Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
TernaryTree.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 //using System;
19 //using System.Collections;
20 //using System.Text;
21 
22 //namespace Lucene.Net.Analysis.Compound.Hyphenation
23 //{
24 // /*
25 // * <h2>Ternary Search Tree.</h2>
26 // *
27 // * <p>
28 // * A ternary search tree is a hybrid between a binary tree and a digital search
29 // * tree (trie). Keys are limited to strings. A data value of type char is stored
30 // * in each leaf node. It can be used as an index (or pointer) to the data.
31 // * Branches that only contain one key are compressed to one node by storing a
32 // * pointer to the trailer substring of the key. This class is intended to serve
33 // * as base class or helper class to implement Dictionary collections or the
34 // * like. Ternary trees have some nice properties as the following: the tree can
35 // * be traversed in sorted order, partial matches (wildcard) can be implemented,
36 // * retrieval of all keys within a given distance from the target, etc. The
37 // * storage requirements are higher than a binary tree but a lot less than a
38 // * trie. Performance is comparable with a hash table, sometimes it outperforms a
39 // * hash function (most of the time can determine a miss faster than a hash).
40 // * </p>
41 // *
42 // * <p>
43 // * The main purpose of this java port is to serve as a base for implementing
44 // * TeX's hyphenation algorithm (see The TeXBook, appendix H). Each language
45 // * requires from 5000 to 15000 hyphenation patterns which will be keys in this
46 // * tree. The strings patterns are usually small (from 2 to 5 characters), but
47 // * each char in the tree is stored in a node. Thus memory usage is the main
48 // * concern. We will sacrifice 'elegance' to keep memory requirements to the
49 // * minimum. Using java's char type as pointer (yes, I know pointer it is a
50 // * forbidden word in java) we can keep the size of the node to be just 8 bytes
51 // * (3 pointers and the data char). This gives room for about 65000 nodes. In my
52 // * tests the english patterns took 7694 nodes and the german patterns 10055
53 // * nodes, so I think we are safe.
54 // * </p>
55 // *
56 // * <p>
57 // * All said, this is a map with strings as keys and char as value. Pretty
58 // * limited!. It can be extended to a general map by using the string
59 // * representation of an object and using the char value as an index to an array
60 // * that contains the object values.
61 // * </p>
62 // *
63 // * This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
64 // */
65 // [Serializable]
66 // public class TernaryTree : ICloneable
67 // {
68 
69 // /*
70 // * We use 4 arrays to represent a node. I guess I should have created a proper
71 // * node class, but somehow Knuth's pascal code made me forget we now have a
72 // * portable language with virtual memory management and automatic garbage
73 // * collection! And now is kind of late, furthermore, if it ain't broken, don't
74 // * fix it.
75 // */
76 
77 // /*
78 // * Pointer to low branch and to rest of the key when it is stored directly in
79 // * this node, we don't have unions in java!
80 // */
81 // protected char[] lo;
82 
83 // /*
84 // * Pointer to high branch.
85 // */
86 // protected char[] hi;
87 
88 // /*
89 // * Pointer to equal branch and to data when this node is a string terminator.
90 // */
91 // protected char[] eq;
92 
93 // /*
94 // * <P>
95 // * The character stored in this node: splitchar. Two special values are
96 // * reserved:
97 // * </P>
98 // * <ul>
99 // * <li>0x0000 as string terminator</li>
100 // * <li>0xFFFF to indicate that the branch starting at this node is compressed</li>
101 // * </ul>
102 // * <p>
103 // * This shouldn't be a problem if we give the usual semantics to strings since
104 // * 0xFFFF is guaranteed not to be an Unicode character.
105 // * </p>
106 // */
107 // protected char[] sc;
108 
109 // /*
110 // * This vector holds the trailing of the keys when the branch is compressed.
111 // */
112 // protected CharVector kv;
113 
114 // protected char root;
115 
116 // protected char freenode;
117 
118 // protected int length; // number of items in tree
119 
120 // protected static readonly int BLOCK_SIZE = 2048; // allocation size for arrays
121 
122 // private TernaryTree()
123 // {
124 // Init();
125 // }
126 
127 // protected void Init()
128 // {
129 // root = (char)0;
130 // freenode = (char)1;
131 // length = 0;
132 // lo = new char[BLOCK_SIZE];
133 // hi = new char[BLOCK_SIZE];
134 // eq = new char[BLOCK_SIZE];
135 // sc = new char[BLOCK_SIZE];
136 // kv = new CharVector();
137 // }
138 
139 // /*
140 // * Branches are initially compressed, needing one node per key plus the size
141 // * of the string key. They are decompressed as needed when another key with
142 // * same prefix is inserted. This saves a lot of space, specially for long
143 // * keys.
144 // */
145 // public void insert(String key, char val)
146 // {
147 // // make sure we have enough room in the arrays
148 // int len = key.Length + 1; // maximum number of nodes that may be generated
149 // if (freenode + len > eq.Length)
150 // {
151 // redimNodeArrays(eq.Length + BLOCK_SIZE);
152 // }
153 // char[] strkey = new char[len--];
154 // key.GetChars(0, len, strkey, 0);
155 // strkey[len] = (char)0;
156 // root = insert(root, strkey, 0, val);
157 // }
158 
159 // public void insert(char[] key, int start, char val)
160 // {
161 // int len = strlen(key) + 1;
162 // if (freenode + len > eq.Length)
163 // {
164 // redimNodeArrays(eq.Length + BLOCK_SIZE);
165 // }
166 // root = insert(root, key, start, val);
167 // }
168 
169 // /*
170 // * The actual insertion function, recursive version.
171 // */
172 // private char insert(char p, char[] key, int start, char val)
173 // {
174 // int len = strlen(key, start);
175 // if (p == 0)
176 // {
177 // // this means there is no branch, this node will start a new branch.
178 // // Instead of doing that, we store the key somewhere else and create
179 // // only one node with a pointer to the key
180 // p = freenode++;
181 // eq[p] = val; // holds data
182 // length++;
183 // hi[p] = (char)0;
184 // if (len > 0)
185 // {
186 // sc[p] = (char)0xFFFF; // indicates branch is compressed
187 // lo[p] = (char)kv.Alloc(len + 1); // use 'lo' to hold pointer to key
188 // strcpy(kv.GetArray(), lo[p], key, start);
189 // }
190 // else
191 // {
192 // sc[p] = (char)0;
193 // lo[p] = (char)0;
194 // }
195 // return p;
196 // }
197 
198 // if (sc[p] == 0xFFFF)
199 // {
200 // // branch is compressed: need to decompress
201 // // this will generate garbage in the external key array
202 // // but we can do some garbage collection later
203 // char pp = freenode++;
204 // lo[pp] = lo[p]; // previous pointer to key
205 // eq[pp] = eq[p]; // previous pointer to data
206 // lo[p] = (char)0;
207 // if (len > 0)
208 // {
209 // sc[p] = kv.Get(lo[pp]);
210 // eq[p] = pp;
211 // lo[pp]++;
212 // if (kv.Get(lo[pp]) == 0)
213 // {
214 // // key completly decompressed leaving garbage in key array
215 // lo[pp] = (char)0;
216 // sc[pp] = (char)0;
217 // hi[pp] = (char)0;
218 // }
219 // else
220 // {
221 // // we only got first char of key, rest is still there
222 // sc[pp] = (char)0xFFFF;
223 // }
224 // }
225 // else
226 // {
227 // // In this case we can save a node by swapping the new node
228 // // with the compressed node
229 // sc[pp] = (char)0xFFFF;
230 // hi[p] = pp;
231 // sc[p] = (char)0;
232 // eq[p] = val;
233 // length++;
234 // return p;
235 // }
236 // }
237 // char s = key[start];
238 // if (s < sc[p])
239 // {
240 // lo[p] = insert(lo[p], key, start, val);
241 // }
242 // else if (s == sc[p])
243 // {
244 // if (s != 0)
245 // {
246 // eq[p] = insert(eq[p], key, start + 1, val);
247 // }
248 // else
249 // {
250 // // key already in tree, overwrite data
251 // eq[p] = val;
252 // }
253 // }
254 // else
255 // {
256 // hi[p] = insert(hi[p], key, start, val);
257 // }
258 // return p;
259 // }
260 
261 // /*
262 // * Compares 2 null terminated char arrays
263 // */
264 // public static int strcmp(char[] a, int startA, char[] b, int startB)
265 // {
266 // for (; a[startA] == b[startB]; startA++, startB++)
267 // {
268 // if (a[startA] == 0)
269 // {
270 // return 0;
271 // }
272 // }
273 // return a[startA] - b[startB];
274 // }
275 
276 // /*
277 // * Compares a string with null terminated char array
278 // */
279 // public static int strcmp(String str, char[] a, int start)
280 // {
281 // int i, d, len = str.Length;
282 // for (i = 0; i < len; i++)
283 // {
284 // d = (int)str[i] - a[start + i];
285 // if (d != 0)
286 // {
287 // return d;
288 // }
289 // if (a[start + i] == 0)
290 // {
291 // return d;
292 // }
293 // }
294 // if (a[start + i] != 0)
295 // {
296 // return (int)-a[start + i];
297 // }
298 // return 0;
299 
300 // }
301 
302 // public static void strcpy(char[] dst, int di, char[] src, int si)
303 // {
304 // while (src[si] != 0)
305 // {
306 // dst[di++] = src[si++];
307 // }
308 // dst[di] = (char)0;
309 // }
310 
311 // public static int strlen(char[] a, int start)
312 // {
313 // int len = 0;
314 // for (int i = start; i < a.Length && a[i] != 0; i++)
315 // {
316 // len++;
317 // }
318 // return len;
319 // }
320 
321 // public static int strlen(char[] a)
322 // {
323 // return strlen(a, 0);
324 // }
325 
326 // public int find(String key)
327 // {
328 // int len = key.Length;
329 // char[] strkey = new char[len + 1];
330 // key.GetChars(0, len, strkey, 0);
331 // strkey[len] = (char)0;
332 
333 // return find(strkey, 0);
334 // }
335 
336 // public int find(char[] key, int start)
337 // {
338 // int d;
339 // char p = root;
340 // int i = start;
341 // char c;
342 
343 // while (p != 0)
344 // {
345 // if (sc[p] == 0xFFFF)
346 // {
347 // if (strcmp(key, i, kv.GetArray(), lo[p]) == 0)
348 // {
349 // return eq[p];
350 // }
351 // else
352 // {
353 // return -1;
354 // }
355 // }
356 // c = key[i];
357 // d = c - sc[p];
358 // if (d == 0)
359 // {
360 // if (c == 0)
361 // {
362 // return eq[p];
363 // }
364 // i++;
365 // p = eq[p];
366 // }
367 // else if (d < 0)
368 // {
369 // p = lo[p];
370 // }
371 // else
372 // {
373 // p = hi[p];
374 // }
375 // }
376 // return -1;
377 // }
378 
379 // public bool knows(String key)
380 // {
381 // return (find(key) >= 0);
382 // }
383 
384 // // redimension the arrays
385 // private void redimNodeArrays(int newsize)
386 // {
387 // int len = newsize < lo.Length ? newsize : lo.Length;
388 // char[] na = new char[newsize];
389 // Array.Copy(lo, 0, na, 0, len);
390 // lo = na;
391 // na = new char[newsize];
392 // Array.Copy(hi, 0, na, 0, len);
393 // hi = na;
394 // na = new char[newsize];
395 // Array.Copy(eq, 0, na, 0, len);
396 // eq = na;
397 // na = new char[newsize];
398 // Array.Copy(sc, 0, na, 0, len);
399 // sc = na;
400 // }
401 
402 // public int size()
403 // {
404 // return length;
405 // }
406 
407 // public Object clone()
408 // {
409 // TernaryTree t = new TernaryTree();
410 // t.lo = (char[])this.lo.Clone();
411 // t.hi = (char[])this.hi.Clone();
412 // t.eq = (char[])this.eq.Clone();
413 // t.sc = (char[])this.sc.Clone();
414 // t.kv = (CharVector)this.kv.Clone();
415 // t.root = this.root;
416 // t.freenode = this.freenode;
417 // t.length = this.length;
418 
419 // return t;
420 // }
421 
422 // /*
423 // * Recursively insert the median first and then the median of the lower and
424 // * upper halves, and so on in order to get a balanced tree. The array of keys
425 // * is assumed to be sorted in ascending order.
426 // */
427 // protected void insertBalanced(String[] k, char[] v, int offset, int n)
428 // {
429 // int m;
430 // if (n < 1)
431 // {
432 // return;
433 // }
434 // m = n >> 1;
435 
436 // insert(k[m + offset], v[m + offset]);
437 // insertBalanced(k, v, offset, m);
438 
439 // insertBalanced(k, v, offset + m + 1, n - m - 1);
440 // }
441 
442 // /*
443 // * Balance the tree for best search performance
444 // */
445 // public void balance()
446 // {
447 // // System.out.print("Before root splitchar = ");
448 // // System.out.println(sc[root]);
449 
450 // int i = 0, n = length;
451 // String[] k = new String[n];
452 // char[] v = new char[n];
453 // Iterator iter = new Iterator();
454 // while (iter.HasMoreElements())
455 // {
456 // v[i] = iter.getValue();
457 // k[i++] = (String)iter.nextElement();
458 // }
459 // Init();
460 // insertBalanced(k, v, 0, n);
461 
462 // // With uniform letter distribution sc[root] should be around 'm'
463 // // System.out.print("After root splitchar = ");
464 // // System.out.println(sc[root]);
465 // }
466 
467 // /*
468 // * Each node stores a character (splitchar) which is part of some key(s). In a
469 // * compressed branch (one that only contain a single string key) the trailer
470 // * of the key which is not already in nodes is stored externally in the kv
471 // * array. As items are inserted, key substrings decrease. Some substrings may
472 // * completely disappear when the whole branch is totally decompressed. The
473 // * tree is traversed to find the key substrings actually used. In addition,
474 // * duplicate substrings are removed using a map (implemented with a
475 // * TernaryTree!).
476 // *
477 // */
478 // public void trimToSize()
479 // {
480 // // first balance the tree for best performance
481 // balance();
482 
483 // // redimension the node arrays
484 // redimNodeArrays(freenode);
485 
486 // // ok, compact kv array
487 // CharVector kx = new CharVector();
488 // kx.Alloc(1);
489 // TernaryTree map = new TernaryTree();
490 // compact(kx, map, root);
491 // kv = kx;
492 // kv.TrimToSize();
493 // }
494 
495 // private void compact(CharVector kx, TernaryTree map, char p)
496 // {
497 // int k;
498 // if (p == 0)
499 // {
500 // return;
501 // }
502 // if (sc[p] == 0xFFFF)
503 // {
504 // k = map.find(kv.GetArray(), lo[p]);
505 // if (k < 0)
506 // {
507 // k = kx.Alloc(strlen(kv.GetArray(), lo[p]) + 1);
508 // strcpy(kx.GetArray(), k, kv.GetArray(), lo[p]);
509 // map.insert(kx.GetArray(), k, (char)k);
510 // }
511 // lo[p] = (char)k;
512 // }
513 // else
514 // {
515 // compact(kx, map, lo[p]);
516 // if (sc[p] != 0)
517 // {
518 // compact(kx, map, eq[p]);
519 // }
520 // compact(kx, map, hi[p]);
521 // }
522 // }
523 
524 // public IEnumerator keys()
525 // {
526 // return new Iterator();
527 // }
528 
529 // public class Iterator : IEnumerator
530 // {
531 
532 // /*
533 // * current node index
534 // */
535 // int cur;
536 
537 // /*
538 // * current key
539 // */
540 // String curkey;
541 
542 // private class Item : ICloneable
543 // {
544 // char parent;
545 
546 // char child;
547 
548 // public Item()
549 // {
550 // parent = (char)0;
551 // child = (char)0;
552 // }
553 
554 // public Item(char p, char c)
555 // {
556 // parent = p;
557 // child = c;
558 // }
559 
560 // public Object Clone()
561 // {
562 // return new Item(parent, child);
563 // }
564 
565 // }
566 
567 // /*
568 // * Node stack
569 // */
570 // Stack ns;
571 
572 // /*
573 // * key stack implemented with a StringBuilder
574 // */
575 // StringBuilder ks;
576 
577 // public Iterator()
578 // {
579 // cur = -1;
580 // ns = new Stack();
581 // ks = new StringBuilder();
582 // rewind();
583 // }
584 
585 // public void rewind()
586 // {
587 // ns.Clear();
588 // ks.SetLength(0);
589 // cur = root;
590 // Run();
591 // }
592 
593 // public Object nextElement()
594 // {
595 // String res = curkey;
596 // cur = up();
597 // Run();
598 // return res;
599 // }
600 
601 // public char getValue()
602 // {
603 // if (cur >= 0)
604 // {
605 // return eq[cur];
606 // }
607 // return 0;
608 // }
609 
610 // public bool hasMoreElements()
611 // {
612 // return (cur != -1);
613 // }
614 
615 // /*
616 // * traverse upwards
617 // */
618 // private int up()
619 // {
620 // Item i = new Item();
621 // int res = 0;
622 
623 // if (ns.Empty())
624 // {
625 // return -1;
626 // }
627 
628 // if (cur != 0 && sc[cur] == 0)
629 // {
630 // return lo[cur];
631 // }
632 
633 // bool climb = true;
634 
635 // while (climb)
636 // {
637 // i = (Item)ns.Pop();
638 // i.child++;
639 // switch (i.child)
640 // {
641 // case 1:
642 // if (sc[i.parent] != 0)
643 // {
644 // res = eq[i.parent];
645 // ns.Push(i.Clone());
646 // ks.Append(sc[i.parent]);
647 // }
648 // else
649 // {
650 // i.child++;
651 // ns.Push(i.Clone());
652 // res = hi[i.parent];
653 // }
654 // climb = false;
655 // break;
656 
657 // case 2:
658 // res = hi[i.parent];
659 // ns.Push(i.Clone());
660 // if (ks.Length() > 0)
661 // {
662 // ks.SetLength(ks.Length() - 1); // pop
663 // }
664 // climb = false;
665 // break;
666 
667 // default:
668 // if (ns.Clear())
669 // {
670 // return -1;
671 // }
672 // climb = true;
673 // break;
674 // }
675 // }
676 // return res;
677 // }
678 
679 // /*
680 // * traverse the tree to find next key
681 // */
682 // private int Run()
683 // {
684 // if (cur == -1)
685 // {
686 // return -1;
687 // }
688 
689 // bool leaf = false;
690 // while (true)
691 // {
692 // // first go down on low branch until leaf or compressed branch
693 // while (cur != 0)
694 // {
695 // if (sc[cur] == 0xFFFF)
696 // {
697 // leaf = true;
698 // break;
699 // }
700 // ns.Push(new Item((char)cur, '\u0000'));
701 // if (sc[cur] == 0)
702 // {
703 // leaf = true;
704 // break;
705 // }
706 // cur = lo[cur];
707 // }
708 // if (leaf)
709 // {
710 // break;
711 // }
712 // // nothing found, go up one node and try again
713 // cur = up();
714 // if (cur == -1)
715 // {
716 // return -1;
717 // }
718 // }
719 // // The current node should be a data node and
720 // // the key should be in the key stack (at least partially)
721 // StringBuilder buf = new StringBuilder(ks.ToString());
722 // if (sc[cur] == 0xFFFF)
723 // {
724 // int p = lo[cur];
725 // while (kv.Get(p) != 0)
726 // {
727 // buf.Append(kv.Get(p++));
728 // }
729 // }
730 // curkey = buf.ToString();
731 // return 0;
732 // }
733 
734 // }
735 
736 // public void PrintStats()
737 // {
738 // Console.WriteLine("Number of keys = " + length);
739 // Console.WriteLine("Node count = " + freenode);
740 // // System.out.println("Array length = " + Integer.toString(eq.length));
741 // Console.WriteLine("Key Array length = " + kv.Length());
742 
743 // /*
744 // * for(int i=0; i<kv.length(); i++) if ( kv.get(i) != 0 )
745 // * System.out.print(kv.get(i)); else System.out.println("");
746 // * System.out.println("Keys:"); for(Enumeration enum = keys();
747 // * enum.hasMoreElements(); ) System.out.println(enum.nextElement());
748 // */
749 
750 // }
751 
752 // public static void Main(String[] args)
753 // {
754 // TernaryTree tt = new TernaryTree();
755 // tt.insert("Carlos", 'C');
756 // tt.insert("Car", 'r');
757 // tt.insert("palos", 'l');
758 // tt.insert("pa", 'p');
759 // tt.trimToSize();
760 // Console.WriteLine((char)tt.find("Car"));
761 // Console.WriteLine((char)tt.find("Carlos"));
762 // Console.WriteLine((char)tt.find("alto"));
763 // tt.PrintStats();
764 // }
765 // }
766 //}