Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
HyphenationTree.cs
Go to the documentation of this file.
1 ////*
2 // * Licensed to the Apache Software Foundation (ASF) under one or more
3 // * contributor license agreements. See the NOTICE file distributed with
4 // * this work for additional information regarding copyright ownership.
5 // * The ASF licenses this file to You under the Apache License, Version 2.0
6 // * (the "License"); you may not use this file except in compliance with
7 // * the License. You may obtain a copy of the License at
8 // *
9 // * http://www.apache.org/licenses/LICENSE-2.0
10 // *
11 // * Unless required by applicable law or agreed to in writing, software
12 // * distributed under the License is distributed on an "AS IS" BASIS,
13 // * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // * See the License for the specific language governing permissions and
15 // * limitations under the License.
16 // */
17 
18 //using System;
19 //using System.Collections.Generic;
20 //using System.IO;
21 //using System.Linq;
22 //using System.Text;
23 //using Lucene.Net.Support;
24 
25 //namespace Lucene.Net.Analysis.Compound.Hyphenation
26 //{
27 ////*
28 // * This tree structure stores the hyphenation patterns in an efficient way for
29 // * fast lookup. It provides the provides the method to hyphenate a word.
30 // *
31 // * This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
32 // */
33 //[Serializable]
34 //public class HyphenationTree : TernaryTree, PatternConsumer
35 //{
36 
37 // private static readonly long serialVersionUID = -7842107987915665573L;
38 
39 // /*
40 // * value space: stores the interletter values
41 // */
42 // protected ByteVector vspace;
43 
44 // /*
45 // * This map stores hyphenation exceptions
46 // */
47 // protected HashMap<String,ArrayList> stoplist;
48 
49 // /*
50 // * This map stores the character classes
51 // */
52 // protected TernaryTree classmap;
53 
54 // /*
55 // * Temporary map to store interletter values on pattern loading.
56 // */
57 // [NonSerialized]
58 // private TernaryTree ivalues;
59 
60 // public HyphenationTree() {
61 // stoplist = new HashMap<string,ArrayList>(23); // usually a small table
62 // classmap = new TernaryTree();
63 // vspace = new ByteVector();
64 // vspace.Alloc(1); // this reserves index 0, which we don't use
65 // }
66 
67 // /*
68 // * Packs the values by storing them in 4 bits, two values into a byte Values
69 // * range is from 0 to 9. We use zero as terminator, so we'll add 1 to the
70 // * value.
71 // *
72 // * @param values a string of digits from '0' to '9' representing the
73 // * interletter values.
74 // * @return the index into the vspace array where the packed values are stored.
75 // */
76 // protected int packValues(String values) {
77 // int i, n = values.Length;
78 // int m = (n & 1) == 1 ? (n >> 1) + 2 : (n >> 1) + 1;
79 // int offset = vspace.Alloc(m);
80 // byte[] va = vspace.GetArray();
81 // for (i = 0; i < n; i++) {
82 // int j = i >> 1;
83 // byte v = (byte) ((values[i] - '0' + 1) & 0x0f);
84 // if ((i & 1) == 1) {
85 // va[j + offset] = (byte) (va[j + offset] | v);
86 // } else {
87 // va[j + offset] = (byte) (v << 4); // big endian
88 // }
89 // }
90 // va[m - 1 + offset] = 0; // terminator
91 // return offset;
92 // }
93 
94 // protected String unpackValues(int k) {
95 // StringBuilder buf = new StringBuilder();
96 // byte v = vspace.Get(k++);
97 // while (v != 0) {
98 // char c = (char) ((v >>> 4) - 1 + '0');
99 // buf.Append(c);
100 // c = (char) (v & 0x0f);
101 // if (c == 0) {
102 // break;
103 // }
104 // c = (char) (c - 1 + '0');
105 // buf.Append(c);
106 // v = vspace.Get(k++);
107 // }
108 // return buf.ToString();
109 // }
110 
111 // /*
112 // * Read hyphenation patterns from an XML file.
113 // *
114 // * @param f the filename
115 // * @throws HyphenationException In case the parsing fails
116 // */
117 // public void loadPatterns(FileInfo f)
118 // {
119 // try {
120 // InputSource src = new InputSource(f.toURL().toExternalForm());
121 // loadPatterns(src);
122 // } catch (MalformedURLException e) {
123 // throw new HyphenationException("Error converting the File '" + f
124 // + "' to a URL: " + e.getMessage());
125 // }
126 // }
127 
128 // /*
129 // * Read hyphenation patterns from an XML file.
130 // *
131 // * @param source the InputSource for the file
132 // * @throws HyphenationException In case the parsing fails
133 // */
134 // public void loadPatterns(InputSource source)
135 // {
136 // PatternParser pp = new PatternParser(this);
137 // ivalues = new TernaryTree();
138 
139 // pp.parse(source);
140 
141 // // patterns/values should be now in the tree
142 // // let's optimize a bit
143 // trimToSize();
144 // vspace.trimToSize();
145 // classmap.trimToSize();
146 
147 // // get rid of the auxiliary map
148 // ivalues = null;
149 // }
150 
151 // public String findPattern(String pat) {
152 // int k = super.find(pat);
153 // if (k >= 0) {
154 // return unpackValues(k);
155 // }
156 // return "";
157 // }
158 
159 // /*
160 // * String compare, returns 0 if equal or t is a substring of s
161 // */
162 // protected int hstrcmp(char[] s, int si, char[] t, int ti) {
163 // for (; s[si] == t[ti]; si++, ti++) {
164 // if (s[si] == 0) {
165 // return 0;
166 // }
167 // }
168 // if (t[ti] == 0) {
169 // return 0;
170 // }
171 // return s[si] - t[ti];
172 // }
173 
174 // protected byte[] getValues(int k) {
175 // StringBuilder buf = new StringBuilder();
176 // byte v = vspace.get(k++);
177 // while (v != 0) {
178 // char c = (char) ((v >>> 4) - 1);
179 // buf.append(c);
180 // c = (char) (v & 0x0f);
181 // if (c == 0) {
182 // break;
183 // }
184 // c = (char) (c - 1);
185 // buf.append(c);
186 // v = vspace.get(k++);
187 // }
188 // byte[] res = new byte[buf.length()];
189 // for (int i = 0; i < res.length; i++) {
190 // res[i] = (byte) buf.charAt(i);
191 // }
192 // return res;
193 // }
194 
195 // /*
196 // * <p>
197 // * Search for all possible partial matches of word starting at index an update
198 // * interletter values. In other words, it does something like:
199 // * </p>
200 // * <code>
201 // * for(i=0; i<patterns.length; i++) {
202 // * if ( word.substring(index).startsWidth(patterns[i]) )
203 // * update_interletter_values(patterns[i]);
204 // * }
205 // * </code>
206 // * <p>
207 // * But it is done in an efficient way since the patterns are stored in a
208 // * ternary tree. In fact, this is the whole purpose of having the tree: doing
209 // * this search without having to test every single pattern. The number of
210 // * patterns for languages such as English range from 4000 to 10000. Thus,
211 // * doing thousands of string comparisons for each word to hyphenate would be
212 // * really slow without the tree. The tradeoff is memory, but using a ternary
213 // * tree instead of a trie, almost halves the the memory used by Lout or TeX.
214 // * It's also faster than using a hash table
215 // * </p>
216 // *
217 // * @param word null terminated word to match
218 // * @param index start index from word
219 // * @param il interletter values array to update
220 // */
221 // protected void searchPatterns(char[] word, int index, byte[] il) {
222 // byte[] values;
223 // int i = index;
224 // char p, q;
225 // char sp = word[i];
226 // p = root;
227 
228 // while (p > 0 && p < sc.length) {
229 // if (sc[p] == 0xFFFF) {
230 // if (hstrcmp(word, i, kv.getArray(), lo[p]) == 0) {
231 // values = getValues(eq[p]); // data pointer is in eq[]
232 // int j = index;
233 // for (int k = 0; k < values.length; k++) {
234 // if (j < il.length && values[k] > il[j]) {
235 // il[j] = values[k];
236 // }
237 // j++;
238 // }
239 // }
240 // return;
241 // }
242 // int d = sp - sc[p];
243 // if (d == 0) {
244 // if (sp == 0) {
245 // break;
246 // }
247 // sp = word[++i];
248 // p = eq[p];
249 // q = p;
250 
251 // // look for a pattern ending at this position by searching for
252 // // the null char ( splitchar == 0 )
253 // while (q > 0 && q < sc.length) {
254 // if (sc[q] == 0xFFFF) { // stop at compressed branch
255 // break;
256 // }
257 // if (sc[q] == 0) {
258 // values = getValues(eq[q]);
259 // int j = index;
260 // for (int k = 0; k < values.length; k++) {
261 // if (j < il.length && values[k] > il[j]) {
262 // il[j] = values[k];
263 // }
264 // j++;
265 // }
266 // break;
267 // } else {
268 // q = lo[q];
269 
270 // /*
271 // * actually the code should be: q = sc[q] < 0 ? hi[q] : lo[q]; but
272 // * java chars are unsigned
273 // */
274 // }
275 // }
276 // } else {
277 // p = d < 0 ? lo[p] : hi[p];
278 // }
279 // }
280 // }
281 
282 // /*
283 // * Hyphenate word and return a Hyphenation object.
284 // *
285 // * @param word the word to be hyphenated
286 // * @param remainCharCount Minimum number of characters allowed before the
287 // * hyphenation point.
288 // * @param pushCharCount Minimum number of characters allowed after the
289 // * hyphenation point.
290 // * @return a {@link Hyphenation Hyphenation} object representing the
291 // * hyphenated word or null if word is not hyphenated.
292 // */
293 // public Hyphenation hyphenate(String word, int remainCharCount,
294 // int pushCharCount) {
295 // char[] w = word.toCharArray();
296 // return hyphenate(w, 0, w.length, remainCharCount, pushCharCount);
297 // }
298 
299 // /*
300 // * w = "****nnllllllnnn*****", where n is a non-letter, l is a letter, all n
301 // * may be absent, the first n is at offset, the first l is at offset +
302 // * iIgnoreAtBeginning; word = ".llllll.'\0'***", where all l in w are copied
303 // * into word. In the first part of the routine len = w.length, in the second
304 // * part of the routine len = word.length. Three indices are used: index(w),
305 // * the index in w, index(word), the index in word, letterindex(word), the
306 // * index in the letter part of word. The following relations exist: index(w) =
307 // * offset + i - 1 index(word) = i - iIgnoreAtBeginning letterindex(word) =
308 // * index(word) - 1 (see first loop). It follows that: index(w) - index(word) =
309 // * offset - 1 + iIgnoreAtBeginning index(w) = letterindex(word) + offset +
310 // * iIgnoreAtBeginning
311 // */
312 
313 // /*
314 // * Hyphenate word and return an array of hyphenation points.
315 // *
316 // * @param w char array that contains the word
317 // * @param offset Offset to first character in word
318 // * @param len Length of word
319 // * @param remainCharCount Minimum number of characters allowed before the
320 // * hyphenation point.
321 // * @param pushCharCount Minimum number of characters allowed after the
322 // * hyphenation point.
323 // * @return a {@link Hyphenation Hyphenation} object representing the
324 // * hyphenated word or null if word is not hyphenated.
325 // */
326 // public Hyphenation hyphenate(char[] w, int offset, int len,
327 // int remainCharCount, int pushCharCount) {
328 // int i;
329 // char[] word = new char[len + 3];
330 
331 // // normalize word
332 // char[] c = new char[2];
333 // int iIgnoreAtBeginning = 0;
334 // int iLength = len;
335 // boolean bEndOfLetters = false;
336 // for (i = 1; i <= len; i++) {
337 // c[0] = w[offset + i - 1];
338 // int nc = classmap.find(c, 0);
339 // if (nc < 0) { // found a non-letter character ...
340 // if (i == (1 + iIgnoreAtBeginning)) {
341 // // ... before any letter character
342 // iIgnoreAtBeginning++;
343 // } else {
344 // // ... after a letter character
345 // bEndOfLetters = true;
346 // }
347 // iLength--;
348 // } else {
349 // if (!bEndOfLetters) {
350 // word[i - iIgnoreAtBeginning] = (char) nc;
351 // } else {
352 // return null;
353 // }
354 // }
355 // }
356 // len = iLength;
357 // if (len < (remainCharCount + pushCharCount)) {
358 // // word is too short to be hyphenated
359 // return null;
360 // }
361 // int[] result = new int[len + 1];
362 // int k = 0;
363 
364 // // check exception list first
365 // String sw = new String(word, 1, len);
366 // if (stoplist.containsKey(sw)) {
367 // // assume only simple hyphens (Hyphen.pre="-", Hyphen.post = Hyphen.no =
368 // // null)
369 // ArrayList hw = stoplist.get(sw);
370 // int j = 0;
371 // for (i = 0; i < hw.size(); i++) {
372 // Object o = hw.get(i);
373 // // j = index(sw) = letterindex(word)?
374 // // result[k] = corresponding index(w)
375 // if (o instanceof String) {
376 // j += ((String) o).length();
377 // if (j >= remainCharCount && j < (len - pushCharCount)) {
378 // result[k++] = j + iIgnoreAtBeginning;
379 // }
380 // }
381 // }
382 // } else {
383 // // use algorithm to get hyphenation points
384 // word[0] = '.'; // word start marker
385 // word[len + 1] = '.'; // word end marker
386 // word[len + 2] = 0; // null terminated
387 // byte[] il = new byte[len + 3]; // initialized to zero
388 // for (i = 0; i < len + 1; i++) {
389 // searchPatterns(word, i, il);
390 // }
391 
392 // // hyphenation points are located where interletter value is odd
393 // // i is letterindex(word),
394 // // i + 1 is index(word),
395 // // result[k] = corresponding index(w)
396 // for (i = 0; i < len; i++) {
397 // if (((il[i + 1] & 1) == 1) && i >= remainCharCount
398 // && i <= (len - pushCharCount)) {
399 // result[k++] = i + iIgnoreAtBeginning;
400 // }
401 // }
402 // }
403 
404 // if (k > 0) {
405 // // trim result array
406 // int[] res = new int[k+2];
407 // System.arraycopy(result, 0, res, 1, k);
408 // // We add the synthetical hyphenation points
409 // // at the beginning and end of the word
410 // res[0]=0;
411 // res[k+1]=len;
412 // return new Hyphenation(res);
413 // } else {
414 // return null;
415 // }
416 // }
417 
418 // /*
419 // * Add a character class to the tree. It is used by
420 // * {@link PatternParser PatternParser} as callback to add character classes.
421 // * Character classes define the valid word characters for hyphenation. If a
422 // * word contains a character not defined in any of the classes, it is not
423 // * hyphenated. It also defines a way to normalize the characters in order to
424 // * compare them with the stored patterns. Usually pattern files use only lower
425 // * case characters, in this case a class for letter 'a', for example, should
426 // * be defined as "aA", the first character being the normalization char.
427 // */
428 // public void AddClass(String chargroup) {
429 // if (chargroup.Length > 0) {
430 // char equivChar = chargroup[0];
431 // char[] key = new char[2];
432 // key[1] = (char)0;
433 // for (int i = 0; i < chargroup.Length; i++) {
434 // key[0] = chargroup[i];
435 // classmap.Insert(key, 0, equivChar);
436 // }
437 // }
438 // }
439 
440 // /*
441 // * Add an exception to the tree. It is used by
442 // * {@link PatternParser PatternParser} class as callback to store the
443 // * hyphenation exceptions.
444 // *
445 // * @param word normalized word
446 // * @param hyphenatedword a vector of alternating strings and
447 // * {@link Hyphen hyphen} objects.
448 // */
449 // public void AddException(String word, ArrayList hyphenatedword) {
450 // stoplist.Add(word, hyphenatedword);
451 // }
452 
453 // /*
454 // * Add a pattern to the tree. Mainly, to be used by
455 // * {@link PatternParser PatternParser} class as callback to add a pattern to
456 // * the tree.
457 // *
458 // * @param pattern the hyphenation pattern
459 // * @param ivalue interletter weight values indicating the desirability and
460 // * priority of hyphenating at a given point within the pattern. It
461 // * should contain only digit characters. (i.e. '0' to '9').
462 // */
463 // public void AddPattern(String pattern, String ivalue) {
464 // int k = ivalues.Find(ivalue);
465 // if (k <= 0) {
466 // k = packValues(ivalue);
467 // ivalues.Insert(ivalue, (char)k);
468 // }
469 // Insert(pattern, (char) k);
470 // }
471 
472 // public override void PrintStats() {
473 // Console.WriteLine("Value space size = " + vspace.Length());
474 // base.PrintStats();
475 
476 // }
477 //}
478 
479 //}
480