Lucene.Net  3.0.3
Lucene.Net is a .NET port of the Java Lucene Indexing Library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties
PatternParser.cs
Go to the documentation of this file.
1 //using System;
2 //using System.Collections;
3 //using System.Collections.Generic;
4 //using System.IO;
5 //using System.Linq;
6 //using System.Text;
7 //using Lucene.Net.Analysis.Compound.Hyphenation;
8 
9 //namespace Lucene.Net.Analyzers.Compound.Hyphenation
10 //{
11 // /*
12 // * A SAX document handler to read and parse hyphenation patterns from a XML
13 // * file.
14 // *
15 // * This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
16 // */
17 //public class PatternParser : DefaultHandler, PatternConsumer {
18 
19 // XMLReader parser;
20 
21 // int currElement;
22 
23 // PatternConsumer consumer;
24 
25 // StringBuilder token;
26 
27 // ArrayList exception;
28 
29 // char hyphenChar;
30 
31 // String errMsg;
32 
33 // static readonly int ELEM_CLASSES = 1;
34 
35 // static readonly int ELEM_EXCEPTIONS = 2;
36 
37 // static readonly int ELEM_PATTERNS = 3;
38 
39 // static readonly int ELEM_HYPHEN = 4;
40 
41 // public PatternParser()
42 // {
43 // token = new StringBuilder();
44 // parser = CreateParser();
45 // parser.SetContentHandler(this);
46 // parser.SetErrorHandler(this);
47 // parser.SetEntityResolver(this);
48 // hyphenChar = '-'; // default
49 
50 // }
51 
52 // public PatternParser(PatternConsumer consumer)
53 // : this()
54 // {
55 // this.consumer = consumer;
56 // }
57 
58 // public void setConsumer(PatternConsumer consumer) {
59 // this.consumer = consumer;
60 // }
61 
62 // /*
63 // * Parses a hyphenation pattern file.
64 // *
65 // * @param filename the filename
66 // * @throws HyphenationException In case of an exception while parsing
67 // */
68 // public void parse(String filename)
69 // {
70 // parse(new FileInfo(filename));
71 // }
72 
73 // /*
74 // * Parses a hyphenation pattern file.
75 // *
76 // * @param file the pattern file
77 // * @throws HyphenationException In case of an exception while parsing
78 // */
79 // public void parse(FileInfo file)
80 // {
81 // try {
82 // InputSource src = new InputSource(file.toURL().toExternalForm());
83 // parse(src);
84 // } catch (MalformedURLException e) {
85 // throw new HyphenationException("Error converting the File '" + file
86 // + "' to a URL: " + e.GetMessage());
87 // }
88 // }
89 
90 // /*
91 // * Parses a hyphenation pattern file.
92 // *
93 // * @param source the InputSource for the file
94 // * @throws HyphenationException In case of an exception while parsing
95 // */
96 // public void parse(InputSource source)
97 // {
98 // try {
99 // parser.parse(source);
100 // } catch (FileNotFoundException fnfe) {
101 // throw new HyphenationException("File not found: " + fnfe.GetMessage());
102 // } catch (IOException ioe) {
103 // throw new HyphenationException(ioe.GetMessage());
104 // } catch (SAXException e) {
105 // throw new HyphenationException(errMsg);
106 // }
107 // }
108 
109 // /*
110 // * Creates a SAX parser using JAXP
111 // *
112 // * @return the created SAX parser
113 // */
114 // static XMLReader createParser() {
115 // try {
116 // SAXParserFactory factory = SAXParserFactory.newInstance();
117 // factory.SetNamespaceAware(true);
118 // return factory.newSAXParser().GetXMLReader();
119 // } catch (Exception e) {
120 // throw new RuntimeException("Couldn't create XMLReader: " + e.GetMessage());
121 // }
122 // }
123 
124 // protected String readToken(StringBuffer chars) {
125 // String word;
126 // bool space = false;
127 // int i;
128 // for (i = 0; i < chars.Length(); i++) {
129 // if (char.isWhitespace(chars.charAt(i))) {
130 // space = true;
131 // } else {
132 // break;
133 // }
134 // }
135 // if (space) {
136 // // chars.delete(0,i);
137 // for (int countr = i; countr < chars.Length(); countr++) {
138 // chars.SetCharAt(countr - i, chars.charAt(countr));
139 // }
140 // chars.SetLength(chars.Length() - i);
141 // if (token.Length() > 0) {
142 // word = token.ToString();
143 // token.SetLength(0);
144 // return word;
145 // }
146 // }
147 // space = false;
148 // for (i = 0; i < chars.Length(); i++) {
149 // if (char.isWhitespace(chars.charAt(i))) {
150 // space = true;
151 // break;
152 // }
153 // }
154 // token.Append(chars.ToString().substring(0, i));
155 // // chars.delete(0,i);
156 // for (int countr = i; countr < chars.Length(); countr++) {
157 // chars.SetCharAt(countr - i, chars.charAt(countr));
158 // }
159 // chars.SetLength(chars.Length() - i);
160 // if (space) {
161 // word = token.ToString();
162 // token.SetLength(0);
163 // return word;
164 // }
165 // token.Append(chars);
166 // return null;
167 // }
168 
169 // protected static String getPattern(String word) {
170 // StringBuilder pat = new StringBuilder();
171 // int len = word.Length();
172 // for (int i = 0; i < len; i++) {
173 // if (!char.isDigit(word.charAt(i))) {
174 // pat.Append(word.charAt(i));
175 // }
176 // }
177 // return pat.ToString();
178 // }
179 
180 // protected ArrayList normalizeException(ArrayList ex) {
181 // ArrayList res = new ArrayList();
182 // for (int i = 0; i < ex.size(); i++) {
183 // Object item = ex.Get(i);
184 // if (item instanceof String) {
185 // String str = (String) item;
186 // StringBuilder buf = new StringBuilder();
187 // for (int j = 0; j < str.Length(); j++) {
188 // char c = str.charAt(j);
189 // if (c != hyphenChar) {
190 // buf.Append(c);
191 // } else {
192 // res.add(buf.ToString());
193 // buf.SetLength(0);
194 // char[] h = new char[1];
195 // h[0] = hyphenChar;
196 // // we use here hyphenChar which is not necessarily
197 // // the one to be printed
198 // res.add(new Hyphen(new String(h), null, null));
199 // }
200 // }
201 // if (buf.Length() > 0) {
202 // res.add(buf.ToString());
203 // }
204 // } else {
205 // res.add(item);
206 // }
207 // }
208 // return res;
209 // }
210 
211 // protected String getExceptionWord(ArrayList ex) {
212 // StringBuilder res = new StringBuilder();
213 // for (int i = 0; i < ex.size(); i++) {
214 // Object item = ex.Get(i);
215 // if (item instanceof String) {
216 // res.Append((String) item);
217 // } else {
218 // if (((Hyphen) item).noBreak != null) {
219 // res.Append(((Hyphen) item).noBreak);
220 // }
221 // }
222 // }
223 // return res.ToString();
224 // }
225 
226 // protected static String getInterletterValues(String pat) {
227 // StringBuilder il = new StringBuilder();
228 // String word = pat + "a"; // add dummy letter to serve as sentinel
229 // int len = word.Length();
230 // for (int i = 0; i < len; i++) {
231 // char c = word.charAt(i);
232 // if (char.isDigit(c)) {
233 // il.Append(c);
234 // i++;
235 // } else {
236 // il.Append('0');
237 // }
238 // }
239 // return il.ToString();
240 // }
241 
242 // //
243 // // EntityResolver methods
244 // //
245 // public override InputSource resolveEntity(String publicId, String systemId) {
246 // return HyphenationDTDGenerator.generateDTD();
247 // }
248 
249 // //
250 // // ContentHandler methods
251 // //
252 
253 // /*
254 // * @see org.xml.sax.ContentHandler#startElement(java.lang.String,
255 // * java.lang.String, java.lang.String, org.xml.sax.Attributes)
256 // */
257 // public override void startElement(String uri, String local, String raw,
258 // Attributes attrs) {
259 // if (local.equals("hyphen-char")) {
260 // String h = attrs.GetValue("value");
261 // if (h != null && h.Length() == 1) {
262 // hyphenChar = h.charAt(0);
263 // }
264 // } else if (local.equals("classes")) {
265 // currElement = ELEM_CLASSES;
266 // } else if (local.equals("patterns")) {
267 // currElement = ELEM_PATTERNS;
268 // } else if (local.equals("exceptions")) {
269 // currElement = ELEM_EXCEPTIONS;
270 // exception = new ArrayList();
271 // } else if (local.equals("hyphen")) {
272 // if (token.Length() > 0) {
273 // exception.add(token.ToString());
274 // }
275 // exception.add(new Hyphen(attrs.GetValue("pre"), attrs.GetValue("no"),
276 // attrs.GetValue("post")));
277 // currElement = ELEM_HYPHEN;
278 // }
279 // token.SetLength(0);
280 // }
281 
282 // /*
283 // * @see org.xml.sax.ContentHandler#endElement(java.lang.String,
284 // * java.lang.String, java.lang.String)
285 // */
286 // public override void endElement(String uri, String local, String raw) {
287 
288 // if (token.Length() > 0) {
289 // String word = token.ToString();
290 // switch (currElement) {
291 // case ELEM_CLASSES:
292 // consumer.addClass(word);
293 // break;
294 // case ELEM_EXCEPTIONS:
295 // exception.add(word);
296 // exception = normalizeException(exception);
297 // consumer.addException(getExceptionWord(exception),
298 // (ArrayList) exception.clone());
299 // break;
300 // case ELEM_PATTERNS:
301 // consumer.addPattern(getPattern(word), getInterletterValues(word));
302 // break;
303 // case ELEM_HYPHEN:
304 // // nothing to do
305 // break;
306 // }
307 // if (currElement != ELEM_HYPHEN) {
308 // token.SetLength(0);
309 // }
310 // }
311 // if (currElement == ELEM_HYPHEN) {
312 // currElement = ELEM_EXCEPTIONS;
313 // } else {
314 // currElement = 0;
315 // }
316 
317 // }
318 
319 // /*
320 // * @see org.xml.sax.ContentHandler#chars(char[], int, int)
321 // */
322 // public override void chars(char ch[], int start, int Length) {
323 // StringBuffer chars = new StringBuffer(Length);
324 // chars.Append(ch, start, Length);
325 // String word = readToken(chars);
326 // while (word != null) {
327 // // Console.WriteLine("\"" + word + "\"");
328 // switch (currElement) {
329 // case ELEM_CLASSES:
330 // consumer.addClass(word);
331 // break;
332 // case ELEM_EXCEPTIONS:
333 // exception.add(word);
334 // exception = normalizeException(exception);
335 // consumer.addException(getExceptionWord(exception),
336 // (ArrayList) exception.clone());
337 // exception.clear();
338 // break;
339 // case ELEM_PATTERNS:
340 // consumer.addPattern(getPattern(word), getInterletterValues(word));
341 // break;
342 // }
343 // word = readToken(chars);
344 // }
345 
346 // }
347 
348 // //
349 // // ErrorHandler methods
350 // //
351 
352 // /*
353 // * @see org.xml.sax.ErrorHandler#warning(org.xml.sax.SAXParseException)
354 // */
355 // public override void warning(SAXParseException ex) {
356 // errMsg = "[Warning] " + getLocationString(ex) + ": " + ex.GetMessage();
357 // }
358 
359 // /*
360 // * @see org.xml.sax.ErrorHandler#error(org.xml.sax.SAXParseException)
361 // */
362 // public override void error(SAXParseException ex) {
363 // errMsg = "[Error] " + getLocationString(ex) + ": " + ex.GetMessage();
364 // }
365 
366 // /*
367 // * @see org.xml.sax.ErrorHandler#fatalError(org.xml.sax.SAXParseException)
368 // */
369 // public override void fatalError(SAXParseException ex) throws SAXException {
370 // errMsg = "[Fatal Error] " + getLocationString(ex) + ": " + ex.GetMessage();
371 // throw ex;
372 // }
373 
374 // /*
375 // * Returns a string of the location.
376 // */
377 // private String getLocationString(SAXParseException ex) {
378 // StringBuilder str = new StringBuilder();
379 
380 // String systemId = ex.GetSystemId();
381 // if (systemId != null) {
382 // int index = systemId.lastIndexOf('/');
383 // if (index != -1) {
384 // systemId = systemId.substring(index + 1);
385 // }
386 // str.Append(systemId);
387 // }
388 // str.Append(':');
389 // str.Append(ex.GetLineNumber());
390 // str.Append(':');
391 // str.Append(ex.GetColumnNumber());
392 
393 // return str.ToString();
394 
395 // } // getLocationString(SAXParseException):String
396 
397 // // PatternConsumer implementation for testing purposes
398 // public void addClass(String c) {
399 // Console.WriteLine("class: " + c);
400 // }
401 
402 // public void addException(String w, ArrayList e) {
403 // Console.WriteLine("exception: " + w + " : " + e.ToString());
404 // }
405 
406 // public void addPattern(String p, String v) {
407 // Console.WriteLine("pattern: " + p + " : " + v);
408 // }
409 
410 // public static void main(String[] args)
411 // {
412 // if (args.Length > 0) {
413 // PatternParser pp = new PatternParser();
414 // pp.SetConsumer(pp);
415 // pp.parse(args[0]);
416 // }
417 // }
418 //}
419 
420 //class HyphenationDTDGenerator {
421 // public static readonly String DTD_STRING=
422 // "<?xml version=\"1.0\" encoding=\"US-ASCII\"?>\n"+
423 // "<!--\n"+
424 // " Copyright 1999-2004 The Apache Software Foundation\n"+
425 // "\n"+
426 // " Licensed under the Apache License, Version 2.0 (the \"License\");\n"+
427 // " you may not use this file except in compliance with the License.\n"+
428 // " You may obtain a copy of the License at\n"+
429 // "\n"+
430 // " http://www.apache.org/licenses/LICENSE-2.0\n"+
431 // "\n"+
432 // " Unless required by applicable law or agreed to in writing, software\n"+
433 // " distributed under the License is distributed on an \"AS IS\" BASIS,\n"+
434 // " WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n"+
435 // " See the License for the specific language governing permissions and\n"+
436 // " limitations under the License.\n"+
437 // "-->\n"+
438 // "<!-- $Id: hyphenation.dtd,v 1.3 2004/02/27 18:34:59 jeremias Exp $ -->\n"+
439 // "\n"+
440 // "<!ELEMENT hyphenation-info (hyphen-char?, hyphen-min?,\n"+
441 // " classes, exceptions?, patterns)>\n"+
442 // "\n"+
443 // "<!-- Hyphen char to be used in the exception list as shortcut for\n"+
444 // " <hyphen pre-break=\"-\"/>. Defaults to '-'\n"+
445 // "-->\n"+
446 // "<!ELEMENT hyphen-char EMPTY>\n"+
447 // "<!ATTLIST hyphen-char value CDATA #REQUIRED>\n"+
448 // "\n"+
449 // "<!-- Default minimun Length in chars of hyphenated word fragments\n"+
450 // " before and after the line break. For some languages this is not\n"+
451 // " only for aesthetic purposes, wrong hyphens may be generated if this\n"+
452 // " is not accounted for.\n"+
453 // "-->\n"+
454 // "<!ELEMENT hyphen-min EMPTY>\n"+
455 // "<!ATTLIST hyphen-min before CDATA #REQUIRED>\n"+
456 // "<!ATTLIST hyphen-min after CDATA #REQUIRED>\n"+
457 // "\n"+
458 // "<!-- char equivalent classes: space separated list of char groups, all\n"+
459 // " chars in a group are to be treated equivalent as far as\n"+
460 // " the hyphenation algorithm is concerned. The first char in a group\n"+
461 // " is the group's equivalent char. Patterns should only contain\n"+
462 // " first chars. It also defines word chars, i.e. a word that\n"+
463 // " contains chars not present in any of the classes is not hyphenated.\n"+
464 // "-->\n"+
465 // "<!ELEMENT classes (#PCDATA)>\n"+
466 // "\n"+
467 // "<!-- Hyphenation exceptions: space separated list of hyphenated words.\n"+
468 // " A hyphen is indicated by the hyphen tag, but you can use the\n"+
469 // " hyphen-char defined previously as shortcut. This is in cases\n"+
470 // " when the algorithm procedure finds wrong hyphens or you want\n"+
471 // " to provide your own hyphenation for some words.\n"+
472 // "-->\n"+
473 // "<!ELEMENT exceptions (#PCDATA|hyphen)* >\n"+
474 // "\n"+
475 // "<!-- The hyphenation patterns, space separated. A pattern is made of 'equivalent'\n"+
476 // " chars as described before, between any two word chars a digit\n"+
477 // " in the range 0 to 9 may be specified. The absence of a digit is equivalent\n"+
478 // " to zero. The '.' char is reserved to indicate begining or ending\n"+
479 // " of words. -->\n"+
480 // "<!ELEMENT patterns (#PCDATA)>\n"+
481 // "\n"+
482 // "<!-- A \"full hyphen\" equivalent to TeX's \\discretionary\n"+
483 // " with pre-break, post-break and no-break attributes.\n"+
484 // " To be used in the exceptions list, the hyphen char is not\n"+
485 // " automatically added -->\n"+
486 // "<!ELEMENT hyphen EMPTY>\n"+
487 // "<!ATTLIST hyphen pre CDATA #IMPLIED>\n"+
488 // "<!ATTLIST hyphen no CDATA #IMPLIED>\n"+
489 // "<!ATTLIST hyphen post CDATA #IMPLIED>\n";
490 
491 // public static InputSource generateDTD() {
492 // return new InputSource(new StringReader(DTD_STRING));
493 // }
494 //}
495 //}