Lucene.Net
3.0.3
Lucene.Net is a .NET port of the Java Lucene Indexing Library
Main Page
Packages
Classes
Files
File List
File Members
All
Classes
Namespaces
Files
Functions
Variables
Typedefs
Enumerations
Properties
contrib
Analyzers
Compound
Hyphenation
PatternParser.cs
Go to the documentation of this file.
1
//using System;
2
//using System.Collections;
3
//using System.Collections.Generic;
4
//using System.IO;
5
//using System.Linq;
6
//using System.Text;
7
//using Lucene.Net.Analysis.Compound.Hyphenation;
8
9
//namespace Lucene.Net.Analyzers.Compound.Hyphenation
10
//{
11
// /*
12
// * A SAX document handler to read and parse hyphenation patterns from a XML
13
// * file.
14
// *
15
// * This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
16
// */
17
//public class PatternParser : DefaultHandler, PatternConsumer {
18
19
// XMLReader parser;
20
21
// int currElement;
22
23
// PatternConsumer consumer;
24
25
// StringBuilder token;
26
27
// ArrayList exception;
28
29
// char hyphenChar;
30
31
// String errMsg;
32
33
// static readonly int ELEM_CLASSES = 1;
34
35
// static readonly int ELEM_EXCEPTIONS = 2;
36
37
// static readonly int ELEM_PATTERNS = 3;
38
39
// static readonly int ELEM_HYPHEN = 4;
40
41
// public PatternParser()
42
// {
43
// token = new StringBuilder();
44
// parser = CreateParser();
45
// parser.SetContentHandler(this);
46
// parser.SetErrorHandler(this);
47
// parser.SetEntityResolver(this);
48
// hyphenChar = '-'; // default
49
50
// }
51
52
// public PatternParser(PatternConsumer consumer)
53
// : this()
54
// {
55
// this.consumer = consumer;
56
// }
57
58
// public void setConsumer(PatternConsumer consumer) {
59
// this.consumer = consumer;
60
// }
61
62
// /*
63
// * Parses a hyphenation pattern file.
64
// *
65
// * @param filename the filename
66
// * @throws HyphenationException In case of an exception while parsing
67
// */
68
// public void parse(String filename)
69
// {
70
// parse(new FileInfo(filename));
71
// }
72
73
// /*
74
// * Parses a hyphenation pattern file.
75
// *
76
// * @param file the pattern file
77
// * @throws HyphenationException In case of an exception while parsing
78
// */
79
// public void parse(FileInfo file)
80
// {
81
// try {
82
// InputSource src = new InputSource(file.toURL().toExternalForm());
83
// parse(src);
84
// } catch (MalformedURLException e) {
85
// throw new HyphenationException("Error converting the File '" + file
86
// + "' to a URL: " + e.GetMessage());
87
// }
88
// }
89
90
// /*
91
// * Parses a hyphenation pattern file.
92
// *
93
// * @param source the InputSource for the file
94
// * @throws HyphenationException In case of an exception while parsing
95
// */
96
// public void parse(InputSource source)
97
// {
98
// try {
99
// parser.parse(source);
100
// } catch (FileNotFoundException fnfe) {
101
// throw new HyphenationException("File not found: " + fnfe.GetMessage());
102
// } catch (IOException ioe) {
103
// throw new HyphenationException(ioe.GetMessage());
104
// } catch (SAXException e) {
105
// throw new HyphenationException(errMsg);
106
// }
107
// }
108
109
// /*
110
// * Creates a SAX parser using JAXP
111
// *
112
// * @return the created SAX parser
113
// */
114
// static XMLReader createParser() {
115
// try {
116
// SAXParserFactory factory = SAXParserFactory.newInstance();
117
// factory.SetNamespaceAware(true);
118
// return factory.newSAXParser().GetXMLReader();
119
// } catch (Exception e) {
120
// throw new RuntimeException("Couldn't create XMLReader: " + e.GetMessage());
121
// }
122
// }
123
124
// protected String readToken(StringBuffer chars) {
125
// String word;
126
// bool space = false;
127
// int i;
128
// for (i = 0; i < chars.Length(); i++) {
129
// if (char.isWhitespace(chars.charAt(i))) {
130
// space = true;
131
// } else {
132
// break;
133
// }
134
// }
135
// if (space) {
136
// // chars.delete(0,i);
137
// for (int countr = i; countr < chars.Length(); countr++) {
138
// chars.SetCharAt(countr - i, chars.charAt(countr));
139
// }
140
// chars.SetLength(chars.Length() - i);
141
// if (token.Length() > 0) {
142
// word = token.ToString();
143
// token.SetLength(0);
144
// return word;
145
// }
146
// }
147
// space = false;
148
// for (i = 0; i < chars.Length(); i++) {
149
// if (char.isWhitespace(chars.charAt(i))) {
150
// space = true;
151
// break;
152
// }
153
// }
154
// token.Append(chars.ToString().substring(0, i));
155
// // chars.delete(0,i);
156
// for (int countr = i; countr < chars.Length(); countr++) {
157
// chars.SetCharAt(countr - i, chars.charAt(countr));
158
// }
159
// chars.SetLength(chars.Length() - i);
160
// if (space) {
161
// word = token.ToString();
162
// token.SetLength(0);
163
// return word;
164
// }
165
// token.Append(chars);
166
// return null;
167
// }
168
169
// protected static String getPattern(String word) {
170
// StringBuilder pat = new StringBuilder();
171
// int len = word.Length();
172
// for (int i = 0; i < len; i++) {
173
// if (!char.isDigit(word.charAt(i))) {
174
// pat.Append(word.charAt(i));
175
// }
176
// }
177
// return pat.ToString();
178
// }
179
180
// protected ArrayList normalizeException(ArrayList ex) {
181
// ArrayList res = new ArrayList();
182
// for (int i = 0; i < ex.size(); i++) {
183
// Object item = ex.Get(i);
184
// if (item instanceof String) {
185
// String str = (String) item;
186
// StringBuilder buf = new StringBuilder();
187
// for (int j = 0; j < str.Length(); j++) {
188
// char c = str.charAt(j);
189
// if (c != hyphenChar) {
190
// buf.Append(c);
191
// } else {
192
// res.add(buf.ToString());
193
// buf.SetLength(0);
194
// char[] h = new char[1];
195
// h[0] = hyphenChar;
196
// // we use here hyphenChar which is not necessarily
197
// // the one to be printed
198
// res.add(new Hyphen(new String(h), null, null));
199
// }
200
// }
201
// if (buf.Length() > 0) {
202
// res.add(buf.ToString());
203
// }
204
// } else {
205
// res.add(item);
206
// }
207
// }
208
// return res;
209
// }
210
211
// protected String getExceptionWord(ArrayList ex) {
212
// StringBuilder res = new StringBuilder();
213
// for (int i = 0; i < ex.size(); i++) {
214
// Object item = ex.Get(i);
215
// if (item instanceof String) {
216
// res.Append((String) item);
217
// } else {
218
// if (((Hyphen) item).noBreak != null) {
219
// res.Append(((Hyphen) item).noBreak);
220
// }
221
// }
222
// }
223
// return res.ToString();
224
// }
225
226
// protected static String getInterletterValues(String pat) {
227
// StringBuilder il = new StringBuilder();
228
// String word = pat + "a"; // add dummy letter to serve as sentinel
229
// int len = word.Length();
230
// for (int i = 0; i < len; i++) {
231
// char c = word.charAt(i);
232
// if (char.isDigit(c)) {
233
// il.Append(c);
234
// i++;
235
// } else {
236
// il.Append('0');
237
// }
238
// }
239
// return il.ToString();
240
// }
241
242
// //
243
// // EntityResolver methods
244
// //
245
// public override InputSource resolveEntity(String publicId, String systemId) {
246
// return HyphenationDTDGenerator.generateDTD();
247
// }
248
249
// //
250
// // ContentHandler methods
251
// //
252
253
// /*
254
// * @see org.xml.sax.ContentHandler#startElement(java.lang.String,
255
// * java.lang.String, java.lang.String, org.xml.sax.Attributes)
256
// */
257
// public override void startElement(String uri, String local, String raw,
258
// Attributes attrs) {
259
// if (local.equals("hyphen-char")) {
260
// String h = attrs.GetValue("value");
261
// if (h != null && h.Length() == 1) {
262
// hyphenChar = h.charAt(0);
263
// }
264
// } else if (local.equals("classes")) {
265
// currElement = ELEM_CLASSES;
266
// } else if (local.equals("patterns")) {
267
// currElement = ELEM_PATTERNS;
268
// } else if (local.equals("exceptions")) {
269
// currElement = ELEM_EXCEPTIONS;
270
// exception = new ArrayList();
271
// } else if (local.equals("hyphen")) {
272
// if (token.Length() > 0) {
273
// exception.add(token.ToString());
274
// }
275
// exception.add(new Hyphen(attrs.GetValue("pre"), attrs.GetValue("no"),
276
// attrs.GetValue("post")));
277
// currElement = ELEM_HYPHEN;
278
// }
279
// token.SetLength(0);
280
// }
281
282
// /*
283
// * @see org.xml.sax.ContentHandler#endElement(java.lang.String,
284
// * java.lang.String, java.lang.String)
285
// */
286
// public override void endElement(String uri, String local, String raw) {
287
288
// if (token.Length() > 0) {
289
// String word = token.ToString();
290
// switch (currElement) {
291
// case ELEM_CLASSES:
292
// consumer.addClass(word);
293
// break;
294
// case ELEM_EXCEPTIONS:
295
// exception.add(word);
296
// exception = normalizeException(exception);
297
// consumer.addException(getExceptionWord(exception),
298
// (ArrayList) exception.clone());
299
// break;
300
// case ELEM_PATTERNS:
301
// consumer.addPattern(getPattern(word), getInterletterValues(word));
302
// break;
303
// case ELEM_HYPHEN:
304
// // nothing to do
305
// break;
306
// }
307
// if (currElement != ELEM_HYPHEN) {
308
// token.SetLength(0);
309
// }
310
// }
311
// if (currElement == ELEM_HYPHEN) {
312
// currElement = ELEM_EXCEPTIONS;
313
// } else {
314
// currElement = 0;
315
// }
316
317
// }
318
319
// /*
320
// * @see org.xml.sax.ContentHandler#chars(char[], int, int)
321
// */
322
// public override void chars(char ch[], int start, int Length) {
323
// StringBuffer chars = new StringBuffer(Length);
324
// chars.Append(ch, start, Length);
325
// String word = readToken(chars);
326
// while (word != null) {
327
// // Console.WriteLine("\"" + word + "\"");
328
// switch (currElement) {
329
// case ELEM_CLASSES:
330
// consumer.addClass(word);
331
// break;
332
// case ELEM_EXCEPTIONS:
333
// exception.add(word);
334
// exception = normalizeException(exception);
335
// consumer.addException(getExceptionWord(exception),
336
// (ArrayList) exception.clone());
337
// exception.clear();
338
// break;
339
// case ELEM_PATTERNS:
340
// consumer.addPattern(getPattern(word), getInterletterValues(word));
341
// break;
342
// }
343
// word = readToken(chars);
344
// }
345
346
// }
347
348
// //
349
// // ErrorHandler methods
350
// //
351
352
// /*
353
// * @see org.xml.sax.ErrorHandler#warning(org.xml.sax.SAXParseException)
354
// */
355
// public override void warning(SAXParseException ex) {
356
// errMsg = "[Warning] " + getLocationString(ex) + ": " + ex.GetMessage();
357
// }
358
359
// /*
360
// * @see org.xml.sax.ErrorHandler#error(org.xml.sax.SAXParseException)
361
// */
362
// public override void error(SAXParseException ex) {
363
// errMsg = "[Error] " + getLocationString(ex) + ": " + ex.GetMessage();
364
// }
365
366
// /*
367
// * @see org.xml.sax.ErrorHandler#fatalError(org.xml.sax.SAXParseException)
368
// */
369
// public override void fatalError(SAXParseException ex) throws SAXException {
370
// errMsg = "[Fatal Error] " + getLocationString(ex) + ": " + ex.GetMessage();
371
// throw ex;
372
// }
373
374
// /*
375
// * Returns a string of the location.
376
// */
377
// private String getLocationString(SAXParseException ex) {
378
// StringBuilder str = new StringBuilder();
379
380
// String systemId = ex.GetSystemId();
381
// if (systemId != null) {
382
// int index = systemId.lastIndexOf('/');
383
// if (index != -1) {
384
// systemId = systemId.substring(index + 1);
385
// }
386
// str.Append(systemId);
387
// }
388
// str.Append(':');
389
// str.Append(ex.GetLineNumber());
390
// str.Append(':');
391
// str.Append(ex.GetColumnNumber());
392
393
// return str.ToString();
394
395
// } // getLocationString(SAXParseException):String
396
397
// // PatternConsumer implementation for testing purposes
398
// public void addClass(String c) {
399
// Console.WriteLine("class: " + c);
400
// }
401
402
// public void addException(String w, ArrayList e) {
403
// Console.WriteLine("exception: " + w + " : " + e.ToString());
404
// }
405
406
// public void addPattern(String p, String v) {
407
// Console.WriteLine("pattern: " + p + " : " + v);
408
// }
409
410
// public static void main(String[] args)
411
// {
412
// if (args.Length > 0) {
413
// PatternParser pp = new PatternParser();
414
// pp.SetConsumer(pp);
415
// pp.parse(args[0]);
416
// }
417
// }
418
//}
419
420
//class HyphenationDTDGenerator {
421
// public static readonly String DTD_STRING=
422
// "<?xml version=\"1.0\" encoding=\"US-ASCII\"?>\n"+
423
// "<!--\n"+
424
// " Copyright 1999-2004 The Apache Software Foundation\n"+
425
// "\n"+
426
// " Licensed under the Apache License, Version 2.0 (the \"License\");\n"+
427
// " you may not use this file except in compliance with the License.\n"+
428
// " You may obtain a copy of the License at\n"+
429
// "\n"+
430
// " http://www.apache.org/licenses/LICENSE-2.0\n"+
431
// "\n"+
432
// " Unless required by applicable law or agreed to in writing, software\n"+
433
// " distributed under the License is distributed on an \"AS IS\" BASIS,\n"+
434
// " WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n"+
435
// " See the License for the specific language governing permissions and\n"+
436
// " limitations under the License.\n"+
437
// "-->\n"+
438
// "<!-- $Id: hyphenation.dtd,v 1.3 2004/02/27 18:34:59 jeremias Exp $ -->\n"+
439
// "\n"+
440
// "<!ELEMENT hyphenation-info (hyphen-char?, hyphen-min?,\n"+
441
// " classes, exceptions?, patterns)>\n"+
442
// "\n"+
443
// "<!-- Hyphen char to be used in the exception list as shortcut for\n"+
444
// " <hyphen pre-break=\"-\"/>. Defaults to '-'\n"+
445
// "-->\n"+
446
// "<!ELEMENT hyphen-char EMPTY>\n"+
447
// "<!ATTLIST hyphen-char value CDATA #REQUIRED>\n"+
448
// "\n"+
449
// "<!-- Default minimun Length in chars of hyphenated word fragments\n"+
450
// " before and after the line break. For some languages this is not\n"+
451
// " only for aesthetic purposes, wrong hyphens may be generated if this\n"+
452
// " is not accounted for.\n"+
453
// "-->\n"+
454
// "<!ELEMENT hyphen-min EMPTY>\n"+
455
// "<!ATTLIST hyphen-min before CDATA #REQUIRED>\n"+
456
// "<!ATTLIST hyphen-min after CDATA #REQUIRED>\n"+
457
// "\n"+
458
// "<!-- char equivalent classes: space separated list of char groups, all\n"+
459
// " chars in a group are to be treated equivalent as far as\n"+
460
// " the hyphenation algorithm is concerned. The first char in a group\n"+
461
// " is the group's equivalent char. Patterns should only contain\n"+
462
// " first chars. It also defines word chars, i.e. a word that\n"+
463
// " contains chars not present in any of the classes is not hyphenated.\n"+
464
// "-->\n"+
465
// "<!ELEMENT classes (#PCDATA)>\n"+
466
// "\n"+
467
// "<!-- Hyphenation exceptions: space separated list of hyphenated words.\n"+
468
// " A hyphen is indicated by the hyphen tag, but you can use the\n"+
469
// " hyphen-char defined previously as shortcut. This is in cases\n"+
470
// " when the algorithm procedure finds wrong hyphens or you want\n"+
471
// " to provide your own hyphenation for some words.\n"+
472
// "-->\n"+
473
// "<!ELEMENT exceptions (#PCDATA|hyphen)* >\n"+
474
// "\n"+
475
// "<!-- The hyphenation patterns, space separated. A pattern is made of 'equivalent'\n"+
476
// " chars as described before, between any two word chars a digit\n"+
477
// " in the range 0 to 9 may be specified. The absence of a digit is equivalent\n"+
478
// " to zero. The '.' char is reserved to indicate begining or ending\n"+
479
// " of words. -->\n"+
480
// "<!ELEMENT patterns (#PCDATA)>\n"+
481
// "\n"+
482
// "<!-- A \"full hyphen\" equivalent to TeX's \\discretionary\n"+
483
// " with pre-break, post-break and no-break attributes.\n"+
484
// " To be used in the exceptions list, the hyphen char is not\n"+
485
// " automatically added -->\n"+
486
// "<!ELEMENT hyphen EMPTY>\n"+
487
// "<!ATTLIST hyphen pre CDATA #IMPLIED>\n"+
488
// "<!ATTLIST hyphen no CDATA #IMPLIED>\n"+
489
// "<!ATTLIST hyphen post CDATA #IMPLIED>\n";
490
491
// public static InputSource generateDTD() {
492
// return new InputSource(new StringReader(DTD_STRING));
493
// }
494
//}
495
//}
Generated on Thu Jan 3 2013 02:12:42 for Lucene.Net by
1.8.3