Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
HunspellDictionary.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.Collections.Generic;
20 using System.Globalization;
21 using System.IO;
22 using System.Text;
23 using System.Text.RegularExpressions;
24 
25 namespace Lucene.Net.Analysis.Hunspell {
26  public class HunspellDictionary {
27  private static readonly HunspellWord NoFlags = new HunspellWord();
28 
29  private static readonly String PREFIX_KEY = "PFX";
30  private static readonly String SUFFIX_KEY = "SFX";
31  private static readonly String FLAG_KEY = "FLAG";
32  private static readonly String AF_KEY = "AF";
33 
34  private static readonly String NUM_FLAG_TYPE = "num";
35  private static readonly String UTF8_FLAG_TYPE = "UTF-8";
36  private static readonly String LONG_FLAG_TYPE = "long";
37 
38  private static readonly String PREFIX_CONDITION_REGEX_PATTERN = @"^{0}";
39  private static readonly String SUFFIX_CONDITION_REGEX_PATTERN = @"{0}$";
40 
41  private readonly Dictionary<String, List<HunspellAffix>> _prefixes = new Dictionary<String, List<HunspellAffix>>();
42  private readonly Dictionary<String, List<HunspellAffix>> _suffixes = new Dictionary<String, List<HunspellAffix>>();
43  private readonly Dictionary<String, List<HunspellWord>> _words = new Dictionary<String, List<HunspellWord>>();
44  private readonly Dictionary<String, Char[]> _aliases = new Dictionary<String, Char[]>();
45  private FlagParsingStrategy _flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy
46 
47  /// <summary>
48  /// Creates a new HunspellDictionary containing the information read from the provided streams to hunspell affix and dictionary file.
49  /// </summary>
50  /// <param name = "affix">Stream for reading the hunspell affix file.</param>
51  /// <param name = "dictionary">Stream for reading the hunspell dictionary file.</param>
52  /// <exception cref = "IOException">Can be thrown while reading from the streams.</exception>
53  /// <exception cref = "InvalidDataException">Can be thrown if the content of the files does not meet expected formats.</exception>
54  public HunspellDictionary(Stream affix, Stream dictionary)
55  : this(affix, new[] { dictionary }) {
56  }
57 
58  /// <summary>
59  /// Creates a new HunspellDictionary containing the information read from the provided streams to hunspell affix and dictionary files.
60  /// </summary>
61  /// <param name = "affix">Stream for reading the hunspell affix file.</param>
62  /// <param name = "dictionaries">Streams for reading the hunspell dictionary file.</param>
63  /// <exception cref = "IOException">Can be thrown while reading from the streams.</exception>
64  /// <exception cref = "InvalidDataException">Can be thrown if the content of the files does not meet expected formats.</exception>
65  public HunspellDictionary(Stream affix, IEnumerable<Stream> dictionaries) {
66  if (affix == null) throw new ArgumentNullException("affix");
67  if (dictionaries == null) throw new ArgumentNullException("dictionaries");
68 
69  var encodingName = ReadDictionaryEncoding(affix);
70  var encoding = Encoding.GetEncoding(encodingName);
71 
72  ReadAffixFile(affix, encoding);
73  foreach (var dictionary in dictionaries)
74  ReadDictionaryFile(dictionary, encoding);
75  }
76 
77  /// <summary>
78  /// Looks up HunspellWords that match the String created from the given char array, offset and length.
79  /// </summary>
80  public IEnumerable<HunspellWord> LookupWord(String word) {
81  if (word == null) throw new ArgumentNullException("word");
82 
83  List<HunspellWord> list;
84  if (_words.TryGetValue(word, out list))
85  return list;
86 
87  return null;
88  }
89 
90  /// <summary>
91  /// Looks up HunspellAffix prefixes that have an append that matches the String created from the given char array, offset and length.
92  /// </summary>
93  /// <param name="word">Char array to generate the String from.</param>
94  /// <param name="offset">Offset in the char array that the String starts at.</param>
95  /// <param name="length">Length from the offset that the String is.</param>
96  /// <returns>List of HunspellAffix prefixes with an append that matches the String, or <c>null</c> if none are found.</returns>
97  public IEnumerable<HunspellAffix> LookupPrefix(char[] word, int offset, int length) {
98  if (word == null) throw new ArgumentNullException("word");
99  var key = new String(word, offset, length);
100 
101  List<HunspellAffix> list;
102  if (_prefixes.TryGetValue(key, out list))
103  return list;
104 
105  return null;
106  }
107 
108  /// <summary>
109  /// Looks up HunspellAffix suffixes that have an append that matches the String created from the given char array, offset and length.
110  /// </summary>
111  /// <param name="word">Char array to generate the String from.</param>
112  /// <param name="offset">Offset in the char array that the String starts at.</param>
113  /// <param name="length">Length from the offset that the String is.</param>
114  /// <returns>List of HunspellAffix suffixes with an append that matches the String, or <c>null</c> if none are found</returns>
115  public IEnumerable<HunspellAffix> LookupSuffix(char[] word, int offset, int length) {
116  if (word == null) throw new ArgumentNullException("word");
117  var key = new String(word, offset, length);
118 
119  List<HunspellAffix> list;
120  if (_suffixes.TryGetValue(key, out list))
121  return list;
122 
123  return null;
124  }
125 
126  /// <summary>
127  /// Reads the affix file through the provided Stream, building up the prefix and suffix maps.
128  /// </summary>
129  /// <param name="affixStream">Stream to read the content of the affix file from.</param>
130  /// <param name="encoding">Encoding to decode the content of the file.</param>
131  /// <exception cref="IOException">IOException Can be thrown while reading from the Stream.</exception>
132  private void ReadAffixFile(Stream affixStream, Encoding encoding) {
133  if (affixStream == null) throw new ArgumentNullException("affixStream");
134  if (encoding == null) throw new ArgumentNullException("encoding");
135 
136  using (var reader = new StreamReader(affixStream, encoding)) {
137  String line;
138  while ((line = reader.ReadLine()) != null) {
139  if (line.StartsWith(PREFIX_KEY)) {
140  ParseAffix(_prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN);
141  } else if (line.StartsWith(SUFFIX_KEY)) {
142  ParseAffix(_suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN);
143  } else if (line.StartsWith(FLAG_KEY)) {
144  // Assume that the FLAG line comes before any prefix or suffixes
145  // Store the strategy so it can be used when parsing the dic file
146  _flagParsingStrategy = GetFlagParsingStrategy(line);
147  } else if (line.StartsWith(AF_KEY)) {
148  // Parse Alias Flag
149  ParseAliasFlag(line, reader);
150  }
151  }
152  }
153  }
154 
155  /// <summary>
156  /// Parse alias flag and put it in hash
157  /// </summary>
158  /// <param name="line"></param>
159  /// <param name="reader"></param>
160  private void ParseAliasFlag(String line, TextReader reader) {
161  if (reader == null) throw new ArgumentNullException("reader");
162  var args = Regex.Split(line, "\\s+");
163  var numLines = Int32.Parse(args[1]);
164 
165  for (var i = 0; i < numLines; i++) {
166  line = reader.ReadLine();
167  var ruleArgs = Regex.Split(line, "\\s+");
168 
169  if (ruleArgs[0] != "AF")
170  throw new Exception("File corrupted, should be AF directive : " + line);
171 
172  var appendFlags = _flagParsingStrategy.ParseFlags(ruleArgs[1]);
173  _aliases.Add((i+1).ToString(CultureInfo.InvariantCulture), appendFlags);
174  }
175  }
176 
177  /// <summary>
178  /// Parses a specific affix rule putting the result into the provided affix map.
179  /// </summary>
180  /// <param name="affixes">Map where the result of the parsing will be put.</param>
181  /// <param name="header">Header line of the affix rule.</param>
182  /// <param name="reader">TextReader to read the content of the rule from.</param>
183  /// <param name="conditionPattern">Pattern to be used to generate the condition regex pattern.</param>
184  private void ParseAffix(Dictionary<String, List<HunspellAffix>> affixes, String header, TextReader reader, String conditionPattern) {
185  if (affixes == null) throw new ArgumentNullException("affixes");
186  if (header == null) throw new ArgumentNullException("header");
187  if (reader == null) throw new ArgumentNullException("reader");
188  if (conditionPattern == null) throw new ArgumentNullException("conditionPattern");
189 
190  var args = Regex.Split(header, "\\s+");
191  var crossProduct = args[2].Equals("Y");
192  var numLines = Int32.Parse(args[3]);
193 
194  var hasAliases = _aliases.Count > 0;
195  for (var i = 0; i < numLines; i++) {
196  var line = reader.ReadLine();
197  var ruleArgs = Regex.Split(line, "\\s+");
198 
199  var affix = new HunspellAffix();
200 
201  affix.Flag = _flagParsingStrategy.ParseFlag(ruleArgs[1]);
202  affix.Strip = (ruleArgs[2] == "0") ? "" : ruleArgs[2];
203 
204  var affixArg = ruleArgs[3];
205 
206  var flagSep = affixArg.LastIndexOf('/');
207  if (flagSep != -1) {
208  var cflag = affixArg.Substring(flagSep + 1);
209  var appendFlags = hasAliases ? _aliases[cflag] : _flagParsingStrategy.ParseFlags(cflag);
210  Array.Sort(appendFlags);
211  affix.AppendFlags = appendFlags;
212  affix.Append = affixArg.Substring(0, flagSep);
213  } else {
214  affix.Append = affixArg;
215  }
216 
217  var condition = ruleArgs[4];
218  affix.SetCondition(condition, String.Format(conditionPattern, condition));
219  affix.IsCrossProduct = crossProduct;
220 
221  List<HunspellAffix> list;
222  if (!affixes.TryGetValue(affix.Append, out list))
223  affixes.Add(affix.Append, list = new List<HunspellAffix>());
224 
225  list.Add(affix);
226  }
227  }
228 
229  /// <summary>
230  /// Parses the encoding specificed in the affix file readable through the provided Stream.
231  /// </summary>
232  /// <param name="affix">Stream for reading the affix file.</param>
233  /// <returns>Encoding specified in the affix file.</returns>
234  /// <exception cref="InvalidDataException">
235  /// Thrown if the first non-empty non-comment line read from the file does not
236  /// adhere to the format <c>SET encoding</c>.
237  /// </exception>
238  private static String ReadDictionaryEncoding(Stream affix) {
239  if (affix == null) throw new ArgumentNullException("affix");
240 
241  var builder = new StringBuilder();
242  for (; ; ) {
243  builder.Length = 0;
244  int ch;
245  while ((ch = affix.ReadByte()) >= 0) {
246  if (ch == '\n') {
247  break;
248  }
249  if (ch != '\r') {
250  builder.Append((char)ch);
251  }
252  }
253 
254  if (builder.Length == 0 ||
255  builder[0] == '#' ||
256  // this test only at the end as ineffective but would allow lines only containing spaces:
257  builder.ToString().Trim().Length == 0
258  ) {
259  if (ch < 0)
260  throw new InvalidDataException("Unexpected end of affix file.");
261 
262  continue;
263  }
264 
265  if ("SET ".Equals(builder.ToString(0, 4))) {
266  // cleanup the encoding string, too (whitespace)
267  return builder.ToString(4, builder.Length - 4).Trim();
268  }
269 
270  throw new InvalidDataException("The first non-comment line in the affix file must " +
271  "be a 'SET charset', was: '" + builder + "'");
272  }
273  }
274 
275  /// <summary>
276  /// Determines the appropriate {@link FlagParsingStrategy} based on the FLAG definiton line taken from the affix file.
277  /// </summary>
278  /// <param name="flagLine">Line containing the flag information</param>
279  /// <returns>FlagParsingStrategy that handles parsing flags in the way specified in the FLAG definition.</returns>
280  private static FlagParsingStrategy GetFlagParsingStrategy(String flagLine) {
281  if (flagLine == null) throw new ArgumentNullException("flagLine");
282  var flagType = flagLine.Substring(5);
283 
284  if (NUM_FLAG_TYPE.Equals(flagType))
285  return new NumFlagParsingStrategy();
286 
287  if (UTF8_FLAG_TYPE.Equals(flagType))
288  return new SimpleFlagParsingStrategy();
289 
290  if (LONG_FLAG_TYPE.Equals(flagType))
291  return new DoubleASCIIFlagParsingStrategy();
292 
293  throw new ArgumentException("Unknown flag type: " + flagType);
294  }
295 
296  /// <summary>
297  /// Reads the dictionary file through the provided Stream, building up the words map.
298  /// </summary>
299  /// <param name="dictionary">Stream to read the dictionary file through.</param>
300  /// <param name="encoding">Encoding used to decode the contents of the file.</param>
301  /// <exception cref="IOException">Can be thrown while reading from the file.</exception>
302  private void ReadDictionaryFile(Stream dictionary, Encoding encoding) {
303  if (dictionary == null) throw new ArgumentNullException("dictionary");
304  if (encoding == null) throw new ArgumentNullException("encoding");
305  var reader = new StreamReader(dictionary, encoding);
306 
307  // nocommit, don't create millions of strings.
308  var line = reader.ReadLine(); // first line is number of entries
309  var numEntries = Int32.Parse(line);
310  var hasAliases = _aliases.Count > 0;
311 
312  // nocommit, the flags themselves can be double-chars (long) or also numeric
313  // either way the trick is to encode them as char... but they must be parsed differently
314  while ((line = reader.ReadLine()) != null) {
315  String entry;
316  HunspellWord wordForm;
317 
318  var flagSep = line.LastIndexOf('/');
319  if (flagSep == -1) {
320  wordForm = NoFlags;
321  entry = line;
322  } else {
323  // note, there can be comments (morph description) after a flag.
324  // we should really look for any whitespace
325  var end = line.IndexOf('\t', flagSep);
326  var cflag = end == -1 ? line.Substring(flagSep + 1) : line.Substring(flagSep + 1, end - flagSep - 1);
327 
328  wordForm = new HunspellWord(hasAliases ? _aliases[cflag] : _flagParsingStrategy.ParseFlags(cflag));
329 
330  entry = line.Substring(0, flagSep);
331  }
332 
333  List<HunspellWord> entries;
334  if (!_words.TryGetValue(entry, out entries))
335  _words.Add(entry, entries = new List<HunspellWord>());
336 
337  entries.Add(wordForm);
338  }
339  }
340 
341  #region Nested type: DoubleASCIIFlagParsingStrategy
342 
343  /// <summary>
344  /// Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded as
345  /// two ASCII characters whose codes must be combined into a single character.
346  /// </summary>
347  private class DoubleASCIIFlagParsingStrategy : FlagParsingStrategy {
348  public override Char[] ParseFlags(String rawFlags) {
349  if (rawFlags.Length == 0)
350  return new Char[0];
351 
352  var builder = new StringBuilder();
353  for (var i = 0; i < rawFlags.Length; i += 2) {
354  var cookedFlag = (Char)(rawFlags[i] + rawFlags[i + 1]);
355  builder.Append(cookedFlag);
356  }
357 
358  return builder.ToString().ToCharArray();
359  }
360  }
361 
362  #endregion
363 
364  #region Nested type: FlagParsingStrategy
365  /// <summary>
366  /// Abstraction of the process of parsing flags taken from the affix and dic files
367  /// </summary>
368  private abstract class FlagParsingStrategy {
369  /// <summary>
370  /// Parses the given String into a single flag.
371  /// </summary>
372  /// <param name="rawFlag">String to parse into a flag.</param>
373  /// <returns>Parsed flag.</returns>
374  public Char ParseFlag(String rawFlag) {
375  if (rawFlag == null)
376  throw new ArgumentNullException("rawFlag");
377 
378  return ParseFlags(rawFlag)[0];
379  }
380 
381  /// <summary>
382  /// Parses the given String into multiple flag.
383  /// </summary>
384  /// <param name="rawFlags">String to parse into a flags.</param>
385  /// <returns>Parsed flags.</returns>
386  public abstract Char[] ParseFlags(String rawFlags);
387  }
388 
389  #endregion
390 
391  #region Nested type: NumFlagParsingStrategy
392 
393  /// <summary>
394  /// Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded in its
395  /// numerical form. In the case of multiple flags, each number is separated by a comma.
396  /// </summary>
397  private class NumFlagParsingStrategy : FlagParsingStrategy {
398  public override Char[] ParseFlags(String rawFlags) {
399  var rawFlagParts = rawFlags.Trim().Split(',');
400  var flags = new Char[rawFlagParts.Length];
401 
402  for (var i = 0; i < rawFlagParts.Length; i++) {
403  // note, removing the trailing X/leading I for nepali... what is the rule here?!
404  var replaced = Regex.Replace(rawFlagParts[i], "[^0-9]", "");
405  flags[i] = (Char)Int32.Parse(replaced);
406  }
407 
408  return flags;
409  }
410  }
411 
412  #endregion
413 
414  #region Nested type: SimpleFlagParsingStrategy
415 
416  /// <summary>
417  /// Simple implementation of {@link FlagParsingStrategy} that treats the chars in each
418  /// String as a individual flags. Can be used with both the ASCII and UTF-8 flag types.
419  /// </summary>
420  private class SimpleFlagParsingStrategy : FlagParsingStrategy {
421  public override Char[] ParseFlags(String rawFlags) {
422  return rawFlags.ToCharArray();
423  }
424  }
425 
426  #endregion
427  }
428 }