19 using System.Collections.Generic;
20 using System.Globalization;
23 using System.Text.RegularExpressions;
25 namespace Lucene.Net.Analysis.Hunspell {
29 private static readonly String PREFIX_KEY =
"PFX";
30 private static readonly String SUFFIX_KEY =
"SFX";
31 private static readonly String FLAG_KEY =
"FLAG";
32 private static readonly String AF_KEY =
"AF";
34 private static readonly String NUM_FLAG_TYPE =
"num";
35 private static readonly String UTF8_FLAG_TYPE =
"UTF-8";
36 private static readonly String LONG_FLAG_TYPE =
"long";
38 private static readonly String PREFIX_CONDITION_REGEX_PATTERN =
@"^{0}";
39 private static readonly String SUFFIX_CONDITION_REGEX_PATTERN =
@"{0}$";
41 private readonly Dictionary<String, List<HunspellAffix>> _prefixes =
new Dictionary<String, List<HunspellAffix>>();
42 private readonly Dictionary<String, List<HunspellAffix>> _suffixes =
new Dictionary<String, List<HunspellAffix>>();
43 private readonly Dictionary<String, List<HunspellWord>> _words =
new Dictionary<String, List<HunspellWord>>();
44 private readonly Dictionary<String, Char[]> _aliases =
new Dictionary<String, Char[]>();
45 private FlagParsingStrategy _flagParsingStrategy =
new SimpleFlagParsingStrategy();
55 : this(affix, new[] { dictionary }) {
66 if (affix == null)
throw new ArgumentNullException(
"affix");
67 if (dictionaries == null)
throw new ArgumentNullException(
"dictionaries");
69 var encodingName = ReadDictionaryEncoding(affix);
70 var encoding = Encoding.GetEncoding(encodingName);
72 ReadAffixFile(affix, encoding);
73 foreach (var dictionary
in dictionaries)
74 ReadDictionaryFile(dictionary, encoding);
80 public IEnumerable<HunspellWord>
LookupWord(String word) {
81 if (word == null)
throw new ArgumentNullException(
"word");
83 List<HunspellWord> list;
84 if (_words.TryGetValue(word, out list))
97 public IEnumerable<HunspellAffix>
LookupPrefix(
char[] word,
int offset,
int length) {
98 if (word == null)
throw new ArgumentNullException(
"word");
99 var key =
new String(word, offset, length);
101 List<HunspellAffix> list;
102 if (_prefixes.TryGetValue(key, out list))
115 public IEnumerable<HunspellAffix>
LookupSuffix(
char[] word,
int offset,
int length) {
116 if (word == null)
throw new ArgumentNullException(
"word");
117 var key =
new String(word, offset, length);
119 List<HunspellAffix> list;
120 if (_suffixes.TryGetValue(key, out list))
132 private void ReadAffixFile(Stream affixStream, Encoding encoding) {
133 if (affixStream == null)
throw new ArgumentNullException(
"affixStream");
134 if (encoding == null)
throw new ArgumentNullException(
"encoding");
136 using (var reader =
new StreamReader(affixStream, encoding)) {
138 while ((line = reader.ReadLine()) != null) {
139 if (line.StartsWith(PREFIX_KEY)) {
140 ParseAffix(_prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN);
141 }
else if (line.StartsWith(SUFFIX_KEY)) {
142 ParseAffix(_suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN);
143 }
else if (line.StartsWith(FLAG_KEY)) {
146 _flagParsingStrategy = GetFlagParsingStrategy(line);
147 }
else if (line.StartsWith(AF_KEY)) {
149 ParseAliasFlag(line, reader);
160 private void ParseAliasFlag(String line, TextReader reader) {
161 if (reader == null)
throw new ArgumentNullException(
"reader");
162 var args = Regex.Split(line,
"\\s+");
163 var numLines = Int32.Parse(args[1]);
165 for (var i = 0; i < numLines; i++) {
166 line = reader.ReadLine();
167 var ruleArgs = Regex.Split(line,
"\\s+");
169 if (ruleArgs[0] !=
"AF")
170 throw new Exception(
"File corrupted, should be AF directive : " + line);
172 var appendFlags = _flagParsingStrategy.ParseFlags(ruleArgs[1]);
173 _aliases.Add((i+1).ToString(CultureInfo.InvariantCulture), appendFlags);
184 private void ParseAffix(Dictionary<String, List<HunspellAffix>> affixes, String header, TextReader reader, String conditionPattern) {
185 if (affixes == null)
throw new ArgumentNullException(
"affixes");
186 if (header == null)
throw new ArgumentNullException(
"header");
187 if (reader == null)
throw new ArgumentNullException(
"reader");
188 if (conditionPattern == null)
throw new ArgumentNullException(
"conditionPattern");
190 var args = Regex.Split(header,
"\\s+");
191 var crossProduct = args[2].Equals(
"Y");
192 var numLines = Int32.Parse(args[3]);
194 var hasAliases = _aliases.Count > 0;
195 for (var i = 0; i < numLines; i++) {
196 var line = reader.ReadLine();
197 var ruleArgs = Regex.Split(line,
"\\s+");
199 var affix =
new HunspellAffix();
201 affix.Flag = _flagParsingStrategy.ParseFlag(ruleArgs[1]);
202 affix.Strip = (ruleArgs[2] ==
"0") ?
"" : ruleArgs[2];
204 var affixArg = ruleArgs[3];
206 var flagSep = affixArg.LastIndexOf(
'/');
208 var cflag = affixArg.Substring(flagSep + 1);
209 var appendFlags = hasAliases ? _aliases[cflag] : _flagParsingStrategy.ParseFlags(cflag);
210 Array.Sort(appendFlags);
211 affix.AppendFlags = appendFlags;
212 affix.Append = affixArg.Substring(0, flagSep);
214 affix.Append = affixArg;
217 var condition = ruleArgs[4];
218 affix.SetCondition(condition, String.Format(conditionPattern, condition));
219 affix.IsCrossProduct = crossProduct;
221 List<HunspellAffix> list;
222 if (!affixes.TryGetValue(affix.Append, out list))
223 affixes.Add(affix.Append, list =
new List<HunspellAffix>());
238 private static String ReadDictionaryEncoding(Stream affix) {
239 if (affix == null)
throw new ArgumentNullException(
"affix");
241 var builder =
new StringBuilder();
245 while ((ch = affix.ReadByte()) >= 0) {
250 builder.Append((
char)ch);
254 if (builder.Length == 0 ||
257 builder.ToString().Trim().Length == 0
260 throw new InvalidDataException(
"Unexpected end of affix file.");
265 if (
"SET ".Equals(builder.ToString(0, 4))) {
267 return builder.ToString(4, builder.Length - 4).Trim();
270 throw new InvalidDataException(
"The first non-comment line in the affix file must " +
271 "be a 'SET charset', was: '" + builder +
"'");
280 private static FlagParsingStrategy GetFlagParsingStrategy(String flagLine) {
281 if (flagLine == null)
throw new ArgumentNullException(
"flagLine");
282 var flagType = flagLine.Substring(5);
284 if (NUM_FLAG_TYPE.Equals(flagType))
285 return new NumFlagParsingStrategy();
287 if (UTF8_FLAG_TYPE.Equals(flagType))
288 return new SimpleFlagParsingStrategy();
290 if (LONG_FLAG_TYPE.Equals(flagType))
291 return new DoubleASCIIFlagParsingStrategy();
293 throw new ArgumentException(
"Unknown flag type: " + flagType);
302 private void ReadDictionaryFile(Stream dictionary, Encoding encoding) {
303 if (dictionary == null)
throw new ArgumentNullException(
"dictionary");
304 if (encoding == null)
throw new ArgumentNullException(
"encoding");
305 var reader =
new StreamReader(dictionary, encoding);
308 var line = reader.ReadLine();
309 var numEntries = Int32.Parse(line);
310 var hasAliases = _aliases.Count > 0;
314 while ((line = reader.ReadLine()) != null) {
316 HunspellWord wordForm;
318 var flagSep = line.LastIndexOf(
'/');
325 var end = line.IndexOf(
'\t', flagSep);
326 var cflag = end == -1 ? line.Substring(flagSep + 1) : line.Substring(flagSep + 1, end - flagSep - 1);
328 wordForm =
new HunspellWord(hasAliases ? _aliases[cflag] : _flagParsingStrategy.ParseFlags(cflag));
330 entry = line.Substring(0, flagSep);
333 List<HunspellWord> entries;
334 if (!_words.TryGetValue(entry, out entries))
335 _words.Add(entry, entries =
new List<HunspellWord>());
337 entries.Add(wordForm);
341 #region Nested type: DoubleASCIIFlagParsingStrategy
347 private class DoubleASCIIFlagParsingStrategy : FlagParsingStrategy {
348 public override Char[] ParseFlags(String rawFlags) {
349 if (rawFlags.Length == 0)
352 var builder =
new StringBuilder();
353 for (var i = 0; i < rawFlags.Length; i += 2) {
354 var cookedFlag = (Char)(rawFlags[i] + rawFlags[i + 1]);
355 builder.Append(cookedFlag);
358 return builder.ToString().ToCharArray();
364 #region Nested type: FlagParsingStrategy
368 private abstract class FlagParsingStrategy {
374 public Char ParseFlag(String rawFlag) {
376 throw new ArgumentNullException(
"rawFlag");
378 return ParseFlags(rawFlag)[0];
386 public abstract Char[] ParseFlags(String rawFlags);
391 #region Nested type: NumFlagParsingStrategy
397 private class NumFlagParsingStrategy : FlagParsingStrategy {
398 public override Char[] ParseFlags(String rawFlags) {
399 var rawFlagParts = rawFlags.Trim().Split(
',');
400 var flags =
new Char[rawFlagParts.Length];
402 for (var i = 0; i < rawFlagParts.Length; i++) {
404 var replaced = Regex.Replace(rawFlagParts[i],
"[^0-9]",
"");
405 flags[i] = (Char)Int32.Parse(replaced);
414 #region Nested type: SimpleFlagParsingStrategy
420 private class SimpleFlagParsingStrategy : FlagParsingStrategy {
421 public override Char[] ParseFlags(String rawFlags) {
422 return rawFlags.ToCharArray();