Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
HunspellStemmer.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.Collections.Generic;
20 using System.Linq;
21 using System.Text;
22 
23 namespace Lucene.Net.Analysis.Hunspell {
24  /// <summary>
25  /// HunspellStemmer uses the affix rules declared in the HunspellDictionary to generate one or
26  /// more stems for a word. It conforms to the algorithm in the original hunspell algorithm,
27  /// including recursive suffix stripping.
28  /// </summary>
29  /// <author>Chris Male</author>
30  public class HunspellStemmer {
31  private static Int32 RECURSION_CAP = 2;
32  private readonly HunspellDictionary _dictionary;
33 
34  /// <summary>
35  /// Constructs a new HunspellStemmer which will use the provided HunspellDictionary
36  /// to create its stems.
37  /// </summary>
38  /// <param name="dictionary">HunspellDictionary that will be used to create the stems.</param>
39  public HunspellStemmer(HunspellDictionary dictionary) {
40  if (dictionary == null) throw new ArgumentNullException("dictionary");
41  _dictionary = dictionary;
42  }
43 
44  /// <summary>
45  /// Find the stem(s) of the provided word.
46  /// </summary>
47  /// <param name="word">Word to find the stems for.</param>
48  /// <returns>List of stems for the word.</returns>
49  public IEnumerable<HunspellStem> Stem(String word) {
50  if (word == null) throw new ArgumentNullException("word");
51 
52  var stems = new List<HunspellStem>();
53  if (_dictionary.LookupWord(word) != null)
54  stems.Add(new HunspellStem(word));
55 
56  stems.AddRange(Stem(word, null, 0));
57  return stems;
58  }
59 
60  /// <summary>
61  /// Find the unique stem(s) of the provided word.
62  /// </summary>
63  /// <param name="word">Word to find the stems for.</param>
64  /// <returns>List of stems for the word.</returns>
65  public IEnumerable<HunspellStem> UniqueStems(String word) {
66  if (word == null) throw new ArgumentNullException("word");
67 
68  var stems = new List<HunspellStem>();
69  var terms = new CharArraySet(8, false);
70  if (_dictionary.LookupWord(word) != null) {
71  stems.Add(new HunspellStem(word));
72  terms.Add(word);
73  }
74 
75  var otherStems = Stem(word, null, 0);
76  foreach (var s in otherStems) {
77  if (!terms.Contains(s.Stem)) {
78  stems.Add(s);
79  terms.Add(s.Stem);
80  }
81  }
82 
83  return stems;
84  }
85 
86  /// <summary>
87  /// Generates a list of stems for the provided word.
88  /// </summary>
89  /// <param name="word">Word to generate the stems for.</param>
90  /// <param name="flags">Flags from a previous stemming step that need to be cross-checked with any affixes in this recursive step.</param>
91  /// <param name="recursionDepth">Level of recursion this stemming step is at.</param>
92  /// <returns>List of stems, pr an empty if no stems are found.</returns>
93  private IEnumerable<HunspellStem> Stem(String word, Char[] flags, Int32 recursionDepth) {
94  if (word == null) throw new ArgumentNullException("word");
95 
96  var stems = new List<HunspellStem>();
97  var chars = word.ToCharArray();
98  var length = word.Length;
99 
100  for (var i = 0; i < length; i++) {
101  var suffixes = _dictionary.LookupSuffix(chars, i, length - i);
102  if (suffixes != null) {
103  foreach (var suffix in suffixes) {
104  if (HasCrossCheckedFlag(suffix.Flag, flags)) {
105  var deAffixedLength = length - suffix.Append.Length;
106 
107  // TODO: can we do this in-place?
108  var strippedWord = new StringBuilder()
109  .Append(word, 0, deAffixedLength)
110  .Append(suffix.Strip)
111  .ToString();
112 
113  var stemList = ApplyAffix(strippedWord, suffix, recursionDepth);
114  foreach (var stem in stemList) {
115  stem.AddSuffix(suffix);
116  }
117 
118  stems.AddRange(stemList);
119  }
120  }
121  }
122  }
123 
124  for (var i = length - 1; i >= 0; i--) {
125  var prefixes = _dictionary.LookupPrefix(chars, 0, i);
126  if (prefixes != null) {
127  foreach (var prefix in prefixes) {
128  if (HasCrossCheckedFlag(prefix.Flag, flags)) {
129  var deAffixedStart = prefix.Append.Length;
130  var deAffixedLength = length - deAffixedStart;
131 
132  var strippedWord = new StringBuilder()
133  .Append(prefix.Strip)
134  .Append(word, deAffixedStart, deAffixedLength)
135  .ToString();
136 
137  var stemList = ApplyAffix(strippedWord, prefix, recursionDepth);
138  foreach (var stem in stemList) {
139  stem.AddPrefix(prefix);
140  }
141 
142  stems.AddRange(stemList);
143  }
144  }
145  }
146  }
147 
148  return stems;
149  }
150 
151  /// <summary>
152  /// Applies the affix rule to the given word, producing a list of stems if any are found.
153  /// </summary>
154  /// <param name="strippedWord">Word the affix has been removed and the strip added.</param>
155  /// <param name="affix">HunspellAffix representing the affix rule itself.</param>
156  /// <param name="recursionDepth">Level of recursion this stemming step is at.</param>
157  /// <returns>List of stems for the word, or an empty list if none are found.</returns>
158  public IEnumerable<HunspellStem> ApplyAffix(String strippedWord, HunspellAffix affix, Int32 recursionDepth) {
159  if (strippedWord == null) throw new ArgumentNullException("strippedWord");
160  if (affix == null) throw new ArgumentNullException("affix");
161 
162  if (!affix.CheckCondition(strippedWord)) {
163  return new List<HunspellStem>();
164  }
165 
166  var words = _dictionary.LookupWord(strippedWord);
167  if (words == null) {
168  return new List<HunspellStem>();
169  }
170 
171  var stems = new List<HunspellStem>();
172 
173  foreach (var hunspellWord in words) {
174  if (hunspellWord.HasFlag(affix.Flag)) {
175  if (affix.IsCrossProduct && recursionDepth < RECURSION_CAP) {
176  var recursiveStems = Stem(strippedWord, affix.AppendFlags, ++recursionDepth);
177  if (recursiveStems.Any()) {
178  stems.AddRange(recursiveStems);
179  } else {
180  stems.Add(new HunspellStem(strippedWord));
181  }
182  } else {
183  stems.Add(new HunspellStem(strippedWord));
184  }
185  }
186  }
187 
188  return stems;
189  }
190 
191  /// <summary>
192  /// Checks if the given flag cross checks with the given array of flags.
193  /// </summary>
194  /// <param name="flag">Flag to cross check with the array of flags.</param>
195  /// <param name="flags">Array of flags to cross check against. Can be <c>null</c>.</param>
196  /// <returns><c>true</c> if the flag is found in the array or the array is <c>null</c>, <c>false</c> otherwise.</returns>
197  private static Boolean HasCrossCheckedFlag(Char flag, Char[] flags) {
198  return flags == null || Array.BinarySearch(flags, flag) >= 0;
199  }
200  }
201 }