Lucene.Net  3.0.3
Lucene.Net is a .NET port of the Java Lucene Indexing Library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties
HunspellStemmer.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.Collections.Generic;
20 using System.Linq;
21 using System.Text;
22 
23 namespace Lucene.Net.Analysis.Hunspell {
30  public class HunspellStemmer {
31  private static Int32 RECURSION_CAP = 2;
32  private readonly HunspellDictionary _dictionary;
33 
39  public HunspellStemmer(HunspellDictionary dictionary) {
40  if (dictionary == null) throw new ArgumentNullException("dictionary");
41  _dictionary = dictionary;
42  }
43 
49  public IEnumerable<HunspellStem> Stem(String word) {
50  if (word == null) throw new ArgumentNullException("word");
51 
52  var stems = new List<HunspellStem>();
53  if (_dictionary.LookupWord(word) != null)
54  stems.Add(new HunspellStem(word));
55 
56  stems.AddRange(Stem(word, null, 0));
57  return stems;
58  }
59 
65  public IEnumerable<HunspellStem> UniqueStems(String word) {
66  if (word == null) throw new ArgumentNullException("word");
67 
68  var stems = new List<HunspellStem>();
69  var terms = new CharArraySet(8, false);
70  if (_dictionary.LookupWord(word) != null) {
71  stems.Add(new HunspellStem(word));
72  terms.Add(word);
73  }
74 
75  var otherStems = Stem(word, null, 0);
76  foreach (var s in otherStems) {
77  if (!terms.Contains(s.Stem)) {
78  stems.Add(s);
79  terms.Add(s.Stem);
80  }
81  }
82 
83  return stems;
84  }
85 
93  private IEnumerable<HunspellStem> Stem(String word, Char[] flags, Int32 recursionDepth) {
94  if (word == null) throw new ArgumentNullException("word");
95 
96  var stems = new List<HunspellStem>();
97  var chars = word.ToCharArray();
98  var length = word.Length;
99 
100  for (var i = 0; i < length; i++) {
101  var suffixes = _dictionary.LookupSuffix(chars, i, length - i);
102  if (suffixes != null) {
103  foreach (var suffix in suffixes) {
104  if (HasCrossCheckedFlag(suffix.Flag, flags)) {
105  var deAffixedLength = length - suffix.Append.Length;
106 
107  // TODO: can we do this in-place?
108  var strippedWord = new StringBuilder()
109  .Append(word, 0, deAffixedLength)
110  .Append(suffix.Strip)
111  .ToString();
112 
113  var stemList = ApplyAffix(strippedWord, suffix, recursionDepth);
114  foreach (var stem in stemList) {
115  stem.AddSuffix(suffix);
116  }
117 
118  stems.AddRange(stemList);
119  }
120  }
121  }
122  }
123 
124  for (var i = length - 1; i >= 0; i--) {
125  var prefixes = _dictionary.LookupPrefix(chars, 0, i);
126  if (prefixes != null) {
127  foreach (var prefix in prefixes) {
128  if (HasCrossCheckedFlag(prefix.Flag, flags)) {
129  var deAffixedStart = prefix.Append.Length;
130  var deAffixedLength = length - deAffixedStart;
131 
132  var strippedWord = new StringBuilder()
133  .Append(prefix.Strip)
134  .Append(word, deAffixedStart, deAffixedLength)
135  .ToString();
136 
137  var stemList = ApplyAffix(strippedWord, prefix, recursionDepth);
138  foreach (var stem in stemList) {
139  stem.AddPrefix(prefix);
140  }
141 
142  stems.AddRange(stemList);
143  }
144  }
145  }
146  }
147 
148  return stems;
149  }
150 
158  public IEnumerable<HunspellStem> ApplyAffix(String strippedWord, HunspellAffix affix, Int32 recursionDepth) {
159  if (strippedWord == null) throw new ArgumentNullException("strippedWord");
160  if (affix == null) throw new ArgumentNullException("affix");
161 
162  if (!affix.CheckCondition(strippedWord)) {
163  return new List<HunspellStem>();
164  }
165 
166  var words = _dictionary.LookupWord(strippedWord);
167  if (words == null) {
168  return new List<HunspellStem>();
169  }
170 
171  var stems = new List<HunspellStem>();
172 
173  foreach (var hunspellWord in words) {
174  if (hunspellWord.HasFlag(affix.Flag)) {
175  if (affix.IsCrossProduct && recursionDepth < RECURSION_CAP) {
176  var recursiveStems = Stem(strippedWord, affix.AppendFlags, ++recursionDepth);
177  if (recursiveStems.Any()) {
178  stems.AddRange(recursiveStems);
179  } else {
180  stems.Add(new HunspellStem(strippedWord));
181  }
182  } else {
183  stems.Add(new HunspellStem(strippedWord));
184  }
185  }
186  }
187 
188  return stems;
189  }
190 
197  private static Boolean HasCrossCheckedFlag(Char flag, Char[] flags) {
198  return flags == null || Array.BinarySearch(flags, flag) >= 0;
199  }
200  }
201 }