Lucene.Net  3.0.3
Lucene.Net is a .NET port of the Java Lucene Indexing Library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties
DutchStemmer.cs
Go to the documentation of this file.
1 /*
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements. See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership. The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License. You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied. See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20 */
21 
22 using System;
23 using System.IO;
24 using System.Text;
25 using System.Collections;
26 using System.Collections.Generic;
27 
28 namespace Lucene.Net.Analysis.Nl
29 {
30  /*
31  * A stemmer for Dutch words.
32  * <p>
33  * The algorithm is an implementation of
34  * the <a href="http://snowball.tartarus.org/algorithms/dutch/stemmer.html">dutch stemming</a>
35  * algorithm in Martin Porter's snowball project.
36  * </p>
37  */
38 
39  public class DutchStemmer
40  {
41  /*
42  * Buffer for the terms while stemming them.
43  */
44  private StringBuilder sb = new StringBuilder();
45  private bool _removedE;
46  private IDictionary<string, string> _stemDict;
47 
48  private int _R1;
49  private int _R2;
50 
51  //TODO convert to internal
52  /*
53  * Stems the given term to an unique <tt>discriminator</tt>.
54  *
55  * @param term The term that should be stemmed.
56  * @return Discriminator for <tt>term</tt>
57  */
58  public String Stem(String term)
59  {
60  term = term.ToLower();
61  if (!isStemmable(term))
62  return term;
63  if (_stemDict != null && _stemDict.ContainsKey(term))
64  if (_stemDict[term] is String)
65  return (String)_stemDict[term];
66  else
67  return null;
68 
69  // Reset the StringBuilder.
70  sb.Length = 0;
71  sb.Insert(0, term);
72  // Stemming starts here...
73  substitute(sb);
74  storeYandI(sb);
75  _R1 = getRIndex(sb, 0);
76  _R1 = Math.Max(3, _R1);
77  step1(sb);
78  step2(sb);
79  _R2 = getRIndex(sb, _R1);
80  step3a(sb);
81  step3b(sb);
82  step4(sb);
83  reStoreYandI(sb);
84  return sb.ToString();
85  }
86 
87  private bool enEnding(StringBuilder sb)
88  {
89  String[] enend = new String[] { "ene", "en" };
90  for (int i = 0; i < enend.Length; i++)
91  {
92  String end = enend[i];
93  String s = sb.ToString();
94  int index = s.Length - end.Length;
95  if (s.EndsWith(end) &&
96  index >= _R1 &&
97  isValidEnEnding(sb, index - 1)
98  )
99  {
100  sb.Remove(index, end.Length);
101  unDouble(sb, index);
102  return true;
103  }
104  }
105  return false;
106  }
107 
108 
109  private void step1(StringBuilder sb)
110  {
111  if (_R1 >= sb.Length)
112  return;
113 
114  String s = sb.ToString();
115  int LengthR1 = sb.Length - _R1;
116  int index;
117 
118  if (s.EndsWith("heden"))
119  {
120  var toReplace = sb.ToString(_R1, LengthR1).Replace("heden", "heid");
121  sb.Remove(_R1, LengthR1);
122  sb.Insert(_R1, toReplace);
123  return;
124  }
125 
126  if (enEnding(sb))
127  return;
128 
129  if (s.EndsWith("se") &&
130  (index = s.Length - 2) >= _R1 &&
131  isValidSEnding(sb, index - 1)
132  )
133  {
134  sb.Remove(index, 2);
135  return;
136  }
137  if (s.EndsWith("s") &&
138  (index = s.Length - 1) >= _R1 &&
139  isValidSEnding(sb, index - 1))
140  {
141  sb.Remove(index, 1);
142  }
143  }
144 
145  /*
146  * Remove suffix e if in R1 and
147  * preceded by a non-vowel, and then undouble the ending
148  *
149  * @param sb String being stemmed
150  */
151  private void step2(StringBuilder sb)
152  {
153  _removedE = false;
154  if (_R1 >= sb.Length)
155  return;
156  String s = sb.ToString();
157  int index = s.Length - 1;
158  if (index >= _R1 &&
159  s.EndsWith("e") &&
160  !isVowel(sb[index - 1]))
161  {
162  sb.Remove(index, 1);
163  unDouble(sb);
164  _removedE = true;
165  }
166  }
167 
168  /*
169  * Remove "heid"
170  *
171  * @param sb String being stemmed
172  */
173  private void step3a(StringBuilder sb)
174  {
175  if (_R2 >= sb.Length)
176  return;
177  String s = sb.ToString();
178  int index = s.Length - 4;
179  if (s.EndsWith("heid") && index >= _R2 && sb[index - 1] != 'c')
180  {
181  sb.Remove(index, 4); //remove heid
182  enEnding(sb);
183  }
184  }
185 
186  /*
187  * <p>A d-suffix, or derivational suffix, enables a new word,
188  * often with a different grammatical category, or with a different
189  * sense, to be built from another word. Whether a d-suffix can be
190  * attached is discovered not from the rules of grammar, but by
191  * referring to a dictionary. So in English, ness can be added to
192  * certain adjectives to form corresponding nouns (littleness,
193  * kindness, foolishness ...) but not to all adjectives
194  * (not for example, to big, cruel, wise ...) d-suffixes can be
195  * used to change meaning, often in rather exotic ways.</p>
196  * Remove "ing", "end", "ig", "lijk", "baar" and "bar"
197  *
198  * @param sb String being stemmed
199  */
200  private void step3b(StringBuilder sb)
201  {
202  if (_R2 >= sb.Length)
203  return;
204  String s = sb.ToString();
205  int index = 0;
206 
207  if ((s.EndsWith("end") || s.EndsWith("ing")) &&
208  (index = s.Length - 3) >= _R2)
209  {
210  sb.Remove(index, 3);
211  if (sb[index - 2] == 'i' &&
212  sb[index - 1] == 'g')
213  {
214  if (sb[index - 3] != 'e' & index - 2 >= _R2)
215  {
216  index -= 2;
217  sb.Remove(index, 2);
218  }
219  }
220  else
221  {
222  unDouble(sb, index);
223  }
224  return;
225  }
226  if (s.EndsWith("ig") &&
227  (index = s.Length - 2) >= _R2
228  )
229  {
230  if (sb[index - 1] != 'e')
231  sb.Remove(index, 2);
232  return;
233  }
234  if (s.EndsWith("lijk") &&
235  (index = s.Length - 4) >= _R2
236  )
237  {
238  sb.Remove(index, 4);
239  step2(sb);
240  return;
241  }
242  if (s.EndsWith("baar") &&
243  (index = s.Length - 4) >= _R2
244  )
245  {
246  sb.Remove(index, 4);
247  return;
248  }
249  if (s.EndsWith("bar") &&
250  (index = s.Length - 3) >= _R2
251  )
252  {
253  if (_removedE)
254  sb.Remove(index, 3);
255  return;
256  }
257  }
258 
259  /*
260  * undouble vowel
261  * If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u, remove one of the vowels from V (for example, maan -> man, brood -> brod).
262  *
263  * @param sb String being stemmed
264  */
265  private void step4(StringBuilder sb)
266  {
267  if (sb.Length < 4)
268  return;
269  String end = sb.ToString(sb.Length - 4, 4);
270  char c = end[0];
271  char v1 = end[1];
272  char v2 = end[2];
273  char d = end[3];
274  if (v1 == v2 &&
275  d != 'I' &&
276  v1 != 'i' &&
277  isVowel(v1) &&
278  !isVowel(d) &&
279  !isVowel(c))
280  {
281  sb.Remove(sb.Length - 2, 1);
282  }
283  }
284 
285  /*
286  * Checks if a term could be stemmed.
287  *
288  * @return true if, and only if, the given term consists in letters.
289  */
290  private bool isStemmable(String term)
291  {
292  for (int c = 0; c < term.Length; c++)
293  {
294  if (!char.IsLetter(term[c])) return false;
295  }
296  return true;
297  }
298 
299  /*
300  * Substitute ä, ë, ï, ö, ü, á , é, í, ó, ú
301  */
302  private void substitute(StringBuilder buffer)
303  {
304  for (int i = 0; i < buffer.Length; i++)
305  {
306  switch (buffer[i])
307  {
308  case 'ä':
309  case 'á':
310  {
311  buffer[i] = 'a';
312  break;
313  }
314  case 'ë':
315  case 'é':
316  {
317  buffer[i] = 'e';
318  break;
319  }
320  case 'ü':
321  case 'ú':
322  {
323  buffer[i] = 'u';
324  break;
325  }
326  case 'ï':
327  case 'i':
328  {
329  buffer[i] = 'i';
330  break;
331  }
332  case 'ö':
333  case 'ó':
334  {
335  buffer[i] = 'o';
336  break;
337  }
338  }
339  }
340  }
341 
342  /*private bool isValidSEnding(StringBuilder sb) {
343  return isValidSEnding(sb, sb.Length - 1);
344  }*/
345 
346  private bool isValidSEnding(StringBuilder sb, int index)
347  {
348  char c = sb[index];
349  if (isVowel(c) || c == 'j')
350  return false;
351  return true;
352  }
353 
354  /*private bool isValidEnEnding(StringBuilder sb) {
355  return isValidEnEnding(sb, sb.Length - 1);
356  }*/
357 
358  private bool isValidEnEnding(StringBuilder sb, int index)
359  {
360  char c = sb[index];
361  if (isVowel(c))
362  return false;
363  if (c < 3)
364  return false;
365  // ends with "gem"?
366  if (c == 'm' && sb[index - 2] == 'g' && sb[index - 1] == 'e')
367  return false;
368  return true;
369  }
370 
371  private void unDouble(StringBuilder sb)
372  {
373  unDouble(sb, sb.Length);
374  }
375 
376  private void unDouble(StringBuilder sb, int endIndex)
377  {
378  String s = sb.ToString(0, endIndex);
379  if (s.EndsWith("kk") || s.EndsWith("tt") || s.EndsWith("dd") || s.EndsWith("nn") || s.EndsWith("mm") || s.EndsWith("ff"))
380  {
381  sb.Remove(endIndex - 1, 1);
382  }
383  }
384 
385  private int getRIndex(StringBuilder sb, int start)
386  {
387  if (start == 0)
388  start = 1;
389  int i = start;
390  for (; i < sb.Length; i++)
391  {
392  //first non-vowel preceded by a vowel
393  if (!isVowel(sb[i]) && isVowel(sb[i - 1]))
394  {
395  return i + 1;
396  }
397  }
398  return i + 1;
399  }
400 
401  private void storeYandI(StringBuilder sb)
402  {
403  if (sb[0] == 'y')
404  sb[0] = 'Y';
405 
406  int last = sb.Length - 1;
407 
408  for (int i = 1; i < last; i++)
409  {
410  switch (sb[i])
411  {
412  case 'i':
413  {
414  if (isVowel(sb[i - 1]) &&
415  isVowel(sb[i + 1])
416  )
417  sb[i] = 'I';
418  break;
419  }
420  case 'y':
421  {
422  if (isVowel(sb[i - 1]))
423  sb[i] = 'Y';
424  break;
425  }
426  }
427  }
428  if (last > 0 && sb[last] == 'y' && isVowel(sb[last - 1]))
429  sb[last] = 'Y';
430  }
431 
432  private void reStoreYandI(StringBuilder sb)
433  {
434  String tmp = sb.ToString();
435  sb.Length = 0;
436  sb.Insert(0, tmp.Replace("I", "i").Replace("Y", "y"));
437  }
438 
439  private bool isVowel(char c)
440  {
441  switch (c)
442  {
443  case 'e':
444  case 'a':
445  case 'o':
446  case 'i':
447  case 'u':
448  case 'y':
449  case 'è':
450  {
451  return true;
452  }
453  }
454  return false;
455  }
456 
457  protected internal void SetStemDictionary(IDictionary<string, string> dict)
458  {
459  _stemDict = dict;
460  }
461  }
462 }