Lucene.Net  3.0.3
Lucene.Net is a .NET port of the Java Lucene Indexing Library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties
RussianStemmer.cs
Go to the documentation of this file.
1 /*
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements. See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership. The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License. You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied. See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20 */
21 
22 using System;
23 using System.Text;
24 
25 namespace Lucene.Net.Analysis.Ru
26 {
27  /*
28  * Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description).
29  */
30  public class RussianStemmer
31  {
32  // positions of RV, R1 and R2 respectively
33  private int RV, R1, R2;
34 
35  // letters (currently unused letters are commented out)
36  private const char A = '\u0430';
37  //private const char B = '\u0431';
38  private const char V = '\u0432';
39  private const char G = '\u0433';
40  //private const char D = '\u0434';
41  private const char E = '\u0435';
42  //private const char ZH = '\u0436';
43  //private const char Z = '\u0437';
44  private const char I = '\u0438';
45  private const char I_ = '\u0439';
46  //private const char K = '\u043A';
47  private const char L = '\u043B';
48  private const char M = '\u043C';
49  private const char N = '\u043D';
50  private const char O = '\u043E';
51  //private const char P = '\u043F';
52  //private const char R = '\u0440';
53  private const char S = '\u0441';
54  private const char T = '\u0442';
55  private const char U = '\u0443';
56  //private const char F = '\u0444';
57  private const char X = '\u0445';
58  //private const char TS = '\u0446';
59  //private const char CH = '\u0447';
60  private const char SH = '\u0448';
61  private const char SHCH = '\u0449';
62  //private const char HARD = '\u044A';
63  private const char Y = '\u044B';
64  private const char SOFT = '\u044C';
65  private const char AE = '\u044D';
66  private const char IU = '\u044E';
67  private const char IA = '\u044F';
68 
69  // stem definitions
70  private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA };
71 
72  private static char[][] perfectiveGerundEndings1 = {
73  new[] {V},
74  new[] {V, SH, I},
75  new[] {V, SH, I, S, SOFT}
76  };
77 
78  private static char[][] perfectiveGerund1Predessors = {
79  new[] {A},
80  new[] {IA}
81  };
82 
83  private static char[][] perfectiveGerundEndings2 = {
84  new[] {I, V},
85  new[] {Y, V},
86  new[] {I, V, SH, I},
87  new[] {Y, V, SH, I},
88  new[] {I, V, SH, I, S, SOFT},
89  new[] {Y, V, SH, I, S, SOFT}
90  };
91 
92  private static char[][] adjectiveEndings = {
93  new[] {E, E},
94  new[] {I, E},
95  new[] {Y, E},
96  new[] {O, E},
97  new[] {E, I_},
98  new[] {I, I_},
99  new[] {Y, I_},
100  new[] {O, I_},
101  new[] {E, M},
102  new[] {I, M},
103  new[] {Y, M},
104  new[] {O, M},
105  new[] {I, X},
106  new[] {Y, X},
107  new[] {U, IU},
108  new[] {IU, IU},
109  new[] {A, IA},
110  new[] {IA, IA},
111  new[] {O, IU},
112  new[] {E, IU},
113  new[] {I, M, I},
114  new[] {Y, M, I},
115  new[] {E, G, O},
116  new[] {O, G, O},
117  new[] {E, M, U},
118  new[] {O, M, U}
119  };
120 
121  private static char[][] participleEndings1 = {
122  new[] {SHCH},
123  new[] {E, M},
124  new[] {N, N},
125  new[] {V, SH},
126  new[] {IU, SHCH}
127  };
128 
129  private static char[][] participleEndings2 = {
130  new[] {I, V, SH},
131  new[] {Y, V, SH},
132  new[] {U, IU, SHCH}
133  };
134 
135  private static char[][] participle1Predessors = {
136  new[] {A},
137  new[] {IA}
138  };
139 
140  private static char[][] reflexiveEndings = {
141  new[] {S, IA},
142  new[] {S, SOFT}
143  };
144 
145  private static char[][] verbEndings1 = {
146  new[] {I_},
147  new[] {L},
148  new[] {N},
149  new[] {L, O},
150  new[] {N, O},
151  new[] {E, T},
152  new[] {IU, T},
153  new[] {L, A},
154  new[] {N, A},
155  new[] {L, I},
156  new[] {E, M},
157  new[] {N, Y},
158  new[] {E, T, E},
159  new[] {I_, T, E},
160  new[] {T, SOFT},
161  new[] {E, SH, SOFT},
162  new[] {N, N, O}
163  };
164 
165  private static char[][] verbEndings2 = {
166  new[] {IU},
167  new[] {U, IU},
168  new[] {E, N},
169  new[] {E, I_},
170  new[] {IA, T},
171  new[] {U, I_},
172  new[] {I, L},
173  new[] {Y, L},
174  new[] {I, M},
175  new[] {Y, M},
176  new[] {I, T},
177  new[] {Y, T},
178  new[] {I, L, A},
179  new[] {Y, L, A},
180  new[] {E, N, A},
181  new[] {I, T, E},
182  new[] {I, L, I},
183  new[] {Y, L, I},
184  new[] {I, L, O},
185  new[] {Y, L, O},
186  new[] {E, N, O},
187  new[] {U, E, T},
188  new[] {U, IU, T},
189  new[] {E, N, Y},
190  new[] {I, T, SOFT},
191  new[] {Y, T, SOFT},
192  new[] {I, SH, SOFT},
193  new[] {E, I_, T, E},
194  new[] {U, I_, T, E}
195  };
196 
197  private static char[][] verb1Predessors = {
198  new[] {A},
199  new[] {IA}
200  };
201 
202  private static char[][] nounEndings = {
203  new[] {A},
204  new[] {U},
205  new[] {I_},
206  new[] {O},
207  new[] {U},
208  new[] {E},
209  new[] {Y},
210  new[] {I},
211  new[] {SOFT},
212  new[] {IA},
213  new[] {E, V},
214  new[] {O, V},
215  new[] {I, E},
216  new[] {SOFT, E},
217  new[] {IA, X},
218  new[] {I, IU},
219  new[] {E, I},
220  new[] {I, I},
221  new[] {E, I_},
222  new[] {O, I_},
223  new[] {E, M},
224  new[] {A, M},
225  new[] {O, M},
226  new[] {A, X},
227  new[] {SOFT, IU},
228  new[] {I, IA},
229  new[] {SOFT, IA},
230  new[] {I, I_},
231  new[] {IA, M},
232  new[] {IA, M, I},
233  new[] {A, M, I},
234  new[] {I, E, I_},
235  new[] {I, IA, M},
236  new[] {I, E, M},
237  new[] {I, IA, X},
238  new[] {I, IA, M, I}
239  };
240 
241  private static char[][] superlativeEndings = {
242  new[] {E, I_, SH},
243  new[] {E, I_, SH, E}
244  };
245 
246  private static char[][] derivationalEndings = {
247  new[] {O, S, T},
248  new[] {O, S, T, SOFT}
249  };
250 
251  /*
252  * RussianStemmer constructor comment.
253  */
254  public RussianStemmer()
255  {
256  }
257 
258  /*
259  * Adjectival ending is an adjective ending,
260  * optionally preceded by participle ending.
261  * Creation date: (17/03/2002 12:14:58 AM)
262  * @param stemmingZone java.lang.StringBuilder
263  */
264  private bool adjectival(StringBuilder stemmingZone)
265  {
266  // look for adjective ending in a stemming zone
267  if (!findAndRemoveEnding(stemmingZone, adjectiveEndings))
268  return false;
269  // if adjective ending was found, try for participle ending.
270  // variable r is unused, we are just interested in the side effect of
271  // findAndRemoveEnding():
272  bool r =
273  findAndRemoveEnding(stemmingZone, participleEndings1, participle1Predessors)
274  ||
275  findAndRemoveEnding(stemmingZone, participleEndings2);
276  return true;
277  }
278 
279  /*
280  * Derivational endings
281  * Creation date: (17/03/2002 12:14:58 AM)
282  * @param stemmingZone java.lang.StringBuilder
283  */
284  private bool derivational(StringBuilder stemmingZone)
285  {
286  int endingLength = findEnding(stemmingZone, derivationalEndings);
287  if (endingLength == 0)
288  // no derivational ending found
289  return false;
290  else
291  {
292  // Ensure that the ending locates in R2
293  if (R2 - RV <= stemmingZone.Length - endingLength)
294  {
295  stemmingZone.Length = stemmingZone.Length - endingLength;
296  return true;
297  }
298  else
299  {
300  return false;
301  }
302  }
303  }
304 
305  /*
306  * Finds ending among given ending class and returns the length of ending found(0, if not found).
307  * Creation date: (17/03/2002 8:18:34 PM)
308  */
309  private int findEnding(StringBuilder stemmingZone, int startIndex, char[][] theEndingClass)
310  {
311  bool match = false;
312  for (int i = theEndingClass.Length - 1; i >= 0; i--)
313  {
314  char[] theEnding = theEndingClass[i];
315  // check if the ending is bigger than stemming zone
316  if (startIndex < theEnding.Length - 1)
317  {
318  match = false;
319  continue;
320  }
321  match = true;
322  int stemmingIndex = startIndex;
323  for (int j = theEnding.Length - 1; j >= 0; j--)
324  {
325  if (stemmingZone[stemmingIndex--] != theEnding[j])
326  {
327  match = false;
328  break;
329  }
330  }
331  // check if ending was found
332  if (match)
333  {
334  return theEndingClass[i].Length; // cut ending
335  }
336  }
337  return 0;
338  }
339 
340  private int findEnding(StringBuilder stemmingZone, char[][] theEndingClass)
341  {
342  return findEnding(stemmingZone, stemmingZone.Length - 1, theEndingClass);
343  }
344 
345  /*
346  * Finds the ending among the given class of endings and removes it from stemming zone.
347  * Creation date: (17/03/2002 8:18:34 PM)
348  */
349  private bool findAndRemoveEnding(StringBuilder stemmingZone, char[][] theEndingClass)
350  {
351  int endingLength = findEnding(stemmingZone, theEndingClass);
352  if (endingLength == 0)
353  // not found
354  return false;
355  else
356  {
357  stemmingZone.Length = stemmingZone.Length - endingLength;
358  // cut the ending found
359  return true;
360  }
361  }
362 
363  /*
364  * Finds the ending among the given class of endings, then checks if this ending was
365  * preceded by any of given predecessors, and if so, removes it from stemming zone.
366  * Creation date: (17/03/2002 8:18:34 PM)
367  */
368  private bool findAndRemoveEnding(StringBuilder stemmingZone,
369  char[][] theEndingClass, char[][] thePredessors)
370  {
371  int endingLength = findEnding(stemmingZone, theEndingClass);
372  if (endingLength == 0)
373  // not found
374  return false;
375  else
376  {
377  int predessorLength =
378  findEnding(stemmingZone,
379  stemmingZone.Length - endingLength - 1,
380  thePredessors);
381  if (predessorLength == 0)
382  return false;
383  else
384  {
385  stemmingZone.Length = stemmingZone.Length - endingLength;
386  // cut the ending found
387  return true;
388  }
389  }
390 
391  }
392 
393  /*
394  * Marks positions of RV, R1 and R2 in a given word.
395  * Creation date: (16/03/2002 3:40:11 PM)
396  */
397  private void markPositions(String word)
398  {
399  RV = 0;
400  R1 = 0;
401  R2 = 0;
402  int i = 0;
403  // find RV
404  while (word.Length > i && !isVowel(word[i]))
405  {
406  i++;
407  }
408  if (word.Length - 1 < ++i)
409  return; // RV zone is empty
410  RV = i;
411  // find R1
412  while (word.Length > i && isVowel(word[i]))
413  {
414  i++;
415  }
416  if (word.Length - 1 < ++i)
417  return; // R1 zone is empty
418  R1 = i;
419  // find R2
420  while (word.Length > i && !isVowel(word[i]))
421  {
422  i++;
423  }
424  if (word.Length - 1 < ++i)
425  return; // R2 zone is empty
426  while (word.Length > i && isVowel(word[i]))
427  {
428  i++;
429  }
430  if (word.Length - 1 < ++i)
431  return; // R2 zone is empty
432  R2 = i;
433  }
434 
435  /*
436  * Checks if character is a vowel..
437  * Creation date: (16/03/2002 10:47:03 PM)
438  * @return bool
439  * @param letter char
440  */
441  private bool isVowel(char letter)
442  {
443  for (int i = 0; i < vowels.Length; i++)
444  {
445  if (letter == vowels[i])
446  return true;
447  }
448  return false;
449  }
450 
451  /*
452  * Noun endings.
453  * Creation date: (17/03/2002 12:14:58 AM)
454  * @param stemmingZone java.lang.StringBuilder
455  */
456  private bool noun(StringBuilder stemmingZone)
457  {
458  return findAndRemoveEnding(stemmingZone, nounEndings);
459  }
460 
461  /*
462  * Perfective gerund endings.
463  * Creation date: (17/03/2002 12:14:58 AM)
464  * @param stemmingZone java.lang.StringBuilder
465  */
466  private bool perfectiveGerund(StringBuilder stemmingZone)
467  {
468  return findAndRemoveEnding(
469  stemmingZone,
470  perfectiveGerundEndings1,
471  perfectiveGerund1Predessors)
472  || findAndRemoveEnding(stemmingZone, perfectiveGerundEndings2);
473  }
474 
475  /*
476  * Reflexive endings.
477  * Creation date: (17/03/2002 12:14:58 AM)
478  * @param stemmingZone java.lang.StringBuilder
479  */
480  private bool reflexive(StringBuilder stemmingZone)
481  {
482  return findAndRemoveEnding(stemmingZone, reflexiveEndings);
483  }
484 
485  /*
486  * Insert the method's description here.
487  * Creation date: (17/03/2002 12:14:58 AM)
488  * @param stemmingZone java.lang.StringBuilder
489  */
490  private bool removeI(StringBuilder stemmingZone)
491  {
492  if (stemmingZone.Length > 0
493  && stemmingZone[stemmingZone.Length - 1] == I)
494  {
495  stemmingZone.Length = stemmingZone.Length - 1;
496  return true;
497  }
498  else
499  {
500  return false;
501  }
502  }
503 
504  /*
505  * Insert the method's description here.
506  * Creation date: (17/03/2002 12:14:58 AM)
507  * @param stemmingZone java.lang.StringBuilder
508  */
509  private bool removeSoft(StringBuilder stemmingZone)
510  {
511  if (stemmingZone.Length > 0
512  && stemmingZone[stemmingZone.Length - 1] == SOFT)
513  {
514  stemmingZone.Length = stemmingZone.Length - 1;
515  return true;
516  }
517  else
518  {
519  return false;
520  }
521  }
522 
523  /*
524  * Finds the stem for given Russian word.
525  * Creation date: (16/03/2002 3:36:48 PM)
526  * @return java.lang.String
527  * @param input java.lang.String
528  */
529  public virtual String Stem(String input)
530  {
531  markPositions(input);
532  if (RV == 0)
533  return input; //RV wasn't detected, nothing to stem
534  StringBuilder stemmingZone = new StringBuilder(input.Substring(RV));
535  // stemming goes on in RV
536  // Step 1
537 
538  if (!perfectiveGerund(stemmingZone))
539  {
540  reflexive(stemmingZone);
541  // variable r is unused, we are just interested in the flow that gets
542  // created by logical expression: apply adjectival(); if that fails,
543  // apply verb() etc
544  bool r =
545  adjectival(stemmingZone)
546  || Verb(stemmingZone)
547  || noun(stemmingZone);
548  }
549  // Step 2
550  removeI(stemmingZone);
551  // Step 3
552  derivational(stemmingZone);
553  // Step 4
554  Superlative(stemmingZone);
555  UndoubleN(stemmingZone);
556  removeSoft(stemmingZone);
557  // return result
558  return input.Substring(0, RV) + stemmingZone.ToString();
559  }
560 
561  /*
562  * Superlative endings.
563  * Creation date: (17/03/2002 12:14:58 AM)
564  * @param stemmingZone java.lang.StringBuilder
565  */
566  private bool Superlative(StringBuilder stemmingZone)
567  {
568  return findAndRemoveEnding(stemmingZone, superlativeEndings);
569  }
570 
571  /*
572  * Undoubles N.
573  * Creation date: (17/03/2002 12:14:58 AM)
574  * @param stemmingZone java.lang.StringBuilder
575  */
576  private bool UndoubleN(StringBuilder stemmingZone)
577  {
578  char[][] doubleN = {
579  new[] {N, N}
580  };
581  if (findEnding(stemmingZone, doubleN) != 0)
582  {
583  stemmingZone.Length = stemmingZone.Length - 1;
584  return true;
585  }
586  else
587  {
588  return false;
589  }
590  }
591 
592  /*
593  * Verb endings.
594  * Creation date: (17/03/2002 12:14:58 AM)
595  * @param stemmingZone java.lang.StringBuilder
596  */
597  private bool Verb(StringBuilder stemmingZone)
598  {
599  return findAndRemoveEnding(
600  stemmingZone,
601  verbEndings1,
602  verb1Predessors)
603  || findAndRemoveEnding(stemmingZone, verbEndings2);
604  }
605 
606  /*
607  * Static method for stemming.
608  */
609  public static String StemWord(String theWord)
610  {
611  RussianStemmer stemmer = new RussianStemmer();
612  return stemmer.Stem(theWord);
613  }
614  }
615 }