Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
PorterStemmer.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 /*
19 
20 Porter stemmer in Java. The original paper is in
21 
22 Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
23 no. 3, pp 130-137,
24 
25 See also http://www.tartarus.org/~martin/PorterStemmer/index.html
26 
27 Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.
28 Tthe words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]
29 is then out outside the bounds of b.
30 
31 Similarly,
32 
33 Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below.
34 'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and
35 b[j] is then outside the bounds of b.
36 
37 Release 3.
38 
39 [ This version is derived from Release 3, modified by Brian Goetz to
40 optimize for fewer object creations. ]
41 */
42 using System;
43 namespace Lucene.Net.Analysis
44 {
45 
46  /// <summary>
47  /// Stemmer, implementing the Porter Stemming Algorithm
48  ///
49  /// The Stemmer class transforms a word into its root form. The input
50  /// word can be provided a character at time (by calling add()), or at once
51  /// by calling one of the various stem(something) methods.
52  /// </summary>
53 
55  {
56  private char[] b;
57  private int i, j, k, k0;
58  private bool dirty = false;
59  private const int INC = 50; /* unit of size whereby b is increased */
60  private const int EXTRA = 1;
61 
62  public PorterStemmer()
63  {
64  b = new char[INC];
65  i = 0;
66  }
67 
68  /// <summary> reset() resets the stemmer so it can stem another word. If you invoke
69  /// the stemmer by calling add(char) and then stem(), you must call reset()
70  /// before starting another word.
71  /// </summary>
72  public virtual void Reset()
73  {
74  i = 0; dirty = false;
75  }
76 
77  /// <summary> Add a character to the word being stemmed. When you are finished
78  /// adding characters, you can call stem(void) to process the word.
79  /// </summary>
80  public virtual void Add(char ch)
81  {
82  if (b.Length <= i + EXTRA)
83  {
84  var new_b = new char[b.Length + INC];
85  Array.Copy(b, 0, new_b, 0, b.Length);
86  b = new_b;
87  }
88  b[i++] = ch;
89  }
90 
91  /// <summary> After a word has been stemmed, it can be retrieved by toString(),
92  /// or a reference to the internal buffer can be retrieved by getResultBuffer
93  /// and getResultLength (which is generally more efficient.)
94  /// </summary>
95  public override System.String ToString()
96  {
97  return new System.String(b, 0, i);
98  }
99 
100  /// <summary> Returns the length of the word resulting from the stemming process.</summary>
101  public virtual int ResultLength
102  {
103  get { return i; }
104  }
105 
106  /// <summary> Returns a reference to a character buffer containing the results of
107  /// the stemming process. You also need to consult getResultLength()
108  /// to determine the length of the result.
109  /// </summary>
110  public virtual char[] ResultBuffer
111  {
112  get { return b; }
113  }
114 
115  /* cons(i) is true <=> b[i] is a consonant. */
116 
117  private bool Cons(int i)
118  {
119  switch (b[i])
120  {
121 
122  case 'a':
123  case 'e':
124  case 'i':
125  case 'o':
126  case 'u':
127  return false;
128 
129  case 'y':
130  return (i == k0)?true:!Cons(i - 1);
131 
132  default:
133  return true;
134 
135  }
136  }
137 
138  /* m() measures the number of consonant sequences between k0 and j. if c is
139  a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
140  presence,
141 
142  <c><v> gives 0
143  <c>vc<v> gives 1
144  <c>vcvc<v> gives 2
145  <c>vcvcvc<v> gives 3
146  ....
147  */
148 
149  private int M()
150  {
151  int n = 0;
152  int i = k0;
153  while (true)
154  {
155  if (i > j)
156  return n;
157  if (!Cons(i))
158  break;
159  i++;
160  }
161  i++;
162  while (true)
163  {
164  while (true)
165  {
166  if (i > j)
167  return n;
168  if (Cons(i))
169  break;
170  i++;
171  }
172  i++;
173  n++;
174  while (true)
175  {
176  if (i > j)
177  return n;
178  if (!Cons(i))
179  break;
180  i++;
181  }
182  i++;
183  }
184  }
185 
186  /* vowelinstem() is true <=> k0,...j contains a vowel */
187 
188  private bool Vowelinstem()
189  {
190  int i;
191  for (i = k0; i <= j; i++)
192  if (!Cons(i))
193  return true;
194  return false;
195  }
196 
197  /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
198 
199  private bool Doublec(int j)
200  {
201  if (j < k0 + 1)
202  return false;
203  if (b[j] != b[j - 1])
204  return false;
205  return Cons(j);
206  }
207 
208  /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
209  and also if the second c is not w,x or y. this is used when trying to
210  restore an e at the end of a short word. e.g.
211 
212  cav(e), lov(e), hop(e), crim(e), but
213  snow, box, tray.
214 
215  */
216 
217  private bool Cvc(int i)
218  {
219  if (i < k0 + 2 || !Cons(i) || Cons(i - 1) || !Cons(i - 2))
220  return false;
221  else
222  {
223  int ch = b[i];
224  if (ch == 'w' || ch == 'x' || ch == 'y')
225  return false;
226  }
227  return true;
228  }
229 
230  private bool Ends(System.String s)
231  {
232  int l = s.Length;
233  int o = k - l + 1;
234  if (o < k0)
235  return false;
236  for (int i = 0; i < l; i++)
237  if (b[o + i] != s[i])
238  return false;
239  j = k - l;
240  return true;
241  }
242 
243  /* setto(s) sets (j+1),...k to the characters in the string s, readjusting
244  k. */
245 
246  internal virtual void Setto(System.String s)
247  {
248  int l = s.Length;
249  int o = j + 1;
250  for (int i = 0; i < l; i++)
251  b[o + i] = s[i];
252  k = j + l;
253  dirty = true;
254  }
255 
256  /* r(s) is used further down. */
257 
258  internal virtual void R(System.String s)
259  {
260  if (M() > 0)
261  Setto(s);
262  }
263 
264  /* step1() gets rid of plurals and -ed or -ing. e.g.
265 
266  caresses -> caress
267  ponies -> poni
268  ties -> ti
269  caress -> caress
270  cats -> cat
271 
272  feed -> feed
273  agreed -> agree
274  disabled -> disable
275 
276  matting -> mat
277  mating -> mate
278  meeting -> meet
279  milling -> mill
280  messing -> mess
281 
282  meetings -> meet
283 
284  */
285 
286  private void Step1()
287  {
288  if (b[k] == 's')
289  {
290  if (Ends("sses"))
291  k -= 2;
292  else if (Ends("ies"))
293  Setto("i");
294  else if (b[k - 1] != 's')
295  k--;
296  }
297  if (Ends("eed"))
298  {
299  if (M() > 0)
300  k--;
301  }
302  else if ((Ends("ed") || Ends("ing")) && Vowelinstem())
303  {
304  k = j;
305  if (Ends("at"))
306  Setto("ate");
307  else if (Ends("bl"))
308  Setto("ble");
309  else if (Ends("iz"))
310  Setto("ize");
311  else if (Doublec(k))
312  {
313  int ch = b[k--];
314  if (ch == 'l' || ch == 's' || ch == 'z')
315  k++;
316  }
317  else if (M() == 1 && Cvc(k))
318  Setto("e");
319  }
320  }
321 
322  /* step2() turns terminal y to i when there is another vowel in the stem. */
323 
324  private void Step2()
325  {
326  if (Ends("y") && Vowelinstem())
327  {
328  b[k] = 'i';
329  dirty = true;
330  }
331  }
332 
333  /* step3() maps double suffices to single ones. so -ization ( = -ize plus
334  -ation) maps to -ize etc. note that the string before the suffix must give
335  m() > 0. */
336 
337  private void Step3()
338  {
339  if (k == k0)
340  return ; /* For Bug 1 */
341  switch (b[k - 1])
342  {
343 
344  case 'a':
345  if (Ends("ational"))
346  {
347  R("ate"); break;
348  }
349  if (Ends("tional"))
350  {
351  R("tion"); break;
352  }
353  break;
354 
355  case 'c':
356  if (Ends("enci"))
357  {
358  R("ence"); break;
359  }
360  if (Ends("anci"))
361  {
362  R("ance"); break;
363  }
364  break;
365 
366  case 'e':
367  if (Ends("izer"))
368  {
369  R("ize"); break;
370  }
371  break;
372 
373  case 'l':
374  if (Ends("bli"))
375  {
376  R("ble"); break;
377  }
378  if (Ends("alli"))
379  {
380  R("al"); break;
381  }
382  if (Ends("entli"))
383  {
384  R("ent"); break;
385  }
386  if (Ends("eli"))
387  {
388  R("e"); break;
389  }
390  if (Ends("ousli"))
391  {
392  R("ous"); break;
393  }
394  break;
395 
396  case 'o':
397  if (Ends("ization"))
398  {
399  R("ize"); break;
400  }
401  if (Ends("ation"))
402  {
403  R("ate"); break;
404  }
405  if (Ends("ator"))
406  {
407  R("ate"); break;
408  }
409  break;
410 
411  case 's':
412  if (Ends("alism"))
413  {
414  R("al"); break;
415  }
416  if (Ends("iveness"))
417  {
418  R("ive"); break;
419  }
420  if (Ends("fulness"))
421  {
422  R("ful"); break;
423  }
424  if (Ends("ousness"))
425  {
426  R("ous"); break;
427  }
428  break;
429 
430  case 't':
431  if (Ends("aliti"))
432  {
433  R("al"); break;
434  }
435  if (Ends("iviti"))
436  {
437  R("ive"); break;
438  }
439  if (Ends("biliti"))
440  {
441  R("ble"); break;
442  }
443  break;
444 
445  case 'g':
446  if (Ends("logi"))
447  {
448  R("log"); break;
449  }
450  break;
451  }
452  }
453 
454  /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
455 
456  private void Step4()
457  {
458  switch (b[k])
459  {
460 
461  case 'e':
462  if (Ends("icate"))
463  {
464  R("ic"); break;
465  }
466  if (Ends("ative"))
467  {
468  R(""); break;
469  }
470  if (Ends("alize"))
471  {
472  R("al"); break;
473  }
474  break;
475 
476  case 'i':
477  if (Ends("iciti"))
478  {
479  R("ic"); break;
480  }
481  break;
482 
483  case 'l':
484  if (Ends("ical"))
485  {
486  R("ic"); break;
487  }
488  if (Ends("ful"))
489  {
490  R(""); break;
491  }
492  break;
493 
494  case 's':
495  if (Ends("ness"))
496  {
497  R(""); break;
498  }
499  break;
500  }
501  }
502 
503  /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
504 
505  private void Step5()
506  {
507  if (k == k0)
508  return ; /* for Bug 1 */
509  switch (b[k - 1])
510  {
511 
512  case 'a':
513  if (Ends("al"))
514  break;
515  return ;
516 
517  case 'c':
518  if (Ends("ance"))
519  break;
520  if (Ends("ence"))
521  break;
522  return ;
523 
524  case 'e':
525  if (Ends("er"))
526  break; return ;
527 
528  case 'i':
529  if (Ends("ic"))
530  break; return ;
531 
532  case 'l':
533  if (Ends("able"))
534  break;
535  if (Ends("ible"))
536  break; return ;
537 
538  case 'n':
539  if (Ends("ant"))
540  break;
541  if (Ends("ement"))
542  break;
543  if (Ends("ment"))
544  break;
545  /* element etc. not stripped before the m */
546  if (Ends("ent"))
547  break;
548  return ;
549 
550  case 'o':
551  if (Ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't'))
552  break;
553  /* j >= 0 fixes Bug 2 */
554  if (Ends("ou"))
555  break;
556  return ;
557  /* takes care of -ous */
558 
559  case 's':
560  if (Ends("ism"))
561  break;
562  return ;
563 
564  case 't':
565  if (Ends("ate"))
566  break;
567  if (Ends("iti"))
568  break;
569  return ;
570 
571  case 'u':
572  if (Ends("ous"))
573  break;
574  return ;
575 
576  case 'v':
577  if (Ends("ive"))
578  break;
579  return ;
580 
581  case 'z':
582  if (Ends("ize"))
583  break;
584  return ;
585 
586  default:
587  return ;
588 
589  }
590  if (M() > 1)
591  k = j;
592  }
593 
594  /* step6() removes a final -e if m() > 1. */
595 
596  private void Step6()
597  {
598  j = k;
599  if (b[k] == 'e')
600  {
601  int a = M();
602  if (a > 1 || a == 1 && !Cvc(k - 1))
603  k--;
604  }
605  if (b[k] == 'l' && Doublec(k) && M() > 1)
606  k--;
607  }
608 
609 
610  /// <summary> Stem a word provided as a String. Returns the result as a String.</summary>
611  public virtual System.String Stem(System.String s)
612  {
613  if (Stem(s.ToCharArray(), s.Length))
614  {
615  return ToString();
616  }
617  else
618  return s;
619  }
620 
621  /// <summary>Stem a word contained in a char[]. Returns true if the stemming process
622  /// resulted in a word different from the input. You can retrieve the
623  /// result with getResultLength()/getResultBuffer() or toString().
624  /// </summary>
625  public virtual bool Stem(char[] word)
626  {
627  return Stem(word, word.Length);
628  }
629 
630  /// <summary>Stem a word contained in a portion of a char[] array. Returns
631  /// true if the stemming process resulted in a word different from
632  /// the input. You can retrieve the result with
633  /// getResultLength()/getResultBuffer() or toString().
634  /// </summary>
635  public virtual bool Stem(char[] wordBuffer, int offset, int wordLen)
636  {
637  Reset();
638  if (b.Length < wordLen)
639  {
640  var new_b = new char[wordLen + EXTRA];
641  b = new_b;
642  }
643  Array.Copy(wordBuffer, offset, b, 0, wordLen);
644  i = wordLen;
645  return Stem(0);
646  }
647 
648  /// <summary>Stem a word contained in a leading portion of a char[] array.
649  /// Returns true if the stemming process resulted in a word different
650  /// from the input. You can retrieve the result with
651  /// getResultLength()/getResultBuffer() or toString().
652  /// </summary>
653  public virtual bool Stem(char[] word, int wordLen)
654  {
655  return Stem(word, 0, wordLen);
656  }
657 
658  /// <summary>Stem the word placed into the Stemmer buffer through calls to add().
659  /// Returns true if the stemming process resulted in a word different
660  /// from the input. You can retrieve the result with
661  /// getResultLength()/getResultBuffer() or toString().
662  /// </summary>
663  public virtual bool Stem()
664  {
665  return Stem(0);
666  }
667 
668  public virtual bool Stem(int i0)
669  {
670  k = i - 1;
671  k0 = i0;
672  if (k > k0 + 1)
673  {
674  Step1(); Step2(); Step3(); Step4(); Step5(); Step6();
675  }
676  // Also, a word is considered dirty if we lopped off letters
677  // Thanks to Ifigenia Vairelles for pointing this out.
678  if (i != k + 1)
679  dirty = true;
680  i = k + 1;
681  return dirty;
682  }
683 
684  /// <summary>Test program for demonstrating the Stemmer. It reads a file and
685  /// stems each word, writing the result to standard out.
686  /// Usage: Stemmer file-name
687  /// </summary>
688  [STAThread]
689  public static void Main(System.String[] args)
690  {
691  var s = new PorterStemmer();
692 
693  for (int i = 0; i < args.Length; i++)
694  {
695  try
696  {
697  System.IO.Stream in_Renamed = new System.IO.FileStream(args[i], System.IO.FileMode.Open, System.IO.FileAccess.Read);
698  var buffer = new byte[1024];
699 
700  int bufferLen = in_Renamed.Read(buffer, 0, buffer.Length);
701  int offset = 0;
702  s.Reset();
703 
704  while (true)
705  {
706  int ch;
707  if (offset < bufferLen)
708  ch = buffer[offset++];
709  else
710  {
711  bufferLen = in_Renamed.Read(buffer, 0, buffer.Length);
712  offset = 0;
713  if (bufferLen < 0)
714  ch = - 1;
715  else
716  ch = buffer[offset++];
717  }
718 
719  if (Char.IsLetter((char) ch))
720  {
721  s.Add(Char.ToLower((char) ch));
722  }
723  else
724  {
725  s.Stem();
726  Console.Out.Write(s.ToString());
727  s.Reset();
728  if (ch < 0)
729  break;
730  else
731  {
732  System.Console.Out.Write((char) ch);
733  }
734  }
735  }
736 
737  in_Renamed.Close();
738  }
739  catch (System.IO.IOException)
740  {
741  Console.Out.WriteLine("error reading " + args[i]);
742  }
743  }
744  }
745  }
746 }