Lucene.Net  3.0.3
Lucene.Net is a .NET port of the Java Lucene Indexing Library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties
FrenchStemmer.cs
Go to the documentation of this file.
1 /*
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements. See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership. The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License. You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied. See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20 */
21 
22 using System;
23 using System.Text;
24 
25 namespace Lucene.Net.Analysis.Fr
26 {
27 
28 /*
29  * A stemmer for French words.
30  * <p>
31  * The algorithm is based on the work of
32  * Dr Martin Porter on his snowball project<br>
33  * refer to http://snowball.sourceforge.net/french/stemmer.html<br>
34  * (French stemming algorithm) for details
35  * </p>
36  */
37 
38 public class FrenchStemmer {
39 
40  /*
41  * Buffer for the terms while stemming them.
42  */
43  private StringBuilder sb = new StringBuilder();
44 
45  /*
46  * A temporary buffer, used to reconstruct R2
47  */
48  private StringBuilder tb = new StringBuilder();
49 
50  /*
51  * Region R0 is equal to the whole buffer
52  */
53  private String R0;
54 
55  /*
56  * Region RV
57  * "If the word begins with two vowels, RV is the region after the third letter,
58  * otherwise the region after the first vowel not at the beginning of the word,
59  * or the end of the word if these positions cannot be found."
60  */
61  private String RV;
62 
63  /*
64  * Region R1
65  * "R1 is the region after the first non-vowel following a vowel
66  * or is the null region at the end of the word if there is no such non-vowel"
67  */
68  private String R1;
69 
70  /*
71  * Region R2
72  * "R2 is the region after the first non-vowel in R1 following a vowel
73  * or is the null region at the end of the word if there is no such non-vowel"
74  */
75  private String R2;
76 
77 
78  /*
79  * Set to true if we need to perform step 2
80  */
81  private bool suite;
82 
83  /*
84  * Set to true if the buffer was modified
85  */
86  private bool modified;
87 
88 
89  /*
90  * Stems the given term to a unique <tt>discriminator</tt>.
91  *
92  * @param term java.langString The term that should be stemmed
93  * @return java.lang.String Discriminator for <tt>term</tt>
94  */
95  protected internal String Stem( String term ) {
96  if ( !IsStemmable( term ) ) {
97  return term;
98  }
99 
100  // Use lowercase for medium stemming.
101  term = term.ToLower();
102 
103  // Reset the StringBuilder.
104  sb.Length = 0;
105  sb.Insert( 0, term );
106 
107  // reset the bools
108  modified = false;
109  suite = false;
110 
111  sb = TreatVowels( sb );
112 
113  SetStrings();
114 
115  Step1();
116 
117  if (!modified || suite)
118  {
119  if (RV != null)
120  {
121  suite = Step2A();
122  if (!suite)
123  Step2B();
124  }
125  }
126 
127  if (modified || suite)
128  Step3();
129  else
130  Step4();
131 
132  Step5();
133 
134  Step6();
135 
136  return sb.ToString();
137  }
138 
139  /*
140  * Sets the search region Strings<br>
141  * it needs to be done each time the buffer was modified
142  */
143  private void SetStrings() {
144  // set the strings
145  R0 = sb.ToString();
146  RV = RetrieveRV( sb );
147  R1 = RetrieveR( sb );
148  if ( R1 != null )
149  {
150  tb.Length = 0;
151  tb.Insert( 0, R1 );
152  R2 = RetrieveR( tb );
153  }
154  else
155  R2 = null;
156  }
157 
158  /*
159  * First step of the Porter Algorithm<br>
160  * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
161  */
162  private void Step1( ) {
163  String[] suffix = { "ances", "iqUes", "ismes", "ables", "istes", "ance", "iqUe", "isme", "able", "iste" };
164  DeleteFrom( R2, suffix );
165 
166  ReplaceFrom( R2, new String[] { "logies", "logie" }, "log" );
167  ReplaceFrom( R2, new String[] { "usions", "utions", "usion", "ution" }, "u" );
168  ReplaceFrom( R2, new String[] { "ences", "ence" }, "ent" );
169 
170  String[] search = { "atrices", "ateurs", "ations", "atrice", "ateur", "ation"};
171  DeleteButSuffixFromElseReplace( R2, search, "ic", true, R0, "iqU" );
172 
173  DeleteButSuffixFromElseReplace( R2, new String[] { "ements", "ement" }, "eus", false, R0, "eux" );
174  DeleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "ativ", false );
175  DeleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iv", false );
176  DeleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "abl", false );
177  DeleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iqU", false );
178 
179  DeleteFromIfTestVowelBeforeIn( R1, new String[] { "issements", "issement" }, false, R0 );
180  DeleteFrom( RV, new String[] { "ements", "ement" } );
181 
182  DeleteButSuffixFromElseReplace(R2, new [] { "it\u00e9s", "it\u00e9" }, "abil", false, R0, "abl");
183  DeleteButSuffixFromElseReplace(R2, new [] { "it\u00e9s", "it\u00e9" }, "ic", false, R0, "iqU");
184  DeleteButSuffixFrom(R2, new [] { "it\u00e9s", "it\u00e9" }, "iv", true);
185 
186  String[] autre = { "ifs", "ives", "if", "ive" };
187  DeleteButSuffixFromElseReplace( R2, autre, "icat", false, R0, "iqU" );
188  DeleteButSuffixFromElseReplace( R2, autre, "at", true, R2, "iqU" );
189 
190  ReplaceFrom( R0, new String[] { "eaux" }, "eau" );
191 
192  ReplaceFrom( R1, new String[] { "aux" }, "al" );
193 
194  DeleteButSuffixFromElseReplace( R2, new String[] { "euses", "euse" }, "", true, R1, "eux" );
195 
196  DeleteFrom( R2, new String[] { "eux" } );
197 
198  // if one of the next steps is performed, we will need to perform step2a
199  bool temp = false;
200  temp = ReplaceFrom( RV, new String[] { "amment" }, "ant" );
201  if (temp == true)
202  suite = true;
203  temp = ReplaceFrom( RV, new String[] { "emment" }, "ent" );
204  if (temp == true)
205  suite = true;
206  temp = DeleteFromIfTestVowelBeforeIn( RV, new String[] { "ments", "ment" }, true, RV );
207  if (temp == true)
208  suite = true;
209 
210  }
211 
212  /*
213  * Second step (A) of the Porter Algorithm<br>
214  * Will be performed if nothing changed from the first step
215  * or changed were done in the amment, emment, ments or ment suffixes<br>
216  * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
217  *
218  * @return bool - true if something changed in the StringBuilder
219  */
220  private bool Step2A() {
221  String[] search = { "\u00eemes", "\u00eetes", "iraIent", "irait", "irais", "irai", "iras", "ira",
222  "irent", "iriez", "irez", "irions", "irons", "iront",
223  "issaIent", "issais", "issantes", "issante", "issants", "issant",
224  "issait", "issais", "issions", "issons", "issiez", "issez", "issent",
225  "isses", "isse", "ir", "is", "\u00eet", "it", "ies", "ie", "i" };
226  return DeleteFromIfTestVowelBeforeIn( RV, search, false, RV );
227  }
228 
229  /*
230  * Second step (B) of the Porter Algorithm<br>
231  * Will be performed if step 2 A was performed unsuccessfully<br>
232  * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
233  */
234  private void Step2B() {
235  String[] suffix = { "eraIent", "erais", "erait", "erai", "eras", "erions", "eriez",
236  "erons", "eront","erez", "\u00e8rent", "era", "\u00e9es", "iez",
237  "\u00e9e", "\u00e9s", "er", "ez", "\u00e9" };
238  DeleteFrom( RV, suffix );
239 
240  String[] search = { "assions", "assiez", "assent", "asses", "asse", "aIent",
241  "antes", "aIent", "Aient", "ante", "\u00e2mes", "\u00e2tes", "ants", "ant",
242  "ait", "a\u00eet", "ais", "Ait", "A\u00eet", "Ais", "\u00e2t", "as", "ai", "Ai", "a" };
243  DeleteButSuffixFrom( RV, search, "e", true );
244 
245  DeleteFrom( R2, new String[] { "ions" } );
246  }
247 
248  /*
249  * Third step of the Porter Algorithm<br>
250  * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
251  */
252  private void Step3() {
253  if (sb.Length>0)
254  {
255  char ch = sb[ sb.Length -1];
256  if (ch == 'Y')
257  {
258  sb[sb.Length -1] = 'i' ;
259  SetStrings();
260  }
261  else if (ch == 'ç')
262  {
263  sb[sb.Length -1] = 'c';
264  SetStrings();
265  }
266  }
267  }
268 
269  /*
270  * Fourth step of the Porter Algorithm<br>
271  * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
272  */
273  private void Step4() {
274  if (sb.Length > 1)
275  {
276  char ch = sb[ sb.Length -1];
277  if (ch == 's')
278  {
279  char b = sb[ sb.Length -2];
280  if (b != 'a' && b != 'i' && b != 'o' && b != 'u' && b != 'è' && b != 's')
281  {
282  sb.Length = sb.Length - 1;
283  SetStrings();
284  }
285  }
286  }
287  bool found = DeleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "s" );
288  if (!found)
289  found = DeleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "t" );
290 
291  ReplaceFrom(RV, new String[] { "I\u00e8re", "i\u00e8re", "Ier", "ier" }, "i");
292  DeleteFrom( RV, new String[] { "e" } );
293  DeleteFromIfPrecededIn(RV, new String[] { "\u00eb" }, R0, "gu");
294  }
295 
296  /*
297  * Fifth step of the Porter Algorithm<br>
298  * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
299  */
300  private void Step5() {
301  if (R0 != null)
302  {
303  if (R0.EndsWith("enn") || R0.EndsWith("onn") || R0.EndsWith("ett") || R0.EndsWith("ell") || R0.EndsWith("eill"))
304  {
305  sb.Length = sb.Length - 1;
306  SetStrings();
307  }
308  }
309  }
310 
311  /*
312  * Sixth (and last!) step of the Porter Algorithm<br>
313  * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
314  */
315  private void Step6() {
316  if (R0!=null && R0.Length>0)
317  {
318  bool seenVowel = false;
319  bool seenConson = false;
320  int pos = -1;
321  for (int i = R0.Length-1; i > -1; i--)
322  {
323  char ch = R0[i] ;
324  if (IsVowel(ch))
325  {
326  if (!seenVowel)
327  {
328  if (ch == 'é' || ch == 'è')
329  {
330  pos = i;
331  break;
332  }
333  }
334  seenVowel = true;
335  }
336  else
337  {
338  if (seenVowel)
339  break;
340  else
341  seenConson = true;
342  }
343  }
344  if (pos > -1 && seenConson && !seenVowel)
345  sb[pos] = 'e';
346  }
347  }
348 
349  /*
350  * Delete a suffix searched in zone "source" if zone "from" contains prefix + search string
351  *
352  * @param source java.lang.String - the primary source zone for search
353  * @param search java.lang.String[] - the strings to search for suppression
354  * @param from java.lang.String - the secondary source zone for search
355  * @param prefix java.lang.String - the prefix to add to the search string to test
356  * @return bool - true if modified
357  */
358  private bool DeleteFromIfPrecededIn( String source, String[] search, String from, String prefix ) {
359  bool found = false;
360  if (source!=null )
361  {
362  for (int i = 0; i < search.Length; i++) {
363  if ( source.EndsWith( search[i] ))
364  {
365  if (from!=null && from.EndsWith( prefix + search[i] ))
366  {
367  sb.Length = sb.Length - search[i].Length;
368  found = true;
369  SetStrings();
370  break;
371  }
372  }
373  }
374  }
375  return found;
376  }
377 
378  /*
379  * Delete a suffix searched in zone "source" if the preceding letter is (or isn't) a vowel
380  *
381  * @param source java.lang.String - the primary source zone for search
382  * @param search java.lang.String[] - the strings to search for suppression
383  * @param vowel bool - true if we need a vowel before the search string
384  * @param from java.lang.String - the secondary source zone for search (where vowel could be)
385  * @return bool - true if modified
386  */
387  private bool DeleteFromIfTestVowelBeforeIn( String source, String[] search, bool vowel, String from ) {
388  bool found = false;
389  if (source!=null && from!=null)
390  {
391  for (int i = 0; i < search.Length; i++) {
392  if ( source.EndsWith( search[i] ))
393  {
394  if ((search[i].Length + 1) <= from.Length)
395  {
396  bool test = IsVowel(sb[sb.Length -(search[i].Length+1)]);
397  if (test == vowel)
398  {
399  sb.Length = sb.Length - search[i].Length;
400  modified = true;
401  found = true;
402  SetStrings();
403  break;
404  }
405  }
406  }
407  }
408  }
409  return found;
410  }
411 
412  /*
413  * Delete a suffix searched in zone "source" if preceded by the prefix
414  *
415  * @param source java.lang.String - the primary source zone for search
416  * @param search java.lang.String[] - the strings to search for suppression
417  * @param prefix java.lang.String - the prefix to add to the search string to test
418  * @param without bool - true if it will be deleted even without prefix found
419  */
420  private void DeleteButSuffixFrom( String source, String[] search, String prefix, bool without ) {
421  if (source!=null)
422  {
423  for (int i = 0; i < search.Length; i++) {
424  if ( source.EndsWith( prefix + search[i] ))
425  {
426  sb.Length = sb.Length - (prefix.Length + search[i].Length);
427  modified = true;
428  SetStrings();
429  break;
430  }
431  else if ( without && source.EndsWith( search[i] ))
432  {
433  sb.Length = sb.Length - search[i].Length;
434  modified = true;
435  SetStrings();
436  break;
437  }
438  }
439  }
440  }
441 
442  /*
443  * Delete a suffix searched in zone "source" if preceded by prefix<br>
444  * or replace it with the replace string if preceded by the prefix in the zone "from"<br>
445  * or delete the suffix if specified
446  *
447  * @param source java.lang.String - the primary source zone for search
448  * @param search java.lang.String[] - the strings to search for suppression
449  * @param prefix java.lang.String - the prefix to add to the search string to test
450  * @param without bool - true if it will be deleted even without prefix found
451  */
452  private void DeleteButSuffixFromElseReplace( String source, String[] search, String prefix, bool without, String from, String replace ) {
453  if (source!=null)
454  {
455  for (int i = 0; i < search.Length; i++) {
456  if ( source.EndsWith( prefix + search[i] ))
457  {
458  sb.Length = sb.Length - (prefix.Length + search[i].Length);
459  modified = true;
460  SetStrings();
461  break;
462  }
463  else if ( from!=null && from.EndsWith( prefix + search[i] ))
464  {
465  // java equivalent of replace
466  sb.Length = sb.Length - (prefix.Length + search[i].Length);
467  sb.Append(replace);
468 
469  modified = true;
470  SetStrings();
471  break;
472  }
473  else if ( without && source.EndsWith( search[i] ))
474  {
475  sb.Length = sb.Length - search[i].Length;
476  modified = true;
477  SetStrings();
478  break;
479  }
480  }
481  }
482  }
483 
484  /*
485  * Replace a search string with another within the source zone
486  *
487  * @param source java.lang.String - the source zone for search
488  * @param search java.lang.String[] - the strings to search for replacement
489  * @param replace java.lang.String - the replacement string
490  */
491  private bool ReplaceFrom( String source, String[] search, String replace ) {
492  bool found = false;
493  if (source!=null)
494  {
495  for (int i = 0; i < search.Length; i++) {
496  if ( source.EndsWith( search[i] ))
497  {
498  // java equivalent for replace
499  sb.Length = sb.Length - search[i].Length;
500  sb.Append(replace);
501 
502  modified = true;
503  found = true;
504  SetStrings();
505  break;
506  }
507  }
508  }
509  return found;
510  }
511 
512  /*
513  * Delete a search string within the source zone
514  *
515  * @param source the source zone for search
516  * @param suffix the strings to search for suppression
517  */
518  private void DeleteFrom(String source, String[] suffix ) {
519  if (source!=null)
520  {
521  for (int i = 0; i < suffix.Length; i++) {
522  if (source.EndsWith( suffix[i] ))
523  {
524  sb.Length = sb.Length - suffix[i].Length;
525  modified = true;
526  SetStrings();
527  break;
528  }
529  }
530  }
531  }
532 
533  /*
534  * Test if a char is a french vowel, including accentuated ones
535  *
536  * @param ch the char to test
537  * @return bool - true if the char is a vowel
538  */
539  private bool IsVowel(char ch) {
540  switch (ch)
541  {
542  case 'a':
543  case 'e':
544  case 'i':
545  case 'o':
546  case 'u':
547  case 'y':
548  case 'â':
549  case 'à':
550  case 'ë':
551  case 'é':
552  case 'ê':
553  case 'è':
554  case 'ï':
555  case 'î':
556  case 'ô':
557  case 'ü':
558  case 'ù':
559  case 'û':
560  return true;
561  default:
562  return false;
563  }
564  }
565 
566  /*
567  * Retrieve the "R zone" (1 or 2 depending on the buffer) and return the corresponding string<br>
568  * "R is the region after the first non-vowel following a vowel
569  * or is the null region at the end of the word if there is no such non-vowel"<br>
570  * @param buffer java.lang.StringBuilder - the in buffer
571  * @return java.lang.String - the resulting string
572  */
573  private String RetrieveR( StringBuilder buffer ) {
574  int len = buffer.Length;
575  int pos = -1;
576  for (int c = 0; c < len; c++) {
577  if (IsVowel( buffer[ c ] ))
578  {
579  pos = c;
580  break;
581  }
582  }
583  if (pos > -1)
584  {
585  int consonne = -1;
586  for (int c = pos; c < len; c++) {
587  if (!IsVowel(buffer[ c ] ))
588  {
589  consonne = c;
590  break;
591  }
592  }
593  if (consonne > -1 && (consonne+1) < len)
594  return buffer.ToString(consonne + 1, len - (consonne+1));
595  else
596  return null;
597  }
598  else
599  return null;
600  }
601 
602  /*
603  * Retrieve the "RV zone" from a buffer an return the corresponding string<br>
604  * "If the word begins with two vowels, RV is the region after the third letter,
605  * otherwise the region after the first vowel not at the beginning of the word,
606  * or the end of the word if these positions cannot be found."<br>
607  * @param buffer java.lang.StringBuilder - the in buffer
608  * @return java.lang.String - the resulting string
609  */
610  private String RetrieveRV( StringBuilder buffer ) {
611  int len = buffer.Length;
612  if ( buffer.Length > 3)
613  {
614  if ( IsVowel(buffer[ 0 ] ) && IsVowel(buffer[ 1 ] )) {
615  return buffer.ToString(3, len - 3);
616  }
617  else
618  {
619  int pos = 0;
620  for (int c = 1; c < len; c++) {
621  if (IsVowel( buffer[ c ] ))
622  {
623  pos = c;
624  break;
625  }
626  }
627  if ( pos+1 < len )
628  return buffer.ToString(pos + 1, len - (pos+1));
629  else
630  return null;
631  }
632  }
633  else
634  return null;
635  }
636 
637 
638 
639  /*
640  * Turns u and i preceded AND followed by a vowel to UpperCase<br>
641  * Turns y preceded OR followed by a vowel to UpperCase<br>
642  * Turns u preceded by q to UpperCase<br>
643  *
644  * @param buffer java.util.StringBuilder - the buffer to treat
645  * @return java.util.StringBuilder - the treated buffer
646  */
647  private StringBuilder TreatVowels( StringBuilder buffer ) {
648  for ( int c = 0; c < buffer.Length; c++ ) {
649  char ch = buffer[ c ] ;
650 
651  if (c == 0) // first char
652  {
653  if (buffer.Length>1)
654  {
655  if (ch == 'y' && IsVowel(buffer[ c + 1 ] ))
656  buffer[c] = 'Y';
657  }
658  }
659  else if (c == buffer.Length-1) // last char
660  {
661  if (ch == 'u' && buffer[ c - 1 ] == 'q')
662  buffer[c] = 'U';
663  if (ch == 'y' && IsVowel(buffer[ c - 1 ] ))
664  buffer[c] = 'Y';
665  }
666  else // other cases
667  {
668  if (ch == 'u')
669  {
670  if (buffer[ c - 1] == 'q')
671  buffer[c] = 'U';
672  else if (IsVowel(buffer[ c - 1 ] ) && IsVowel(buffer[ c + 1 ] ))
673  buffer[c] = 'U';
674  }
675  if (ch == 'i')
676  {
677  if (IsVowel(buffer[ c - 1 ] ) && IsVowel(buffer[ c + 1 ] ))
678  buffer[c] = 'I';
679  }
680  if (ch == 'y')
681  {
682  if (IsVowel(buffer[ c - 1 ] ) || IsVowel(buffer[ c + 1 ] ))
683  buffer[c] = 'Y';
684  }
685  }
686  }
687 
688  return buffer;
689  }
690 
691  /*
692  * Checks a term if it can be processed correctly.
693  *
694  * @return bool - true if, and only if, the given term consists in letters.
695  */
696  private bool IsStemmable( String term ) {
697  bool upper = false;
698  int first = -1;
699  for ( int c = 0; c < term.Length; c++ ) {
700  // Discard terms that contain non-letter chars.
701  if ( !char.IsLetter( term[c] ) ) {
702  return false;
703  }
704  // Discard terms that contain multiple uppercase letters.
705  if ( char.IsUpper( term[ c] ) ) {
706  if ( upper ) {
707  return false;
708  }
709  // First encountered uppercase letter, set flag and save
710  // position.
711  else {
712  first = c;
713  upper = true;
714  }
715  }
716  }
717  // Discard the term if it contains a single uppercase letter that
718  // is not starting the term.
719  if ( first > 0 ) {
720  return false;
721  }
722  return true;
723  }
724 }
725 
726 }