Lucene.Net  3.0.3
Lucene.Net is a .NET port of the Java Lucene Indexing Library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties
FrenchAnalyzer.cs
Go to the documentation of this file.
1 /*
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements. See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership. The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License. You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied. See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20 */
21 
22 using System;
23 using System.Collections.Generic;
24 using System.IO;
25 using System.Text;
26 using System.Collections;
27 
28 using Lucene.Net.Analysis;
29 using Lucene.Net.Analysis.De;
30 using Lucene.Net.Analysis.Standard;
31 using Version = Lucene.Net.Util.Version;
32 
33 namespace Lucene.Net.Analysis.Fr
34 {
35  /*
36  * {@link Analyzer} for French language.
37  * <p>
38  * Supports an external list of stopwords (words that
39  * will not be indexed at all) and an external list of exclusions (word that will
40  * not be stemmed, but indexed).
41  * A default set of stopwords is used unless an alternative list is specified, but the
42  * exclusion list is empty by default.
43  * </p>
44  *
45  * <a name="version"/>
46  * <p>You must specify the required {@link Version}
47  * compatibility when creating FrenchAnalyzer:
48  * <ul>
49  * <li> As of 2.9, StopFilter preserves position
50  * increments
51  * </ul>
52  *
53  * <p><b>NOTE</b>: This class uses the same {@link Version}
54  * dependent settings as {@link StandardAnalyzer}.</p>
55  */
56  public sealed class FrenchAnalyzer : Analyzer
57  {
58 
59  /*
60  * Extended list of typical French stopwords.
61  * @deprecated use {@link #getDefaultStopSet()} instead
62  */
63  // TODO make this private in 3.1
64  public readonly static String[] FRENCH_STOP_WORDS = {
65  "a", "afin", "ai", "ainsi", "après", "attendu", "au", "aujourd", "auquel", "aussi",
66  "autre", "autres", "aux", "auxquelles", "auxquels", "avait", "avant", "avec", "avoir",
67  "c", "car", "ce", "ceci", "cela", "celle", "celles", "celui", "cependant", "certain",
68  "certaine", "certaines", "certains", "ces", "cet", "cette", "ceux", "chez", "ci",
69  "combien", "comme", "comment", "concernant", "contre", "d", "dans", "de", "debout",
70  "dedans", "dehors", "delà", "depuis", "derrière", "des", "désormais", "desquelles",
71  "desquels", "dessous", "dessus", "devant", "devers", "devra", "divers", "diverse",
72  "diverses", "doit", "donc", "dont", "du", "duquel", "durant", "dès", "elle", "elles",
73  "en", "entre", "environ", "est", "et", "etc", "etre", "eu", "eux", "excepté", "hormis",
74  "hors", "hélas", "hui", "il", "ils", "j", "je", "jusqu", "jusque", "l", "la", "laquelle",
75  "le", "lequel", "les", "lesquelles", "lesquels", "leur", "leurs", "lorsque", "lui", "là",
76  "ma", "mais", "malgré", "me", "merci", "mes", "mien", "mienne", "miennes", "miens", "moi",
77  "moins", "mon", "moyennant", "même", "mêmes", "n", "ne", "ni", "non", "nos", "notre",
78  "nous", "néanmoins", "nôtre", "nôtres", "on", "ont", "ou", "outre", "où", "par", "parmi",
79  "partant", "pas", "passé", "pendant", "plein", "plus", "plusieurs", "pour", "pourquoi",
80  "proche", "près", "puisque", "qu", "quand", "que", "quel", "quelle", "quelles", "quels",
81  "qui", "quoi", "quoique", "revoici", "revoilà", "s", "sa", "sans", "sauf", "se", "selon",
82  "seront", "ses", "si", "sien", "sienne", "siennes", "siens", "sinon", "soi", "soit",
83  "son", "sont", "sous", "suivant", "sur", "ta", "te", "tes", "tien", "tienne", "tiennes",
84  "tiens", "toi", "ton", "tous", "tout", "toute", "toutes", "tu", "un", "une", "va", "vers",
85  "voici", "voilà", "vos", "votre", "vous", "vu", "vôtre", "vôtres", "y", "à", "ça", "ès",
86  "été", "être", "ô"
87  };
88 
89  /*
90  * Contains the stopwords used with the {@link StopFilter}.
91  */
92  private readonly ISet<string> stoptable;
93  /*
94  * Contains words that should be indexed but not stemmed.
95  */
96  //TODO make this final in 3.0
97  private ISet<string> excltable = Support.Compatibility.SetFactory.CreateHashSet<string>();
98 
99  private readonly Version matchVersion;
100 
101  /*
102  * Returns an unmodifiable instance of the default stop-words set.
103  * @return an unmodifiable instance of the default stop-words set.
104  */
105  public static ISet<string> GetDefaultStopSet()
106  {
107  return DefaultSetHolder.DEFAULT_STOP_SET;
108  }
109 
110  static class DefaultSetHolder
111  {
112  internal static ISet<string> DEFAULT_STOP_SET = CharArraySet.UnmodifiableSet(new CharArraySet((IEnumerable<string>)FRENCH_STOP_WORDS, false));
113  }
114 
115  /*
116  * Builds an analyzer with the default stop words ({@link #FRENCH_STOP_WORDS}).
117  */
118  public FrenchAnalyzer(Version matchVersion)
119  : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
120  {
121 
122  }
123 
124  /*
125  * Builds an analyzer with the given stop words
126  *
127  * @param matchVersion
128  * lucene compatibility version
129  * @param stopwords
130  * a stopword set
131  */
132  public FrenchAnalyzer(Version matchVersion, ISet<string> stopwords)
133  : this(matchVersion, stopwords, CharArraySet.EMPTY_SET)
134  {
135  }
136 
137  /*
138  * Builds an analyzer with the given stop words
139  *
140  * @param matchVersion
141  * lucene compatibility version
142  * @param stopwords
143  * a stopword set
144  * @param stemExclutionSet
145  * a stemming exclusion set
146  */
147  public FrenchAnalyzer(Version matchVersion, ISet<string> stopwords, ISet<string> stemExclutionSet)
148  {
149  this.matchVersion = matchVersion;
150  this.stoptable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
151  this.excltable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stemExclutionSet));
152  }
153 
154 
155  /*
156  * Builds an analyzer with the given stop words.
157  * @deprecated use {@link #FrenchAnalyzer(Version, Set)} instead
158  */
159  public FrenchAnalyzer(Version matchVersion, params string[] stopwords)
160  : this(matchVersion, StopFilter.MakeStopSet(stopwords))
161  {
162 
163  }
164 
165  /*
166  * Builds an analyzer with the given stop words.
167  * @throws IOException
168  * @deprecated use {@link #FrenchAnalyzer(Version, Set)} instead
169  */
170  public FrenchAnalyzer(Version matchVersion, FileInfo stopwords)
171  : this(matchVersion, WordlistLoader.GetWordSet(stopwords))
172  {
173  }
174 
175  /*
176  * Builds an exclusionlist from an array of Strings.
177  * @deprecated use {@link #FrenchAnalyzer(Version, Set, Set)} instead
178  */
179  public void SetStemExclusionTable(params string[] exclusionlist)
180  {
181  excltable = StopFilter.MakeStopSet(exclusionlist);
182  PreviousTokenStream = null; // force a new stemmer to be created
183  }
184 
185  /*
186  * Builds an exclusionlist from a Map.
187  * @deprecated use {@link #FrenchAnalyzer(Version, Set, Set)} instead
188  */
189  public void SetStemExclusionTable(IDictionary<string, string> exclusionlist)
190  {
191  excltable = Support.Compatibility.SetFactory.CreateHashSet(exclusionlist.Keys);
192  PreviousTokenStream = null; // force a new stemmer to be created
193  }
194 
195  /*
196  * Builds an exclusionlist from the words contained in the given file.
197  * @throws IOException
198  * @deprecated use {@link #FrenchAnalyzer(Version, Set, Set)} instead
199  */
200  public void SetStemExclusionTable(FileInfo exclusionlist)
201  {
202  excltable = WordlistLoader.GetWordSet(exclusionlist);
203  PreviousTokenStream = null; // force a new stemmer to be created
204  }
205 
206  /*
207  * Creates a {@link TokenStream} which tokenizes all the text in the provided
208  * {@link Reader}.
209  *
210  * @return A {@link TokenStream} built from a {@link StandardTokenizer}
211  * filtered with {@link StandardFilter}, {@link StopFilter},
212  * {@link FrenchStemFilter} and {@link LowerCaseFilter}
213  */
214  public override sealed TokenStream TokenStream(String fieldName, TextReader reader)
215  {
216  TokenStream result = new StandardTokenizer(matchVersion, reader);
217  result = new StandardFilter(result);
219  result, stoptable);
220  result = new FrenchStemFilter(result, excltable);
221  // Convert to lowercase after stemming!
222  result = new LowerCaseFilter(result);
223  return result;
224  }
225 
226  class SavedStreams
227  {
228  protected internal Tokenizer source;
229  protected internal TokenStream result;
230  };
231 
232  /*
233  * Returns a (possibly reused) {@link TokenStream} which tokenizes all the
234  * text in the provided {@link Reader}.
235  *
236  * @return A {@link TokenStream} built from a {@link StandardTokenizer}
237  * filtered with {@link StandardFilter}, {@link StopFilter},
238  * {@link FrenchStemFilter} and {@link LowerCaseFilter}
239  */
240  public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
241  {
242  SavedStreams streams = (SavedStreams)PreviousTokenStream;
243  if (streams == null)
244  {
245  streams = new SavedStreams();
246  streams.source = new StandardTokenizer(matchVersion, reader);
247  streams.result = new StandardFilter(streams.source);
248  streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
249  streams.result, stoptable);
250  streams.result = new FrenchStemFilter(streams.result, excltable);
251  // Convert to lowercase after stemming!
252  streams.result = new LowerCaseFilter(streams.result);
253  PreviousTokenStream = streams;
254  }
255  else
256  {
257  streams.source.Reset(reader);
258  }
259  return streams.result;
260  }
261  }
262 }