Lucene.Net  3.0.3
Lucene.Net is a .NET port of the Java Lucene Indexing Library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties
DutchAnalyzer.cs
Go to the documentation of this file.
1 /*
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements. See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership. The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License. You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied. See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20 */
21 
22 using System;
23 using System.Collections.Generic;
24 using System.IO;
25 using System.Collections;
26 using Lucene.Net.Analysis.Standard;
27 using Lucene.Net.Support;
28 using Version = Lucene.Net.Util.Version;
29 
30 namespace Lucene.Net.Analysis.Nl
31 {
32  /*
33  * {@link Analyzer} for Dutch language.
34  * <p>
35  * Supports an external list of stopwords (words that
36  * will not be indexed at all), an external list of exclusions (word that will
37  * not be stemmed, but indexed) and an external list of word-stem pairs that overrule
38  * the algorithm (dictionary stemming).
39  * A default set of stopwords is used unless an alternative list is specified, but the
40  * exclusion list is empty by default.
41  * </p>
42  *
43  * <p><b>NOTE</b>: This class uses the same {@link Version}
44  * dependent settings as {@link StandardAnalyzer}.</p>
45  */
46  public class DutchAnalyzer : Analyzer
47  {
48  /*
49  * List of typical Dutch stopwords.
50  * @deprecated use {@link #getDefaultStopSet()} instead
51  */
52  public static readonly String[] DUTCH_STOP_WORDS =
53  {
54  "de", "en", "van", "ik", "te", "dat", "die", "in", "een",
55  "hij", "het", "niet", "zijn", "is", "was", "op", "aan", "met", "als", "voor", "had",
56  "er", "maar", "om", "hem", "dan", "zou", "of", "wat", "mijn", "men", "dit", "zo",
57  "door", "over", "ze", "zich", "bij", "ook", "tot", "je", "mij", "uit", "der", "daar",
58  "haar", "naar", "heb", "hoe", "heeft", "hebben", "deze", "u", "want", "nog", "zal",
59  "me", "zij", "nu", "ge", "geen", "omdat", "iets", "worden", "toch", "al", "waren",
60  "veel", "meer", "doen", "toen", "moet", "ben", "zonder", "kan", "hun", "dus",
61  "alles", "onder", "ja", "eens", "hier", "wie", "werd", "altijd", "doch", "wordt",
62  "wezen", "kunnen", "ons", "zelf", "tegen", "na", "reeds", "wil", "kon", "niets",
63  "uw", "iemand", "geweest", "andere"
64  };
65  /*
66  * Returns an unmodifiable instance of the default stop-words set.
67  * @return an unmodifiable instance of the default stop-words set.
68  */
69  public static ISet<string> getDefaultStopSet()
70  {
71  return DefaultSetHolder.DEFAULT_STOP_SET;
72  }
73 
74  static class DefaultSetHolder
75  {
76  internal static readonly ISet<string> DEFAULT_STOP_SET = CharArraySet
77  .UnmodifiableSet(new CharArraySet((IEnumerable<string>)DUTCH_STOP_WORDS, false));
78  }
79 
80 
81  /*
82  * Contains the stopwords used with the StopFilter.
83  */
84  private readonly ISet<string> stoptable;
85 
86  /*
87  * Contains words that should be indexed but not stemmed.
88  */
89  private ISet<string> excltable = Support.Compatibility.SetFactory.CreateHashSet<string>();
90 
91  private IDictionary<String, String> stemdict = new HashMap<String, String>();
92  private readonly Version matchVersion;
93 
94  /*
95  * Builds an analyzer with the default stop words ({@link #DUTCH_STOP_WORDS})
96  * and a few default entries for the stem exclusion table.
97  *
98  */
99  public DutchAnalyzer(Version matchVersion)
100  : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
101  {
102  stemdict.Add("fiets", "fiets"); //otherwise fiet
103  stemdict.Add("bromfiets", "bromfiets"); //otherwise bromfiet
104  stemdict.Add("ei", "eier");
105  stemdict.Add("kind", "kinder");
106  }
107 
108  public DutchAnalyzer(Version matchVersion, ISet<string> stopwords)
109  : this(matchVersion, stopwords, CharArraySet.EMPTY_SET)
110  {
111 
112  }
113 
114  public DutchAnalyzer(Version matchVersion, ISet<string> stopwords, ISet<string> stemExclusionTable)
115  {
116  stoptable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
117  excltable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stemExclusionTable));
118  this.matchVersion = matchVersion;
119  SetOverridesTokenStreamMethod<DutchAnalyzer>();
120  }
121 
122  /*
123  * Builds an analyzer with the given stop words.
124  *
125  * @param matchVersion
126  * @param stopwords
127  * @deprecated use {@link #DutchAnalyzer(Version, Set)} instead
128  */
129  public DutchAnalyzer(Version matchVersion, params string[] stopwords)
130  : this(matchVersion, StopFilter.MakeStopSet(stopwords))
131  {
132 
133  }
134 
135  /*
136  * Builds an analyzer with the given stop words.
137  *
138  * @param stopwords
139  * @deprecated use {@link #DutchAnalyzer(Version, Set)} instead
140  */
141  public DutchAnalyzer(Version matchVersion, HashSet<string> stopwords)
142  : this(matchVersion, (ISet<string>)stopwords)
143  {
144 
145  }
146 
147  /*
148  * Builds an analyzer with the given stop words.
149  *
150  * @param stopwords
151  * @deprecated use {@link #DutchAnalyzer(Version, Set)} instead
152  */
153  public DutchAnalyzer(Version matchVersion, FileInfo stopwords)
154  {
155  // this is completely broken!
156  SetOverridesTokenStreamMethod<DutchAnalyzer>();
157  try
158  {
159  stoptable = WordlistLoader.GetWordSet(stopwords);
160  }
161  catch (IOException e)
162  {
163  // TODO: throw IOException
164  throw new Exception("", e);
165  }
166  this.matchVersion = matchVersion;
167  }
168 
169  /*
170  * Builds an exclusionlist from an array of Strings.
171  *
172  * @param exclusionlist
173  * @deprecated use {@link #DutchAnalyzer(Version, Set, Set)} instead
174  */
175  public void SetStemExclusionTable(params string[] exclusionlist)
176  {
177  excltable = StopFilter.MakeStopSet(exclusionlist);
178  PreviousTokenStream = null; // force a new stemmer to be created
179  }
180 
181  /*
182  * Builds an exclusionlist from a Hashtable.
183  * @deprecated use {@link #DutchAnalyzer(Version, Set, Set)} instead
184  */
185  public void SetStemExclusionTable(ISet<string> exclusionlist)
186  {
187  excltable = exclusionlist;
188  PreviousTokenStream = null; // force a new stemmer to be created
189  }
190 
191  /*
192  * Builds an exclusionlist from the words contained in the given file.
193  * @deprecated use {@link #DutchAnalyzer(Version, Set, Set)} instead
194  */
195  public void SetStemExclusionTable(FileInfo exclusionlist)
196  {
197  try
198  {
199  excltable = WordlistLoader.GetWordSet(exclusionlist);
200  PreviousTokenStream = null; // force a new stemmer to be created
201  }
202  catch (IOException e)
203  {
204  // TODO: throw IOException
205  throw new Exception("", e);
206  }
207  }
208 
209  /*
210  * Reads a stemdictionary file , that overrules the stemming algorithm
211  * This is a textfile that contains per line
212  * <tt>word<b>\t</b>stem</tt>, i.e: two tab seperated words
213  */
214  public void SetStemDictionary(FileInfo stemdictFile)
215  {
216  try
217  {
218  stemdict = WordlistLoader.GetStemDict(stemdictFile);
219  PreviousTokenStream = null; // force a new stemmer to be created
220  }
221  catch (IOException e)
222  {
223  // TODO: throw IOException
224  throw new Exception(string.Empty, e);
225  }
226  }
227 
228  /*
229  * Creates a {@link TokenStream} which tokenizes all the text in the
230  * provided {@link Reader}.
231  *
232  * @return A {@link TokenStream} built from a {@link StandardTokenizer}
233  * filtered with {@link StandardFilter}, {@link StopFilter},
234  * and {@link DutchStemFilter}
235  */
236  public override TokenStream TokenStream(String fieldName, TextReader reader)
237  {
238  TokenStream result = new StandardTokenizer(matchVersion, reader);
239  result = new StandardFilter(result);
241  result, stoptable);
242  result = new DutchStemFilter(result, excltable, stemdict);
243  return result;
244  }
245 
246  class SavedStreams
247  {
248  protected internal Tokenizer source;
249  protected internal TokenStream result;
250  };
251 
252  /*
253  * Returns a (possibly reused) {@link TokenStream} which tokenizes all the
254  * text in the provided {@link Reader}.
255  *
256  * @return A {@link TokenStream} built from a {@link StandardTokenizer}
257  * filtered with {@link StandardFilter}, {@link StopFilter},
258  * and {@link DutchStemFilter}
259  */
260  public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
261  {
262  if (overridesTokenStreamMethod)
263  {
264  // LUCENE-1678: force fallback to tokenStream() if we
265  // have been subclassed and that subclass overrides
266  // tokenStream but not reusableTokenStream
267  return TokenStream(fieldName, reader);
268  }
269 
270  SavedStreams streams = (SavedStreams)PreviousTokenStream;
271  if (streams == null)
272  {
273  streams = new SavedStreams();
274  streams.source = new StandardTokenizer(matchVersion, reader);
275  streams.result = new StandardFilter(streams.source);
276  streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
277  streams.result, stoptable);
278  streams.result = new DutchStemFilter(streams.result, excltable, stemdict);
279  PreviousTokenStream = streams;
280  }
281  else
282  {
283  streams.source.Reset(reader);
284  }
285  return streams.result;
286  }
287  }
288 }