Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
GermanAnalyzer.cs
Go to the documentation of this file.
1 /*
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements. See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership. The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License. You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied. See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20 */
21 
22 using System;
23 using System.Collections.Generic;
24 using System.IO;
25 using System.Collections;
26 using System.Linq;
27 using Lucene.Net.Analysis.Standard;
28 using Lucene.Net.Analysis;
29 using Version = Lucene.Net.Util.Version;
30 
31 namespace Lucene.Net.Analysis.De
32 {
33  /// <summary>
34  /// Analyzer for German language. Supports an external list of stopwords (words that
35  /// will not be indexed at all) and an external list of exclusions (word that will
36  /// not be stemmed, but indexed).
37  /// A default set of stopwords is used unless an alternative list is specified, the
38  /// exclusion list is empty by default.
39  /// </summary>
40  public class GermanAnalyzer : Analyzer
41  {
42  /// <summary>
43  /// List of typical german stopwords.
44  /// </summary>
45  //TODO: make this private in 3.1
46  private static readonly String[] GERMAN_STOP_WORDS =
47  {
48  "einer", "eine", "eines", "einem", "einen",
49  "der", "die", "das", "dass", "daß",
50  "du", "er", "sie", "es",
51  "was", "wer", "wie", "wir",
52  "und", "oder", "ohne", "mit",
53  "am", "im", "in", "aus", "auf",
54  "ist", "sein", "war", "wird",
55  "ihr", "ihre", "ihres",
56  "als", "für", "von",
57  "dich", "dir", "mich", "mir",
58  "mein", "kein",
59  "durch", "wegen"
60  };
61 
62  /// <summary>
63  /// Returns a set of default German-stopwords
64  /// </summary>
65  public static ISet<string> GetDefaultStopSet()
66  {
67  return DefaultSetHolder.DEFAULT_SET;
68  }
69 
70  private static class DefaultSetHolder
71  {
72  internal static readonly ISet<string> DEFAULT_SET = CharArraySet.UnmodifiableSet(new CharArraySet(
73  (IEnumerable<string>)GERMAN_STOP_WORDS,
74  false));
75  }
76 
77  /// <summary>
78  /// Contains the stopwords used with the StopFilter.
79  /// </summary>
80  //TODO: make this readonly in 3.1
81  private ISet<string> stopSet;
82 
83  /// <summary>
84  /// Contains words that should be indexed but not stemmed.
85  /// </summary>
86  //TODO: make this readonly in 3.1
87  private ISet<string> exclusionSet;
88 
89  private Version matchVersion;
90  private readonly bool _normalizeDin2;
91 
92  /// <summary>
93  /// Builds an analyzer with the default stop words:
94  /// <see cref="GetDefaultStopSet"/>
95  /// </summary>
96  [Obsolete("Use GermanAnalyzer(Version) instead")]
97  public GermanAnalyzer()
98  : this(Version.LUCENE_CURRENT)
99  {
100  }
101 
102  /// <summary>
103  /// Builds an analyzer with the default stop words:
104  /// <see cref="GetDefaultStopSet"/>
105  /// </summary>
106  /// <param name="matchVersion">Lucene compatibility version</param>
107  public GermanAnalyzer(Version matchVersion)
108  : this(matchVersion, DefaultSetHolder.DEFAULT_SET)
109  { }
110 
111  /// <summary>
112  /// Builds an analyzer with the default stop words:
113  /// <see cref="GetDefaultStopSet"/>
114  /// </summary>
115  /// <param name="matchVersion">Lucene compatibility version</param>
116  /// <param name="normalizeDin2">Specifies if the DIN-2007-2 style stemmer should be used in addition to DIN1. This
117  /// will cause words with 'ae', 'ue', or 'oe' in them (expanded umlauts) to be first converted to 'a', 'u', and 'o'
118  /// respectively, before the DIN1 stemmer is invoked.</param>
119  public GermanAnalyzer(Version matchVersion, bool normalizeDin2)
120  : this(matchVersion, DefaultSetHolder.DEFAULT_SET, normalizeDin2)
121  { }
122 
123  /// <summary>
124  /// Builds an analyzer with the given stop words, using the default DIN-5007-1 stemmer
125  /// </summary>
126  /// <param name="matchVersion">Lucene compatibility version</param>
127  /// <param name="stopwords">a stopword set</param>
128  public GermanAnalyzer(Version matchVersion, ISet<string> stopwords)
129  : this(matchVersion, stopwords, CharArraySet.EMPTY_SET)
130  {
131  }
132 
133  /// <summary>
134  /// Builds an analyzer with the given stop words
135  /// </summary>
136  /// <param name="matchVersion">Lucene compatibility version</param>
137  /// <param name="stopwords">a stopword set</param>
138  /// <param name="normalizeDin2">Specifies if the DIN-2007-2 style stemmer should be used in addition to DIN1. This
139  /// will cause words with 'ae', 'ue', or 'oe' in them (expanded umlauts) to be first converted to 'a', 'u', and 'o'
140  /// respectively, before the DIN1 stemmer is invoked.</param>
141  public GermanAnalyzer(Version matchVersion, ISet<string> stopwords, bool normalizeDin2)
142  : this(matchVersion, stopwords, CharArraySet.EMPTY_SET, normalizeDin2)
143  {
144  }
145 
146  /// <summary>
147  /// Builds an analyzer with the given stop words, using the default DIN-5007-1 stemmer
148  /// </summary>
149  /// <param name="matchVersion">lucene compatibility version</param>
150  /// <param name="stopwords">a stopword set</param>
151  /// <param name="stemExclusionSet">a stemming exclusion set</param>
152  public GermanAnalyzer(Version matchVersion, ISet<string> stopwords, ISet<string> stemExclusionSet)
153  : this(matchVersion, stopwords, stemExclusionSet, false)
154  { }
155 
156 
157  /// <summary>
158  /// Builds an analyzer with the given stop words
159  /// </summary>
160  /// <param name="matchVersion">lucene compatibility version</param>
161  /// <param name="stopwords">a stopword set</param>
162  /// <param name="stemExclusionSet">a stemming exclusion set</param>
163  /// <param name="normalizeDin2">Specifies if the DIN-2007-2 style stemmer should be used in addition to DIN1. This
164  /// will cause words with 'ae', 'ue', or 'oe' in them (expanded umlauts) to be first converted to 'a', 'u', and 'o'
165  /// respectively, before the DIN1 stemmer is invoked.</param>
166  public GermanAnalyzer(Version matchVersion, ISet<string> stopwords, ISet<string> stemExclusionSet, bool normalizeDin2)
167  {
168  stopSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
169  exclusionSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stemExclusionSet));
170  this.matchVersion = matchVersion;
171  _normalizeDin2 = normalizeDin2;
172  SetOverridesTokenStreamMethod<GermanAnalyzer>();
173  }
174 
175  /// <summary>
176  /// Builds an analyzer with the given stop words.
177  /// </summary>
178  /// <param name="stopwords"></param>
179  [Obsolete("use GermanAnalyzer(Version, Set) instead")]
180  public GermanAnalyzer(Version matchVersion, params string[] stopwords)
181  : this(matchVersion, StopFilter.MakeStopSet(stopwords))
182  {
183  }
184 
185  /// <summary>
186  /// Builds an analyzer with the given stop words.
187  /// </summary>
188  [Obsolete("Use GermanAnalyzer(Version, ISet)")]
189  public GermanAnalyzer(Version matchVersion, IDictionary<string, string> stopwords)
190  : this(matchVersion, stopwords.Keys.ToArray())
191  {
192 
193  }
194 
195  /// <summary>
196  /// Builds an analyzer with the given stop words.
197  /// </summary>
198  [Obsolete("Use GermanAnalyzer(Version, ISet)")]
199  public GermanAnalyzer(Version matchVersion, FileInfo stopwords)
200  : this(matchVersion, WordlistLoader.GetWordSet(stopwords))
201  {
202  }
203 
204  /// <summary>
205  /// Builds an exclusionlist from an array of Strings.
206  /// </summary>
207  [Obsolete("Use GermanAnalyzer(Version, ISet, ISet) instead")]
208  public void SetStemExclusionTable(String[] exclusionlist)
209  {
210  exclusionSet = StopFilter.MakeStopSet(exclusionlist);
211  PreviousTokenStream = null;
212  }
213 
214  /// <summary>
215  /// Builds an exclusionlist from a IDictionary.
216  /// </summary>
217  [Obsolete("Use GermanAnalyzer(Version, ISet, ISet) instead")]
218  public void SetStemExclusionTable(IDictionary<string, string> exclusionlist)
219  {
220  exclusionSet = Support.Compatibility.SetFactory.CreateHashSet(exclusionlist.Keys);
221  PreviousTokenStream = null;
222  }
223 
224  /// <summary>
225  /// Builds an exclusionlist from the words contained in the given file.
226  /// </summary>
227  [Obsolete("Use GermanAnalyzer(Version, ISet, ISet) instead")]
228  public void SetStemExclusionTable(FileInfo exclusionlist)
229  {
230  exclusionSet = WordlistLoader.GetWordSet(exclusionlist);
231  PreviousTokenStream = null;
232  }
233 
234  /// <summary>
235  /// Creates a TokenStream which tokenizes all the text in the provided TextReader.
236  /// </summary>
237  /// <param name="fieldName"></param>
238  /// <param name="reader"></param>
239  /// <returns>A TokenStream build from a StandardTokenizer filtered with StandardFilter, StopFilter, GermanStemFilter</returns>
240  public override TokenStream TokenStream(String fieldName, TextReader reader)
241  {
242  TokenStream result = new StandardTokenizer(matchVersion, reader);
243  result = new StandardFilter(result);
244  result = new LowerCaseFilter(result);
245  result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stopSet);
246  result = new GermanStemFilter(result, exclusionSet, _normalizeDin2);
247  return result;
248  }
249  }
250 }