Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
GermanStemmer.cs
Go to the documentation of this file.
1 /*
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements. See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership. The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License. You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied. See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20 */
21 
22 using System;
23 using System.IO;
24 using System.Text;
25 using System.Collections;
26 
27 namespace Lucene.Net.Analysis.De
28 {
29  /// <summary>
30  /// A stemmer for German words. The algorithm is based on the report
31  /// "A Fast and Simple Stemming Algorithm for German Words" by JГ¶rg
32  /// Caumanns (joerg.caumanns@isst.fhg.de).
33  /// </summary>
34  public class GermanStemmer
35  {
36  /// <summary>
37  /// Buffer for the terms while stemming them.
38  /// </summary>
39  private StringBuilder sb = new StringBuilder();
40 
41  /// <summary>
42  /// Amount of characters that are removed with <tt>Substitute()</tt> while stemming.
43  /// </summary>
44  protected int substCount = 0;
45 
46  /// <summary>
47  /// Stemms the given term to an unique <tt>discriminator</tt>.
48  /// </summary>
49  /// <param name="term">The term that should be stemmed.</param>
50  /// <returns>Discriminator for <tt>term</tt></returns>
51  internal String Stem( String term )
52  {
53  // Use lowercase for medium stemming.
54  term = term.ToLower();
55  if ( !IsStemmable( term ) )
56  return term;
57  // Reset the StringBuilder.
58  sb.Remove(0, sb.Length);
59  sb.Insert(0, term);
60  // Stemming starts here...
61  Substitute( sb );
62  Strip( sb );
63  Optimize( sb );
64  Resubstitute( sb );
65  RemoveParticleDenotion( sb );
66  return sb.ToString();
67  }
68 
69  /// <summary>
70  /// Checks if a term could be stemmed.
71  /// </summary>
72  /// <param name="term"></param>
73  /// <returns>true if, and only if, the given term consists in letters.</returns>
74  private bool IsStemmable( String term )
75  {
76  for ( int c = 0; c < term.Length; c++ )
77  {
78  if ( !Char.IsLetter(term[c])) return false;
79  }
80  return true;
81  }
82 
83  /// <summary>
84  /// Suffix stripping (stemming) on the current term. The stripping is reduced
85  /// to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd",
86  /// from which all regular suffixes are build of. The simplification causes
87  /// some overstemming, and way more irregular stems, but still provides unique.
88  /// discriminators in the most of those cases.
89  /// The algorithm is context free, except of the length restrictions.
90  /// </summary>
91  /// <param name="buffer"></param>
92  private void Strip( StringBuilder buffer )
93  {
94  bool doMore = true;
95  while ( doMore && buffer.Length > 3 )
96  {
97  if ( ( buffer.Length + substCount > 5 ) &&
98  buffer.ToString().Substring(buffer.Length - 2, 2).Equals( "nd" ) )
99  {
100  buffer.Remove( buffer.Length - 2, 2 );
101  }
102  else if ( ( buffer.Length + substCount > 4 ) &&
103  buffer.ToString().Substring( buffer.Length - 2, 2).Equals( "em" ) )
104  {
105  buffer.Remove( buffer.Length - 2, 2 );
106  }
107  else if ( ( buffer.Length + substCount > 4 ) &&
108  buffer.ToString().Substring( buffer.Length - 2, 2).Equals( "er" ) )
109  {
110  buffer.Remove( buffer.Length - 2, 2 );
111  }
112  else if ( buffer[buffer.Length - 1] == 'e' )
113  {
114  buffer.Remove(buffer.Length - 1, 1);
115  }
116  else if ( buffer[buffer.Length - 1] == 's' )
117  {
118  buffer.Remove(buffer.Length - 1, 1);
119  }
120  else if ( buffer[buffer.Length - 1] == 'n' )
121  {
122  buffer.Remove(buffer.Length - 1, 1);
123  }
124  // "t" occurs only as suffix of verbs.
125  else if ( buffer[buffer.Length - 1] == 't')
126  {
127  buffer.Remove(buffer.Length - 1, 1);
128  }
129  else
130  {
131  doMore = false;
132  }
133  }
134  }
135 
136  /// <summary>
137  /// Does some optimizations on the term. This optimisations are contextual.
138  /// </summary>
139  /// <param name="buffer"></param>
140  private void Optimize( StringBuilder buffer )
141  {
142  // Additional step for female plurals of professions and inhabitants.
143  if ( buffer.Length > 5 && buffer.ToString().Substring(buffer.Length - 5, 5).Equals( "erin*" ))
144  {
145  buffer.Remove(buffer.Length - 1, 1);
146  Strip(buffer);
147  }
148  // Additional step for irregular plural nouns like "Matrizen -> Matrix".
149  if ( buffer[buffer.Length - 1] == ('z') )
150  {
151  buffer[buffer.Length - 1] = 'x';
152  }
153  }
154 
155  /// <summary>
156  /// Removes a particle denotion ("ge") from a term.
157  /// </summary>
158  /// <param name="buffer"></param>
159  private void RemoveParticleDenotion( StringBuilder buffer )
160  {
161  if ( buffer.Length > 4 )
162  {
163  for ( int c = 0; c < buffer.Length - 3; c++ )
164  {
165  if ( buffer.ToString().Substring( c, 4 ).Equals( "gege" ) )
166  {
167  buffer.Remove(c, 2);
168  return;
169  }
170  }
171  }
172  }
173 
174  /// <summary>
175  /// Do some substitutions for the term to reduce overstemming:
176  ///
177  /// - Substitute Umlauts with their corresponding vowel: äöü -> aou,
178  /// "&#223;" is substituted by "ss"
179  /// - Substitute a second char of a pair of equal characters with
180  /// an asterisk: ?? -&gt; ?*
181  /// - Substitute some common character combinations with a token:
182  /// sch/ch/ei/ie/ig/st -&gt; $/В&#167;/%/&amp;/#/!
183  /// </summary>
184  protected virtual void Substitute( StringBuilder buffer )
185  {
186  substCount = 0;
187  for ( int c = 0; c < buffer.Length; c++ )
188  {
189  // Replace the second char of a pair of the equal characters with an asterisk
190  if (c > 0 && buffer[c] == buffer[c - 1])
191  {
192  buffer[c] = '*';
193  }
194  // Substitute Umlauts.
195  else if (buffer[c] == 'ä')
196  {
197  buffer[c] = 'a';
198  }
199  else if (buffer[c] == 'ö')
200  {
201  buffer[c] = 'o';
202  }
203  else if (buffer[c] == 'ü')
204  {
205  buffer[c] = 'u';
206  }
207  // Fix bug so that 'ß' at the end of a word is replaced.
208  else if (buffer[c] == 'ß')
209  {
210  buffer[c] = 's';
211  buffer.Insert(c + 1, 's');
212  substCount++;
213  }
214 
215  // Take care that at least one character is left left side from the current one
216  if ( c < buffer.Length - 1 )
217  {
218  // Masking several common character combinations with an token
219  if ( ( c < buffer.Length - 2 ) && buffer[c] == 's' &&
220  buffer[c + 1] == 'c' && buffer[c + 2] == 'h' )
221  {
222  buffer[c] = '$';
223  buffer.Remove(c + 1, 2);
224  substCount =+ 2;
225  }
226  else if ( buffer[c] == 'c' && buffer[c + 1] == 'h' )
227  {
228  buffer[c] = '§';
229  buffer.Remove(c + 1, 1);
230  substCount++;
231  }
232  else if ( buffer[c] == 'e' && buffer[c + 1] == 'i' )
233  {
234  buffer[c] = '%';
235  buffer.Remove(c + 1, 1);
236  substCount++;
237  }
238  else if ( buffer[c] == 'i' && buffer[c + 1] == 'e' )
239  {
240  buffer[c] = '&';
241  buffer.Remove(c + 1, 1);
242  substCount++;
243  }
244  else if ( buffer[c] == 'i' && buffer[c + 1] == 'g' )
245  {
246  buffer[c] = '#';
247  buffer.Remove(c + 1, 1);
248  substCount++;
249  }
250  else if ( buffer[c] == 's' && buffer[c + 1] == 't' )
251  {
252  buffer[c] = '!';
253  buffer.Remove(c + 1, 1);
254  substCount++;
255  }
256  }
257  }
258  }
259 
260  /// <summary>
261  /// Undoes the changes made by Substitute(). That are character pairs and
262  /// character combinations. Umlauts will remain as their corresponding vowel,
263  /// as "?" remains as "ss".
264  /// </summary>
265  /// <param name="buffer"></param>
266  private void Resubstitute( StringBuilder buffer )
267  {
268  for ( int c = 0; c < buffer.Length; c++ )
269  {
270  if ( buffer[c] == '*' )
271  {
272  char x = buffer[c - 1];
273  buffer[c] = x;
274  }
275  else if ( buffer[c] == '$' )
276  {
277  buffer[c] = 's';
278  buffer.Insert( c + 1, new char[]{'c', 'h'}, 0, 2);
279  }
280  else if ( buffer[c] == '§' )
281  {
282  buffer[c] = 'c';
283  buffer.Insert( c + 1, 'h' );
284  }
285  else if ( buffer[c] == '%' )
286  {
287  buffer[c] = 'e';
288  buffer.Insert( c + 1, 'i' );
289  }
290  else if ( buffer[c] == '&' )
291  {
292  buffer[c] = 'i';
293  buffer.Insert( c + 1, 'e' );
294  }
295  else if ( buffer[c] == '#' )
296  {
297  buffer[c] = 'i';
298  buffer.Insert( c + 1, 'g' );
299  }
300  else if ( buffer[c] == '!' )
301  {
302  buffer[c] = 's';
303  buffer.Insert( c + 1, 't' );
304  }
305  }
306  }
307  }
308 }