Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
RussianAnalyzer.cs
Go to the documentation of this file.
1 /*
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements. See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership. The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License. You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied. See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20 */
21 
22 using System;
23 using System.Collections.Generic;
24 using System.Linq;
25 using System.Text;
26 using System.IO;
27 using System.Collections;
28 using Lucene.Net.Analysis;
29 using Version = Lucene.Net.Util.Version;
30 
31 namespace Lucene.Net.Analysis.Ru
32 {
33  /// <summary>
34  /// Analyzer for Russian language. Supports an external list of stopwords (words that
35  /// will not be indexed at all).
36  /// A default set of stopwords is used unless an alternative list is specified.
37  /// </summary>
38  public sealed class RussianAnalyzer : Analyzer
39  {
40  /// <summary>
41  /// List of typical Russian stopwords.
42  /// </summary>
43  private static readonly String[] RUSSIAN_STOP_WORDS = {
44  "а", "без", "более", "бы", "был", "была", "были",
45  "было", "быть", "в",
46  "вам", "вас", "весь", "во", "вот", "все", "всего",
47  "всех", "вы", "где",
48  "да", "даже", "для", "до", "его", "ее", "ей", "ею",
49  "если", "есть",
50  "еще", "же", "за", "здесь", "и", "из", "или", "им",
51  "их", "к", "как",
52  "ко", "когда", "кто", "ли", "либо", "мне", "может",
53  "мы", "на", "надо",
54  "наш", "не", "него", "нее", "нет", "ни", "них", "но",
55  "ну", "о", "об",
56  "однако", "он", "она", "они", "оно", "от", "очень",
57  "по", "под", "при",
58  "с", "со", "так", "также", "такой", "там", "те", "тем"
59  , "то", "того",
60  "тоже", "той", "только", "том", "ты", "у", "уже",
61  "хотя", "чего", "чей",
62  "чем", "что", "чтобы", "чье", "чья", "эта", "эти",
63  "это", "я"
64  };
65 
66  private static class DefaultSetHolder
67  {
68  internal static readonly ISet<string> DEFAULT_STOP_SET = CharArraySet.UnmodifiableSet(new CharArraySet((IEnumerable<string>)RUSSIAN_STOP_WORDS, false));
69  }
70 
71  /// <summary>
72  /// Contains the stopwords used with the StopFilter.
73  /// </summary>
74  private readonly ISet<string> stopSet;
75 
76  private readonly Version matchVersion;
77 
78 
79  public RussianAnalyzer(Version matchVersion)
80  : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
81  {
82  }
83 
84  /*
85  * Builds an analyzer with the given stop words.
86  * @deprecated use {@link #RussianAnalyzer(Version, Set)} instead
87  */
88  public RussianAnalyzer(Version matchVersion, params string[] stopwords)
89  : this(matchVersion, StopFilter.MakeStopSet(stopwords))
90  {
91 
92  }
93 
94  /*
95  * Builds an analyzer with the given stop words
96  *
97  * @param matchVersion
98  * lucene compatibility version
99  * @param stopwords
100  * a stopword set
101  */
102  public RussianAnalyzer(Version matchVersion, ISet<string> stopwords)
103  {
104  stopSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
105  this.matchVersion = matchVersion;
106  }
107 
108  /*
109  * Builds an analyzer with the given stop words.
110  * TODO: create a Set version of this ctor
111  * @deprecated use {@link #RussianAnalyzer(Version, Set)} instead
112  */
113  public RussianAnalyzer(Version matchVersion, IDictionary<string, string> stopwords)
114  : this(matchVersion, stopwords.Keys.ToArray())
115  {
116  }
117 
118  /*
119  * Creates a {@link TokenStream} which tokenizes all the text in the
120  * provided {@link Reader}.
121  *
122  * @return A {@link TokenStream} built from a
123  * {@link RussianLetterTokenizer} filtered with
124  * {@link RussianLowerCaseFilter}, {@link StopFilter},
125  * and {@link RussianStemFilter}
126  */
127  public override TokenStream TokenStream(String fieldName, TextReader reader)
128  {
129  TokenStream result = new RussianLetterTokenizer(reader);
130  result = new LowerCaseFilter(result);
132  result, stopSet);
133  result = new RussianStemFilter(result);
134  return result;
135  }
136 
137  private class SavedStreams
138  {
139  protected internal Tokenizer source;
140  protected internal TokenStream result;
141  };
142 
143  /*
144  * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
145  * in the provided {@link Reader}.
146  *
147  * @return A {@link TokenStream} built from a
148  * {@link RussianLetterTokenizer} filtered with
149  * {@link RussianLowerCaseFilter}, {@link StopFilter},
150  * and {@link RussianStemFilter}
151  */
152  public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
153  {
154  SavedStreams streams = (SavedStreams)PreviousTokenStream;
155  if (streams == null)
156  {
157  streams = new SavedStreams();
158  streams.source = new RussianLetterTokenizer(reader);
159  streams.result = new LowerCaseFilter(streams.source);
160  streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
161  streams.result, stopSet);
162  streams.result = new RussianStemFilter(streams.result);
163  PreviousTokenStream = streams;
164  }
165  else
166  {
167  streams.source.Reset(reader);
168  }
169  return streams.result;
170  }
171  }
172 }