Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
StopAnalyzer.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System.Collections.Generic;
19 using Version = Lucene.Net.Util.Version;
20 
21 namespace Lucene.Net.Analysis
22 {
23 
24  /// <summary> Filters <see cref="LetterTokenizer" /> with <see cref="LowerCaseFilter" /> and
25  /// <see cref="StopFilter" />.
26  ///
27  /// <a name="version"/>
28  /// <p/>
29  /// You must specify the required <see cref="Version" /> compatibility when creating
30  /// StopAnalyzer:
31  /// <list type="bullet">
32  /// <item>As of 2.9, position increments are preserved</item>
33  /// </list>
34  /// </summary>
35 
36  public sealed class StopAnalyzer:Analyzer
37  {
38  private readonly ISet<string> stopWords;
39  private readonly bool enablePositionIncrements;
40 
41  /// <summary>An unmodifiable set containing some common English words that are not usually useful
42  /// for searching.
43  /// </summary>
44  public static ISet<string> ENGLISH_STOP_WORDS_SET;
45 
46  /// <summary> Builds an analyzer which removes words in ENGLISH_STOP_WORDS.</summary>
47  public StopAnalyzer(Version matchVersion)
48  {
49  stopWords = ENGLISH_STOP_WORDS_SET;
50  enablePositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion);
51  }
52 
53  /// <summary>Builds an analyzer with the stop words from the given set.</summary>
54  public StopAnalyzer(Version matchVersion, ISet<string> stopWords)
55  {
56  this.stopWords = stopWords;
57  enablePositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion);
58  }
59 
60  /// <summary> Builds an analyzer with the stop words from the given file.
61  ///
62  /// </summary>
63  /// <seealso cref="WordlistLoader.GetWordSet(System.IO.FileInfo)">
64  /// </seealso>
65  /// <param name="matchVersion">See <a href="#version">above</a>
66  /// </param>
67  /// <param name="stopwordsFile">File to load stop words from
68  /// </param>
69  public StopAnalyzer(Version matchVersion, System.IO.FileInfo stopwordsFile)
70  {
71  stopWords = WordlistLoader.GetWordSet(stopwordsFile);
72  enablePositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion);
73  }
74 
75  /// <summary>Builds an analyzer with the stop words from the given reader. </summary>
76  /// <seealso cref="WordlistLoader.GetWordSet(System.IO.TextReader)">
77  /// </seealso>
78  /// <param name="matchVersion">See <a href="#Version">above</a>
79  /// </param>
80  /// <param name="stopwords">Reader to load stop words from
81  /// </param>
82  public StopAnalyzer(Version matchVersion, System.IO.TextReader stopwords)
83  {
84  stopWords = WordlistLoader.GetWordSet(stopwords);
85  enablePositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion);
86  }
87 
88  /// <summary>Filters LowerCaseTokenizer with StopFilter. </summary>
89  public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
90  {
91  return new StopFilter(enablePositionIncrements, new LowerCaseTokenizer(reader), stopWords);
92  }
93 
94  /// <summary>Filters LowerCaseTokenizer with StopFilter. </summary>
95  private class SavedStreams
96  {
97  public SavedStreams(StopAnalyzer enclosingInstance)
98  {
99  InitBlock(enclosingInstance);
100  }
101  private void InitBlock(StopAnalyzer enclosingInstance)
102  {
103  this.enclosingInstance = enclosingInstance;
104  }
105  private StopAnalyzer enclosingInstance;
106  public StopAnalyzer Enclosing_Instance
107  {
108  get
109  {
110  return enclosingInstance;
111  }
112 
113  }
114  internal Tokenizer source;
115  internal TokenStream result;
116  }
117 
118  public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader)
119  {
120  var streams = (SavedStreams) PreviousTokenStream;
121  if (streams == null)
122  {
123  streams = new SavedStreams(this) {source = new LowerCaseTokenizer(reader)};
124  streams.result = new StopFilter(enablePositionIncrements, streams.source, stopWords);
125  PreviousTokenStream = streams;
126  }
127  else
128  streams.source.Reset(reader);
129  return streams.result;
130  }
131  static StopAnalyzer()
132  {
133  {
134  var stopWords = new System.String[]{"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"};
135  var stopSet = new CharArraySet(stopWords.Length, false);
136  stopSet.AddAll(stopWords);
137  ENGLISH_STOP_WORDS_SET = CharArraySet.UnmodifiableSet(stopSet);
138  }
139  }
140  }
141 }