Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
StopFilter.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.Collections.Generic;
20 using Lucene.Net.Analysis.Tokenattributes;
21 using Lucene.Net.Util;
22 using QueryParser = Lucene.Net.QueryParsers.QueryParser;
23 using Version = Lucene.Net.Util.Version;
24 
25 namespace Lucene.Net.Analysis
26 {
27 
28  /// <summary> Removes stop words from a token stream.</summary>
29 
30  public sealed class StopFilter:TokenFilter
31  {
32  private readonly CharArraySet stopWords;
33  private bool enablePositionIncrements = false;
34 
35  private readonly ITermAttribute termAtt;
36  private readonly IPositionIncrementAttribute posIncrAtt;
37 
38  /// <summary> Construct a token stream filtering the given input.
39  /// If <c>stopWords</c> is an instance of <see cref="CharArraySet" /> (true if
40  /// <c>makeStopSet()</c> was used to construct the set) it will be directly used
41  /// and <c>ignoreCase</c> will be ignored since <c>CharArraySet</c>
42  /// directly controls case sensitivity.
43  /// <p/>
44  /// If <c>stopWords</c> is not an instance of <see cref="CharArraySet" />,
45  /// a new CharArraySet will be constructed and <c>ignoreCase</c> will be
46  /// used to specify the case sensitivity of that set.
47  /// </summary>
48  /// <param name="enablePositionIncrements">true if token positions should record the removed stop words</param>
49  /// <param name="input">Input TokenStream</param>
50  /// <param name="stopWords">A Set of strings or strings or char[] or any other ToString()-able set representing the stopwords</param>
51  /// <param name="ignoreCase">if true, all words are lower cased first</param>
52  public StopFilter(bool enablePositionIncrements, TokenStream input, ISet<string> stopWords, bool ignoreCase)
53  : base(input)
54  {
55  if (stopWords is CharArraySet)
56  {
57  this.stopWords = (CharArraySet) stopWords;
58  }
59  else
60  {
61  this.stopWords = new CharArraySet(stopWords.Count, ignoreCase);
62  this.stopWords.AddAll(stopWords);
63  }
64  this.enablePositionIncrements = enablePositionIncrements;
65  termAtt = AddAttribute<ITermAttribute>();
66  posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
67  }
68 
69  /// <summary> Constructs a filter which removes words from the input
70  /// TokenStream that are named in the Set.
71  /// </summary>
72  /// <param name="enablePositionIncrements">true if token positions should record the removed stop words</param>
73  /// <param name="in">Input stream</param>
74  /// <param name="stopWords">A Set of strings or char[] or any other ToString()-able set representing the stopwords</param>
75  /// <seealso cref="MakeStopSet(String[])"/>
76  public StopFilter(bool enablePositionIncrements, TokenStream @in, ISet<string> stopWords)
77  : this(enablePositionIncrements, @in, stopWords, false)
78  { }
79 
80  /// <summary> Builds a Set from an array of stop words,
81  /// appropriate for passing into the StopFilter constructor.
82  /// This permits this stopWords construction to be cached once when
83  /// an Analyzer is constructed.
84  ///
85  /// </summary>
86  /// <seealso cref="MakeStopSet(String[], bool)">passing false to ignoreCase</seealso>
87  public static ISet<string> MakeStopSet(params string[] stopWords)
88  {
89  return MakeStopSet(stopWords, false);
90  }
91 
92  /// <summary> Builds a Set from an array of stop words,
93  /// appropriate for passing into the StopFilter constructor.
94  /// This permits this stopWords construction to be cached once when
95  /// an Analyzer is constructed.
96  /// </summary>
97  /// <param name="stopWords">A list of strings or char[] or any other ToString()-able list representing the stop words</param>
98  /// <seealso cref="MakeStopSet(String[], bool)">passing false to ignoreCase</seealso>
99  public static ISet<string> MakeStopSet(IList<object> stopWords)
100  {
101  return MakeStopSet(stopWords, false);
102  }
103 
104  /// <summary></summary>
105  /// <param name="stopWords">An array of stopwords</param>
106  /// <param name="ignoreCase">If true, all words are lower cased first.</param>
107  /// <returns> a Set containing the words</returns>
108  public static ISet<string> MakeStopSet(string[] stopWords, bool ignoreCase)
109  {
110  var stopSet = new CharArraySet(stopWords.Length, ignoreCase);
111  stopSet.AddAll(stopWords);
112  return stopSet;
113  }
114 
115  /// <summary> </summary>
116  /// <param name="stopWords">A List of Strings or char[] or any other toString()-able list representing the stopwords </param>
117  /// <param name="ignoreCase">if true, all words are lower cased first</param>
118  /// <returns>A Set (<see cref="CharArraySet"/>)containing the words</returns>
119  public static ISet<string> MakeStopSet(IList<object> stopWords, bool ignoreCase)
120  {
121  var stopSet = new CharArraySet(stopWords.Count, ignoreCase);
122  foreach(var word in stopWords)
123  stopSet.Add(word.ToString());
124  return stopSet;
125  }
126 
127  /// <summary> Returns the next input Token whose term() is not a stop word.</summary>
128  public override bool IncrementToken()
129  {
130  // return the first non-stop word found
131  int skippedPositions = 0;
132  while (input.IncrementToken())
133  {
134  if (!stopWords.Contains(termAtt.TermBuffer(), 0, termAtt.TermLength()))
135  {
136  if (enablePositionIncrements)
137  {
138  posIncrAtt.PositionIncrement = posIncrAtt.PositionIncrement + skippedPositions;
139  }
140  return true;
141  }
142  skippedPositions += posIncrAtt.PositionIncrement;
143  }
144  // reached EOS -- return false
145  return false;
146  }
147 
148  /// <summary> Returns version-dependent default for enablePositionIncrements. Analyzers
149  /// that embed StopFilter use this method when creating the StopFilter. Prior
150  /// to 2.9, this returns false. On 2.9 or later, it returns true.
151  /// </summary>
152  public static bool GetEnablePositionIncrementsVersionDefault(Version matchVersion)
153  {
154  return matchVersion.OnOrAfter(Version.LUCENE_29);
155  }
156 
157  /// <summary> If <c>true</c>, this StopFilter will preserve
158  /// positions of the incoming tokens (ie, accumulate and
159  /// set position increments of the removed stop tokens).
160  /// Generally, <c>true</c> is best as it does not
161  /// lose information (positions of the original tokens)
162  /// during indexing.
163  ///
164  /// <p/> When set, when a token is stopped
165  /// (omitted), the position increment of the following
166  /// token is incremented.
167  ///
168  /// <p/> <b>NOTE</b>: be sure to also
169  /// set <see cref="QueryParser.EnablePositionIncrements" /> if
170  /// you use QueryParser to create queries.
171  /// </summary>
172  public bool EnablePositionIncrements
173  {
174  get { return enablePositionIncrements; }
175  set { enablePositionIncrements = value; }
176  }
177  }
178 }