Lucene.Net  3.0.3
Lucene.Net is a .NET port of the Java Lucene Indexing Library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties
QueryAutoStopWordAnalyzer.cs
Go to the documentation of this file.
1 /*
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements. See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership. The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License. You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied. See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20 */
21 
22 using System;
23 using System.Collections.Generic;
24 using System.IO;
25 using System.Linq;
26 using System.Text;
27 using Lucene.Net.Index;
28 using Lucene.Net.Support;
29 using Lucene.Net.Util;
30 using Version = Lucene.Net.Util.Version;
31 
32 namespace Lucene.Net.Analysis.Query
33 {
34 /*
35  * An {@link Analyzer} used primarily at query time to wrap another analyzer and provide a layer of protection
36  * which prevents very common words from being passed into queries.
37  * <p>
38  * For very large indexes the cost
39  * of reading TermDocs for a very common word can be high. This analyzer was created after experience with
40  * a 38 million doc index which had a term in around 50% of docs and was causing TermQueries for
41  * this term to take 2 seconds.
42  * </p>
43  * <p>
44  * Use the various "addStopWords" methods in this class to automate the identification and addition of
45  * stop words found in an already existing index.
46  * </p>
47  */
49  Analyzer _delegate;
50  HashMap<String,ISet<String>> stopWordsPerField = new HashMap<String,ISet<String>>();
51  //The default maximum percentage (40%) of index documents which
52  //can contain a term, after which the term is considered to be a stop word.
53  public const float defaultMaxDocFreqPercent = 0.4f;
54  private readonly Version matchVersion;
55 
56  /*
57  * Initializes this analyzer with the Analyzer object that actually produces the tokens
58  *
59  * @param _delegate The choice of {@link Analyzer} that is used to produce the token stream which needs filtering
60  */
61  public QueryAutoStopWordAnalyzer(Version matchVersion, Analyzer _delegate)
62  {
63  this._delegate = _delegate;
64  SetOverridesTokenStreamMethod<QueryAutoStopWordAnalyzer>();
65  this.matchVersion = matchVersion;
66  }
67 
68  /*
69  * Automatically adds stop words for all fields with terms exceeding the defaultMaxDocFreqPercent
70  *
71  * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
72  * exceed the required document frequency
73  * @return The number of stop words identified.
74  * @throws IOException
75  */
76  public int AddStopWords(IndexReader reader)
77  {
78  return AddStopWords(reader, defaultMaxDocFreqPercent);
79  }
80 
81  /*
82  * Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent
83  *
84  * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
85  * exceed the required document frequency
86  * @param maxDocFreq The maximum number of index documents which can contain a term, after which
87  * the term is considered to be a stop word
88  * @return The number of stop words identified.
89  * @throws IOException
90  */
91  public int AddStopWords(IndexReader reader, int maxDocFreq)
92  {
93  int numStopWords = 0;
94  ICollection<String> fieldNames = reader.GetFieldNames(IndexReader.FieldOption.INDEXED);
95  for (IEnumerator<String> iter = fieldNames.GetEnumerator(); iter.MoveNext();) {
96  String fieldName = iter.Current;
97  numStopWords += AddStopWords(reader, fieldName, maxDocFreq);
98  }
99  return numStopWords;
100  }
101 
102  /*
103  * Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent
104  *
105  * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
106  * exceed the required document frequency
107  * @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
108  * contain a term, after which the word is considered to be a stop word.
109  * @return The number of stop words identified.
110  * @throws IOException
111  */
112  public int AddStopWords(IndexReader reader, float maxPercentDocs)
113  {
114  int numStopWords = 0;
115  ICollection<String> fieldNames = reader.GetFieldNames(IndexReader.FieldOption.INDEXED);
116  for (IEnumerator<String> iter = fieldNames.GetEnumerator(); iter.MoveNext();) {
117  String fieldName = iter.Current;
118  numStopWords += AddStopWords(reader, fieldName, maxPercentDocs);
119  }
120  return numStopWords;
121  }
122 
123  /*
124  * Automatically adds stop words for the given field with terms exceeding the maxPercentDocs
125  *
126  * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
127  * exceed the required document frequency
128  * @param fieldName The field for which stopwords will be added
129  * @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
130  * contain a term, after which the word is considered to be a stop word.
131  * @return The number of stop words identified.
132  * @throws IOException
133  */
134  public int AddStopWords(IndexReader reader, String fieldName, float maxPercentDocs)
135  {
136  return AddStopWords(reader, fieldName, (int) (reader.NumDocs() * maxPercentDocs));
137  }
138 
139  /*
140  * Automatically adds stop words for the given field with terms exceeding the maxPercentDocs
141  *
142  * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
143  * exceed the required document frequency
144  * @param fieldName The field for which stopwords will be added
145  * @param maxDocFreq The maximum number of index documents which
146  * can contain a term, after which the term is considered to be a stop word.
147  * @return The number of stop words identified.
148  * @throws IOException
149  */
150  public int AddStopWords(IndexReader reader, String fieldName, int maxDocFreq)
151  {
152  var stopWords = Support.Compatibility.SetFactory.CreateHashSet<string>();
153  String internedFieldName = StringHelper.Intern(fieldName);
154  TermEnum te = reader.Terms(new Term(fieldName));
155  Term term = te.Term;
156  while (term != null) {
157  if (term.Field != internedFieldName) {
158  break;
159  }
160  if (te.DocFreq() > maxDocFreq) {
161  stopWords.Add(term.Text);
162  }
163  if (!te.Next()) {
164  break;
165  }
166  term = te.Term;
167  }
168  stopWordsPerField.Add(fieldName, stopWords);
169 
170  /* if the stopwords for a field are changed,
171  * then saved streams for that field are erased.
172  */
173  IDictionary<String,SavedStreams> streamMap = (IDictionary<String,SavedStreams>) PreviousTokenStream;
174  if (streamMap != null)
175  streamMap.Remove(fieldName);
176 
177  return stopWords.Count;
178  }
179 
180  public override TokenStream TokenStream(String fieldName, TextReader reader) {
181  TokenStream result;
182  try {
183  result = _delegate.ReusableTokenStream(fieldName, reader);
184  } catch (IOException) {
185  result = _delegate.TokenStream(fieldName, reader);
186  }
187  var stopWords = stopWordsPerField[fieldName];
188  if (stopWords != null) {
190  result, stopWords);
191  }
192  return result;
193  }
194 
195  private class SavedStreams {
196  /* the underlying stream */
197  protected internal TokenStream Wrapped;
198 
199  /*
200  * when there are no stopwords for the field, refers to wrapped.
201  * if there stopwords, it is a StopFilter around wrapped.
202  */
203  protected internal TokenStream WithStopFilter;
204  };
205 
206  public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
207 {
208  if (overridesTokenStreamMethod) {
209  // LUCENE-1678: force fallback to tokenStream() if we
210  // have been subclassed and that subclass overrides
211  // tokenStream but not reusableTokenStream
212  return TokenStream(fieldName, reader);
213  }
214 
215  /* map of SavedStreams for each field */
216  IDictionary<String, SavedStreams> streamMap = (IDictionary<String, SavedStreams>)PreviousTokenStream;
217  if (streamMap == null) {
218  streamMap = new HashMap<String, SavedStreams>();
219  PreviousTokenStream = streamMap;
220  }
221 
222  SavedStreams streams = streamMap[fieldName];
223  if (streams == null) {
224  /* an entry for this field does not exist, create one */
225  streams = new SavedStreams();
226  streamMap.Add(fieldName, streams);
227  streams.Wrapped = _delegate.ReusableTokenStream(fieldName, reader);
228 
229  /* if there are any stopwords for the field, save the stopfilter */
230  var stopWords = stopWordsPerField[fieldName];
231  if (stopWords != null)
232  streams.WithStopFilter = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
233  streams.Wrapped, stopWords);
234  else
235  streams.WithStopFilter = streams.Wrapped;
236 
237  } else {
238  /*
239  * an entry for this field exists, verify the wrapped stream has not
240  * changed. if it has not, reuse it, otherwise wrap the new stream.
241  */
242  TokenStream result = _delegate.ReusableTokenStream(fieldName, reader);
243  if (result == streams.Wrapped) {
244  /* the wrapped analyzer reused the stream */
245  streams.WithStopFilter.Reset();
246  } else {
247  /*
248  * the wrapped analyzer did not. if there are any stopwords for the
249  * field, create a new StopFilter around the new stream
250  */
251  streams.Wrapped = result;
252  var stopWords = stopWordsPerField[fieldName];
253  if (stopWords != null)
254  streams.WithStopFilter = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
255  streams.Wrapped, stopWords);
256  else
257  streams.WithStopFilter = streams.Wrapped;
258  }
259  }
260 
261  return streams.WithStopFilter;
262  }
263 
264  /*
265  * Provides information on which stop words have been identified for a field
266  *
267  * @param fieldName The field for which stop words identified in "addStopWords"
268  * method calls will be returned
269  * @return the stop words identified for a field
270  */
271  public String[] GetStopWords(String fieldName) {
272  String[] result;
273  var stopWords = stopWordsPerField[fieldName];
274  if (stopWords != null) {
275  result = stopWords.ToArray();
276  } else {
277  result = new String[0];
278  }
279  return result;
280  }
281 
282  /*
283  * Provides information on which stop words have been identified for all fields
284  *
285  * @return the stop words (as terms)
286  */
287  public Term[] GetStopWords() {
288  List<Term> allStopWords = new List<Term>();
289  foreach(var fieldName in stopWordsPerField.Keys)
290  {
291  var stopWords = stopWordsPerField[fieldName];
292  foreach(var text in stopWords) {
293  allStopWords.Add(new Term(fieldName, text));
294  }
295  }
296  return allStopWords.ToArray();
297  }
298 
299 }
300 }