Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
MultiTermQuery.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.Collections.Generic;
20 using System.Runtime.InteropServices;
21 using IndexReader = Lucene.Net.Index.IndexReader;
22 using Term = Lucene.Net.Index.Term;
23 using QueryParser = Lucene.Net.QueryParsers.QueryParser;
24 using ToStringUtils = Lucene.Net.Util.ToStringUtils;
25 
26 namespace Lucene.Net.Search
27 {
28 
29  /// <summary> An abstract <see cref="Query" /> that matches documents
30  /// containing a subset of terms provided by a <see cref="FilteredTermEnum" />
31  /// enumeration.
32  ///
33  /// <p/>This query cannot be used directly; you must subclass
34  /// it and define <see cref="GetEnum" /> to provide a <see cref="FilteredTermEnum" />
35  /// that iterates through the terms to be
36  /// matched.
37  ///
38  /// <p/><b>NOTE</b>: if <see cref="RewriteMethod" /> is either
39  /// <see cref="CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE" /> or <see cref="SCORING_BOOLEAN_QUERY_REWRITE" />
40  ///, you may encounter a
41  /// <see cref="BooleanQuery.TooManyClauses" /> exception during
42  /// searching, which happens when the number of terms to be
43  /// searched exceeds <see cref="BooleanQuery.MaxClauseCount" />
44  ///. Setting <see cref="RewriteMethod" />
45  /// to <see cref="CONSTANT_SCORE_FILTER_REWRITE" />
46  /// prevents this.
47  ///
48  /// <p/>The recommended rewrite method is <see cref="CONSTANT_SCORE_AUTO_REWRITE_DEFAULT" />
49  ///: it doesn't spend CPU
50  /// computing unhelpful scores, and it tries to pick the most
51  /// performant rewrite method given the query.
52  ///
53  /// Note that <see cref="QueryParser" /> produces
54  /// MultiTermQueries using <see cref="CONSTANT_SCORE_AUTO_REWRITE_DEFAULT" />
55  /// by default.
56  /// </summary>
57  [Serializable]
58  public abstract class MultiTermQuery:Query
59  {
60  [Serializable]
62  {
63  public override int TermCountCutoff
64  {
65  set { throw new System.NotSupportedException("Please create a private instance"); }
66  }
67 
68  public override double DocCountPercent
69  {
70  set { throw new System.NotSupportedException("Please create a private instance"); }
71  }
72 
73  // Make sure we are still a singleton even after deserializing
74  protected internal virtual System.Object ReadResolve()
75  {
76  return Lucene.Net.Search.MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT;
77  }
78  }
79  protected internal RewriteMethod internalRewriteMethod = CONSTANT_SCORE_AUTO_REWRITE_DEFAULT;
80  [NonSerialized]
81  internal int numberOfTerms = 0;
82 
83  [Serializable]
84  private sealed class ConstantScoreFilterRewrite:RewriteMethod
85  {
86  public override Query Rewrite(IndexReader reader, MultiTermQuery query)
87  {
88  Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter<MultiTermQuery>(query));
89  result.Boost = query.Boost;
90  return result;
91  }
92 
93  // Make sure we are still a singleton even after deserializing
94  internal System.Object ReadResolve()
95  {
96  return Lucene.Net.Search.MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE;
97  }
98  }
99 
100  /// <summary>A rewrite method that first creates a private Filter,
101  /// by visiting each term in sequence and marking all docs
102  /// for that term. Matching documents are assigned a
103  /// constant score equal to the query's boost.
104  ///
105  /// <p/> This method is faster than the BooleanQuery
106  /// rewrite methods when the number of matched terms or
107  /// matched documents is non-trivial. Also, it will never
108  /// hit an errant <see cref="BooleanQuery.TooManyClauses" />
109  /// exception.
110  ///
111  /// </summary>
112  /// <seealso cref="RewriteMethod">
113  /// </seealso>
114  public static readonly RewriteMethod CONSTANT_SCORE_FILTER_REWRITE = new ConstantScoreFilterRewrite();
115 
116  [Serializable]
117  private class ScoringBooleanQueryRewrite:RewriteMethod
118  {
119  public override Query Rewrite(IndexReader reader, MultiTermQuery query)
120  {
121 
122  FilteredTermEnum enumerator = query.GetEnum(reader);
123  BooleanQuery result = new BooleanQuery(true);
124  int count = 0;
125  try
126  {
127  do
128  {
129  Term t = enumerator.Term;
130  if (t != null)
131  {
132  TermQuery tq = new TermQuery(t); // found a match
133  tq.Boost = query.Boost * enumerator.Difference(); // set the boost
134  result.Add(tq, Occur.SHOULD); // add to query
135  count++;
136  }
137  }
138  while (enumerator.Next());
139  }
140  finally
141  {
142  enumerator.Close();
143  }
144  query.IncTotalNumberOfTerms(count);
145  return result;
146  }
147 
148  // Make sure we are still a singleton even after deserializing
149  protected internal virtual System.Object ReadResolve()
150  {
151  return Lucene.Net.Search.MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE;
152  }
153  }
154 
155  /// <summary>A rewrite method that first translates each term into
156  /// <see cref="Occur.SHOULD" /> clause in a
157  /// BooleanQuery, and keeps the scores as computed by the
158  /// query. Note that typically such scores are
159  /// meaningless to the user, and require non-trivial CPU
160  /// to compute, so it's almost always better to use <see cref="CONSTANT_SCORE_AUTO_REWRITE_DEFAULT" />
161  /// instead.
162  ///
163  /// <p/><b>NOTE</b>: This rewrite method will hit <see cref="BooleanQuery.TooManyClauses" />
164  /// if the number of terms
165  /// exceeds <see cref="BooleanQuery.MaxClauseCount" />.
166  ///
167  /// </summary>
168  /// <seealso cref="RewriteMethod">
169  /// </seealso>
170  public static readonly RewriteMethod SCORING_BOOLEAN_QUERY_REWRITE = new ScoringBooleanQueryRewrite();
171 
172  [Serializable]
173  private class ConstantScoreBooleanQueryRewrite:ScoringBooleanQueryRewrite
174  {
175  public override Query Rewrite(IndexReader reader, MultiTermQuery query)
176  {
177  // strip the scores off
178  Query result = new ConstantScoreQuery(new QueryWrapperFilter(base.Rewrite(reader, query)));
179  result.Boost = query.Boost;
180  return result;
181  }
182 
183  // Make sure we are still a singleton even after deserializing
184  protected internal override System.Object ReadResolve()
185  {
186  return Lucene.Net.Search.MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE;
187  }
188  }
189 
190  /// <summary>Like <see cref="SCORING_BOOLEAN_QUERY_REWRITE" /> except
191  /// scores are not computed. Instead, each matching
192  /// document receives a constant score equal to the
193  /// query's boost.
194  ///
195  /// <p/><b>NOTE</b>: This rewrite method will hit <see cref="BooleanQuery.TooManyClauses" />
196  /// if the number of terms
197  /// exceeds <see cref="BooleanQuery.MaxClauseCount" />.
198  ///
199  /// </summary>
200  /// <seealso cref="RewriteMethod">
201  /// </seealso>
202  public static readonly RewriteMethod CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE = new ConstantScoreBooleanQueryRewrite();
203 
204 
205  /// <summary>A rewrite method that tries to pick the best
206  /// constant-score rewrite method based on term and
207  /// document counts from the query. If both the number of
208  /// terms and documents is small enough, then <see cref="CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE" />
209  /// is used.
210  /// Otherwise, <see cref="CONSTANT_SCORE_FILTER_REWRITE" /> is
211  /// used.
212  /// </summary>
213  [Serializable]
215  {
217  {
218  InitBlock();
219  }
220  private void InitBlock()
221  {
222  termCountCutoff = DEFAULT_TERM_COUNT_CUTOFF;
223  docCountPercent = DEFAULT_DOC_COUNT_PERCENT;
224  }
225 
226  // Defaults derived from rough tests with a 20.0 million
227  // doc Wikipedia index. With more than 350 terms in the
228  // query, the filter method is fastest:
229  public static int DEFAULT_TERM_COUNT_CUTOFF = 350;
230 
231  // If the query will hit more than 1 in 1000 of the docs
232  // in the index (0.1%), the filter method is fastest:
233  public static double DEFAULT_DOC_COUNT_PERCENT = 0.1;
234 
235  private int termCountCutoff;
236  private double docCountPercent;
237 
238  /// <summary>If the number of terms in this query is equal to or
239  /// larger than this setting then <see cref="CONSTANT_SCORE_FILTER_REWRITE" />
240  /// is used.
241  /// </summary>
242  public virtual int TermCountCutoff
243  {
244  get { return termCountCutoff; }
245  set { termCountCutoff = value; }
246  }
247 
248  /// <summary>If the number of documents to be visited in the
249  /// postings exceeds this specified percentage of the
250  /// MaxDoc for the index, then <see cref="CONSTANT_SCORE_FILTER_REWRITE" />
251  /// is used.
252  /// </summary>
253  /// <value> 0.0 to 100.0 </value>
254  public virtual double DocCountPercent
255  {
256  get { return docCountPercent; }
257  set { docCountPercent = value; }
258  }
259 
260  public override Query Rewrite(IndexReader reader, MultiTermQuery query)
261  {
262  // Get the enum and start visiting terms. If we
263  // exhaust the enum before hitting either of the
264  // cutoffs, we use ConstantBooleanQueryRewrite; else,
265  // ConstantFilterRewrite:
266  ICollection<Term> pendingTerms = new List<Term>();
267  int docCountCutoff = (int) ((docCountPercent / 100.0) * reader.MaxDoc);
268  int termCountLimit = System.Math.Min(BooleanQuery.MaxClauseCount, termCountCutoff);
269  int docVisitCount = 0;
270 
271  FilteredTermEnum enumerator = query.GetEnum(reader);
272  try
273  {
274  while (true)
275  {
276  Term t = enumerator.Term;
277  if (t != null)
278  {
279  pendingTerms.Add(t);
280  // Loading the TermInfo from the terms dict here
281  // should not be costly, because 1) the
282  // query/filter will load the TermInfo when it
283  // runs, and 2) the terms dict has a cache:
284  docVisitCount += reader.DocFreq(t);
285  }
286 
287  if (pendingTerms.Count >= termCountLimit || docVisitCount >= docCountCutoff)
288  {
289  // Too many terms -- make a filter.
290  Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter<MultiTermQuery>(query));
291  result.Boost = query.Boost;
292  return result;
293  }
294  else if (!enumerator.Next())
295  {
296  // Enumeration is done, and we hit a small
297  // enough number of terms & docs -- just make a
298  // BooleanQuery, now
299  BooleanQuery bq = new BooleanQuery(true);
300  foreach(Term term in pendingTerms)
301  {
302  TermQuery tq = new TermQuery(term);
303  bq.Add(tq, Occur.SHOULD);
304  }
305  // Strip scores
306  Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq));
307  result.Boost = query.Boost;
308  query.IncTotalNumberOfTerms(pendingTerms.Count);
309  return result;
310  }
311  }
312  }
313  finally
314  {
315  enumerator.Close();
316  }
317  }
318 
319  public override int GetHashCode()
320  {
321  int prime = 1279;
322  return (int) (prime * termCountCutoff + BitConverter.DoubleToInt64Bits(docCountPercent));
323  }
324 
325  public override bool Equals(System.Object obj)
326  {
327  if (this == obj)
328  return true;
329  if (obj == null)
330  return false;
331  if (GetType() != obj.GetType())
332  return false;
333 
335  if (other.termCountCutoff != termCountCutoff)
336  {
337  return false;
338  }
339 
340  if (BitConverter.DoubleToInt64Bits(other.docCountPercent) != BitConverter.DoubleToInt64Bits(docCountPercent))
341  {
342  return false;
343  }
344 
345  return true;
346  }
347  }
348 
349  /// <summary>Read-only default instance of <see cref="ConstantScoreAutoRewrite" />
350  ///, with <see cref="ConstantScoreAutoRewrite.TermCountCutoff" />
351  /// set to
352  /// <see cref="ConstantScoreAutoRewrite.DEFAULT_TERM_COUNT_CUTOFF" />
353  ///
354  /// and <see cref="ConstantScoreAutoRewrite.DocCountPercent" />
355  /// set to
356  /// <see cref="ConstantScoreAutoRewrite.DEFAULT_DOC_COUNT_PERCENT" />
357  ///.
358  /// Note that you cannot alter the configuration of this
359  /// instance; you'll need to create a private instance
360  /// instead.
361  /// </summary>
363 
364  /// <summary> Constructs a query matching terms that cannot be represented with a single
365  /// Term.
366  /// </summary>
367  protected MultiTermQuery()
368  {
369  }
370 
371  /// <summary>Construct the enumeration to be used, expanding the pattern term. </summary>
372  protected internal abstract FilteredTermEnum GetEnum(IndexReader reader);
373 
374  /// <summary> Expert: Return the number of unique terms visited during execution of the query.
375  /// If there are many of them, you may consider using another query type
376  /// or optimize your total term count in index.
377  /// <p/>This method is not thread safe, be sure to only call it when no query is running!
378  /// If you re-use the same query instance for another
379  /// search, be sure to first reset the term counter
380  /// with <see cref="ClearTotalNumberOfTerms" />.
381  /// <p/>On optimized indexes / no MultiReaders, you get the correct number of
382  /// unique terms for the whole index. Use this number to compare different queries.
383  /// For non-optimized indexes this number can also be achived in
384  /// non-constant-score mode. In constant-score mode you get the total number of
385  /// terms seeked for all segments / sub-readers.
386  /// </summary>
387  /// <seealso cref="ClearTotalNumberOfTerms">
388  /// </seealso>
389  public virtual int TotalNumberOfTerms
390  {
391  get { return numberOfTerms; }
392  }
393 
394  /// <summary> Expert: Resets the counting of unique terms.
395  /// Do this before executing the query/filter.
396  /// </summary>
397  /// <seealso cref="TotalNumberOfTerms">
398  /// </seealso>
399  public virtual void ClearTotalNumberOfTerms()
400  {
401  numberOfTerms = 0;
402  }
403 
404  protected internal virtual void IncTotalNumberOfTerms(int inc)
405  {
406  numberOfTerms += inc;
407  }
408 
409  public override Query Rewrite(IndexReader reader)
410  {
411  return internalRewriteMethod.Rewrite(reader, this);
412  }
413 
414  /// <summary> Sets the rewrite method to be used when executing the
415  /// query. You can use one of the four core methods, or
416  /// implement your own subclass of <see cref="Search.RewriteMethod" />.
417  /// </summary>
418  public virtual RewriteMethod RewriteMethod
419  {
420  get { return internalRewriteMethod; }
421  set { internalRewriteMethod = value; }
422  }
423 
424  //@Override
425  public override int GetHashCode()
426  {
427  int prime = 31;
428  int result = 1;
429  result = prime * result + System.Convert.ToInt32(Boost);
430  result = prime * result;
431  result += internalRewriteMethod.GetHashCode();
432  return result;
433  }
434 
435  //@Override
436  public override bool Equals(System.Object obj)
437  {
438  if (this == obj)
439  return true;
440  if (obj == null)
441  return false;
442  if (GetType() != obj.GetType())
443  return false;
444  MultiTermQuery other = (MultiTermQuery) obj;
445  if (System.Convert.ToInt32(Boost) != System.Convert.ToInt32(other.Boost))
446  return false;
447  if (!internalRewriteMethod.Equals(other.internalRewriteMethod))
448  {
449  return false;
450  }
451  return true;
452  }
453  static MultiTermQuery()
454  {
455  CONSTANT_SCORE_AUTO_REWRITE_DEFAULT = new AnonymousClassConstantScoreAutoRewrite();
456  }
457  }
458 
459  /// <summary>Abstract class that defines how the query is rewritten. </summary>
460  [Serializable]
461  public abstract class RewriteMethod
462  {
463  public abstract Query Rewrite(IndexReader reader, MultiTermQuery query);
464  }
465 }