Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
FuzzyLikeThisQuery.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.Collections;
20 using System.Collections.Generic;
21 using System.Linq;
22 using System.Text;
23 
24 using Lucene.Net.Search;
25 using Lucene.Net.Index;
26 using Lucene.Net.Analysis;
27 using Lucene.Net.Analysis.Tokenattributes;
28 using Lucene.Net.Support;
29 using Lucene.Net.Util;
30 
31 namespace Lucene.Net.Search
32 {
33  /// <summary>
34  /// Fuzzifies ALL terms provided as strings and then picks the best n differentiating terms.
35  /// In effect this mixes the behaviour of FuzzyQuery and MoreLikeThis but with special consideration
36  /// of fuzzy scoring factors.
37  /// This generally produces good results for queries where users may provide details in a number of
38  /// fields and have no knowledge of boolean query syntax and also want a degree of fuzzy matching and
39  /// a fast query.
40  ///
41  /// For each source term the fuzzy variants are held in a BooleanQuery with no coord factor (because
42  /// we are not looking for matches on multiple variants in any one doc). Additionally, a specialized
43  /// TermQuery is used for variants and does not use that variant term's IDF because this would favour rarer
44  /// terms eg misspellings. Instead, all variants use the same IDF ranking (the one for the source query
45  /// term) and this is factored into the variant's boost. If the source query term does not exist in the
46  /// index the average IDF of the variants is used.
47  /// </summary>
48  public class FuzzyLikeThisQuery : Query
49  {
50  static Similarity sim = new DefaultSimilarity();
51  Query rewrittenQuery = null;
52  EquatableList<FieldVals> fieldVals = new EquatableList<FieldVals>();
53  Analyzer analyzer;
54 
55  ScoreTermQueue q;
56  int MAX_VARIANTS_PER_TERM = 50;
57  bool ignoreTF = false;
58  private int maxNumTerms;
59 
60  public override int GetHashCode()
61  {
62  int prime = 31;
63  int result = 1;
64  result = prime * result + ((analyzer == null) ? 0 : analyzer.GetHashCode());
65  result = prime * result
66  + ((fieldVals == null) ? 0 : fieldVals.GetHashCode());
67  result = prime * result + (ignoreTF ? 1231 : 1237);
68  result = prime * result + maxNumTerms;
69  return result;
70  }
71 
72  public override bool Equals(Object obj)
73  {
74  if (this == obj)
75  return true;
76  if (obj == null)
77  return false;
78  if (GetType() != obj.GetType())
79  return false;
81  if (analyzer == null)
82  {
83  if (other.analyzer != null)
84  return false;
85  }
86  else if (!analyzer.Equals(other.analyzer))
87  return false;
88  if (fieldVals == null)
89  {
90  if (other.fieldVals != null)
91  return false;
92  }
93  else if (!fieldVals.Equals(other.fieldVals))
94  return false;
95  if (ignoreTF != other.ignoreTF)
96  return false;
97  if (maxNumTerms != other.maxNumTerms)
98  return false;
99  return true;
100  }
101 
102 
103  /*
104  *
105  * <param name="maxNumTerms">The total number of terms clauses that will appear once rewritten as a BooleanQuery</param>
106  * <param name="analyzer"></param>
107  */
108  public FuzzyLikeThisQuery(int maxNumTerms, Analyzer analyzer)
109  {
110  q = new ScoreTermQueue(maxNumTerms);
111  this.analyzer = analyzer;
112  this.maxNumTerms = maxNumTerms;
113  }
114 
115  class FieldVals
116  {
117  internal String queryString;
118  internal String fieldName;
119  internal float minSimilarity;
120  internal int prefixLength;
121  public FieldVals(String name, float similarity, int length, String queryString)
122  {
123  fieldName = name;
124  minSimilarity = similarity;
125  prefixLength = length;
126  this.queryString = queryString;
127  }
128 
129  public override int GetHashCode()
130  {
131  int prime = 31;
132  int result = 1;
133  result = prime * result
134  + ((fieldName == null) ? 0 : fieldName.GetHashCode());
135  result = prime * result + BitConverter.ToInt32(BitConverter.GetBytes(minSimilarity),0);
136  result = prime * result + prefixLength;
137  result = prime * result
138  + ((queryString == null) ? 0 : queryString.GetHashCode());
139  return result;
140  }
141 
142  public override bool Equals(Object obj)
143  {
144  if (this == obj)
145  return true;
146  if (obj == null)
147  return false;
148  if (GetType() != obj.GetType())
149  return false;
150  FieldVals other = (FieldVals)obj;
151  if (fieldName == null)
152  {
153  if (other.fieldName != null)
154  return false;
155  }
156  else if (!fieldName.Equals(other.fieldName))
157  return false;
158  if (BitConverter.ToInt32(BitConverter.GetBytes(minSimilarity), 0) != BitConverter.ToInt32(BitConverter.GetBytes(other.minSimilarity), 0))
159  //if (Float.floatToIntBits(minSimilarity) != Float.floatToIntBits(other.minSimilarity))
160  return false;
161  if (prefixLength != other.prefixLength)
162  return false;
163  if (queryString == null)
164  {
165  if (other.queryString != null)
166  return false;
167  }
168  else if (!queryString.Equals(other.queryString))
169  return false;
170  return true;
171  }
172 
173 
174 
175  }
176 
177  /*
178  * <summary>Adds user input for "fuzzification" </summary>
179  * <param name="queryString">The string which will be parsed by the analyzer and for which fuzzy variants will be parsed</param>
180  * <param name="fieldName"></param>
181  * <param name="minSimilarity">The minimum similarity of the term variants (see FuzzyTermEnum)</param>
182  * <param name="prefixLength">Length of required common prefix on variant terms (see FuzzyTermEnum)</param>
183  */
184  public void AddTerms(String queryString, String fieldName, float minSimilarity, int prefixLength)
185  {
186  fieldVals.Add(new FieldVals(fieldName, minSimilarity, prefixLength, queryString));
187  }
188 
189 
190  private void AddTerms(IndexReader reader, FieldVals f)
191  {
192  if (f.queryString == null) return;
193  TokenStream ts = analyzer.TokenStream(f.fieldName, new System.IO.StringReader(f.queryString));
194  ITermAttribute termAtt = ts.AddAttribute<ITermAttribute>();
195 
196  int corpusNumDocs = reader.NumDocs();
197  Term internSavingTemplateTerm = new Term(f.fieldName); //optimization to avoid constructing new Term() objects
198  HashSet<string> processedTerms = new HashSet<string>();
199  while (ts.IncrementToken())
200  {
201  String term = termAtt.Term;
202  if (!processedTerms.Contains(term))
203  {
204  processedTerms.Add(term);
205  ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
206  float minScore = 0;
207  Term startTerm = internSavingTemplateTerm.CreateTerm(term);
208  FuzzyTermEnum fe = new FuzzyTermEnum(reader, startTerm, f.minSimilarity, f.prefixLength);
209  TermEnum origEnum = reader.Terms(startTerm);
210  int df = 0;
211  if (startTerm.Equals(origEnum.Term))
212  {
213  df = origEnum.DocFreq(); //store the df so all variants use same idf
214  }
215  int numVariants = 0;
216  int totalVariantDocFreqs = 0;
217  do
218  {
219  Term possibleMatch = fe.Term;
220  if (possibleMatch != null)
221  {
222  numVariants++;
223  totalVariantDocFreqs += fe.DocFreq();
224  float score = fe.Difference();
225  if (variantsQ.Size() < MAX_VARIANTS_PER_TERM || score > minScore)
226  {
227  ScoreTerm st = new ScoreTerm(possibleMatch, score, startTerm);
228  variantsQ.InsertWithOverflow(st);
229  minScore = variantsQ.Top().Score; // maintain minScore
230  }
231  }
232  }
233  while (fe.Next());
234  if (numVariants > 0)
235  {
236  int avgDf = totalVariantDocFreqs / numVariants;
237  if (df == 0)//no direct match we can use as df for all variants
238  {
239  df = avgDf; //use avg df of all variants
240  }
241 
242  // take the top variants (scored by edit distance) and reset the score
243  // to include an IDF factor then add to the global queue for ranking
244  // overall top query terms
245  int size = variantsQ.Size();
246  for (int i = 0; i < size; i++)
247  {
248  ScoreTerm st = variantsQ.Pop();
249  st.Score = (st.Score * st.Score) * sim.Idf(df, corpusNumDocs);
250  q.InsertWithOverflow(st);
251  }
252  }
253  }
254  }
255  }
256 
257  public override Query Rewrite(IndexReader reader)
258  {
259  if (rewrittenQuery != null)
260  {
261  return rewrittenQuery;
262  }
263  //load up the list of possible terms
264  foreach (FieldVals f in fieldVals)
265  {
266  AddTerms(reader, f);
267  }
268  //clear the list of fields
269  fieldVals.Clear();
270 
271  BooleanQuery bq = new BooleanQuery();
272 
273 
274  //create BooleanQueries to hold the variants for each token/field pair and ensure it
275  // has no coord factor
276  //Step 1: sort the termqueries by term/field
277  HashMap<Term, List<ScoreTerm>> variantQueries = new HashMap<Term, List<ScoreTerm>>();
278  int size = q.Size();
279  for (int i = 0; i < size; i++)
280  {
281  ScoreTerm st = q.Pop();
282  var l = variantQueries[st.fuzziedSourceTerm];
283  if (l == null)
284  {
285  l = new List<ScoreTerm>();
286  variantQueries.Add(st.fuzziedSourceTerm, l);
287  }
288  l.Add(st);
289  }
290  //Step 2: Organize the sorted termqueries into zero-coord scoring boolean queries
291  foreach(var variants in variantQueries.Values)
292  {
293  if (variants.Count == 1)
294  {
295  //optimize where only one selected variant
296  ScoreTerm st = variants[0];
297  TermQuery tq = new FuzzyTermQuery(st.Term, ignoreTF);
298  tq.Boost = st.Score; // set the boost to a mix of IDF and score
299  bq.Add(tq, Occur.SHOULD);
300  }
301  else
302  {
303  BooleanQuery termVariants = new BooleanQuery(true); //disable coord and IDF for these term variants
304  foreach(ScoreTerm st in variants)
305  {
306  TermQuery tq = new FuzzyTermQuery(st.Term, ignoreTF); // found a match
307  tq.Boost = st.Score; // set the boost using the ScoreTerm's score
308  termVariants.Add(tq, Occur.SHOULD); // add to query
309  }
310  bq.Add(termVariants, Occur.SHOULD); // add to query
311  }
312  }
313  //TODO possible alternative step 3 - organize above booleans into a new layer of field-based
314  // booleans with a minimum-should-match of NumFields-1?
315  bq.Boost = Boost;
316  this.rewrittenQuery = bq;
317  return bq;
318  }
319 
320  //Holds info for a fuzzy term variant - initially score is set to edit distance (for ranking best
321  // term variants) then is reset with IDF for use in ranking against all other
322  // terms/fields
323  private class ScoreTerm
324  {
325  public Term Term { get; set; }
326  public float Score { get; set; }
327 
328  internal Term fuzziedSourceTerm;
329 
330  public ScoreTerm(Term term, float score, Term fuzziedSourceTerm)
331  {
332  this.Term = term;
333  this.Score = score;
334  this.fuzziedSourceTerm = fuzziedSourceTerm;
335  }
336  }
337 
338  private class ScoreTermQueue : PriorityQueue<ScoreTerm>
339  {
340  public ScoreTermQueue(int size)
341  {
342  Initialize(size);
343  }
344 
345  /* (non-Javadoc)
346  * <see cref="org.apache.lucene.util.PriorityQueue.lessThan(java.lang.Object, java.lang.Object)"/>
347  */
348  public override bool LessThan(ScoreTerm termA, ScoreTerm termB)
349  {
350  if (termA.Score == termB.Score)
351  return termA.Term.CompareTo(termB.Term) > 0;
352  else
353  return termA.Score < termB.Score;
354  }
355 
356  }
357 
358  //overrides basic TermQuery to negate effects of IDF (idf is factored into boost of containing BooleanQuery)
359  private class FuzzyTermQuery : TermQuery
360  {
361  bool ignoreTF;
362 
363  public FuzzyTermQuery(Term t, bool ignoreTF): base(t)
364  {
365  this.ignoreTF = ignoreTF;
366  }
367 
368  public override Similarity GetSimilarity(Searcher searcher)
369  {
370  Similarity result = base.GetSimilarity(searcher);
371  result = new AnonymousSimilarityDelegator(this,result);
372  return result;
373  }
374 
375  class AnonymousSimilarityDelegator : SimilarityDelegator
376  {
377  FuzzyTermQuery parent = null;
378  public AnonymousSimilarityDelegator(FuzzyTermQuery parent,Similarity result) : base(result)
379  {
380  this.parent = parent;
381  }
382 
383  public override float Tf(float freq)
384  {
385  if (parent.ignoreTF)
386  {
387  return 1; //ignore tf
388  }
389  return base.Tf(freq);
390  }
391 
392  public override float Idf(int docFreq, int numDocs)
393  {
394  //IDF is already factored into individual term boosts
395  return 1;
396  }
397  }
398  }
399 
400 
401  /* (non-Javadoc)
402  * <see cref="org.apache.lucene.search.Query.toString(java.lang.String)"/>
403  */
404  public override String ToString(String field)
405  {
406  return null;
407  }
408 
409 
410  public bool IsIgnoreTF()
411  {
412  return ignoreTF;
413  }
414 
415 
416  public void SetIgnoreTF(bool ignoreTF)
417  {
418  this.ignoreTF = ignoreTF;
419  }
420 
421  }
422 }