Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
FuzzyQuery.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.Collections.Generic;
20 using System.Linq;
21 using Lucene.Net.Index;
22 using Lucene.Net.Support;
23 using Lucene.Net.Util;
24 using IndexReader = Lucene.Net.Index.IndexReader;
25 using Single = Lucene.Net.Support.Single;
26 using Term = Lucene.Net.Index.Term;
27 using ToStringUtils = Lucene.Net.Util.ToStringUtils;
28 
29 namespace Lucene.Net.Search
30 {
31 
32  /// <summary>Implements the fuzzy search query. The similarity measurement
33  /// is based on the Levenshtein (edit distance) algorithm.
34  ///
35  /// Warning: this query is not very scalable with its default prefix
36  /// length of 0 - in this case, *every* term will be enumerated and
37  /// cause an edit score calculation.
38  ///
39  /// </summary>
40  [Serializable]
41  public class FuzzyQuery : MultiTermQuery
42  {
43 
44  public const float defaultMinSimilarity = 0.5f;
45  public const int defaultPrefixLength = 0;
46 
47  private float minimumSimilarity;
48  private int prefixLength;
49  private bool termLongEnough = false;
50 
51  /// <summary> Returns the pattern term.</summary>
52  public Term Term { get; protected internal set; }
53 
54  /// <summary> Create a new FuzzyQuery that will match terms with a similarity
55  /// of at least <c>minimumSimilarity</c> to <c>term</c>.
56  /// If a <c>prefixLength</c> &gt; 0 is specified, a common prefix
57  /// of that length is also required.
58  ///
59  /// </summary>
60  /// <param name="term">the term to search for
61  /// </param>
62  /// <param name="minimumSimilarity">a value between 0 and 1 to set the required similarity
63  /// between the query term and the matching terms. For example, for a
64  /// <c>minimumSimilarity</c> of <c>0.5</c> a term of the same length
65  /// as the query term is considered similar to the query term if the edit distance
66  /// between both terms is less than <c>length(term)*0.5</c>
67  /// </param>
68  /// <param name="prefixLength">length of common (non-fuzzy) prefix
69  /// </param>
70  /// <throws> IllegalArgumentException if minimumSimilarity is &gt;= 1 or &lt; 0 </throws>
71  /// <summary> or if prefixLength &lt; 0
72  /// </summary>
73  public FuzzyQuery(Term term, float minimumSimilarity, int prefixLength)
74  {
75  this.Term = term;
76 
77  if (minimumSimilarity >= 1.0f)
78  throw new System.ArgumentException("minimumSimilarity >= 1");
79  else if (minimumSimilarity < 0.0f)
80  throw new System.ArgumentException("minimumSimilarity < 0");
81  if (prefixLength < 0)
82  throw new System.ArgumentException("prefixLength < 0");
83 
84  if (term.Text.Length > 1.0f / (1.0f - minimumSimilarity))
85  {
86  this.termLongEnough = true;
87  }
88 
89  this.minimumSimilarity = minimumSimilarity;
90  this.prefixLength = prefixLength;
91  internalRewriteMethod = SCORING_BOOLEAN_QUERY_REWRITE;
92  }
93 
94  /// <summary> Calls <see cref="FuzzyQuery(Index.Term, float)">FuzzyQuery(term, minimumSimilarity, 0)</see>.</summary>
95  public FuzzyQuery(Term term, float minimumSimilarity):this(term, minimumSimilarity, defaultPrefixLength)
96  {
97  }
98 
99  /// <summary> Calls <see cref="FuzzyQuery(Index.Term, float)">FuzzyQuery(term, 0.5f, 0)</see>.</summary>
100  public FuzzyQuery(Term term):this(term, defaultMinSimilarity, defaultPrefixLength)
101  {
102  }
103 
104  /// <summary> Returns the minimum similarity that is required for this query to match.</summary>
105  /// <value> float value between 0.0 and 1.0 </value>
106  public virtual float MinSimilarity
107  {
108  get { return minimumSimilarity; }
109  }
110 
111  /// <summary> Returns the non-fuzzy prefix length. This is the number of characters at the start
112  /// of a term that must be identical (not fuzzy) to the query term if the query
113  /// is to match that term.
114  /// </summary>
115  public virtual int PrefixLength
116  {
117  get { return prefixLength; }
118  }
119 
120  protected internal override FilteredTermEnum GetEnum(IndexReader reader)
121  {
122  return new FuzzyTermEnum(reader, Term, minimumSimilarity, prefixLength);
123  }
124 
125  public override RewriteMethod RewriteMethod
126  {
127  set { throw new System.NotSupportedException("FuzzyQuery cannot change rewrite method"); }
128  }
129 
130  public override Query Rewrite(IndexReader reader)
131  {
132  if (!termLongEnough)
133  {
134  // can only match if it's exact
135  return new TermQuery(Term);
136  }
137 
138  int maxSize = BooleanQuery.MaxClauseCount;
139 
140  // TODO: Java uses a PriorityQueue. Using Linq, we can emulate it,
141  // however it's considerable slower than the java counterpart.
142  // this should be a temporary thing, fixed before release
143  SortedList<ScoreTerm, ScoreTerm> stQueue = new SortedList<ScoreTerm, ScoreTerm>();
144  FilteredTermEnum enumerator = GetEnum(reader);
145 
146  try
147  {
148  ScoreTerm st = new ScoreTerm();
149  do
150  {
151  Term t = enumerator.Term;
152  if (t == null) break;
153  float score = enumerator.Difference();
154  //ignore uncompetetive hits
155  if (stQueue.Count >= maxSize && score <= stQueue.Keys.First().score)
156  continue;
157  // add new entry in PQ
158  st.term = t;
159  st.score = score;
160  stQueue.Add(st, st);
161  // possibly drop entries from queue
162  if (stQueue.Count > maxSize)
163  {
164  st = stQueue.Keys.First();
165  stQueue.Remove(st);
166  }
167  else
168  {
169  st = new ScoreTerm();
170  }
171  }
172  while (enumerator.Next());
173  }
174  finally
175  {
176  enumerator.Close();
177  }
178 
179  BooleanQuery query = new BooleanQuery(true);
180  foreach(ScoreTerm st in stQueue.Keys)
181  {
182  TermQuery tq = new TermQuery(st.term); // found a match
183  tq.Boost = Boost * st.score; // set the boost
184  query.Add(tq, Occur.SHOULD); // add to query
185  }
186 
187  return query;
188  }
189 
190  public override System.String ToString(System.String field)
191  {
192  System.Text.StringBuilder buffer = new System.Text.StringBuilder();
193  if (!Term.Field.Equals(field))
194  {
195  buffer.Append(Term.Field);
196  buffer.Append(":");
197  }
198  buffer.Append(Term.Text);
199  buffer.Append('~');
200  buffer.Append(Single.ToString(minimumSimilarity));
201  buffer.Append(ToStringUtils.Boost(Boost));
202  return buffer.ToString();
203  }
204 
205  protected internal class ScoreTerm : IComparable<ScoreTerm>
206  {
207  public Term term;
208  public float score;
209 
210  public int CompareTo(ScoreTerm other)
211  {
212  if (Comparer<float>.Default.Compare(this.score, other.score) == 0)
213  {
214  return other.term.CompareTo(this.term);
215  }
216  else
217  {
218  return Comparer<float>.Default.Compare(this.score, other.score);
219  }
220  }
221  }
222 
223  public override int GetHashCode()
224  {
225  int prime = 31;
226  int result = base.GetHashCode();
227  result = prime * result + BitConverter.ToInt32(BitConverter.GetBytes(minimumSimilarity), 0);
228  result = prime * result + prefixLength;
229  result = prime * result + ((Term == null)?0:Term.GetHashCode());
230  return result;
231  }
232 
233  public override bool Equals(System.Object obj)
234  {
235  if (this == obj)
236  return true;
237  if (!base.Equals(obj))
238  return false;
239  if (GetType() != obj.GetType())
240  return false;
241  FuzzyQuery other = (FuzzyQuery) obj;
242  if (BitConverter.ToInt32(BitConverter.GetBytes(minimumSimilarity), 0) != BitConverter.ToInt32(BitConverter.GetBytes(other.minimumSimilarity), 0))
243  return false;
244  if (prefixLength != other.prefixLength)
245  return false;
246  if (Term == null)
247  {
248  if (other.Term != null)
249  return false;
250  }
251  else if (!Term.Equals(other.Term))
252  return false;
253  return true;
254  }
255  }
256 }