Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
FuzzyTermEnum.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 
20 using IndexReader = Lucene.Net.Index.IndexReader;
21 using Term = Lucene.Net.Index.Term;
22 
23 namespace Lucene.Net.Search
24 {
25 
26  /// <summary>Subclass of FilteredTermEnum for enumerating all terms that are similiar
27  /// to the specified filter term.
28  ///
29  /// <p/>Term enumerations are always ordered by Term.compareTo(). Each term in
30  /// the enumeration is greater than all that precede it.
31  /// </summary>
32  public sealed class FuzzyTermEnum:FilteredTermEnum
33  {
34  /* Allows us save time required to create a new array
35  * everytime similarity is called.
36  */
37  private int[] p;
38  private int[] d;
39 
40  private float similarity;
41  private bool endEnum = false;
42 
43  private bool isDisposed;
44 
45  private Term searchTerm = null;
46  private System.String field;
47  private System.String text;
48  private System.String prefix;
49 
50  private float minimumSimilarity;
51  private float scale_factor;
52 
53  /// <summary> Creates a FuzzyTermEnum with an empty prefix and a minSimilarity of 0.5f.
54  /// <p/>
55  /// After calling the constructor the enumeration is already pointing to the first
56  /// valid term if such a term exists.
57  ///
58  /// </summary>
59  /// <param name="reader">
60  /// </param>
61  /// <param name="term">
62  /// </param>
63  /// <throws> IOException </throws>
64  /// <seealso cref="FuzzyTermEnum(IndexReader, Term, float, int)">
65  /// </seealso>
66  public FuzzyTermEnum(IndexReader reader, Term term):this(reader, term, FuzzyQuery.defaultMinSimilarity, FuzzyQuery.defaultPrefixLength)
67  {
68  }
69 
70  /// <summary> Creates a FuzzyTermEnum with an empty prefix.
71  /// <p/>
72  /// After calling the constructor the enumeration is already pointing to the first
73  /// valid term if such a term exists.
74  ///
75  /// </summary>
76  /// <param name="reader">
77  /// </param>
78  /// <param name="term">
79  /// </param>
80  /// <param name="minSimilarity">
81  /// </param>
82  /// <throws> IOException </throws>
83  /// <seealso cref="FuzzyTermEnum(IndexReader, Term, float, int)">
84  /// </seealso>
85  public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity):this(reader, term, minSimilarity, FuzzyQuery.defaultPrefixLength)
86  {
87  }
88 
89  /// <summary> Constructor for enumeration of all terms from specified <c>reader</c> which share a prefix of
90  /// length <c>prefixLength</c> with <c>term</c> and which have a fuzzy similarity &gt;
91  /// <c>minSimilarity</c>.
92  /// <p/>
93  /// After calling the constructor the enumeration is already pointing to the first
94  /// valid term if such a term exists.
95  ///
96  /// </summary>
97  /// <param name="reader">Delivers terms.
98  /// </param>
99  /// <param name="term">Pattern term.
100  /// </param>
101  /// <param name="minSimilarity">Minimum required similarity for terms from the reader. Default value is 0.5f.
102  /// </param>
103  /// <param name="prefixLength">Length of required common prefix. Default value is 0.
104  /// </param>
105  /// <throws> IOException </throws>
106  public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity, int prefixLength):base()
107  {
108 
109  if (minSimilarity >= 1.0f)
110  throw new System.ArgumentException("minimumSimilarity cannot be greater than or equal to 1");
111  else if (minSimilarity < 0.0f)
112  throw new System.ArgumentException("minimumSimilarity cannot be less than 0");
113  if (prefixLength < 0)
114  throw new System.ArgumentException("prefixLength cannot be less than 0");
115 
116  this.minimumSimilarity = minSimilarity;
117  this.scale_factor = 1.0f / (1.0f - minimumSimilarity);
118  this.searchTerm = term;
119  this.field = searchTerm.Field;
120 
121  //The prefix could be longer than the word.
122  //It's kind of silly though. It means we must match the entire word.
123  int fullSearchTermLength = searchTerm.Text.Length;
124  int realPrefixLength = prefixLength > fullSearchTermLength?fullSearchTermLength:prefixLength;
125 
126  this.text = searchTerm.Text.Substring(realPrefixLength);
127  this.prefix = searchTerm.Text.Substring(0, (realPrefixLength) - (0));
128 
129  this.p = new int[this.text.Length + 1];
130  this.d = new int[this.text.Length + 1];
131 
132  SetEnum(reader.Terms(new Term(searchTerm.Field, prefix)));
133  }
134 
135  /// <summary> The termCompare method in FuzzyTermEnum uses Levenshtein distance to
136  /// calculate the distance between the given term and the comparing term.
137  /// </summary>
138  protected internal override bool TermCompare(Term term)
139  {
140  if ((System.Object) field == (System.Object) term.Field && term.Text.StartsWith(prefix))
141  {
142  System.String target = term.Text.Substring(prefix.Length);
143  this.similarity = Similarity(target);
144  return (similarity > minimumSimilarity);
145  }
146  endEnum = true;
147  return false;
148  }
149 
150  public override float Difference()
151  {
152  return ((similarity - minimumSimilarity) * scale_factor);
153  }
154 
155  public override bool EndEnum()
156  {
157  return endEnum;
158  }
159 
160  // <summary>
161  // ***************************
162  // Compute Levenshtein distance
163  // ****************************
164  // </summary>
165 
166  /// <summary> <p/>Similarity returns a number that is 1.0f or less (including negative numbers)
167  /// based on how similar the Term is compared to a target term. It returns
168  /// exactly 0.0f when
169  /// <c>
170  /// editDistance &gt; maximumEditDistance</c>
171  /// Otherwise it returns:
172  /// <c>
173  /// 1 - (editDistance / length)</c>
174  /// where length is the length of the shortest term (text or target) including a
175  /// prefix that are identical and editDistance is the Levenshtein distance for
176  /// the two words.<p/>
177  ///
178  /// <p/>Embedded within this algorithm is a fail-fast Levenshtein distance
179  /// algorithm. The fail-fast algorithm differs from the standard Levenshtein
180  /// distance algorithm in that it is aborted if it is discovered that the
181  /// mimimum distance between the words is greater than some threshold.
182  ///
183  /// <p/>To calculate the maximum distance threshold we use the following formula:
184  /// <c>
185  /// (1 - minimumSimilarity) * length</c>
186  /// where length is the shortest term including any prefix that is not part of the
187  /// similarity comparision. This formula was derived by solving for what maximum value
188  /// of distance returns false for the following statements:
189  /// <code>
190  /// similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen)));
191  /// return (similarity > minimumSimilarity);</code>
192  /// where distance is the Levenshtein distance for the two words.
193  /// <p/>
194  /// <p/>Levenshtein distance (also known as edit distance) is a measure of similiarity
195  /// between two strings where the distance is measured as the number of character
196  /// deletions, insertions or substitutions required to transform one string to
197  /// the other string.
198  /// </summary>
199  /// <param name="target">the target word or phrase
200  /// </param>
201  /// <returns> the similarity, 0.0 or less indicates that it matches less than the required
202  /// threshold and 1.0 indicates that the text and target are identical
203  /// </returns>
204  private float Similarity(System.String target)
205  {
206 
207  int m = target.Length;
208  int n = text.Length;
209  if (n == 0)
210  {
211  //we don't have anything to compare. That means if we just add
212  //the letters for m we get the new word
213  return prefix.Length == 0 ? 0.0f : 1.0f - ((float)m / prefix.Length);
214  }
215  if (m == 0)
216  {
217  return prefix.Length == 0 ? 0.0f : 1.0f - ((float)n / prefix.Length);
218  }
219 
220  int maxDistance = CalculateMaxDistance(m);
221 
222  if (maxDistance < System.Math.Abs(m - n))
223  {
224  //just adding the characters of m to n or vice-versa results in
225  //too many edits
226  //for example "pre" length is 3 and "prefixes" length is 8. We can see that
227  //given this optimal circumstance, the edit distance cannot be less than 5.
228  //which is 8-3 or more precisesly Math.abs(3-8).
229  //if our maximum edit distance is 4, then we can discard this word
230  //without looking at it.
231  return 0.0f;
232  }
233 
234  // init matrix d
235  for (int i = 0; i < n; ++i)
236  {
237  p[i] = i;
238  }
239 
240  // start computing edit distance
241  for (int j = 1; j <= m; ++j)
242  {
243  int bestPossibleEditDistance = m;
244  char t_j = target[j - 1];
245  d[0] = j;
246  for (int i = 1; i <= n; ++i)
247  {
248  // minimum of cell to the left+1, to the top+1, diagonally left and up +(0|1)
249  if (t_j != text[i - 1])
250  {
251  d[i] = Math.Min(Math.Min(d[i - 1], p[i]), p[i - 1]) + 1;
252  }
253  else
254  {
255  d[i] = Math.Min(Math.Min(d[i - 1] + 1, p[i] + 1), p[i - 1]);
256  }
257  bestPossibleEditDistance = System.Math.Min(bestPossibleEditDistance, d[i]);
258  }
259 
260  //After calculating row i, the best possible edit distance
261  //can be found by found by finding the smallest value in a given column.
262  //If the bestPossibleEditDistance is greater than the max distance, abort.
263 
264  if (j > maxDistance && bestPossibleEditDistance > maxDistance)
265  {
266  //equal is okay, but not greater
267  //the closest the target can be to the text is just too far away.
268  //this target is leaving the party early.
269  return 0.0f;
270  }
271 
272  // copy current distance counts to 'previous row' distance counts: swap p and d
273  int[] _d = p;
274  p = d;
275  d = _d;
276  }
277 
278  // our last action in the above loop was to switch d and p, so p now
279  // actually has the most recent cost counts
280 
281  // this will return less than 0.0 when the edit distance is
282  // greater than the number of characters in the shorter word.
283  // but this was the formula that was previously used in FuzzyTermEnum,
284  // so it has not been changed (even though minimumSimilarity must be
285  // greater than 0.0)
286  return 1.0f - (p[n] / (float)(prefix.Length + System.Math.Min(n, m)));
287 
288  }
289 
290  /// <summary> The max Distance is the maximum Levenshtein distance for the text
291  /// compared to some other value that results in score that is
292  /// better than the minimum similarity.
293  /// </summary>
294  /// <param name="m">the length of the "other value"
295  /// </param>
296  /// <returns> the maximum levenshtein distance that we care about
297  /// </returns>
298  private int CalculateMaxDistance(int m)
299  {
300  return (int) ((1 - minimumSimilarity) * (System.Math.Min(text.Length, m) + prefix.Length));
301  }
302 
303  protected override void Dispose(bool disposing)
304  {
305  if (isDisposed) return;
306 
307  if (disposing)
308  {
309  p = null;
310  d = null;
311  searchTerm = null;
312  }
313 
314  isDisposed = true;
315  base.Dispose(disposing); //call super.close() and let the garbage collector do its work.
316  }
317  }
318 }