Lucene.Net  3.0.3
Lucene.Net is a .NET port of the Java Lucene Indexing Library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties
FuzzyTermEnum.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 
20 using IndexReader = Lucene.Net.Index.IndexReader;
21 using Term = Lucene.Net.Index.Term;
22 
23 namespace Lucene.Net.Search
24 {
25 
32  public sealed class FuzzyTermEnum:FilteredTermEnum
33  {
34  /* Allows us save time required to create a new array
35  * everytime similarity is called.
36  */
37  private int[] p;
38  private int[] d;
39 
40  private float similarity;
41  private bool endEnum = false;
42 
43  private bool isDisposed;
44 
45  private Term searchTerm = null;
46  private System.String field;
47  private System.String text;
48  private System.String prefix;
49 
50  private float minimumSimilarity;
51  private float scale_factor;
52 
66  public FuzzyTermEnum(IndexReader reader, Term term):this(reader, term, FuzzyQuery.defaultMinSimilarity, FuzzyQuery.defaultPrefixLength)
67  {
68  }
69 
85  public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity):this(reader, term, minSimilarity, FuzzyQuery.defaultPrefixLength)
86  {
87  }
88 
106  public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity, int prefixLength):base()
107  {
108 
109  if (minSimilarity >= 1.0f)
110  throw new System.ArgumentException("minimumSimilarity cannot be greater than or equal to 1");
111  else if (minSimilarity < 0.0f)
112  throw new System.ArgumentException("minimumSimilarity cannot be less than 0");
113  if (prefixLength < 0)
114  throw new System.ArgumentException("prefixLength cannot be less than 0");
115 
116  this.minimumSimilarity = minSimilarity;
117  this.scale_factor = 1.0f / (1.0f - minimumSimilarity);
118  this.searchTerm = term;
119  this.field = searchTerm.Field;
120 
121  //The prefix could be longer than the word.
122  //It's kind of silly though. It means we must match the entire word.
123  int fullSearchTermLength = searchTerm.Text.Length;
124  int realPrefixLength = prefixLength > fullSearchTermLength?fullSearchTermLength:prefixLength;
125 
126  this.text = searchTerm.Text.Substring(realPrefixLength);
127  this.prefix = searchTerm.Text.Substring(0, (realPrefixLength) - (0));
128 
129  this.p = new int[this.text.Length + 1];
130  this.d = new int[this.text.Length + 1];
131 
132  SetEnum(reader.Terms(new Term(searchTerm.Field, prefix)));
133  }
134 
138  protected internal override bool TermCompare(Term term)
139  {
140  if ((System.Object) field == (System.Object) term.Field && term.Text.StartsWith(prefix))
141  {
142  System.String target = term.Text.Substring(prefix.Length);
143  this.similarity = Similarity(target);
144  return (similarity > minimumSimilarity);
145  }
146  endEnum = true;
147  return false;
148  }
149 
150  public override float Difference()
151  {
152  return ((similarity - minimumSimilarity) * scale_factor);
153  }
154 
155  public override bool EndEnum()
156  {
157  return endEnum;
158  }
159 
160  // <summary>
161  // ***************************
162  // Compute Levenshtein distance
163  // ****************************
164  // </summary>
165 
204  private float Similarity(System.String target)
205  {
206 
207  int m = target.Length;
208  int n = text.Length;
209  if (n == 0)
210  {
211  //we don't have anything to compare. That means if we just add
212  //the letters for m we get the new word
213  return prefix.Length == 0 ? 0.0f : 1.0f - ((float)m / prefix.Length);
214  }
215  if (m == 0)
216  {
217  return prefix.Length == 0 ? 0.0f : 1.0f - ((float)n / prefix.Length);
218  }
219 
220  int maxDistance = CalculateMaxDistance(m);
221 
222  if (maxDistance < System.Math.Abs(m - n))
223  {
224  //just adding the characters of m to n or vice-versa results in
225  //too many edits
226  //for example "pre" length is 3 and "prefixes" length is 8. We can see that
227  //given this optimal circumstance, the edit distance cannot be less than 5.
228  //which is 8-3 or more precisesly Math.abs(3-8).
229  //if our maximum edit distance is 4, then we can discard this word
230  //without looking at it.
231  return 0.0f;
232  }
233 
234  // init matrix d
235  for (int i = 0; i < n; ++i)
236  {
237  p[i] = i;
238  }
239 
240  // start computing edit distance
241  for (int j = 1; j <= m; ++j)
242  {
243  int bestPossibleEditDistance = m;
244  char t_j = target[j - 1];
245  d[0] = j;
246  for (int i = 1; i <= n; ++i)
247  {
248  // minimum of cell to the left+1, to the top+1, diagonally left and up +(0|1)
249  if (t_j != text[i - 1])
250  {
251  d[i] = Math.Min(Math.Min(d[i - 1], p[i]), p[i - 1]) + 1;
252  }
253  else
254  {
255  d[i] = Math.Min(Math.Min(d[i - 1] + 1, p[i] + 1), p[i - 1]);
256  }
257  bestPossibleEditDistance = System.Math.Min(bestPossibleEditDistance, d[i]);
258  }
259 
260  //After calculating row i, the best possible edit distance
261  //can be found by found by finding the smallest value in a given column.
262  //If the bestPossibleEditDistance is greater than the max distance, abort.
263 
264  if (j > maxDistance && bestPossibleEditDistance > maxDistance)
265  {
266  //equal is okay, but not greater
267  //the closest the target can be to the text is just too far away.
268  //this target is leaving the party early.
269  return 0.0f;
270  }
271 
272  // copy current distance counts to 'previous row' distance counts: swap p and d
273  int[] _d = p;
274  p = d;
275  d = _d;
276  }
277 
278  // our last action in the above loop was to switch d and p, so p now
279  // actually has the most recent cost counts
280 
281  // this will return less than 0.0 when the edit distance is
282  // greater than the number of characters in the shorter word.
283  // but this was the formula that was previously used in FuzzyTermEnum,
284  // so it has not been changed (even though minimumSimilarity must be
285  // greater than 0.0)
286  return 1.0f - (p[n] / (float)(prefix.Length + System.Math.Min(n, m)));
287 
288  }
289 
298  private int CalculateMaxDistance(int m)
299  {
300  return (int) ((1 - minimumSimilarity) * (System.Math.Min(text.Length, m) + prefix.Length));
301  }
302 
303  protected override void Dispose(bool disposing)
304  {
305  if (isDisposed) return;
306 
307  if (disposing)
308  {
309  p = null;
310  d = null;
311  searchTerm = null;
312  }
313 
314  isDisposed = true;
315  base.Dispose(disposing); //call super.close() and let the garbage collector do its work.
316  }
317  }
318 }