Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
LevenshteinDistance.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.Collections.Generic;
20 using System.Text;
21 using SpellChecker.Net.Search.Spell;
22 
23 namespace SpellChecker.Net.Search.Spell
24 {
25  /// <summary>
26  /// Levenshtein edit distance
27  /// </summary>
29  {
30  /// <summary>
31  /// Returns a float between 0 and 1 based on how similar the specified strings are to one another.
32  /// Returning a value of 1 means the specified strings are identical and 0 means the
33  /// string are maximally different.
34  /// </summary>
35  /// <param name="target">The first string.</param>
36  /// <param name="other">The second string.</param>
37  /// <returns>a float between 0 and 1 based on how similar the specified strings are to one another.</returns>
38  public float GetDistance(String target, String other)
39  {
40  char[] sa;
41  int n;
42  int[] p; //'previous' cost array, horizontally
43  int[] d; // cost array, horizontally
44  int[] _d; //placeholder to assist in swapping p and d
45 
46  /*
47  The difference between this impl. and the previous is that, rather
48  than creating and retaining a matrix of size s.length()+1 by t.length()+1,
49  we maintain two single-dimensional arrays of length s.length()+1. The first, d,
50  is the 'current working' distance array that maintains the newest distance cost
51  counts as we iterate through the characters of String s. Each time we increment
52  the index of String t we are comparing, d is copied to p, the second int[]. Doing so
53  allows us to retain the previous cost counts as required by the algorithm (taking
54  the minimum of the cost count to the left, up one, and diagonally up and to the left
55  of the current cost count being calculated). (Note that the arrays aren't really
56  copied anymore, just switched...this is clearly much better than cloning an array
57  or doing a System.arraycopy() each time through the outer loop.)
58 
59  Effectively, the difference between the two implementations is this one does not
60  cause an out of memory condition when calculating the LD over two very large strings.
61  */
62 
63  sa = target.ToCharArray();
64  n = sa.Length;
65  p = new int[n + 1];
66  d = new int[n + 1];
67  int m = other.Length;
68 
69  if (n == 0 || m == 0)
70  {
71  if (n == m)
72  {
73  return 1;
74  }
75  else
76  {
77  return 0;
78  }
79  }
80 
81 
82  // indexes into strings s and t
83  int i; // iterates through s
84  int j; // iterates through t
85 
86  char t_j; // jth character of t
87 
88  int cost; // cost
89 
90  for (i = 0; i <= n; i++)
91  {
92  p[i] = i;
93  }
94 
95  for (j = 1; j <= m; j++)
96  {
97  t_j = other[j - 1];
98  d[0] = j;
99 
100  for (i = 1; i <= n; i++)
101  {
102  cost = sa[i - 1] == t_j ? 0 : 1;
103  // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
104  d[i] = Math.Min(Math.Min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost);
105  }
106 
107  // copy current distance counts to 'previous row' distance counts
108  _d = p;
109  p = d;
110  d = _d;
111  }
112 
113  // our last action in the above loop was to switch d and p, so p now
114  // actually has the most recent cost counts
115  return 1.0f - ((float)p[n] / Math.Max(other.Length, sa.Length));
116  }
117  }
118 }