Lucene.Net  3.0.3
Lucene.Net is a .NET port of the Java Lucene Indexing Library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties
NGramDistance.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 
19 using System;
20 using System.Collections.Generic;
21 using System.Text;
22 
23 namespace SpellChecker.Net.Search.Spell
24 {
26  {
27  private int n;
28 
33  public NGramDistance(int size)
34  {
35  this.n = size;
36  }
37 
41  public NGramDistance()
42  : this(2)
43  {
44  }
45 
46  public float GetDistance(String source, String target)
47  {
48  int sl = source.Length;
49  int tl = target.Length;
50 
51  if (sl == 0 || tl == 0)
52  {
53  if (sl == tl)
54  {
55  return 1;
56  }
57  else
58  {
59  return 0;
60  }
61  }
62 
63  int cost = 0;
64  if (sl < n || tl < n)
65  {
66  for (int ii = 0, ni = Math.Min(sl, tl); ii < ni; ii++)
67  {
68  if (source[ii] == target[ii])
69  {
70  cost++;
71  }
72  }
73  return (float)cost / Math.Max(sl, tl);
74  }
75 
76  char[] sa = new char[sl + n - 1];
77  float[] p; //'previous' cost array, horizontally
78  float[] d; // cost array, horizontally
79  float[] _d; //placeholder to assist in swapping p and d
80 
81  //construct sa with prefix
82  for (int ii = 0; ii < sa.Length; ii++)
83  {
84  if (ii < n - 1)
85  {
86  sa[ii] = (char)0; //add prefix
87  }
88  else
89  {
90  sa[ii] = source[ii - n + 1];
91  }
92  }
93  p = new float[sl + 1];
94  d = new float[sl + 1];
95 
96  // indexes into strings s and t
97  int i; // iterates through source
98  int j; // iterates through target
99 
100  char[] t_j = new char[n]; // jth n-gram of t
101 
102  for (i = 0; i <= sl; i++)
103  {
104  p[i] = i;
105  }
106 
107  for (j = 1; j <= tl; j++)
108  {
109  //construct t_j n-gram
110  if (j < n)
111  {
112  for (int ti = 0; ti < n - j; ti++)
113  {
114  t_j[ti] = (char)0; //add prefix
115  }
116  for (int ti = n - j; ti < n; ti++)
117  {
118  t_j[ti] = target[ti - (n - j)];
119  }
120  }
121  else
122  {
123  t_j = target.Substring(j - n, n).ToCharArray();
124  }
125  d[0] = j;
126  for (i = 1; i <= sl; i++)
127  {
128  cost = 0;
129  int tn = n;
130  //compare sa to t_j
131  for (int ni = 0; ni < n; ni++)
132  {
133  if (sa[i - 1 + ni] != t_j[ni])
134  {
135  cost++;
136  }
137  else if (sa[i - 1 + ni] == 0)
138  { //discount matches on prefix
139  tn--;
140  }
141  }
142  float ec = (float)cost / tn;
143  // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
144  d[i] = Math.Min(Math.Min(d[i - 1] + 1, p[i] + 1), p[i - 1] + ec);
145  }
146  // copy current distance counts to 'previous row' distance counts
147  _d = p;
148  p = d;
149  d = _d;
150  }
151 
152  // our last action in the above loop was to switch d and p, so p now
153  // actually has the most recent cost counts
154  return 1.0f - ((float)p[sl] / Math.Max(tl, sl));
155  }
156 
157 
158  }
159 }