Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
SpellChecker.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 
19 using System.Collections.Generic;
20 
21 namespace SpellChecker.Net.Search.Spell
22 {
23  using System;
24 
25  using Lucene.Net.Search;
26  using Lucene.Net.Store;
27  using BooleanClause = Lucene.Net.Search.BooleanClause;
28  using BooleanQuery = Lucene.Net.Search.BooleanQuery;
29  using Directory = Lucene.Net.Store.Directory;
30  using Document = Lucene.Net.Documents.Document;
31  using Field = Lucene.Net.Documents.Field;
32  using IndexReader = Lucene.Net.Index.IndexReader;
33  using IndexSearcher = Lucene.Net.Search.IndexSearcher;
34  using IndexWriter = Lucene.Net.Index.IndexWriter;
35  using Query = Lucene.Net.Search.Query;
36  using Term = Lucene.Net.Index.Term;
37  using TermQuery = Lucene.Net.Search.TermQuery;
38  using WhitespaceAnalyzer = Lucene.Net.Analysis.WhitespaceAnalyzer;
39 
40 
41  /// <summary> <p>
42  /// Spell Checker class (Main class) <br/>
43  /// (initially inspired by the David Spencer code).
44  /// </p>
45  ///
46  /// <p>Example Usage:</p>
47  ///
48  /// <pre>
49  /// SpellChecker spellchecker = new SpellChecker(spellIndexDirectory);
50  /// // To index a field of a user index:
51  /// spellchecker.indexDictionary(new LuceneDictionary(my_lucene_reader, a_field));
52  /// // To index a file containing words:
53  /// spellchecker.indexDictionary(new PlainTextDictionary(new File("myfile.txt")));
54  /// String[] suggestions = spellchecker.suggestSimilar("misspelt", 5);
55  /// </pre>
56  ///
57  /// </summary>
58  /// <author> Nicolas Maisonneuve
59  /// </author>
60  /// <version> 1.0
61  /// </version>
62  public class SpellChecker : IDisposable
63  {
64  /// <summary> Field name for each word in the ngram index.</summary>
65  public const System.String F_WORD = "word";
66  private readonly Term F_WORD_TERM = new Term(F_WORD);
67 
68  /// <summary> the spell index</summary>
69  internal Directory spellindex;
70 
71  /// <summary> Boost value for start and end grams</summary>
72  private const float bStart = 2.0f;
73  private const float bEnd = 1.0f;
74 
75  // don't use this searcher directly - see #swapSearcher()
76  private IndexSearcher searcher;
77 
78  /// <summary>
79  /// this locks all modifications to the current searcher.
80  /// </summary>
81  private static readonly System.Object searcherLock = new System.Object();
82 
83  /*
84  * this lock synchronizes all possible modifications to the
85  * current index directory. It should not be possible to try modifying
86  * the same index concurrently. Note: Do not acquire the searcher lock
87  * before acquiring this lock!
88  */
89  private static readonly System.Object modifyCurrentIndexLock = new System.Object();
90  private volatile bool closed = false;
91 
92  internal float minScore = 0.5f; //LUCENENET-359 Spellchecker accuracy gets overwritten
93 
94  private StringDistance sd;
95 
96  /// <summary>
97  /// Use the given directory as a spell checker index. The directory
98  /// is created if it doesn't exist yet.
99  /// </summary>
100  /// <param name="spellIndex">the spell index directory</param>
101  /// <param name="sd">the <see cref="StringDistance"/> measurement to use </param>
102  public SpellChecker(Directory spellIndex, StringDistance sd)
103  {
104  this.SetSpellIndex(spellIndex);
105  this.setStringDistance(sd);
106  }
107 
108  /// <summary>
109  /// Use the given directory as a spell checker index with a
110  /// <see cref="LevenshteinDistance"/> as the default <see cref="StringDistance"/>. The
111  /// directory is created if it doesn't exist yet.
112  /// </summary>
113  /// <param name="spellIndex">the spell index directory</param>
114  public SpellChecker(Directory spellIndex)
115  : this(spellIndex, new LevenshteinDistance())
116  { }
117 
118  /// <summary>
119  /// Use a different index as the spell checker index or re-open
120  /// the existing index if <c>spellIndex</c> is the same value
121  /// as given in the constructor.
122  /// </summary>
123  /// <param name="spellIndexDir">spellIndexDir the spell directory to use </param>
124  /// <throws>AlreadyClosedException if the Spellchecker is already closed</throws>
125  /// <throws>IOException if spellchecker can not open the directory</throws>
126  virtual public void SetSpellIndex(Directory spellIndexDir)
127  {
128  // this could be the same directory as the current spellIndex
129  // modifications to the directory should be synchronized
130  lock (modifyCurrentIndexLock)
131  {
132  EnsureOpen();
133  if (!IndexReader.IndexExists(spellIndexDir))
134  {
135  var writer = new IndexWriter(spellIndexDir, null, true,
136  IndexWriter.MaxFieldLength.UNLIMITED);
137  writer.Close();
138  }
139  SwapSearcher(spellIndexDir);
140  }
141  }
142 
143  /// <summary>
144  /// Sets the <see cref="StringDistance"/> implementation for this
145  /// <see cref="SpellChecker"/> instance.
146  /// </summary>
147  /// <param name="sd">the <see cref="StringDistance"/> implementation for this
148  /// <see cref="SpellChecker"/> instance.</param>
149  public void setStringDistance(StringDistance sd)
150  {
151  this.sd = sd;
152  }
153 
154  /// <summary>
155  /// Returns the <see cref="StringDistance"/> instance used by this
156  /// <see cref="SpellChecker"/> instance.
157  /// </summary>
158  /// <returns>
159  /// Returns the <see cref="StringDistance"/> instance used by this
160  /// <see cref="SpellChecker"/> instance.
161  /// </returns>
162  public StringDistance GetStringDistance()
163  {
164  return sd;
165  }
166 
167 
168  /// <summary> Set the accuracy 0 &lt; min &lt; 1; default 0.5</summary>
169  virtual public void SetAccuracy(float minScore)
170  {
171  this.minScore = minScore;
172  }
173 
174  /// <summary> Suggest similar words</summary>
175  /// <param name="word">String the word you want a spell check done on
176  /// </param>
177  /// <param name="num_sug">int the number of suggest words
178  /// </param>
179  /// <throws> IOException </throws>
180  /// <returns> String[]
181  /// </returns>
182  public virtual System.String[] SuggestSimilar(System.String word, int num_sug)
183  {
184  return this.SuggestSimilar(word, num_sug, null, null, false);
185  }
186 
187 
188  /// <summary> Suggest similar words (restricted or not to a field of a user index)</summary>
189  /// <param name="word">String the word you want a spell check done on
190  /// </param>
191  /// <param name="numSug">int the number of suggest words
192  /// </param>
193  /// <param name="ir">the indexReader of the user index (can be null see field param)
194  /// </param>
195  /// <param name="field">String the field of the user index: if field is not null, the suggested
196  /// words are restricted to the words present in this field.
197  /// </param>
198  /// <param name="morePopular">boolean return only the suggest words that are more frequent than the searched word
199  /// (only if restricted mode = (indexReader!=null and field!=null)
200  /// </param>
201  /// <throws> IOException </throws>
202  /// <returns> String[] the sorted list of the suggest words with this 2 criteria:
203  /// first criteria: the edit distance, second criteria (only if restricted mode): the popularity
204  /// of the suggest words in the field of the user index
205  /// </returns>
206  public virtual System.String[] SuggestSimilar(System.String word, int numSug, IndexReader ir, System.String field, bool morePopular)
207  { // obtainSearcher calls ensureOpen
208  IndexSearcher indexSearcher = ObtainSearcher();
209  try
210  {
211  float min = this.minScore;
212  int lengthWord = word.Length;
213 
214  int freq = (ir != null && field != null) ? ir.DocFreq(new Term(field, word)) : 0;
215  int goalFreq = (morePopular && ir != null && field != null) ? freq : 0;
216  // if the word exists in the real index and we don't care for word frequency, return the word itself
217  if (!morePopular && freq > 0)
218  {
219  return new String[] { word };
220  }
221 
222  var query = new BooleanQuery();
223  String[] grams;
224  String key;
225 
226  var alreadySeen = new HashSet<string>();
227  for (var ng = GetMin(lengthWord); ng <= GetMax(lengthWord); ng++)
228  {
229  key = "gram" + ng; // form key
230 
231  grams = FormGrams(word, ng); // form word into ngrams (allow dups too)
232 
233  if (grams.Length == 0)
234  {
235  continue; // hmm
236  }
237 
238  if (bStart > 0)
239  { // should we boost prefixes?
240  Add(query, "start" + ng, grams[0], bStart); // matches start of word
241 
242  }
243  if (bEnd > 0)
244  { // should we boost suffixes
245  Add(query, "end" + ng, grams[grams.Length - 1], bEnd); // matches end of word
246 
247  }
248  for (int i = 0; i < grams.Length; i++)
249  {
250  Add(query, key, grams[i]);
251  }
252  }
253 
254  int maxHits = 10 * numSug;
255 
256  // System.out.println("Q: " + query);
257  ScoreDoc[] hits = indexSearcher.Search(query, null, maxHits).ScoreDocs;
258  // System.out.println("HITS: " + hits.length());
259  SuggestWordQueue sugQueue = new SuggestWordQueue(numSug);
260 
261  // go thru more than 'maxr' matches in case the distance filter triggers
262  int stop = Math.Min(hits.Length, maxHits);
263  SuggestWord sugWord = new SuggestWord();
264  for (int i = 0; i < stop; i++)
265  {
266  sugWord.termString = indexSearcher.Doc(hits[i].Doc).Get(F_WORD); // get orig word
267 
268  // don't suggest a word for itself, that would be silly
269  if (sugWord.termString.Equals(word))
270  {
271  continue;
272  }
273 
274  // edit distance
275  sugWord.score = sd.GetDistance(word, sugWord.termString);
276  if (sugWord.score < min)
277  {
278  continue;
279  }
280 
281  if (ir != null && field != null)
282  { // use the user index
283  sugWord.freq = ir.DocFreq(new Term(field, sugWord.termString)); // freq in the index
284  // don't suggest a word that is not present in the field
285  if ((morePopular && goalFreq > sugWord.freq) || sugWord.freq < 1)
286  {
287  continue;
288  }
289  }
290 
291  if (alreadySeen.Add(sugWord.termString) == false) // we already seen this word, no point returning it twice
292  continue;
293 
294  sugQueue.InsertWithOverflow(sugWord);
295  if (sugQueue.Size() == numSug)
296  {
297  // if queue full, maintain the minScore score
298  min = ((SuggestWord)sugQueue.Top()).score;
299  }
300  sugWord = new SuggestWord();
301  }
302 
303  // convert to array string
304  String[] list = new String[sugQueue.Size()];
305  for (int i = sugQueue.Size() - 1; i >= 0; i--)
306  {
307  list[i] = ((SuggestWord)sugQueue.Pop()).termString;
308  }
309 
310  return list;
311  }
312  finally
313  {
314  ReleaseSearcher(indexSearcher);
315  }
316 
317  }
318 
319 
320  /// <summary> Add a clause to a boolean query.</summary>
321  private static void Add(BooleanQuery q, System.String k, System.String v, float boost)
322  {
323  Query tq = new TermQuery(new Term(k, v));
324  tq.Boost = boost;
325  q.Add(new BooleanClause(tq, Occur.SHOULD));
326  }
327 
328 
329  /// <summary> Add a clause to a boolean query.</summary>
330  private static void Add(BooleanQuery q, System.String k, System.String v)
331  {
332  q.Add(new BooleanClause(new TermQuery(new Term(k, v)), Occur.SHOULD));
333  }
334 
335 
336  /// <summary> Form all ngrams for a given word.</summary>
337  /// <param name="text">the word to parse
338  /// </param>
339  /// <param name="ng">the ngram length e.g. 3
340  /// </param>
341  /// <returns> an array of all ngrams in the word and note that duplicates are not removed
342  /// </returns>
343  private static System.String[] FormGrams(System.String text, int ng)
344  {
345  int len = text.Length;
346  System.String[] res = new System.String[len - ng + 1];
347  for (int i = 0; i < len - ng + 1; i++)
348  {
349  res[i] = text.Substring(i, (i + ng) - (i));
350  }
351  return res;
352  }
353 
354  /// <summary>
355  /// Removes all terms from the spell check index.
356  /// </summary>
357  public virtual void ClearIndex()
358  {
359  lock (modifyCurrentIndexLock)
360  {
361  EnsureOpen();
362  Directory dir = this.spellindex;
363  IndexWriter writer = new IndexWriter(dir, null, true, IndexWriter.MaxFieldLength.UNLIMITED);
364  writer.Close();
365  SwapSearcher(dir);
366  }
367  }
368 
369 
370  /// <summary> Check whether the word exists in the index.</summary>
371  /// <param name="word">String
372  /// </param>
373  /// <throws> IOException </throws>
374  /// <returns> true iff the word exists in the index
375  /// </returns>
376  public virtual bool Exist(System.String word)
377  {
378  // obtainSearcher calls ensureOpen
379  IndexSearcher indexSearcher = ObtainSearcher();
380  try
381  {
382  return indexSearcher.DocFreq(F_WORD_TERM.CreateTerm(word)) > 0;
383  }
384  finally
385  {
386  ReleaseSearcher(indexSearcher);
387  }
388  }
389 
390 
391  /// <summary> Index a Dictionary</summary>
392  /// <param name="dict">the dictionary to index</param>
393  /// <param name="mergeFactor">mergeFactor to use when indexing</param>
394  /// <param name="ramMB">the max amount or memory in MB to use</param>
395  /// <throws> IOException </throws>
396  /// <throws>AlreadyClosedException if the Spellchecker is already closed</throws>
397  public virtual void IndexDictionary(IDictionary dict, int mergeFactor, int ramMB)
398  {
399  lock (modifyCurrentIndexLock)
400  {
401  EnsureOpen();
402  Directory dir = this.spellindex;
403  IndexWriter writer = new IndexWriter(spellindex, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED);
404  writer.MergeFactor = mergeFactor;
405  writer.SetMaxBufferedDocs(ramMB);
406 
407  System.Collections.IEnumerator iter = dict.GetWordsIterator();
408  while (iter.MoveNext())
409  {
410  System.String word = (System.String)iter.Current;
411 
412  int len = word.Length;
413  if (len < 3)
414  {
415  continue; // too short we bail but "too long" is fine...
416  }
417 
418  if (this.Exist(word))
419  {
420  // if the word already exist in the gramindex
421  continue;
422  }
423 
424  // ok index the word
425  Document doc = CreateDocument(word, GetMin(len), GetMax(len));
426  writer.AddDocument(doc);
427  }
428  // close writer
429  writer.Optimize();
430  writer.Close();
431  // also re-open the spell index to see our own changes when the next suggestion
432  // is fetched:
433  SwapSearcher(dir);
434  }
435  }
436 
437  /// <summary>
438  /// Indexes the data from the given <see cref="IDictionary"/>.
439  /// </summary>
440  /// <param name="dict">dict the dictionary to index</param>
441  public void IndexDictionary(IDictionary dict)
442  {
443  IndexDictionary(dict, 300, 10);
444  }
445 
446  private int GetMin(int l)
447  {
448  if (l > 5)
449  {
450  return 3;
451  }
452  if (l == 5)
453  {
454  return 2;
455  }
456  return 1;
457  }
458 
459 
460  private int GetMax(int l)
461  {
462  if (l > 5)
463  {
464  return 4;
465  }
466  if (l == 5)
467  {
468  return 3;
469  }
470  return 2;
471  }
472 
473 
474  private static Document CreateDocument(System.String text, int ng1, int ng2)
475  {
476  Document doc = new Document();
477  doc.Add(new Field(F_WORD, text, Field.Store.YES, Field.Index.NOT_ANALYZED)); // orig term
478  AddGram(text, doc, ng1, ng2);
479  return doc;
480  }
481 
482 
483  private static void AddGram(System.String text, Document doc, int ng1, int ng2)
484  {
485  int len = text.Length;
486  for (int ng = ng1; ng <= ng2; ng++)
487  {
488  System.String key = "gram" + ng;
489  System.String end = null;
490  for (int i = 0; i < len - ng + 1; i++)
491  {
492  System.String gram = text.Substring(i, (i + ng) - (i));
493  doc.Add(new Field(key, gram, Field.Store.NO, Field.Index.NOT_ANALYZED));
494  if (i == 0)
495  {
496  doc.Add(new Field("start" + ng, gram, Field.Store.NO, Field.Index.NOT_ANALYZED));
497  }
498  end = gram;
499  }
500  if (end != null)
501  {
502  // may not be present if len==ng1
503  doc.Add(new Field("end" + ng, end, Field.Store.NO, Field.Index.NOT_ANALYZED));
504  }
505  }
506  }
507 
508  private IndexSearcher ObtainSearcher()
509  {
510  lock (searcherLock)
511  {
512  EnsureOpen();
513  searcher.IndexReader.IncRef();
514  return searcher;
515  }
516  }
517 
518  private void ReleaseSearcher(IndexSearcher aSearcher)
519  {
520  // don't check if open - always decRef
521  // don't decrement the private searcher - could have been swapped
522  aSearcher.IndexReader.DecRef();
523  }
524 
525  private void EnsureOpen()
526  {
527  if (closed)
528  {
529  throw new AlreadyClosedException("Spellchecker has been closed");
530  }
531  }
532 
533  public void Close()
534  {
535  lock (searcherLock)
536  {
537  EnsureOpen();
538  closed = true;
539  if (searcher != null)
540  {
541  searcher.Close();
542  }
543  searcher = null;
544  }
545  }
546 
547  private void SwapSearcher(Directory dir)
548  {
549  /*
550  * opening a searcher is possibly very expensive.
551  * We rather close it again if the Spellchecker was closed during
552  * this operation than block access to the current searcher while opening.
553  */
554  IndexSearcher indexSearcher = CreateSearcher(dir);
555  lock (searcherLock)
556  {
557  if (closed)
558  {
559  indexSearcher.Close();
560  throw new AlreadyClosedException("Spellchecker has been closed");
561  }
562  if (searcher != null)
563  {
564  searcher.Close();
565  }
566  // set the spellindex in the sync block - ensure consistency.
567  searcher = indexSearcher;
568  this.spellindex = dir;
569  }
570  }
571 
572  /// <summary>
573  /// Creates a new read-only IndexSearcher (for testing purposes)
574  /// </summary>
575  /// <param name="dir">dir the directory used to open the searcher</param>
576  /// <returns>a new read-only IndexSearcher. (throws IOException f there is a low-level IO error)</returns>
577  public virtual IndexSearcher CreateSearcher(Directory dir)
578  {
579  return new IndexSearcher(dir, true);
580  }
581 
582  /// <summary>
583  /// Returns <c>true</c> if and only if the <see cref="SpellChecker"/> is
584  /// closed, otherwise <c>false</c>.
585  /// </summary>
586  /// <returns><c>true</c> if and only if the <see cref="SpellChecker"/> is
587  /// closed, otherwise <c>false</c>.
588  ///</returns>
589  bool IsClosed()
590  {
591  return closed;
592  }
593 
594  ~SpellChecker()
595  {
596  this.Dispose(false);
597  }
598 
599  public void Dispose()
600  {
601  this.Dispose(true);
602  GC.SuppressFinalize(this);
603  }
604 
605  protected void Dispose(bool disposeOfManagedResources)
606  {
607  if (disposeOfManagedResources)
608  {
609  if (!this.closed)
610  this.Close();
611  }
612  }
613  }
614 }