Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
SimilarityQueries.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.Collections.Generic;
20 using Lucene.Net.Analysis;
21 using Lucene.Net.Index;
22 using Lucene.Net.Analysis.Tokenattributes;
23 
24 namespace Lucene.Net.Search.Similar
25 {
26 
27  /// <summary> Simple similarity measures.
28  ///
29  ///
30  /// </summary>
31  /// <seealso cref="Lucene.Net.Search.Similar.MoreLikeThis">
32  /// </seealso>
33  public sealed class SimilarityQueries
34  {
35  /// <summary> </summary>
36  private SimilarityQueries()
37  {
38  }
39 
40  /// <summary> Simple similarity query generators.
41  /// Takes every unique word and forms a boolean query where all words are optional.
42  /// After you get this you'll use to to query your <see cref="IndexSearcher"/> for similar docs.
43  /// The only caveat is the first hit returned <b>should be</b> your source document - you'll
44  /// need to then ignore that.
45  ///
46  /// <p/>
47  ///
48  /// So, if you have a code fragment like this:
49  /// <br/>
50  /// <code>
51  /// Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null);
52  /// </code>
53  ///
54  /// <p/>
55  ///
56  /// The query returned, in string form, will be <c>'(i use lucene to search fast searchers are good')</c>.
57  ///
58  /// <p/>
59  /// The philosophy behind this method is "two documents are similar if they share lots of words".
60  /// Note that behind the scenes, Lucenes scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words.
61  ///
62  /// <P/>
63  /// This method is fail-safe in that if a long 'body' is passed in and
64  /// <see cref="BooleanQuery.Add"/> (used internally)
65  /// throws
66  /// <see cref="BooleanQuery.TooManyClauses"/>, the
67  /// query as it is will be returned.
68  /// </summary>
69  /// <param name="body">the body of the document you want to find similar documents to
70  /// </param>
71  /// <param name="a">the analyzer to use to parse the body
72  /// </param>
73  /// <param name="field">the field you want to search on, probably something like "contents" or "body"
74  /// </param>
75  /// <param name="stop">optional set of stop words to ignore
76  /// </param>
77  /// <returns> a query with all unique words in 'body'
78  /// </returns>
79  /// <throws> IOException this can't happen... </throws>
80  public static Query FormSimilarQuery(System.String body, Analyzer a, System.String field, ISet<string> stop)
81  {
82  TokenStream ts = a.TokenStream(field, new System.IO.StringReader(body));
83  ITermAttribute termAtt = ts.AddAttribute<ITermAttribute>();
84 
85  BooleanQuery tmp = new BooleanQuery();
86  ISet<string> already = Lucene.Net.Support.Compatibility.SetFactory.CreateHashSet<string>(); // ignore dups
87  while (ts.IncrementToken())
88  {
89  String word = termAtt.Term;
90  // ignore opt stop words
91  if (stop != null && stop.Contains(word))
92  continue;
93  // ignore dups
94  if (already.Contains(word))
95  continue;
96  already.Add(word);
97  // add to query
98  TermQuery tq = new TermQuery(new Term(field, word));
99  try
100  {
101  tmp.Add(tq, Occur.SHOULD);
102  }
103  catch (BooleanQuery.TooManyClauses)
104  {
105  // fail-safe, just return what we have, not the end of the world
106  break;
107  }
108  }
109  return tmp;
110  }
111  }
112 }