Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
SynExpand.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.Collections.Generic;
20 using System.IO;
21 using System.Linq;
22 using Lucene.Net.Analysis;
23 using Lucene.Net.Analysis.Standard;
24 using Lucene.Net.Analysis.Tokenattributes;
25 using Lucene.Net.Index;
26 using Lucene.Net.Search;
27 using Lucene.Net.Store;
28 
29 namespace WorldNet.Net
30 {
31 
32 
33  /// <summary> Expand a query by looking up synonyms for every term.
34  /// You need to invoke <see cref="Syns2Index"/> first to build the synonym index.
35  ///
36  /// </summary>
37  /// <seealso cref="Syns2Index" />
38  public sealed class SynExpand
39  {
40  static List<String> already;
41  private static BooleanQuery tmp;
42 
43  /// <summary> Test driver for synonym expansion.
44  /// Uses boost factor of 0.9 for illustrative purposes.
45  ///
46  /// If you pass in the query "big dog" then it prints out:
47  ///
48  /// <pre>
49  /// Query: big adult^0.9 bad^0.9 bighearted^0.9 boastful^0.9 boastfully^0.9 bounteous^0.9 bountiful^0.9 braggy^0.9 crowing^0.9 freehanded^0.9 giving^0.9 grown^0.9 grownup^0.9 handsome^0.9 large^0.9 liberal^0.9 magnanimous^0.9 momentous^0.9 openhanded^0.9 prominent^0.9 swelled^0.9 vainglorious^0.9 vauntingly^0.9
50  /// dog andiron^0.9 blackguard^0.9 bounder^0.9 cad^0.9 chase^0.9 click^0.9 detent^0.9 dogtooth^0.9 firedog^0.9 frank^0.9 frankfurter^0.9 frump^0.9 heel^0.9 hotdog^0.9 hound^0.9 pawl^0.9 tag^0.9 tail^0.9 track^0.9 trail^0.9 weenie^0.9 wiener^0.9 wienerwurst^0.9
51  /// </pre>
52  /// </summary>
53  [STAThread]
54  public static void Main(String[] args)
55  {
56  if (args.Length != 2)
57  {
58  Console.Out.WriteLine(typeof(SynExpand) + " <index path> <query>");
59  return;
60  }
61 
62  var directory = FSDirectory.Open(new DirectoryInfo(args[0]));
63  var searcher = new IndexSearcher(directory, true);
64 
65  String query = args[1];
66  const string field = "contents";
67 
68  Query q = Expand(query, searcher, new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_CURRENT), field, 0.9f);
69  System.Console.Out.WriteLine("Query: " + q.ToString(field));
70 
71  searcher.Close();
72  directory.Close();
73  }
74 
75 
76  /// <summary>
77  /// Perform synonym expansion on a query.
78  /// </summary>
79  /// <param name="query">users query that is assumed to not have any "special" query syntax, thus it should be just normal words, so "big dog" makes sense, but a query like "title:foo^1.2" doesn't as this should presumably be passed directly to the default query parser </param>
80  /// <param name="syns">a opened to the Lucene index you previously created with <see cref="Syns2Index"/>. The searcher is not closed or otherwise altered. </param>
81  /// <param name="a">optional analyzer used to parse the users query else <see cref="StandardAnalyzer"/> is used </param>
82  /// <param name="field">optional field name to search in or null if you want the default of "contents" </param>
83  /// <param name="boost">optional boost applied to synonyms else no boost is applied </param>
84  /// <returns>the expanded Query </returns>
85  public static Query Expand(String query,
86  Searcher syns,
87  Analyzer a,
88  String field,
89  float boost)
90  {
91  already = new List<String>(); // avoid dups
92  var top = new List<String>(); // needs to be separately listed..
93  if (field == null)
94  field = "contents";
95 
96  if (a == null)
97  a = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_CURRENT);
98 
99  // [1] Parse query into separate words so that when we expand we can avoid dups
100  var ts = a.TokenStream(field, new StringReader(query));
101  var termAtt = ts.AddAttribute<TermAttribute>();
102 
103  while (ts.IncrementToken())
104  {
105  var word = termAtt.Term;
106 
107  if (!already.Contains(word))
108  {
109  already.Add(word);
110  top.Add(word);
111  }
112  }
113 
114  tmp = new BooleanQuery();
115 
116  // [2] form query
117  System.Collections.IEnumerator it = top.GetEnumerator();
118  while (it.MoveNext())
119  {
120  // [2a] add to level words in
121  var word = (String) it.Current;
122  var tq = new TermQuery(new Term(field, word));
123  tmp.Add(tq, Occur.SHOULD);
124 
125  var c = new CollectorImpl(field, boost);
126  syns.Search(new TermQuery(new Term(Syns2Index.F_WORD, word)), c);
127  }
128 
129  return tmp;
130  }
131 
132 
133  /// <summary>
134  /// From project WordNet.Net.Syns2Index
135  /// </summary>
136  public class Syns2Index
137  {
138  /// <summary> </summary>
139  public const String F_SYN = "syn";
140 
141  /// <summary> </summary>
142  public const String F_WORD = "word";
143  }
144 
145  /// <summary>
146  /// CollectorImpl
147  /// </summary>
148  internal sealed class CollectorImpl : Collector
149  {
150  private IndexReader reader;
151  private readonly string field;
152  private readonly float boost;
153 
154  public CollectorImpl(string field, float boost)
155  {
156  this.field = field;
157  this.boost = boost;
158  }
159 
160  public override void SetScorer(Scorer scorer)
161  {
162  // Ignore
163  }
164 
165  public override void Collect(int doc)
166  {
167  var d = reader.Document(doc);
168  var values = d.GetValues(Syns2Index.F_SYN);
169  foreach (var syn in values.Where(syn => !already.Contains(syn)))
170  {
171  already.Add(syn);
172 
173  var tq = new TermQuery(new Term(field, syn));
174  if (boost > 0) // else keep normal 1.0
175  tq.Boost = boost;
176 
177  tmp.Add(tq, Occur.SHOULD);
178  }
179  }
180 
181  public override void SetNextReader(IndexReader reader, int docBase)
182  {
183  this.reader = reader;
184  }
185 
186  public override bool AcceptsDocsOutOfOrder
187  {
188  get { return true; }
189  }
190 
191  }
192 
193  }
194 }