Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
MoreLikeThis.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.Collections.Generic;
20 using System.IO;
21 using System.Linq;
22 using Lucene.Net.Index;
23 using Lucene.Net.Store;
24 using Lucene.Net.Support;
25 using Lucene.Net.Util;
26 using IndexReader = Lucene.Net.Index.IndexReader;
27 using Term = Lucene.Net.Index.Term;
28 using BooleanClause = Lucene.Net.Search.BooleanClause;
29 using DefaultSimilarity = Lucene.Net.Search.DefaultSimilarity;
30 using TermQuery = Lucene.Net.Search.TermQuery;
31 using BooleanQuery = Lucene.Net.Search.BooleanQuery;
32 using IndexSearcher = Lucene.Net.Search.IndexSearcher;
33 using Query = Lucene.Net.Search.Query;
34 using Analyzer = Lucene.Net.Analysis.Analyzer;
35 using TokenStream = Lucene.Net.Analysis.TokenStream;
36 using StandardAnalyzer = Lucene.Net.Analysis.Standard.StandardAnalyzer;
37 using Document = Lucene.Net.Documents.Document;
38 using Lucene.Net.Analysis.Tokenattributes;
39 
40 namespace Lucene.Net.Search.Similar
41 {
42  /// <summary> Generate "more like this" similarity queries.
43  /// Based on this mail:
44  /// <pre>
45  /// Lucene does let you access the document frequency of terms, with IndexReader.DocFreq().
46  /// Term frequencies can be computed by re-tokenizing the text, which, for a single document,
47  /// is usually fast enough. But looking up the DocFreq() of every term in the document is
48  /// probably too slow.
49  ///
50  /// You can use some heuristics to prune the set of terms, to avoid calling DocFreq() too much,
51  /// or at all. Since you're trying to maximize a tf*idf score, you're probably most interested
52  /// in terms with a high tf. Choosing a tf threshold even as low as two or three will radically
53  /// reduce the number of terms under consideration. Another heuristic is that terms with a
54  /// high idf (i.e., a low df) tend to be longer. So you could threshold the terms by the
55  /// number of characters, not selecting anything less than, e.g., six or seven characters.
56  /// With these sorts of heuristics you can usually find small set of, e.g., ten or fewer terms
57  /// that do a pretty good job of characterizing a document.
58  ///
59  /// It all depends on what you're trying to do. If you're trying to eek out that last percent
60  /// of precision and recall regardless of computational difficulty so that you can win a TREC
61  /// competition, then the techniques I mention above are useless. But if you're trying to
62  /// provide a "more like this" button on a search results page that does a decent job and has
63  /// good performance, such techniques might be useful.
64  ///
65  /// An efficient, effective "more-like-this" query generator would be a great contribution, if
66  /// anyone's interested. I'd imagine that it would take a Reader or a String (the document's
67  /// text), analyzer Analyzer, and return a set of representative terms using heuristics like those
68  /// above. The frequency and length thresholds could be parameters, etc.
69  ///
70  /// Doug
71  /// </pre>
72  ///
73  ///
74  /// <p/>
75  /// <h3>Initial Usage</h3>
76  ///
77  /// This class has lots of options to try to make it efficient and flexible.
78  /// See the body of <see cref="Main"/> below in the source for real code, or
79  /// if you want pseudo code, the simpliest possible usage is as follows. The bold
80  /// fragment is specific to this class.
81  ///
82  /// <pre>
83  ///
84  /// IndexReader ir = ...
85  /// IndexSearcher is = ...
86  /// <b>
87  /// MoreLikeThis mlt = new MoreLikeThis(ir);
88  /// Reader target = ... </b><em>// orig source of doc you want to find similarities to</em><b>
89  /// Query query = mlt.Like( target);
90  /// </b>
91  /// Hits hits = is.Search(query);
92  /// <em>// now the usual iteration thru 'hits' - the only thing to watch for is to make sure
93  /// you ignore the doc if it matches your 'target' document, as it should be similar to itself </em>
94  ///
95  /// </pre>
96  ///
97  /// Thus you:
98  /// <ol>
99  /// <li> do your normal, Lucene setup for searching,</li>
100  /// <li> create a MoreLikeThis,</li>
101  /// <li> get the text of the doc you want to find similaries to</li>
102  /// <li> then call one of the Like() calls to generate a similarity query</li>
103  /// <li> call the searcher to find the similar docs</li>
104  /// </ol>
105  ///
106  /// <h3>More Advanced Usage</h3>
107  ///
108  /// You may want to use <see cref="SetFieldNames"/> so you can examine
109  /// multiple fields (e.g. body and title) for similarity.
110  /// <p/>
111  ///
112  /// Depending on the size of your index and the size and makeup of your documents you
113  /// may want to call the other set methods to control how the similarity queries are
114  /// generated:
115  /// <ul>
116  /// <li> <see cref="MinTermFreq"/> </li>
117  /// <li> <see cref="MinDocFreq"/> </li>
118  /// <li> <see cref="MaxDocFreq"/></li>
119  /// <li> <see cref="SetMaxDocFreqPct(int)"/></li>
120  /// <li> <see cref="MinWordLen"/> </li>
121  /// <li> <see cref="MaxWordLen"/></li>
122  /// <li> <see cref="MaxQueryTerms"/></li>
123  /// <li> <see cref="MaxNumTokensParsed"/></li>
124  /// <li> <see cref="SetStopWords(ISet{string})"/> </li>
125  /// </ul>
126  ///
127  /// <hr/>
128  /// <pre>
129  /// Changes: Mark Harwood 29/02/04
130  /// Some bugfixing, some refactoring, some optimisation.
131  /// - bugfix: retrieveTerms(int docNum) was not working for indexes without a termvector -added missing code
132  /// - bugfix: No significant terms being created for fields with a termvector - because
133  /// was only counting one occurence per term/field pair in calculations(ie not including frequency info from TermVector)
134  /// - refactor: moved common code into isNoiseWord()
135  /// - optimise: when no termvector support available - used maxNumTermsParsed to limit amount of tokenization
136  /// </pre>
137  /// </summary>
138  public sealed class MoreLikeThis
139  {
140 
141  /// <summary> Default maximum number of tokens to parse in each example doc field that is not stored with TermVector support.</summary>
142  /// <seealso cref="MaxNumTokensParsed">
143  /// </seealso>
144  public const int DEFAULT_MAX_NUM_TOKENS_PARSED = 5000;
145 
146 
147  /// <summary> Default analyzer to parse source doc with.</summary>
148  /// <seealso cref="Analyzer">
149  /// </seealso>
150  public static readonly Analyzer DEFAULT_ANALYZER = new StandardAnalyzer(Util.Version.LUCENE_CURRENT);
151 
152  /// <summary> Ignore terms with less than this frequency in the source doc.</summary>
153  /// <seealso cref="MinTermFreq">
154  /// </seealso>
155  /// <seealso cref="MinTermFreq">
156  /// </seealso>
157  public const int DEFAULT_MIN_TERM_FREQ = 2;
158 
159  /// <summary> Ignore words which do not occur in at least this many docs.</summary>
160  /// <seealso cref="MinDocFreq">
161  /// </seealso>
162  /// <seealso cref="MinDocFreq">
163  /// </seealso>
164  public const int DEFAULT_MIN_DOC_FREQ = 5;
165 
166  /// <summary>
167  /// Ignore words wich occur in more than this many docs
168  /// </summary>
169  /// <seealso cref="MaxDocFreq"/>
170  /// <seealso cref="MaxDocFreq"/>
171  public const int DEFAULT_MAX_DOC_FREQ = int.MaxValue;
172 
173  /// <summary> Boost terms in query based on score.</summary>
174  /// <seealso cref="Boost">
175  /// </seealso>
176  /// <seealso cref="Boost">
177  /// </seealso>
178  public const bool DEFAULT_BOOST = false;
179 
180  /// <summary> Default field names. Null is used to specify that the field names should be looked
181  /// up at runtime from the provided reader.
182  /// </summary>
183  public static readonly System.String[] DEFAULT_FIELD_NAMES = new System.String[] { "contents" };
184 
185  /// <summary> Ignore words less than this length or if 0 then this has no effect.</summary>
186  /// <seealso cref="MinWordLen">
187  /// </seealso>
188  /// <seealso cref="MinWordLen">
189  /// </seealso>
190  public const int DEFAULT_MIN_WORD_LENGTH = 0;
191 
192  /// <summary> Ignore words greater than this length or if 0 then this has no effect.</summary>
193  /// <seealso cref="MaxWordLen">
194  /// </seealso>
195  /// <seealso cref="MaxWordLen">
196  /// </seealso>
197  public const int DEFAULT_MAX_WORD_LENGTH = 0;
198 
199  /// <summary> Default set of stopwords.
200  /// If null means to allow stop words.
201  ///
202  /// </summary>
203  /// <seealso cref="SetStopWords">
204  /// </seealso>
205  /// <seealso cref="GetStopWords">
206  /// </seealso>
207  public static readonly ISet<string> DEFAULT_STOP_WORDS = null;
208 
209  /// <summary> Current set of stop words.</summary>
210  private ISet<string> stopWords = DEFAULT_STOP_WORDS;
211 
212  /// <summary> Return a Query with no more than this many terms.
213  ///
214  /// </summary>
215  /// <seealso cref="BooleanQuery.MaxClauseCount">
216  /// </seealso>
217  /// <seealso cref="MaxQueryTerms">
218  /// </seealso>
219  /// <seealso cref="MaxQueryTerms">
220  /// </seealso>
221  public const int DEFAULT_MAX_QUERY_TERMS = 25;
222 
223  /// <summary> Analyzer that will be used to parse the doc.</summary>
224  private Analyzer analyzer = DEFAULT_ANALYZER;
225 
226  /// <summary> Ignore words less freqent that this.</summary>
227  private int minTermFreq = DEFAULT_MIN_TERM_FREQ;
228 
229  /// <summary> Ignore words which do not occur in at least this many docs.</summary>
230  private int minDocFreq = DEFAULT_MIN_DOC_FREQ;
231 
232  /// <summary>
233  /// Ignore words which occur in more than this many docs.
234  /// </summary>
235  private int maxDocfreq = DEFAULT_MAX_DOC_FREQ;
236 
237  /// <summary> Should we apply a boost to the Query based on the scores?</summary>
238  private bool boost = DEFAULT_BOOST;
239 
240  /// <summary> Field name we'll analyze.</summary>
241  private System.String[] fieldNames = DEFAULT_FIELD_NAMES;
242 
243  /// <summary> The maximum number of tokens to parse in each example doc field that is not stored with TermVector support</summary>
244  private int maxNumTokensParsed = DEFAULT_MAX_NUM_TOKENS_PARSED;
245 
246  /// <summary> Ignore words if less than this len.</summary>
247  private int minWordLen = DEFAULT_MIN_WORD_LENGTH;
248 
249  /// <summary> Ignore words if greater than this len.</summary>
250  private int maxWordLen = DEFAULT_MAX_WORD_LENGTH;
251 
252  /// <summary> Don't return a query longer than this.</summary>
253  private int maxQueryTerms = DEFAULT_MAX_QUERY_TERMS;
254 
255  /// <summary> For idf() calculations.</summary>
256  private Lucene.Net.Search.Similarity similarity = null;
257 
258  /// <summary> IndexReader to use</summary>
259  private IndexReader ir;
260 
261  /// <summary> Boost factor to use when boosting the terms </summary>
262  private float boostFactor = 1;
263 
264  /// <summary>
265  /// Gets or sets the boost factor used when boosting terms
266  /// </summary>
267  public float BoostFactor
268  {
269  get { return boostFactor; }
270  set { this.boostFactor = value; }
271  }
272 
273  /// <summary> Constructor requiring an IndexReader.</summary>
274  public MoreLikeThis(IndexReader ir) : this(ir,new DefaultSimilarity())
275  {
276  }
277 
278  public MoreLikeThis(IndexReader ir, Lucene.Net.Search.Similarity sim)
279  {
280  this.ir = ir;
281  this.similarity = sim;
282  }
283 
284  public Similarity Similarity
285  {
286  get { return similarity; }
287  set { this.similarity = value; }
288  }
289 
290  /// <summary> Gets or sets the analyzer used to parse source doc with. The default analyzer
291  /// is the <see cref="DEFAULT_ANALYZER"/>.
292  /// <para />
293  /// An analyzer is not required for generating a query with the
294  /// <see cref="Like(int)"/> method, all other 'like' methods require an analyzer.
295  /// </summary>
296  /// <value> the analyzer that will be used to parse source doc with. </value>
297  /// <seealso cref="DEFAULT_ANALYZER">
298  /// </seealso>
299  public Analyzer Analyzer
300  {
301  get { return analyzer; }
302  set { this.analyzer = value; }
303  }
304 
305  /// <summary>
306  /// Gets or sets the frequency below which terms will be ignored in the source doc. The default
307  /// frequency is the <see cref="DEFAULT_MIN_TERM_FREQ"/>.
308  /// </summary>
309  public int MinTermFreq
310  {
311  get { return minTermFreq; }
312  set { this.minTermFreq = value; }
313  }
314 
315  /// <summary>
316  /// Gets or sets the frequency at which words will be ignored which do not occur in at least this
317  /// many docs. The default frequency is <see cref="DEFAULT_MIN_DOC_FREQ"/>.
318  /// </summary>
319  public int MinDocFreq
320  {
321  get { return minDocFreq; }
322  set { this.minDocFreq = value; }
323  }
324 
325  /// <summary>
326  /// Gets or sets the maximum frequency in which words may still appear.
327  /// Words that appear in more than this many docs will be ignored. The default frequency is
328  /// <see cref="DEFAULT_MAX_DOC_FREQ"/>
329  /// </summary>
330  public int MaxDocFreq
331  {
332  get { return this.maxDocfreq; }
333  set { this.maxDocfreq = value; }
334  }
335 
336  /// <summary>
337  /// Set the maximum percentage in which words may still appear. Words that appear
338  /// in more than this many percent of all docs will be ignored.
339  /// </summary>
340  /// <param name="maxPercentage">
341  /// the maximum percentage of documents (0-100) that a term may appear
342  /// in to be still considered relevant
343  /// </param>
344  public void SetMaxDocFreqPct(int maxPercentage)
345  {
346  this.maxDocfreq = maxPercentage * ir.NumDocs() / 100;
347  }
348 
349  /// <summary> Gets or sets a boolean indicating whether to boost terms in query based
350  /// on "score" or not. The default is <see cref="DEFAULT_BOOST"/>.
351  /// </summary>
352  public bool Boost
353  {
354  get { return boost; }
355  set { this.boost = value; }
356  }
357 
358  /// <summary> Returns the field names that will be used when generating the 'More Like This' query.
359  /// The default field names that will be used is <see cref="DEFAULT_FIELD_NAMES"/>.
360  ///
361  /// </summary>
362  /// <returns> the field names that will be used when generating the 'More Like This' query.
363  /// </returns>
364  public System.String[] GetFieldNames()
365  {
366  return fieldNames;
367  }
368 
369  /// <summary> Sets the field names that will be used when generating the 'More Like This' query.
370  /// Set this to null for the field names to be determined at runtime from the IndexReader
371  /// provided in the constructor.
372  ///
373  /// </summary>
374  /// <param name="fieldNames">the field names that will be used when generating the 'More Like This'
375  /// query.
376  /// </param>
377  public void SetFieldNames(System.String[] fieldNames)
378  {
379  this.fieldNames = fieldNames;
380  }
381 
382  /// <summary>
383  /// Gets or sets the minimum word length below which words will be ignored.
384  /// Set this to 0 for no minimum word length. The default is <see cref="DEFAULT_MIN_WORD_LENGTH"/>.
385  /// </summary>
386  public int MinWordLen
387  {
388  get { return minWordLen; }
389  set { this.minWordLen = value; }
390  }
391 
392  /// <summary>
393  /// Gets or sets the maximum word length above which words will be ignored. Set this to 0 for no
394  /// maximum word length. The default is <see cref="DEFAULT_MAX_WORD_LENGTH"/>.
395  /// </summary>
396  public int MaxWordLen
397  {
398  get { return maxWordLen; }
399  set { this.maxWordLen = value; }
400  }
401 
402  /// <summary> Set the set of stopwords.
403  /// Any word in this set is considered "uninteresting" and ignored.
404  /// Even if your Analyzer allows stopwords, you might want to tell the MoreLikeThis code to ignore them, as
405  /// for the purposes of document similarity it seems reasonable to assume that "a stop word is never interesting".
406  ///
407  /// </summary>
408  /// <param name="stopWords">set of stopwords, if null it means to allow stop words
409  ///
410  /// </param>
411  /// <seealso cref="Lucene.Net.Analysis.StopFilter.MakeStopSet(string[])">
412  /// </seealso>
413  /// <seealso cref="GetStopWords">
414  /// </seealso>
415  public void SetStopWords(ISet<string> stopWords)
416  {
417  this.stopWords = stopWords;
418  }
419 
420  /// <summary> Get the current stop words being used.</summary>
421  /// <seealso cref="SetStopWords">
422  /// </seealso>
423  public ISet<string> GetStopWords()
424  {
425  return stopWords;
426  }
427 
428 
429  /// <summary>
430  /// Gets or sets the maximum number of query terms that will be included in any generated query.
431  /// The default is <see cref="DEFAULT_MAX_QUERY_TERMS"/>.
432  /// </summary>
433  public int MaxQueryTerms
434  {
435  get { return maxQueryTerms; }
436  set { this.maxQueryTerms = value; }
437  }
438 
439  /// <summary>
440  /// Gets or sets the maximum number of tokens to parse in each example doc
441  /// field that is not stored with TermVector support
442  /// </summary>
443  /// <seealso cref="DEFAULT_MAX_NUM_TOKENS_PARSED" />
444  public int MaxNumTokensParsed
445  {
446  get { return maxNumTokensParsed; }
447  set { maxNumTokensParsed = value; }
448  }
449 
450  /// <summary>Return a query that will return docs like the passed lucene document ID.</summary>
451  /// <param name="docNum">the documentID of the lucene doc to generate the 'More Like This" query for.</param>
452  /// <returns> a query that will return docs like the passed lucene document ID.</returns>
453  public Query Like(int docNum)
454  {
455  if (fieldNames == null)
456  {
457  // gather list of valid fields from lucene
458  ICollection<string> fields = ir.GetFieldNames(IndexReader.FieldOption.INDEXED);
459  fieldNames = fields.ToArray();
460  }
461 
462  return CreateQuery(RetrieveTerms(docNum));
463  }
464 
465  /// <summary> Return a query that will return docs like the passed file.
466  ///
467  /// </summary>
468  /// <returns> a query that will return docs like the passed file.
469  /// </returns>
470  public Query Like(System.IO.FileInfo f)
471  {
472  if (fieldNames == null)
473  {
474  // gather list of valid fields from lucene
475  ICollection<string> fields = ir.GetFieldNames(IndexReader.FieldOption.INDEXED);
476  fieldNames = fields.ToArray();
477  }
478 
479  return Like(new System.IO.StreamReader(f.FullName, System.Text.Encoding.Default));
480  }
481 
482  /// <summary> Return a query that will return docs like the passed URL.
483  ///
484  /// </summary>
485  /// <returns> a query that will return docs like the passed URL.
486  /// </returns>
487  public Query Like(System.Uri u)
488  {
489  return Like(new System.IO.StreamReader((System.Net.WebRequest.Create(u)).GetResponse().GetResponseStream(), System.Text.Encoding.Default));
490  }
491 
492  /// <summary> Return a query that will return docs like the passed stream.
493  ///
494  /// </summary>
495  /// <returns> a query that will return docs like the passed stream.
496  /// </returns>
497  public Query Like(System.IO.Stream is_Renamed)
498  {
499  return Like(new System.IO.StreamReader(is_Renamed, System.Text.Encoding.Default));
500  }
501 
502  /// <summary> Return a query that will return docs like the passed Reader.
503  ///
504  /// </summary>
505  /// <returns> a query that will return docs like the passed Reader.
506  /// </returns>
507  public Query Like(System.IO.TextReader r)
508  {
509  return CreateQuery(RetrieveTerms(r));
510  }
511 
512  /// <summary> Create the More like query from a PriorityQueue</summary>
513  private Query CreateQuery(PriorityQueue<object[]> q)
514  {
515  BooleanQuery query = new BooleanQuery();
516  System.Object cur;
517  int qterms = 0;
518  float bestScore = 0;
519 
520  while (((cur = q.Pop()) != null))
521  {
522  System.Object[] ar = (System.Object[])cur;
523  TermQuery tq = new TermQuery(new Term((System.String)ar[1], (System.String)ar[0]));
524 
525  if (boost)
526  {
527  if (qterms == 0)
528  {
529  bestScore = (float)ar[2];
530  }
531  float myScore = (float)ar[2];
532 
533  tq.Boost = boostFactor * myScore / bestScore;
534  }
535 
536  try
537  {
538  query.Add(tq, Occur.SHOULD);
539  }
540  catch (BooleanQuery.TooManyClauses ignore)
541  {
542  break;
543  }
544 
545  qterms++;
546  if (maxQueryTerms > 0 && qterms >= maxQueryTerms)
547  {
548  break;
549  }
550  }
551 
552  return query;
553  }
554 
555  /// <summary> Create a PriorityQueue from a word->tf map.
556  ///
557  /// </summary>
558  /// <param name="words">a map of words keyed on the word(String) with Int objects as the values.
559  /// </param>
560  private PriorityQueue<object[]> CreateQueue(IDictionary<string,Int> words)
561  {
562  // have collected all words in doc and their freqs
563  int numDocs = ir.NumDocs();
564  FreqQ res = new FreqQ(words.Count); // will order words by score
565 
566  var it = words.Keys.GetEnumerator();
567  while (it.MoveNext())
568  {
569  // for every word
570  System.String word = it.Current;
571 
572  int tf = words[word].x; // term freq in the source doc
573  if (minTermFreq > 0 && tf < minTermFreq)
574  {
575  continue; // filter out words that don't occur enough times in the source
576  }
577 
578  // go through all the fields and find the largest document frequency
579  System.String topField = fieldNames[0];
580  int docFreq = 0;
581  for (int i = 0; i < fieldNames.Length; i++)
582  {
583  int freq = ir.DocFreq(new Term(fieldNames[i], word));
584  topField = (freq > docFreq) ? fieldNames[i] : topField;
585  docFreq = (freq > docFreq) ? freq : docFreq;
586  }
587 
588  if (minDocFreq > 0 && docFreq < minDocFreq)
589  {
590  continue; // filter out words that don't occur in enough docs
591  }
592 
593  if (docFreq > maxDocfreq)
594  {
595  continue; // filter out words that occur in too many docs
596  }
597 
598  if (docFreq == 0)
599  {
600  continue; // index update problem?
601  }
602 
603  float idf = similarity.Idf(docFreq, numDocs);
604  float score = tf * idf;
605 
606  // only really need 1st 3 entries, other ones are for troubleshooting
607  res.InsertWithOverflow(new System.Object[] { word, topField, score, idf, docFreq, tf });
608  }
609  return res;
610  }
611 
612  /// <summary> Describe the parameters that control how the "more like this" query is formed.</summary>
613  public System.String DescribeParams()
614  {
615  System.Text.StringBuilder sb = new System.Text.StringBuilder();
616  sb.Append("\t" + "maxQueryTerms : " + maxQueryTerms + "\n");
617  sb.Append("\t" + "minWordLen : " + minWordLen + "\n");
618  sb.Append("\t" + "maxWordLen : " + maxWordLen + "\n");
619  sb.Append("\t" + "fieldNames : \"");
620  System.String delim = "";
621  for (int i = 0; i < fieldNames.Length; i++)
622  {
623  System.String fieldName = fieldNames[i];
624  sb.Append(delim).Append(fieldName);
625  delim = ", ";
626  }
627  sb.Append("\n");
628  sb.Append("\t" + "boost : " + boost + "\n");
629  sb.Append("\t" + "minTermFreq : " + minTermFreq + "\n");
630  sb.Append("\t" + "minDocFreq : " + minDocFreq + "\n");
631  return sb.ToString();
632  }
633 
634  /// <summary> Test driver.
635  /// Pass in "-i INDEX" and then either "-fn FILE" or "-url URL".
636  /// </summary>
637  [STAThread]
638  public static void Main(System.String[] a)
639  {
640  System.String indexName = "localhost_index";
641  System.String fn = "c:/Program Files/Apache Group/Apache/htdocs/manual/vhosts/index.html.en";
642  System.Uri url = null;
643  for (int i = 0; i < a.Length; i++)
644  {
645  if (a[i].Equals("-i"))
646  {
647  indexName = a[++i];
648  }
649  else if (a[i].Equals("-f"))
650  {
651  fn = a[++i];
652  }
653  else if (a[i].Equals("-url"))
654  {
655  url = new System.Uri(a[++i]);
656  }
657  }
658 
659  System.IO.StreamWriter temp_writer;
660  temp_writer = new System.IO.StreamWriter(System.Console.OpenStandardOutput(), System.Console.Out.Encoding);
661  temp_writer.AutoFlush = true;
662  System.IO.StreamWriter o = temp_writer;
663  FSDirectory dir = FSDirectory.Open(new DirectoryInfo(indexName));
664  IndexReader r = IndexReader.Open(dir, true);
665  o.WriteLine("Open index " + indexName + " which has " + r.NumDocs() + " docs");
666 
667  MoreLikeThis mlt = new MoreLikeThis(r);
668 
669  o.WriteLine("Query generation parameters:");
670  o.WriteLine(mlt.DescribeParams());
671  o.WriteLine();
672 
673  Query query = null;
674  if (url != null)
675  {
676  o.WriteLine("Parsing URL: " + url);
677  query = mlt.Like(url);
678  }
679  else if (fn != null)
680  {
681  o.WriteLine("Parsing file: " + fn);
682  query = mlt.Like(new System.IO.FileInfo(fn));
683  }
684 
685  o.WriteLine("q: " + query);
686  o.WriteLine();
687  IndexSearcher searcher = new IndexSearcher(dir, true);
688 
689  TopDocs hits = searcher.Search(query, null, 25);
690  int len = hits.TotalHits;
691  o.WriteLine("found: " + len + " documents matching");
692  o.WriteLine();
693  ScoreDoc[] scoreDocs = hits.ScoreDocs;
694  for (int i = 0; i < System.Math.Min(25, len); i++)
695  {
696  Document d = searcher.Doc(scoreDocs[i].Doc);
697  System.String summary = d.Get("summary");
698  o.WriteLine("score : " + scoreDocs[i].Score);
699  o.WriteLine("url : " + d.Get("url"));
700  o.WriteLine("\ttitle : " + d.Get("title"));
701  if (summary != null)
702  o.WriteLine("\tsummary: " + d.Get("summary"));
703  o.WriteLine();
704  }
705  }
706 
707  /// <summary> Find words for a more-like-this query former.
708  ///
709  /// </summary>
710  /// <param name="docNum">the id of the lucene document from which to find terms
711  /// </param>
712  private PriorityQueue<object[]> RetrieveTerms(int docNum)
713  {
714  IDictionary<string,Int> termFreqMap = new HashMap<string,Int>();
715  for (int i = 0; i < fieldNames.Length; i++)
716  {
717  System.String fieldName = fieldNames[i];
718  ITermFreqVector vector = ir.GetTermFreqVector(docNum, fieldName);
719 
720  // field does not store term vector info
721  if (vector == null)
722  {
723  Document d = ir.Document(docNum);
724  System.String[] text = d.GetValues(fieldName);
725  if (text != null)
726  {
727  for (int j = 0; j < text.Length; j++)
728  {
729  AddTermFrequencies(new System.IO.StringReader(text[j]), termFreqMap, fieldName);
730  }
731  }
732  }
733  else
734  {
735  AddTermFrequencies(termFreqMap, vector);
736  }
737  }
738 
739  return CreateQueue(termFreqMap);
740  }
741 
742  /// <summary> Adds terms and frequencies found in vector into the Map termFreqMap</summary>
743  /// <param name="termFreqMap">a Map of terms and their frequencies
744  /// </param>
745  /// <param name="vector">List of terms and their frequencies for a doc/field
746  /// </param>
747  private void AddTermFrequencies(IDictionary<string, Int> termFreqMap, ITermFreqVector vector)
748  {
749  System.String[] terms = vector.GetTerms();
750  int[] freqs = vector.GetTermFrequencies();
751  for (int j = 0; j < terms.Length; j++)
752  {
753  System.String term = terms[j];
754 
755  if (IsNoiseWord(term))
756  {
757  continue;
758  }
759  // increment frequency
760  Int cnt = termFreqMap[term];
761  if (cnt == null)
762  {
763  cnt = new Int();
764  termFreqMap[term] = cnt;
765  cnt.x = freqs[j];
766  }
767  else
768  {
769  cnt.x += freqs[j];
770  }
771  }
772  }
773  /// <summary> Adds term frequencies found by tokenizing text from reader into the Map words</summary>
774  /// <param name="r">a source of text to be tokenized
775  /// </param>
776  /// <param name="termFreqMap">a Map of terms and their frequencies
777  /// </param>
778  /// <param name="fieldName">Used by analyzer for any special per-field analysis
779  /// </param>
780  private void AddTermFrequencies(System.IO.TextReader r, IDictionary<string,Int> termFreqMap, System.String fieldName)
781  {
782  TokenStream ts = analyzer.TokenStream(fieldName, r);
783  int tokenCount=0;
784  // for every token
785  ITermAttribute termAtt = ts.AddAttribute<ITermAttribute>();
786 
787  while (ts.IncrementToken()) {
788  string word = termAtt.Term;
789  tokenCount++;
790  if(tokenCount>maxNumTokensParsed)
791  {
792  break;
793  }
794  if(IsNoiseWord(word)){
795  continue;
796  }
797 
798  // increment frequency
799  Int cnt = termFreqMap[word];
800  if (cnt == null) {
801  termFreqMap[word] = new Int();
802  }
803  else {
804  cnt.x++;
805  }
806  }
807  }
808 
809 
810  /// <summary>determines if the passed term is likely to be of interest in "more like" comparisons
811  ///
812  /// </summary>
813  /// <param name="term">The word being considered
814  /// </param>
815  /// <returns> true if should be ignored, false if should be used in further analysis
816  /// </returns>
817  private bool IsNoiseWord(System.String term)
818  {
819  int len = term.Length;
820  if (minWordLen > 0 && len < minWordLen)
821  {
822  return true;
823  }
824  if (maxWordLen > 0 && len > maxWordLen)
825  {
826  return true;
827  }
828  if (stopWords != null && stopWords.Contains(term))
829  {
830  return true;
831  }
832  return false;
833  }
834 
835 
836  /// <summary> Find words for a more-like-this query former.
837  /// The result is a priority queue of arrays with one entry for <b>every word</b> in the document.
838  /// Each array has 6 elements.
839  /// The elements are:
840  /// <ol>
841  /// <li> The word (String)</li>
842  /// <li> The top field that this word comes from (String)</li>
843  /// <li> The score for this word (Float)</li>
844  /// <li> The IDF value (Float)</li>
845  /// <li> The frequency of this word in the index (Integer)</li>
846  /// <li> The frequency of this word in the source document (Integer)</li>
847  /// </ol>
848  /// This is a somewhat "advanced" routine, and in general only the 1st entry in the array is of interest.
849  /// This method is exposed so that you can identify the "interesting words" in a document.
850  /// For an easier method to call see <see cref="RetrieveInterestingTerms(System.IO.TextReader)"/>.
851  ///
852  /// </summary>
853  /// <param name="r">the reader that has the content of the document
854  /// </param>
855  /// <returns> the most intresting words in the document ordered by score, with the highest scoring, or best entry, first
856  ///
857  /// </returns>
858  /// <seealso cref="RetrieveInterestingTerms(System.IO.TextReader)">
859  /// </seealso>
860  public PriorityQueue<object[]> RetrieveTerms(System.IO.TextReader r)
861  {
862  IDictionary<string, Int> words = new HashMap<string,Int>();
863  for (int i = 0; i < fieldNames.Length; i++)
864  {
865  System.String fieldName = fieldNames[i];
866  AddTermFrequencies(r, words, fieldName);
867  }
868  return CreateQueue(words);
869  }
870 
871 
872  public System.String[] RetrieveInterestingTerms(int docNum)
873  {
874  List<object> al = new List<object>(maxQueryTerms);
875  PriorityQueue<object[]> pq = RetrieveTerms(docNum);
876  System.Object cur;
877  int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
878  // we just want to return the top words
879  while (((cur = pq.Pop()) != null) && lim-- > 0)
880  {
881  System.Object[] ar = (System.Object[])cur;
882  al.Add(ar[0]); // the 1st entry is the interesting word
883  }
884  //System.String[] res = new System.String[al.Count];
885  //return al.toArray(res);
886  return al.Select(x => x.ToString()).ToArray();
887  }
888 
889  /// <summary> Convenience routine to make it easy to return the most interesting words in a document.
890  /// More advanced users will call <see cref="RetrieveTerms(System.IO.TextReader)"/> directly.
891  /// </summary>
892  /// <param name="r">the source document
893  /// </param>
894  /// <returns> the most interesting words in the document
895  ///
896  /// </returns>
897  /// <seealso cref="RetrieveTerms(System.IO.TextReader)">
898  /// </seealso>
899  /// <seealso cref="MaxQueryTerms">
900  /// </seealso>
901  public System.String[] RetrieveInterestingTerms(System.IO.TextReader r)
902  {
903  List<object> al = new List<object>(maxQueryTerms);
904  PriorityQueue<object[]> pq = RetrieveTerms(r);
905  System.Object cur;
906  int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
907  // we just want to return the top words
908  while (((cur = pq.Pop()) != null) && lim-- > 0)
909  {
910  System.Object[] ar = (System.Object[])cur;
911  al.Add(ar[0]); // the 1st entry is the interesting word
912  }
913  //System.String[] res = new System.String[al.Count];
914  // return (System.String[]) SupportClass.ICollectionSupport.ToArray(al, res);
915  return al.Select(x => x.ToString()).ToArray();
916  }
917 
918  /// <summary> PriorityQueue that orders words by score.</summary>
919  private class FreqQ : PriorityQueue<object[]>
920  {
921  internal FreqQ(int s)
922  {
923  Initialize(s);
924  }
925 
926  override public bool LessThan(System.Object[] aa, System.Object[] bb)
927  {
928  float fa = (float)aa[2];
929  float fb = (float)bb[2];
930  return (float)fa > (float)fb;
931  }
932  }
933 
934  /// <summary> Use for frequencies and to avoid renewing Integers.</summary>
935  private class Int
936  {
937  internal int x;
938 
939  internal Int()
940  {
941  x = 1;
942  }
943  }
944  }
945 }