Lucene.Net  3.0.3
Lucene.Net is a .NET port of the Java Lucene Indexing Library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties
MoreLikeThis.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.Collections.Generic;
20 using System.IO;
21 using System.Linq;
22 using Lucene.Net.Index;
23 using Lucene.Net.Store;
24 using Lucene.Net.Support;
25 using Lucene.Net.Util;
26 using IndexReader = Lucene.Net.Index.IndexReader;
27 using Term = Lucene.Net.Index.Term;
28 using BooleanClause = Lucene.Net.Search.BooleanClause;
29 using DefaultSimilarity = Lucene.Net.Search.DefaultSimilarity;
30 using TermQuery = Lucene.Net.Search.TermQuery;
31 using BooleanQuery = Lucene.Net.Search.BooleanQuery;
32 using IndexSearcher = Lucene.Net.Search.IndexSearcher;
33 using Query = Lucene.Net.Search.Query;
34 using Analyzer = Lucene.Net.Analysis.Analyzer;
35 using TokenStream = Lucene.Net.Analysis.TokenStream;
36 using StandardAnalyzer = Lucene.Net.Analysis.Standard.StandardAnalyzer;
37 using Document = Lucene.Net.Documents.Document;
38 using Lucene.Net.Analysis.Tokenattributes;
39 
40 namespace Lucene.Net.Search.Similar
41 {
138  public sealed class MoreLikeThis
139  {
140 
144  public const int DEFAULT_MAX_NUM_TOKENS_PARSED = 5000;
145 
146 
150  public static readonly Analyzer DEFAULT_ANALYZER = new StandardAnalyzer(Util.Version.LUCENE_CURRENT);
151 
157  public const int DEFAULT_MIN_TERM_FREQ = 2;
158 
164  public const int DEFAULT_MIN_DOC_FREQ = 5;
165 
171  public const int DEFAULT_MAX_DOC_FREQ = int.MaxValue;
172 
178  public const bool DEFAULT_BOOST = false;
179 
183  public static readonly System.String[] DEFAULT_FIELD_NAMES = new System.String[] { "contents" };
184 
190  public const int DEFAULT_MIN_WORD_LENGTH = 0;
191 
197  public const int DEFAULT_MAX_WORD_LENGTH = 0;
198 
207  public static readonly ISet<string> DEFAULT_STOP_WORDS = null;
208 
210  private ISet<string> stopWords = DEFAULT_STOP_WORDS;
211 
221  public const int DEFAULT_MAX_QUERY_TERMS = 25;
222 
224  private Analyzer analyzer = DEFAULT_ANALYZER;
225 
227  private int minTermFreq = DEFAULT_MIN_TERM_FREQ;
228 
230  private int minDocFreq = DEFAULT_MIN_DOC_FREQ;
231 
235  private int maxDocfreq = DEFAULT_MAX_DOC_FREQ;
236 
238  private bool boost = DEFAULT_BOOST;
239 
241  private System.String[] fieldNames = DEFAULT_FIELD_NAMES;
242 
244  private int maxNumTokensParsed = DEFAULT_MAX_NUM_TOKENS_PARSED;
245 
247  private int minWordLen = DEFAULT_MIN_WORD_LENGTH;
248 
250  private int maxWordLen = DEFAULT_MAX_WORD_LENGTH;
251 
253  private int maxQueryTerms = DEFAULT_MAX_QUERY_TERMS;
254 
256  private Lucene.Net.Search.Similarity similarity = null;
257 
259  private IndexReader ir;
260 
262  private float boostFactor = 1;
263 
267  public float BoostFactor
268  {
269  get { return boostFactor; }
270  set { this.boostFactor = value; }
271  }
272 
274  public MoreLikeThis(IndexReader ir) : this(ir,new DefaultSimilarity())
275  {
276  }
277 
278  public MoreLikeThis(IndexReader ir, Lucene.Net.Search.Similarity sim)
279  {
280  this.ir = ir;
281  this.similarity = sim;
282  }
283 
284  public Similarity Similarity
285  {
286  get { return similarity; }
287  set { this.similarity = value; }
288  }
289 
299  public Analyzer Analyzer
300  {
301  get { return analyzer; }
302  set { this.analyzer = value; }
303  }
304 
309  public int MinTermFreq
310  {
311  get { return minTermFreq; }
312  set { this.minTermFreq = value; }
313  }
314 
319  public int MinDocFreq
320  {
321  get { return minDocFreq; }
322  set { this.minDocFreq = value; }
323  }
324 
330  public int MaxDocFreq
331  {
332  get { return this.maxDocfreq; }
333  set { this.maxDocfreq = value; }
334  }
335 
344  public void SetMaxDocFreqPct(int maxPercentage)
345  {
346  this.maxDocfreq = maxPercentage * ir.NumDocs() / 100;
347  }
348 
352  public bool Boost
353  {
354  get { return boost; }
355  set { this.boost = value; }
356  }
357 
364  public System.String[] GetFieldNames()
365  {
366  return fieldNames;
367  }
368 
377  public void SetFieldNames(System.String[] fieldNames)
378  {
379  this.fieldNames = fieldNames;
380  }
381 
386  public int MinWordLen
387  {
388  get { return minWordLen; }
389  set { this.minWordLen = value; }
390  }
391 
396  public int MaxWordLen
397  {
398  get { return maxWordLen; }
399  set { this.maxWordLen = value; }
400  }
401 
415  public void SetStopWords(ISet<string> stopWords)
416  {
417  this.stopWords = stopWords;
418  }
419 
423  public ISet<string> GetStopWords()
424  {
425  return stopWords;
426  }
427 
428 
433  public int MaxQueryTerms
434  {
435  get { return maxQueryTerms; }
436  set { this.maxQueryTerms = value; }
437  }
438 
444  public int MaxNumTokensParsed
445  {
446  get { return maxNumTokensParsed; }
447  set { maxNumTokensParsed = value; }
448  }
449 
453  public Query Like(int docNum)
454  {
455  if (fieldNames == null)
456  {
457  // gather list of valid fields from lucene
458  ICollection<string> fields = ir.GetFieldNames(IndexReader.FieldOption.INDEXED);
459  fieldNames = fields.ToArray();
460  }
461 
462  return CreateQuery(RetrieveTerms(docNum));
463  }
464 
470  public Query Like(System.IO.FileInfo f)
471  {
472  if (fieldNames == null)
473  {
474  // gather list of valid fields from lucene
475  ICollection<string> fields = ir.GetFieldNames(IndexReader.FieldOption.INDEXED);
476  fieldNames = fields.ToArray();
477  }
478 
479  return Like(new System.IO.StreamReader(f.FullName, System.Text.Encoding.Default));
480  }
481 
487  public Query Like(System.Uri u)
488  {
489  return Like(new System.IO.StreamReader((System.Net.WebRequest.Create(u)).GetResponse().GetResponseStream(), System.Text.Encoding.Default));
490  }
491 
497  public Query Like(System.IO.Stream is_Renamed)
498  {
499  return Like(new System.IO.StreamReader(is_Renamed, System.Text.Encoding.Default));
500  }
501 
507  public Query Like(System.IO.TextReader r)
508  {
509  return CreateQuery(RetrieveTerms(r));
510  }
511 
513  private Query CreateQuery(PriorityQueue<object[]> q)
514  {
515  BooleanQuery query = new BooleanQuery();
516  System.Object cur;
517  int qterms = 0;
518  float bestScore = 0;
519 
520  while (((cur = q.Pop()) != null))
521  {
522  System.Object[] ar = (System.Object[])cur;
523  TermQuery tq = new TermQuery(new Term((System.String)ar[1], (System.String)ar[0]));
524 
525  if (boost)
526  {
527  if (qterms == 0)
528  {
529  bestScore = (float)ar[2];
530  }
531  float myScore = (float)ar[2];
532 
533  tq.Boost = boostFactor * myScore / bestScore;
534  }
535 
536  try
537  {
538  query.Add(tq, Occur.SHOULD);
539  }
540  catch (BooleanQuery.TooManyClauses ignore)
541  {
542  break;
543  }
544 
545  qterms++;
546  if (maxQueryTerms > 0 && qterms >= maxQueryTerms)
547  {
548  break;
549  }
550  }
551 
552  return query;
553  }
554 
560  private PriorityQueue<object[]> CreateQueue(IDictionary<string,Int> words)
561  {
562  // have collected all words in doc and their freqs
563  int numDocs = ir.NumDocs();
564  FreqQ res = new FreqQ(words.Count); // will order words by score
565 
566  var it = words.Keys.GetEnumerator();
567  while (it.MoveNext())
568  {
569  // for every word
570  System.String word = it.Current;
571 
572  int tf = words[word].x; // term freq in the source doc
573  if (minTermFreq > 0 && tf < minTermFreq)
574  {
575  continue; // filter out words that don't occur enough times in the source
576  }
577 
578  // go through all the fields and find the largest document frequency
579  System.String topField = fieldNames[0];
580  int docFreq = 0;
581  for (int i = 0; i < fieldNames.Length; i++)
582  {
583  int freq = ir.DocFreq(new Term(fieldNames[i], word));
584  topField = (freq > docFreq) ? fieldNames[i] : topField;
585  docFreq = (freq > docFreq) ? freq : docFreq;
586  }
587 
588  if (minDocFreq > 0 && docFreq < minDocFreq)
589  {
590  continue; // filter out words that don't occur in enough docs
591  }
592 
593  if (docFreq > maxDocfreq)
594  {
595  continue; // filter out words that occur in too many docs
596  }
597 
598  if (docFreq == 0)
599  {
600  continue; // index update problem?
601  }
602 
603  float idf = similarity.Idf(docFreq, numDocs);
604  float score = tf * idf;
605 
606  // only really need 1st 3 entries, other ones are for troubleshooting
607  res.InsertWithOverflow(new System.Object[] { word, topField, score, idf, docFreq, tf });
608  }
609  return res;
610  }
611 
613  public System.String DescribeParams()
614  {
615  System.Text.StringBuilder sb = new System.Text.StringBuilder();
616  sb.Append("\t" + "maxQueryTerms : " + maxQueryTerms + "\n");
617  sb.Append("\t" + "minWordLen : " + minWordLen + "\n");
618  sb.Append("\t" + "maxWordLen : " + maxWordLen + "\n");
619  sb.Append("\t" + "fieldNames : \"");
620  System.String delim = "";
621  for (int i = 0; i < fieldNames.Length; i++)
622  {
623  System.String fieldName = fieldNames[i];
624  sb.Append(delim).Append(fieldName);
625  delim = ", ";
626  }
627  sb.Append("\n");
628  sb.Append("\t" + "boost : " + boost + "\n");
629  sb.Append("\t" + "minTermFreq : " + minTermFreq + "\n");
630  sb.Append("\t" + "minDocFreq : " + minDocFreq + "\n");
631  return sb.ToString();
632  }
633 
637  [STAThread]
638  public static void Main(System.String[] a)
639  {
640  System.String indexName = "localhost_index";
641  System.String fn = "c:/Program Files/Apache Group/Apache/htdocs/manual/vhosts/index.html.en";
642  System.Uri url = null;
643  for (int i = 0; i < a.Length; i++)
644  {
645  if (a[i].Equals("-i"))
646  {
647  indexName = a[++i];
648  }
649  else if (a[i].Equals("-f"))
650  {
651  fn = a[++i];
652  }
653  else if (a[i].Equals("-url"))
654  {
655  url = new System.Uri(a[++i]);
656  }
657  }
658 
659  System.IO.StreamWriter temp_writer;
660  temp_writer = new System.IO.StreamWriter(System.Console.OpenStandardOutput(), System.Console.Out.Encoding);
661  temp_writer.AutoFlush = true;
662  System.IO.StreamWriter o = temp_writer;
663  FSDirectory dir = FSDirectory.Open(new DirectoryInfo(indexName));
664  IndexReader r = IndexReader.Open(dir, true);
665  o.WriteLine("Open index " + indexName + " which has " + r.NumDocs() + " docs");
666 
667  MoreLikeThis mlt = new MoreLikeThis(r);
668 
669  o.WriteLine("Query generation parameters:");
670  o.WriteLine(mlt.DescribeParams());
671  o.WriteLine();
672 
673  Query query = null;
674  if (url != null)
675  {
676  o.WriteLine("Parsing URL: " + url);
677  query = mlt.Like(url);
678  }
679  else if (fn != null)
680  {
681  o.WriteLine("Parsing file: " + fn);
682  query = mlt.Like(new System.IO.FileInfo(fn));
683  }
684 
685  o.WriteLine("q: " + query);
686  o.WriteLine();
687  IndexSearcher searcher = new IndexSearcher(dir, true);
688 
689  TopDocs hits = searcher.Search(query, null, 25);
690  int len = hits.TotalHits;
691  o.WriteLine("found: " + len + " documents matching");
692  o.WriteLine();
693  ScoreDoc[] scoreDocs = hits.ScoreDocs;
694  for (int i = 0; i < System.Math.Min(25, len); i++)
695  {
696  Document d = searcher.Doc(scoreDocs[i].Doc);
697  System.String summary = d.Get("summary");
698  o.WriteLine("score : " + scoreDocs[i].Score);
699  o.WriteLine("url : " + d.Get("url"));
700  o.WriteLine("\ttitle : " + d.Get("title"));
701  if (summary != null)
702  o.WriteLine("\tsummary: " + d.Get("summary"));
703  o.WriteLine();
704  }
705  }
706 
712  private PriorityQueue<object[]> RetrieveTerms(int docNum)
713  {
714  IDictionary<string,Int> termFreqMap = new HashMap<string,Int>();
715  for (int i = 0; i < fieldNames.Length; i++)
716  {
717  System.String fieldName = fieldNames[i];
718  ITermFreqVector vector = ir.GetTermFreqVector(docNum, fieldName);
719 
720  // field does not store term vector info
721  if (vector == null)
722  {
723  Document d = ir.Document(docNum);
724  System.String[] text = d.GetValues(fieldName);
725  if (text != null)
726  {
727  for (int j = 0; j < text.Length; j++)
728  {
729  AddTermFrequencies(new System.IO.StringReader(text[j]), termFreqMap, fieldName);
730  }
731  }
732  }
733  else
734  {
735  AddTermFrequencies(termFreqMap, vector);
736  }
737  }
738 
739  return CreateQueue(termFreqMap);
740  }
741 
747  private void AddTermFrequencies(IDictionary<string, Int> termFreqMap, ITermFreqVector vector)
748  {
749  System.String[] terms = vector.GetTerms();
750  int[] freqs = vector.GetTermFrequencies();
751  for (int j = 0; j < terms.Length; j++)
752  {
753  System.String term = terms[j];
754 
755  if (IsNoiseWord(term))
756  {
757  continue;
758  }
759  // increment frequency
760  Int cnt = termFreqMap[term];
761  if (cnt == null)
762  {
763  cnt = new Int();
764  termFreqMap[term] = cnt;
765  cnt.x = freqs[j];
766  }
767  else
768  {
769  cnt.x += freqs[j];
770  }
771  }
772  }
780  private void AddTermFrequencies(System.IO.TextReader r, IDictionary<string,Int> termFreqMap, System.String fieldName)
781  {
782  TokenStream ts = analyzer.TokenStream(fieldName, r);
783  int tokenCount=0;
784  // for every token
785  ITermAttribute termAtt = ts.AddAttribute<ITermAttribute>();
786 
787  while (ts.IncrementToken()) {
788  string word = termAtt.Term;
789  tokenCount++;
790  if(tokenCount>maxNumTokensParsed)
791  {
792  break;
793  }
794  if(IsNoiseWord(word)){
795  continue;
796  }
797 
798  // increment frequency
799  Int cnt = termFreqMap[word];
800  if (cnt == null) {
801  termFreqMap[word] = new Int();
802  }
803  else {
804  cnt.x++;
805  }
806  }
807  }
808 
809 
817  private bool IsNoiseWord(System.String term)
818  {
819  int len = term.Length;
820  if (minWordLen > 0 && len < minWordLen)
821  {
822  return true;
823  }
824  if (maxWordLen > 0 && len > maxWordLen)
825  {
826  return true;
827  }
828  if (stopWords != null && stopWords.Contains(term))
829  {
830  return true;
831  }
832  return false;
833  }
834 
835 
860  public PriorityQueue<object[]> RetrieveTerms(System.IO.TextReader r)
861  {
862  IDictionary<string, Int> words = new HashMap<string,Int>();
863  for (int i = 0; i < fieldNames.Length; i++)
864  {
865  System.String fieldName = fieldNames[i];
866  AddTermFrequencies(r, words, fieldName);
867  }
868  return CreateQueue(words);
869  }
870 
871 
872  public System.String[] RetrieveInterestingTerms(int docNum)
873  {
874  List<object> al = new List<object>(maxQueryTerms);
875  PriorityQueue<object[]> pq = RetrieveTerms(docNum);
876  System.Object cur;
877  int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
878  // we just want to return the top words
879  while (((cur = pq.Pop()) != null) && lim-- > 0)
880  {
881  System.Object[] ar = (System.Object[])cur;
882  al.Add(ar[0]); // the 1st entry is the interesting word
883  }
884  //System.String[] res = new System.String[al.Count];
885  //return al.toArray(res);
886  return al.Select(x => x.ToString()).ToArray();
887  }
888 
901  public System.String[] RetrieveInterestingTerms(System.IO.TextReader r)
902  {
903  List<object> al = new List<object>(maxQueryTerms);
904  PriorityQueue<object[]> pq = RetrieveTerms(r);
905  System.Object cur;
906  int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
907  // we just want to return the top words
908  while (((cur = pq.Pop()) != null) && lim-- > 0)
909  {
910  System.Object[] ar = (System.Object[])cur;
911  al.Add(ar[0]); // the 1st entry is the interesting word
912  }
913  //System.String[] res = new System.String[al.Count];
914  // return (System.String[]) SupportClass.ICollectionSupport.ToArray(al, res);
915  return al.Select(x => x.ToString()).ToArray();
916  }
917 
919  private class FreqQ : PriorityQueue<object[]>
920  {
921  internal FreqQ(int s)
922  {
923  Initialize(s);
924  }
925 
926  override public bool LessThan(System.Object[] aa, System.Object[] bb)
927  {
928  float fa = (float)aa[2];
929  float fb = (float)bb[2];
930  return (float)fa > (float)fb;
931  }
932  }
933 
935  private class Int
936  {
937  internal int x;
938 
939  internal Int()
940  {
941  x = 1;
942  }
943  }
944  }
945 }