19 using System.Collections.Generic;
22 using Lucene.Net.Index;
23 using Lucene.Net.Store;
24 using Lucene.Net.Support;
25 using Lucene.Net.Util;
33 using Query = Lucene.Net.Search.Query;
37 using Document = Lucene.Net.Documents.Document;
38 using Lucene.Net.Analysis.Tokenattributes;
40 namespace Lucene.Net.Search.Similar
144 public const int DEFAULT_MAX_NUM_TOKENS_PARSED = 5000;
157 public const int DEFAULT_MIN_TERM_FREQ = 2;
164 public const int DEFAULT_MIN_DOC_FREQ = 5;
171 public const int DEFAULT_MAX_DOC_FREQ =
int.MaxValue;
178 public const bool DEFAULT_BOOST =
false;
183 public static readonly System.String[] DEFAULT_FIELD_NAMES =
new System.String[] {
"contents" };
190 public const int DEFAULT_MIN_WORD_LENGTH = 0;
197 public const int DEFAULT_MAX_WORD_LENGTH = 0;
207 public static readonly ISet<string> DEFAULT_STOP_WORDS = null;
210 private ISet<string> stopWords = DEFAULT_STOP_WORDS;
221 public const int DEFAULT_MAX_QUERY_TERMS = 25;
224 private Analyzer analyzer = DEFAULT_ANALYZER;
227 private int minTermFreq = DEFAULT_MIN_TERM_FREQ;
230 private int minDocFreq = DEFAULT_MIN_DOC_FREQ;
235 private int maxDocfreq = DEFAULT_MAX_DOC_FREQ;
238 private bool boost = DEFAULT_BOOST;
241 private System.String[] fieldNames = DEFAULT_FIELD_NAMES;
244 private int maxNumTokensParsed = DEFAULT_MAX_NUM_TOKENS_PARSED;
247 private int minWordLen = DEFAULT_MIN_WORD_LENGTH;
250 private int maxWordLen = DEFAULT_MAX_WORD_LENGTH;
253 private int maxQueryTerms = DEFAULT_MAX_QUERY_TERMS;
256 private Lucene.Net.Search.Similarity similarity = null;
262 private float boostFactor = 1;
267 public float BoostFactor
269 get {
return boostFactor; }
270 set { this.boostFactor = value; }
281 this.similarity = sim;
286 get {
return similarity; }
287 set { this.similarity = value; }
301 get {
return analyzer; }
302 set { this.analyzer = value; }
309 public int MinTermFreq
311 get {
return minTermFreq; }
312 set { this.minTermFreq = value; }
319 public int MinDocFreq
321 get {
return minDocFreq; }
322 set { this.minDocFreq = value; }
330 public int MaxDocFreq
332 get {
return this.maxDocfreq; }
333 set { this.maxDocfreq = value; }
344 public void SetMaxDocFreqPct(
int maxPercentage)
346 this.maxDocfreq = maxPercentage * ir.NumDocs() / 100;
354 get {
return boost; }
355 set { this.boost = value; }
364 public System.String[] GetFieldNames()
377 public void SetFieldNames(System.String[] fieldNames)
379 this.fieldNames = fieldNames;
386 public int MinWordLen
388 get {
return minWordLen; }
389 set { this.minWordLen = value; }
396 public int MaxWordLen
398 get {
return maxWordLen; }
399 set { this.maxWordLen = value; }
415 public void SetStopWords(ISet<string> stopWords)
417 this.stopWords = stopWords;
423 public ISet<string> GetStopWords()
433 public int MaxQueryTerms
435 get {
return maxQueryTerms; }
436 set { this.maxQueryTerms = value; }
444 public int MaxNumTokensParsed
446 get {
return maxNumTokensParsed; }
447 set { maxNumTokensParsed = value; }
455 if (fieldNames == null)
458 ICollection<string> fields = ir.GetFieldNames(
IndexReader.FieldOption.INDEXED);
459 fieldNames = fields.ToArray();
462 return CreateQuery(RetrieveTerms(docNum));
470 public Query Like(System.IO.FileInfo f)
472 if (fieldNames == null)
475 ICollection<string> fields = ir.GetFieldNames(
IndexReader.FieldOption.INDEXED);
476 fieldNames = fields.ToArray();
479 return Like(
new System.IO.StreamReader(f.FullName, System.Text.Encoding.Default));
489 return Like(
new System.IO.StreamReader((System.Net.WebRequest.Create(u)).GetResponse().GetResponseStream(), System.Text.Encoding.Default));
497 public Query Like(System.IO.Stream is_Renamed)
499 return Like(
new System.IO.StreamReader(is_Renamed, System.Text.Encoding.Default));
507 public Query Like(System.IO.TextReader r)
509 return CreateQuery(RetrieveTerms(r));
520 while (((cur = q.Pop()) != null))
522 System.Object[] ar = (System.Object[])cur;
529 bestScore = (float)ar[2];
531 float myScore = (float)ar[2];
533 tq.
Boost = boostFactor * myScore / bestScore;
546 if (maxQueryTerms > 0 && qterms >= maxQueryTerms)
560 private PriorityQueue<object[]> CreateQueue(IDictionary<string,Int> words)
563 int numDocs = ir.NumDocs();
564 FreqQ res =
new FreqQ(words.Count);
566 var it = words.Keys.GetEnumerator();
567 while (it.MoveNext())
570 System.String word = it.Current;
572 int tf = words[word].x;
573 if (minTermFreq > 0 && tf < minTermFreq)
579 System.String topField = fieldNames[0];
581 for (
int i = 0; i < fieldNames.Length; i++)
583 int freq = ir.DocFreq(
new Term(fieldNames[i], word));
584 topField = (freq > docFreq) ? fieldNames[i] : topField;
585 docFreq = (freq > docFreq) ? freq : docFreq;
588 if (minDocFreq > 0 && docFreq < minDocFreq)
593 if (docFreq > maxDocfreq)
603 float idf = similarity.Idf(docFreq, numDocs);
604 float score = tf * idf;
607 res.InsertWithOverflow(
new System.Object[] { word, topField, score, idf, docFreq, tf });
613 public System.String DescribeParams()
615 System.Text.StringBuilder sb =
new System.Text.StringBuilder();
616 sb.Append(
"\t" +
"maxQueryTerms : " + maxQueryTerms +
"\n");
617 sb.Append(
"\t" +
"minWordLen : " + minWordLen +
"\n");
618 sb.Append(
"\t" +
"maxWordLen : " + maxWordLen +
"\n");
619 sb.Append(
"\t" +
"fieldNames : \"");
620 System.String delim =
"";
621 for (
int i = 0; i < fieldNames.Length; i++)
623 System.String fieldName = fieldNames[i];
624 sb.Append(delim).Append(fieldName);
628 sb.Append(
"\t" +
"boost : " + boost +
"\n");
629 sb.Append(
"\t" +
"minTermFreq : " + minTermFreq +
"\n");
630 sb.Append(
"\t" +
"minDocFreq : " + minDocFreq +
"\n");
631 return sb.ToString();
638 public static void Main(System.String[] a)
640 System.String indexName =
"localhost_index";
641 System.String fn =
"c:/Program Files/Apache Group/Apache/htdocs/manual/vhosts/index.html.en";
642 System.Uri url = null;
643 for (
int i = 0; i < a.Length; i++)
645 if (a[i].Equals(
"-i"))
649 else if (a[i].Equals(
"-f"))
653 else if (a[i].Equals(
"-url"))
655 url =
new System.Uri(a[++i]);
659 System.IO.StreamWriter temp_writer;
660 temp_writer =
new System.IO.StreamWriter(System.Console.OpenStandardOutput(), System.Console.Out.Encoding);
661 temp_writer.AutoFlush =
true;
662 System.IO.StreamWriter o = temp_writer;
665 o.WriteLine(
"Open index " + indexName +
" which has " + r.
NumDocs() +
" docs");
669 o.WriteLine(
"Query generation parameters:");
676 o.WriteLine(
"Parsing URL: " + url);
677 query = mlt.
Like(url);
681 o.WriteLine(
"Parsing file: " + fn);
682 query = mlt.
Like(
new System.IO.FileInfo(fn));
685 o.WriteLine(
"q: " + query);
691 o.WriteLine(
"found: " + len +
" documents matching");
694 for (
int i = 0; i < System.Math.Min(25, len); i++)
697 System.String summary = d.Get(
"summary");
698 o.WriteLine(
"score : " + scoreDocs[i].Score);
699 o.WriteLine(
"url : " + d.Get(
"url"));
700 o.WriteLine(
"\ttitle : " + d.Get(
"title"));
702 o.WriteLine(
"\tsummary: " + d.Get(
"summary"));
712 private PriorityQueue<object[]> RetrieveTerms(
int docNum)
714 IDictionary<string,Int> termFreqMap =
new HashMap<string,Int>();
715 for (
int i = 0; i < fieldNames.Length; i++)
717 System.String fieldName = fieldNames[i];
724 System.String[] text = d.GetValues(fieldName);
727 for (
int j = 0; j < text.Length; j++)
729 AddTermFrequencies(
new System.IO.StringReader(text[j]), termFreqMap, fieldName);
735 AddTermFrequencies(termFreqMap, vector);
739 return CreateQueue(termFreqMap);
747 private void AddTermFrequencies(IDictionary<string, Int> termFreqMap,
ITermFreqVector vector)
749 System.String[] terms = vector.
GetTerms();
751 for (
int j = 0; j < terms.Length; j++)
753 System.String term = terms[j];
755 if (IsNoiseWord(term))
760 Int cnt = termFreqMap[term];
764 termFreqMap[term] = cnt;
780 private void AddTermFrequencies(System.IO.TextReader r, IDictionary<string,Int> termFreqMap, System.String fieldName)
782 TokenStream ts = analyzer.TokenStream(fieldName, r);
787 while (ts.IncrementToken()) {
788 string word = termAtt.
Term;
790 if(tokenCount>maxNumTokensParsed)
794 if(IsNoiseWord(word)){
799 Int cnt = termFreqMap[word];
801 termFreqMap[word] =
new Int();
817 private bool IsNoiseWord(System.String term)
819 int len = term.Length;
820 if (minWordLen > 0 && len < minWordLen)
824 if (maxWordLen > 0 && len > maxWordLen)
828 if (stopWords != null && stopWords.Contains(term))
860 public PriorityQueue<object[]> RetrieveTerms(System.IO.TextReader r)
862 IDictionary<string, Int> words =
new HashMap<string,Int>();
863 for (
int i = 0; i < fieldNames.Length; i++)
865 System.String fieldName = fieldNames[i];
866 AddTermFrequencies(r, words, fieldName);
868 return CreateQueue(words);
872 public System.String[] RetrieveInterestingTerms(
int docNum)
874 List<object> al =
new List<object>(maxQueryTerms);
875 PriorityQueue<object[]> pq = RetrieveTerms(docNum);
877 int lim = maxQueryTerms;
879 while (((cur = pq.Pop()) != null) && lim-- > 0)
881 System.Object[] ar = (System.Object[])cur;
886 return al.Select(x => x.ToString()).ToArray();
901 public System.String[] RetrieveInterestingTerms(System.IO.TextReader r)
903 List<object> al =
new List<object>(maxQueryTerms);
904 PriorityQueue<object[]> pq = RetrieveTerms(r);
906 int lim = maxQueryTerms;
908 while (((cur = pq.Pop()) != null) && lim-- > 0)
910 System.Object[] ar = (System.Object[])cur;
915 return al.Select(x => x.ToString()).ToArray();
921 internal FreqQ(
int s)
926 override public bool LessThan(System.Object[] aa, System.Object[] bb)
928 float fa = (float)aa[2];
929 float fb = (float)bb[2];
930 return (
float)fa > (float)fb;