19 using System.Collections.Generic;
 
   22 using Lucene.Net.Index;
 
   23 using Lucene.Net.Store;
 
   24 using Lucene.Net.Support;
 
   25 using Lucene.Net.Util;
 
   33 using Query = Lucene.Net.Search.Query;
 
   37 using Document = Lucene.Net.Documents.Document;
 
   38 using Lucene.Net.Analysis.Tokenattributes;
 
   40 namespace Lucene.Net.Search.Similar
 
  144         public const int DEFAULT_MAX_NUM_TOKENS_PARSED = 5000;
 
  157         public const int DEFAULT_MIN_TERM_FREQ = 2;
 
  164         public const int DEFAULT_MIN_DOC_FREQ = 5;
 
  171         public const int DEFAULT_MAX_DOC_FREQ = 
int.MaxValue;
 
  178         public const bool DEFAULT_BOOST = 
false;
 
  183         public static readonly System.String[] DEFAULT_FIELD_NAMES = 
new System.String[] { 
"contents" };
 
  190         public const int DEFAULT_MIN_WORD_LENGTH = 0;
 
  197         public const int DEFAULT_MAX_WORD_LENGTH = 0;
 
  207         public static readonly ISet<string> DEFAULT_STOP_WORDS = null;
 
  210         private ISet<string> stopWords = DEFAULT_STOP_WORDS;
 
  221         public const int DEFAULT_MAX_QUERY_TERMS = 25;
 
  224         private Analyzer analyzer = DEFAULT_ANALYZER;
 
  227         private int minTermFreq = DEFAULT_MIN_TERM_FREQ;
 
  230         private int minDocFreq = DEFAULT_MIN_DOC_FREQ;
 
  235         private int maxDocfreq = DEFAULT_MAX_DOC_FREQ;
 
  238         private bool boost = DEFAULT_BOOST;
 
  241         private System.String[] fieldNames = DEFAULT_FIELD_NAMES;
 
  244         private int maxNumTokensParsed = DEFAULT_MAX_NUM_TOKENS_PARSED;
 
  247         private int minWordLen = DEFAULT_MIN_WORD_LENGTH;
 
  250         private int maxWordLen = DEFAULT_MAX_WORD_LENGTH;
 
  253         private int maxQueryTerms = DEFAULT_MAX_QUERY_TERMS;
 
  256         private Lucene.Net.Search.Similarity similarity = null;
 
  262         private float boostFactor = 1;
 
  267         public float BoostFactor
 
  269             get { 
return boostFactor; }
 
  270             set { this.boostFactor = value; }
 
  281             this.similarity = sim;
 
  286             get { 
return similarity; }
 
  287             set { this.similarity = value; }
 
  301             get { 
return analyzer; }
 
  302             set { this.analyzer = value; }
 
  309         public int MinTermFreq
 
  311             get { 
return minTermFreq; }
 
  312             set { this.minTermFreq = value; }
 
  319         public int MinDocFreq
 
  321             get { 
return minDocFreq; }
 
  322             set { this.minDocFreq = value; }
 
  330         public int MaxDocFreq
 
  332             get { 
return this.maxDocfreq; }
 
  333             set { this.maxDocfreq = value; }
 
  344         public void SetMaxDocFreqPct(
int maxPercentage)
 
  346             this.maxDocfreq = maxPercentage * ir.NumDocs() / 100;
 
  354             get { 
return boost; }
 
  355             set { this.boost = value; }
 
  364         public System.String[] GetFieldNames()
 
  377         public void SetFieldNames(System.String[] fieldNames)
 
  379             this.fieldNames = fieldNames;
 
  386         public int MinWordLen
 
  388             get { 
return minWordLen; }
 
  389             set { this.minWordLen = value; }
 
  396         public int MaxWordLen
 
  398             get { 
return maxWordLen; }
 
  399             set { this.maxWordLen = value; }
 
  415         public void SetStopWords(ISet<string> stopWords)
 
  417             this.stopWords = stopWords;
 
  423         public ISet<string> GetStopWords()
 
  433         public int MaxQueryTerms
 
  435             get { 
return maxQueryTerms; }
 
  436             set { this.maxQueryTerms = value; }
 
  444         public int MaxNumTokensParsed
 
  446             get { 
return maxNumTokensParsed; }
 
  447             set { maxNumTokensParsed = value; }
 
  455             if (fieldNames == null)
 
  458                 ICollection<string> fields = ir.GetFieldNames(
IndexReader.FieldOption.INDEXED);
 
  459                 fieldNames = fields.ToArray();
 
  462             return CreateQuery(RetrieveTerms(docNum));
 
  470         public Query Like(System.IO.FileInfo f)
 
  472             if (fieldNames == null)
 
  475                 ICollection<string> fields = ir.GetFieldNames(
IndexReader.FieldOption.INDEXED);
 
  476                 fieldNames = fields.ToArray();
 
  479             return Like(
new System.IO.StreamReader(f.FullName, System.Text.Encoding.Default));
 
  489             return Like(
new System.IO.StreamReader((System.Net.WebRequest.Create(u)).GetResponse().GetResponseStream(), System.Text.Encoding.Default));
 
  497         public Query Like(System.IO.Stream is_Renamed)
 
  499             return Like(
new System.IO.StreamReader(is_Renamed, System.Text.Encoding.Default));
 
  507         public Query Like(System.IO.TextReader r)
 
  509             return CreateQuery(RetrieveTerms(r));
 
  520             while (((cur = q.Pop()) != null))
 
  522                 System.Object[] ar = (System.Object[])cur;
 
  529                         bestScore = (float)ar[2];
 
  531                     float myScore = (float)ar[2];
 
  533                     tq.
Boost = boostFactor * myScore / bestScore;
 
  546                 if (maxQueryTerms > 0 && qterms >= maxQueryTerms)
 
  560         private PriorityQueue<object[]> CreateQueue(IDictionary<string,Int> words)
 
  563             int numDocs = ir.NumDocs();
 
  564             FreqQ res = 
new FreqQ(words.Count); 
 
  566             var it = words.Keys.GetEnumerator();
 
  567             while (it.MoveNext())
 
  570                 System.String word = it.Current;
 
  572                 int tf = words[word].x; 
 
  573                 if (minTermFreq > 0 && tf < minTermFreq)
 
  579                 System.String topField = fieldNames[0];
 
  581                 for (
int i = 0; i < fieldNames.Length; i++)
 
  583                     int freq = ir.DocFreq(
new Term(fieldNames[i], word));
 
  584                     topField = (freq > docFreq) ? fieldNames[i] : topField;
 
  585                     docFreq = (freq > docFreq) ? freq : docFreq;
 
  588                 if (minDocFreq > 0 && docFreq < minDocFreq)
 
  593                 if (docFreq > maxDocfreq)
 
  603                 float idf = similarity.Idf(docFreq, numDocs);
 
  604                 float score = tf * idf;
 
  607                 res.InsertWithOverflow(
new System.Object[] { word, topField, score, idf, docFreq, tf });
 
  613         public System.String DescribeParams()
 
  615             System.Text.StringBuilder sb = 
new System.Text.StringBuilder();
 
  616             sb.Append(
"\t" + 
"maxQueryTerms  : " + maxQueryTerms + 
"\n");
 
  617             sb.Append(
"\t" + 
"minWordLen     : " + minWordLen + 
"\n");
 
  618             sb.Append(
"\t" + 
"maxWordLen     : " + maxWordLen + 
"\n");
 
  619             sb.Append(
"\t" + 
"fieldNames     : \"");
 
  620             System.String delim = 
"";
 
  621             for (
int i = 0; i < fieldNames.Length; i++)
 
  623                 System.String fieldName = fieldNames[i];
 
  624                 sb.Append(delim).Append(fieldName);
 
  628             sb.Append(
"\t" + 
"boost          : " + boost + 
"\n");
 
  629             sb.Append(
"\t" + 
"minTermFreq    : " + minTermFreq + 
"\n");
 
  630             sb.Append(
"\t" + 
"minDocFreq     : " + minDocFreq + 
"\n");
 
  631             return sb.ToString();
 
  638         public static void Main(System.String[] a)
 
  640             System.String indexName = 
"localhost_index";
 
  641             System.String fn = 
"c:/Program Files/Apache Group/Apache/htdocs/manual/vhosts/index.html.en";
 
  642             System.Uri url = null;
 
  643             for (
int i = 0; i < a.Length; i++)
 
  645                 if (a[i].Equals(
"-i"))
 
  649                 else if (a[i].Equals(
"-f"))
 
  653                 else if (a[i].Equals(
"-url"))
 
  655                     url = 
new System.Uri(a[++i]);
 
  659             System.IO.StreamWriter temp_writer;
 
  660             temp_writer = 
new System.IO.StreamWriter(System.Console.OpenStandardOutput(), System.Console.Out.Encoding);
 
  661             temp_writer.AutoFlush = 
true;
 
  662             System.IO.StreamWriter o = temp_writer;
 
  665             o.WriteLine(
"Open index " + indexName + 
" which has " + r.
NumDocs() + 
" docs");
 
  669             o.WriteLine(
"Query generation parameters:");
 
  676                 o.WriteLine(
"Parsing URL: " + url);
 
  677                 query = mlt.
Like(url);
 
  681                 o.WriteLine(
"Parsing file: " + fn);
 
  682                 query = mlt.
Like(
new System.IO.FileInfo(fn));
 
  685             o.WriteLine(
"q: " + query);
 
  691             o.WriteLine(
"found: " + len + 
" documents matching");
 
  694             for (
int i = 0; i < System.Math.Min(25, len); i++)
 
  697                 System.String summary = d.Get(
"summary");
 
  698                 o.WriteLine(
"score  : " + scoreDocs[i].Score);
 
  699                 o.WriteLine(
"url    : " + d.Get(
"url"));
 
  700                 o.WriteLine(
"\ttitle  : " + d.Get(
"title"));
 
  702                     o.WriteLine(
"\tsummary: " + d.Get(
"summary"));
 
  712         private PriorityQueue<object[]> RetrieveTerms(
int docNum)
 
  714             IDictionary<string,Int> termFreqMap = 
new HashMap<string,Int>();
 
  715             for (
int i = 0; i < fieldNames.Length; i++)
 
  717                 System.String fieldName = fieldNames[i];
 
  724                     System.String[] text = d.GetValues(fieldName);
 
  727                         for (
int j = 0; j < text.Length; j++)
 
  729                             AddTermFrequencies(
new System.IO.StringReader(text[j]), termFreqMap, fieldName);
 
  735                     AddTermFrequencies(termFreqMap, vector);
 
  739             return CreateQueue(termFreqMap);
 
  747         private void AddTermFrequencies(IDictionary<string, Int> termFreqMap, 
ITermFreqVector vector)
 
  749             System.String[] terms = vector.
GetTerms();
 
  751             for (
int j = 0; j < terms.Length; j++)
 
  753                 System.String term = terms[j];
 
  755                 if (IsNoiseWord(term))
 
  760                 Int cnt = termFreqMap[term];
 
  764                     termFreqMap[term] = cnt;
 
  780         private void AddTermFrequencies(System.IO.TextReader r, IDictionary<string,Int> termFreqMap, System.String fieldName)
 
  782             TokenStream ts = analyzer.TokenStream(fieldName, r);
 
  787             while (ts.IncrementToken()) {
 
  788                 string word = termAtt.
Term;
 
  790                 if(tokenCount>maxNumTokensParsed)
 
  794                 if(IsNoiseWord(word)){
 
  799                 Int cnt = termFreqMap[word];
 
  801                     termFreqMap[word] = 
new Int();
 
  817         private bool IsNoiseWord(System.String term)
 
  819             int len = term.Length;
 
  820             if (minWordLen > 0 && len < minWordLen)
 
  824             if (maxWordLen > 0 && len > maxWordLen)
 
  828             if (stopWords != null && stopWords.Contains(term))
 
  860         public PriorityQueue<object[]> RetrieveTerms(System.IO.TextReader r)
 
  862             IDictionary<string, Int> words = 
new HashMap<string,Int>();
 
  863             for (
int i = 0; i < fieldNames.Length; i++)
 
  865                 System.String fieldName = fieldNames[i];
 
  866                 AddTermFrequencies(r, words, fieldName);
 
  868             return CreateQueue(words);
 
  872         public System.String[] RetrieveInterestingTerms(
int docNum)
 
  874             List<object> al = 
new List<object>(maxQueryTerms);
 
  875             PriorityQueue<object[]> pq = RetrieveTerms(docNum);
 
  877             int lim = maxQueryTerms; 
 
  879             while (((cur = pq.Pop()) != null) && lim-- > 0)
 
  881                 System.Object[] ar = (System.Object[])cur;
 
  886             return al.Select(x => x.ToString()).ToArray();
 
  901         public System.String[] RetrieveInterestingTerms(System.IO.TextReader r)
 
  903             List<object> al = 
new List<object>(maxQueryTerms);
 
  904             PriorityQueue<object[]> pq = RetrieveTerms(r);
 
  906             int lim = maxQueryTerms; 
 
  908             while (((cur = pq.Pop()) != null) && lim-- > 0)
 
  910                 System.Object[] ar = (System.Object[])cur;
 
  915             return al.Select(x => x.ToString()).ToArray();
 
  921             internal FreqQ(
int s)
 
  926             override public bool LessThan(System.Object[] aa, System.Object[] bb)
 
  928                 float fa = (float)aa[2];
 
  929                 float fb = (float)bb[2];
 
  930                 return (
float)fa > (float)fb;