Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
Similarity.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.Collections.Generic;
20 using Lucene.Net.Documents;
21 using FieldInvertState = Lucene.Net.Index.FieldInvertState;
22 using Term = Lucene.Net.Index.Term;
23 using SmallFloat = Lucene.Net.Util.SmallFloat;
24 using IDFExplanation = Lucene.Net.Search.Explanation.IDFExplanation;
25 
26 namespace Lucene.Net.Search
27 {
28 
29  /// <summary>Expert: Scoring API.
30  /// <p/>Subclasses implement search scoring.
31  ///
32  /// <p/>The score of query <c>q</c> for document <c>d</c> correlates to the
33  /// cosine-distance or dot-product between document and query vectors in a
34  /// <a href="http://en.wikipedia.org/wiki/Vector_Space_Model">
35  /// Vector Space Model (VSM) of Information Retrieval</a>.
36  /// A document whose vector is closer to the query vector in that model is scored higher.
37  ///
38  /// The score is computed as follows:
39  ///
40  /// <p/>
41  /// <table cellpadding="1" cellspacing="0" border="1" align="center">
42  /// <tr><td>
43  /// <table cellpadding="1" cellspacing="0" border="0" align="center">
44  /// <tr>
45  /// <td valign="middle" align="right" rowspan="1">
46  /// score(q,d) &#160; = &#160;
47  /// <A HREF="#formula_coord">coord(q,d)</A> &#160;&#183;&#160;
48  /// <A HREF="#formula_queryNorm">queryNorm(q)</A> &#160;&#183;&#160;
49  /// </td>
50  /// <td valign="bottom" align="center" rowspan="1">
51  /// <big><big><big>&#8721;</big></big></big>
52  /// </td>
53  /// <td valign="middle" align="right" rowspan="1">
54  /// <big><big>(</big></big>
55  /// <A HREF="#formula_tf">tf(t in d)</A> &#160;&#183;&#160;
56  /// <A HREF="#formula_idf">idf(t)</A><sup>2</sup> &#160;&#183;&#160;
57  /// <A HREF="#formula_termBoost">t.Boost</A>&#160;&#183;&#160;
58  /// <A HREF="#formula_norm">norm(t,d)</A>
59  /// <big><big>)</big></big>
60  /// </td>
61  /// </tr>
62  /// <tr valigh="top">
63  /// <td></td>
64  /// <td align="center"><small>t in q</small></td>
65  /// <td></td>
66  /// </tr>
67  /// </table>
68  /// </td></tr>
69  /// </table>
70  ///
71  /// <p/> where
72  /// <list type="bullet">
73  /// <item>
74  /// <A NAME="formula_tf"></A>
75  /// <b>tf(t in d)</b>
76  /// correlates to the term's <i>frequency</i>,
77  /// defined as the number of times term <i>t</i> appears in the currently scored document <i>d</i>.
78  /// Documents that have more occurrences of a given term receive a higher score.
79  /// The default computation for <i>tf(t in d)</i> in
80  /// <see cref="Lucene.Net.Search.DefaultSimilarity.Tf(float)">DefaultSimilarity</see> is:
81  ///
82  /// <br/>&#160;<br/>
83  /// <table cellpadding="2" cellspacing="2" border="0" align="center">
84  /// <tr>
85  /// <td valign="middle" align="right" rowspan="1">
86  /// <see cref="Lucene.Net.Search.DefaultSimilarity.Tf(float)">tf(t in d)</see> &#160; = &#160;
87  /// </td>
88  /// <td valign="top" align="center" rowspan="1">
89  /// frequency<sup><big>&#189;</big></sup>
90  /// </td>
91  /// </tr>
92  /// </table>
93  /// <br/>&#160;<br/>
94  /// </item>
95  ///
96  /// <item>
97  /// <A NAME="formula_idf"></A>
98  /// <b>idf(t)</b> stands for Inverse Document Frequency. This value
99  /// correlates to the inverse of <i>docFreq</i>
100  /// (the number of documents in which the term <i>t</i> appears).
101  /// This means rarer terms give higher contribution to the total score.
102  /// The default computation for <i>idf(t)</i> in
103  /// <see cref="Lucene.Net.Search.DefaultSimilarity.Idf(int, int)">DefaultSimilarity</see> is:
104  ///
105  /// <br/>&#160;<br/>
106  /// <table cellpadding="2" cellspacing="2" border="0" align="center">
107  /// <tr>
108  /// <td valign="middle" align="right">
109  /// <see cref="Lucene.Net.Search.DefaultSimilarity.Idf(int, int)">idf(t)</see>&#160; = &#160;
110  /// </td>
111  /// <td valign="middle" align="center">
112  /// 1 + log <big>(</big>
113  /// </td>
114  /// <td valign="middle" align="center">
115  /// <table>
116  /// <tr><td align="center"><small>numDocs</small></td></tr>
117  /// <tr><td align="center">&#8211;&#8211;&#8211;&#8211;&#8211;&#8211;&#8211;&#8211;&#8211;</td></tr>
118  /// <tr><td align="center"><small>docFreq+1</small></td></tr>
119  /// </table>
120  /// </td>
121  /// <td valign="middle" align="center">
122  /// <big>)</big>
123  /// </td>
124  /// </tr>
125  /// </table>
126  /// <br/>&#160;<br/>
127  /// </item>
128  ///
129  /// <item>
130  /// <A NAME="formula_coord"></A>
131  /// <b>coord(q,d)</b>
132  /// is a score factor based on how many of the query terms are found in the specified document.
133  /// Typically, a document that contains more of the query's terms will receive a higher score
134  /// than another document with fewer query terms.
135  /// This is a search time factor computed in
136  /// <see cref="Coord(int, int)">coord(q,d)</see>
137  /// by the Similarity in effect at search time.
138  /// <br/>&#160;<br/>
139  /// </item>
140  ///
141  /// <item><b>
142  /// <A NAME="formula_queryNorm"></A>
143  /// queryNorm(q)
144  /// </b>
145  /// is a normalizing factor used to make scores between queries comparable.
146  /// This factor does not affect document ranking (since all ranked documents are multiplied by the same factor),
147  /// but rather just attempts to make scores from different queries (or even different indexes) comparable.
148  /// This is a search time factor computed by the Similarity in effect at search time.
149  ///
150  /// The default computation in
151  /// <see cref="Lucene.Net.Search.DefaultSimilarity.QueryNorm(float)">DefaultSimilarity</see>
152  /// is:
153  /// <br/>&#160;<br/>
154  /// <table cellpadding="1" cellspacing="0" border="0" align="center">
155  /// <tr>
156  /// <td valign="middle" align="right" rowspan="1">
157  /// queryNorm(q) &#160; = &#160;
158  /// <see cref="Lucene.Net.Search.DefaultSimilarity.QueryNorm(float)">queryNorm(sumOfSquaredWeights)</see>
159  /// &#160; = &#160;
160  /// </td>
161  /// <td valign="middle" align="center" rowspan="1">
162  /// <table>
163  /// <tr><td align="center"><big>1</big></td></tr>
164  /// <tr><td align="center"><big>
165  /// &#8211;&#8211;&#8211;&#8211;&#8211;&#8211;&#8211;&#8211;&#8211;&#8211;&#8211;&#8211;&#8211;&#8211;
166  /// </big></td></tr>
167  /// <tr><td align="center">sumOfSquaredWeights<sup><big>&#189;</big></sup></td></tr>
168  /// </table>
169  /// </td>
170  /// </tr>
171  /// </table>
172  /// <br/>&#160;<br/>
173  ///
174  /// The sum of squared weights (of the query terms) is
175  /// computed by the query <see cref="Lucene.Net.Search.Weight" /> object.
176  /// For example, a <see cref="Lucene.Net.Search.BooleanQuery">boolean query</see>
177  /// computes this value as:
178  ///
179  /// <br/>&#160;<br/>
180  /// <table cellpadding="1" cellspacing="0" border="0" align="center">
181  /// <tr>
182  /// <td valign="middle" align="right" rowspan="1">
183  /// <see cref="Lucene.Net.Search.Weight.GetSumOfSquaredWeights">GetSumOfSquaredWeights</see> &#160; = &#160;
184  /// <see cref="Lucene.Net.Search.Query.Boost">q.Boost</see> <sup><big>2</big></sup>
185  /// &#160;&#183;&#160;
186  /// </td>
187  /// <td valign="bottom" align="center" rowspan="1">
188  /// <big><big><big>&#8721;</big></big></big>
189  /// </td>
190  /// <td valign="middle" align="right" rowspan="1">
191  /// <big><big>(</big></big>
192  /// <A HREF="#formula_idf">idf(t)</A> &#160;&#183;&#160;
193  /// <A HREF="#formula_termBoost">t.Boost</A>
194  /// <big><big>) <sup>2</sup> </big></big>
195  /// </td>
196  /// </tr>
197  /// <tr valigh="top">
198  /// <td></td>
199  /// <td align="center"><small>t in q</small></td>
200  /// <td></td>
201  /// </tr>
202  /// </table>
203  /// <br/>&#160;<br/>
204  ///
205  /// </item>
206  ///
207  /// <item>
208  /// <A NAME="formula_termBoost"></A>
209  /// <b>t.Boost</b>
210  /// is a search time boost of term <i>t</i> in the query <i>q</i> as
211  /// specified in the query text
212  /// (see <A HREF="../../../../../../queryparsersyntax.html#Boosting a Term">query syntax</A>),
213  /// or as set by application calls to
214  /// <see cref="Lucene.Net.Search.Query.Boost" />.
215  /// Notice that there is really no direct API for accessing a boost of one term in a multi term query,
216  /// but rather multi terms are represented in a query as multi
217  /// <see cref="Lucene.Net.Search.TermQuery">TermQuery</see> objects,
218  /// and so the boost of a term in the query is accessible by calling the sub-query
219  /// <see cref="Lucene.Net.Search.Query.Boost" />.
220  /// <br/>&#160;<br/>
221  /// </item>
222  ///
223  /// <item>
224  /// <A NAME="formula_norm"></A>
225  /// <b>norm(t,d)</b> encapsulates a few (indexing time) boost and length factors:
226  ///
227  /// <list type="bullet">
228  /// <item><b>Document boost</b> - set by calling
229  /// <see cref="Lucene.Net.Documents.Document.Boost">doc.Boost</see>
230  /// before adding the document to the index.
231  /// </item>
232  /// <item><b>Field boost</b> - set by calling
233  /// <see cref="IFieldable.Boost">field.Boost</see>
234  /// before adding the field to a document.
235  /// </item>
236  /// <item><see cref="LengthNorm(String, int)">LengthNorm(field)</see> - computed
237  /// when the document is added to the index in accordance with the number of tokens
238  /// of this field in the document, so that shorter fields contribute more to the score.
239  /// LengthNorm is computed by the Similarity class in effect at indexing.
240  /// </item>
241  /// </list>
242  ///
243  /// <p/>
244  /// When a document is added to the index, all the above factors are multiplied.
245  /// If the document has multiple fields with the same name, all their boosts are multiplied together:
246  ///
247  /// <br/>&#160;<br/>
248  /// <table cellpadding="1" cellspacing="0" border="0" align="center">
249  /// <tr>
250  /// <td valign="middle" align="right" rowspan="1">
251  /// norm(t,d) &#160; = &#160;
252  /// <see cref="Lucene.Net.Documents.Document.Boost">doc.Boost</see>
253  /// &#160;&#183;&#160;
254  /// <see cref="LengthNorm(String, int)">LengthNorm(field)</see>
255  /// &#160;&#183;&#160;
256  /// </td>
257  /// <td valign="bottom" align="center" rowspan="1">
258  /// <big><big><big>&#8719;</big></big></big>
259  /// </td>
260  /// <td valign="middle" align="right" rowspan="1">
261  /// <see cref="IFieldable.Boost">field.Boost</see>
262  /// </td>
263  /// </tr>
264  /// <tr valigh="top">
265  /// <td></td>
266  /// <td align="center"><small>field <i><b>f</b></i> in <i>d</i> named as <i><b>t</b></i></small></td>
267  /// <td></td>
268  /// </tr>
269  /// </table>
270  /// <br/>&#160;<br/>
271  /// However the resulted <i>norm</i> value is <see cref="EncodeNorm(float)">encoded</see> as a single byte
272  /// before being stored.
273  /// At search time, the norm byte value is read from the index
274  /// <see cref="Lucene.Net.Store.Directory">directory</see> and
275  /// <see cref="DecodeNorm(byte)">decoded</see> back to a float <i>norm</i> value.
276  /// This encoding/decoding, while reducing index size, comes with the price of
277  /// precision loss - it is not guaranteed that decode(encode(x)) = x.
278  /// For instance, decode(encode(0.89)) = 0.75.
279  /// Also notice that search time is too late to modify this <i>norm</i> part of scoring, e.g. by
280  /// using a different <see cref="Similarity" /> for search.
281  /// <br/>&#160;<br/>
282  /// </item>
283  /// </list>
284  ///
285  /// </summary>
286  /// <seealso cref="Default">
287  /// </seealso>
288  /// <seealso cref="Lucene.Net.Index.IndexWriter.Similarity">
289  /// </seealso>
290  /// <seealso cref="Searcher.Similarity">
291  /// </seealso>
292  [Serializable]
293  public abstract class Similarity
294  {
295  protected Similarity()
296  {
297  InitBlock();
298  }
299  [Serializable]
300  private class AnonymousClassIDFExplanation1:IDFExplanation
301  {
302  public AnonymousClassIDFExplanation1(int df, int max, float idf, Similarity enclosingInstance)
303  {
304  InitBlock(df, max, idf, enclosingInstance);
305  }
306  private void InitBlock(int df, int max, float idf, Similarity enclosingInstance)
307  {
308  this.df = df;
309  this.max = max;
310  this.idf = idf;
311  this.enclosingInstance = enclosingInstance;
312  }
313  private int df;
314  private int max;
315  private float idf;
316  private Similarity enclosingInstance;
317  public Similarity Enclosing_Instance
318  {
319  get
320  {
321  return enclosingInstance;
322  }
323 
324  }
325  //@Override
326  public override System.String Explain()
327  {
328  return "idf(docFreq=" + df + ", maxDocs=" + max + ")";
329  }
330  //@Override
331 
332  public override float Idf
333  {
334  get { return idf; }
335  }
336  }
337  [Serializable]
338  private class AnonymousClassIDFExplanation3:IDFExplanation
339  {
340  public AnonymousClassIDFExplanation3(float fIdf, System.Text.StringBuilder exp, Similarity enclosingInstance)
341  {
342  InitBlock(fIdf, exp, enclosingInstance);
343  }
344  private void InitBlock(float fIdf, System.Text.StringBuilder exp, Similarity enclosingInstance)
345  {
346  this.fIdf = fIdf;
347  this.exp = exp;
348  this.enclosingInstance = enclosingInstance;
349  }
350  private float fIdf;
351  private System.Text.StringBuilder exp;
352  private Similarity enclosingInstance;
353  public Similarity Enclosing_Instance
354  {
355  get
356  {
357  return enclosingInstance;
358  }
359 
360  }
361  //@Override
362 
363  public override float Idf
364  {
365  get { return fIdf; }
366  }
367 
368  //@Override
369  public override System.String Explain()
370  {
371  return exp.ToString();
372  }
373  }
374  private void InitBlock()
375  {
376 
377  }
378 
379  /// <summary>The Similarity implementation used by default.</summary>
380  private static Similarity defaultImpl = new DefaultSimilarity();
381  public const int NO_DOC_ID_PROVIDED = - 1;
382 
383  /// <summary>Gets or sets the default Similarity implementation
384  /// used by indexing and search code.
385  /// <p/>This is initially an instance of <see cref="DefaultSimilarity" />.
386  /// </summary>
387  /// <seealso cref="Searcher.Similarity">
388  /// </seealso>
389  /// <seealso cref="Lucene.Net.Index.IndexWriter.SetSimilarity(Similarity)">
390  /// </seealso>
391  public static Similarity Default
392  {
393  get { return defaultImpl; }
394  set { defaultImpl = value; }
395  }
396 
397  /// <summary>Cache of decoded bytes. </summary>
398  private static readonly float[] NORM_TABLE = new float[256];
399 
400  /// <summary>Decodes a normalization factor stored in an index.</summary>
401  /// <seealso cref="EncodeNorm(float)">
402  /// </seealso>
403  public static float DecodeNorm(byte b)
404  {
405  return NORM_TABLE[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127
406  }
407 
408  /// <summary>Returns a table for decoding normalization bytes.</summary>
409  /// <seealso cref="EncodeNorm(float)">
410  /// </seealso>
411  public static float[] GetNormDecoder()
412  {
413  return NORM_TABLE;
414  }
415 
416  /// <summary> Compute the normalization value for a field, given the accumulated
417  /// state of term processing for this field (see <see cref="FieldInvertState" />).
418  ///
419  /// <p/>Implementations should calculate a float value based on the field
420  /// state and then return that value.
421  ///
422  /// <p/>For backward compatibility this method by default calls
423  /// <see cref="LengthNorm(String, int)" /> passing
424  /// <see cref="FieldInvertState.Length" /> as the second argument, and
425  /// then multiplies this value by <see cref="FieldInvertState.Boost" />.<p/>
426  ///
427  /// <p/><b>WARNING</b>: This API is new and experimental and may
428  /// suddenly change.<p/>
429  ///
430  /// </summary>
431  /// <param name="field">field name
432  /// </param>
433  /// <param name="state">current processing state for this field
434  /// </param>
435  /// <returns> the calculated float norm
436  /// </returns>
437  public virtual float ComputeNorm(System.String field, FieldInvertState state)
438  {
439  return (float) (state.Boost * LengthNorm(field, state.Length));
440  }
441 
442  /// <summary>Computes the normalization value for a field given the total number of
443  /// terms contained in a field. These values, together with field boosts, are
444  /// stored in an index and multipled into scores for hits on each field by the
445  /// search code.
446  ///
447  /// <p/>Matches in longer fields are less precise, so implementations of this
448  /// method usually return smaller values when <c>numTokens</c> is large,
449  /// and larger values when <c>numTokens</c> is small.
450  ///
451  /// <p/>Note that the return values are computed under
452  /// <see cref="Lucene.Net.Index.IndexWriter.AddDocument(Lucene.Net.Documents.Document)" />
453  /// and then stored using
454  /// <see cref="EncodeNorm(float)" />.
455  /// Thus they have limited precision, and documents
456  /// must be re-indexed if this method is altered.
457  ///
458  /// </summary>
459  /// <param name="fieldName">the name of the field
460  /// </param>
461  /// <param name="numTokens">the total number of tokens contained in fields named
462  /// <i>fieldName</i> of <i>doc</i>.
463  /// </param>
464  /// <returns> a normalization factor for hits on this field of this document
465  ///
466  /// </returns>
467  /// <seealso cref="Lucene.Net.Documents.AbstractField.Boost" />
468  public abstract float LengthNorm(System.String fieldName, int numTokens);
469 
470  /// <summary>Computes the normalization value for a query given the sum of the squared
471  /// weights of each of the query terms. This value is then multipled into the
472  /// weight of each query term.
473  ///
474  /// <p/>This does not affect ranking, but rather just attempts to make scores
475  /// from different queries comparable.
476  ///
477  /// </summary>
478  /// <param name="sumOfSquaredWeights">the sum of the squares of query term weights
479  /// </param>
480  /// <returns> a normalization factor for query weights
481  /// </returns>
482  public abstract float QueryNorm(float sumOfSquaredWeights);
483 
484  /// <summary>Encodes a normalization factor for storage in an index.
485  ///
486  /// <p/>The encoding uses a three-bit mantissa, a five-bit exponent, and
487  /// the zero-exponent point at 15, thus
488  /// representing values from around 7x10^9 to 2x10^-9 with about one
489  /// significant decimal digit of accuracy. Zero is also represented.
490  /// Negative numbers are rounded up to zero. Values too large to represent
491  /// are rounded down to the largest representable value. Positive values too
492  /// small to represent are rounded up to the smallest positive representable
493  /// value.
494  ///
495  /// </summary>
496  /// <seealso cref="Lucene.Net.Documents.AbstractField.Boost" />
497  /// <seealso cref="Lucene.Net.Util.SmallFloat" />
498  public static byte EncodeNorm(float f)
499  {
500  return (byte) SmallFloat.FloatToByte315(f);
501  }
502 
503 
504  /// <summary>Computes a score factor based on a term or phrase's frequency in a
505  /// document. This value is multiplied by the <see cref="Idf(int, int)" />
506  /// factor for each term in the query and these products are then summed to
507  /// form the initial score for a document.
508  ///
509  /// <p/>Terms and phrases repeated in a document indicate the topic of the
510  /// document, so implementations of this method usually return larger values
511  /// when <c>freq</c> is large, and smaller values when <c>freq</c>
512  /// is small.
513  ///
514  /// <p/>The default implementation calls <see cref="Tf(float)" />.
515  ///
516  /// </summary>
517  /// <param name="freq">the frequency of a term within a document
518  /// </param>
519  /// <returns> a score factor based on a term's within-document frequency
520  /// </returns>
521  public virtual float Tf(int freq)
522  {
523  return Tf((float) freq);
524  }
525 
526  /// <summary>Computes the amount of a sloppy phrase match, based on an edit distance.
527  /// This value is summed for each sloppy phrase match in a document to form
528  /// the frequency that is passed to <see cref="Tf(float)" />.
529  ///
530  /// <p/>A phrase match with a small edit distance to a document passage more
531  /// closely matches the document, so implementations of this method usually
532  /// return larger values when the edit distance is small and smaller values
533  /// when it is large.
534  ///
535  /// </summary>
536  /// <seealso cref="PhraseQuery.Slop" />
537  /// <param name="distance">the edit distance of this sloppy phrase match </param>
538  /// <returns> the frequency increment for this match </returns>
539  public abstract float SloppyFreq(int distance);
540 
541  /// <summary>Computes a score factor based on a term or phrase's frequency in a
542  /// document. This value is multiplied by the <see cref="Idf(int, int)" />
543  /// factor for each term in the query and these products are then summed to
544  /// form the initial score for a document.
545  ///
546  /// <p/>Terms and phrases repeated in a document indicate the topic of the
547  /// document, so implementations of this method usually return larger values
548  /// when <c>freq</c> is large, and smaller values when <c>freq</c>
549  /// is small.
550  ///
551  /// </summary>
552  /// <param name="freq">the frequency of a term within a document
553  /// </param>
554  /// <returns> a score factor based on a term's within-document frequency
555  /// </returns>
556  public abstract float Tf(float freq);
557 
558  /// <summary> Computes a score factor for a simple term and returns an explanation
559  /// for that score factor.
560  ///
561  /// <p/>
562  /// The default implementation uses:
563  ///
564  /// <code>
565  /// idf(searcher.docFreq(term), searcher.MaxDoc);
566  /// </code>
567  ///
568  /// Note that <see cref="Searcher.MaxDoc" /> is used instead of
569  /// <see cref="Lucene.Net.Index.IndexReader.NumDocs()" /> because it is
570  /// proportional to <see cref="Searcher.DocFreq(Term)" /> , i.e., when one is
571  /// inaccurate, so is the other, and in the same direction.
572  ///
573  /// </summary>
574  /// <param name="term">the term in question
575  /// </param>
576  /// <param name="searcher">the document collection being searched
577  /// </param>
578  /// <returns> an IDFExplain object that includes both an idf score factor
579  /// and an explanation for the term.
580  /// </returns>
581  /// <throws> IOException </throws>
582  public virtual IDFExplanation IdfExplain(Term term, Searcher searcher)
583  {
584  int df = searcher.DocFreq(term);
585  int max = searcher.MaxDoc;
586  float idf2 = Idf(df, max);
587  return new AnonymousClassIDFExplanation1(df, max, idf2, this);
588  }
589 
590  /// <summary> Computes a score factor for a phrase.
591  ///
592  /// <p/>
593  /// The default implementation sums the idf factor for
594  /// each term in the phrase.
595  ///
596  /// </summary>
597  /// <param name="terms">the terms in the phrase
598  /// </param>
599  /// <param name="searcher">the document collection being searched
600  /// </param>
601  /// <returns> an IDFExplain object that includes both an idf
602  /// score factor for the phrase and an explanation
603  /// for each term.
604  /// </returns>
605  /// <throws> IOException </throws>
606  public virtual IDFExplanation IdfExplain(ICollection<Term> terms, Searcher searcher)
607  {
608  int max = searcher.MaxDoc;
609  float idf2 = 0.0f;
610  System.Text.StringBuilder exp = new System.Text.StringBuilder();
611  foreach (Term term in terms)
612  {
613  int df = searcher.DocFreq(term);
614  idf2 += Idf(df, max);
615  exp.Append(" ");
616  exp.Append(term.Text);
617  exp.Append("=");
618  exp.Append(df);
619  }
620  float fIdf = idf2;
621  return new AnonymousClassIDFExplanation3(fIdf, exp, this);
622  }
623 
624  /// <summary>Computes a score factor based on a term's document frequency (the number
625  /// of documents which contain the term). This value is multiplied by the
626  /// <see cref="Tf(int)" /> factor for each term in the query and these products are
627  /// then summed to form the initial score for a document.
628  ///
629  /// <p/>Terms that occur in fewer documents are better indicators of topic, so
630  /// implementations of this method usually return larger values for rare terms,
631  /// and smaller values for common terms.
632  ///
633  /// </summary>
634  /// <param name="docFreq">the number of documents which contain the term
635  /// </param>
636  /// <param name="numDocs">the total number of documents in the collection
637  /// </param>
638  /// <returns> a score factor based on the term's document frequency
639  /// </returns>
640  public abstract float Idf(int docFreq, int numDocs);
641 
642  /// <summary>Computes a score factor based on the fraction of all query terms that a
643  /// document contains. This value is multiplied into scores.
644  ///
645  /// <p/>The presence of a large portion of the query terms indicates a better
646  /// match with the query, so implementations of this method usually return
647  /// larger values when the ratio between these parameters is large and smaller
648  /// values when the ratio between them is small.
649  ///
650  /// </summary>
651  /// <param name="overlap">the number of query terms matched in the document
652  /// </param>
653  /// <param name="maxOverlap">the total number of terms in the query
654  /// </param>
655  /// <returns> a score factor based on term overlap with the query
656  /// </returns>
657  public abstract float Coord(int overlap, int maxOverlap);
658 
659 
660  /// <summary> Calculate a scoring factor based on the data in the payload. Overriding implementations
661  /// are responsible for interpreting what is in the payload. Lucene makes no assumptions about
662  /// what is in the byte array.
663  /// <p/>
664  /// The default implementation returns 1.
665  ///
666  /// </summary>
667  /// <param name="docId">The docId currently being scored. If this value is <see cref="NO_DOC_ID_PROVIDED" />, then it should be assumed that the PayloadQuery implementation does not provide document information
668  /// </param>
669  /// <param name="fieldName">The fieldName of the term this payload belongs to
670  /// </param>
671  /// <param name="start">The start position of the payload
672  /// </param>
673  /// <param name="end">The end position of the payload
674  /// </param>
675  /// <param name="payload">The payload byte array to be scored
676  /// </param>
677  /// <param name="offset">The offset into the payload array
678  /// </param>
679  /// <param name="length">The length in the array
680  /// </param>
681  /// <returns> An implementation dependent float to be used as a scoring factor
682  ///
683  /// </returns>
684  public virtual float ScorePayload(int docId, System.String fieldName, int start, int end, byte[] payload, int offset, int length)
685  {
686  return 1;
687  }
688 
689  static Similarity()
690  {
691  {
692  for (int i = 0; i < 256; i++)
693  NORM_TABLE[i] = SmallFloat.Byte315ToFloat((byte) i);
694  }
695  }
696  }
697 }