Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
SloppyPhraseScorer.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.Linq;
20 using Lucene.Net.Support;
21 using TermPositions = Lucene.Net.Index.TermPositions;
22 
23 namespace Lucene.Net.Search
24 {
25 
27  {
28  private int slop;
29  private PhrasePositions[] repeats;
30  private PhrasePositions[] tmpPos; // for flipping repeating pps.
31  private bool checkedRepeats;
32 
33  internal SloppyPhraseScorer(Weight weight, TermPositions[] tps, int[] offsets, Similarity similarity, int slop, byte[] norms):base(weight, tps, offsets, similarity, norms)
34  {
35  this.slop = slop;
36  }
37 
38  /// <summary> Score a candidate doc for all slop-valid position-combinations (matches)
39  /// encountered while traversing/hopping the PhrasePositions.
40  /// <br/> The score contribution of a match depends on the distance:
41  /// <br/> - highest score for distance=0 (exact match).
42  /// <br/> - score gets lower as distance gets higher.
43  /// <br/>Example: for query "a b"~2, a document "x a b a y" can be scored twice:
44  /// once for "a b" (distance=0), and once for "b a" (distance=2).
45  /// <br/>Possibly not all valid combinations are encountered, because for efficiency
46  /// we always propagate the least PhrasePosition. This allows to base on
47  /// PriorityQueue and move forward faster.
48  /// As result, for example, document "a b c b a"
49  /// would score differently for queries "a b c"~4 and "c b a"~4, although
50  /// they really are equivalent.
51  /// Similarly, for doc "a b c b a f g", query "c b"~2
52  /// would get same score as "g f"~2, although "c b"~2 could be matched twice.
53  /// We may want to fix this in the future (currently not, for performance reasons).
54  /// </summary>
55  protected internal override float PhraseFreq()
56  {
57  int end = InitPhrasePositions();
58 
59  float freq = 0.0f;
60  bool done = (end < 0);
61  while (!done)
62  {
63  PhrasePositions pp = pq.Pop();
64  int start = pp.position;
65  int next = pq.Top().position;
66 
67  bool tpsDiffer = true;
68  for (int pos = start; pos <= next || !tpsDiffer; pos = pp.position)
69  {
70  if (pos <= next && tpsDiffer)
71  start = pos; // advance pp to min window
72  if (!pp.NextPosition())
73  {
74  done = true; // ran out of a term -- done
75  break;
76  }
77  PhrasePositions pp2 = null;
78  tpsDiffer = !pp.repeats || (pp2 = TermPositionsDiffer(pp)) == null;
79  if (pp2 != null && pp2 != pp)
80  {
81  pp = Flip(pp, pp2); // flip pp to pp2
82  }
83  }
84 
85  int matchLength = end - start;
86  if (matchLength <= slop)
87  freq += Similarity.SloppyFreq(matchLength); // score match
88 
89  if (pp.position > end)
90  end = pp.position;
91  pq.Add(pp); // restore pq
92  }
93 
94  return freq;
95  }
96 
97  // flip pp2 and pp in the queue: pop until finding pp2, insert back all but pp2, insert pp back.
98  // assumes: pp!=pp2, pp2 in pq, pp not in pq.
99  // called only when there are repeating pps.
100  private PhrasePositions Flip(PhrasePositions pp, PhrasePositions pp2)
101  {
102  int n = 0;
103  PhrasePositions pp3;
104  //pop until finding pp2
105  while ((pp3 = pq.Pop()) != pp2)
106  {
107  tmpPos[n++] = pp3;
108  }
109  //insert back all but pp2
110  for (n--; n >= 0; n--)
111  {
112  pq.InsertWithOverflow(tmpPos[n]);
113  }
114  //insert pp back
115  pq.Add(pp);
116  return pp2;
117  }
118 
119  /// <summary> Init PhrasePositions in place.
120  /// There is a one time initialization for this scorer:
121  /// <br/>- Put in repeats[] each pp that has another pp with same position in the doc.
122  /// <br/>- Also mark each such pp by pp.repeats = true.
123  /// <br/>Later can consult with repeats[] in termPositionsDiffer(pp), making that check efficient.
124  /// In particular, this allows to score queries with no repetitions with no overhead due to this computation.
125  /// <br/>- Example 1 - query with no repetitions: "ho my"~2
126  /// <br/>- Example 2 - query with repetitions: "ho my my"~2
127  /// <br/>- Example 3 - query with repetitions: "my ho my"~2
128  /// <br/>Init per doc w/repeats in query, includes propagating some repeating pp's to avoid false phrase detection.
129  /// </summary>
130  /// <returns> end (max position), or -1 if any term ran out (i.e. done)
131  /// </returns>
132  /// <throws> IOException </throws>
133  private int InitPhrasePositions()
134  {
135  int end = 0;
136 
137  // no repeats at all (most common case is also the simplest one)
138  if (checkedRepeats && repeats == null)
139  {
140  // build queue from list
141  pq.Clear();
142  for (PhrasePositions pp = first; pp != null; pp = pp.next)
143  {
144  pp.FirstPosition();
145  if (pp.position > end)
146  end = pp.position;
147  pq.Add(pp); // build pq from list
148  }
149  return end;
150  }
151 
152  // position the pp's
153  for (PhrasePositions pp = first; pp != null; pp = pp.next)
154  pp.FirstPosition();
155 
156  // one time initializatin for this scorer
157  if (!checkedRepeats)
158  {
159  checkedRepeats = true;
160  // check for repeats
161  HashMap<PhrasePositions, object> m = null;
162  for (PhrasePositions pp = first; pp != null; pp = pp.next)
163  {
164  int tpPos = pp.position + pp.offset;
165  for (PhrasePositions pp2 = pp.next; pp2 != null; pp2 = pp2.next)
166  {
167  int tpPos2 = pp2.position + pp2.offset;
168  if (tpPos2 == tpPos)
169  {
170  if (m == null)
171  {
172  m = new HashMap<PhrasePositions, object>();
173  }
174  pp.repeats = true;
175  pp2.repeats = true;
176  m[pp] = null;
177  m[pp2] = null;
178  }
179  }
180  }
181  if (m != null)
182  {
183  repeats = m.Keys.ToArray();
184  }
185  }
186 
187  // with repeats must advance some repeating pp's so they all start with differing tp's
188  if (repeats != null)
189  {
190  for (int i = 0; i < repeats.Length; i++)
191  {
192  PhrasePositions pp = repeats[i];
193  PhrasePositions pp2;
194  while ((pp2 = TermPositionsDiffer(pp)) != null)
195  {
196  if (!pp2.NextPosition())
197  // out of pps that do not differ, advance the pp with higher offset
198  return - 1; // ran out of a term -- done
199  }
200  }
201  }
202 
203  // build queue from list
204  pq.Clear();
205  for (PhrasePositions pp = first; pp != null; pp = pp.next)
206  {
207  if (pp.position > end)
208  end = pp.position;
209  pq.Add(pp); // build pq from list
210  }
211 
212  if (repeats != null)
213  {
214  tmpPos = new PhrasePositions[pq.Size()];
215  }
216  return end;
217  }
218 
219  /// <summary> We disallow two pp's to have the same TermPosition, thereby verifying multiple occurrences
220  /// in the query of the same word would go elsewhere in the matched doc.
221  /// </summary>
222  /// <returns> null if differ (i.e. valid) otherwise return the higher offset PhrasePositions
223  /// out of the first two PPs found to not differ.
224  /// </returns>
225  private PhrasePositions TermPositionsDiffer(PhrasePositions pp)
226  {
227  // efficiency note: a more efficient implementation could keep a map between repeating
228  // pp's, so that if pp1a, pp1b, pp1c are repeats term1, and pp2a, pp2b are repeats
229  // of term2, pp2a would only be checked against pp2b but not against pp1a, pp1b, pp1c.
230  // However this would complicate code, for a rather rare case, so choice is to compromise here.
231  int tpPos = pp.position + pp.offset;
232  for (int i = 0; i < repeats.Length; i++)
233  {
234  PhrasePositions pp2 = repeats[i];
235  if (pp2 == pp)
236  continue;
237  int tpPos2 = pp2.position + pp2.offset;
238  if (tpPos2 == tpPos)
239  return pp.offset > pp2.offset?pp:pp2; // do not differ: return the one with higher offset.
240  }
241  return null;
242  }
243  }
244 }