Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
NearSpansOrdered.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.Collections.Generic;
20 using IndexReader = Lucene.Net.Index.IndexReader;
21 
22 namespace Lucene.Net.Search.Spans
23 {
24 
25  /// <summary>A Spans that is formed from the ordered subspans of a SpanNearQuery
26  /// where the subspans do not overlap and have a maximum slop between them.
27  /// <p/>
28  /// The formed spans only contains minimum slop matches.<br/>
29  /// The matching slop is computed from the distance(s) between
30  /// the non overlapping matching Spans.<br/>
31  /// Successive matches are always formed from the successive Spans
32  /// of the SpanNearQuery.
33  /// <p/>
34  /// The formed spans may contain overlaps when the slop is at least 1.
35  /// For example, when querying using
36  /// <c>t1 t2 t3</c>
37  /// with slop at least 1, the fragment:
38  /// <c>t1 t2 t1 t3 t2 t3</c>
39  /// matches twice:
40  /// <c>t1 t2 .. t3 </c>
41  /// <c> t1 .. t2 t3</c>
42  ///
43  ///
44  /// Expert:
45  /// Only public for subclassing. Most implementations should not need this class
46  /// </summary>
47  public class NearSpansOrdered:Spans
48  {
49  internal class AnonymousClassComparator : System.Collections.IComparer
50  {
51  public AnonymousClassComparator(NearSpansOrdered enclosingInstance)
52  {
53  InitBlock(enclosingInstance);
54  }
55  private void InitBlock(NearSpansOrdered enclosingInstance)
56  {
57  this.enclosingInstance = enclosingInstance;
58  }
59  private NearSpansOrdered enclosingInstance;
60  public NearSpansOrdered Enclosing_Instance
61  {
62  get
63  {
64  return enclosingInstance;
65  }
66 
67  }
68  public virtual int Compare(System.Object o1, System.Object o2)
69  {
70  return ((Spans) o1).Doc() - ((Spans) o2).Doc();
71  }
72  }
73  private void InitBlock()
74  {
75  spanDocComparator = new AnonymousClassComparator(this);
76  }
77  private int allowedSlop;
78  private bool firstTime = true;
79  private bool more = false;
80 
81  /// <summary>The spans in the same order as the SpanNearQuery </summary>
82  private Spans[] subSpans;
83 
84  /// <summary>Indicates that all subSpans have same doc() </summary>
85  private bool inSameDoc = false;
86 
87  private int matchDoc = - 1;
88  private int matchStart = - 1;
89  private int matchEnd = - 1;
90  private System.Collections.Generic.List<byte[]> matchPayload;
91 
92  private Spans[] subSpansByDoc;
93  private System.Collections.IComparer spanDocComparator;
94 
95  private SpanNearQuery query;
96  private bool collectPayloads = true;
97 
98  public NearSpansOrdered(SpanNearQuery spanNearQuery, IndexReader reader):this(spanNearQuery, reader, true)
99  {
100  }
101 
102  public NearSpansOrdered(SpanNearQuery spanNearQuery, IndexReader reader, bool collectPayloads)
103  {
104  InitBlock();
105  if (spanNearQuery.GetClauses().Length < 2)
106  {
107  throw new System.ArgumentException("Less than 2 clauses: " + spanNearQuery);
108  }
109  this.collectPayloads = collectPayloads;
110  allowedSlop = spanNearQuery.Slop;
111  SpanQuery[] clauses = spanNearQuery.GetClauses();
112  subSpans = new Spans[clauses.Length];
113  matchPayload = new System.Collections.Generic.List<byte[]>();
114  subSpansByDoc = new Spans[clauses.Length];
115  for (int i = 0; i < clauses.Length; i++)
116  {
117  subSpans[i] = clauses[i].GetSpans(reader);
118  subSpansByDoc[i] = subSpans[i]; // used in toSameDoc()
119  }
120  query = spanNearQuery; // kept for toString() only.
121  }
122 
123  // inherit javadocs
124  public override int Doc()
125  {
126  return matchDoc;
127  }
128 
129  // inherit javadocs
130  public override int Start()
131  {
132  return matchStart;
133  }
134 
135  // inherit javadocs
136  public override int End()
137  {
138  return matchEnd;
139  }
140 
141  public virtual Spans[] GetSubSpans()
142  {
143  return subSpans;
144  }
145 
146  // TODO: Remove warning after API has been finalized
147  // TODO: Would be nice to be able to lazy load payloads
148 
149  public override ICollection<byte[]> GetPayload()
150  {
151  return matchPayload;
152  }
153 
154  // TODO: Remove warning after API has been finalized
155 
156  public override bool IsPayloadAvailable()
157  {
158  return (matchPayload.Count == 0) == false;
159  }
160 
161  // inherit javadocs
162  public override bool Next()
163  {
164  if (firstTime)
165  {
166  firstTime = false;
167  for (int i = 0; i < subSpans.Length; i++)
168  {
169  if (!subSpans[i].Next())
170  {
171  more = false;
172  return false;
173  }
174  }
175  more = true;
176  }
177  if (collectPayloads)
178  {
179  matchPayload.Clear();
180  }
181  return AdvanceAfterOrdered();
182  }
183 
184  // inherit javadocs
185  public override bool SkipTo(int target)
186  {
187  if (firstTime)
188  {
189  firstTime = false;
190  for (int i = 0; i < subSpans.Length; i++)
191  {
192  if (!subSpans[i].SkipTo(target))
193  {
194  more = false;
195  return false;
196  }
197  }
198  more = true;
199  }
200  else if (more && (subSpans[0].Doc() < target))
201  {
202  if (subSpans[0].SkipTo(target))
203  {
204  inSameDoc = false;
205  }
206  else
207  {
208  more = false;
209  return false;
210  }
211  }
212  if (collectPayloads)
213  {
214  matchPayload.Clear();
215  }
216  return AdvanceAfterOrdered();
217  }
218 
219  /// <summary>Advances the subSpans to just after an ordered match with a minimum slop
220  /// that is smaller than the slop allowed by the SpanNearQuery.
221  /// </summary>
222  /// <returns> true iff there is such a match.
223  /// </returns>
224  private bool AdvanceAfterOrdered()
225  {
226  while (more && (inSameDoc || ToSameDoc()))
227  {
228  if (StretchToOrder() && ShrinkToAfterShortestMatch())
229  {
230  return true;
231  }
232  }
233  return false; // no more matches
234  }
235 
236 
237  /// <summary>Advance the subSpans to the same document </summary>
238  private bool ToSameDoc()
239  {
240  System.Array.Sort(subSpansByDoc, spanDocComparator);
241  int firstIndex = 0;
242  int maxDoc = subSpansByDoc[subSpansByDoc.Length - 1].Doc();
243  while (subSpansByDoc[firstIndex].Doc() != maxDoc)
244  {
245  if (!subSpansByDoc[firstIndex].SkipTo(maxDoc))
246  {
247  more = false;
248  inSameDoc = false;
249  return false;
250  }
251  maxDoc = subSpansByDoc[firstIndex].Doc();
252  if (++firstIndex == subSpansByDoc.Length)
253  {
254  firstIndex = 0;
255  }
256  }
257  for (int i = 0; i < subSpansByDoc.Length; i++)
258  {
259  System.Diagnostics.Debug.Assert((subSpansByDoc [i].Doc() == maxDoc)
260  , "NearSpansOrdered.toSameDoc() spans " + subSpansByDoc [0]
261  + "\n at doc " + subSpansByDoc [i].Doc()
262  + ", but should be at " + maxDoc);
263  }
264  inSameDoc = true;
265  return true;
266  }
267 
268  /// <summary>Check whether two Spans in the same document are ordered.</summary>
269  /// <param name="spans1">
270  /// </param>
271  /// <param name="spans2">
272  /// </param>
273  /// <returns> true iff spans1 starts before spans2
274  /// or the spans start at the same position,
275  /// and spans1 ends before spans2.
276  /// </returns>
277  internal static bool DocSpansOrdered(Spans spans1, Spans spans2)
278  {
279  System.Diagnostics.Debug.Assert(spans1.Doc() == spans2.Doc(), "doc1 " + spans1.Doc() + " != doc2 " + spans2.Doc());
280  int start1 = spans1.Start();
281  int start2 = spans2.Start();
282  /* Do not call docSpansOrdered(int,int,int,int) to avoid invoking .end() : */
283  return (start1 == start2)?(spans1.End() < spans2.End()):(start1 < start2);
284  }
285 
286  /// <summary>Like <see cref="DocSpansOrdered(Spans,Spans)" />, but use the spans
287  /// starts and ends as parameters.
288  /// </summary>
289  private static bool DocSpansOrdered(int start1, int end1, int start2, int end2)
290  {
291  return (start1 == start2)?(end1 < end2):(start1 < start2);
292  }
293 
294  /// <summary>Order the subSpans within the same document by advancing all later spans
295  /// after the previous one.
296  /// </summary>
297  private bool StretchToOrder()
298  {
299  matchDoc = subSpans[0].Doc();
300  for (int i = 1; inSameDoc && (i < subSpans.Length); i++)
301  {
302  while (!DocSpansOrdered(subSpans[i - 1], subSpans[i]))
303  {
304  if (!subSpans[i].Next())
305  {
306  inSameDoc = false;
307  more = false;
308  break;
309  }
310  else if (matchDoc != subSpans[i].Doc())
311  {
312  inSameDoc = false;
313  break;
314  }
315  }
316  }
317  return inSameDoc;
318  }
319 
320  /// <summary>The subSpans are ordered in the same doc, so there is a possible match.
321  /// Compute the slop while making the match as short as possible by advancing
322  /// all subSpans except the last one in reverse order.
323  /// </summary>
324  private bool ShrinkToAfterShortestMatch()
325  {
326  matchStart = subSpans[subSpans.Length - 1].Start();
327  matchEnd = subSpans[subSpans.Length - 1].End();
328  System.Collections.Generic.Dictionary<byte[], byte[]> possibleMatchPayloads = new System.Collections.Generic.Dictionary<byte[], byte[]>();
329  if (subSpans[subSpans.Length - 1].IsPayloadAvailable())
330  {
331  System.Collections.Generic.ICollection<byte[]> payload = subSpans[subSpans.Length - 1].GetPayload();
332  foreach(byte[] pl in payload)
333  {
334  if (!possibleMatchPayloads.ContainsKey(pl))
335  {
336  possibleMatchPayloads.Add(pl, pl);
337  }
338  }
339  }
340 
341  System.Collections.Generic.List<byte[]> possiblePayload = null;
342 
343  int matchSlop = 0;
344  int lastStart = matchStart;
345  int lastEnd = matchEnd;
346  for (int i = subSpans.Length - 2; i >= 0; i--)
347  {
348  Spans prevSpans = subSpans[i];
349  if (collectPayloads && prevSpans.IsPayloadAvailable())
350  {
351  System.Collections.Generic.ICollection<byte[]> payload = prevSpans.GetPayload();
352  possiblePayload = new System.Collections.Generic.List<byte[]>(payload.Count);
353  possiblePayload.AddRange(payload);
354  }
355 
356  int prevStart = prevSpans.Start();
357  int prevEnd = prevSpans.End();
358  while (true)
359  {
360  // Advance prevSpans until after (lastStart, lastEnd)
361  if (!prevSpans.Next())
362  {
363  inSameDoc = false;
364  more = false;
365  break; // Check remaining subSpans for final match.
366  }
367  else if (matchDoc != prevSpans.Doc())
368  {
369  inSameDoc = false; // The last subSpans is not advanced here.
370  break; // Check remaining subSpans for last match in this document.
371  }
372  else
373  {
374  int ppStart = prevSpans.Start();
375  int ppEnd = prevSpans.End(); // Cannot avoid invoking .end()
376  if (!DocSpansOrdered(ppStart, ppEnd, lastStart, lastEnd))
377  {
378  break; // Check remaining subSpans.
379  }
380  else
381  {
382  // prevSpans still before (lastStart, lastEnd)
383  prevStart = ppStart;
384  prevEnd = ppEnd;
385  if (collectPayloads && prevSpans.IsPayloadAvailable())
386  {
387  System.Collections.Generic.ICollection<byte[]> payload = prevSpans.GetPayload();
388  possiblePayload = new System.Collections.Generic.List<byte[]>(payload.Count);
389  possiblePayload.AddRange(payload);
390  }
391  }
392  }
393  }
394 
395  if (collectPayloads && possiblePayload != null)
396  {
397  foreach (byte[] pl in possiblePayload)
398  {
399  if (!possibleMatchPayloads.ContainsKey(pl))
400  {
401  possibleMatchPayloads.Add(pl, pl);
402  }
403  }
404  }
405 
406  System.Diagnostics.Debug.Assert(prevStart <= matchStart);
407  if (matchStart > prevEnd)
408  {
409  // Only non overlapping spans add to slop.
410  matchSlop += (matchStart - prevEnd);
411  }
412 
413  /* Do not break on (matchSlop > allowedSlop) here to make sure
414  * that subSpans[0] is advanced after the match, if any.
415  */
416  matchStart = prevStart;
417  lastStart = prevStart;
418  lastEnd = prevEnd;
419  }
420 
421  bool match = matchSlop <= allowedSlop;
422 
423  if (collectPayloads && match && possibleMatchPayloads.Count > 0)
424  {
425  matchPayload.AddRange(possibleMatchPayloads.Keys);
426  }
427 
428  return match; // ordered and allowed slop
429  }
430 
431  public override System.String ToString()
432  {
433  return GetType().FullName + "(" + query.ToString() + ")@" + (firstTime?"START":(more?(Doc() + ":" + Start() + "-" + End()):"END"));
434  }
435  }
436 }