Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
WeightedSpanTermExtractor.cs
Go to the documentation of this file.
1 /*
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements. See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership. The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License. You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied. See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20 */
21 
22 using System;
23 using System.Collections.Generic;
24 using System.IO;
25 using System.Linq;
26 using System.Text;
27 using Lucene.Net.Analysis;
28 using Lucene.Net.Index;
29 using Lucene.Net.Index.Memory;
30 using Lucene.Net.Search.Spans;
31 using Lucene.Net.Store;
32 using Lucene.Net.Support;
33 using Lucene.Net.Util;
34 
35 namespace Lucene.Net.Search.Highlight
36 {
37  /// <summary>
38  /// Class used to extract <see cref="WeightedSpanTerm"/>s from a <see cref="Query"/> based on whether
39  /// <see cref="Term"/>s from the <see cref="Query"/> are contained in a supplied <see cref="Analysis.TokenStream"/>.
40  /// </summary>
42  {
43  private String fieldName;
44  private TokenStream tokenStream;
45  private IDictionary<String, IndexReader> readers = new HashMap<String, IndexReader>(10);
46  private String defaultField;
47  private bool expandMultiTermQuery;
48  private bool cachedTokenStream;
49  private bool wrapToCaching = true;
50 
52  {
53  }
54 
55  public WeightedSpanTermExtractor(String defaultField)
56  {
57  if (defaultField != null)
58  {
59  this.defaultField = StringHelper.Intern(defaultField);
60  }
61  }
62 
63  private void CloseReaders()
64  {
65  ICollection<IndexReader> readerSet = readers.Values;
66 
67  foreach (IndexReader reader in readerSet)
68  {
69  try
70  {
71  reader.Close();
72  }
73  catch (IOException e)
74  {
75  // alert?
76  }
77  }
78  }
79 
80  /// <summary>
81  /// Fills a <c>Map</c> with <see cref="WeightedSpanTerm"/>s using the terms from the supplied <c>Query</c>.
82  /// </summary>
83  /// <param name="query">Query to extract Terms from</param>
84  /// <param name="terms">Map to place created WeightedSpanTerms in</param>
85  private void Extract(Query query, IDictionary<String, WeightedSpanTerm> terms)
86  {
87  if (query is BooleanQuery)
88  {
89  BooleanClause[] queryClauses = ((BooleanQuery) query).GetClauses();
90 
91  for (int i = 0; i < queryClauses.Length; i++)
92  {
93  if (!queryClauses[i].IsProhibited)
94  {
95  Extract(queryClauses[i].Query, terms);
96  }
97  }
98  }
99  else if (query is PhraseQuery)
100  {
101  PhraseQuery phraseQuery = ((PhraseQuery) query);
102  Term[] phraseQueryTerms = phraseQuery.GetTerms();
103  SpanQuery[] clauses = new SpanQuery[phraseQueryTerms.Length];
104  for (int i = 0; i < phraseQueryTerms.Length; i++)
105  {
106  clauses[i] = new SpanTermQuery(phraseQueryTerms[i]);
107  }
108  int slop = phraseQuery.Slop;
109  int[] positions = phraseQuery.GetPositions();
110  // add largest position increment to slop
111  if (positions.Length > 0)
112  {
113  int lastPos = positions[0];
114  int largestInc = 0;
115  int sz = positions.Length;
116  for (int i = 1; i < sz; i++)
117  {
118  int pos = positions[i];
119  int inc = pos - lastPos;
120  if (inc > largestInc)
121  {
122  largestInc = inc;
123  }
124  lastPos = pos;
125  }
126  if (largestInc > 1)
127  {
128  slop += largestInc;
129  }
130  }
131 
132  bool inorder = slop == 0;
133 
134  SpanNearQuery sp = new SpanNearQuery(clauses, slop, inorder);
135  sp.Boost = query.Boost;
136  ExtractWeightedSpanTerms(terms, sp);
137  }
138  else if (query is TermQuery)
139  {
140  ExtractWeightedTerms(terms, query);
141  }
142  else if (query is SpanQuery)
143  {
144  ExtractWeightedSpanTerms(terms, (SpanQuery) query);
145  }
146  else if (query is FilteredQuery)
147  {
148  Extract(((FilteredQuery) query).Query, terms);
149  }
150  else if (query is DisjunctionMaxQuery)
151  {
152  foreach (var q in ((DisjunctionMaxQuery) query))
153  {
154  Extract(q, terms);
155  }
156  }
157  else if (query is MultiTermQuery && expandMultiTermQuery)
158  {
159  MultiTermQuery mtq = ((MultiTermQuery) query);
160  if (mtq.RewriteMethod != MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE)
161  {
162  mtq = (MultiTermQuery) mtq.Clone();
163  mtq.RewriteMethod = MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE;
164  query = mtq;
165  }
166  FakeReader fReader = new FakeReader();
167  MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE.Rewrite(fReader, mtq);
168  if (fReader.Field != null)
169  {
170  IndexReader ir = GetReaderForField(fReader.Field);
171  Extract(query.Rewrite(ir), terms);
172  }
173  }
174  else if (query is MultiPhraseQuery)
175  {
176  MultiPhraseQuery mpq = (MultiPhraseQuery) query;
177  IList<Term[]> termArrays = mpq.GetTermArrays();
178  int[] positions = mpq.GetPositions();
179  if (positions.Length > 0)
180  {
181 
182  int maxPosition = positions[positions.Length - 1];
183  for (int i = 0; i < positions.Length - 1; ++i)
184  {
185  if (positions[i] > maxPosition)
186  {
187  maxPosition = positions[i];
188  }
189  }
190 
191  var disjunctLists = new List<SpanQuery>[maxPosition + 1];
192  int distinctPositions = 0;
193 
194  for (int i = 0; i < termArrays.Count; ++i)
195  {
196  Term[] termArray = termArrays[i];
197  List<SpanQuery> disjuncts = disjunctLists[positions[i]];
198  if (disjuncts == null)
199  {
200  disjuncts = (disjunctLists[positions[i]] = new List<SpanQuery>(termArray.Length));
201  ++distinctPositions;
202  }
203  for (int j = 0; j < termArray.Length; ++j)
204  {
205  disjuncts.Add(new SpanTermQuery(termArray[j]));
206  }
207  }
208 
209  int positionGaps = 0;
210  int position = 0;
211  SpanQuery[] clauses = new SpanQuery[distinctPositions];
212  for (int i = 0; i < disjunctLists.Length; ++i)
213  {
214  List<SpanQuery> disjuncts = disjunctLists[i];
215  if (disjuncts != null)
216  {
217  clauses[position++] = new SpanOrQuery(disjuncts.ToArray());
218  }
219  else
220  {
221  ++positionGaps;
222  }
223  }
224 
225  int slop = mpq.Slop;
226  bool inorder = (slop == 0);
227 
228  SpanNearQuery sp = new SpanNearQuery(clauses, slop + positionGaps, inorder);
229  sp.Boost = query.Boost;
230  ExtractWeightedSpanTerms(terms, sp);
231  }
232  }
233  }
234 
235  /// <summary>
236  /// Fills a <c>Map</c> with <see cref="WeightedSpanTerm"/>s using the terms from the supplied <c>SpanQuery</c>.
237  /// </summary>
238  /// <param name="terms">Map to place created WeightedSpanTerms in</param>
239  /// <param name="spanQuery">SpanQuery to extract Terms from</param>
240  private void ExtractWeightedSpanTerms(IDictionary<String, WeightedSpanTerm> terms, SpanQuery spanQuery)
241  {
242  HashSet<String> fieldNames;
243 
244  if (fieldName == null)
245  {
246  fieldNames = new HashSet<String>();
247  CollectSpanQueryFields(spanQuery, fieldNames);
248  }
249  else
250  {
251  fieldNames = new HashSet<String>();
252  fieldNames.Add(fieldName);
253  }
254  // To support the use of the default field name
255  if (defaultField != null)
256  {
257  fieldNames.Add(defaultField);
258  }
259 
260  IDictionary<String, SpanQuery> queries = new HashMap<String, SpanQuery>();
261 
262  var nonWeightedTerms = Support.Compatibility.SetFactory.CreateHashSet<Term>();
263  bool mustRewriteQuery = MustRewriteQuery(spanQuery);
264  if (mustRewriteQuery)
265  {
266  foreach (String field in fieldNames)
267  {
268  SpanQuery rewrittenQuery = (SpanQuery) spanQuery.Rewrite(GetReaderForField(field));
269  queries[field] = rewrittenQuery;
270  rewrittenQuery.ExtractTerms(nonWeightedTerms);
271  }
272  }
273  else
274  {
275  spanQuery.ExtractTerms(nonWeightedTerms);
276  }
277 
278  List<PositionSpan> spanPositions = new List<PositionSpan>();
279 
280  foreach (String field in fieldNames)
281  {
282 
283  IndexReader reader = GetReaderForField(field);
284  Spans.Spans spans;
285  if (mustRewriteQuery)
286  {
287  spans = queries[field].GetSpans(reader);
288  }
289  else
290  {
291  spans = spanQuery.GetSpans(reader);
292  }
293 
294 
295  // collect span positions
296  while (spans.Next())
297  {
298  spanPositions.Add(new PositionSpan(spans.Start(), spans.End() - 1));
299  }
300 
301  }
302 
303  if (spanPositions.Count == 0)
304  {
305  // no spans found
306  return;
307  }
308 
309  foreach (Term queryTerm in nonWeightedTerms)
310  {
311 
312  if (FieldNameComparator(queryTerm.Field))
313  {
314  WeightedSpanTerm weightedSpanTerm = terms[queryTerm.Text];
315 
316  if (weightedSpanTerm == null)
317  {
318  weightedSpanTerm = new WeightedSpanTerm(spanQuery.Boost, queryTerm.Text);
319  weightedSpanTerm.AddPositionSpans(spanPositions);
320  weightedSpanTerm.SetPositionSensitive(true);
321  terms[queryTerm.Text] = weightedSpanTerm;
322  }
323  else
324  {
325  if (spanPositions.Count > 0)
326  {
327  weightedSpanTerm.AddPositionSpans(spanPositions);
328  }
329  }
330  }
331  }
332  }
333 
334  /// <summary>
335  /// Fills a <c>Map</c> with <see cref="WeightedSpanTerm"/>s using the terms from the supplied <c>Query</c>.
336  /// </summary>
337  /// <param name="terms"></param>
338  /// <param name="query"></param>
339  private void ExtractWeightedTerms(IDictionary<String, WeightedSpanTerm> terms, Query query)
340  {
341  var nonWeightedTerms = Support.Compatibility.SetFactory.CreateHashSet<Term>();
342  query.ExtractTerms(nonWeightedTerms);
343 
344  foreach (Term queryTerm in nonWeightedTerms)
345  {
346 
347  if (FieldNameComparator(queryTerm.Field))
348  {
349  WeightedSpanTerm weightedSpanTerm = new WeightedSpanTerm(query.Boost, queryTerm.Text);
350  terms[queryTerm.Text] = weightedSpanTerm;
351  }
352  }
353  }
354 
355  /// <summary>
356  /// Necessary to implement matches for queries against <c>defaultField</c>
357  /// </summary>
358  private bool FieldNameComparator(String fieldNameToCheck)
359  {
360  bool rv = fieldName == null || fieldNameToCheck == fieldName
361  || fieldNameToCheck == defaultField;
362  return rv;
363  }
364 
365  private IndexReader GetReaderForField(String field)
366  {
367  if (wrapToCaching && !cachedTokenStream && !(tokenStream is CachingTokenFilter))
368  {
369  tokenStream = new CachingTokenFilter(tokenStream);
370  cachedTokenStream = true;
371  }
372  IndexReader reader = readers[field];
373  if (reader == null)
374  {
375  MemoryIndex indexer = new MemoryIndex();
376  indexer.AddField(field, tokenStream);
377  tokenStream.Reset();
378  IndexSearcher searcher = indexer.CreateSearcher();
379  reader = searcher.IndexReader;
380  readers[field] = reader;
381  }
382 
383  return reader;
384  }
385 
386  /// <summary>
387  /// Creates a Map of <c>WeightedSpanTerms</c> from the given <c>Query</c> and <c>TokenStream</c>.
388  /// </summary>
389  /// <param name="query">query that caused hit</param>
390  /// <param name="tokenStream">TokenStream of text to be highlighted</param>
391  /// <returns>Map containing WeightedSpanTerms</returns>
392  public IDictionary<String, WeightedSpanTerm> GetWeightedSpanTerms(Query query, TokenStream tokenStream)
393  {
394  return GetWeightedSpanTerms(query, tokenStream, null);
395  }
396 
397 
398  /// <summary>
399  /// Creates a Map of <c>WeightedSpanTerms</c> from the given <c>Query</c> and <c>TokenStream</c>.
400  /// </summary>
401  /// <param name="query">query that caused hit</param>
402  /// <param name="tokenStream">tokenStream of text to be highlighted</param>
403  /// <param name="fieldName">restricts Term's used based on field name</param>
404  /// <returns>Map containing WeightedSpanTerms</returns>
405  public IDictionary<String, WeightedSpanTerm> GetWeightedSpanTerms(Query query, TokenStream tokenStream,
406  String fieldName)
407  {
408  if (fieldName != null)
409  {
410  this.fieldName = StringHelper.Intern(fieldName);
411  }
412  else
413  {
414  this.fieldName = null;
415  }
416 
417  IDictionary<String, WeightedSpanTerm> terms = new PositionCheckingMap<String>();
418  this.tokenStream = tokenStream;
419  try
420  {
421  Extract(query, terms);
422  }
423  finally
424  {
425  CloseReaders();
426  }
427 
428  return terms;
429  }
430 
431  /// <summary>
432  /// Creates a Map of <c>WeightedSpanTerms</c> from the given <c>Query</c> and <c>TokenStream</c>. Uses a supplied
433  /// <c>IndexReader</c> to properly Weight terms (for gradient highlighting).
434  /// </summary>
435  /// <param name="query">Query that caused hit</param>
436  /// <param name="tokenStream">Tokenstream of text to be highlighted</param>
437  /// <param name="fieldName">restricts Term's used based on field name</param>
438  /// <param name="reader">to use for scoring</param>
439  /// <returns>Map of WeightedSpanTerms with quasi tf/idf scores</returns>
440  public IDictionary<String, WeightedSpanTerm> GetWeightedSpanTermsWithScores(Query query, TokenStream tokenStream,
441  String fieldName, IndexReader reader)
442  {
443  if (fieldName != null)
444  {
445  this.fieldName = StringHelper.Intern(fieldName);
446  }
447  else
448  {
449  this.fieldName = null;
450  }
451  this.tokenStream = tokenStream;
452 
453  IDictionary<String, WeightedSpanTerm> terms = new PositionCheckingMap<String>();
454  Extract(query, terms);
455 
456  int totalNumDocs = reader.NumDocs();
457  var weightedTerms = terms.Keys;
458 
459  try
460  {
461  foreach (var wt in weightedTerms)
462  {
463  WeightedSpanTerm weightedSpanTerm = terms[wt];
464  int docFreq = reader.DocFreq(new Term(fieldName, weightedSpanTerm.Term));
465  // docFreq counts deletes
466  if (totalNumDocs < docFreq)
467  {
468  docFreq = totalNumDocs;
469  }
470  // IDF algorithm taken from DefaultSimilarity class
471  float idf = (float) (Math.Log((float) totalNumDocs/(double) (docFreq + 1)) + 1.0);
472  weightedSpanTerm.Weight *= idf;
473  }
474  }
475  finally
476  {
477 
478  CloseReaders();
479  }
480 
481  return terms;
482  }
483 
484  private void CollectSpanQueryFields(SpanQuery spanQuery, HashSet<String> fieldNames)
485  {
486  if (spanQuery is FieldMaskingSpanQuery)
487  {
488  CollectSpanQueryFields(((FieldMaskingSpanQuery) spanQuery).MaskedQuery, fieldNames);
489  }
490  else if (spanQuery is SpanFirstQuery)
491  {
492  CollectSpanQueryFields(((SpanFirstQuery) spanQuery).Match, fieldNames);
493  }
494  else if (spanQuery is SpanNearQuery)
495  {
496  foreach (SpanQuery clause in ((SpanNearQuery) spanQuery).GetClauses())
497  {
498  CollectSpanQueryFields(clause, fieldNames);
499  }
500  }
501  else if (spanQuery is SpanNotQuery)
502  {
503  CollectSpanQueryFields(((SpanNotQuery) spanQuery).Include, fieldNames);
504  }
505  else if (spanQuery is SpanOrQuery)
506  {
507  foreach (SpanQuery clause in ((SpanOrQuery) spanQuery).GetClauses())
508  {
509  CollectSpanQueryFields(clause, fieldNames);
510  }
511  }
512  else
513  {
514  fieldNames.Add(spanQuery.Field);
515  }
516  }
517 
518  private bool MustRewriteQuery(SpanQuery spanQuery)
519  {
520  if (!expandMultiTermQuery)
521  {
522  return false; // Will throw UnsupportedOperationException in case of a SpanRegexQuery.
523  }
524  else if (spanQuery is FieldMaskingSpanQuery)
525  {
526  return MustRewriteQuery(((FieldMaskingSpanQuery)spanQuery).MaskedQuery);
527  }
528  else if (spanQuery is SpanFirstQuery)
529  {
530  return MustRewriteQuery(((SpanFirstQuery)spanQuery).Match);
531  }
532  else if (spanQuery is SpanNearQuery)
533  {
534  foreach (SpanQuery clause in ((SpanNearQuery) spanQuery).GetClauses())
535  {
536  if (MustRewriteQuery(clause))
537  {
538  return true;
539  }
540  }
541  return false;
542  }
543  else if (spanQuery is SpanNotQuery)
544  {
545  SpanNotQuery spanNotQuery = (SpanNotQuery) spanQuery;
546  return MustRewriteQuery(spanNotQuery.Include) || MustRewriteQuery(spanNotQuery.Exclude);
547  }
548  else if (spanQuery is SpanOrQuery)
549  {
550  foreach (SpanQuery clause in ((SpanOrQuery) spanQuery).GetClauses())
551  {
552  if (MustRewriteQuery(clause))
553  {
554  return true;
555  }
556  }
557  return false;
558  }
559  else if (spanQuery is SpanTermQuery)
560  {
561  return false;
562  }
563  else
564  {
565  return true;
566  }
567  }
568 
569 
570  /// <summary>
571  /// This class makes sure that if both position sensitive and insensitive
572  /// versions of the same term are added, the position insensitive one wins.
573  /// </summary>
574  /// <typeparam name="K"></typeparam>
575  private class PositionCheckingMap<K> : HashMap<K, WeightedSpanTerm>
576  {
577  public PositionCheckingMap()
578  {
579 
580  }
581 
582  public PositionCheckingMap(IEnumerable<KeyValuePair<K, WeightedSpanTerm>> m)
583  {
584  PutAll(m);
585  }
586 
587  public void PutAll(IEnumerable<KeyValuePair<K, WeightedSpanTerm>> m)
588  {
589  foreach (var entry in m)
590  {
591  Add(entry.Key, entry.Value);
592  }
593  }
594 
595  public override void Add(K key, WeightedSpanTerm value)
596  {
597  base.Add(key, value);
598  WeightedSpanTerm prev = this[key];
599 
600  if (prev == null) return;
601 
602  WeightedSpanTerm prevTerm = prev;
603  WeightedSpanTerm newTerm = value;
604  if (!prevTerm.IsPositionSensitive())
605  {
606  newTerm.SetPositionSensitive(false);
607  }
608  }
609 
610  }
611 
612  public bool ExpandMultiTermQuery
613  {
614  set { this.expandMultiTermQuery = value; }
615  get { return expandMultiTermQuery; }
616  }
617 
618  public bool IsCachedTokenStream
619  {
620  get { return cachedTokenStream; }
621  }
622 
623  public TokenStream TokenStream
624  {
625  get { return tokenStream; }
626  }
627 
628 
629  /// <summary>
630  /// By default, <see cref="Analysis.TokenStream"/>s that are not of the type
631  /// <see cref="CachingTokenFilter"/> are wrapped in a <see cref="CachingTokenFilter"/> to
632  /// <see cref="Analysis.TokenStream"/> impl and you don't want it to be wrapped, set this to
633  /// false.
634  /// </summary>
635  public void SetWrapIfNotCachingTokenFilter(bool wrap)
636  {
637  this.wrapToCaching = wrap;
638  }
639 
640  /// <summary>
641  /// A fake IndexReader class to extract the field from a MultiTermQuery
642  /// </summary>
643  protected internal sealed class FakeReader : FilterIndexReader
644  {
645 
646  private static IndexReader EMPTY_MEMORY_INDEX_READER = new MemoryIndex().CreateSearcher().IndexReader;
647 
648  public String Field { get; private set; }
649 
650  protected internal FakeReader()
651  : base(EMPTY_MEMORY_INDEX_READER)
652  {
653 
654  }
655 
656  public override TermEnum Terms(Term t)
657  {
658  // only set first fieldname, maybe use a Set?
659  if (t != null && Field == null)
660  Field = t.Field;
661  return base.Terms(t);
662  }
663 
664 
665  }
666  }
667 }