Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
Highlighter.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.Collections.Generic;
20 using System.IO;
21 using System.Linq;
22 using System.Text;
23 using Lucene.Net.Analysis;
24 using Lucene.Net.Analysis.Tokenattributes;
25 using Lucene.Net.Util;
26 
27 namespace Lucene.Net.Search.Highlight
28 {
29  /// <summary>
30  /// Class used to markup highlighted terms found in the best sections of a
31  /// text, using configurable <see cref="IFragmenter"/>, <see cref="Scorer"/>, <see cref="IFormatter"/>,
32  /// <see cref="IEncoder"/> and tokenizers.
33  /// </summary>
34  public class Highlighter
35  {
36  public static readonly int DEFAULT_MAX_CHARS_TO_ANALYZE = 50*1024;
37 
38  private int _maxDocCharsToAnalyze = DEFAULT_MAX_CHARS_TO_ANALYZE;
39  private IFormatter _formatter;
40  private IEncoder _encoder;
41  private IFragmenter _textFragmenter = new SimpleFragmenter();
42  private IScorer _fragmentScorer = null;
43 
44  public Highlighter(IScorer fragmentScorer)
45  : this(new SimpleHTMLFormatter(), fragmentScorer)
46  {
47  }
48 
49 
50  public Highlighter(IFormatter formatter, IScorer fragmentScorer)
51  : this(formatter, new DefaultEncoder(), fragmentScorer)
52  {
53  }
54 
55 
56  public Highlighter(IFormatter formatter, IEncoder encoder, IScorer fragmentScorer)
57  {
58  _formatter = formatter;
59  _encoder = encoder;
60  _fragmentScorer = fragmentScorer;
61  }
62 
63  /// <summary>
64  /// Highlights chosen terms in a text, extracting the most relevant section.
65  /// This is a convenience method that calls <see cref="GetBestFragment(TokenStream, string)"/>
66  /// </summary>
67  /// <param name="analyzer">the analyzer that will be used to split <c>text</c> into chunks</param>
68  /// <param name="fieldName">Name of field used to influence analyzer's tokenization policy</param>
69  /// <param name="text">text to highlight terms in</param>
70  /// <returns>highlighted text fragment or null if no terms found</returns>
71  /// <exception cref="InvalidTokenOffsetsException">thrown if any token's endOffset exceeds the provided text's length</exception>
72  public String GetBestFragment(Analyzer analyzer, String fieldName, String text)
73  {
74  TokenStream tokenStream = analyzer.TokenStream(fieldName, new StringReader(text));
75  return GetBestFragment(tokenStream, text);
76  }
77 
78  /// <summary>
79  /// Highlights chosen terms in a text, extracting the most relevant section.
80  /// The document text is analysed in chunks to record hit statistics
81  /// across the document. After accumulating stats, the fragment with the highest score
82  /// is returned
83  /// </summary>
84  /// <param name="tokenStream">
85  /// a stream of tokens identified in the text parameter, including offset information.
86  /// This is typically produced by an analyzer re-parsing a document's
87  /// text. Some work may be done on retrieving TokenStreams more efficiently
88  /// by adding support for storing original text position data in the Lucene
89  /// index but this support is not currently available (as of Lucene 1.4 rc2).
90  /// </param>
91  /// <param name="text">text to highlight terms in</param>
92  /// <returns>highlighted text fragment or null if no terms found</returns>
93  /// <exception cref="InvalidTokenOffsetsException">thrown if any token's endOffset exceeds the provided text's length</exception>
94  public String GetBestFragment(TokenStream tokenStream, String text)
95  {
96  String[] results = GetBestFragments(tokenStream, text, 1);
97  if (results.Length > 0)
98  {
99  return results[0];
100  }
101  return null;
102  }
103 
104  /// <summary>
105  /// Highlights chosen terms in a text, extracting the most relevant sections.
106  /// This is a convenience method that calls <see cref="GetBestFragments(TokenStream, string, int)"/>
107  /// </summary>
108  /// <param name="analyzer">the analyzer that will be used to split <c>text</c> into chunks</param>
109  /// <param name="fieldName">the name of the field being highlighted (used by analyzer)</param>
110  /// <param name="text">text to highlight terms in</param>
111  /// <param name="maxNumFragments">the maximum number of fragments.</param>
112  /// <returns>highlighted text fragments (between 0 and maxNumFragments number of fragments)</returns>
113  /// <exception cref="InvalidTokenOffsetsException">thrown if any token's endOffset exceeds the provided text's length</exception>
114  public String[] GetBestFragments(
115  Analyzer analyzer,
116  String fieldName,
117  String text,
118  int maxNumFragments)
119  {
120  TokenStream tokenStream = analyzer.TokenStream(fieldName, new StringReader(text));
121  return GetBestFragments(tokenStream, text, maxNumFragments);
122  }
123 
124  /// <summary>
125  /// Highlights chosen terms in a text, extracting the most relevant sections.
126  /// The document text is analysed in chunks to record hit statistics
127  /// across the document. After accumulating stats, the fragments with the highest scores
128  /// are returned as an array of strings in order of score (contiguous fragments are merged into
129  /// one in their original order to improve readability)
130  /// </summary>
131  /// <param name="tokenStream"></param>
132  /// <param name="text">text to highlight terms in</param>
133  /// <param name="maxNumFragments">the maximum number of fragments.</param>
134  /// <returns>highlighted text fragments (between 0 and maxNumFragments number of fragments)</returns>
135  /// <exception cref="InvalidTokenOffsetsException">thrown if any token's endOffset exceeds the provided text's length</exception>
136  public String[] GetBestFragments(TokenStream tokenStream, String text, int maxNumFragments)
137  {
138  maxNumFragments = Math.Max(1, maxNumFragments); //sanity check
139 
140  TextFragment[] frag = GetBestTextFragments(tokenStream, text, true, maxNumFragments);
141 
142  //Get text
143  var fragTexts = new List<String>();
144  for (int i = 0; i < frag.Length; i++)
145  {
146  if ((frag[i] != null) && (frag[i].Score > 0))
147  {
148  fragTexts.Add(frag[i].ToString());
149  }
150  }
151  return fragTexts.ToArray();
152  }
153 
154  /// <summary>
155  /// Low level api to get the most relevant (formatted) sections of the document.
156  /// This method has been made public to allow visibility of score information held in TextFragment objects.
157  /// Thanks to Jason Calabrese for help in redefining the interface.
158  /// </summary>
159  public TextFragment[] GetBestTextFragments(
160  TokenStream tokenStream,
161  String text,
162  bool mergeContiguousFragments,
163  int maxNumFragments)
164  {
165  var docFrags = new List<TextFragment>();
166  var newText = new StringBuilder();
167 
168  var termAtt = tokenStream.AddAttribute<ITermAttribute>();
169  var offsetAtt = tokenStream.AddAttribute<IOffsetAttribute>();
170  tokenStream.AddAttribute<IPositionIncrementAttribute>();
171  tokenStream.Reset();
172 
173  var currentFrag = new TextFragment(newText, newText.Length, docFrags.Count);
174  var newStream = _fragmentScorer.Init(tokenStream);
175  if (newStream != null)
176  {
177  tokenStream = newStream;
178  }
179  _fragmentScorer.StartFragment(currentFrag);
180  docFrags.Add(currentFrag);
181 
182  var fragQueue = new FragmentQueue(maxNumFragments);
183 
184  try
185  {
186 
187  String tokenText;
188  int startOffset;
189  int endOffset;
190  int lastEndOffset = 0;
191  _textFragmenter.Start(text, tokenStream);
192 
193  var tokenGroup = new TokenGroup(tokenStream);
194 
195  for (bool next = tokenStream.IncrementToken();
196  next && (offsetAtt.StartOffset < _maxDocCharsToAnalyze);
197  next = tokenStream.IncrementToken())
198  {
199  if ((offsetAtt.EndOffset > text.Length)
200  ||
201  (offsetAtt.StartOffset > text.Length)
202  )
203  {
204  throw new InvalidTokenOffsetsException("Token " + termAtt.Term
205  + " exceeds length of provided text sized " + text.Length);
206  }
207  if ((tokenGroup.NumTokens > 0) && (tokenGroup.IsDistinct()))
208  {
209  //the current token is distinct from previous tokens -
210  // markup the cached token group info
211  startOffset = tokenGroup.MatchStartOffset;
212  endOffset = tokenGroup.MatchEndOffset;
213  tokenText = text.Substring(startOffset, endOffset - startOffset);
214  String markedUpText = _formatter.HighlightTerm(_encoder.EncodeText(tokenText), tokenGroup);
215  //store any whitespace etc from between this and last group
216  if (startOffset > lastEndOffset)
217  newText.Append(_encoder.EncodeText(text.Substring(lastEndOffset, startOffset - lastEndOffset)));
218  newText.Append(markedUpText);
219  lastEndOffset = Math.Max(endOffset, lastEndOffset);
220  tokenGroup.Clear();
221 
222  //check if current token marks the start of a new fragment
223  if (_textFragmenter.IsNewFragment())
224  {
225  currentFrag.Score = _fragmentScorer.FragmentScore;
226  //record stats for a new fragment
227  currentFrag.TextEndPos = newText.Length;
228  currentFrag = new TextFragment(newText, newText.Length, docFrags.Count);
229  _fragmentScorer.StartFragment(currentFrag);
230  docFrags.Add(currentFrag);
231  }
232  }
233 
234  tokenGroup.AddToken(_fragmentScorer.GetTokenScore());
235 
236  // if(lastEndOffset>maxDocBytesToAnalyze)
237  // {
238  // break;
239  // }
240  }
241  currentFrag.Score = _fragmentScorer.FragmentScore;
242 
243  if (tokenGroup.NumTokens > 0)
244  {
245  //flush the accumulated text (same code as in above loop)
246  startOffset = tokenGroup.MatchStartOffset;
247  endOffset = tokenGroup.MatchEndOffset;
248  tokenText = text.Substring(startOffset, endOffset - startOffset);
249  var markedUpText = _formatter.HighlightTerm(_encoder.EncodeText(tokenText), tokenGroup);
250  //store any whitespace etc from between this and last group
251  if (startOffset > lastEndOffset)
252  newText.Append(_encoder.EncodeText(text.Substring(lastEndOffset, startOffset - lastEndOffset)));
253  newText.Append(markedUpText);
254  lastEndOffset = Math.Max(lastEndOffset, endOffset);
255  }
256 
257  //Test what remains of the original text beyond the point where we stopped analyzing
258  if (
259  // if there is text beyond the last token considered..
260  (lastEndOffset < text.Length)
261  &&
262  // and that text is not too large...
263  (text.Length <= _maxDocCharsToAnalyze)
264  )
265  {
266  //append it to the last fragment
267  newText.Append(_encoder.EncodeText(text.Substring(lastEndOffset)));
268  }
269 
270  currentFrag.TextEndPos = newText.Length;
271 
272  //sort the most relevant sections of the text
273  foreach (var f in docFrags)
274  {
275  currentFrag = f;
276 
277  //If you are running with a version of Lucene before 11th Sept 03
278  // you do not have PriorityQueue.insert() - so uncomment the code below
279  /*
280  if (currentFrag.getScore() >= minScore)
281  {
282  fragQueue.put(currentFrag);
283  if (fragQueue.size() > maxNumFragments)
284  { // if hit queue overfull
285  fragQueue.pop(); // remove lowest in hit queue
286  minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
287  }
288 
289 
290  }
291  */
292  //The above code caused a problem as a result of Christoph Goller's 11th Sept 03
293  //fix to PriorityQueue. The correct method to use here is the new "insert" method
294  // USE ABOVE CODE IF THIS DOES NOT COMPILE!
295  fragQueue.InsertWithOverflow(currentFrag);
296  }
297 
298  //return the most relevant fragments
299  var frag = new TextFragment[fragQueue.Size()];
300  for (int i = frag.Length - 1; i >= 0; i--)
301  {
302  frag[i] = fragQueue.Pop();
303  }
304 
305  //merge any contiguous fragments to improve readability
306  if (mergeContiguousFragments)
307  {
308  MergeContiguousFragments(frag);
309  frag = frag.Where(t => (t != null) && (t.Score > 0)).ToArray();
310  }
311 
312  return frag;
313 
314  }
315  finally
316  {
317  if (tokenStream != null)
318  {
319  try
320  {
321  tokenStream.Close();
322  }
323  catch (Exception)
324  {
325  }
326  }
327  }
328  }
329 
330  /// <summary>
331  /// Improves readability of a score-sorted list of TextFragments by merging any fragments
332  /// that were contiguous in the original text into one larger fragment with the correct order.
333  /// This will leave a "null" in the array entry for the lesser scored fragment.
334  /// </summary>
335  /// <param name="frag">An array of document fragments in descending score</param>
336  private void MergeContiguousFragments(TextFragment[] frag)
337  {
338  bool mergingStillBeingDone;
339  if (frag.Length > 1)
340  do
341  {
342  mergingStillBeingDone = false; //initialise loop control flag
343  //for each fragment, scan other frags looking for contiguous blocks
344  for (int i = 0; i < frag.Length; i++)
345  {
346  if (frag[i] == null)
347  {
348  continue;
349  }
350  //merge any contiguous blocks
351  for (int x = 0; x < frag.Length; x++)
352  {
353  if (frag[x] == null)
354  {
355  continue;
356  }
357  if (frag[i] == null)
358  {
359  break;
360  }
361  TextFragment frag1 = null;
362  TextFragment frag2 = null;
363  int frag1Num = 0;
364  int frag2Num = 0;
365  int bestScoringFragNum;
366  int worstScoringFragNum;
367  //if blocks are contiguous....
368  if (frag[i].Follows(frag[x]))
369  {
370  frag1 = frag[x];
371  frag1Num = x;
372  frag2 = frag[i];
373  frag2Num = i;
374  }
375  else if (frag[x].Follows(frag[i]))
376  {
377  frag1 = frag[i];
378  frag1Num = i;
379  frag2 = frag[x];
380  frag2Num = x;
381  }
382  //merging required..
383  if (frag1 != null)
384  {
385  if (frag1.Score > frag2.Score)
386  {
387  bestScoringFragNum = frag1Num;
388  worstScoringFragNum = frag2Num;
389  }
390  else
391  {
392  bestScoringFragNum = frag2Num;
393  worstScoringFragNum = frag1Num;
394  }
395  frag1.Merge(frag2);
396  frag[worstScoringFragNum] = null;
397  mergingStillBeingDone = true;
398  frag[bestScoringFragNum] = frag1;
399  }
400  }
401  }
402  } while (mergingStillBeingDone);
403  }
404 
405  /// <summary>
406  /// Highlights terms in the text , extracting the most relevant sections
407  /// and concatenating the chosen fragments with a separator (typically "...").
408  /// The document text is analysed in chunks to record hit statistics
409  /// across the document. After accumulating stats, the fragments with the highest scores
410  /// are returned in order as "separator" delimited strings.
411  /// </summary>
412  /// <param name="tokenStream"></param>
413  /// <param name="text">text to highlight terms in</param>
414  /// <param name="maxNumFragments">the maximum number of fragments.</param>
415  /// <param name="separator">the separator used to intersperse the document fragments (typically "...")</param>
416  /// <returns>highlighted text</returns>
417  public String GetBestFragments(
418  TokenStream tokenStream,
419  String text,
420  int maxNumFragments,
421  String separator)
422  {
423  string[] sections = GetBestFragments(tokenStream, text, maxNumFragments);
424  StringBuilder result = new StringBuilder();
425  for (int i = 0; i < sections.Length; i++)
426  {
427  if (i > 0)
428  {
429  result.Append(separator);
430  }
431  result.Append(sections[i]);
432  }
433  return result.ToString();
434  }
435 
436  public int MaxDocCharsToAnalyze
437  {
438  get { return _maxDocCharsToAnalyze; }
439  set { this._maxDocCharsToAnalyze = value; }
440  }
441 
442 
443  public IFragmenter TextFragmenter
444  {
445  get { return _textFragmenter; }
446  set { _textFragmenter = value; }
447  }
448 
449  public IScorer FragmentScorer
450  {
451  get { return _fragmentScorer; }
452  set { _fragmentScorer = value; }
453  }
454 
455  public IEncoder Encoder
456  {
457  get { return _encoder; }
458  set { this._encoder = value; }
459  }
460  }
461 
462  internal class FragmentQueue : PriorityQueue<TextFragment>
463  {
464  public FragmentQueue(int size)
465  {
466  Initialize(size);
467  }
468 
469  public override bool LessThan(TextFragment fragA, TextFragment fragB)
470  {
471  if (fragA.Score == fragB.Score)
472  return fragA.FragNum > fragB.FragNum;
473  else
474  return fragA.Score < fragB.Score;
475  }
476  }
477 }