19 using System.Collections.Generic;
23 using Lucene.Net.Analysis;
24 using Lucene.Net.Analysis.Tokenattributes;
25 using Lucene.Net.Util;
27 namespace Lucene.Net.Search.Highlight
36 public static readonly
int DEFAULT_MAX_CHARS_TO_ANALYZE = 50*1024;
38 private int _maxDocCharsToAnalyze = DEFAULT_MAX_CHARS_TO_ANALYZE;
42 private IScorer _fragmentScorer = null;
58 _formatter = formatter;
60 _fragmentScorer = fragmentScorer;
72 public String GetBestFragment(
Analyzer analyzer, String fieldName, String text)
75 return GetBestFragment(tokenStream, text);
94 public String GetBestFragment(
TokenStream tokenStream, String text)
96 String[] results = GetBestFragments(tokenStream, text, 1);
97 if (results.Length > 0)
114 public String[] GetBestFragments(
121 return GetBestFragments(tokenStream, text, maxNumFragments);
136 public String[] GetBestFragments(
TokenStream tokenStream, String text,
int maxNumFragments)
138 maxNumFragments = Math.Max(1, maxNumFragments);
140 TextFragment[] frag = GetBestTextFragments(tokenStream, text,
true, maxNumFragments);
143 var fragTexts =
new List<String>();
144 for (
int i = 0; i < frag.Length; i++)
146 if ((frag[i] != null) && (frag[i].
Score > 0))
148 fragTexts.Add(frag[i].ToString());
151 return fragTexts.ToArray();
162 bool mergeContiguousFragments,
165 var docFrags =
new List<TextFragment>();
166 var newText =
new StringBuilder();
173 var currentFrag =
new TextFragment(newText, newText.Length, docFrags.Count);
174 var newStream = _fragmentScorer.Init(tokenStream);
175 if (newStream != null)
177 tokenStream = newStream;
179 _fragmentScorer.StartFragment(currentFrag);
180 docFrags.Add(currentFrag);
182 var fragQueue =
new FragmentQueue(maxNumFragments);
190 int lastEndOffset = 0;
191 _textFragmenter.Start(text, tokenStream);
196 next && (offsetAtt.StartOffset < _maxDocCharsToAnalyze);
199 if ((offsetAtt.EndOffset > text.Length)
201 (offsetAtt.StartOffset > text.Length)
205 +
" exceeds length of provided text sized " + text.Length);
207 if ((tokenGroup.NumTokens > 0) && (tokenGroup.IsDistinct()))
211 startOffset = tokenGroup.MatchStartOffset;
212 endOffset = tokenGroup.MatchEndOffset;
213 tokenText = text.Substring(startOffset, endOffset - startOffset);
214 String markedUpText = _formatter.HighlightTerm(_encoder.EncodeText(tokenText), tokenGroup);
216 if (startOffset > lastEndOffset)
217 newText.Append(_encoder.EncodeText(text.Substring(lastEndOffset, startOffset - lastEndOffset)));
218 newText.Append(markedUpText);
219 lastEndOffset = Math.Max(endOffset, lastEndOffset);
223 if (_textFragmenter.IsNewFragment())
225 currentFrag.Score = _fragmentScorer.FragmentScore;
227 currentFrag.TextEndPos = newText.Length;
228 currentFrag =
new TextFragment(newText, newText.Length, docFrags.Count);
229 _fragmentScorer.StartFragment(currentFrag);
230 docFrags.Add(currentFrag);
234 tokenGroup.AddToken(_fragmentScorer.GetTokenScore());
241 currentFrag.Score = _fragmentScorer.FragmentScore;
243 if (tokenGroup.NumTokens > 0)
246 startOffset = tokenGroup.MatchStartOffset;
247 endOffset = tokenGroup.MatchEndOffset;
248 tokenText = text.Substring(startOffset, endOffset - startOffset);
249 var markedUpText = _formatter.HighlightTerm(_encoder.EncodeText(tokenText), tokenGroup);
251 if (startOffset > lastEndOffset)
252 newText.Append(_encoder.EncodeText(text.Substring(lastEndOffset, startOffset - lastEndOffset)));
253 newText.Append(markedUpText);
254 lastEndOffset = Math.Max(lastEndOffset, endOffset);
260 (lastEndOffset < text.Length)
263 (text.Length <= _maxDocCharsToAnalyze)
267 newText.Append(_encoder.EncodeText(text.Substring(lastEndOffset)));
270 currentFrag.TextEndPos = newText.Length;
273 foreach (var f
in docFrags)
295 fragQueue.InsertWithOverflow(currentFrag);
300 for (
int i = frag.Length - 1; i >= 0; i--)
302 frag[i] = fragQueue.Pop();
306 if (mergeContiguousFragments)
308 MergeContiguousFragments(frag);
309 frag = frag.Where(t => (t != null) && (t.Score > 0)).ToArray();
317 if (tokenStream != null)
336 private void MergeContiguousFragments(
TextFragment[] frag)
338 bool mergingStillBeingDone;
342 mergingStillBeingDone =
false;
344 for (
int i = 0; i < frag.Length; i++)
351 for (
int x = 0; x < frag.Length; x++)
361 TextFragment frag1 = null;
362 TextFragment frag2 = null;
365 int bestScoringFragNum;
366 int worstScoringFragNum;
368 if (frag[i].Follows(frag[x]))
375 else if (frag[x].Follows(frag[i]))
385 if (frag1.Score > frag2.Score)
387 bestScoringFragNum = frag1Num;
388 worstScoringFragNum = frag2Num;
392 bestScoringFragNum = frag2Num;
393 worstScoringFragNum = frag1Num;
396 frag[worstScoringFragNum] = null;
397 mergingStillBeingDone =
true;
398 frag[bestScoringFragNum] = frag1;
402 }
while (mergingStillBeingDone);
417 public String GetBestFragments(
423 string[] sections = GetBestFragments(tokenStream, text, maxNumFragments);
424 StringBuilder result =
new StringBuilder();
425 for (
int i = 0; i < sections.Length; i++)
429 result.Append(separator);
431 result.Append(sections[i]);
433 return result.ToString();
436 public int MaxDocCharsToAnalyze
438 get {
return _maxDocCharsToAnalyze; }
439 set { this._maxDocCharsToAnalyze = value; }
445 get {
return _textFragmenter; }
446 set { _textFragmenter = value; }
451 get {
return _fragmentScorer; }
452 set { _fragmentScorer = value; }
457 get {
return _encoder; }
458 set { this._encoder = value; }
464 public FragmentQueue(
int size)
469 public override bool LessThan(TextFragment fragA, TextFragment fragB)
471 if (fragA.Score == fragB.Score)
472 return fragA.FragNum > fragB.FragNum;
474 return fragA.Score < fragB.Score;