Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
TokenSources.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 /*
19 * Created on 28-Oct-2004
20 */
21 
22 using System;
23 using System.Collections.Generic;
24 using System.IO;
25 using System.Linq;
26 using Lucene.Net.Analysis;
27 using Lucene.Net.Analysis.Tokenattributes;
28 using Lucene.Net.Documents;
29 using Lucene.Net.Index;
30 
31 namespace Lucene.Net.Search.Highlight
32 {
33 
34  /// <summary> Hides implementation issues associated with obtaining a TokenStream for use with
35  /// the higlighter - can obtain from TermFreqVectors with offsets and (optionally) positions or
36  /// from Analyzer class reparsing the stored content.
37  /// </summary>
38  public class TokenSources
39  {
41  {
42  protected internal Token[] tokens;
43  protected internal int currentToken = 0;
44  protected internal ITermAttribute termAtt;
45  protected internal IOffsetAttribute offsetAtt;
46 
47  protected internal StoredTokenStream(Token[] tokens)
48  {
49  this.tokens = tokens;
50  termAtt = AddAttribute<ITermAttribute>();
51  offsetAtt = AddAttribute<IOffsetAttribute>();
52  }
53 
54  public override bool IncrementToken()
55  {
56  if (currentToken >= tokens.Length)
57  {
58  return false;
59  }
60  ClearAttributes();
61  Token token = tokens[currentToken++];
62  termAtt.SetTermBuffer(token.Term);
63  offsetAtt.SetOffset(token.StartOffset, token.EndOffset);
64  return true;
65  }
66 
67  protected override void Dispose(bool disposing)
68  {
69  // do nothing
70  }
71  }
72 
73  /// <summary>
74  /// A convenience method that tries to first get a TermPositionVector for the specified docId, then, falls back to
75  /// using the passed in {@link org.apache.lucene.document.Document} to retrieve the TokenStream. This is useful when
76  /// you already have the document, but would prefer to use the vector first.
77  /// </summary>
78  /// <param name="reader">The <see cref="IndexReader"/> to use to try and get the vector from</param>
79  /// <param name="docId">The docId to retrieve.</param>
80  /// <param name="field">The field to retrieve on the document</param>
81  /// <param name="doc">The document to fall back on</param>
82  /// <param name="analyzer">The analyzer to use for creating the TokenStream if the vector doesn't exist</param>
83  /// <returns>The <see cref="TokenStream"/> for the <see cref="IFieldable"/> on the <see cref="Document"/></returns>
84  /// <exception cref="IOException">if there was an error loading</exception>
85  public static TokenStream GetAnyTokenStream(IndexReader reader, int docId, String field, Document doc,
86  Analyzer analyzer)
87  {
88  TokenStream ts = null;
89 
90  var tfv = reader.GetTermFreqVector(docId, field);
91  if (tfv != null)
92  {
93  var termPositionVector = tfv as TermPositionVector;
94  if (termPositionVector != null)
95  {
96  ts = GetTokenStream(termPositionVector);
97  }
98  }
99  //No token info stored so fall back to analyzing raw content
100  return ts ?? GetTokenStream(doc, field, analyzer);
101  }
102 
103  /// <summary>
104  /// A convenience method that tries a number of approaches to getting a token stream.
105  /// The cost of finding there are no termVectors in the index is minimal (1000 invocations still
106  /// registers 0 ms). So this "lazy" (flexible?) approach to coding is probably acceptable
107  /// </summary>
108  /// <returns>null if field not stored correctly</returns>
109  public static TokenStream GetAnyTokenStream(IndexReader reader, int docId, String field, Analyzer analyzer)
110  {
111  TokenStream ts = null;
112 
113  var tfv = reader.GetTermFreqVector(docId, field);
114  if (tfv != null)
115  {
116  var termPositionVector = tfv as TermPositionVector;
117  if (termPositionVector != null)
118  {
119  ts = GetTokenStream(termPositionVector);
120  }
121  }
122  //No token info stored so fall back to analyzing raw content
123  return ts ?? GetTokenStream(reader, docId, field, analyzer);
124  }
125 
126  public static TokenStream GetTokenStream(TermPositionVector tpv)
127  {
128  //assumes the worst and makes no assumptions about token position sequences.
129  return GetTokenStream(tpv, false);
130  }
131 
132  /// <summary>
133  /// Low level api.
134  /// Returns a token stream or null if no offset info available in index.
135  /// This can be used to feed the highlighter with a pre-parsed token stream
136  ///
137  /// In my tests the speeds to recreate 1000 token streams using this method are:
138  /// - with TermVector offset only data stored - 420 milliseconds
139  /// - with TermVector offset AND position data stored - 271 milliseconds
140  /// (nb timings for TermVector with position data are based on a tokenizer with contiguous
141  /// positions - no overlaps or gaps)
142  /// The cost of not using TermPositionVector to store
143  /// pre-parsed content and using an analyzer to re-parse the original content:
144  /// - reanalyzing the original content - 980 milliseconds
145  ///
146  /// The re-analyze timings will typically vary depending on -
147  /// 1) The complexity of the analyzer code (timings above were using a
148  /// stemmer/lowercaser/stopword combo)
149  /// 2) The number of other fields (Lucene reads ALL fields off the disk
150  /// when accessing just one document field - can cost dear!)
151  /// 3) Use of compression on field storage - could be faster due to compression (less disk IO)
152  /// or slower (more CPU burn) depending on the content.
153  /// </summary>
154  /// <param name="tpv"/>
155  /// <param name="tokenPositionsGuaranteedContiguous">true if the token position numbers have no overlaps or gaps. If looking
156  /// to eek out the last drops of performance, set to true. If in doubt, set to false.</param>
157  public static TokenStream GetTokenStream(TermPositionVector tpv, bool tokenPositionsGuaranteedContiguous)
158  {
159  //code to reconstruct the original sequence of Tokens
160  String[] terms = tpv.GetTerms();
161  int[] freq = tpv.GetTermFrequencies();
162 
163  int totalTokens = freq.Sum();
164 
165  var tokensInOriginalOrder = new Token[totalTokens];
166  List<Token> unsortedTokens = null;
167  for (int t = 0; t < freq.Length; t++)
168  {
169  TermVectorOffsetInfo[] offsets = tpv.GetOffsets(t);
170  if (offsets == null)
171  {
172  return null;
173  }
174 
175  int[] pos = null;
176  if (tokenPositionsGuaranteedContiguous)
177  {
178  //try get the token position info to speed up assembly of tokens into sorted sequence
179  pos = tpv.GetTermPositions(t);
180  }
181  if (pos == null)
182  {
183  //tokens NOT stored with positions or not guaranteed contiguous - must add to list and sort later
184  if (unsortedTokens == null)
185  {
186  unsortedTokens = new List<Token>();
187  }
188 
189  foreach (TermVectorOffsetInfo t1 in offsets)
190  {
191  var token = new Token(t1.StartOffset, t1.EndOffset);
192  token.SetTermBuffer(terms[t]);
193  unsortedTokens.Add(token);
194  }
195  }
196  else
197  {
198  //We have positions stored and a guarantee that the token position information is contiguous
199 
200  // This may be fast BUT wont work if Tokenizers used which create >1 token in same position or
201  // creates jumps in position numbers - this code would fail under those circumstances
202 
203  //tokens stored with positions - can use this to index straight into sorted array
204  for (int tp = 0; tp < pos.Length; tp++)
205  {
206  var token = new Token(terms[t], offsets[tp].StartOffset, offsets[tp].EndOffset);
207  tokensInOriginalOrder[pos[tp]] = token;
208  }
209  }
210  }
211  //If the field has been stored without position data we must perform a sort
212  if (unsortedTokens != null)
213  {
214  tokensInOriginalOrder = unsortedTokens.ToArray();
215  Array.Sort(tokensInOriginalOrder, (t1, t2) =>
216  {
217  if (t1.StartOffset > t2.EndOffset)
218  return 1;
219  if (t1.StartOffset < t2.StartOffset)
220  return -1;
221  return 0;
222  });
223  }
224  return new StoredTokenStream(tokensInOriginalOrder);
225  }
226 
227  public static TokenStream GetTokenStream(IndexReader reader, int docId, System.String field)
228  {
229  var tfv = reader.GetTermFreqVector(docId, field);
230  if (tfv == null)
231  {
232  throw new ArgumentException(field + " in doc #" + docId
233  + "does not have any term position data stored");
234  }
235  if (tfv is TermPositionVector)
236  {
237  var tpv = (TermPositionVector) reader.GetTermFreqVector(docId, field);
238  return GetTokenStream(tpv);
239  }
240  throw new ArgumentException(field + " in doc #" + docId
241  + "does not have any term position data stored");
242  }
243 
244  //convenience method
245  public static TokenStream GetTokenStream(IndexReader reader, int docId, String field, Analyzer analyzer)
246  {
247  Document doc = reader.Document(docId);
248  return GetTokenStream(doc, field, analyzer);
249  }
250 
251  public static TokenStream GetTokenStream(Document doc, String field, Analyzer analyzer)
252  {
253  String contents = doc.Get(field);
254  if (contents == null)
255  {
256  throw new ArgumentException("Field " + field + " in document is not stored and cannot be analyzed");
257  }
258  return GetTokenStream(field, contents, analyzer);
259  }
260 
261  //convenience method
262  public static TokenStream GetTokenStream(String field, String contents, Analyzer analyzer)
263  {
264  return analyzer.TokenStream(field, new StringReader(contents));
265  }
266  }
267 }