Lucene.Net  3.0.3
Lucene.Net is a .NET port of the Java Lucene Indexing Library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties
TokenSources.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 /*
19 * Created on 28-Oct-2004
20 */
21 
22 using System;
23 using System.Collections.Generic;
24 using System.IO;
25 using System.Linq;
26 using Lucene.Net.Analysis;
27 using Lucene.Net.Analysis.Tokenattributes;
28 using Lucene.Net.Documents;
29 using Lucene.Net.Index;
30 
31 namespace Lucene.Net.Search.Highlight
32 {
33 
38  public class TokenSources
39  {
41  {
42  protected internal Token[] tokens;
43  protected internal int currentToken = 0;
44  protected internal ITermAttribute termAtt;
45  protected internal IOffsetAttribute offsetAtt;
46 
47  protected internal StoredTokenStream(Token[] tokens)
48  {
49  this.tokens = tokens;
50  termAtt = AddAttribute<ITermAttribute>();
51  offsetAtt = AddAttribute<IOffsetAttribute>();
52  }
53 
54  public override bool IncrementToken()
55  {
56  if (currentToken >= tokens.Length)
57  {
58  return false;
59  }
60  ClearAttributes();
61  Token token = tokens[currentToken++];
62  termAtt.SetTermBuffer(token.Term);
63  offsetAtt.SetOffset(token.StartOffset, token.EndOffset);
64  return true;
65  }
66 
67  protected override void Dispose(bool disposing)
68  {
69  // do nothing
70  }
71  }
72 
85  public static TokenStream GetAnyTokenStream(IndexReader reader, int docId, String field, Document doc,
86  Analyzer analyzer)
87  {
88  TokenStream ts = null;
89 
90  var tfv = reader.GetTermFreqVector(docId, field);
91  if (tfv != null)
92  {
93  var termPositionVector = tfv as TermPositionVector;
94  if (termPositionVector != null)
95  {
96  ts = GetTokenStream(termPositionVector);
97  }
98  }
99  //No token info stored so fall back to analyzing raw content
100  return ts ?? GetTokenStream(doc, field, analyzer);
101  }
102 
109  public static TokenStream GetAnyTokenStream(IndexReader reader, int docId, String field, Analyzer analyzer)
110  {
111  TokenStream ts = null;
112 
113  var tfv = reader.GetTermFreqVector(docId, field);
114  if (tfv != null)
115  {
116  var termPositionVector = tfv as TermPositionVector;
117  if (termPositionVector != null)
118  {
119  ts = GetTokenStream(termPositionVector);
120  }
121  }
122  //No token info stored so fall back to analyzing raw content
123  return ts ?? GetTokenStream(reader, docId, field, analyzer);
124  }
125 
126  public static TokenStream GetTokenStream(TermPositionVector tpv)
127  {
128  //assumes the worst and makes no assumptions about token position sequences.
129  return GetTokenStream(tpv, false);
130  }
131 
157  public static TokenStream GetTokenStream(TermPositionVector tpv, bool tokenPositionsGuaranteedContiguous)
158  {
159  //code to reconstruct the original sequence of Tokens
160  String[] terms = tpv.GetTerms();
161  int[] freq = tpv.GetTermFrequencies();
162 
163  int totalTokens = freq.Sum();
164 
165  var tokensInOriginalOrder = new Token[totalTokens];
166  List<Token> unsortedTokens = null;
167  for (int t = 0; t < freq.Length; t++)
168  {
169  TermVectorOffsetInfo[] offsets = tpv.GetOffsets(t);
170  if (offsets == null)
171  {
172  return null;
173  }
174 
175  int[] pos = null;
176  if (tokenPositionsGuaranteedContiguous)
177  {
178  //try get the token position info to speed up assembly of tokens into sorted sequence
179  pos = tpv.GetTermPositions(t);
180  }
181  if (pos == null)
182  {
183  //tokens NOT stored with positions or not guaranteed contiguous - must add to list and sort later
184  if (unsortedTokens == null)
185  {
186  unsortedTokens = new List<Token>();
187  }
188 
189  foreach (TermVectorOffsetInfo t1 in offsets)
190  {
191  var token = new Token(t1.StartOffset, t1.EndOffset);
192  token.SetTermBuffer(terms[t]);
193  unsortedTokens.Add(token);
194  }
195  }
196  else
197  {
198  //We have positions stored and a guarantee that the token position information is contiguous
199 
200  // This may be fast BUT wont work if Tokenizers used which create >1 token in same position or
201  // creates jumps in position numbers - this code would fail under those circumstances
202 
203  //tokens stored with positions - can use this to index straight into sorted array
204  for (int tp = 0; tp < pos.Length; tp++)
205  {
206  var token = new Token(terms[t], offsets[tp].StartOffset, offsets[tp].EndOffset);
207  tokensInOriginalOrder[pos[tp]] = token;
208  }
209  }
210  }
211  //If the field has been stored without position data we must perform a sort
212  if (unsortedTokens != null)
213  {
214  tokensInOriginalOrder = unsortedTokens.ToArray();
215  Array.Sort(tokensInOriginalOrder, (t1, t2) =>
216  {
217  if (t1.StartOffset > t2.EndOffset)
218  return 1;
219  if (t1.StartOffset < t2.StartOffset)
220  return -1;
221  return 0;
222  });
223  }
224  return new StoredTokenStream(tokensInOriginalOrder);
225  }
226 
227  public static TokenStream GetTokenStream(IndexReader reader, int docId, System.String field)
228  {
229  var tfv = reader.GetTermFreqVector(docId, field);
230  if (tfv == null)
231  {
232  throw new ArgumentException(field + " in doc #" + docId
233  + "does not have any term position data stored");
234  }
235  if (tfv is TermPositionVector)
236  {
237  var tpv = (TermPositionVector) reader.GetTermFreqVector(docId, field);
238  return GetTokenStream(tpv);
239  }
240  throw new ArgumentException(field + " in doc #" + docId
241  + "does not have any term position data stored");
242  }
243 
244  //convenience method
245  public static TokenStream GetTokenStream(IndexReader reader, int docId, String field, Analyzer analyzer)
246  {
247  Document doc = reader.Document(docId);
248  return GetTokenStream(doc, field, analyzer);
249  }
250 
251  public static TokenStream GetTokenStream(Document doc, String field, Analyzer analyzer)
252  {
253  String contents = doc.Get(field);
254  if (contents == null)
255  {
256  throw new ArgumentException("Field " + field + " in document is not stored and cannot be analyzed");
257  }
258  return GetTokenStream(field, contents, analyzer);
259  }
260 
261  //convenience method
262  public static TokenStream GetTokenStream(String field, String contents, Analyzer analyzer)
263  {
264  return analyzer.TokenStream(field, new StringReader(contents));
265  }
266  }
267 }