Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
TermVectorsReader.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 
20 using BufferedIndexInput = Lucene.Net.Store.BufferedIndexInput;
21 using Directory = Lucene.Net.Store.Directory;
22 using IndexInput = Lucene.Net.Store.IndexInput;
23 
24 namespace Lucene.Net.Index
25 {
26  class TermVectorsReader : System.ICloneable, IDisposable
27  {
28 
29  // NOTE: if you make a new format, it must be larger than
30  // the current format
31  internal const int FORMAT_VERSION = 2;
32 
33  // Changes to speed up bulk merging of term vectors:
34  internal const int FORMAT_VERSION2 = 3;
35 
36  // Changed strings to UTF8 with length-in-bytes not length-in-chars
37  internal const int FORMAT_UTF8_LENGTH_IN_BYTES = 4;
38 
39  // NOTE: always change this if you switch to a new format!
40  internal static readonly int FORMAT_CURRENT = FORMAT_UTF8_LENGTH_IN_BYTES;
41 
42  //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
43  internal const int FORMAT_SIZE = 4;
44 
45  internal const byte STORE_POSITIONS_WITH_TERMVECTOR = (byte) (0x1);
46  internal const byte STORE_OFFSET_WITH_TERMVECTOR = (byte) (0x2);
47 
48  private FieldInfos fieldInfos;
49 
50  private IndexInput tvx;
51  private IndexInput tvd;
52  private IndexInput tvf;
53  private int size;
54  private int numTotalDocs;
55 
56  // The docID offset where our docs begin in the index
57  // file. This will be 0 if we have our own private file.
58  private int docStoreOffset;
59 
60  private int format;
61  private bool isDisposed;
62 
63  internal TermVectorsReader(Directory d, System.String segment, FieldInfos fieldInfos):this(d, segment, fieldInfos, BufferedIndexInput.BUFFER_SIZE)
64  {
65  }
66 
67  internal TermVectorsReader(Directory d, System.String segment, FieldInfos fieldInfos, int readBufferSize):this(d, segment, fieldInfos, readBufferSize, - 1, 0)
68  {
69  }
70 
71  internal TermVectorsReader(Directory d, System.String segment, FieldInfos fieldInfos, int readBufferSize, int docStoreOffset, int size)
72  {
73  bool success = false;
74 
75  try
76  {
77  if (d.FileExists(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION))
78  {
79  tvx = d.OpenInput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION, readBufferSize);
80  format = CheckValidFormat(tvx);
81  tvd = d.OpenInput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION, readBufferSize);
82  int tvdFormat = CheckValidFormat(tvd);
83  tvf = d.OpenInput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION, readBufferSize);
84  int tvfFormat = CheckValidFormat(tvf);
85 
86  System.Diagnostics.Debug.Assert(format == tvdFormat);
87  System.Diagnostics.Debug.Assert(format == tvfFormat);
88 
89  if (format >= FORMAT_VERSION2)
90  {
91  System.Diagnostics.Debug.Assert((tvx.Length() - FORMAT_SIZE) % 16 == 0);
92  numTotalDocs = (int)(tvx.Length() >> 4);
93  }
94  else
95  {
96  System.Diagnostics.Debug.Assert((tvx.Length() - FORMAT_SIZE) % 8 == 0);
97  numTotalDocs = (int)(tvx.Length() >> 3);
98  }
99 
100  if (-1 == docStoreOffset)
101  {
102  this.docStoreOffset = 0;
103  this.size = numTotalDocs;
104  System.Diagnostics.Debug.Assert(size == 0 || numTotalDocs == size);
105  }
106  else
107  {
108  this.docStoreOffset = docStoreOffset;
109  this.size = size;
110  // Verify the file is long enough to hold all of our
111  // docs
112  System.Diagnostics.Debug.Assert(numTotalDocs >= size + docStoreOffset, "numTotalDocs=" + numTotalDocs + " size=" + size + " docStoreOffset=" + docStoreOffset);
113  }
114  }
115  else
116  {
117  // If all documents flushed in a segment had hit
118  // non-aborting exceptions, it's possible that
119  // FieldInfos.hasVectors returns true yet the term
120  // vector files don't exist.
121  format = 0;
122  }
123 
124 
125  this.fieldInfos = fieldInfos;
126  success = true;
127  }
128  finally
129  {
130  // With lock-less commits, it's entirely possible (and
131  // fine) to hit a FileNotFound exception above. In
132  // this case, we want to explicitly close any subset
133  // of things that were opened so that we don't have to
134  // wait for a GC to do so.
135  if (!success)
136  {
137  Dispose();
138  }
139  }
140  }
141 
142  // Used for bulk copy when merging
143  internal virtual IndexInput GetTvdStream()
144  {
145  return tvd;
146  }
147 
148  // Used for bulk copy when merging
149  internal virtual IndexInput GetTvfStream()
150  {
151  return tvf;
152  }
153 
154  private void SeekTvx(int docNum)
155  {
156  if (format < FORMAT_VERSION2)
157  tvx.Seek((docNum + docStoreOffset) * 8L + FORMAT_SIZE);
158  else
159  tvx.Seek((docNum + docStoreOffset) * 16L + FORMAT_SIZE);
160  }
161 
162  internal virtual bool CanReadRawDocs()
163  {
164  return format >= FORMAT_UTF8_LENGTH_IN_BYTES;
165  }
166 
167  /// <summary>Retrieve the length (in bytes) of the tvd and tvf
168  /// entries for the next numDocs starting with
169  /// startDocID. This is used for bulk copying when
170  /// merging segments, if the field numbers are
171  /// congruent. Once this returns, the tvf &amp; tvd streams
172  /// are seeked to the startDocID.
173  /// </summary>
174  internal void RawDocs(int[] tvdLengths, int[] tvfLengths, int startDocID, int numDocs)
175  {
176 
177  if (tvx == null)
178  {
179  for (int i = 0; i < tvdLengths.Length; i++)
180  {
181  tvdLengths[i] = 0;
182  }
183  for (int i = 0; i < tvfLengths.Length; i++)
184  {
185  tvfLengths[i] = 0;
186  }
187  return ;
188  }
189 
190  // SegmentMerger calls canReadRawDocs() first and should
191  // not call us if that returns false.
192  if (format < FORMAT_VERSION2)
193  throw new System.SystemException("cannot read raw docs with older term vector formats");
194 
195  SeekTvx(startDocID);
196 
197  long tvdPosition = tvx.ReadLong();
198  tvd.Seek(tvdPosition);
199 
200  long tvfPosition = tvx.ReadLong();
201  tvf.Seek(tvfPosition);
202 
203  long lastTvdPosition = tvdPosition;
204  long lastTvfPosition = tvfPosition;
205 
206  int count = 0;
207  while (count < numDocs)
208  {
209  int docID = docStoreOffset + startDocID + count + 1;
210  System.Diagnostics.Debug.Assert(docID <= numTotalDocs);
211  if (docID < numTotalDocs)
212  {
213  tvdPosition = tvx.ReadLong();
214  tvfPosition = tvx.ReadLong();
215  }
216  else
217  {
218  tvdPosition = tvd.Length();
219  tvfPosition = tvf.Length();
220  System.Diagnostics.Debug.Assert(count == numDocs - 1);
221  }
222  tvdLengths[count] = (int) (tvdPosition - lastTvdPosition);
223  tvfLengths[count] = (int) (tvfPosition - lastTvfPosition);
224  count++;
225  lastTvdPosition = tvdPosition;
226  lastTvfPosition = tvfPosition;
227  }
228  }
229 
230  private int CheckValidFormat(IndexInput in_Renamed)
231  {
232  int format = in_Renamed.ReadInt();
233  if (format > FORMAT_CURRENT)
234  {
235  throw new CorruptIndexException("Incompatible format version: " + format + " expected " + FORMAT_CURRENT + " or less");
236  }
237  return format;
238  }
239 
240  public void Dispose()
241  {
242  Dispose(true);
243  }
244 
245  protected virtual void Dispose(bool disposing)
246  {
247  if (isDisposed) return;
248 
249  if (disposing)
250  {
251  // make all effort to close up. Keep the first exception
252  // and throw it as a new one.
253  System.IO.IOException keep = null;
254  if (tvx != null)
255  try
256  {
257  tvx.Close();
258  }
259  catch (System.IO.IOException e)
260  {
261  if (keep == null)
262  keep = e;
263  }
264  if (tvd != null)
265  try
266  {
267  tvd.Close();
268  }
269  catch (System.IO.IOException e)
270  {
271  if (keep == null)
272  keep = e;
273  }
274  if (tvf != null)
275  try
276  {
277  tvf.Close();
278  }
279  catch (System.IO.IOException e)
280  {
281  if (keep == null)
282  keep = e;
283  }
284  if (keep != null)
285  {
286  throw new System.IO.IOException(keep.StackTrace);
287  }
288  }
289 
290  isDisposed = true;
291  }
292 
293  /// <summary> </summary>
294  /// <returns> The number of documents in the reader
295  /// </returns>
296  internal virtual int Size()
297  {
298  return size;
299  }
300 
301  public virtual void Get(int docNum, System.String field, TermVectorMapper mapper)
302  {
303  if (tvx != null)
304  {
305  int fieldNumber = fieldInfos.FieldNumber(field);
306  //We need to account for the FORMAT_SIZE at when seeking in the tvx
307  //We don't need to do this in other seeks because we already have the
308  // file pointer
309  //that was written in another file
310  SeekTvx(docNum);
311  //System.out.println("TVX Pointer: " + tvx.getFilePointer());
312  long tvdPosition = tvx.ReadLong();
313 
314  tvd.Seek(tvdPosition);
315  int fieldCount = tvd.ReadVInt();
316  //System.out.println("Num Fields: " + fieldCount);
317  // There are only a few fields per document. We opt for a full scan
318  // rather then requiring that they be ordered. We need to read through
319  // all of the fields anyway to get to the tvf pointers.
320  int number = 0;
321  int found = - 1;
322  for (int i = 0; i < fieldCount; i++)
323  {
324  if (format >= FORMAT_VERSION)
325  number = tvd.ReadVInt();
326  else
327  number += tvd.ReadVInt();
328 
329  if (number == fieldNumber)
330  found = i;
331  }
332 
333  // This field, although valid in the segment, was not found in this
334  // document
335  if (found != - 1)
336  {
337  // Compute position in the tvf file
338  long position;
339  if (format >= FORMAT_VERSION2)
340  position = tvx.ReadLong();
341  else
342  position = tvd.ReadVLong();
343  for (int i = 1; i <= found; i++)
344  position += tvd.ReadVLong();
345 
346  mapper.SetDocumentNumber(docNum);
347  ReadTermVector(field, position, mapper);
348  }
349  else
350  {
351  //System.out.println("Fieldable not found");
352  }
353  }
354  else
355  {
356  //System.out.println("No tvx file");
357  }
358  }
359 
360 
361 
362  /// <summary> Retrieve the term vector for the given document and field</summary>
363  /// <param name="docNum">The document number to retrieve the vector for
364  /// </param>
365  /// <param name="field">The field within the document to retrieve
366  /// </param>
367  /// <returns> The TermFreqVector for the document and field or null if there is no termVector for this field.
368  /// </returns>
369  /// <throws> IOException if there is an error reading the term vector files </throws>
370  public /*internal*/ virtual ITermFreqVector Get(int docNum, System.String field)
371  {
372  // Check if no term vectors are available for this segment at all
374  Get(docNum, field, mapper);
375 
376  return mapper.MaterializeVector();
377  }
378 
379  // Reads the String[] fields; you have to pre-seek tvd to
380  // the right point
381  private System.String[] ReadFields(int fieldCount)
382  {
383  int number = 0;
384  System.String[] fields = new System.String[fieldCount];
385 
386  for (int i = 0; i < fieldCount; i++)
387  {
388  if (format >= FORMAT_VERSION)
389  number = tvd.ReadVInt();
390  else
391  number += tvd.ReadVInt();
392 
393  fields[i] = fieldInfos.FieldName(number);
394  }
395 
396  return fields;
397  }
398 
399  // Reads the long[] offsets into TVF; you have to pre-seek
400  // tvx/tvd to the right point
401  private long[] ReadTvfPointers(int fieldCount)
402  {
403  // Compute position in the tvf file
404  long position;
405  if (format >= FORMAT_VERSION2)
406  position = tvx.ReadLong();
407  else
408  position = tvd.ReadVLong();
409 
410  long[] tvfPointers = new long[fieldCount];
411  tvfPointers[0] = position;
412 
413  for (int i = 1; i < fieldCount; i++)
414  {
415  position += tvd.ReadVLong();
416  tvfPointers[i] = position;
417  }
418 
419  return tvfPointers;
420  }
421 
422  /// <summary> Return all term vectors stored for this document or null if the could not be read in.
423  ///
424  /// </summary>
425  /// <param name="docNum">The document number to retrieve the vector for
426  /// </param>
427  /// <returns> All term frequency vectors
428  /// </returns>
429  /// <throws> IOException if there is an error reading the term vector files </throws>
430  public /*internal*/ virtual ITermFreqVector[] Get(int docNum)
431  {
432  ITermFreqVector[] result = null;
433  if (tvx != null)
434  {
435  //We need to offset by
436  SeekTvx(docNum);
437  long tvdPosition = tvx.ReadLong();
438 
439  tvd.Seek(tvdPosition);
440  int fieldCount = tvd.ReadVInt();
441 
442  // No fields are vectorized for this document
443  if (fieldCount != 0)
444  {
445  System.String[] fields = ReadFields(fieldCount);
446  long[] tvfPointers = ReadTvfPointers(fieldCount);
447  result = ReadTermVectors(docNum, fields, tvfPointers);
448  }
449  }
450  else
451  {
452  //System.out.println("No tvx file");
453  }
454  return result;
455  }
456 
457  public virtual void Get(int docNumber, TermVectorMapper mapper)
458  {
459  // Check if no term vectors are available for this segment at all
460  if (tvx != null)
461  {
462  //We need to offset by
463 
464  SeekTvx(docNumber);
465  long tvdPosition = tvx.ReadLong();
466 
467  tvd.Seek(tvdPosition);
468  int fieldCount = tvd.ReadVInt();
469 
470  // No fields are vectorized for this document
471  if (fieldCount != 0)
472  {
473  System.String[] fields = ReadFields(fieldCount);
474  long[] tvfPointers = ReadTvfPointers(fieldCount);
475  mapper.SetDocumentNumber(docNumber);
476  ReadTermVectors(fields, tvfPointers, mapper);
477  }
478  }
479  else
480  {
481  //System.out.println("No tvx file");
482  }
483  }
484 
485 
486  private SegmentTermVector[] ReadTermVectors(int docNum, System.String[] fields, long[] tvfPointers)
487  {
488  SegmentTermVector[] res = new SegmentTermVector[fields.Length];
489  for (int i = 0; i < fields.Length; i++)
490  {
491  var mapper = new ParallelArrayTermVectorMapper();
492  mapper.SetDocumentNumber(docNum);
493  ReadTermVector(fields[i], tvfPointers[i], mapper);
494  res[i] = (SegmentTermVector) mapper.MaterializeVector();
495  }
496  return res;
497  }
498 
499  private void ReadTermVectors(System.String[] fields, long[] tvfPointers, TermVectorMapper mapper)
500  {
501  for (int i = 0; i < fields.Length; i++)
502  {
503  ReadTermVector(fields[i], tvfPointers[i], mapper);
504  }
505  }
506 
507 
508  /// <summary> </summary>
509  /// <param name="field">The field to read in
510  /// </param>
511  /// <param name="tvfPointer">The pointer within the tvf file where we should start reading
512  /// </param>
513  /// <param name="mapper">The mapper used to map the TermVector
514  /// </param>
515  /// <throws> IOException </throws>
516  private void ReadTermVector(System.String field, long tvfPointer, TermVectorMapper mapper)
517  {
518 
519  // Now read the data from specified position
520  //We don't need to offset by the FORMAT here since the pointer already includes the offset
521  tvf.Seek(tvfPointer);
522 
523  int numTerms = tvf.ReadVInt();
524  //System.out.println("Num Terms: " + numTerms);
525  // If no terms - return a constant empty termvector. However, this should never occur!
526  if (numTerms == 0)
527  return ;
528 
529  bool storePositions;
530  bool storeOffsets;
531 
532  if (format >= FORMAT_VERSION)
533  {
534  byte bits = tvf.ReadByte();
535  storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
536  storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
537  }
538  else
539  {
540  tvf.ReadVInt();
541  storePositions = false;
542  storeOffsets = false;
543  }
544  mapper.SetExpectations(field, numTerms, storeOffsets, storePositions);
545  int start = 0;
546  int deltaLength = 0;
547  int totalLength = 0;
548  byte[] byteBuffer;
549  char[] charBuffer;
550  bool preUTF8 = format < FORMAT_UTF8_LENGTH_IN_BYTES;
551 
552  // init the buffers
553  if (preUTF8)
554  {
555  charBuffer = new char[10];
556  byteBuffer = null;
557  }
558  else
559  {
560  charBuffer = null;
561  byteBuffer = new byte[20];
562  }
563 
564  for (int i = 0; i < numTerms; i++)
565  {
566  start = tvf.ReadVInt();
567  deltaLength = tvf.ReadVInt();
568  totalLength = start + deltaLength;
569 
570  System.String term;
571 
572  if (preUTF8)
573  {
574  // Term stored as java chars
575  if (charBuffer.Length < totalLength)
576  {
577  char[] newCharBuffer = new char[(int) (1.5 * totalLength)];
578  Array.Copy(charBuffer, 0, newCharBuffer, 0, start);
579  charBuffer = newCharBuffer;
580  }
581  tvf.ReadChars(charBuffer, start, deltaLength);
582  term = new System.String(charBuffer, 0, totalLength);
583  }
584  else
585  {
586  // Term stored as utf8 bytes
587  if (byteBuffer.Length < totalLength)
588  {
589  byte[] newByteBuffer = new byte[(int) (1.5 * totalLength)];
590  Array.Copy(byteBuffer, 0, newByteBuffer, 0, start);
591  byteBuffer = newByteBuffer;
592  }
593  tvf.ReadBytes(byteBuffer, start, deltaLength);
594  term = System.Text.Encoding.UTF8.GetString(byteBuffer, 0, totalLength);
595  }
596  int freq = tvf.ReadVInt();
597  int[] positions = null;
598  if (storePositions)
599  {
600  //read in the positions
601  //does the mapper even care about positions?
602  if (mapper.IsIgnoringPositions == false)
603  {
604  positions = new int[freq];
605  int prevPosition = 0;
606  for (int j = 0; j < freq; j++)
607  {
608  positions[j] = prevPosition + tvf.ReadVInt();
609  prevPosition = positions[j];
610  }
611  }
612  else
613  {
614  //we need to skip over the positions. Since these are VInts, I don't believe there is anyway to know for sure how far to skip
615  //
616  for (int j = 0; j < freq; j++)
617  {
618  tvf.ReadVInt();
619  }
620  }
621  }
622  TermVectorOffsetInfo[] offsets = null;
623  if (storeOffsets)
624  {
625  //does the mapper even care about offsets?
626  if (mapper.IsIgnoringOffsets == false)
627  {
628  offsets = new TermVectorOffsetInfo[freq];
629  int prevOffset = 0;
630  for (int j = 0; j < freq; j++)
631  {
632  int startOffset = prevOffset + tvf.ReadVInt();
633  int endOffset = startOffset + tvf.ReadVInt();
634  offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset);
635  prevOffset = endOffset;
636  }
637  }
638  else
639  {
640  for (int j = 0; j < freq; j++)
641  {
642  tvf.ReadVInt();
643  tvf.ReadVInt();
644  }
645  }
646  }
647  mapper.Map(term, freq, offsets, positions);
648  }
649  }
650 
651  public virtual System.Object Clone()
652  {
653 
654  TermVectorsReader clone = (TermVectorsReader) base.MemberwiseClone();
655 
656  // These are null when a TermVectorsReader was created
657  // on a segment that did not have term vectors saved
658  if (tvx != null && tvd != null && tvf != null)
659  {
660  clone.tvx = (IndexInput) tvx.Clone();
661  clone.tvd = (IndexInput) tvd.Clone();
662  clone.tvf = (IndexInput) tvf.Clone();
663  }
664 
665  return clone;
666  }
667  }
668 
669 
670  /// <summary> Models the existing parallel array structure</summary>
672  {
673 
674  private System.String[] terms;
675  private int[] termFreqs;
676  private int[][] positions;
677  private TermVectorOffsetInfo[][] offsets;
678  private int currentPosition;
679  private bool storingOffsets;
680  private bool storingPositions;
681  private System.String field;
682 
683  public override void SetExpectations(System.String field, int numTerms, bool storeOffsets, bool storePositions)
684  {
685  this.field = field;
686  terms = new System.String[numTerms];
687  termFreqs = new int[numTerms];
688  this.storingOffsets = storeOffsets;
689  this.storingPositions = storePositions;
690  if (storePositions)
691  this.positions = new int[numTerms][];
692  if (storeOffsets)
693  this.offsets = new TermVectorOffsetInfo[numTerms][];
694  }
695 
696  public override void Map(System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions)
697  {
698  terms[currentPosition] = term;
699  termFreqs[currentPosition] = frequency;
700  if (storingOffsets)
701  {
702  this.offsets[currentPosition] = offsets;
703  }
704  if (storingPositions)
705  {
706  this.positions[currentPosition] = positions;
707  }
708  currentPosition++;
709  }
710 
711  /// <summary> Construct the vector</summary>
712  /// <returns> The <see cref="ITermFreqVector" /> based on the mappings.
713  /// </returns>
714  public virtual ITermFreqVector MaterializeVector()
715  {
716  SegmentTermVector tv = null;
717  if (field != null && terms != null)
718  {
719  if (storingPositions || storingOffsets)
720  {
721  tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets);
722  }
723  else
724  {
725  tv = new SegmentTermVector(field, terms, termFreqs);
726  }
727  }
728  return tv;
729  }
730  }
731 }