Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
FieldsReader.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.IO;
20 using Lucene.Net.Support;
21 using Lucene.Net.Util;
22 using TokenStream = Lucene.Net.Analysis.TokenStream;
23 using Lucene.Net.Documents;
24 using AlreadyClosedException = Lucene.Net.Store.AlreadyClosedException;
25 using BufferedIndexInput = Lucene.Net.Store.BufferedIndexInput;
26 using Directory = Lucene.Net.Store.Directory;
27 using IndexInput = Lucene.Net.Store.IndexInput;
28 
29 namespace Lucene.Net.Index
30 {
31 
32  /// <summary> Class responsible for access to stored document fields.
33  /// <p/>
34  /// It uses &lt;segment&gt;.fdt and &lt;segment&gt;.fdx; files.
35  ///
36  /// </summary>
37  public sealed class FieldsReader : ICloneable, IDisposable
38  {
39  private readonly FieldInfos fieldInfos;
40 
41  // The main fieldStream, used only for cloning.
42  private readonly IndexInput cloneableFieldsStream;
43 
44  // This is a clone of cloneableFieldsStream used for reading documents.
45  // It should not be cloned outside of a synchronized context.
46  private readonly IndexInput fieldsStream;
47 
48  private readonly IndexInput cloneableIndexStream;
49  private readonly IndexInput indexStream;
50  private readonly int numTotalDocs;
51  private readonly int size;
52  private bool closed;
53  private readonly int format;
54  private readonly int formatSize;
55 
56  // The docID offset where our docs begin in the index
57  // file. This will be 0 if we have our own private file.
58  private readonly int docStoreOffset;
59 
60  private readonly CloseableThreadLocal<IndexInput> fieldsStreamTL = new CloseableThreadLocal<IndexInput>();
61  private readonly bool isOriginal = false;
62 
63  /// <summary>Returns a cloned FieldsReader that shares open
64  /// IndexInputs with the original one. It is the caller's
65  /// job not to close the original FieldsReader until all
66  /// clones are called (eg, currently SegmentReader manages
67  /// this logic).
68  /// </summary>
69  public System.Object Clone()
70  {
71  EnsureOpen();
72  return new FieldsReader(fieldInfos, numTotalDocs, size, format, formatSize, docStoreOffset, cloneableFieldsStream, cloneableIndexStream);
73  }
74 
75  // Used only by clone
76  private FieldsReader(FieldInfos fieldInfos, int numTotalDocs, int size, int format, int formatSize, int docStoreOffset, IndexInput cloneableFieldsStream, IndexInput cloneableIndexStream)
77  {
78  this.fieldInfos = fieldInfos;
79  this.numTotalDocs = numTotalDocs;
80  this.size = size;
81  this.format = format;
82  this.formatSize = formatSize;
83  this.docStoreOffset = docStoreOffset;
84  this.cloneableFieldsStream = cloneableFieldsStream;
85  this.cloneableIndexStream = cloneableIndexStream;
86  fieldsStream = (IndexInput) cloneableFieldsStream.Clone();
87  indexStream = (IndexInput) cloneableIndexStream.Clone();
88  }
89 
90  public /*internal*/ FieldsReader(Directory d, String segment, FieldInfos fn):this(d, segment, fn, BufferedIndexInput.BUFFER_SIZE, - 1, 0)
91  {
92  }
93 
94  internal FieldsReader(Directory d, System.String segment, FieldInfos fn, int readBufferSize):this(d, segment, fn, readBufferSize, - 1, 0)
95  {
96  }
97 
98  internal FieldsReader(Directory d, System.String segment, FieldInfos fn, int readBufferSize, int docStoreOffset, int size)
99  {
100  bool success = false;
101  isOriginal = true;
102  try
103  {
104  fieldInfos = fn;
105 
106  cloneableFieldsStream = d.OpenInput(segment + "." + IndexFileNames.FIELDS_EXTENSION, readBufferSize);
107  cloneableIndexStream = d.OpenInput(segment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION, readBufferSize);
108 
109  // First version of fdx did not include a format
110  // header, but, the first int will always be 0 in that
111  // case
112  int firstInt = cloneableIndexStream.ReadInt();
113  format = firstInt == 0 ? 0 : firstInt;
114 
115  if (format > FieldsWriter.FORMAT_CURRENT)
116  throw new CorruptIndexException("Incompatible format version: " + format + " expected " + FieldsWriter.FORMAT_CURRENT + " or lower");
117 
118  formatSize = format > FieldsWriter.FORMAT ? 4 : 0;
119 
120  if (format < FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES)
121  cloneableFieldsStream.SetModifiedUTF8StringsMode();
122 
123  fieldsStream = (IndexInput) cloneableFieldsStream.Clone();
124 
125  long indexSize = cloneableIndexStream.Length() - formatSize;
126 
127  if (docStoreOffset != - 1)
128  {
129  // We read only a slice out of this shared fields file
130  this.docStoreOffset = docStoreOffset;
131  this.size = size;
132 
133  // Verify the file is long enough to hold all of our
134  // docs
135  System.Diagnostics.Debug.Assert(((int)(indexSize / 8)) >= size + this.docStoreOffset, "indexSize=" + indexSize + " size=" + size + " docStoreOffset=" + docStoreOffset);
136  }
137  else
138  {
139  this.docStoreOffset = 0;
140  this.size = (int) (indexSize >> 3);
141  }
142 
143  indexStream = (IndexInput) cloneableIndexStream.Clone();
144  numTotalDocs = (int) (indexSize >> 3);
145  success = true;
146  }
147  finally
148  {
149  // With lock-less commits, it's entirely possible (and
150  // fine) to hit a FileNotFound exception above. In
151  // this case, we want to explicitly close any subset
152  // of things that were opened so that we don't have to
153  // wait for a GC to do so.
154  if (!success)
155  {
156  Dispose();
157  }
158  }
159  }
160 
161  /// <throws> AlreadyClosedException if this FieldsReader is closed </throws>
162  internal void EnsureOpen()
163  {
164  if (closed)
165  {
166  throw new AlreadyClosedException("this FieldsReader is closed");
167  }
168  }
169 
170  /// <summary> Closes the underlying <see cref="Lucene.Net.Store.IndexInput" /> streams, including any ones associated with a
171  /// lazy implementation of a Field. This means that the Fields values will not be accessible.
172  ///
173  /// </summary>
174  /// <throws> IOException </throws>
175  public void Dispose()
176  {
177  // Move to protected method if class becomes unsealed
178  if (!closed)
179  {
180  if (fieldsStream != null)
181  {
182  fieldsStream.Close();
183  }
184  if (isOriginal)
185  {
186  if (cloneableFieldsStream != null)
187  {
188  cloneableFieldsStream.Close();
189  }
190  if (cloneableIndexStream != null)
191  {
192  cloneableIndexStream.Close();
193  }
194  }
195  if (indexStream != null)
196  {
197  indexStream.Close();
198  }
199  fieldsStreamTL.Close();
200  closed = true;
201  }
202  }
203 
204  public /*internal*/ int Size()
205  {
206  return size;
207  }
208 
209  private void SeekIndex(int docID)
210  {
211  indexStream.Seek(formatSize + (docID + docStoreOffset) * 8L);
212  }
213 
214  internal bool CanReadRawDocs()
215  {
216  // Disable reading raw docs in 2.x format, because of the removal of compressed
217  // fields in 3.0. We don't want rawDocs() to decode field bits to figure out
218  // if a field was compressed, hence we enforce ordinary (non-raw) stored field merges
219  // for <3.0 indexes.
220  return format >= FieldsWriter.FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS;
221  }
222 
223  public /*internal*/ Document Doc(int n, FieldSelector fieldSelector)
224  {
225  SeekIndex(n);
226  long position = indexStream.ReadLong();
227  fieldsStream.Seek(position);
228 
229  var doc = new Document();
230  int numFields = fieldsStream.ReadVInt();
231  for (int i = 0; i < numFields; i++)
232  {
233  int fieldNumber = fieldsStream.ReadVInt();
234  FieldInfo fi = fieldInfos.FieldInfo(fieldNumber);
235  FieldSelectorResult acceptField = fieldSelector == null?FieldSelectorResult.LOAD:fieldSelector.Accept(fi.name);
236 
237  byte bits = fieldsStream.ReadByte();
238  System.Diagnostics.Debug.Assert(bits <= FieldsWriter.FIELD_IS_COMPRESSED + FieldsWriter.FIELD_IS_TOKENIZED + FieldsWriter.FIELD_IS_BINARY);
239 
240  bool compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0;
241  System.Diagnostics.Debug.Assert(
242  (!compressed || (format < FieldsWriter.FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS)),
243  "compressed fields are only allowed in indexes of version <= 2.9");
244  bool tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0;
245  bool binary = (bits & FieldsWriter.FIELD_IS_BINARY) != 0;
246  //TODO: Find an alternative approach here if this list continues to grow beyond the
247  //list of 5 or 6 currently here. See Lucene 762 for discussion
248  if (acceptField.Equals(FieldSelectorResult.LOAD))
249  {
250  AddField(doc, fi, binary, compressed, tokenize);
251  }
252  else if (acceptField.Equals(FieldSelectorResult.LOAD_AND_BREAK))
253  {
254  AddField(doc, fi, binary, compressed, tokenize);
255  break; //Get out of this loop
256  }
257  else if (acceptField.Equals(FieldSelectorResult.LAZY_LOAD))
258  {
259  AddFieldLazy(doc, fi, binary, compressed, tokenize);
260  }
261  else if (acceptField.Equals(FieldSelectorResult.SIZE))
262  {
263  SkipField(binary, compressed, AddFieldSize(doc, fi, binary, compressed));
264  }
265  else if (acceptField.Equals(FieldSelectorResult.SIZE_AND_BREAK))
266  {
267  AddFieldSize(doc, fi, binary, compressed);
268  break;
269  }
270  else
271  {
272  SkipField(binary, compressed);
273  }
274  }
275 
276  return doc;
277  }
278 
279  /// <summary>Returns the length in bytes of each raw document in a
280  /// contiguous range of length numDocs starting with
281  /// startDocID. Returns the IndexInput (the fieldStream),
282  /// already seeked to the starting point for startDocID.
283  /// </summary>
284  internal IndexInput RawDocs(int[] lengths, int startDocID, int numDocs)
285  {
286  SeekIndex(startDocID);
287  long startOffset = indexStream.ReadLong();
288  long lastOffset = startOffset;
289  int count = 0;
290  while (count < numDocs)
291  {
292  long offset;
293  int docID = docStoreOffset + startDocID + count + 1;
294  System.Diagnostics.Debug.Assert(docID <= numTotalDocs);
295  if (docID < numTotalDocs)
296  offset = indexStream.ReadLong();
297  else
298  offset = fieldsStream.Length();
299  lengths[count++] = (int) (offset - lastOffset);
300  lastOffset = offset;
301  }
302 
303  fieldsStream.Seek(startOffset);
304 
305  return fieldsStream;
306  }
307 
308  /// <summary> Skip the field. We still have to read some of the information about the field, but can skip past the actual content.
309  /// This will have the most payoff on large fields.
310  /// </summary>
311  private void SkipField(bool binary, bool compressed)
312  {
313  SkipField(binary, compressed, fieldsStream.ReadVInt());
314  }
315 
316  private void SkipField(bool binary, bool compressed, int toRead)
317  {
318  if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES || binary || compressed)
319  {
320  fieldsStream.Seek(fieldsStream.FilePointer + toRead);
321  }
322  else
323  {
324  // We need to skip chars. This will slow us down, but still better
325  fieldsStream.SkipChars(toRead);
326  }
327  }
328 
329  private void AddFieldLazy(Document doc, FieldInfo fi, bool binary, bool compressed, bool tokenize)
330  {
331  if (binary)
332  {
333  int toRead = fieldsStream.ReadVInt();
334  long pointer = fieldsStream.FilePointer;
335  //was: doc.add(new Fieldable(fi.name, b, Fieldable.Store.YES));
336  doc.Add(new LazyField(this, fi.name, Field.Store.YES, toRead, pointer, binary, compressed));
337 
338  //Need to move the pointer ahead by toRead positions
339  fieldsStream.Seek(pointer + toRead);
340  }
341  else
342  {
343  const Field.Store store = Field.Store.YES;
344  Field.Index index = FieldExtensions.ToIndex(fi.isIndexed, tokenize);
345  Field.TermVector termVector = FieldExtensions.ToTermVector(fi.storeTermVector, fi.storeOffsetWithTermVector, fi.storePositionWithTermVector);
346 
347  AbstractField f;
348  if (compressed)
349  {
350  int toRead = fieldsStream.ReadVInt();
351  long pointer = fieldsStream.FilePointer;
352  f = new LazyField(this, fi.name, store, toRead, pointer, binary, compressed);
353  //skip over the part that we aren't loading
354  fieldsStream.Seek(pointer + toRead);
355  f.OmitNorms = fi.omitNorms;
356  f.OmitTermFreqAndPositions = fi.omitTermFreqAndPositions;
357  }
358  else
359  {
360  int length = fieldsStream.ReadVInt();
361  long pointer = fieldsStream.FilePointer;
362  //Skip ahead of where we are by the length of what is stored
363  if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES)
364  {
365  fieldsStream.Seek(pointer + length);
366  }
367  else
368  {
369  fieldsStream.SkipChars(length);
370  }
371  f = new LazyField(this, fi.name, store, index, termVector, length, pointer, binary, compressed)
372  {OmitNorms = fi.omitNorms, OmitTermFreqAndPositions = fi.omitTermFreqAndPositions};
373  }
374 
375  doc.Add(f);
376  }
377  }
378 
379  private void AddField(Document doc, FieldInfo fi, bool binary, bool compressed, bool tokenize)
380  {
381  //we have a binary stored field, and it may be compressed
382  if (binary)
383  {
384  int toRead = fieldsStream.ReadVInt();
385  var b = new byte[toRead];
386  fieldsStream.ReadBytes(b, 0, b.Length);
387  doc.Add(compressed ? new Field(fi.name, Uncompress(b), Field.Store.YES) : new Field(fi.name, b, Field.Store.YES));
388  }
389  else
390  {
391  const Field.Store store = Field.Store.YES;
392  Field.Index index = FieldExtensions.ToIndex(fi.isIndexed, tokenize);
393  Field.TermVector termVector = FieldExtensions.ToTermVector(fi.storeTermVector, fi.storeOffsetWithTermVector, fi.storePositionWithTermVector);
394 
395  AbstractField f;
396  if (compressed)
397  {
398  int toRead = fieldsStream.ReadVInt();
399 
400  var b = new byte[toRead];
401  fieldsStream.ReadBytes(b, 0, b.Length);
402  f = new Field(fi.name, false, System.Text.Encoding.GetEncoding("UTF-8").GetString(Uncompress(b)), store, index,
403  termVector) {OmitTermFreqAndPositions = fi.omitTermFreqAndPositions, OmitNorms = fi.omitNorms};
404  }
405  else
406  {
407  f = new Field(fi.name, false, fieldsStream.ReadString(), store, index, termVector)
408  {OmitTermFreqAndPositions = fi.omitTermFreqAndPositions, OmitNorms = fi.omitNorms};
409  }
410 
411  doc.Add(f);
412  }
413  }
414 
415  // Add the size of field as a byte[] containing the 4 bytes of the integer byte size (high order byte first; char = 2 bytes)
416  // Read just the size -- caller must skip the field content to continue reading fields
417  // Return the size in bytes or chars, depending on field type
418  private int AddFieldSize(Document doc, FieldInfo fi, bool binary, bool compressed)
419  {
420  int size = fieldsStream.ReadVInt(), bytesize = binary || compressed?size:2 * size;
421  var sizebytes = new byte[4];
422  sizebytes[0] = (byte) (Number.URShift(bytesize, 24));
423  sizebytes[1] = (byte) (Number.URShift(bytesize, 16));
424  sizebytes[2] = (byte) (Number.URShift(bytesize, 8));
425  sizebytes[3] = (byte) bytesize;
426  doc.Add(new Field(fi.name, sizebytes, Field.Store.YES));
427  return size;
428  }
429 
430  /// <summary> A Lazy implementation of Fieldable that differs loading of fields until asked for, instead of when the Document is
431  /// loaded.
432  /// </summary>
433  [Serializable]
434  private sealed class LazyField : AbstractField
435  {
436  private void InitBlock(FieldsReader enclosingInstance)
437  {
438  this.Enclosing_Instance = enclosingInstance;
439  }
440 
441  private FieldsReader Enclosing_Instance { get; set; }
442 
443  private int toRead;
444  private long pointer;
445  [Obsolete("Only kept for backward-compatbility with <3.0 indexes. Will be removed in 4.0.")]
446  private readonly Boolean isCompressed;
447 
448  public LazyField(FieldsReader enclosingInstance, System.String name, Field.Store store, int toRead, long pointer, bool isBinary, bool isCompressed):base(name, store, Field.Index.NO, Field.TermVector.NO)
449  {
450  InitBlock(enclosingInstance);
451  this.toRead = toRead;
452  this.pointer = pointer;
453  this.internalIsBinary = isBinary;
454  if (isBinary)
455  internalBinaryLength = toRead;
456  lazy = true;
457  this.isCompressed = isCompressed;
458  }
459 
460  public LazyField(FieldsReader enclosingInstance, System.String name, Field.Store store, Field.Index index, Field.TermVector termVector, int toRead, long pointer, bool isBinary, bool isCompressed):base(name, store, index, termVector)
461  {
462  InitBlock(enclosingInstance);
463  this.toRead = toRead;
464  this.pointer = pointer;
465  this.internalIsBinary = isBinary;
466  if (isBinary)
467  internalBinaryLength = toRead;
468  lazy = true;
469  this.isCompressed = isCompressed;
470  }
471 
472  private IndexInput GetFieldStream()
473  {
474  IndexInput localFieldsStream = Enclosing_Instance.fieldsStreamTL.Get();
475  if (localFieldsStream == null)
476  {
477  localFieldsStream = (IndexInput) Enclosing_Instance.cloneableFieldsStream.Clone();
478  Enclosing_Instance.fieldsStreamTL.Set(localFieldsStream);
479  }
480  return localFieldsStream;
481  }
482 
483  /// <summary>The value of the field as a Reader, or null. If null, the String value,
484  /// binary value, or TokenStream value is used. Exactly one of StringValue(),
485  /// ReaderValue(), GetBinaryValue(), and TokenStreamValue() must be set.
486  /// </summary>
487  public override TextReader ReaderValue
488  {
489  get
490  {
491  Enclosing_Instance.EnsureOpen();
492  return null;
493  }
494  }
495 
496  /// <summary>The value of the field as a TokenStream, or null. If null, the Reader value,
497  /// String value, or binary value is used. Exactly one of StringValue(),
498  /// ReaderValue(), GetBinaryValue(), and TokenStreamValue() must be set.
499  /// </summary>
500  public override TokenStream TokenStreamValue
501  {
502  get
503  {
504  Enclosing_Instance.EnsureOpen();
505  return null;
506  }
507  }
508 
509  /// <summary>The value of the field as a String, or null. If null, the Reader value,
510  /// binary value, or TokenStream value is used. Exactly one of StringValue(),
511  /// ReaderValue(), GetBinaryValue(), and TokenStreamValue() must be set.
512  /// </summary>
513  public override string StringValue
514  {
515  get
516  {
517  Enclosing_Instance.EnsureOpen();
518  if (internalIsBinary)
519  return null;
520 
521  if (fieldsData == null)
522  {
523  IndexInput localFieldsStream = GetFieldStream();
524  try
525  {
526  localFieldsStream.Seek(pointer);
527  if (isCompressed)
528  {
529  var b = new byte[toRead];
530  localFieldsStream.ReadBytes(b, 0, b.Length);
531  fieldsData =
532  System.Text.Encoding.GetEncoding("UTF-8").GetString(Enclosing_Instance.Uncompress(b));
533  }
534  else
535  {
536  if (Enclosing_Instance.format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES)
537  {
538  var bytes = new byte[toRead];
539  localFieldsStream.ReadBytes(bytes, 0, toRead);
540  fieldsData = System.Text.Encoding.GetEncoding("UTF-8").GetString(bytes);
541  }
542  else
543  {
544  //read in chars b/c we already know the length we need to read
545  var chars = new char[toRead];
546  localFieldsStream.ReadChars(chars, 0, toRead);
547  fieldsData = new System.String(chars);
548  }
549  }
550  }
551  catch (System.IO.IOException e)
552  {
553  throw new FieldReaderException(e);
554  }
555  }
556  return (System.String) fieldsData;
557  }
558  }
559 
560  public long Pointer
561  {
562  get
563  {
564  Enclosing_Instance.EnsureOpen();
565  return pointer;
566  }
567  set
568  {
569  Enclosing_Instance.EnsureOpen();
570  this.pointer = value;
571  }
572  }
573 
574  public int ToRead
575  {
576  get
577  {
578  Enclosing_Instance.EnsureOpen();
579  return toRead;
580  }
581  set
582  {
583  Enclosing_Instance.EnsureOpen();
584  this.toRead = value;
585  }
586  }
587 
588  public override byte[] GetBinaryValue(byte[] result)
589  {
590  Enclosing_Instance.EnsureOpen();
591 
592  if (internalIsBinary)
593  {
594  if (fieldsData == null)
595  {
596  // Allocate new buffer if result is null or too small
597  byte[] b;
598  if (result == null || result.Length < toRead)
599  b = new byte[toRead];
600  else
601  b = result;
602 
603  IndexInput localFieldsStream = GetFieldStream();
604 
605  // Throw this IOException since IndexReader.document does so anyway, so probably not that big of a change for people
606  // since they are already handling this exception when getting the document
607  try
608  {
609  localFieldsStream.Seek(pointer);
610  localFieldsStream.ReadBytes(b, 0, toRead);
611  fieldsData = isCompressed ? Enclosing_Instance.Uncompress(b) : b;
612  }
613  catch (IOException e)
614  {
615  throw new FieldReaderException(e);
616  }
617 
618  internalbinaryOffset = 0;
619  internalBinaryLength = toRead;
620  }
621 
622  return (byte[]) fieldsData;
623  }
624  return null;
625  }
626  }
627 
628  private byte[] Uncompress(byte[] b)
629  {
630  try
631  {
632  return CompressionTools.Decompress(b);
633  }
634  catch (Exception e)
635  {
636  // this will happen if the field is not compressed
637  throw new CorruptIndexException("field data are in wrong format: " + e, e);
638  }
639  }
640  }
641 }