Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
TermVectorsWriter.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 
20 using Directory = Lucene.Net.Store.Directory;
21 using IndexOutput = Lucene.Net.Store.IndexOutput;
22 using StringHelper = Lucene.Net.Util.StringHelper;
23 using UnicodeUtil = Lucene.Net.Util.UnicodeUtil;
24 
25 namespace Lucene.Net.Index
26 {
27  sealed class TermVectorsWriter : IDisposable
28  {
29 
30  private readonly IndexOutput tvx = null;
31  private readonly IndexOutput tvd = null;
32  private readonly IndexOutput tvf = null;
33  private readonly FieldInfos fieldInfos;
34  internal UnicodeUtil.UTF8Result[] utf8Results = new[]{new UnicodeUtil.UTF8Result(), new UnicodeUtil.UTF8Result()};
35 
36  public TermVectorsWriter(Directory directory, System.String segment, FieldInfos fieldInfos)
37  {
38  // Open files for TermVector storage
39  tvx = directory.CreateOutput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);
40  tvx.WriteInt(TermVectorsReader.FORMAT_CURRENT);
41  tvd = directory.CreateOutput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);
42  tvd.WriteInt(TermVectorsReader.FORMAT_CURRENT);
43  tvf = directory.CreateOutput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION);
44  tvf.WriteInt(TermVectorsReader.FORMAT_CURRENT);
45 
46  this.fieldInfos = fieldInfos;
47  }
48 
49  /// <summary> Add a complete document specified by all its term vectors. If document has no
50  /// term vectors, add value for tvx.
51  ///
52  /// </summary>
53  /// <param name="vectors">
54  /// </param>
55  /// <throws> IOException </throws>
56  public void AddAllDocVectors(ITermFreqVector[] vectors)
57  {
58 
59  tvx.WriteLong(tvd.FilePointer);
60  tvx.WriteLong(tvf.FilePointer);
61 
62  if (vectors != null)
63  {
64  int numFields = vectors.Length;
65  tvd.WriteVInt(numFields);
66 
67  var fieldPointers = new long[numFields];
68 
69  for (int i = 0; i < numFields; i++)
70  {
71  fieldPointers[i] = tvf.FilePointer;
72 
73  int fieldNumber = fieldInfos.FieldNumber(vectors[i].Field);
74 
75  // 1st pass: write field numbers to tvd
76  tvd.WriteVInt(fieldNumber);
77 
78  int numTerms = vectors[i].Size;
79  tvf.WriteVInt(numTerms);
80 
81  TermPositionVector tpVector;
82 
83  byte bits;
84  bool storePositions;
85  bool storeOffsets;
86 
87  if (vectors[i] is TermPositionVector)
88  {
89  // May have positions & offsets
90  tpVector = (TermPositionVector) vectors[i];
91  storePositions = tpVector.Size > 0 && tpVector.GetTermPositions(0) != null;
92  storeOffsets = tpVector.Size > 0 && tpVector.GetOffsets(0) != null;
93  bits = (byte) ((storePositions?TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR: (byte) 0) + (storeOffsets?TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR: (byte) 0));
94  }
95  else
96  {
97  tpVector = null;
98  bits = 0;
99  storePositions = false;
100  storeOffsets = false;
101  }
102 
103  tvf.WriteVInt(bits);
104 
105  System.String[] terms = vectors[i].GetTerms();
106  int[] freqs = vectors[i].GetTermFrequencies();
107 
108  int utf8Upto = 0;
109  utf8Results[1].length = 0;
110 
111  for (int j = 0; j < numTerms; j++)
112  {
113 
114  UnicodeUtil.UTF16toUTF8(terms[j], 0, terms[j].Length, utf8Results[utf8Upto]);
115 
116  int start = StringHelper.BytesDifference(utf8Results[1 - utf8Upto].result, utf8Results[1 - utf8Upto].length, utf8Results[utf8Upto].result, utf8Results[utf8Upto].length);
117  int length = utf8Results[utf8Upto].length - start;
118  tvf.WriteVInt(start); // write shared prefix length
119  tvf.WriteVInt(length); // write delta length
120  tvf.WriteBytes(utf8Results[utf8Upto].result, start, length); // write delta bytes
121  utf8Upto = 1 - utf8Upto;
122 
123  int termFreq = freqs[j];
124 
125  tvf.WriteVInt(termFreq);
126 
127  if (storePositions)
128  {
129  int[] positions = tpVector.GetTermPositions(j);
130  if (positions == null)
131  throw new System.SystemException("Trying to write positions that are null!");
132  System.Diagnostics.Debug.Assert(positions.Length == termFreq);
133 
134  // use delta encoding for positions
135  int lastPosition = 0;
136  foreach (int position in positions)
137  {
138  tvf.WriteVInt(position - lastPosition);
139  lastPosition = position;
140  }
141  }
142 
143  if (storeOffsets)
144  {
145  TermVectorOffsetInfo[] offsets = tpVector.GetOffsets(j);
146  if (offsets == null)
147  throw new System.SystemException("Trying to write offsets that are null!");
148  System.Diagnostics.Debug.Assert(offsets.Length == termFreq);
149 
150  // use delta encoding for offsets
151  int lastEndOffset = 0;
152  foreach (TermVectorOffsetInfo t in offsets)
153  {
154  int startOffset = t.StartOffset;
155  int endOffset = t.EndOffset;
156  tvf.WriteVInt(startOffset - lastEndOffset);
157  tvf.WriteVInt(endOffset - startOffset);
158  lastEndOffset = endOffset;
159  }
160  }
161  }
162  }
163 
164  // 2nd pass: write field pointers to tvd
165  if (numFields > 1)
166  {
167  long lastFieldPointer = fieldPointers[0];
168  for (int i = 1; i < numFields; i++)
169  {
170  long fieldPointer = fieldPointers[i];
171  tvd.WriteVLong(fieldPointer - lastFieldPointer);
172  lastFieldPointer = fieldPointer;
173  }
174  }
175  }
176  else
177  tvd.WriteVInt(0);
178  }
179 
180  /// <summary> Do a bulk copy of numDocs documents from reader to our
181  /// streams. This is used to expedite merging, if the
182  /// field numbers are congruent.
183  /// </summary>
184  internal void AddRawDocuments(TermVectorsReader reader, int[] tvdLengths, int[] tvfLengths, int numDocs)
185  {
186  long tvdPosition = tvd.FilePointer;
187  long tvfPosition = tvf.FilePointer;
188  long tvdStart = tvdPosition;
189  long tvfStart = tvfPosition;
190  for (int i = 0; i < numDocs; i++)
191  {
192  tvx.WriteLong(tvdPosition);
193  tvdPosition += tvdLengths[i];
194  tvx.WriteLong(tvfPosition);
195  tvfPosition += tvfLengths[i];
196  }
197  tvd.CopyBytes(reader.GetTvdStream(), tvdPosition - tvdStart);
198  tvf.CopyBytes(reader.GetTvfStream(), tvfPosition - tvfStart);
199  System.Diagnostics.Debug.Assert(tvd.FilePointer == tvdPosition);
200  System.Diagnostics.Debug.Assert(tvf.FilePointer == tvfPosition);
201  }
202 
203  /// <summary>Close all streams. </summary>
204  public void Dispose()
205  {
206  // Move to a protected method if class becomes unsealed
207 
208  // make an effort to close all streams we can but remember and re-throw
209  // the first exception encountered in this process
210  System.IO.IOException keep = null;
211  if (tvx != null)
212  try
213  {
214  tvx.Close();
215  }
216  catch (System.IO.IOException e)
217  {
218  keep = e;
219  }
220  if (tvd != null)
221  try
222  {
223  tvd.Close();
224  }
225  catch (System.IO.IOException e)
226  {
227  if (keep == null)
228  keep = e;
229  }
230  if (tvf != null)
231  try
232  {
233  tvf.Close();
234  }
235  catch (System.IO.IOException e)
236  {
237  if (keep == null)
238  keep = e;
239  }
240  if (keep != null)
241  {
242  throw new System.IO.IOException(keep.StackTrace);
243  }
244  }
245  }
246 }