Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
TermVectorsTermsWriterPerField.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using Lucene.Net.Analysis.Tokenattributes;
20 using Lucene.Net.Documents;
21 using IndexOutput = Lucene.Net.Store.IndexOutput;
22 using UnicodeUtil = Lucene.Net.Util.UnicodeUtil;
23 
24 namespace Lucene.Net.Index
25 {
26 
28  {
29 
30  internal TermVectorsTermsWriterPerThread perThread;
31  internal TermsHashPerField termsHashPerField;
32  internal TermVectorsTermsWriter termsWriter;
33  internal FieldInfo fieldInfo;
34  internal DocumentsWriter.DocState docState;
35  internal FieldInvertState fieldState;
36 
37  internal bool doVectors;
38  internal bool doVectorPositions;
39  internal bool doVectorOffsets;
40 
41  internal int maxNumPostings;
42  internal IOffsetAttribute offsetAttribute = null;
43 
45  {
46  this.termsHashPerField = termsHashPerField;
47  this.perThread = perThread;
48  this.termsWriter = perThread.termsWriter;
49  this.fieldInfo = fieldInfo;
50  docState = termsHashPerField.docState;
51  fieldState = termsHashPerField.fieldState;
52  }
53 
54  internal override int GetStreamCount()
55  {
56  return 2;
57  }
58 
59  internal override bool Start(IFieldable[] fields, int count)
60  {
61  doVectors = false;
62  doVectorPositions = false;
63  doVectorOffsets = false;
64 
65  for (int i = 0; i < count; i++)
66  {
67  IFieldable field = fields[i];
68  if (field.IsIndexed && field.IsTermVectorStored)
69  {
70  doVectors = true;
71  doVectorPositions |= field.IsStorePositionWithTermVector;
72  doVectorOffsets |= field.IsStoreOffsetWithTermVector;
73  }
74  }
75 
76  if (doVectors)
77  {
78  if (perThread.doc == null)
79  {
80  perThread.doc = termsWriter.GetPerDoc();
81  perThread.doc.docID = docState.docID;
82  System.Diagnostics.Debug.Assert(perThread.doc.numVectorFields == 0);
83  System.Diagnostics.Debug.Assert(0 == perThread.doc.perDocTvf.Length);
84  System.Diagnostics.Debug.Assert(0 == perThread.doc.perDocTvf.FilePointer);
85  }
86 
87  System.Diagnostics.Debug.Assert(perThread.doc.docID == docState.docID);
88  if (termsHashPerField.numPostings != 0)
89  {
90  // Only necessary if previous doc hit a
91  // non-aborting exception while writing vectors in
92  // this field:
93  termsHashPerField.Reset();
94  perThread.termsHashPerThread.Reset(false);
95  }
96  }
97 
98  // TODO: only if needed for performance
99  //perThread.postingsCount = 0;
100 
101  return doVectors;
102  }
103 
104  public void Abort()
105  {
106  }
107 
108  /// <summary>Called once per field per document if term vectors
109  /// are enabled, to write the vectors to
110  /// RAMOutputStream, which is then quickly flushed to
111  /// the real term vectors files in the Directory.
112  /// </summary>
113  internal override void Finish()
114  {
115 
116  System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.finish start"));
117 
118  int numPostings = termsHashPerField.numPostings;
119 
120  System.Diagnostics.Debug.Assert(numPostings >= 0);
121 
122  if (!doVectors || numPostings == 0)
123  return ;
124 
125  if (numPostings > maxNumPostings)
126  maxNumPostings = numPostings;
127 
128  IndexOutput tvf = perThread.doc.perDocTvf;
129 
130  // This is called once, after inverting all occurences
131  // of a given field in the doc. At this point we flush
132  // our hash into the DocWriter.
133 
134  System.Diagnostics.Debug.Assert(fieldInfo.storeTermVector);
135  System.Diagnostics.Debug.Assert(perThread.VectorFieldsInOrder(fieldInfo));
136 
137  perThread.doc.AddField(termsHashPerField.fieldInfo.number);
138 
139  RawPostingList[] postings = termsHashPerField.SortPostings();
140 
141  tvf.WriteVInt(numPostings);
142  byte bits = (byte) (0x0);
143  if (doVectorPositions)
144  bits |= TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR;
145  if (doVectorOffsets)
146  bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR;
147  tvf.WriteByte(bits);
148 
149  int encoderUpto = 0;
150  int lastTermBytesCount = 0;
151 
152  ByteSliceReader reader = perThread.vectorSliceReader;
153  char[][] charBuffers = perThread.termsHashPerThread.charPool.buffers;
154  for (int j = 0; j < numPostings; j++)
155  {
156  TermVectorsTermsWriter.PostingList posting = (TermVectorsTermsWriter.PostingList) postings[j];
157  int freq = posting.freq;
158 
159  char[] text2 = charBuffers[posting.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
160  int start2 = posting.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
161 
162  // We swap between two encoders to save copying
163  // last Term's byte array
164  UnicodeUtil.UTF8Result utf8Result = perThread.utf8Results[encoderUpto];
165 
166  // TODO: we could do this incrementally
167  UnicodeUtil.UTF16toUTF8(text2, start2, utf8Result);
168  int termBytesCount = utf8Result.length;
169 
170  // TODO: UTF16toUTF8 could tell us this prefix
171  // Compute common prefix between last term and
172  // this term
173  int prefix = 0;
174  if (j > 0)
175  {
176  byte[] lastTermBytes = perThread.utf8Results[1 - encoderUpto].result;
177  byte[] termBytes = perThread.utf8Results[encoderUpto].result;
178  while (prefix < lastTermBytesCount && prefix < termBytesCount)
179  {
180  if (lastTermBytes[prefix] != termBytes[prefix])
181  break;
182  prefix++;
183  }
184  }
185  encoderUpto = 1 - encoderUpto;
186  lastTermBytesCount = termBytesCount;
187 
188  int suffix = termBytesCount - prefix;
189  tvf.WriteVInt(prefix);
190  tvf.WriteVInt(suffix);
191  tvf.WriteBytes(utf8Result.result, prefix, suffix);
192  tvf.WriteVInt(freq);
193 
194  if (doVectorPositions)
195  {
196  termsHashPerField.InitReader(reader, posting, 0);
197  reader.WriteTo(tvf);
198  }
199 
200  if (doVectorOffsets)
201  {
202  termsHashPerField.InitReader(reader, posting, 1);
203  reader.WriteTo(tvf);
204  }
205  }
206 
207  termsHashPerField.Reset();
208 
209  // NOTE: we clear, per-field, at the thread level,
210  // because term vectors fully write themselves on each
211  // field; this saves RAM (eg if large doc has two large
212  // fields w/ term vectors on) because we recycle/reuse
213  // all RAM after each field:
214  perThread.termsHashPerThread.Reset(false);
215  }
216 
217  internal void ShrinkHash()
218  {
219  termsHashPerField.ShrinkHash(maxNumPostings);
220  maxNumPostings = 0;
221  }
222 
223  internal override void Start(IFieldable f)
224  {
225  if (doVectorOffsets)
226  {
227  offsetAttribute = fieldState.attributeSource.AddAttribute<IOffsetAttribute>();
228  }
229  else
230  {
231  offsetAttribute = null;
232  }
233  }
234 
235  internal override void NewTerm(RawPostingList p0)
236  {
237 
238  System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.newTerm start"));
239 
240  TermVectorsTermsWriter.PostingList p = (TermVectorsTermsWriter.PostingList) p0;
241 
242  p.freq = 1;
243 
244  if (doVectorOffsets)
245  {
246  int startOffset = fieldState.offset + offsetAttribute.StartOffset; ;
247  int endOffset = fieldState.offset + offsetAttribute.EndOffset;
248 
249  termsHashPerField.WriteVInt(1, startOffset);
250  termsHashPerField.WriteVInt(1, endOffset - startOffset);
251  p.lastOffset = endOffset;
252  }
253 
254  if (doVectorPositions)
255  {
256  termsHashPerField.WriteVInt(0, fieldState.position);
257  p.lastPosition = fieldState.position;
258  }
259  }
260 
261  internal override void AddTerm(RawPostingList p0)
262  {
263 
264  System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.addTerm start"));
265 
266  TermVectorsTermsWriter.PostingList p = (TermVectorsTermsWriter.PostingList) p0;
267  p.freq++;
268 
269  if (doVectorOffsets)
270  {
271  int startOffset = fieldState.offset + offsetAttribute.StartOffset; ;
272  int endOffset = fieldState.offset + offsetAttribute.EndOffset;
273 
274  termsHashPerField.WriteVInt(1, startOffset - p.lastOffset);
275  termsHashPerField.WriteVInt(1, endOffset - startOffset);
276  p.lastOffset = endOffset;
277  }
278 
279  if (doVectorPositions)
280  {
281  termsHashPerField.WriteVInt(0, fieldState.position - p.lastPosition);
282  p.lastPosition = fieldState.position;
283  }
284  }
285 
286  internal override void SkippingLongTerm()
287  {
288  }
289  }
290 }