Lucene.Net  3.0.3
Lucene.Net is a .NET port of the Java Lucene Indexing Library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties
FreqProxTermsWriter.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.Collections.Generic;
20 using UnicodeUtil = Lucene.Net.Util.UnicodeUtil;
21 
22 namespace Lucene.Net.Index
23 {
25  {
26  public override TermsHashConsumerPerThread AddThread(TermsHashPerThread perThread)
27  {
28  return new FreqProxTermsWriterPerThread(perThread);
29  }
30 
31  internal override void CreatePostings(RawPostingList[] postings, int start, int count)
32  {
33  int end = start + count;
34  for (int i = start; i < end; i++)
35  postings[i] = new PostingList();
36  }
37 
38  private static int compareText(char[] text1, int pos1, char[] text2, int pos2)
39  {
40  while (true)
41  {
42  char c1 = text1[pos1++];
43  char c2 = text2[pos2++];
44  if (c1 != c2)
45  {
46  if (0xffff == c2)
47  return 1;
48  else if (0xffff == c1)
49  return - 1;
50  else
51  return c1 - c2;
52  }
53  else if (0xffff == c1)
54  return 0;
55  }
56  }
57 
58  internal override void CloseDocStore(SegmentWriteState state)
59  {
60  }
61  public override void Abort()
62  {
63  }
64 
65 
66  // TODO: would be nice to factor out more of this, eg the
67  // FreqProxFieldMergeState, and code to visit all Fields
68  // under the same FieldInfo together, up into TermsHash*.
69  // Other writers would presumably share alot of this...
70  public override void Flush(IDictionary<TermsHashConsumerPerThread, ICollection<TermsHashConsumerPerField>> threadsAndFields, SegmentWriteState state)
71  {
72 
73  // Gather all FieldData's that have postings, across all
74  // ThreadStates
75  var allFields = new List<FreqProxTermsWriterPerField>();
76 
77  foreach(var entry in threadsAndFields)
78  {
79  var fields = entry.Value;
80 
81  foreach(var i in fields)
82  {
84  if (perField.termsHashPerField.numPostings > 0)
85  allFields.Add(perField);
86  }
87  }
88 
89  // Sort by field name
90  allFields.Sort();
91  int numAllFields = allFields.Count;
92 
93  // TODO: allow Lucene user to customize this consumer:
94  FormatPostingsFieldsConsumer consumer = new FormatPostingsFieldsWriter(state, fieldInfos);
95  /*
96  Current writer chain:
97  FormatPostingsFieldsConsumer
98  -> IMPL: FormatPostingsFieldsWriter
99  -> FormatPostingsTermsConsumer
100  -> IMPL: FormatPostingsTermsWriter
101  -> FormatPostingsDocConsumer
102  -> IMPL: FormatPostingsDocWriter
103  -> FormatPostingsPositionsConsumer
104  -> IMPL: FormatPostingsPositionsWriter
105  */
106 
107  int start = 0;
108  while (start < numAllFields)
109  {
110  FieldInfo fieldInfo = allFields[start].fieldInfo;
111  System.String fieldName = fieldInfo.name;
112 
113  int end = start + 1;
114  while (end < numAllFields && allFields[end].fieldInfo.name.Equals(fieldName))
115  end++;
116 
117  FreqProxTermsWriterPerField[] fields = new FreqProxTermsWriterPerField[end - start];
118  for (int i = start; i < end; i++)
119  {
120  fields[i - start] = allFields[i];
121 
122  // Aggregate the storePayload as seen by the same
123  // field across multiple threads
124  fieldInfo.storePayloads |= fields[i - start].hasPayloads;
125  }
126 
127  // If this field has postings then add them to the
128  // segment
129  AppendPostings(fields, consumer);
130 
131  for (int i = 0; i < fields.Length; i++)
132  {
133  TermsHashPerField perField = fields[i].termsHashPerField;
134  int numPostings = perField.numPostings;
135  perField.Reset();
136  perField.ShrinkHash(numPostings);
137  fields[i].Reset();
138  }
139 
140  start = end;
141  }
142 
143  foreach(var entry in threadsAndFields)
144  {
146  perThread.termsHashPerThread.Reset(true);
147  }
148 
149  consumer.Finish();
150  }
151 
152  private byte[] payloadBuffer;
153 
154  /* Walk through all unique text tokens (Posting
155  * instances) found in this field and serialize them
156  * into a single RAM segment. */
157  internal void AppendPostings(FreqProxTermsWriterPerField[] fields, FormatPostingsFieldsConsumer consumer)
158  {
159 
160  int numFields = fields.Length;
161 
162  FreqProxFieldMergeState[] mergeStates = new FreqProxFieldMergeState[numFields];
163 
164  for (int i = 0; i < numFields; i++)
165  {
166  FreqProxFieldMergeState fms = mergeStates[i] = new FreqProxFieldMergeState(fields[i]);
167 
168  System.Diagnostics.Debug.Assert(fms.field.fieldInfo == fields [0].fieldInfo);
169 
170  // Should always be true
171  bool result = fms.NextTerm();
172  System.Diagnostics.Debug.Assert(result);
173  }
174 
175  FormatPostingsTermsConsumer termsConsumer = consumer.AddField(fields[0].fieldInfo);
176 
177  FreqProxFieldMergeState[] termStates = new FreqProxFieldMergeState[numFields];
178 
179  bool currentFieldOmitTermFreqAndPositions = fields[0].fieldInfo.omitTermFreqAndPositions;
180 
181  while (numFields > 0)
182  {
183 
184  // Get the next term to merge
185  termStates[0] = mergeStates[0];
186  int numToMerge = 1;
187 
188  for (int i = 1; i < numFields; i++)
189  {
190  char[] text = mergeStates[i].text;
191  int textOffset = mergeStates[i].textOffset;
192  int cmp = compareText(text, textOffset, termStates[0].text, termStates[0].textOffset);
193 
194  if (cmp < 0)
195  {
196  termStates[0] = mergeStates[i];
197  numToMerge = 1;
198  }
199  else if (cmp == 0)
200  termStates[numToMerge++] = mergeStates[i];
201  }
202 
203  FormatPostingsDocsConsumer docConsumer = termsConsumer.AddTerm(termStates[0].text, termStates[0].textOffset);
204 
205  // Now termStates has numToMerge FieldMergeStates
206  // which all share the same term. Now we must
207  // interleave the docID streams.
208  while (numToMerge > 0)
209  {
210 
211  FreqProxFieldMergeState minState = termStates[0];
212  for (int i = 1; i < numToMerge; i++)
213  if (termStates[i].docID < minState.docID)
214  minState = termStates[i];
215 
216  int termDocFreq = minState.termFreq;
217 
218  FormatPostingsPositionsConsumer posConsumer = docConsumer.AddDoc(minState.docID, termDocFreq);
219 
220  ByteSliceReader prox = minState.prox;
221 
222  // Carefully copy over the prox + payload info,
223  // changing the format to match Lucene's segment
224  // format.
225  if (!currentFieldOmitTermFreqAndPositions)
226  {
227  // omitTermFreqAndPositions == false so we do write positions &
228  // payload
229  int position = 0;
230  for (int j = 0; j < termDocFreq; j++)
231  {
232  int code = prox.ReadVInt();
233  position += (code >> 1);
234 
235  int payloadLength;
236  if ((code & 1) != 0)
237  {
238  // This position has a payload
239  payloadLength = prox.ReadVInt();
240 
241  if (payloadBuffer == null || payloadBuffer.Length < payloadLength)
242  payloadBuffer = new byte[payloadLength];
243 
244  prox.ReadBytes(payloadBuffer, 0, payloadLength);
245  }
246  else
247  payloadLength = 0;
248 
249  posConsumer.AddPosition(position, payloadBuffer, 0, payloadLength);
250  } //End for
251 
252  posConsumer.Finish();
253  }
254 
255  if (!minState.NextDoc())
256  {
257 
258  // Remove from termStates
259  int upto = 0;
260  for (int i = 0; i < numToMerge; i++)
261  if (termStates[i] != minState)
262  termStates[upto++] = termStates[i];
263  numToMerge--;
264  System.Diagnostics.Debug.Assert(upto == numToMerge);
265 
266  // Advance this state to the next term
267 
268  if (!minState.NextTerm())
269  {
270  // OK, no more terms, so remove from mergeStates
271  // as well
272  upto = 0;
273  for (int i = 0; i < numFields; i++)
274  if (mergeStates[i] != minState)
275  mergeStates[upto++] = mergeStates[i];
276  numFields--;
277  System.Diagnostics.Debug.Assert(upto == numFields);
278  }
279  }
280  }
281 
282  docConsumer.Finish();
283  }
284 
285  termsConsumer.Finish();
286  }
287 
288  internal UnicodeUtil.UTF8Result termsUTF8 = new UnicodeUtil.UTF8Result();
289 
290  internal sealed class PostingList:RawPostingList
291  {
292  internal int docFreq; // # times this term occurs in the current doc
293  internal int lastDocID; // Last docID where this term occurred
294  internal int lastDocCode; // Code for prior doc
295  internal int lastPosition; // Last position where this term occurred
296  }
297 
298  internal override int BytesPerPosting()
299  {
300  return RawPostingList.BYTES_SIZE + 4 * DocumentsWriter.INT_NUM_BYTE;
301  }
302  }
303 }