Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
TermsHashPerField.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using Lucene.Net.Analysis.Tokenattributes;
20 using Lucene.Net.Documents;
21 using Lucene.Net.Support;
22 using UnicodeUtil = Lucene.Net.Util.UnicodeUtil;
23 
24 namespace Lucene.Net.Index
25 {
26 
28  {
29  private void InitBlock()
30  {
31  postingsHashHalfSize = postingsHashSize / 2;
32  postingsHashMask = postingsHashSize - 1;
33  postingsHash = new RawPostingList[postingsHashSize];
34  }
35 
36  internal TermsHashConsumerPerField consumer;
37  internal TermsHashPerField nextPerField;
38  internal TermsHashPerThread perThread;
39  internal DocumentsWriter.DocState docState;
40  internal FieldInvertState fieldState;
41  internal ITermAttribute termAtt;
42 
43  // Copied from our perThread
44  internal CharBlockPool charPool;
45  internal IntBlockPool intPool;
46  internal ByteBlockPool bytePool;
47 
48  internal int streamCount;
49  internal int numPostingInt;
50 
51  internal FieldInfo fieldInfo;
52 
53  internal bool postingsCompacted;
54  internal int numPostings;
55  private int postingsHashSize = 4;
56  private int postingsHashHalfSize;
57  private int postingsHashMask;
58  private RawPostingList[] postingsHash;
59  private RawPostingList p;
60 
61  public TermsHashPerField(DocInverterPerField docInverterPerField, TermsHashPerThread perThread, TermsHashPerThread nextPerThread, FieldInfo fieldInfo)
62  {
63  InitBlock();
64  this.perThread = perThread;
65  intPool = perThread.intPool;
66  charPool = perThread.charPool;
67  bytePool = perThread.bytePool;
68  docState = perThread.docState;
69  fieldState = docInverterPerField.fieldState;
70  this.consumer = perThread.consumer.AddField(this, fieldInfo);
71  streamCount = consumer.GetStreamCount();
72  numPostingInt = 2 * streamCount;
73  this.fieldInfo = fieldInfo;
74  if (nextPerThread != null)
75  nextPerField = (TermsHashPerField) nextPerThread.AddField(docInverterPerField, fieldInfo);
76  else
77  nextPerField = null;
78  }
79 
80  internal void ShrinkHash(int targetSize)
81  {
82  System.Diagnostics.Debug.Assert(postingsCompacted || numPostings == 0);
83 
84  int newSize = 4;
85 
86  if (newSize != postingsHash.Length)
87  {
88  postingsHash = new RawPostingList[newSize];
89  postingsHashSize = newSize;
90  postingsHashHalfSize = newSize / 2;
91  postingsHashMask = newSize - 1;
92  }
93  System.Array.Clear(postingsHash,0,postingsHash.Length);
94  }
95 
96  public void Reset()
97  {
98  if (!postingsCompacted)
99  CompactPostings();
100  System.Diagnostics.Debug.Assert(numPostings <= postingsHash.Length);
101  if (numPostings > 0)
102  {
103  perThread.termsHash.RecyclePostings(postingsHash, numPostings);
104  Array.Clear(postingsHash, 0, numPostings);
105  numPostings = 0;
106  }
107  postingsCompacted = false;
108  if (nextPerField != null)
109  nextPerField.Reset();
110  }
111 
112  public override void Abort()
113  {
114  lock (this)
115  {
116  Reset();
117  if (nextPerField != null)
118  nextPerField.Abort();
119  }
120  }
121 
122  public void InitReader(ByteSliceReader reader, RawPostingList p, int stream)
123  {
124  System.Diagnostics.Debug.Assert(stream < streamCount);
125  int[] ints = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
126  int upto = p.intStart & DocumentsWriter.INT_BLOCK_MASK;
127  reader.Init(bytePool, p.byteStart + stream * ByteBlockPool.FIRST_LEVEL_SIZE, ints[upto + stream]);
128  }
129 
130  private void CompactPostings()
131  {
132  lock (this)
133  {
134  int upto = 0;
135  for (int i = 0; i < postingsHashSize; i++)
136  {
137  if (postingsHash[i] != null)
138  {
139  if (upto < i)
140  {
141  postingsHash[upto] = postingsHash[i];
142  postingsHash[i] = null;
143  }
144  upto++;
145  }
146  }
147 
148  System.Diagnostics.Debug.Assert(upto == numPostings);
149  postingsCompacted = true;
150  }
151  }
152 
153  /// <summary>Collapse the hash table &amp; sort in-place. </summary>
154  public RawPostingList[] SortPostings()
155  {
156  CompactPostings();
157  QuickSort(postingsHash, 0, numPostings - 1);
158  return postingsHash;
159  }
160 
161  internal void QuickSort(RawPostingList[] postings, int lo, int hi)
162  {
163  if (lo >= hi)
164  return ;
165  else if (hi == 1 + lo)
166  {
167  if (ComparePostings(postings[lo], postings[hi]) > 0)
168  {
169  RawPostingList tmp = postings[lo];
170  postings[lo] = postings[hi];
171  postings[hi] = tmp;
172  }
173  return ;
174  }
175 
176  int mid = Number.URShift((lo + hi), 1);
177 
178  if (ComparePostings(postings[lo], postings[mid]) > 0)
179  {
180  RawPostingList tmp = postings[lo];
181  postings[lo] = postings[mid];
182  postings[mid] = tmp;
183  }
184 
185  if (ComparePostings(postings[mid], postings[hi]) > 0)
186  {
187  RawPostingList tmp = postings[mid];
188  postings[mid] = postings[hi];
189  postings[hi] = tmp;
190 
191  if (ComparePostings(postings[lo], postings[mid]) > 0)
192  {
193  RawPostingList tmp2 = postings[lo];
194  postings[lo] = postings[mid];
195  postings[mid] = tmp2;
196  }
197  }
198 
199  int left = lo + 1;
200  int right = hi - 1;
201 
202  if (left >= right)
203  return ;
204 
205  RawPostingList partition = postings[mid];
206 
207  for (; ; )
208  {
209  while (ComparePostings(postings[right], partition) > 0)
210  --right;
211 
212  while (left < right && ComparePostings(postings[left], partition) <= 0)
213  ++left;
214 
215  if (left < right)
216  {
217  RawPostingList tmp = postings[left];
218  postings[left] = postings[right];
219  postings[right] = tmp;
220  --right;
221  }
222  else
223  {
224  break;
225  }
226  }
227 
228  QuickSort(postings, lo, left);
229  QuickSort(postings, left + 1, hi);
230  }
231 
232  /// <summary>Compares term text for two Posting instance and
233  /// returns -1 if p1 &lt; p2; 1 if p1 &gt; p2; else 0.
234  /// </summary>
235  internal int ComparePostings(RawPostingList p1, RawPostingList p2)
236  {
237 
238  if (p1 == p2)
239  return 0;
240 
241  char[] text1 = charPool.buffers[p1.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
242  int pos1 = p1.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
243  char[] text2 = charPool.buffers[p2.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
244  int pos2 = p2.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
245 
246  System.Diagnostics.Debug.Assert(text1 != text2 || pos1 != pos2);
247 
248  while (true)
249  {
250  char c1 = text1[pos1++];
251  char c2 = text2[pos2++];
252  if (c1 != c2)
253  {
254  if (0xffff == c2)
255  return 1;
256  else if (0xffff == c1)
257  return - 1;
258  else
259  return c1 - c2;
260  }
261  else
262  // This method should never compare equal postings
263  // unless p1==p2
264  System.Diagnostics.Debug.Assert(c1 != 0xffff);
265  }
266  }
267 
268  /// <summary>Test whether the text for current RawPostingList p equals
269  /// current tokenText.
270  /// </summary>
271  private bool PostingEquals(char[] tokenText, int tokenTextLen)
272  {
273 
274  char[] text = perThread.charPool.buffers[p.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
275  System.Diagnostics.Debug.Assert(text != null);
276  int pos = p.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
277 
278  int tokenPos = 0;
279  for (; tokenPos < tokenTextLen; pos++, tokenPos++)
280  if (tokenText[tokenPos] != text[pos])
281  return false;
282  return 0xffff == text[pos];
283  }
284 
285  private bool doCall;
286  private bool doNextCall;
287 
288  internal override void Start(IFieldable f)
289  {
290  termAtt = fieldState.attributeSource.AddAttribute<ITermAttribute>();
291  consumer.Start(f);
292  if (nextPerField != null)
293  {
294  nextPerField.Start(f);
295  }
296  }
297 
298  internal override bool Start(IFieldable[] fields, int count)
299  {
300  doCall = consumer.Start(fields, count);
301  if (nextPerField != null)
302  doNextCall = nextPerField.Start(fields, count);
303  return doCall || doNextCall;
304  }
305 
306  // Secondary entry point (for 2nd & subsequent TermsHash),
307  // because token text has already been "interned" into
308  // textStart, so we hash by textStart
309  public void Add(int textStart)
310  {
311 
312  int code = textStart;
313 
314  int hashPos = code & postingsHashMask;
315 
316  System.Diagnostics.Debug.Assert(!postingsCompacted);
317 
318  // Locate RawPostingList in hash
319  p = postingsHash[hashPos];
320 
321  if (p != null && p.textStart != textStart)
322  {
323  // Conflict: keep searching different locations in
324  // the hash table.
325  int inc = ((code >> 8) + code) | 1;
326  do
327  {
328  code += inc;
329  hashPos = code & postingsHashMask;
330  p = postingsHash[hashPos];
331  }
332  while (p != null && p.textStart != textStart);
333  }
334 
335  if (p == null)
336  {
337 
338  // First time we are seeing this token since we last
339  // flushed the hash.
340 
341  // Refill?
342  if (0 == perThread.freePostingsCount)
343  perThread.MorePostings();
344 
345  // Pull next free RawPostingList from free list
346  p = perThread.freePostings[--perThread.freePostingsCount];
347  System.Diagnostics.Debug.Assert(p != null);
348 
349  p.textStart = textStart;
350 
351  System.Diagnostics.Debug.Assert(postingsHash [hashPos] == null);
352  postingsHash[hashPos] = p;
353  numPostings++;
354 
355  if (numPostings == postingsHashHalfSize)
356  RehashPostings(2 * postingsHashSize);
357 
358  // Init stream slices
359  if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE)
360  intPool.NextBuffer();
361 
362  if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt * ByteBlockPool.FIRST_LEVEL_SIZE)
363  bytePool.NextBuffer();
364 
365  intUptos = intPool.buffer;
366  intUptoStart = intPool.intUpto;
367  intPool.intUpto += streamCount;
368 
369  p.intStart = intUptoStart + intPool.intOffset;
370 
371  for (int i = 0; i < streamCount; i++)
372  {
373  int upto = bytePool.NewSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
374  intUptos[intUptoStart + i] = upto + bytePool.byteOffset;
375  }
376  p.byteStart = intUptos[intUptoStart];
377 
378  consumer.NewTerm(p);
379  }
380  else
381  {
382  intUptos = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
383  intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK;
384  consumer.AddTerm(p);
385  }
386  }
387 
388  // Primary entry point (for first TermsHash)
389  internal override void Add()
390  {
391 
392  System.Diagnostics.Debug.Assert(!postingsCompacted);
393 
394  // We are first in the chain so we must "intern" the
395  // term text into textStart address
396 
397  // Get the text of this term.
398  char[] tokenText = termAtt.TermBuffer();
399  ;
400  int tokenTextLen = termAtt.TermLength();
401 
402  // Compute hashcode & replace any invalid UTF16 sequences
403  int downto = tokenTextLen;
404  int code = 0;
405  while (downto > 0)
406  {
407  char ch = tokenText[--downto];
408 
409  if (ch >= UnicodeUtil.UNI_SUR_LOW_START && ch <= UnicodeUtil.UNI_SUR_LOW_END)
410  {
411  if (0 == downto)
412  {
413  // Unpaired
414  ch = tokenText[downto] = (char) (UnicodeUtil.UNI_REPLACEMENT_CHAR);
415  }
416  else
417  {
418  char ch2 = tokenText[downto - 1];
419  if (ch2 >= UnicodeUtil.UNI_SUR_HIGH_START && ch2 <= UnicodeUtil.UNI_SUR_HIGH_END)
420  {
421  // OK: high followed by low. This is a valid
422  // surrogate pair.
423  code = ((code * 31) + ch) * 31 + ch2;
424  downto--;
425  continue;
426  }
427  else
428  {
429  // Unpaired
430  ch = tokenText[downto] = (char) (UnicodeUtil.UNI_REPLACEMENT_CHAR);
431  }
432  }
433  }
434  else if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && (ch <= UnicodeUtil.UNI_SUR_HIGH_END || ch == 0xffff))
435  {
436  // Unpaired or 0xffff
437  ch = tokenText[downto] = (char) (UnicodeUtil.UNI_REPLACEMENT_CHAR);
438  }
439 
440  code = (code * 31) + ch;
441  }
442 
443  int hashPos = code & postingsHashMask;
444 
445  // Locate RawPostingList in hash
446  p = postingsHash[hashPos];
447 
448  if (p != null && !PostingEquals(tokenText, tokenTextLen))
449  {
450  // Conflict: keep searching different locations in
451  // the hash table.
452  int inc = ((code >> 8) + code) | 1;
453  do
454  {
455  code += inc;
456  hashPos = code & postingsHashMask;
457  p = postingsHash[hashPos];
458  }
459  while (p != null && !PostingEquals(tokenText, tokenTextLen));
460  }
461 
462  if (p == null)
463  {
464 
465  // First time we are seeing this token since we last
466  // flushed the hash.
467  int textLen1 = 1 + tokenTextLen;
468  if (textLen1 + charPool.charUpto > DocumentsWriter.CHAR_BLOCK_SIZE)
469  {
470  if (textLen1 > DocumentsWriter.CHAR_BLOCK_SIZE)
471  {
472  // Just skip this term, to remain as robust as
473  // possible during indexing. A TokenFilter
474  // can be inserted into the analyzer chain if
475  // other behavior is wanted (pruning the term
476  // to a prefix, throwing an exception, etc).
477 
478  if (docState.maxTermPrefix == null)
479  docState.maxTermPrefix = new System.String(tokenText, 0, 30);
480 
481  consumer.SkippingLongTerm();
482  return ;
483  }
484  charPool.NextBuffer();
485  }
486 
487  // Refill?
488  if (0 == perThread.freePostingsCount)
489  perThread.MorePostings();
490 
491  // Pull next free RawPostingList from free list
492  p = perThread.freePostings[--perThread.freePostingsCount];
493  System.Diagnostics.Debug.Assert(p != null);
494 
495  char[] text = charPool.buffer;
496  int textUpto = charPool.charUpto;
497  p.textStart = textUpto + charPool.charOffset;
498  charPool.charUpto += textLen1;
499  Array.Copy(tokenText, 0, text, textUpto, tokenTextLen);
500  text[textUpto + tokenTextLen] = (char) (0xffff);
501 
502  System.Diagnostics.Debug.Assert(postingsHash [hashPos] == null);
503  postingsHash[hashPos] = p;
504  numPostings++;
505 
506  if (numPostings == postingsHashHalfSize)
507  RehashPostings(2 * postingsHashSize);
508 
509  // Init stream slices
510  if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE)
511  intPool.NextBuffer();
512 
513  if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt * ByteBlockPool.FIRST_LEVEL_SIZE)
514  bytePool.NextBuffer();
515 
516  intUptos = intPool.buffer;
517  intUptoStart = intPool.intUpto;
518  intPool.intUpto += streamCount;
519 
520  p.intStart = intUptoStart + intPool.intOffset;
521 
522  for (int i = 0; i < streamCount; i++)
523  {
524  int upto = bytePool.NewSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
525  intUptos[intUptoStart + i] = upto + bytePool.byteOffset;
526  }
527  p.byteStart = intUptos[intUptoStart];
528 
529  consumer.NewTerm(p);
530  }
531  else
532  {
533  intUptos = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
534  intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK;
535  consumer.AddTerm(p);
536  }
537 
538  if (doNextCall)
539  nextPerField.Add(p.textStart);
540  }
541 
542  internal int[] intUptos;
543  internal int intUptoStart;
544 
545  internal void WriteByte(int stream, byte b)
546  {
547  int upto = intUptos[intUptoStart + stream];
548  byte[] bytes = bytePool.buffers[upto >> DocumentsWriter.BYTE_BLOCK_SHIFT];
549  System.Diagnostics.Debug.Assert(bytes != null);
550  int offset = upto & DocumentsWriter.BYTE_BLOCK_MASK;
551  if (bytes[offset] != 0)
552  {
553  // End of slice; allocate a new one
554  offset = bytePool.AllocSlice(bytes, offset);
555  bytes = bytePool.buffer;
556  intUptos[intUptoStart + stream] = offset + bytePool.byteOffset;
557  }
558  bytes[offset] = b;
559  (intUptos[intUptoStart + stream])++;
560  }
561 
562  public void WriteBytes(int stream, byte[] b, int offset, int len)
563  {
564  // TODO: optimize
565  int end = offset + len;
566  for (int i = offset; i < end; i++)
567  WriteByte(stream, b[i]);
568  }
569 
570  internal void WriteVInt(int stream, int i)
571  {
572  System.Diagnostics.Debug.Assert(stream < streamCount);
573  while ((i & ~ 0x7F) != 0)
574  {
575  WriteByte(stream, (byte) ((i & 0x7f) | 0x80));
576  i = Number.URShift(i, 7);
577  }
578  WriteByte(stream, (byte) i);
579  }
580 
581  internal override void Finish()
582  {
583  consumer.Finish();
584  if (nextPerField != null)
585  nextPerField.Finish();
586  }
587 
588  /// <summary>Called when postings hash is too small (> 50%
589  /// occupied) or too large (&lt; 20% occupied).
590  /// </summary>
591  internal void RehashPostings(int newSize)
592  {
593 
594  int newMask = newSize - 1;
595 
596  RawPostingList[] newHash = new RawPostingList[newSize];
597  for (int i = 0; i < postingsHashSize; i++)
598  {
599  RawPostingList p0 = postingsHash[i];
600  if (p0 != null)
601  {
602  int code;
603  if (perThread.primary)
604  {
605  int start = p0.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
606  char[] text = charPool.buffers[p0.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
607  int pos = start;
608  while (text[pos] != 0xffff)
609  pos++;
610  code = 0;
611  while (pos > start)
612  code = (code * 31) + text[--pos];
613  }
614  else
615  code = p0.textStart;
616 
617  int hashPos = code & newMask;
618  System.Diagnostics.Debug.Assert(hashPos >= 0);
619  if (newHash[hashPos] != null)
620  {
621  int inc = ((code >> 8) + code) | 1;
622  do
623  {
624  code += inc;
625  hashPos = code & newMask;
626  }
627  while (newHash[hashPos] != null);
628  }
629  newHash[hashPos] = p0;
630  }
631  }
632 
633  postingsHashMask = newMask;
634  postingsHash = newHash;
635  postingsHashSize = newSize;
636  postingsHashHalfSize = newSize >> 1;
637  }
638  }
639 }