Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
TermInfosWriter.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 
20 using Directory = Lucene.Net.Store.Directory;
21 using IndexOutput = Lucene.Net.Store.IndexOutput;
22 using UnicodeUtil = Lucene.Net.Util.UnicodeUtil;
23 
24 namespace Lucene.Net.Index
25 {
26 
27  /// <summary>This stores a monotonically increasing set of &lt;Term, TermInfo&gt; pairs in a
28  /// Directory. A TermInfos can be written once, in order.
29  /// </summary>
30 
31  sealed class TermInfosWriter : IDisposable
32  {
33  /// <summary>The file format version, a negative number. </summary>
34  public const int FORMAT = - 3;
35 
36  // Changed strings to true utf8 with length-in-bytes not
37  // length-in-chars
38  public const int FORMAT_VERSION_UTF8_LENGTH_IN_BYTES = - 4;
39 
40  // NOTE: always change this if you switch to a new format!
41  public static readonly int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES;
42 
43  private bool isDisposed;
44 
45  private FieldInfos fieldInfos;
46  private IndexOutput output;
47  private TermInfo lastTi = new TermInfo();
48  private long size;
49 
50  // TODO: the default values for these two parameters should be settable from
51  // IndexWriter. However, once that's done, folks will start setting them to
52  // ridiculous values and complaining that things don't work well, as with
53  // mergeFactor. So, let's wait until a number of folks find that alternate
54  // values work better. Note that both of these values are stored in the
55  // segment, so that it's safe to change these w/o rebuilding all indexes.
56 
57  /// <summary>Expert: The fraction of terms in the "dictionary" which should be stored
58  /// in RAM. Smaller values use more memory, but make searching slightly
59  /// faster, while larger values use less memory and make searching slightly
60  /// slower. Searching is typically not dominated by dictionary lookup, so
61  /// tweaking this is rarely useful.
62  /// </summary>
63  internal int indexInterval = 128;
64 
65  /// <summary>Expert: The fraction of <see cref="TermDocs" /> entries stored in skip tables,
66  /// used to accellerate <see cref="TermDocs.SkipTo(int)" />. Larger values result in
67  /// smaller indexes, greater acceleration, but fewer accelerable cases, while
68  /// smaller values result in bigger indexes, less acceleration and more
69  /// accelerable cases. More detailed experiments would be useful here.
70  /// </summary>
71  internal int skipInterval = 16;
72 
73  /// <summary>Expert: The maximum number of skip levels. Smaller values result in
74  /// slightly smaller indexes, but slower skipping in big posting lists.
75  /// </summary>
76  internal int maxSkipLevels = 10;
77 
78  private long lastIndexPointer;
79  private bool isIndex;
80  private byte[] lastTermBytes = new byte[10];
81  private int lastTermBytesLength = 0;
82  private int lastFieldNumber = - 1;
83 
84  private TermInfosWriter other;
85  private UnicodeUtil.UTF8Result utf8Result = new UnicodeUtil.UTF8Result();
86 
87  internal TermInfosWriter(Directory directory, System.String segment, FieldInfos fis, int interval)
88  {
89  Initialize(directory, segment, fis, interval, false);
90  other = new TermInfosWriter(directory, segment, fis, interval, true);
91  other.other = this;
92  }
93 
94  private TermInfosWriter(Directory directory, System.String segment, FieldInfos fis, int interval, bool isIndex)
95  {
96  Initialize(directory, segment, fis, interval, isIndex);
97  }
98 
99  private void Initialize(Directory directory, System.String segment, FieldInfos fis, int interval, bool isi)
100  {
101  indexInterval = interval;
102  fieldInfos = fis;
103  isIndex = isi;
104  output = directory.CreateOutput(segment + (isIndex?".tii":".tis"));
105  output.WriteInt(FORMAT_CURRENT); // write format
106  output.WriteLong(0); // leave space for size
107  output.WriteInt(indexInterval); // write indexInterval
108  output.WriteInt(skipInterval); // write skipInterval
109  output.WriteInt(maxSkipLevels); // write maxSkipLevels
110  System.Diagnostics.Debug.Assert(InitUTF16Results());
111  }
112 
113  internal void Add(Term term, TermInfo ti)
114  {
115  UnicodeUtil.UTF16toUTF8(term.Text, 0, term.Text.Length, utf8Result);
116  Add(fieldInfos.FieldNumber(term.Field), utf8Result.result, utf8Result.length, ti);
117  }
118 
119  // Currently used only by assert statements
120  internal UnicodeUtil.UTF16Result utf16Result1;
121  internal UnicodeUtil.UTF16Result utf16Result2;
122 
123  // Currently used only by assert statements
124  private bool InitUTF16Results()
125  {
126  utf16Result1 = new UnicodeUtil.UTF16Result();
127  utf16Result2 = new UnicodeUtil.UTF16Result();
128  return true;
129  }
130 
131  // Currently used only by assert statement
132  private int CompareToLastTerm(int fieldNumber, byte[] termBytes, int termBytesLength)
133  {
134 
135  if (lastFieldNumber != fieldNumber)
136  {
137  int cmp = String.CompareOrdinal(fieldInfos.FieldName(lastFieldNumber), fieldInfos.FieldName(fieldNumber));
138  // If there is a field named "" (empty string) then we
139  // will get 0 on this comparison, yet, it's "OK". But
140  // it's not OK if two different field numbers map to
141  // the same name.
142  if (cmp != 0 || lastFieldNumber != - 1)
143  return cmp;
144  }
145 
146  UnicodeUtil.UTF8toUTF16(lastTermBytes, 0, lastTermBytesLength, utf16Result1);
147  UnicodeUtil.UTF8toUTF16(termBytes, 0, termBytesLength, utf16Result2);
148  int len;
149  if (utf16Result1.length < utf16Result2.length)
150  len = utf16Result1.length;
151  else
152  len = utf16Result2.length;
153 
154  for (int i = 0; i < len; i++)
155  {
156  char ch1 = utf16Result1.result[i];
157  char ch2 = utf16Result2.result[i];
158  if (ch1 != ch2)
159  return ch1 - ch2;
160  }
161  return utf16Result1.length - utf16Result2.length;
162  }
163 
164  /// <summary>Adds a new &lt;fieldNumber, termBytes&gt;, TermInfo> pair to the set.
165  /// Term must be lexicographically greater than all previous Terms added.
166  /// TermInfo pointers must be positive and greater than all previous.
167  /// </summary>
168  internal void Add(int fieldNumber, byte[] termBytes, int termBytesLength, TermInfo ti)
169  {
170 
171  System.Diagnostics.Debug.Assert(CompareToLastTerm(fieldNumber, termBytes, termBytesLength) < 0 ||
172  (isIndex && termBytesLength == 0 && lastTermBytesLength == 0),
173  "Terms are out of order: field=" + fieldInfos.FieldName(fieldNumber) + " (number " + fieldNumber + ")" +
174  " lastField=" + fieldInfos.FieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" +
175  " text=" + System.Text.Encoding.UTF8.GetString(termBytes, 0, termBytesLength) + " lastText=" + System.Text.Encoding.UTF8.GetString(lastTermBytes, 0, lastTermBytesLength));
176 
177  System.Diagnostics.Debug.Assert(ti.freqPointer >= lastTi.freqPointer, "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")");
178  System.Diagnostics.Debug.Assert(ti.proxPointer >= lastTi.proxPointer, "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")");
179 
180  if (!isIndex && size % indexInterval == 0)
181  other.Add(lastFieldNumber, lastTermBytes, lastTermBytesLength, lastTi); // add an index term
182 
183  WriteTerm(fieldNumber, termBytes, termBytesLength); // write term
184 
185  output.WriteVInt(ti.docFreq); // write doc freq
186  output.WriteVLong(ti.freqPointer - lastTi.freqPointer); // write pointers
187  output.WriteVLong(ti.proxPointer - lastTi.proxPointer);
188 
189  if (ti.docFreq >= skipInterval)
190  {
191  output.WriteVInt(ti.skipOffset);
192  }
193 
194  if (isIndex)
195  {
196  output.WriteVLong(other.output.FilePointer - lastIndexPointer);
197  lastIndexPointer = other.output.FilePointer; // write pointer
198  }
199 
200  lastFieldNumber = fieldNumber;
201  lastTi.Set(ti);
202  size++;
203  }
204 
205  private void WriteTerm(int fieldNumber, byte[] termBytes, int termBytesLength)
206  {
207 
208  // TODO: UTF16toUTF8 could tell us this prefix
209  // Compute prefix in common with last term:
210  int start = 0;
211  int limit = termBytesLength < lastTermBytesLength?termBytesLength:lastTermBytesLength;
212  while (start < limit)
213  {
214  if (termBytes[start] != lastTermBytes[start])
215  break;
216  start++;
217  }
218 
219  int length = termBytesLength - start;
220  output.WriteVInt(start); // write shared prefix length
221  output.WriteVInt(length); // write delta length
222  output.WriteBytes(termBytes, start, length); // write delta bytes
223  output.WriteVInt(fieldNumber); // write field num
224  if (lastTermBytes.Length < termBytesLength)
225  {
226  byte[] newArray = new byte[(int) (termBytesLength * 1.5)];
227  Array.Copy(lastTermBytes, 0, newArray, 0, start);
228  lastTermBytes = newArray;
229  }
230  Array.Copy(termBytes, start, lastTermBytes, start, length);
231  lastTermBytesLength = termBytesLength;
232  }
233 
234  /// <summary>Called to complete TermInfos creation. </summary>
235  public void Dispose()
236  {
237  // Move to protected method if class becomes unsealed
238  if (isDisposed) return;
239 
240  output.Seek(4); // write size after format
241  output.WriteLong(size);
242  output.Dispose();
243 
244  if (!isIndex)
245  other.Dispose();
246 
247  isDisposed = true;
248  }
249  }
250 }