Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
NumericUtils.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using Lucene.Net.Documents;
20 using Lucene.Net.Search;
21 using Lucene.Net.Support;
22 using NumericTokenStream = Lucene.Net.Analysis.NumericTokenStream;
23 
24 namespace Lucene.Net.Util
25 {
26 
27  /// <summary> This is a helper class to generate prefix-encoded representations for numerical values
28  /// and supplies converters to represent float/double values as sortable integers/longs.
29  ///
30  /// <p/>To quickly execute range queries in Apache Lucene, a range is divided recursively
31  /// into multiple intervals for searching: The center of the range is searched only with
32  /// the lowest possible precision in the trie, while the boundaries are matched
33  /// more exactly. This reduces the number of terms dramatically.
34  ///
35  /// <p/>This class generates terms to achive this: First the numerical integer values need to
36  /// be converted to strings. For that integer values (32 bit or 64 bit) are made unsigned
37  /// and the bits are converted to ASCII chars with each 7 bit. The resulting string is
38  /// sortable like the original integer value. Each value is also prefixed
39  /// (in the first char) by the <c>shift</c> value (number of bits removed) used
40  /// during encoding.
41  ///
42  /// <p/>To also index floating point numbers, this class supplies two methods to convert them
43  /// to integer values by changing their bit layout: <see cref="DoubleToSortableLong" />,
44  /// <see cref="FloatToSortableInt" />. You will have no precision loss by
45  /// converting floating point numbers to integers and back (only that the integer form
46  /// is not usable). Other data types like dates can easily converted to longs or ints (e.g.
47  /// date to long: <see cref="DateTime" />).
48  ///
49  /// <p/>For easy usage, the trie algorithm is implemented for indexing inside
50  /// <see cref="NumericTokenStream" /> that can index <c>int</c>, <c>long</c>,
51  /// <c>float</c>, and <c>double</c>. For querying,
52  /// <see cref="NumericRangeQuery{T}" /> and <see cref="NumericRangeFilter{T}" /> implement the query part
53  /// for the same data types.
54  ///
55  /// <p/>This class can also be used, to generate lexicographically sortable (according
56  /// <see cref="String.CompareTo(String)" />) representations of numeric data types for other
57  /// usages (e.g. sorting).
58  ///
59  /// <p/><font color="red"><b>NOTE:</b> This API is experimental and
60  /// might change in incompatible ways in the next release.</font>
61  ///
62  /// </summary>
63  /// <since> 2.9
64  /// </since>
65  public sealed class NumericUtils
66  {
67 
68  private NumericUtils()
69  {
70  } // no instance!
71 
72  /// <summary> The default precision step used by <see cref="NumericField" />, <see cref="NumericTokenStream" />,
73  /// <see cref="NumericRangeQuery{T}" />, and <see cref="NumericRangeFilter{T}" /> as default
74  /// </summary>
75  public const int PRECISION_STEP_DEFAULT = 4;
76 
77  /// <summary> Expert: Longs are stored at lower precision by shifting off lower bits. The shift count is
78  /// stored as <c>SHIFT_START_LONG+shift</c> in the first character
79  /// </summary>
80  public static char SHIFT_START_LONG = (char) 0x20;
81 
82  /// <summary> Expert: The maximum term length (used for <c>char[]</c> buffer size)
83  /// for encoding <c>long</c> values.
84  /// </summary>
85  /// <seealso cref="LongToPrefixCoded(long,int,char[])">
86  /// </seealso>
87  public const int BUF_SIZE_LONG = 63 / 7 + 2;
88 
89  /// <summary> Expert: Integers are stored at lower precision by shifting off lower bits. The shift count is
90  /// stored as <c>SHIFT_START_INT+shift</c> in the first character
91  /// </summary>
92  public static char SHIFT_START_INT = (char) 0x60;
93 
94  /// <summary> Expert: The maximum term length (used for <c>char[]</c> buffer size)
95  /// for encoding <c>int</c> values.
96  /// </summary>
97  /// <seealso cref="IntToPrefixCoded(int,int,char[])">
98  /// </seealso>
99  public const int BUF_SIZE_INT = 31 / 7 + 2;
100 
101  /// <summary> Expert: Returns prefix coded bits after reducing the precision by <c>shift</c> bits.
102  /// This is method is used by <see cref="NumericTokenStream" />.
103  /// </summary>
104  /// <param name="val">the numeric value
105  /// </param>
106  /// <param name="shift">how many bits to strip from the right
107  /// </param>
108  /// <param name="buffer">that will contain the encoded chars, must be at least of <see cref="BUF_SIZE_LONG" />
109  /// length
110  /// </param>
111  /// <returns> number of chars written to buffer
112  /// </returns>
113  public static int LongToPrefixCoded(long val, int shift, char[] buffer)
114  {
115  if (shift > 63 || shift < 0)
116  throw new System.ArgumentException("Illegal shift value, must be 0..63");
117  int nChars = (63 - shift) / 7 + 1, len = nChars + 1;
118  buffer[0] = (char) (SHIFT_START_LONG + shift);
119  ulong sortableBits = BitConverter.ToUInt64(BitConverter.GetBytes(val), 0) ^ 0x8000000000000000L;
120  sortableBits = sortableBits >> shift;
121  while (nChars >= 1)
122  {
123  // Store 7 bits per character for good efficiency when UTF-8 encoding.
124  // The whole number is right-justified so that lucene can prefix-encode
125  // the terms more efficiently.
126  buffer[nChars--] = (char) (sortableBits & 0x7f);
127  sortableBits = sortableBits >> 7;
128  }
129  return len;
130  }
131 
132  /// <summary> Expert: Returns prefix coded bits after reducing the precision by <c>shift</c> bits.
133  /// This is method is used by <see cref="LongRangeBuilder" />.
134  /// </summary>
135  /// <param name="val">the numeric value
136  /// </param>
137  /// <param name="shift">how many bits to strip from the right
138  /// </param>
139  public static System.String LongToPrefixCoded(long val, int shift)
140  {
141  char[] buffer = new char[BUF_SIZE_LONG];
142  int len = LongToPrefixCoded(val, shift, buffer);
143  return new System.String(buffer, 0, len);
144  }
145 
146  /// <summary> This is a convenience method, that returns prefix coded bits of a long without
147  /// reducing the precision. It can be used to store the full precision value as a
148  /// stored field in index.
149  /// <p/>To decode, use <see cref="PrefixCodedToLong" />.
150  /// </summary>
151  public static System.String LongToPrefixCoded(long val)
152  {
153  return LongToPrefixCoded(val, 0);
154  }
155 
156  /// <summary> Expert: Returns prefix coded bits after reducing the precision by <c>shift</c> bits.
157  /// This is method is used by <see cref="NumericTokenStream" />.
158  /// </summary>
159  /// <param name="val">the numeric value
160  /// </param>
161  /// <param name="shift">how many bits to strip from the right
162  /// </param>
163  /// <param name="buffer">that will contain the encoded chars, must be at least of <see cref="BUF_SIZE_INT" />
164  /// length
165  /// </param>
166  /// <returns> number of chars written to buffer
167  /// </returns>
168  public static int IntToPrefixCoded(int val, int shift, char[] buffer)
169  {
170  if (shift > 31 || shift < 0)
171  throw new System.ArgumentException("Illegal shift value, must be 0..31");
172  int nChars = (31 - shift) / 7 + 1, len = nChars + 1;
173  buffer[0] = (char) (SHIFT_START_INT + shift);
174  int sortableBits = val ^ unchecked((int) 0x80000000);
175  sortableBits = Number.URShift(sortableBits, shift);
176  while (nChars >= 1)
177  {
178  // Store 7 bits per character for good efficiency when UTF-8 encoding.
179  // The whole number is right-justified so that lucene can prefix-encode
180  // the terms more efficiently.
181  buffer[nChars--] = (char) (sortableBits & 0x7f);
182  sortableBits = Number.URShift(sortableBits, 7);
183  }
184  return len;
185  }
186 
187  /// <summary> Expert: Returns prefix coded bits after reducing the precision by <c>shift</c> bits.
188  /// This is method is used by <see cref="IntRangeBuilder" />.
189  /// </summary>
190  /// <param name="val">the numeric value
191  /// </param>
192  /// <param name="shift">how many bits to strip from the right
193  /// </param>
194  public static System.String IntToPrefixCoded(int val, int shift)
195  {
196  char[] buffer = new char[BUF_SIZE_INT];
197  int len = IntToPrefixCoded(val, shift, buffer);
198  return new System.String(buffer, 0, len);
199  }
200 
201  /// <summary> This is a convenience method, that returns prefix coded bits of an int without
202  /// reducing the precision. It can be used to store the full precision value as a
203  /// stored field in index.
204  /// <p/>To decode, use <see cref="PrefixCodedToInt" />.
205  /// </summary>
206  public static System.String IntToPrefixCoded(int val)
207  {
208  return IntToPrefixCoded(val, 0);
209  }
210 
211  /// <summary> Returns a long from prefixCoded characters.
212  /// Rightmost bits will be zero for lower precision codes.
213  /// This method can be used to decode e.g. a stored field.
214  /// </summary>
215  /// <throws> NumberFormatException if the supplied string is </throws>
216  /// <summary> not correctly prefix encoded.
217  /// </summary>
218  /// <seealso cref="LongToPrefixCoded(long)">
219  /// </seealso>
220  public static long PrefixCodedToLong(System.String prefixCoded)
221  {
222  int shift = prefixCoded[0] - SHIFT_START_LONG;
223  if (shift > 63 || shift < 0)
224  throw new System.FormatException("Invalid shift value in prefixCoded string (is encoded value really a LONG?)");
225  ulong sortableBits = 0UL;
226  for (int i = 1, len = prefixCoded.Length; i < len; i++)
227  {
228  sortableBits <<= 7;
229  char ch = prefixCoded[i];
230  if (ch > 0x7f)
231  {
232  throw new System.FormatException("Invalid prefixCoded numerical value representation (char " + System.Convert.ToString((int) ch, 16) + " at position " + i + " is invalid)");
233  }
234  sortableBits |= (ulong) ch;
235  }
236  return BitConverter.ToInt64(BitConverter.GetBytes((sortableBits << shift) ^ 0x8000000000000000L), 0);
237  }
238 
239  /// <summary> Returns an int from prefixCoded characters.
240  /// Rightmost bits will be zero for lower precision codes.
241  /// This method can be used to decode e.g. a stored field.
242  /// </summary>
243  /// <throws> NumberFormatException if the supplied string is </throws>
244  /// <summary> not correctly prefix encoded.
245  /// </summary>
246  /// <seealso cref="IntToPrefixCoded(int)">
247  /// </seealso>
248  public static int PrefixCodedToInt(System.String prefixCoded)
249  {
250  int shift = prefixCoded[0] - SHIFT_START_INT;
251  if (shift > 31 || shift < 0)
252  throw new System.FormatException("Invalid shift value in prefixCoded string (is encoded value really an INT?)");
253  int sortableBits = 0;
254  for (int i = 1, len = prefixCoded.Length; i < len; i++)
255  {
256  sortableBits <<= 7;
257  char ch = prefixCoded[i];
258  if (ch > 0x7f)
259  {
260  throw new System.FormatException("Invalid prefixCoded numerical value representation (char " + System.Convert.ToString((int) ch, 16) + " at position " + i + " is invalid)");
261  }
262  sortableBits |= (int) ch;
263  }
264  return (sortableBits << shift) ^ unchecked((int) 0x80000000);
265  }
266 
267  /// <summary> Converts a <c>double</c> value to a sortable signed <c>long</c>.
268  /// The value is converted by getting their IEEE 754 floating-point &quot;double format&quot;
269  /// bit layout and then some bits are swapped, to be able to compare the result as long.
270  /// By this the precision is not reduced, but the value can easily used as a long.
271  /// </summary>
272  /// <seealso cref="SortableLongToDouble">
273  /// </seealso>
274  public static long DoubleToSortableLong(double val)
275  {
276  long f = BitConverter.DoubleToInt64Bits(val); // {{Aroush-2.9}} will this work the same as 'java.lang.Double.doubleToRawLongBits()'?
277  if (f < 0)
278  f ^= 0x7fffffffffffffffL;
279  return f;
280  }
281 
282  /// <summary> Convenience method: this just returns:
283  /// longToPrefixCoded(doubleToSortableLong(val))
284  /// </summary>
285  public static System.String DoubleToPrefixCoded(double val)
286  {
287  return LongToPrefixCoded(DoubleToSortableLong(val));
288  }
289 
290  /// <summary> Converts a sortable <c>long</c> back to a <c>double</c>.</summary>
291  /// <seealso cref="DoubleToSortableLong">
292  /// </seealso>
293  public static double SortableLongToDouble(long val)
294  {
295  if (val < 0)
296  val ^= 0x7fffffffffffffffL;
297  return BitConverter.Int64BitsToDouble(val);
298  }
299 
300  /// <summary> Convenience method: this just returns:
301  /// sortableLongToDouble(prefixCodedToLong(val))
302  /// </summary>
303  public static double PrefixCodedToDouble(System.String val)
304  {
305  return SortableLongToDouble(PrefixCodedToLong(val));
306  }
307 
308  /// <summary> Converts a <c>float</c> value to a sortable signed <c>int</c>.
309  /// The value is converted by getting their IEEE 754 floating-point &quot;float format&quot;
310  /// bit layout and then some bits are swapped, to be able to compare the result as int.
311  /// By this the precision is not reduced, but the value can easily used as an int.
312  /// </summary>
313  /// <seealso cref="SortableIntToFloat">
314  /// </seealso>
315  public static int FloatToSortableInt(float val)
316  {
317  int f = BitConverter.ToInt32(BitConverter.GetBytes(val), 0);
318  if (f < 0)
319  f ^= 0x7fffffff;
320  return f;
321  }
322 
323  /// <summary> Convenience method: this just returns:
324  /// intToPrefixCoded(floatToSortableInt(val))
325  /// </summary>
326  public static System.String FloatToPrefixCoded(float val)
327  {
328  return IntToPrefixCoded(FloatToSortableInt(val));
329  }
330 
331  /// <summary> Converts a sortable <c>int</c> back to a <c>float</c>.</summary>
332  /// <seealso cref="FloatToSortableInt">
333  /// </seealso>
334  public static float SortableIntToFloat(int val)
335  {
336  if (val < 0)
337  val ^= 0x7fffffff;
338  return BitConverter.ToSingle(BitConverter.GetBytes(val), 0);
339  }
340 
341  /// <summary> Convenience method: this just returns:
342  /// sortableIntToFloat(prefixCodedToInt(val))
343  /// </summary>
344  public static float PrefixCodedToFloat(System.String val)
345  {
346  return SortableIntToFloat(PrefixCodedToInt(val));
347  }
348 
349  /// <summary> Expert: Splits a long range recursively.
350  /// You may implement a builder that adds clauses to a
351  /// <see cref="Lucene.Net.Search.BooleanQuery" /> for each call to its
352  /// <see cref="LongRangeBuilder.AddRange(String,String)" />
353  /// method.
354  /// <p/>This method is used by <see cref="NumericRangeQuery{T}" />.
355  /// </summary>
356  public static void SplitLongRange(LongRangeBuilder builder, int precisionStep, long minBound, long maxBound)
357  {
358  SplitRange(builder, 64, precisionStep, minBound, maxBound);
359  }
360 
361  /// <summary> Expert: Splits an int range recursively.
362  /// You may implement a builder that adds clauses to a
363  /// <see cref="Lucene.Net.Search.BooleanQuery" /> for each call to its
364  /// <see cref="IntRangeBuilder.AddRange(String,String)" />
365  /// method.
366  /// <p/>This method is used by <see cref="NumericRangeQuery{T}" />.
367  /// </summary>
368  public static void SplitIntRange(IntRangeBuilder builder, int precisionStep, int minBound, int maxBound)
369  {
370  SplitRange(builder, 32, precisionStep, (long) minBound, (long) maxBound);
371  }
372 
373  /// <summary>This helper does the splitting for both 32 and 64 bit. </summary>
374  private static void SplitRange(System.Object builder, int valSize, int precisionStep, long minBound, long maxBound)
375  {
376  if (precisionStep < 1)
377  throw new System.ArgumentException("precisionStep must be >=1");
378  if (minBound > maxBound)
379  return ;
380  for (int shift = 0; ; shift += precisionStep)
381  {
382  // calculate new bounds for inner precision
383  long diff = 1L << (shift + precisionStep);
384  long mask = ((1L << precisionStep) - 1L) << shift;
385  bool hasLower = (minBound & mask) != 0L;
386  bool hasUpper = (maxBound & mask) != mask;
387  long nextMinBound = (hasLower?(minBound + diff):minBound) & ~ mask;
388  long nextMaxBound = (hasUpper?(maxBound - diff):maxBound) & ~ mask;
389  bool lowerWrapped = nextMinBound < minBound,
390  upperWrapped = nextMaxBound > maxBound;
391 
392  if (shift+precisionStep>=valSize || nextMinBound>nextMaxBound || lowerWrapped || upperWrapped)
393  {
394  // We are in the lowest precision or the next precision is not available.
395  AddRange(builder, valSize, minBound, maxBound, shift);
396  // exit the split recursion loop
397  break;
398  }
399 
400  if (hasLower)
401  AddRange(builder, valSize, minBound, minBound | mask, shift);
402  if (hasUpper)
403  AddRange(builder, valSize, maxBound & ~ mask, maxBound, shift);
404 
405  // recurse to next precision
406  minBound = nextMinBound;
407  maxBound = nextMaxBound;
408  }
409  }
410 
411  /// <summary>Helper that delegates to correct range builder </summary>
412  private static void AddRange(System.Object builder, int valSize, long minBound, long maxBound, int shift)
413  {
414  // for the max bound set all lower bits (that were shifted away):
415  // this is important for testing or other usages of the splitted range
416  // (e.g. to reconstruct the full range). The prefixEncoding will remove
417  // the bits anyway, so they do not hurt!
418  maxBound |= (1L << shift) - 1L;
419  // delegate to correct range builder
420  switch (valSize)
421  {
422 
423  case 64:
424  ((LongRangeBuilder) builder).AddRange(minBound, maxBound, shift);
425  break;
426 
427  case 32:
428  ((IntRangeBuilder) builder).AddRange((int) minBound, (int) maxBound, shift);
429  break;
430 
431  default:
432  // Should not happen!
433  throw new System.ArgumentException("valSize must be 32 or 64.");
434 
435  }
436  }
437 
438  /// <summary> Expert: Callback for <see cref="SplitLongRange" />.
439  /// You need to overwrite only one of the methods.
440  /// <p/><font color="red"><b>NOTE:</b> This is a very low-level interface,
441  /// the method signatures may change in later versions.</font>
442  /// </summary>
443  public abstract class LongRangeBuilder
444  {
445 
446  /// <summary> Overwrite this method, if you like to receive the already prefix encoded range bounds.
447  /// You can directly build classical (inclusive) range queries from them.
448  /// </summary>
449  public virtual void AddRange(System.String minPrefixCoded, System.String maxPrefixCoded)
450  {
451  throw new System.NotSupportedException();
452  }
453 
454  /// <summary> Overwrite this method, if you like to receive the raw long range bounds.
455  /// You can use this for e.g. debugging purposes (print out range bounds).
456  /// </summary>
457  public virtual void AddRange(long min, long max, int shift)
458  {
459  AddRange(Lucene.Net.Util.NumericUtils.LongToPrefixCoded(min, shift), Lucene.Net.Util.NumericUtils.LongToPrefixCoded(max, shift));
460  }
461  }
462 
463  /// <summary> Expert: Callback for <see cref="SplitIntRange" />.
464  /// You need to overwrite only one of the methods.
465  /// <p/><font color="red"><b>NOTE:</b> This is a very low-level interface,
466  /// the method signatures may change in later versions.</font>
467  /// </summary>
468  public abstract class IntRangeBuilder
469  {
470 
471  /// <summary> Overwrite this method, if you like to receive the already prefix encoded range bounds.
472  /// You can directly build classical range (inclusive) queries from them.
473  /// </summary>
474  public virtual void AddRange(System.String minPrefixCoded, System.String maxPrefixCoded)
475  {
476  throw new System.NotSupportedException();
477  }
478 
479  /// <summary> Overwrite this method, if you like to receive the raw int range bounds.
480  /// You can use this for e.g. debugging purposes (print out range bounds).
481  /// </summary>
482  public virtual void AddRange(int min, int max, int shift)
483  {
484  AddRange(Lucene.Net.Util.NumericUtils.IntToPrefixCoded(min, shift), Lucene.Net.Util.NumericUtils.IntToPrefixCoded(max, shift));
485  }
486  }
487  }
488 }