Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
IndexableBinaryStringTools.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using Lucene.Net.Support;
20 
21 // {{Aroush-2.9}} Port issue? Both of those were treated as: System.IO.MemoryStream
22 //using CharBuffer = java.nio.CharBuffer;
23 //using ByteBuffer = java.nio.ByteBuffer;
24 
25 namespace Lucene.Net.Util
26 {
27 
28  /// <summary> Provides support for converting byte sequences to Strings and back again.
29  /// The resulting Strings preserve the original byte sequences' sort order.
30  ///
31  /// The Strings are constructed using a Base 8000h encoding of the original
32  /// binary data - each char of an encoded String represents a 15-bit chunk
33  /// from the byte sequence. Base 8000h was chosen because it allows for all
34  /// lower 15 bits of char to be used without restriction; the surrogate range
35  /// [U+D8000-U+DFFF] does not represent valid chars, and would require
36  /// complicated handling to avoid them and allow use of char's high bit.
37  ///
38  /// Although unset bits are used as padding in the final char, the original
39  /// byte sequence could contain trailing bytes with no set bits (null bytes):
40  /// padding is indistinguishable from valid information. To overcome this
41  /// problem, a char is appended, indicating the number of encoded bytes in the
42  /// final content char.
43  ///
44  /// This class's operations are defined over CharBuffers and ByteBuffers, to
45  /// allow for wrapped arrays to be reused, reducing memory allocation costs for
46  /// repeated operations. Note that this class calls array() and arrayOffset()
47  /// on the CharBuffers and ByteBuffers it uses, so only wrapped arrays may be
48  /// used. This class interprets the arrayOffset() and limit() values returned by
49  /// its input buffers as beginning and end+1 positions on the wrapped array,
50  /// resprectively; similarly, on the output buffer, arrayOffset() is the first
51  /// position written to, and limit() is set to one past the final output array
52  /// position.
53  /// </summary>
55  {
56 
57  private static readonly CodingCase[] CODING_CASES = new CodingCase[]{new CodingCase(7, 1), new CodingCase(14, 6, 2), new CodingCase(13, 5, 3), new CodingCase(12, 4, 4), new CodingCase(11, 3, 5), new CodingCase(10, 2, 6), new CodingCase(9, 1, 7), new CodingCase(8, 0)};
58 
59  // Export only static methods
61  {
62  }
63 
64  /// <summary> Returns the number of chars required to encode the given byte sequence.
65  ///
66  /// </summary>
67  /// <param name="original">The byte sequence to be encoded. Must be backed by an array.
68  /// </param>
69  /// <returns> The number of chars required to encode the given byte sequence
70  /// </returns>
71  /// <throws> IllegalArgumentException If the given ByteBuffer is not backed by an array </throws>
72  public static int GetEncodedLength(System.Collections.Generic.List<byte> original)
73  {
74  return (original.Count == 0) ? 0 : ((original.Count * 8 + 14) / 15) + 1;
75  }
76 
77  /// <summary> Returns the number of bytes required to decode the given char sequence.
78  ///
79  /// </summary>
80  /// <param name="encoded">The char sequence to be encoded. Must be backed by an array.
81  /// </param>
82  /// <returns> The number of bytes required to decode the given char sequence
83  /// </returns>
84  /// <throws> IllegalArgumentException If the given CharBuffer is not backed by an array </throws>
85  public static int GetDecodedLength(System.Collections.Generic.List<char> encoded)
86  {
87  int numChars = encoded.Count - 1;
88  if (numChars <= 0)
89  {
90  return 0;
91  }
92  else
93  {
94  int numFullBytesInFinalChar = encoded[encoded.Count - 1];
95  int numEncodedChars = numChars - 1;
96  return ((numEncodedChars * 15 + 7) / 8 + numFullBytesInFinalChar);
97  }
98  }
99 
100  /// <summary> Encodes the input byte sequence into the output char sequence. Before
101  /// calling this method, ensure that the output CharBuffer has sufficient
102  /// capacity by calling <see cref="GetEncodedLength(System.Collections.Generic.List{byte})" />.
103  ///
104  /// </summary>
105  /// <param name="input">The byte sequence to encode
106  /// </param>
107  /// <param name="output">Where the char sequence encoding result will go. The limit
108  /// is set to one past the position of the final char.
109  /// </param>
110  /// <throws> IllegalArgumentException If either the input or the output buffer </throws>
111  /// <summary> is not backed by an array
112  /// </summary>
113  public static void Encode(System.Collections.Generic.List<byte> input, System.Collections.Generic.List<char> output)
114  {
115  int outputLength = GetEncodedLength(input);
116  // only adjust capacity if needed
117  if (output.Capacity < outputLength)
118  {
119  output.Capacity = outputLength;
120  }
121 
122  // ensure the buffer we are writing into is occupied with nulls
123  if (output.Count < outputLength)
124  {
125  for (int i = output.Count; i < outputLength; i++)
126  {
127  output.Add(Char.MinValue);
128  }
129  }
130 
131  if (input.Count > 0)
132  {
133  int inputByteNum = 0;
134  int caseNum = 0;
135  int outputCharNum = 0;
136  CodingCase codingCase;
137  for (; inputByteNum + CODING_CASES[caseNum].numBytes <= input.Count; ++outputCharNum)
138  {
139  codingCase = CODING_CASES[caseNum];
140  if (2 == codingCase.numBytes)
141  {
142  output[outputCharNum] = (char)(((input[inputByteNum] & 0xFF) << codingCase.initialShift) + ((Number.URShift((input[inputByteNum + 1] & 0xFF), codingCase.finalShift)) & codingCase.finalMask) & (short)0x7FFF);
143  }
144  else
145  {
146  // numBytes is 3
147  output[outputCharNum] = (char)(((input[inputByteNum] & 0xFF) << codingCase.initialShift) + ((input[inputByteNum + 1] & 0xFF) << codingCase.middleShift) + ((Number.URShift((input[inputByteNum + 2] & 0xFF), codingCase.finalShift)) & codingCase.finalMask) & (short)0x7FFF);
148  }
149  inputByteNum += codingCase.advanceBytes;
150  if (++caseNum == CODING_CASES.Length)
151  {
152  caseNum = 0;
153  }
154  }
155  // Produce final char (if any) and trailing count chars.
156  codingCase = CODING_CASES[caseNum];
157 
158  if (inputByteNum + 1 < input.Count)
159  {
160  // codingCase.numBytes must be 3
161  output[outputCharNum++] = (char) ((((input[inputByteNum] & 0xFF) << codingCase.initialShift) + ((input[inputByteNum + 1] & 0xFF) << codingCase.middleShift)) & (short) 0x7FFF);
162  // Add trailing char containing the number of full bytes in final char
163  output[outputCharNum++] = (char) 1;
164  }
165  else if (inputByteNum < input.Count)
166  {
167  output[outputCharNum++] = (char) (((input[inputByteNum] & 0xFF) << codingCase.initialShift) & (short) 0x7FFF);
168  // Add trailing char containing the number of full bytes in final char
169  output[outputCharNum++] = caseNum == 0?(char) 1:(char) 0;
170  }
171  else
172  {
173  // No left over bits - last char is completely filled.
174  // Add trailing char containing the number of full bytes in final char
175  output[outputCharNum++] = (char) 1;
176  }
177  }
178  }
179 
180  /// <summary> Decodes the input char sequence into the output byte sequence. Before
181  /// calling this method, ensure that the output ByteBuffer has sufficient
182  /// capacity by calling <see cref="GetDecodedLength(System.Collections.Generic.List{char})" />.
183  ///
184  /// </summary>
185  /// <param name="input">The char sequence to decode
186  /// </param>
187  /// <param name="output">Where the byte sequence decoding result will go. The limit
188  /// is set to one past the position of the final char.
189  /// </param>
190  /// <throws> IllegalArgumentException If either the input or the output buffer </throws>
191  /// <summary> is not backed by an array
192  /// </summary>
193  public static void Decode(System.Collections.Generic.List<char> input, System.Collections.Generic.List<byte> output)
194  {
195  int numOutputBytes = GetDecodedLength(input);
196  if (output.Capacity < numOutputBytes)
197  {
198  output.Capacity = numOutputBytes;
199  }
200 
201  // ensure the buffer we are writing into is occupied with nulls
202  if (output.Count < numOutputBytes)
203  {
204  for (int i = output.Count; i < numOutputBytes; i++)
205  {
206  output.Add(Byte.MinValue);
207  }
208  }
209 
210  if (input.Count > 0)
211  {
212  int caseNum = 0;
213  int outputByteNum = 0;
214  int inputCharNum = 0;
215  short inputChar;
216  CodingCase codingCase;
217  for (; inputCharNum < input.Count - 2; ++inputCharNum)
218  {
219  codingCase = CODING_CASES[caseNum];
220  inputChar = (short) input[inputCharNum];
221  if (2 == codingCase.numBytes)
222  {
223  if (0 == caseNum)
224  {
225  output[outputByteNum] = (byte) (Number.URShift(inputChar, codingCase.initialShift));
226  }
227  else
228  {
229  output[outputByteNum] = (byte) (output[outputByteNum] + (byte) (Number.URShift(inputChar, codingCase.initialShift)));
230  }
231  output[outputByteNum + 1] = (byte) ((inputChar & codingCase.finalMask) << codingCase.finalShift);
232  }
233  else
234  {
235  // numBytes is 3
236  output[outputByteNum] = (byte) (output[outputByteNum] + (byte) (Number.URShift(inputChar, codingCase.initialShift)));
237  output[outputByteNum + 1] = (byte) (Number.URShift((inputChar & codingCase.middleMask), codingCase.middleShift));
238  output[outputByteNum + 2] = (byte) ((inputChar & codingCase.finalMask) << codingCase.finalShift);
239  }
240  outputByteNum += codingCase.advanceBytes;
241  if (++caseNum == CODING_CASES.Length)
242  {
243  caseNum = 0;
244  }
245  }
246  // Handle final char
247  inputChar = (short) input[inputCharNum];
248  codingCase = CODING_CASES[caseNum];
249  if (0 == caseNum)
250  {
251  output[outputByteNum] = 0;
252  }
253  output[outputByteNum] = (byte) (output[outputByteNum] + (byte) (Number.URShift(inputChar, codingCase.initialShift)));
254  long bytesLeft = numOutputBytes - outputByteNum;
255  if (bytesLeft > 1)
256  {
257  if (2 == codingCase.numBytes)
258  {
259  output[outputByteNum + 1] = (byte) (Number.URShift((inputChar & codingCase.finalMask), codingCase.finalShift));
260  }
261  else
262  {
263  // numBytes is 3
264  output[outputByteNum + 1] = (byte) (Number.URShift((inputChar & codingCase.middleMask), codingCase.middleShift));
265  if (bytesLeft > 2)
266  {
267  output[outputByteNum + 2] = (byte) ((inputChar & codingCase.finalMask) << codingCase.finalShift);
268  }
269  }
270  }
271  }
272  }
273 
274  /// <summary> Decodes the given char sequence, which must have been encoded by
275  /// <see cref="Encode(System.Collections.Generic.List{byte})" /> or
276  /// <see cref="Encode(System.Collections.Generic.List{byte}, System.Collections.Generic.List{char})" />.
277  ///
278  /// </summary>
279  /// <param name="input">The char sequence to decode
280  /// </param>
281  /// <returns> A byte sequence containing the decoding result. The limit
282  /// is set to one past the position of the final char.
283  /// </returns>
284  /// <throws> IllegalArgumentException If the input buffer is not backed by an </throws>
285  /// <summary> array
286  /// </summary>
287  public static System.Collections.Generic.List<byte> Decode(System.Collections.Generic.List<char> input)
288  {
289  System.Collections.Generic.List<byte> output =
290  new System.Collections.Generic.List<byte>(new byte[GetDecodedLength(input)]);
291  Decode(input, output);
292  return output;
293  }
294 
295  /// <summary> Encodes the input byte sequence.
296  ///
297  /// </summary>
298  /// <param name="input">The byte sequence to encode
299  /// </param>
300  /// <returns> A char sequence containing the encoding result. The limit is set
301  /// to one past the position of the final char.
302  /// </returns>
303  /// <throws> IllegalArgumentException If the input buffer is not backed by an </throws>
304  /// <summary> array
305  /// </summary>
306  public static System.Collections.Generic.List<char> Encode(System.Collections.Generic.List<byte> input)
307  {
308  System.Collections.Generic.List<char> output =
309  new System.Collections.Generic.List<char>(new char[GetEncodedLength(input)]);
310  Encode(input, output);
311  return output;
312  }
313 
314  internal class CodingCase
315  {
316  internal int numBytes, initialShift, middleShift, finalShift, advanceBytes = 2;
317  internal short middleMask, finalMask;
318 
319  internal CodingCase(int initialShift, int middleShift, int finalShift)
320  {
321  this.numBytes = 3;
322  this.initialShift = initialShift;
323  this.middleShift = middleShift;
324  this.finalShift = finalShift;
325  this.finalMask = (short) (Number.URShift((short) 0xFF, finalShift));
326  this.middleMask = (short) ((short) 0xFF << middleShift);
327  }
328 
329  internal CodingCase(int initialShift, int finalShift)
330  {
331  this.numBytes = 2;
332  this.initialShift = initialShift;
333  this.finalShift = finalShift;
334  this.finalMask = (short) (Number.URShift((short) 0xFF, finalShift));
335  if (finalShift != 0)
336  {
337  advanceBytes = 1;
338  }
339  }
340  }
341  }
342 }