Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
UnicodeUtil.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 
20 namespace Lucene.Net.Util
21 {
22 
23 
24  /*
25  * Some of this code came from the excellent Unicode
26  * conversion examples from:
27  *
28  * http://www.unicode.org/Public/PROGRAMS/CVTUTF
29  *
30  * Full Copyright for that code follows:*/
31 
32  /*
33  * Copyright 2001-2004 Unicode, Inc.
34  *
35  * Disclaimer
36  *
37  * This source code is provided as is by Unicode, Inc. No claims are
38  * made as to fitness for any particular purpose. No warranties of any
39  * kind are expressed or implied. The recipient agrees to determine
40  * applicability of information provided. If this file has been
41  * purchased on magnetic or optical media from Unicode, Inc., the
42  * sole remedy for any claim will be exchange of defective media
43  * within 90 days of receipt.
44  *
45  * Limitations on Rights to Redistribute This Code
46  *
47  * Unicode, Inc. hereby grants the right to freely use the information
48  * supplied in this file in the creation of products supporting the
49  * Unicode Standard, and to make copies of this file in any form
50  * for internal or external distribution as long as this notice
51  * remains attached.
52  */
53 
54  /// <summary> Class to encode java's UTF16 char[] into UTF8 byte[]
55  /// without always allocating a new byte[] as
56  /// String.getBytes("UTF-8") does.
57  ///
58  /// <p/><b>WARNING</b>: This API is a new and experimental and
59  /// may suddenly change. <p/>
60  /// </summary>
61 
62  public static class UnicodeUtil
63  {
64 
65  public const int UNI_SUR_HIGH_START = 0xD800;
66  public const int UNI_SUR_HIGH_END = 0xDBFF;
67  public const int UNI_SUR_LOW_START = 0xDC00;
68  public const int UNI_SUR_LOW_END = 0xDFFF;
69  public const int UNI_REPLACEMENT_CHAR = 0xFFFD;
70 
71  private const long UNI_MAX_BMP = 0x0000FFFF;
72 
73  private const int HALF_BASE = 0x0010000;
74  private const long HALF_SHIFT = 10;
75  private const long HALF_MASK = 0x3FFL;
76 
77  public sealed class UTF8Result
78  {
79  public byte[] result = new byte[10];
80  public int length;
81 
82  public void SetLength(int newLength)
83  {
84  if (result.Length < newLength)
85  {
86  byte[] newArray = new byte[(int) (1.5 * newLength)];
87  Array.Copy(result, 0, newArray, 0, length);
88  result = newArray;
89  }
90  length = newLength;
91  }
92  }
93 
94  public sealed class UTF16Result
95  {
96  public char[] result = new char[10];
97  public int[] offsets = new int[10];
98  public int length;
99 
100  public void SetLength(int newLength)
101  {
102  if (result.Length < newLength)
103  {
104  char[] newArray = new char[(int) (1.5 * newLength)];
105  Array.Copy(result, 0, newArray, 0, length);
106  result = newArray;
107  }
108  length = newLength;
109  }
110 
111  public void CopyText(UTF16Result other)
112  {
113  SetLength(other.length);
114  Array.Copy(other.result, 0, result, 0, length);
115  }
116  }
117 
118  /// <summary>Encode characters from a char[] source, starting at
119  /// offset and stopping when the character 0xffff is seen.
120  /// Returns the number of bytes written to bytesOut.
121  /// </summary>
122  public static void UTF16toUTF8(char[] source, int offset, UTF8Result result)
123  {
124 
125  int upto = 0;
126  int i = offset;
127  byte[] out_Renamed = result.result;
128 
129  while (true)
130  {
131 
132  int code = (int) source[i++];
133 
134  if (upto + 4 > out_Renamed.Length)
135  {
136  byte[] newOut = new byte[2 * out_Renamed.Length];
137  System.Diagnostics.Debug.Assert(newOut.Length >= upto + 4);
138  Array.Copy(out_Renamed, 0, newOut, 0, upto);
139  result.result = out_Renamed = newOut;
140  }
141  if (code < 0x80)
142  out_Renamed[upto++] = (byte) code;
143  else if (code < 0x800)
144  {
145  out_Renamed[upto++] = (byte) (0xC0 | (code >> 6));
146  out_Renamed[upto++] = (byte) (0x80 | (code & 0x3F));
147  }
148  else if (code < 0xD800 || code > 0xDFFF)
149  {
150  if (code == 0xffff)
151  // END
152  break;
153  out_Renamed[upto++] = (byte) (0xE0 | (code >> 12));
154  out_Renamed[upto++] = (byte) (0x80 | ((code >> 6) & 0x3F));
155  out_Renamed[upto++] = (byte) (0x80 | (code & 0x3F));
156  }
157  else
158  {
159  // surrogate pair
160  // confirm valid high surrogate
161  if (code < 0xDC00 && source[i] != 0xffff)
162  {
163  int utf32 = (int) source[i];
164  // confirm valid low surrogate and write pair
165  if (utf32 >= 0xDC00 && utf32 <= 0xDFFF)
166  {
167  utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
168  i++;
169  out_Renamed[upto++] = (byte) (0xF0 | (utf32 >> 18));
170  out_Renamed[upto++] = (byte) (0x80 | ((utf32 >> 12) & 0x3F));
171  out_Renamed[upto++] = (byte) (0x80 | ((utf32 >> 6) & 0x3F));
172  out_Renamed[upto++] = (byte) (0x80 | (utf32 & 0x3F));
173  continue;
174  }
175  }
176  // replace unpaired surrogate or out-of-order low surrogate
177  // with substitution character
178  out_Renamed[upto++] = (byte) (0xEF);
179  out_Renamed[upto++] = (byte) (0xBF);
180  out_Renamed[upto++] = (byte) (0xBD);
181  }
182  }
183  //assert matches(source, offset, i-offset-1, out, upto);
184  result.length = upto;
185  }
186 
187  /// <summary>Encode characters from a char[] source, starting at
188  /// offset for length chars. Returns the number of bytes
189  /// written to bytesOut.
190  /// </summary>
191  public static void UTF16toUTF8(char[] source, int offset, int length, UTF8Result result)
192  {
193 
194  int upto = 0;
195  int i = offset;
196  int end = offset + length;
197  byte[] out_Renamed = result.result;
198 
199  while (i < end)
200  {
201 
202  int code = (int) source[i++];
203 
204  if (upto + 4 > out_Renamed.Length)
205  {
206  byte[] newOut = new byte[2 * out_Renamed.Length];
207  System.Diagnostics.Debug.Assert(newOut.Length >= upto + 4);
208  Array.Copy(out_Renamed, 0, newOut, 0, upto);
209  result.result = out_Renamed = newOut;
210  }
211  if (code < 0x80)
212  out_Renamed[upto++] = (byte) code;
213  else if (code < 0x800)
214  {
215  out_Renamed[upto++] = (byte) (0xC0 | (code >> 6));
216  out_Renamed[upto++] = (byte) (0x80 | (code & 0x3F));
217  }
218  else if (code < 0xD800 || code > 0xDFFF)
219  {
220  out_Renamed[upto++] = (byte) (0xE0 | (code >> 12));
221  out_Renamed[upto++] = (byte) (0x80 | ((code >> 6) & 0x3F));
222  out_Renamed[upto++] = (byte) (0x80 | (code & 0x3F));
223  }
224  else
225  {
226  // surrogate pair
227  // confirm valid high surrogate
228  if (code < 0xDC00 && i < end && source[i] != 0xffff)
229  {
230  int utf32 = (int) source[i];
231  // confirm valid low surrogate and write pair
232  if (utf32 >= 0xDC00 && utf32 <= 0xDFFF)
233  {
234  utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
235  i++;
236  out_Renamed[upto++] = (byte) (0xF0 | (utf32 >> 18));
237  out_Renamed[upto++] = (byte) (0x80 | ((utf32 >> 12) & 0x3F));
238  out_Renamed[upto++] = (byte) (0x80 | ((utf32 >> 6) & 0x3F));
239  out_Renamed[upto++] = (byte) (0x80 | (utf32 & 0x3F));
240  continue;
241  }
242  }
243  // replace unpaired surrogate or out-of-order low surrogate
244  // with substitution character
245  out_Renamed[upto++] = (byte) (0xEF);
246  out_Renamed[upto++] = (byte) (0xBF);
247  out_Renamed[upto++] = (byte) (0xBD);
248  }
249  }
250  //assert matches(source, offset, length, out, upto);
251  result.length = upto;
252  }
253 
254  /// <summary>Encode characters from this String, starting at offset
255  /// for length characters. Returns the number of bytes
256  /// written to bytesOut.
257  /// </summary>
258  public static void UTF16toUTF8(System.String s, int offset, int length, UTF8Result result)
259  {
260  int end = offset + length;
261 
262  byte[] out_Renamed = result.result;
263 
264  int upto = 0;
265  for (int i = offset; i < end; i++)
266  {
267  int code = (int) s[i];
268 
269  if (upto + 4 > out_Renamed.Length)
270  {
271  byte[] newOut = new byte[2 * out_Renamed.Length];
272  System.Diagnostics.Debug.Assert(newOut.Length >= upto + 4);
273  Array.Copy(out_Renamed, 0, newOut, 0, upto);
274  result.result = out_Renamed = newOut;
275  }
276  if (code < 0x80)
277  out_Renamed[upto++] = (byte) code;
278  else if (code < 0x800)
279  {
280  out_Renamed[upto++] = (byte) (0xC0 | (code >> 6));
281  out_Renamed[upto++] = (byte) (0x80 | (code & 0x3F));
282  }
283  else if (code < 0xD800 || code > 0xDFFF)
284  {
285  out_Renamed[upto++] = (byte) (0xE0 | (code >> 12));
286  out_Renamed[upto++] = (byte) (0x80 | ((code >> 6) & 0x3F));
287  out_Renamed[upto++] = (byte) (0x80 | (code & 0x3F));
288  }
289  else
290  {
291  // surrogate pair
292  // confirm valid high surrogate
293  if (code < 0xDC00 && (i < end - 1))
294  {
295  int utf32 = (int) s[i + 1];
296  // confirm valid low surrogate and write pair
297  if (utf32 >= 0xDC00 && utf32 <= 0xDFFF)
298  {
299  utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
300  i++;
301  out_Renamed[upto++] = (byte) (0xF0 | (utf32 >> 18));
302  out_Renamed[upto++] = (byte) (0x80 | ((utf32 >> 12) & 0x3F));
303  out_Renamed[upto++] = (byte) (0x80 | ((utf32 >> 6) & 0x3F));
304  out_Renamed[upto++] = (byte) (0x80 | (utf32 & 0x3F));
305  continue;
306  }
307  }
308  // replace unpaired surrogate or out-of-order low surrogate
309  // with substitution character
310  out_Renamed[upto++] = (byte) (0xEF);
311  out_Renamed[upto++] = (byte) (0xBF);
312  out_Renamed[upto++] = (byte) (0xBD);
313  }
314  }
315  //assert matches(s, offset, length, out, upto);
316  result.length = upto;
317  }
318 
319  /// <summary>Convert UTF8 bytes into UTF16 characters. If offset
320  /// is non-zero, conversion starts at that starting point
321  /// in utf8, re-using the results from the previous call
322  /// up until offset.
323  /// </summary>
324  public static void UTF8toUTF16(byte[] utf8, int offset, int length, UTF16Result result)
325  {
326 
327  int end = offset + length;
328  char[] out_Renamed = result.result;
329  if (result.offsets.Length <= end)
330  {
331  int[] newOffsets = new int[2 * end];
332  Array.Copy(result.offsets, 0, newOffsets, 0, result.offsets.Length);
333  result.offsets = newOffsets;
334  }
335  int[] offsets = result.offsets;
336 
337  // If incremental decoding fell in the middle of a
338  // single unicode character, rollback to its start:
339  int upto = offset;
340  while (offsets[upto] == - 1)
341  upto--;
342 
343  int outUpto = offsets[upto];
344 
345  // Pre-allocate for worst case 1-for-1
346  if (outUpto + length >= out_Renamed.Length)
347  {
348  char[] newOut = new char[2 * (outUpto + length)];
349  Array.Copy(out_Renamed, 0, newOut, 0, outUpto);
350  result.result = out_Renamed = newOut;
351  }
352 
353  while (upto < end)
354  {
355 
356  int b = utf8[upto] & 0xff;
357  int ch;
358 
359  offsets[upto++] = outUpto;
360 
361  if (b < 0xc0)
362  {
363  System.Diagnostics.Debug.Assert(b < 0x80);
364  ch = b;
365  }
366  else if (b < 0xe0)
367  {
368  ch = ((b & 0x1f) << 6) + (utf8[upto] & 0x3f);
369  offsets[upto++] = - 1;
370  }
371  else if (b < 0xf0)
372  {
373  ch = ((b & 0xf) << 12) + ((utf8[upto] & 0x3f) << 6) + (utf8[upto + 1] & 0x3f);
374  offsets[upto++] = - 1;
375  offsets[upto++] = - 1;
376  }
377  else
378  {
379  System.Diagnostics.Debug.Assert(b < 0xf8);
380  ch = ((b & 0x7) << 18) + ((utf8[upto] & 0x3f) << 12) + ((utf8[upto + 1] & 0x3f) << 6) + (utf8[upto + 2] & 0x3f);
381  offsets[upto++] = - 1;
382  offsets[upto++] = - 1;
383  offsets[upto++] = - 1;
384  }
385 
386  if (ch <= UNI_MAX_BMP)
387  {
388  // target is a character <= 0xFFFF
389  out_Renamed[outUpto++] = (char) ch;
390  }
391  else
392  {
393  // target is a character in range 0xFFFF - 0x10FFFF
394  int chHalf = ch - HALF_BASE;
395  out_Renamed[outUpto++] = (char) ((chHalf >> (int) HALF_SHIFT) + UNI_SUR_HIGH_START);
396  out_Renamed[outUpto++] = (char) ((chHalf & HALF_MASK) + UNI_SUR_LOW_START);
397  }
398  }
399 
400  offsets[upto] = outUpto;
401  result.length = outUpto;
402  }
403 
404  // Only called from assert
405  /*
406  private static boolean matches(char[] source, int offset, int length, byte[] result, int upto) {
407  try {
408  String s1 = new String(source, offset, length);
409  String s2 = new String(result, 0, upto, "UTF-8");
410  if (!s1.equals(s2)) {
411  //System.out.println("DIFF: s1 len=" + s1.length());
412  //for(int i=0;i<s1.length();i++)
413  // System.out.println(" " + i + ": " + (int) s1.charAt(i));
414  //System.out.println("s2 len=" + s2.length());
415  //for(int i=0;i<s2.length();i++)
416  // System.out.println(" " + i + ": " + (int) s2.charAt(i));
417 
418  // If the input string was invalid, then the
419  // difference is OK
420  if (!validUTF16String(s1))
421  return true;
422 
423  return false;
424  }
425  return s1.equals(s2);
426  } catch (UnsupportedEncodingException uee) {
427  return false;
428  }
429  }
430 
431  // Only called from assert
432  private static boolean matches(String source, int offset, int length, byte[] result, int upto) {
433  try {
434  String s1 = source.substring(offset, offset+length);
435  String s2 = new String(result, 0, upto, "UTF-8");
436  if (!s1.equals(s2)) {
437  // Allow a difference if s1 is not valid UTF-16
438 
439  //System.out.println("DIFF: s1 len=" + s1.length());
440  //for(int i=0;i<s1.length();i++)
441  // System.out.println(" " + i + ": " + (int) s1.charAt(i));
442  //System.out.println(" s2 len=" + s2.length());
443  //for(int i=0;i<s2.length();i++)
444  // System.out.println(" " + i + ": " + (int) s2.charAt(i));
445 
446  // If the input string was invalid, then the
447  // difference is OK
448  if (!validUTF16String(s1))
449  return true;
450 
451  return false;
452  }
453  return s1.equals(s2);
454  } catch (UnsupportedEncodingException uee) {
455  return false;
456  }
457  }
458 
459  public static final boolean validUTF16String(String s) {
460  final int size = s.length();
461  for(int i=0;i<size;i++) {
462  char ch = s.charAt(i);
463  if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
464  if (i < size-1) {
465  i++;
466  char nextCH = s.charAt(i);
467  if (nextCH >= UNI_SUR_LOW_START && nextCH <= UNI_SUR_LOW_END) {
468  // Valid surrogate pair
469  } else
470  // Unmatched hight surrogate
471  return false;
472  } else
473  // Unmatched hight surrogate
474  return false;
475  } else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
476  // Unmatched low surrogate
477  return false;
478  }
479 
480  return true;
481  }
482 
483  public static final boolean validUTF16String(char[] s, int size) {
484  for(int i=0;i<size;i++) {
485  char ch = s[i];
486  if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
487  if (i < size-1) {
488  i++;
489  char nextCH = s[i];
490  if (nextCH >= UNI_SUR_LOW_START && nextCH <= UNI_SUR_LOW_END) {
491  // Valid surrogate pair
492  } else
493  return false;
494  } else
495  return false;
496  } else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
497  // Unmatched low surrogate
498  return false;
499  }
500 
501  return true;
502  }
503  */
504  }
505 }