Class UnicodeUtil
  
  Class to encode .NET's UTF16 char[] into UTF8 byte[]
without always allocating a new byte[] as
System.Text.Encoding.GetBytes(System.String) of System.Text.Encoding.UTF8 does.
This is a Lucene.NET INTERNAL API, use at your own risk
    Inheritance
    System.Object
    UnicodeUtil
   
  
    Inherited Members
    
      System.Object.Equals(System.Object)
    
    
      System.Object.Equals(System.Object, System.Object)
    
    
      System.Object.GetHashCode()
    
    
      System.Object.GetType()
    
    
      System.Object.MemberwiseClone()
    
    
      System.Object.ReferenceEquals(System.Object, System.Object)
    
    
      System.Object.ToString()
    
   
  
  Assembly: Lucene.Net.dll
  Syntax
  
    public static class UnicodeUtil
   
  Fields
  
  
    |
    Improve this Doc
  
  
    View Source
  
  BIG_TERM
  A binary term consisting of a number of 0xff bytes, likely to be bigger than other terms
(e.g. collation keys) one would normally encounter, and definitely bigger than any UTF-8 terms.
WARNING: this is not a valid UTF8 Term
Declaration
  
    public static readonly BytesRef BIG_TERM
   
  Field Value
  
  
    |
    Improve this Doc
  
  
    View Source
  
  UNI_REPLACEMENT_CHAR
  
  
  Declaration
  
    public const int UNI_REPLACEMENT_CHAR = 65533
   
  Field Value
  
    
      
        | Type | Description | 
    
    
      
        | System.Int32 |  | 
    
  
  
    |
    Improve this Doc
  
  
    View Source
  
  UNI_SUR_HIGH_END
  
  
  Declaration
  
    public const int UNI_SUR_HIGH_END = 56319
   
  Field Value
  
    
      
        | Type | Description | 
    
    
      
        | System.Int32 |  | 
    
  
  
    |
    Improve this Doc
  
  
    View Source
  
  UNI_SUR_HIGH_START
  
  
  Declaration
  
    public const int UNI_SUR_HIGH_START = 55296
   
  Field Value
  
    
      
        | Type | Description | 
    
    
      
        | System.Int32 |  | 
    
  
  
    |
    Improve this Doc
  
  
    View Source
  
  UNI_SUR_LOW_END
  
  
  Declaration
  
    public const int UNI_SUR_LOW_END = 57343
   
  Field Value
  
    
      
        | Type | Description | 
    
    
      
        | System.Int32 |  | 
    
  
  
    |
    Improve this Doc
  
  
    View Source
  
  UNI_SUR_LOW_START
  
  
  Declaration
  
    public const int UNI_SUR_LOW_START = 56320
   
  Field Value
  
    
      
        | Type | Description | 
    
    
      
        | System.Int32 |  | 
    
  
  Methods
  
  
    |
    Improve this Doc
  
  
    View Source
  
  
  CodePointCount(BytesRef)
  Returns the number of code points in this UTF8 sequence.
This method assumes valid UTF8 input. This method
does not perform full UTF8 validation, it will check only the
first byte of each codepoint (for multi-byte sequences any bytes after
the head are skipped).
Declaration
  
    public static int CodePointCount(BytesRef utf8)
   
  Parameters
  
  Returns
  
    
      
        | Type | Description | 
    
    
      
        | System.Int32 |  | 
    
  
  Exceptions
  
    
      
        | Type | Condition | 
    
    
      
        | System.ArgumentException | If invalid codepoint header byte occurs or the
  content is prematurely truncated.  | 
    
  
  
    |
    Improve this Doc
  
  
    View Source
  
  
  NewString(Int32[], Int32, Int32)
  Cover JDK 1.5 API. Create a String from an array of codePoints.
Declaration
  
    public static string NewString(int[] codePoints, int offset, int count)
   
  Parameters
  
    
      
        | Type | Name | Description | 
    
    
      
        | System.Int32[] | codePoints | The code array.  | 
      
        | System.Int32 | offset | The start of the text in the code point array.  | 
      
        | System.Int32 | count | The number of code points.  | 
    
  
  Returns
  
    
      
        | Type | Description | 
    
    
      
        | System.String | a String representing the code points between offset and count.  | 
    
  
  Exceptions
  
    
      
        | Type | Condition | 
    
    
      
        | System.ArgumentException | If an invalid code point is encountered.  | 
      
        | System.IndexOutOfRangeException | If the offset or count are out of bounds.  | 
    
  
  
    |
    Improve this Doc
  
  
    View Source
  
  
  ToCharArray(Int32[], Int32, Int32)
  Generates char array that represents the provided input code points.
LUCENENET specific.
Declaration
  
    public static char[] ToCharArray(int[] codePoints, int offset, int count)
   
  Parameters
  
    
      
        | Type | Name | Description | 
    
    
      
        | System.Int32[] | codePoints | The code array.  | 
      
        | System.Int32 | offset | The start of the text in the code point array.  | 
      
        | System.Int32 | count | The number of code points.  | 
    
  
  Returns
  
    
      
        | Type | Description | 
    
    
      
        | System.Char[] | a char array representing the code points between offset and count.  | 
    
  
  
    |
    Improve this Doc
  
  
    View Source
  
  
  ToHexString(String)
  
  
  Declaration
  
    public static string ToHexString(string s)
   
  Parameters
  
    
      
        | Type | Name | Description | 
    
    
      
        | System.String | s |  | 
    
  
  Returns
  
    
      
        | Type | Description | 
    
    
      
        | System.String |  | 
    
  
  
    |
    Improve this Doc
  
  
    View Source
  
  
  UTF16toUTF8(ICharSequence, Int32, Int32, BytesRef)
  Encode characters from this J2N.Text.ICharSequence, starting at offset
for length characters. After encoding, result.Offset will always be 0.
Declaration
  
    public static void UTF16toUTF8(ICharSequence s, int offset, int length, BytesRef result)
   
  Parameters
  
    
      
        | Type | Name | Description | 
    
    
      
        | J2N.Text.ICharSequence | s |  | 
      
        | System.Int32 | offset |  | 
      
        | System.Int32 | length |  | 
      
        | BytesRef | result |  | 
    
  
  
    |
    Improve this Doc
  
  
    View Source
  
  
  UTF16toUTF8(Char[], Int32, Int32, BytesRef)
  Encode characters from a char[] source, starting at
offset for length chars. After encoding, result.Offset will always be 0.
Declaration
  
    public static void UTF16toUTF8(char[] source, int offset, int length, BytesRef result)
   
  Parameters
  
    
      
        | Type | Name | Description | 
    
    
      
        | System.Char[] | source |  | 
      
        | System.Int32 | offset |  | 
      
        | System.Int32 | length |  | 
      
        | BytesRef | result |  | 
    
  
  
    |
    Improve this Doc
  
  
    View Source
  
  
  UTF16toUTF8(String, Int32, Int32, BytesRef)
  Encode characters from this System.String, starting at offset
for length characters. After encoding, result.Offset will always be 0.
LUCENENET specific.
Declaration
  
    public static void UTF16toUTF8(string s, int offset, int length, BytesRef result)
   
  Parameters
  
    
      
        | Type | Name | Description | 
    
    
      
        | System.String | s |  | 
      
        | System.Int32 | offset |  | 
      
        | System.Int32 | length |  | 
      
        | BytesRef | result |  | 
    
  
  
    |
    Improve this Doc
  
  
    View Source
  
  
  UTF8toUTF16(BytesRef, CharsRef)
  
  
  Declaration
  
    public static void UTF8toUTF16(BytesRef bytesRef, CharsRef chars)
   
  Parameters
  
  See Also
  
  
    |
    Improve this Doc
  
  
    View Source
  
  
  UTF8toUTF16(Byte[], Int32, Int32, CharsRef)
  Interprets the given byte array as UTF-8 and converts to UTF-16. The CharsRef will be extended if
it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint.
NOTE: Full characters are read, even if this reads past the length passed (and
can result in an System.IndexOutOfRangeException if invalid UTF-8 is passed).
Explicit checks for valid UTF-8 are not performed.
Declaration
  
    public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef chars)
   
  Parameters
  
    
      
        | Type | Name | Description | 
    
    
      
        | System.Byte[] | utf8 |  | 
      
        | System.Int32 | offset |  | 
      
        | System.Int32 | length |  | 
      
        | CharsRef | chars |  | 
    
  
  
    |
    Improve this Doc
  
  
    View Source
  
  
  UTF8toUTF32(BytesRef, Int32sRef)
  This method assumes valid UTF8 input. This method
does not perform full UTF8 validation, it will check only the
first byte of each codepoint (for multi-byte sequences any bytes after
the head are skipped).
Declaration
  
    public static void UTF8toUTF32(BytesRef utf8, Int32sRef utf32)
   
  Parameters
  
  Exceptions
  
    
      
        | Type | Condition | 
    
    
      
        | System.ArgumentException | If invalid codepoint header byte occurs or the
  content is prematurely truncated.  | 
    
  
  
    |
    Improve this Doc
  
  
    View Source
  
  
  ValidUTF16String(ICharSequence)
  
  
  Declaration
  
    public static bool ValidUTF16String(ICharSequence s)
   
  Parameters
  
    
      
        | Type | Name | Description | 
    
    
      
        | J2N.Text.ICharSequence | s |  | 
    
  
  Returns
  
    
      
        | Type | Description | 
    
    
      
        | System.Boolean |  | 
    
  
  
    |
    Improve this Doc
  
  
    View Source
  
  
  ValidUTF16String(Char[], Int32)
  
  
  Declaration
  
    public static bool ValidUTF16String(char[] s, int size)
   
  Parameters
  
    
      
        | Type | Name | Description | 
    
    
      
        | System.Char[] | s |  | 
      
        | System.Int32 | size |  | 
    
  
  Returns
  
    
      
        | Type | Description | 
    
    
      
        | System.Boolean |  | 
    
  
  
    |
    Improve this Doc
  
  
    View Source
  
  
  ValidUTF16String(String)
  
  
  Declaration
  
    public static bool ValidUTF16String(string s)
   
  Parameters
  
    
      
        | Type | Name | Description | 
    
    
      
        | System.String | s |  | 
    
  
  Returns
  
    
      
        | Type | Description | 
    
    
      
        | System.Boolean |  | 
    
  
  
    |
    Improve this Doc
  
  
    View Source
  
  
  ValidUTF16String(StringBuilder)
  
  
  Declaration
  
    public static bool ValidUTF16String(StringBuilder s)
   
  Parameters
  
    
      
        | Type | Name | Description | 
    
    
      
        | System.Text.StringBuilder | s |  | 
    
  
  Returns
  
    
      
        | Type | Description | 
    
    
      
        | System.Boolean |  |