d9/d94/_c_j_k_tokenizer_8cs_source.html

/*

 *

 * Licensed to the Apache Software Foundation (ASF) under one

 * or more contributor license agreements.  See the NOTICE file

 * distributed with this work for additional information

 * regarding copyright ownership.  The ASF licenses this file

 * to you under the Apache License, Version 2.0 (the

 * "License"); you may not use this file except in compliance

 * with the License.  You may obtain a copy of the License at

 *

 *   http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing,

 * software distributed under the License is distributed on an

 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

 * KIND, either express or implied.  See the License for the

 * specific language governing permissions and limitations

 * under the License.

 *

*/


using System;

using System.Globalization;

using System.IO;

using System.Text;

using System.Text.RegularExpressions;

using Lucene.Net.Analysis;

using Lucene.Net.Analysis.Tokenattributes;

using Lucene.Net.Util;


namespace Lucene.Net.Analysis.CJK

{

    /// <summary>

    /// <p>

    /// CJKTokenizer was modified from StopTokenizer which does a decent job for

    /// most European languages. and it perferm other token method for double-byte

    /// chars: the token will return at each two charactors with overlap match.<br/>

    /// Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it

    /// also need filter filter zero length token ""<br/>

    /// for Digit: digit, '+', '#' will token as letter<br/>

    /// for more info on Asia language(Chinese Japanese Korean) text segmentation:

    /// please search  <a

    /// href="http://www.google.com/search?q=word+chinese+segment">google</a>

    /// </p>

    ///

    /// @author Che, Dong

    /// @version $Id: CJKTokenizer.java,v 1.3 2003/01/22 20:54:47 otis Exp $

    /// </summary>

    public sealed class CJKTokenizer : Tokenizer

    {

        //~ Static fields/initializers ---------------------------------------------

        /// <summary>

        /// Word token type

        /// </summary>

        internal static readonly int WORD_TYPE = 0;


        /// <summary>

        /// Single byte token type

        /// </summary>

        internal static readonly int SINGLE_TOKEN_TYPE = 1;


        /// <summary>

        /// Double byte token type

        /// </summary>

        internal static readonly int DOUBLE_TOKEN_TYPE = 2;


        /// <summary>

        /// Names for token types

        /// </summary>

        internal static readonly String[] TOKEN_TYPE_NAMES = { "word", "single", "double" };


        /// <summary>

        /// Max word length

        /// </summary>

        internal static readonly int MAX_WORD_LEN = 255;


        /// <summary>

        /// buffer size

        /// </summary>

        internal static readonly int IO_BUFFER_SIZE = 256;


        //~ Instance fields --------------------------------------------------------


        /// <summary>

        /// word offset, used to imply which character(in ) is parsed

        /// </summary>

        private int offset = 0;


        /// <summary>

        /// the index used only for ioBuffer

        /// </summary>

        private int bufferIndex = 0;


        /// <summary>

        /// data length

        /// </summary>

        private int dataLen = 0;


        /// <summary>

        /// character buffer, store the characters which are used to compose <br/>

        /// the returned Token

        /// </summary>

        private char[] buffer = new char[MAX_WORD_LEN];


        /// <summary>

        /// I/O buffer, used to store the content of the input(one of the <br/>

        /// members of Tokenizer)

        /// </summary>

        private char[] ioBuffer = new char[IO_BUFFER_SIZE];


        /// <summary>

        /// word type: single=>ASCII  double=>non-ASCII word=>default

        /// </summary>

        private int tokenType = WORD_TYPE;


        /// <summary>

        /// tag: previous character is a cached double-byte character  "C1C2C3C4"

        /// ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)

        /// C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"

        /// </summary>

        private bool preIsTokened = false;


        private ITermAttribute termAtt;

        private IOffsetAttribute offsetAtt;

        private ITypeAttribute typeAtt;


        //~ Constructors -----------------------------------------------------------


        /// <summary>

        /// Construct a token stream processing the given input.

        /// </summary>

        /// <param name="_in">I/O reader</param>

        public CJKTokenizer(TextReader _in)

            : base(_in)

        {

            Init();

        }


        public CJKTokenizer(AttributeSource source, TextReader _in)

            : base(source, _in)

        {

            Init();

        }


        public CJKTokenizer(AttributeFactory factory, TextReader _in)

            : base(factory, _in)

        {

            Init();

        }


        private void Init()

        {

            termAtt = AddAttribute<ITermAttribute>();

            offsetAtt = AddAttribute<IOffsetAttribute>();

            typeAtt = AddAttribute<ITypeAttribute>();

        }


        //~ Methods ----------------------------------------------------------------


        /*

         * Returns true for the next token in the stream, or false at EOS.

         * See http://java.sun.com/j2se/1.3/docs/api/java/lang/char.UnicodeBlock.html

         * for detail.

         *

         * @return false for end of stream, true otherwise

         *

         * @throws java.io.IOException - throw IOException when read error <br>

         *         happened in the InputStream

         *

         */


        Regex isBasicLatin = new Regex(@"\p{IsBasicLatin}", RegexOptions.Compiled);

        Regex isHalfWidthAndFullWidthForms = new Regex(@"\p{IsHalfwidthandFullwidthForms}", RegexOptions.Compiled);


        public override bool IncrementToken()

        {

            ClearAttributes();

            /* how many character(s) has been stored in buffer */


            while (true)

            {

                // loop until we find a non-empty token


                int length = 0;


                /* the position used to create Token */

                int start = offset;


                while (true)

                {

                    // loop until we've found a full token

                    /* current character */

                    char c;


                    offset++;


                    if (bufferIndex >= dataLen)

                    {

                        dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);

                        bufferIndex = 0;

                    }


                    if (dataLen == 0) // input.Read returns 0 when its empty, not -1, as in java

                    {

                        if (length > 0)

                        {

                            if (preIsTokened == true)

                            {

                                length = 0;

                                preIsTokened = false;

                            }

                            else

                            {

                                offset--;

                            }


                            break;

                        }

                        else

                        {

                            offset--;

                            return false;

                        }

                    }

                    else

                    {

                        //get current character

                        c = ioBuffer[bufferIndex++];

                    }


                    //TODO: Using a Regex to determine the UnicodeCategory is probably slower than

                    //      If we just created a small class that would look it up for us, which

                    //      would likely be trivial, however time-consuming.  I can't imagine a Regex

                    //      being fast for this, considering we have to pull a char from the buffer,

                    //      and convert it to a string before we run a regex on it. - cc

                    bool isHalfFullForm = isHalfWidthAndFullWidthForms.Match(c.ToString()).Success;

                    //if the current character is ASCII or Extend ASCII

                    if ((isBasicLatin.Match(c.ToString()).Success) || (isHalfFullForm))

                    {

                        if (isHalfFullForm)

                        {

                            int i = (int) c;

                            if (i >= 65281 && i <= 65374)

                            {

                                // convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN

                                i = i - 65248;

                                c = (char) i;

                            }

                        }


                        // if the current character is a letter or "_" "+" "#"

                        if (char.IsLetterOrDigit(c)

                            || ((c == '_') || (c == '+') || (c == '#'))

                            )

                        {

                            if (length == 0)

                            {

                                // "javaC1C2C3C4linux" <br>

                                //      ^--: the current character begin to token the ASCII

                                // letter

                                start = offset - 1;

                            }

                            else if (tokenType == DOUBLE_TOKEN_TYPE)

                            {

                                // "javaC1C2C3C4linux" <br>

                                //              ^--: the previous non-ASCII

                                // : the current character

                                offset--;

                                bufferIndex--;


                                if (preIsTokened == true)

                                {

                                    // there is only one non-ASCII has been stored

                                    length = 0;

                                    preIsTokened = false;

                                    break;

                                }

                                else

                                {

                                    break;

                                }

                            }


                            // store the LowerCase(c) in the buffer

                            buffer[length++] = char.ToLower(c); // TODO: is java invariant?  If so, this should be ToLowerInvariant()

                            tokenType = SINGLE_TOKEN_TYPE;


                            // break the procedure if buffer overflowed!

                            if (length == MAX_WORD_LEN)

                            {

                                break;

                            }

                        }

                        else if (length > 0)

                        {

                            if (preIsTokened)

                            {

                                length = 0;

                                preIsTokened = false;

                            }

                            else

                            {

                                break;

                            }

                        }

                    }

                    else

                    {

                        // non-ASCII letter, e.g."C1C2C3C4"

                        if (char.IsLetter(c))

                        {

                            if (length == 0)

                            {

                                start = offset - 1;

                                buffer[length++] = c;

                                tokenType = DOUBLE_TOKEN_TYPE;

                            }

                            else

                            {

                                if (tokenType == SINGLE_TOKEN_TYPE)

                                {

                                    offset--;

                                    bufferIndex--;


                                    //return the previous ASCII characters

                                    break;

                                }

                                else

                                {

                                    buffer[length++] = c;

                                    tokenType = DOUBLE_TOKEN_TYPE;


                                    if (length == 2)

                                    {

                                        offset--;

                                        bufferIndex--;

                                        preIsTokened = true;


                                        break;

                                    }

                                }

                            }

                        }

                        else if (length > 0)

                        {

                            if (preIsTokened == true)

                            {

                                // empty the buffer

                                length = 0;

                                preIsTokened = false;

                            }

                            else

                            {

                                break;

                            }

                        }

                    }

                }


                if (length > 0)

                {

                    termAtt.SetTermBuffer(buffer, 0, length);

                    offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length));

                    typeAtt.Type = TOKEN_TYPE_NAMES[tokenType];

                    return true;

                }

                else if (dataLen == 0)

                {

                    offset--;

                    return false;

                }


                // Cycle back and try for the next token (don't

                // return an empty string)

            }

        }


        public override void End()

        {

            // set final offset

            int finalOffset = CorrectOffset(offset);

            this.offsetAtt.SetOffset(finalOffset, finalOffset);

        }


        public override void Reset()

        {

            base.Reset();

            offset = bufferIndex = dataLen = 0;

            preIsTokened = false;

            tokenType = WORD_TYPE;

        }


        public override void Reset(TextReader reader)

        {

            base.Reset(reader);

            Reset();

        }

    }

}