d3/d03/_chinese_tokenizer_8cs_source.html

/*

 *

 * Licensed to the Apache Software Foundation (ASF) under one

 * or more contributor license agreements.  See the NOTICE file

 * distributed with this work for additional information

 * regarding copyright ownership.  The ASF licenses this file

 * to you under the Apache License, Version 2.0 (the

 * "License"); you may not use this file except in compliance

 * with the License.  You may obtain a copy of the License at

 *

 *   http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing,

 * software distributed under the License is distributed on an

 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

 * KIND, either express or implied.  See the License for the

 * specific language governing permissions and limitations

 * under the License.

 *

*/


using System;

using System.IO;

using System.Text;

using System.Collections;

using System.Globalization;


using Lucene.Net.Analysis;

using Lucene.Net.Analysis.Tokenattributes;

using Lucene.Net.Util;


namespace Lucene.Net.Analysis.Cn

{

    /// <summary>

    /// Tokenize Chinese text as individual chinese chars.

    /// <p>

    /// The difference between ChineseTokenizer and

    /// CJKTokenizer is that they have different

    /// token parsing logic.

    /// </p>

    /// <p>

    /// For example, if the Chinese text

    /// "C1C2C3C4" is to be indexed:

    /// <ul>

    /// <li>The tokens returned from ChineseTokenizer are C1, C2, C3, C4</li>

    /// <li>The tokens returned from the CJKTokenizer are C1C2, C2C3, C3C4.</li>

    /// </ul>

    /// </p>

    /// <p>

    /// Therefore the index created by CJKTokenizer is much larger.

    /// </p>

    /// <p>

    /// The problem is that when searching for C1, C1C2, C1C3,

    /// C4C2, C1C2C3 ... the ChineseTokenizer works, but the

    /// CJKTokenizer will not work.

    /// </p>

    /// </summary>

    public sealed class ChineseTokenizer : Tokenizer

    {

        public ChineseTokenizer(TextReader _in)

            : base(_in)

        {

            Init();

        }


        public ChineseTokenizer(AttributeSource source, TextReader _in)

            : base(source, _in)

        {

            Init();

        }


        public ChineseTokenizer(AttributeFactory factory, TextReader _in)

            : base(factory, _in)

        {

            Init();

        }


        private void Init()

        {

            termAtt = AddAttribute<ITermAttribute>();

            offsetAtt = AddAttribute<IOffsetAttribute>();

        }


        private int offset = 0, bufferIndex = 0, dataLen = 0;

        private static readonly int MAX_WORD_LEN = 255;

        private static readonly int IO_BUFFER_SIZE = 1024;

        private readonly char[] buffer = new char[MAX_WORD_LEN];

        private readonly char[] ioBuffer = new char[IO_BUFFER_SIZE];


        private int length;

        private int start;


        private ITermAttribute termAtt;

        private IOffsetAttribute offsetAtt;


        private void Push(char c)

        {

            if (length == 0) start = offset - 1; // start of token

            buffer[length++] = Char.ToLower(c); // buffer it

        }


        private bool Flush()

        {


            if (length > 0)

            {

                termAtt.SetTermBuffer(buffer, 0, length);

                offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length));

                return true;

            }

            else

                return false;

        }


        public override bool IncrementToken()

        {

            ClearAttributes();


            length = 0;

            start = offset;


            while (true)

            {


                char c;

                offset++;


                if (bufferIndex >= dataLen)

                {

                    dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);

                    bufferIndex = 0;

                }


                if (dataLen == 0)

                {

                    offset--;

                    return Flush();

                }

                else

                    c = ioBuffer[bufferIndex++];


                switch (char.GetUnicodeCategory(c))

                {


                    case UnicodeCategory.DecimalDigitNumber:

                    case UnicodeCategory.LowercaseLetter:

                    case UnicodeCategory.UppercaseLetter:

                        Push(c);

                        if (length == MAX_WORD_LEN) return Flush();

                        break;


                    case UnicodeCategory.OtherLetter:

                        if (length > 0)

                        {

                            bufferIndex--;

                            offset--;

                            return Flush();

                        }

                        Push(c);

                        return Flush();


                    default:

                        if (length > 0) return Flush();

                        break;

                }

            }

        }


        public override sealed void End()

        {

            // set final offset

            int finalOffset = CorrectOffset(offset);

            this.offsetAtt.SetOffset(finalOffset, finalOffset);

        }


        public override void Reset()

        {

            base.Reset();

            offset = bufferIndex = dataLen = 0;

        }


        public override void Reset(TextReader input)

        {

            base.Reset(input);

            Reset();

        }

    }

}