dc/db6/_standard_tokenizer_8cs_source.html

/*

 * Licensed to the Apache Software Foundation (ASF) under one or more

 * contributor license agreements.  See the NOTICE file distributed with

 * this work for additional information regarding copyright ownership.

 * The ASF licenses this file to You under the Apache License, Version 2.0

 * (the "License"); you may not use this file except in compliance with

 * the License.  You may obtain a copy of the License at

 *

 * http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


using System;

using Lucene.Net.Analysis.Tokenattributes;

using Lucene.Net.Util;

using CharReader = Lucene.Net.Analysis.CharReader;

using Token = Lucene.Net.Analysis.Token;

using Tokenizer = Lucene.Net.Analysis.Tokenizer;

using AttributeSource = Lucene.Net.Util.AttributeSource;

using Version = Lucene.Net.Util.Version;


namespace Lucene.Net.Analysis.Standard

{


    /// <summary>A grammar-based tokenizer constructed with JFlex

    ///

    /// <p/> This should be a good tokenizer for most European-language documents:

    ///

    /// <list type="bullet">

    /// <item>Splits words at punctuation characters, removing punctuation. However, a

    /// dot that's not followed by whitespace is considered part of a token.</item>

    /// <item>Splits words at hyphens, unless there's a number in the token, in which case

    /// the whole token is interpreted as a product number and is not split.</item>

    /// <item>Recognizes email addresses and internet hostnames as one token.</item>

    /// </list>

    ///

    /// <p/>Many applications have specific tokenizer needs.  If this tokenizer does

    /// not suit your application, please consider copying this source code

    /// directory to your project and maintaining your own grammar-based tokenizer.

    ///

    /// <a name="version"/>

    /// <p/>

    /// You must specify the required <see cref="Version" /> compatibility when creating

    /// StandardAnalyzer:

    /// <list type="bullet">

    /// <item>As of 2.4, Tokens incorrectly identified as acronyms are corrected (see

    /// <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1608</a></item>

    /// </list>

    /// </summary>


    public sealed class StandardTokenizer:Tokenizer

    {

        private void  InitBlock()

        {

            maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;

        }

        /// <summary>A private instance of the JFlex-constructed scanner </summary>

        private StandardTokenizerImpl scanner;


        public const int ALPHANUM   = 0;

        public const int APOSTROPHE = 1;

        public const int ACRONYM    = 2;

        public const int COMPANY    = 3;

        public const int EMAIL      = 4;

        public const int HOST       = 5;

        public const int NUM        = 6;

        public const int CJ         = 7;


        /// <deprecated> this solves a bug where HOSTs that end with '.' are identified

        /// as ACRONYMs.

        /// </deprecated>

        [Obsolete("this solves a bug where HOSTs that end with '.' are identified as ACRONYMs.")]

        public const int ACRONYM_DEP = 8;


        /// <summary>String token types that correspond to token type int constants </summary>

        public static readonly System.String[] TOKEN_TYPES = new System.String[]{"<ALPHANUM>", "<APOSTROPHE>", "<ACRONYM>", "<COMPANY>", "<EMAIL>", "<HOST>", "<NUM>", "<CJ>", "<ACRONYM_DEP>"};


        private bool replaceInvalidAcronym;


        private int maxTokenLength;


        /// <summary>Set the max allowed token length.  Any token longer

        /// than this is skipped.

        /// </summary>

        public int MaxTokenLength

        {

            get { return maxTokenLength; }

            set { this.maxTokenLength = value; }

        }


        /// <summary> Creates a new instance of the

        /// <see cref="Lucene.Net.Analysis.Standard.StandardTokenizer" />. Attaches

        /// the <c>input</c> to the newly created JFlex scanner.

        ///

        /// </summary>

        /// <param name="matchVersion"></param>

        /// <param name="input">The input reader

        ///

        /// See http://issues.apache.org/jira/browse/LUCENE-1068

        /// </param>

        public StandardTokenizer(Version matchVersion, System.IO.TextReader input):base()

        {

            InitBlock();

            this.scanner = new StandardTokenizerImpl(input);

            Init(input, matchVersion);

        }


        /// <summary> Creates a new StandardTokenizer with a given <see cref="AttributeSource" />.</summary>

        public StandardTokenizer(Version matchVersion, AttributeSource source, System.IO.TextReader input):base(source)

        {

            InitBlock();

            this.scanner = new StandardTokenizerImpl(input);

            Init(input, matchVersion);

        }


        /// <summary> Creates a new StandardTokenizer with a given

        /// <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory" />

        /// </summary>

        public StandardTokenizer(Version matchVersion, AttributeFactory factory, System.IO.TextReader input):base(factory)

        {

            InitBlock();

            this.scanner = new StandardTokenizerImpl(input);

            Init(input, matchVersion);

        }


        private void  Init(System.IO.TextReader input, Version matchVersion)

        {

            if (matchVersion.OnOrAfter(Version.LUCENE_24))

            {

                replaceInvalidAcronym = true;

            }

            else

            {

                replaceInvalidAcronym = false;

            }

            this.input = input;

            termAtt = AddAttribute<ITermAttribute>();

            offsetAtt = AddAttribute<IOffsetAttribute>();

            posIncrAtt = AddAttribute<IPositionIncrementAttribute>();

            typeAtt = AddAttribute<ITypeAttribute>();

        }


        // this tokenizer generates three attributes:

        // offset, positionIncrement and type

        private ITermAttribute termAtt;

        private IOffsetAttribute offsetAtt;

        private IPositionIncrementAttribute posIncrAtt;

        private ITypeAttribute typeAtt;


        ///<summary>

        /// (non-Javadoc)

        /// <see cref="Lucene.Net.Analysis.TokenStream.IncrementToken()" />

        ///</summary>

        public override bool IncrementToken()

        {

            ClearAttributes();

            int posIncr = 1;


            while (true)

            {

                int tokenType = scanner.GetNextToken();


                if (tokenType == StandardTokenizerImpl.YYEOF)

                {

                    return false;

                }


                if (scanner.Yylength() <= maxTokenLength)

                {

                    posIncrAtt.PositionIncrement = posIncr;

                    scanner.GetText(termAtt);

                    int start = scanner.Yychar();

                    offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + termAtt.TermLength()));

                    // This 'if' should be removed in the next release. For now, it converts

                    // invalid acronyms to HOST. When removed, only the 'else' part should

                    // remain.

                    if (tokenType == StandardTokenizerImpl.ACRONYM_DEP)

                    {

                        if (replaceInvalidAcronym)

                        {

                            typeAtt.Type = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST];

                            termAtt.SetTermLength(termAtt.TermLength() - 1); // remove extra '.'

                        }

                        else

                        {

                            typeAtt.Type = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM];

                        }

                    }

                    else

                    {

                        typeAtt.Type = StandardTokenizerImpl.TOKEN_TYPES[tokenType];

                    }

                    return true;

                }

                // When we skip a too-long term, we still increment the

                // position increment

                else

                    posIncr++;

            }

        }


        public override void  End()

        {

            // set final offset

            int finalOffset = CorrectOffset(scanner.Yychar() + scanner.Yylength());

            offsetAtt.SetOffset(finalOffset, finalOffset);

        }


        public override void  Reset(System.IO.TextReader reader)

        {

            base.Reset(reader);

            scanner.Reset(reader);

        }


        /// <summary>

        /// Remove in 3.X and make true the only valid value

        /// See https://issues.apache.org/jira/browse/LUCENE-1068

        /// </summary>

        /// <param name="replaceInvalidAcronym">Set to true to replace mischaracterized acronyms as HOST.

        /// </param>

        [Obsolete("Remove in 3.X and make true the only valid value. See https://issues.apache.org/jira/browse/LUCENE-1068")]

        public void  SetReplaceInvalidAcronym(bool replaceInvalidAcronym)

        {

            this.replaceInvalidAcronym = replaceInvalidAcronym;

        }

    }

}