docs/3.0.3/_i_s_o_latin1_accent_filter_8cs_source.html

/*

 * Licensed to the Apache Software Foundation (ASF) under one or more

 * contributor license agreements.  See the NOTICE file distributed with

 * this work for additional information regarding copyright ownership.

 * The ASF licenses this file to You under the Apache License, Version 2.0

 * (the "License"); you may not use this file except in compliance with

 * the License.  You may obtain a copy of the License at

 *

 * http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


using System;

using Lucene.Net.Analysis.Tokenattributes;


namespace Lucene.Net.Analysis

{


    [Obsolete("If you build a new index, use ASCIIFoldingFilter which covers a superset of Latin 1.  This class is included for use with existing indexes and will be removed in a future release (possible Lucene 4.0).")]

    public class ISOLatin1AccentFilter : TokenFilter

    {

        public ISOLatin1AccentFilter(TokenStream input):base(input)

        {

            termAtt = AddAttribute<ITermAttribute>();

        }


        private char[] output = new char[256];

        private int outputPos;

        private readonly ITermAttribute termAtt;


        public override bool IncrementToken()

        {

            if (input.IncrementToken())

            {

                char[] buffer = termAtt.TermBuffer();

                int length = termAtt.TermLength();

                // If no characters actually require rewriting then we

                // just return token as-is:

                for (int i = 0; i < length; i++)

                {

                    char c = buffer[i];

                    if (c >= '\u00c0' && c <= '\uFB06')

                    {

                        RemoveAccents(buffer, length);

                        termAtt.SetTermBuffer(output, 0, outputPos);

                        break;

                    }

                }

                return true;

            }

            return false;

        }


        public void  RemoveAccents(char[] input, int length)

        {


            // Worst-case length required:

            int maxSizeNeeded = 2 * length;


            int size = output.Length;

            while (size < maxSizeNeeded)

                size *= 2;


            if (size != output.Length)

                output = new char[size];


            outputPos = 0;


            int pos = 0;


            for (int i = 0; i < length; i++, pos++)

            {

                char c = input[pos];


                // Quick test: if it's not in range then just keep

                // current character

                if (c < '\u00c0' || c > '\uFB06')

                    output[outputPos++] = c;

                else

                {

                    switch (c)

                    {


                        case '\u00C0':

                        // Ã€

                        case '\u00C1':

                        // ï¿½?

                        case '\u00C2':

                        // Ã‚

                        case '\u00C3':

                        // Ãƒ

                        case '\u00C4':

                        // Ã„

                        case '\u00C5':  // Ã…

                            output[outputPos++] = 'A';

                            break;


                        case '\u00C6':  // Ã†

                            output[outputPos++] = 'A';

                            output[outputPos++] = 'E';

                            break;


                        case '\u00C7':  // Ã‡

                            output[outputPos++] = 'C';

                            break;


                        case '\u00C8':

                        // Ãˆ

                        case '\u00C9':

                        // Ã‰

                        case '\u00CA':

                        // ÃŠ

                        case '\u00CB':  // Ã‹

                            output[outputPos++] = 'E';

                            break;


                        case '\u00CC':

                        // ÃŒ

                        case '\u00CD':

                        // ï¿½?

                        case '\u00CE':

                        // ÃŽ

                        case '\u00CF':  // ï¿½?

                            output[outputPos++] = 'I';

                            break;


                        case '\u0132':  // Ä²

                            output[outputPos++] = 'I';

                            output[outputPos++] = 'J';

                            break;


                        case '\u00D0':  // ï¿½?

                            output[outputPos++] = 'D';

                            break;


                        case '\u00D1':  // Ã‘

                            output[outputPos++] = 'N';

                            break;


                        case '\u00D2':

                        // Ã’

                        case '\u00D3':

                        // Ã“

                        case '\u00D4':

                        // Ã”

                        case '\u00D5':

                        // Ã•

                        case '\u00D6':

                        // Ã–

                        case '\u00D8':  // Ã˜

                            output[outputPos++] = 'O';

                            break;


                        case '\u0152':  // Å’

                            output[outputPos++] = 'O';

                            output[outputPos++] = 'E';

                            break;


                        case '\u00DE':  // Ãž

                            output[outputPos++] = 'T';

                            output[outputPos++] = 'H';

                            break;


                        case '\u00D9':

                        // Ã™

                        case '\u00DA':

                        // Ãš

                        case '\u00DB':

                        // Ã›

                        case '\u00DC':  // Ãœ

                            output[outputPos++] = 'U';

                            break;


                        case '\u00DD':

                        // ï¿½?

                        case '\u0178':  // Å¸

                            output[outputPos++] = 'Y';

                            break;


                        case '\u00E0':

                        // Ã

                        case '\u00E1':

                        // Ã¡

                        case '\u00E2':

                        // Ã¢

                        case '\u00E3':

                        // Ã£

                        case '\u00E4':

                        // Ã¤

                        case '\u00E5':  // Ã¥

                            output[outputPos++] = 'a';

                            break;


                        case '\u00E6':  // Ã¦

                            output[outputPos++] = 'a';

                            output[outputPos++] = 'e';

                            break;


                        case '\u00E7':  // Ã§

                            output[outputPos++] = 'c';

                            break;


                        case '\u00E8':

                        // Ã¨

                        case '\u00E9':

                        // Ã©

                        case '\u00EA':

                        // Ãª

                        case '\u00EB':  // Ã«

                            output[outputPos++] = 'e';

                            break;


                        case '\u00EC':

                        // Ã¬

                        case '\u00ED':

                        // Ã

                        case '\u00EE':

                        // Ã®

                        case '\u00EF':  // Ã¯

                            output[outputPos++] = 'i';

                            break;


                        case '\u0133':  // Ä³

                            output[outputPos++] = 'i';

                            output[outputPos++] = 'j';

                            break;


                        case '\u00F0':  // Ã°

                            output[outputPos++] = 'd';

                            break;


                        case '\u00F1':  // Ã±

                            output[outputPos++] = 'n';

                            break;


                        case '\u00F2':

                        // Ã²

                        case '\u00F3':

                        // Ã³

                        case '\u00F4':

                        // Ã´

                        case '\u00F5':

                        // Ãµ

                        case '\u00F6':

                        // Ã¶

                        case '\u00F8':  // Ã¸

                            output[outputPos++] = 'o';

                            break;


                        case '\u0153':  // Å“

                            output[outputPos++] = 'o';

                            output[outputPos++] = 'e';

                            break;


                        case '\u00DF':  // ÃŸ

                            output[outputPos++] = 's';

                            output[outputPos++] = 's';

                            break;


                        case '\u00FE':  // Ã¾

                            output[outputPos++] = 't';

                            output[outputPos++] = 'h';

                            break;


                        case '\u00F9':

                        // Ã¹

                        case '\u00FA':

                        // Ãº

                        case '\u00FB':

                        // Ã»

                        case '\u00FC':  // Ã¼

                            output[outputPos++] = 'u';

                            break;


                        case '\u00FD':

                        // Ã½

                        case '\u00FF':  // Ã¿

                            output[outputPos++] = 'y';

                            break;


                        case '\uFB00':  // ï¬€

                            output[outputPos++] = 'f';

                            output[outputPos++] = 'f';

                            break;


                        case '\uFB01':  // ï¿½?

                            output[outputPos++] = 'f';

                            output[outputPos++] = 'i';

                            break;


                        case '\uFB02':  // ï¬‚

                            output[outputPos++] = 'f';

                            output[outputPos++] = 'l';

                            break;

                            // following 2 are commented as they can break the maxSizeNeeded (and doing *3 could be expensive)

                            //        case '\uFB03': // ï¬ƒ

                            //            output[outputPos++] = 'f';

                            //            output[outputPos++] = 'f';

                            //            output[outputPos++] = 'i';

                            //            break;

                            //        case '\uFB04': // ï¬„

                            //            output[outputPos++] = 'f';

                            //            output[outputPos++] = 'f';

                            //            output[outputPos++] = 'l';

                            //            break;


                        case '\uFB05':  // ï¬…

                            output[outputPos++] = 'f';

                            output[outputPos++] = 't';

                            break;


                        case '\uFB06':  // ï¬†

                            output[outputPos++] = 's';

                            output[outputPos++] = 't';

                            break;


                        default:

                            output[outputPos++] = c;

                            break;


                    }

                }

            }

        }

    }

}