d1/dd8/_analyzers_2_de_2_german_stemmer_8cs_source.html

/*

 *

 * Licensed to the Apache Software Foundation (ASF) under one

 * or more contributor license agreements.  See the NOTICE file

 * distributed with this work for additional information

 * regarding copyright ownership.  The ASF licenses this file

 * to you under the Apache License, Version 2.0 (the

 * "License"); you may not use this file except in compliance

 * with the License.  You may obtain a copy of the License at

 *

 *   http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing,

 * software distributed under the License is distributed on an

 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

 * KIND, either express or implied.  See the License for the

 * specific language governing permissions and limitations

 * under the License.

 *

*/


using System;

using System.IO;

using System.Text;

using System.Collections;


namespace Lucene.Net.Analysis.De

{

    /// <summary>

    /// A stemmer for German words. The algorithm is based on the report

    /// "A Fast and Simple Stemming Algorithm for German Words" by JГ¶rg

    /// Caumanns (joerg.caumanns@isst.fhg.de).

    /// </summary>

    public class GermanStemmer

    {

        /// <summary>

        /// Buffer for the terms while stemming them.

        /// </summary>

        private StringBuilder sb = new StringBuilder();


        /// <summary>

        /// Amount of characters that are removed with <tt>Substitute()</tt> while stemming.

        /// </summary>

        protected int substCount = 0;


        /// <summary>

        /// Stemms the given term to an unique <tt>discriminator</tt>.

        /// </summary>

        /// <param name="term">The term that should be stemmed.</param>

        /// <returns>Discriminator for <tt>term</tt></returns>

        internal String Stem( String term )

        {

            // Use lowercase for medium stemming.

            term = term.ToLower();

            if ( !IsStemmable( term ) )

                return term;

            // Reset the StringBuilder.

            sb.Remove(0, sb.Length);

            sb.Insert(0, term);

            // Stemming starts here...

            Substitute( sb );

            Strip( sb );

            Optimize( sb );

            Resubstitute( sb );

            RemoveParticleDenotion( sb );

            return sb.ToString();

        }


        /// <summary>

        /// Checks if a term could be stemmed.

        /// </summary>

        /// <param name="term"></param>

        /// <returns>true if, and only if, the given term consists in letters.</returns>

        private bool IsStemmable( String term )

        {

            for ( int c = 0; c < term.Length; c++ )

            {

                if ( !Char.IsLetter(term[c])) return false;

            }

            return true;

        }


        /// <summary>

        /// Suffix stripping (stemming) on the current term. The stripping is reduced

        /// to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd",

        /// from which all regular suffixes are build of. The simplification causes

        /// some overstemming, and way more irregular stems, but still provides unique.

        /// discriminators in the most of those cases.

        /// The algorithm is context free, except of the length restrictions.

        /// </summary>

        /// <param name="buffer"></param>

        private void Strip( StringBuilder buffer )

        {

            bool doMore = true;

            while ( doMore && buffer.Length > 3 )

            {

                if ( ( buffer.Length + substCount > 5 ) &&

                    buffer.ToString().Substring(buffer.Length - 2, 2).Equals( "nd" ) )

                {

                    buffer.Remove( buffer.Length - 2, 2 );

                }

                else if ( ( buffer.Length + substCount > 4 ) &&

                    buffer.ToString().Substring( buffer.Length - 2, 2).Equals( "em" ) )

                {

                    buffer.Remove( buffer.Length - 2, 2 );

                }

                else if ( ( buffer.Length + substCount > 4 ) &&

                    buffer.ToString().Substring( buffer.Length - 2, 2).Equals( "er" ) )

                {

                    buffer.Remove( buffer.Length - 2, 2 );

                }

                else if ( buffer[buffer.Length - 1] == 'e' )

                {

                    buffer.Remove(buffer.Length - 1, 1);

                }

                else if ( buffer[buffer.Length - 1] == 's' )

                {

                    buffer.Remove(buffer.Length - 1, 1);

                }

                else if ( buffer[buffer.Length - 1] == 'n' )

                {

                    buffer.Remove(buffer.Length - 1, 1);

                }

                    // "t" occurs only as suffix of verbs.

                else if ( buffer[buffer.Length - 1] == 't')

                {

                    buffer.Remove(buffer.Length - 1, 1);

                }

                else

                {

                    doMore = false;

                }

            }

        }


        /// <summary>

        /// Does some optimizations on the term. This optimisations are contextual.

        /// </summary>

        /// <param name="buffer"></param>

        private void Optimize( StringBuilder buffer )

        {

            // Additional step for female plurals of professions and inhabitants.

            if ( buffer.Length > 5 && buffer.ToString().Substring(buffer.Length - 5, 5).Equals( "erin*" ))

            {

                buffer.Remove(buffer.Length - 1, 1);

                Strip(buffer);

            }

            // Additional step for irregular plural nouns like "Matrizen -> Matrix".

            if ( buffer[buffer.Length - 1] == ('z') )

            {

                buffer[buffer.Length - 1] = 'x';

            }

        }


        /// <summary>

        /// Removes a particle denotion ("ge") from a term.

        /// </summary>

        /// <param name="buffer"></param>

        private void RemoveParticleDenotion( StringBuilder buffer )

        {

            if ( buffer.Length > 4 )

            {

                for ( int c = 0; c < buffer.Length - 3; c++ )

                {

                    if ( buffer.ToString().Substring( c, 4 ).Equals( "gege" ) )

                    {

                        buffer.Remove(c, 2);

                        return;

                    }

                }

            }

        }


        /// <summary>

        /// Do some substitutions for the term to reduce overstemming:

        ///

        /// - Substitute Umlauts with their corresponding vowel: äöü -> aou,

        ///   "&#223;" is substituted by "ss"

        /// - Substitute a second char of a pair of equal characters with

        /// an asterisk: ?? -&gt; ?*

        /// - Substitute some common character combinations with a token:

        ///   sch/ch/ei/ie/ig/st -&gt; $/В&#167;/%/&amp;/#/!

        /// </summary>

        protected virtual void Substitute( StringBuilder buffer )

        {

            substCount = 0;

            for ( int c = 0; c < buffer.Length; c++ )

            {

                // Replace the second char of a pair of the equal characters with an asterisk

                if (c > 0 && buffer[c] == buffer[c - 1])

                {

                    buffer[c] = '*';

                }

                // Substitute Umlauts.

                else if (buffer[c] == 'ä')

                {

                    buffer[c] = 'a';

                }

                else if (buffer[c] == 'ö')

                {

                    buffer[c] = 'o';

                }

                else if (buffer[c] == 'ü')

                {

                    buffer[c] = 'u';

                }

                // Fix bug so that 'ß' at the end of a word is replaced.

                else if (buffer[c] == 'ß')

                {

                    buffer[c] = 's';

                    buffer.Insert(c + 1, 's');

                    substCount++;

                }


                // Take care that at least one character is left left side from the current one

                if ( c < buffer.Length - 1 )

                {

                    // Masking several common character combinations with an token

                    if ( ( c < buffer.Length - 2 ) && buffer[c] == 's' &&

                        buffer[c + 1] == 'c' && buffer[c + 2] == 'h' )

                    {

                        buffer[c] = '$';

                        buffer.Remove(c + 1, 2);

                        substCount =+ 2;

                    }

                    else if ( buffer[c] == 'c' && buffer[c + 1] == 'h' )

                    {

                        buffer[c] = '§';

                        buffer.Remove(c + 1, 1);

                        substCount++;

                    }

                    else if ( buffer[c] == 'e' && buffer[c + 1] == 'i' )

                    {

                        buffer[c] = '%';

                        buffer.Remove(c + 1, 1);

                        substCount++;

                    }

                    else if ( buffer[c] == 'i' && buffer[c + 1] == 'e' )

                    {

                        buffer[c] = '&';

                        buffer.Remove(c + 1, 1);

                        substCount++;

                    }

                    else if ( buffer[c] == 'i' && buffer[c + 1] == 'g' )

                    {

                        buffer[c] = '#';

                        buffer.Remove(c + 1, 1);

                        substCount++;

                    }

                    else if ( buffer[c] == 's' && buffer[c + 1] == 't' )

                    {

                        buffer[c] = '!';

                        buffer.Remove(c + 1, 1);

                        substCount++;

                    }

                }

            }

        }


        /// <summary>

        /// Undoes the changes made by Substitute(). That are character pairs and

        /// character combinations. Umlauts will remain as their corresponding vowel,

        /// as "?" remains as "ss".

        /// </summary>

        /// <param name="buffer"></param>

        private void Resubstitute( StringBuilder buffer )

        {

            for ( int c = 0; c < buffer.Length; c++ )

            {

                if ( buffer[c] == '*' )

                {

                    char x = buffer[c - 1];

                    buffer[c] = x;

                }

                else if ( buffer[c] == '$' )

                {

                    buffer[c] = 's';

                    buffer.Insert( c + 1, new char[]{'c', 'h'}, 0, 2);

                }

                else if ( buffer[c] == '§' )

                {

                    buffer[c] = 'c';

                    buffer.Insert( c + 1, 'h' );

                }

                else if ( buffer[c] == '%' )

                {

                    buffer[c] = 'e';

                    buffer.Insert( c + 1, 'i' );

                }

                else if ( buffer[c] == '&' )

                {

                    buffer[c] = 'i';

                    buffer.Insert( c + 1, 'e' );

                }

                else if ( buffer[c] == '#' )

                {

                    buffer[c] = 'i';

                    buffer.Insert( c + 1, 'g' );

                }

                else if ( buffer[c] == '!' )

                {

                    buffer[c] = 's';

                    buffer.Insert( c + 1, 't' );

                }

            }

        }

    }

}