d5/d07/_analyzers_2_fr_2_french_stemmer_8cs_source.html

/*

 *

 * Licensed to the Apache Software Foundation (ASF) under one

 * or more contributor license agreements.  See the NOTICE file

 * distributed with this work for additional information

 * regarding copyright ownership.  The ASF licenses this file

 * to you under the Apache License, Version 2.0 (the

 * "License"); you may not use this file except in compliance

 * with the License.  You may obtain a copy of the License at

 *

 *   http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing,

 * software distributed under the License is distributed on an

 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

 * KIND, either express or implied.  See the License for the

 * specific language governing permissions and limitations

 * under the License.

 *

*/


using System;

using System.Text;


namespace Lucene.Net.Analysis.Fr

{


/*

 * A stemmer for French words.

 * <p>

 * The algorithm is based on the work of

 * Dr Martin Porter on his snowball project<br>

 * refer to http://snowball.sourceforge.net/french/stemmer.html<br>

 * (French stemming algorithm) for details

 * </p>

 */


public class FrenchStemmer {


    /*

     * Buffer for the terms while stemming them.

     */

    private StringBuilder sb = new StringBuilder();


    /*

     * A temporary buffer, used to reconstruct R2

     */

     private StringBuilder tb = new StringBuilder();


    /*

     * Region R0 is equal to the whole buffer

     */

    private String R0;


    /*

     * Region RV

     * "If the word begins with two vowels, RV is the region after the third letter,

     * otherwise the region after the first vowel not at the beginning of the word,

     * or the end of the word if these positions cannot be found."

     */

    private String RV;


    /*

     * Region R1

     * "R1 is the region after the first non-vowel following a vowel

     * or is the null region at the end of the word if there is no such non-vowel"

     */

    private String R1;


    /*

     * Region R2

     * "R2 is the region after the first non-vowel in R1 following a vowel

     * or is the null region at the end of the word if there is no such non-vowel"

     */

    private String R2;


    /*

     * Set to true if we need to perform step 2

     */

    private bool suite;


    /*

     * Set to true if the buffer was modified

     */

    private bool modified;


    /*

     * Stems the given term to a unique <tt>discriminator</tt>.

     *

     * @param term  java.langString The term that should be stemmed

     * @return java.lang.String  Discriminator for <tt>term</tt>

     */

    protected internal String Stem( String term ) {

        if ( !IsStemmable( term ) ) {

            return term;

        }


        // Use lowercase for medium stemming.

        term = term.ToLower();


        // Reset the StringBuilder.

        sb.Length =  0;

        sb.Insert( 0, term );


        // reset the bools

        modified = false;

        suite = false;


        sb = TreatVowels( sb );


        SetStrings();


        Step1();


        if (!modified || suite)

        {

            if (RV != null)

            {

                suite = Step2A();

                if (!suite)

                    Step2B();

            }

        }


        if (modified || suite)

            Step3();

        else

            Step4();


        Step5();


        Step6();


        return sb.ToString();

    }


    /*

     * Sets the search region Strings<br>

     * it needs to be done each time the buffer was modified

     */

    private void SetStrings() {

        // set the strings

        R0 = sb.ToString();

        RV = RetrieveRV( sb );

        R1 = RetrieveR( sb );

        if ( R1 != null )

        {

            tb.Length =  0;

            tb.Insert( 0, R1 );

            R2 = RetrieveR( tb );

        }

        else

            R2 = null;

    }


    /*

     * First step of the Porter Algorithm<br>

     * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation

     */

    private void Step1( ) {

        String[] suffix = { "ances", "iqUes", "ismes", "ables", "istes", "ance", "iqUe", "isme", "able", "iste" };

        DeleteFrom( R2, suffix );


        ReplaceFrom( R2, new String[] { "logies", "logie" }, "log" );

        ReplaceFrom( R2, new String[] { "usions", "utions", "usion", "ution" }, "u" );

        ReplaceFrom( R2, new String[] { "ences", "ence" }, "ent" );


        String[] search = { "atrices", "ateurs", "ations", "atrice", "ateur", "ation"};

        DeleteButSuffixFromElseReplace( R2, search, "ic",  true, R0, "iqU" );


        DeleteButSuffixFromElseReplace( R2, new String[] { "ements", "ement" }, "eus", false, R0, "eux" );

        DeleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "ativ", false );

        DeleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iv", false );

        DeleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "abl", false );

        DeleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iqU", false );


        DeleteFromIfTestVowelBeforeIn( R1, new String[] { "issements", "issement" }, false, R0 );

        DeleteFrom( RV, new String[] { "ements", "ement" } );


        DeleteButSuffixFromElseReplace(R2, new [] { "it\u00e9s", "it\u00e9" }, "abil", false, R0, "abl");

        DeleteButSuffixFromElseReplace(R2, new [] { "it\u00e9s", "it\u00e9" }, "ic", false, R0, "iqU");

        DeleteButSuffixFrom(R2, new [] { "it\u00e9s", "it\u00e9" }, "iv", true);


        String[] autre = { "ifs", "ives", "if", "ive" };

        DeleteButSuffixFromElseReplace( R2, autre, "icat", false, R0, "iqU" );

        DeleteButSuffixFromElseReplace( R2, autre, "at", true, R2, "iqU" );


        ReplaceFrom( R0, new String[] { "eaux" }, "eau" );


        ReplaceFrom( R1, new String[] { "aux" }, "al" );


        DeleteButSuffixFromElseReplace( R2, new String[] { "euses", "euse" }, "", true, R1, "eux" );


        DeleteFrom( R2, new String[] { "eux" } );


        // if one of the next steps is performed, we will need to perform step2a

        bool temp = false;

        temp = ReplaceFrom( RV, new String[] { "amment" }, "ant" );

        if (temp == true)

            suite = true;

        temp = ReplaceFrom( RV, new String[] { "emment" }, "ent" );

        if (temp == true)

            suite = true;

        temp = DeleteFromIfTestVowelBeforeIn( RV, new String[] { "ments", "ment" }, true, RV );

        if (temp == true)

            suite = true;


    }


    /*

     * Second step (A) of the Porter Algorithm<br>

     * Will be performed if nothing changed from the first step

     * or changed were done in the amment, emment, ments or ment suffixes<br>

     * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation

     *

     * @return bool - true if something changed in the StringBuilder

     */

    private bool Step2A() {

        String[] search = { "\u00eemes", "\u00eetes", "iraIent", "irait", "irais", "irai", "iras", "ira",

                            "irent", "iriez", "irez", "irions", "irons", "iront",

                            "issaIent", "issais", "issantes", "issante", "issants", "issant",

                            "issait", "issais", "issions", "issons", "issiez", "issez", "issent",

                            "isses", "isse", "ir", "is", "\u00eet", "it", "ies", "ie", "i" };

        return DeleteFromIfTestVowelBeforeIn( RV, search, false, RV );

    }


    /*

     * Second step (B) of the Porter Algorithm<br>

     * Will be performed if step 2 A was performed unsuccessfully<br>

     * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation

     */

    private void Step2B() {

        String[] suffix = { "eraIent", "erais", "erait", "erai", "eras", "erions", "eriez",

                            "erons", "eront","erez", "\u00e8rent", "era", "\u00e9es", "iez",

                            "\u00e9e", "\u00e9s", "er", "ez", "\u00e9" };

        DeleteFrom( RV, suffix );


        String[] search = { "assions", "assiez", "assent", "asses", "asse", "aIent",

                            "antes", "aIent", "Aient", "ante", "\u00e2mes", "\u00e2tes", "ants", "ant",

                            "ait", "a\u00eet", "ais", "Ait", "A\u00eet", "Ais", "\u00e2t", "as", "ai", "Ai", "a" };

        DeleteButSuffixFrom( RV, search, "e", true );


        DeleteFrom( R2, new String[] { "ions" } );

    }


    /*

     * Third step of the Porter Algorithm<br>

     * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation

     */

    private void Step3() {

        if (sb.Length>0)

        {

            char ch = sb[ sb.Length -1];

            if (ch == 'Y')

            {

                sb[sb.Length -1] = 'i' ;

                SetStrings();

            }

            else if (ch == 'ç')

            {

                sb[sb.Length -1] = 'c';

                SetStrings();

            }

        }

    }


    /*

     * Fourth step of the Porter Algorithm<br>

     * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation

     */

    private void Step4() {

        if (sb.Length > 1)

        {

            char ch = sb[ sb.Length -1];

            if (ch == 's')

            {

                char b = sb[ sb.Length -2];

                if (b != 'a' && b != 'i' && b != 'o' && b != 'u' && b != 'è' && b != 's')

                {

                    sb.Length = sb.Length - 1;

                    SetStrings();

                }

            }

        }

        bool found = DeleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "s" );

        if (!found)

        found = DeleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "t" );


        ReplaceFrom(RV, new String[] { "I\u00e8re", "i\u00e8re", "Ier", "ier" }, "i");

        DeleteFrom( RV, new String[] { "e" } );

        DeleteFromIfPrecededIn(RV, new String[] { "\u00eb" }, R0, "gu");

    }


    /*

     * Fifth step of the Porter Algorithm<br>

     * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation

     */

    private void Step5() {

        if (R0 != null)

        {

            if (R0.EndsWith("enn") || R0.EndsWith("onn") || R0.EndsWith("ett") || R0.EndsWith("ell") || R0.EndsWith("eill"))

            {

                sb.Length =  sb.Length - 1;

                SetStrings();

            }

        }

    }


    /*

     * Sixth (and last!) step of the Porter Algorithm<br>

     * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation

     */

    private void Step6() {

        if (R0!=null && R0.Length>0)

        {

            bool seenVowel = false;

            bool seenConson = false;

            int pos = -1;

            for (int i = R0.Length-1; i > -1; i--)

            {

                char ch = R0[i] ;

                if (IsVowel(ch))

                {

                    if (!seenVowel)

                    {

                        if (ch == 'é' || ch == 'è')

                        {

                            pos = i;

                            break;

                        }

                    }

                    seenVowel = true;

                }

                else

                {

                    if (seenVowel)

                        break;

                    else

                        seenConson = true;

                }

            }

            if (pos > -1 && seenConson && !seenVowel)

                sb[pos] = 'e';

        }

    }


    /*

     * Delete a suffix searched in zone "source" if zone "from" contains prefix + search string

     *

     * @param source java.lang.String - the primary source zone for search

     * @param search java.lang.String[] - the strings to search for suppression

     * @param from java.lang.String - the secondary source zone for search

     * @param prefix java.lang.String - the prefix to add to the search string to test

     * @return bool - true if modified

     */

    private bool DeleteFromIfPrecededIn( String source, String[] search, String from, String prefix ) {

        bool found = false;

        if (source!=null )

        {

            for (int i = 0; i < search.Length; i++) {

                if ( source.EndsWith( search[i] ))

                {

                    if (from!=null && from.EndsWith( prefix + search[i] ))

                    {

                        sb.Length =  sb.Length - search[i].Length;

                        found = true;

                        SetStrings();

                        break;

                    }

                }

            }

        }

        return found;

    }


    /*

     * Delete a suffix searched in zone "source" if the preceding letter is (or isn't) a vowel

     *

     * @param source java.lang.String - the primary source zone for search

     * @param search java.lang.String[] - the strings to search for suppression

     * @param vowel bool - true if we need a vowel before the search string

     * @param from java.lang.String - the secondary source zone for search (where vowel could be)

     * @return bool - true if modified

     */

    private bool DeleteFromIfTestVowelBeforeIn( String source, String[] search, bool vowel, String from ) {

        bool found = false;

        if (source!=null && from!=null)

        {

            for (int i = 0; i < search.Length; i++) {

                if ( source.EndsWith( search[i] ))

                {

                    if ((search[i].Length + 1) <= from.Length)

                    {

                        bool test = IsVowel(sb[sb.Length -(search[i].Length+1)]);

                        if (test == vowel)

                        {

                            sb.Length =  sb.Length - search[i].Length;

                            modified = true;

                            found = true;

                            SetStrings();

                            break;

                        }

                    }

                }

            }

        }

        return found;

    }


    /*

     * Delete a suffix searched in zone "source" if preceded by the prefix

     *

     * @param source java.lang.String - the primary source zone for search

     * @param search java.lang.String[] - the strings to search for suppression

     * @param prefix java.lang.String - the prefix to add to the search string to test

     * @param without bool - true if it will be deleted even without prefix found

     */

    private void DeleteButSuffixFrom( String source, String[] search, String prefix, bool without ) {

        if (source!=null)

        {

            for (int i = 0; i < search.Length; i++) {

                if ( source.EndsWith( prefix + search[i] ))

                {

                    sb.Length =  sb.Length - (prefix.Length + search[i].Length);

                    modified = true;

                    SetStrings();

                    break;

                }

                else if ( without && source.EndsWith( search[i] ))

                {

                    sb.Length =  sb.Length - search[i].Length;

                    modified = true;

                    SetStrings();

                    break;

                }

            }

        }

    }


    /*

     * Delete a suffix searched in zone "source" if preceded by prefix<br>

     * or replace it with the replace string if preceded by the prefix in the zone "from"<br>

     * or delete the suffix if specified

     *

     * @param source java.lang.String - the primary source zone for search

     * @param search java.lang.String[] - the strings to search for suppression

     * @param prefix java.lang.String - the prefix to add to the search string to test

     * @param without bool - true if it will be deleted even without prefix found

     */

    private void DeleteButSuffixFromElseReplace( String source, String[] search, String prefix, bool without, String from, String replace ) {

        if (source!=null)

        {

            for (int i = 0; i < search.Length; i++) {

                if ( source.EndsWith( prefix + search[i] ))

                {

                    sb.Length =  sb.Length - (prefix.Length + search[i].Length);

                    modified = true;

                    SetStrings();

                    break;

                }

                else if ( from!=null && from.EndsWith( prefix + search[i] ))

                {

                    // java equivalent of replace

                    sb.Length = sb.Length - (prefix.Length + search[i].Length);

                    sb.Append(replace);


                    modified = true;

                    SetStrings();

                    break;

                }

                else if ( without && source.EndsWith( search[i] ))

                {

                    sb.Length =  sb.Length - search[i].Length;

                    modified = true;

                    SetStrings();

                    break;

                }

            }

        }

    }


    /*

     * Replace a search string with another within the source zone

     *

     * @param source java.lang.String - the source zone for search

     * @param search java.lang.String[] - the strings to search for replacement

     * @param replace java.lang.String - the replacement string

     */

    private bool ReplaceFrom( String source, String[] search, String replace ) {

        bool found = false;

        if (source!=null)

        {

            for (int i = 0; i < search.Length; i++) {

                if ( source.EndsWith( search[i] ))

                {

                    // java equivalent for replace

                    sb.Length = sb.Length - search[i].Length;

                    sb.Append(replace);


                    modified = true;

                    found = true;

                    SetStrings();

                    break;

                }

            }

        }

        return found;

    }


    /*

     * Delete a search string within the source zone

     *

     * @param source the source zone for search

     * @param suffix the strings to search for suppression

     */

    private void DeleteFrom(String source, String[] suffix ) {

        if (source!=null)

        {

            for (int i = 0; i < suffix.Length; i++) {

                if (source.EndsWith( suffix[i] ))

                {

                    sb.Length = sb.Length - suffix[i].Length;

                    modified = true;

                    SetStrings();

                    break;

                }

            }

        }

    }


    /*

     * Test if a char is a french vowel, including accentuated ones

     *

     * @param ch the char to test

     * @return bool - true if the char is a vowel

     */

    private bool IsVowel(char ch) {

        switch (ch)

        {

            case 'a':

            case 'e':

            case 'i':

            case 'o':

            case 'u':

            case 'y':

            case 'â':

            case 'à':

            case 'ë':

            case 'é':

            case 'ê':

            case 'è':

            case 'ï':

            case 'î':

            case 'ô':

            case 'ü':

            case 'ù':

            case 'û':

                return true;

            default:

                return false;

        }

    }


    /*

     * Retrieve the "R zone" (1 or 2 depending on the buffer) and return the corresponding string<br>

     * "R is the region after the first non-vowel following a vowel

     * or is the null region at the end of the word if there is no such non-vowel"<br>

     * @param buffer java.lang.StringBuilder - the in buffer

     * @return java.lang.String - the resulting string

     */

    private String RetrieveR( StringBuilder buffer ) {

        int len = buffer.Length;

        int pos = -1;

        for (int c = 0; c < len; c++) {

            if (IsVowel( buffer[ c ] ))

            {

                pos = c;

                break;

            }

        }

        if (pos > -1)

        {

            int consonne = -1;

            for (int c = pos; c < len; c++) {

                if (!IsVowel(buffer[ c ] ))

                {

                    consonne = c;

                    break;

                }

            }

            if (consonne > -1 && (consonne+1) < len)

                return buffer.ToString(consonne + 1, len - (consonne+1));

            else

                return null;

        }

        else

            return null;

    }


    /*

     * Retrieve the "RV zone" from a buffer an return the corresponding string<br>

     * "If the word begins with two vowels, RV is the region after the third letter,

     * otherwise the region after the first vowel not at the beginning of the word,

     * or the end of the word if these positions cannot be found."<br>

     * @param buffer java.lang.StringBuilder - the in buffer

     * @return java.lang.String - the resulting string

     */

    private String RetrieveRV( StringBuilder buffer ) {

        int len = buffer.Length;

        if ( buffer.Length > 3)

        {

            if ( IsVowel(buffer[ 0 ] ) && IsVowel(buffer[ 1 ] )) {

                return buffer.ToString(3, len - 3);

            }

            else

            {

                int pos = 0;

                for (int c = 1; c < len; c++) {

                    if (IsVowel( buffer[ c ] ))

                    {

                        pos = c;

                        break;

                    }

                }

                if ( pos+1 < len )

                    return buffer.ToString(pos + 1, len - (pos+1));

                else

                    return null;

            }

        }

        else

            return null;

    }


    /*

     * Turns u and i preceded AND followed by a vowel to UpperCase<br>

     * Turns y preceded OR followed by a vowel to UpperCase<br>

     * Turns u preceded by q to UpperCase<br>

     *

     * @param buffer java.util.StringBuilder - the buffer to treat

     * @return java.util.StringBuilder - the treated buffer

     */

    private StringBuilder TreatVowels( StringBuilder buffer ) {

        for ( int c = 0; c < buffer.Length; c++ ) {

            char ch = buffer[ c ] ;


            if (c == 0) // first char

            {

                if (buffer.Length>1)

                {

                    if (ch == 'y' && IsVowel(buffer[ c + 1 ] ))

                        buffer[c] = 'Y';

                }

            }

            else if (c == buffer.Length-1) // last char

            {

                if (ch == 'u' && buffer[ c - 1 ] == 'q')

                    buffer[c] = 'U';

                if (ch == 'y' && IsVowel(buffer[ c - 1 ] ))

                    buffer[c] = 'Y';

            }

            else // other cases

            {

                if (ch == 'u')

                {

                    if (buffer[ c - 1]  == 'q')

                        buffer[c] = 'U';

                    else if (IsVowel(buffer[ c - 1 ] ) && IsVowel(buffer[ c + 1 ] ))

                        buffer[c] = 'U';

                }

                if (ch == 'i')

                {

                    if (IsVowel(buffer[ c - 1 ] ) && IsVowel(buffer[ c + 1 ] ))

                        buffer[c] = 'I';

                }

                if (ch == 'y')

                {

                    if (IsVowel(buffer[ c - 1 ] ) || IsVowel(buffer[ c + 1 ] ))

                        buffer[c] = 'Y';

                }

            }

        }


        return buffer;

    }


    /*

     * Checks a term if it can be processed correctly.

     *

     * @return bool - true if, and only if, the given term consists in letters.

     */

    private bool IsStemmable( String term ) {

        bool upper = false;

        int first = -1;

        for ( int c = 0; c < term.Length; c++ ) {

            // Discard terms that contain non-letter chars.

            if ( !char.IsLetter( term[c] ) ) {

                return false;

            }

            // Discard terms that contain multiple uppercase letters.

            if ( char.IsUpper( term[ c] ) ) {

                if ( upper ) {

                    return false;

                }

            // First encountered uppercase letter, set flag and save

            // position.

                else {

                    first = c;

                    upper = true;

                }

            }

        }

        // Discard the term if it contains a single uppercase letter that

        // is not starting the term.

        if ( first > 0 ) {

            return false;

        }

        return true;

    }

}


}