d7/dd3/core_2_analysis_2_wordlist_loader_8cs_source.html

/*

 * Licensed to the Apache Software Foundation (ASF) under one or more

 * contributor license agreements.  See the NOTICE file distributed with

 * this work for additional information regarding copyright ownership.

 * The ASF licenses this file to You under the Apache License, Version 2.0

 * (the "License"); you may not use this file except in compliance with

 * the License.  You may obtain a copy of the License at

 *

 * http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


using System.Collections.Generic;


namespace Lucene.Net.Analysis

{


    /// <summary> Loader for text files that represent a list of stopwords.</summary>

    public class WordlistLoader

    {


        /// <summary> Loads a text file and adds every line as an entry to a HashSet (omitting

        /// leading and trailing whitespace). Every line of the file should contain only

        /// one word. The words need to be in lowercase if you make use of an

        /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).

        /// </summary>

        /// <param name="wordfile">File containing the wordlist</param>

        /// <returns> A HashSet with the file's words</returns>

        public static ISet<string> GetWordSet(System.IO.FileInfo wordfile)

        {

            using (var reader = new System.IO.StreamReader(wordfile.FullName, System.Text.Encoding.Default))

            {

                return GetWordSet(reader);

            }

        }


        /// <summary> Loads a text file and adds every non-comment line as an entry to a HashSet (omitting

        /// leading and trailing whitespace). Every line of the file should contain only

        /// one word. The words need to be in lowercase if you make use of an

        /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).

        /// </summary>

        /// <param name="wordfile">File containing the wordlist</param>

        /// <param name="comment">The comment string to ignore</param>

        /// <returns> A HashSet with the file's words</returns>

        public static ISet<string> GetWordSet(System.IO.FileInfo wordfile, System.String comment)

        {

            using (var reader = new System.IO.StreamReader(wordfile.FullName, System.Text.Encoding.Default))

            {

                return GetWordSet(reader, comment);

            }

        }


        /// <summary> Reads lines from a Reader and adds every line as an entry to a HashSet (omitting

        /// leading and trailing whitespace). Every line of the Reader should contain only

        /// one word. The words need to be in lowercase if you make use of an

        /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).

        /// </summary>

        /// <param name="reader">Reader containing the wordlist</param>

        /// <returns>A HashSet with the reader's words</returns>

        public static ISet<string> GetWordSet(System.IO.TextReader reader)

        {

            var result = Support.Compatibility.SetFactory.CreateHashSet<string>();


            System.String word;

            while ((word = reader.ReadLine()) != null)

            {

                result.Add(word.Trim());

            }


            return result;

        }


        /// <summary> Reads lines from a Reader and adds every non-comment line as an entry to a HashSet (omitting

        /// leading and trailing whitespace). Every line of the Reader should contain only

        /// one word. The words need to be in lowercase if you make use of an

        /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).

        ///

        /// </summary>

        /// <param name="reader">Reader containing the wordlist

        /// </param>

        /// <param name="comment">The string representing a comment.

        /// </param>

        /// <returns> A HashSet with the reader's words

        /// </returns>

        public static ISet<string> GetWordSet(System.IO.TextReader reader, System.String comment)

        {

            var result = Support.Compatibility.SetFactory.CreateHashSet<string>();


            System.String word = null;

            while ((word = reader.ReadLine()) != null)

            {

                if (word.StartsWith(comment) == false)

                {

                    result.Add(word.Trim());

                }

            }


            return result;

        }


        /// <summary> Reads a stem dictionary. Each line contains:

        /// <c>word<b>\t</b>stem</c>

        /// (i.e. two tab seperated words)

        ///

        /// </summary>

        /// <returns> stem dictionary that overrules the stemming algorithm

        /// </returns>

        /// <throws>  IOException  </throws>

        public static Dictionary<string, string> GetStemDict(System.IO.FileInfo wordstemfile)

        {

            if (wordstemfile == null)

                throw new System.NullReferenceException("wordstemfile may not be null");

            var result = new Dictionary<string, string>();

            System.IO.StreamReader br = null;

            System.IO.StreamReader fr = null;

            try

            {

                fr = new System.IO.StreamReader(wordstemfile.FullName, System.Text.Encoding.Default);

                br = new System.IO.StreamReader(fr.BaseStream, fr.CurrentEncoding);

                System.String line;

                char[] tab = {'\t'};

                while ((line = br.ReadLine()) != null)

                {

                    System.String[] wordstem = line.Split(tab, 2);

                    result[wordstem[0]] = wordstem[1];

                }

            }

            finally

            {

                if (fr != null)

                    fr.Close();

                if (br != null)

                    br.Close();

            }

            return result;

        }

    }

}