Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
WordlistLoader.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System.Collections.Generic;
19 
20 namespace Lucene.Net.Analysis
21 {
22 
23  /// <summary> Loader for text files that represent a list of stopwords.</summary>
24  public class WordlistLoader
25  {
26 
27  /// <summary> Loads a text file and adds every line as an entry to a HashSet (omitting
28  /// leading and trailing whitespace). Every line of the file should contain only
29  /// one word. The words need to be in lowercase if you make use of an
30  /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
31  /// </summary>
32  /// <param name="wordfile">File containing the wordlist</param>
33  /// <returns> A HashSet with the file's words</returns>
34  public static ISet<string> GetWordSet(System.IO.FileInfo wordfile)
35  {
36  using (var reader = new System.IO.StreamReader(wordfile.FullName, System.Text.Encoding.Default))
37  {
38  return GetWordSet(reader);
39  }
40  }
41 
42  /// <summary> Loads a text file and adds every non-comment line as an entry to a HashSet (omitting
43  /// leading and trailing whitespace). Every line of the file should contain only
44  /// one word. The words need to be in lowercase if you make use of an
45  /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
46  /// </summary>
47  /// <param name="wordfile">File containing the wordlist</param>
48  /// <param name="comment">The comment string to ignore</param>
49  /// <returns> A HashSet with the file's words</returns>
50  public static ISet<string> GetWordSet(System.IO.FileInfo wordfile, System.String comment)
51  {
52  using (var reader = new System.IO.StreamReader(wordfile.FullName, System.Text.Encoding.Default))
53  {
54  return GetWordSet(reader, comment);
55  }
56  }
57 
58 
59  /// <summary> Reads lines from a Reader and adds every line as an entry to a HashSet (omitting
60  /// leading and trailing whitespace). Every line of the Reader should contain only
61  /// one word. The words need to be in lowercase if you make use of an
62  /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
63  /// </summary>
64  /// <param name="reader">Reader containing the wordlist</param>
65  /// <returns>A HashSet with the reader's words</returns>
66  public static ISet<string> GetWordSet(System.IO.TextReader reader)
67  {
68  var result = Support.Compatibility.SetFactory.CreateHashSet<string>();
69 
70  System.String word;
71  while ((word = reader.ReadLine()) != null)
72  {
73  result.Add(word.Trim());
74  }
75 
76  return result;
77  }
78 
79  /// <summary> Reads lines from a Reader and adds every non-comment line as an entry to a HashSet (omitting
80  /// leading and trailing whitespace). Every line of the Reader should contain only
81  /// one word. The words need to be in lowercase if you make use of an
82  /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
83  ///
84  /// </summary>
85  /// <param name="reader">Reader containing the wordlist
86  /// </param>
87  /// <param name="comment">The string representing a comment.
88  /// </param>
89  /// <returns> A HashSet with the reader's words
90  /// </returns>
91  public static ISet<string> GetWordSet(System.IO.TextReader reader, System.String comment)
92  {
93  var result = Support.Compatibility.SetFactory.CreateHashSet<string>();
94 
95  System.String word = null;
96  while ((word = reader.ReadLine()) != null)
97  {
98  if (word.StartsWith(comment) == false)
99  {
100  result.Add(word.Trim());
101  }
102  }
103 
104  return result;
105  }
106 
107 
108 
109  /// <summary> Reads a stem dictionary. Each line contains:
110  /// <c>word<b>\t</b>stem</c>
111  /// (i.e. two tab seperated words)
112  ///
113  /// </summary>
114  /// <returns> stem dictionary that overrules the stemming algorithm
115  /// </returns>
116  /// <throws> IOException </throws>
117  public static Dictionary<string, string> GetStemDict(System.IO.FileInfo wordstemfile)
118  {
119  if (wordstemfile == null)
120  throw new System.NullReferenceException("wordstemfile may not be null");
121  var result = new Dictionary<string, string>();
122  System.IO.StreamReader br = null;
123  System.IO.StreamReader fr = null;
124  try
125  {
126  fr = new System.IO.StreamReader(wordstemfile.FullName, System.Text.Encoding.Default);
127  br = new System.IO.StreamReader(fr.BaseStream, fr.CurrentEncoding);
128  System.String line;
129  char[] tab = {'\t'};
130  while ((line = br.ReadLine()) != null)
131  {
132  System.String[] wordstem = line.Split(tab, 2);
133  result[wordstem[0]] = wordstem[1];
134  }
135  }
136  finally
137  {
138  if (fr != null)
139  fr.Close();
140  if (br != null)
141  br.Close();
142  }
143  return result;
144  }
145  }
146 }