Lucene.Net  3.0.3
Lucene.Net is a .NET port of the Java Lucene Indexing Library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties
BrazilianAnalyzer.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.Collections;
20 using System.Collections.Generic;
21 using System.Linq;
22 using Lucene.Net.Analysis;
23 using Lucene.Net.Analysis.Standard;
24 using System.IO;
25 using Version = Lucene.Net.Util.Version;
26 
27 /*
28  * Analyzer for Brazilian language. Supports an external list of stopwords (words that
29  * will not be indexed at all) and an external list of exclusions (word that will
30  * not be stemmed, but indexed).
31  *
32  */
33 namespace Lucene.Net.Analysis.BR
34 {
35  public sealed class BrazilianAnalyzer : Analyzer
36  {
37  /*
38  * List of typical Brazilian stopwords.
39  */
40  //TODO: Make this private in 3.1
41  public static string[] BRAZILIAN_STOP_WORDS = {
42  "a", "ainda", "alem", "ambas", "ambos", "antes",
43  "ao", "aonde", "aos", "apos", "aquele", "aqueles",
44  "as", "assim", "com", "como", "contra", "contudo",
45  "cuja", "cujas", "cujo", "cujos", "da", "das", "de",
46  "dela", "dele", "deles", "demais", "depois", "desde",
47  "desta", "deste", "dispoe", "dispoem", "diversa",
48  "diversas", "diversos", "do", "dos", "durante", "e",
49  "ela", "elas", "ele", "eles", "em", "entao", "entre",
50  "essa", "essas", "esse", "esses", "esta", "estas",
51  "este", "estes", "ha", "isso", "isto", "logo", "mais",
52  "mas", "mediante", "menos", "mesma", "mesmas", "mesmo",
53  "mesmos", "na", "nas", "nao", "nas", "nem", "nesse", "neste",
54  "nos", "o", "os", "ou", "outra", "outras", "outro", "outros",
55  "pelas", "pelas", "pelo", "pelos", "perante", "pois", "por",
56  "porque", "portanto", "proprio", "propios", "quais", "qual",
57  "qualquer", "quando", "quanto", "que", "quem", "quer", "se",
58  "seja", "sem", "sendo", "seu", "seus", "sob", "sobre", "sua",
59  "suas", "tal", "tambem", "teu", "teus", "toda", "todas",
60  "todo",
61  "todos", "tua", "tuas", "tudo", "um", "uma", "umas", "uns"
62  };
63 
68  public static ISet<string> GetDefaultStopSet()
69  {
70  return DefaultSetHolder.DEFAULT_STOP_SET;
71  }
72 
73  private static class DefaultSetHolder
74  {
75  internal static ISet<string> DEFAULT_STOP_SET =
76  CharArraySet.UnmodifiableSet(new CharArraySet((IEnumerable<string>)BRAZILIAN_STOP_WORDS, false));
77  }
78 
82  private ISet<string> stoptable = Support.Compatibility.SetFactory.CreateHashSet<string>();
83 
84  private readonly Version matchVersion;
85 
86  // TODO: make this private in 3.1
90  private ISet<string> excltable = Support.Compatibility.SetFactory.CreateHashSet<string>();
91 
92  public BrazilianAnalyzer(Version matchVersion)
93  : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
94  {
95  }
96 
97  /*
98  * Builds an analyzer with the given stop words
99  *
100  * @param matchVersion
101  * lucene compatibility version
102  * @param stopwords
103  * a stopword set
104  */
105 
106  public BrazilianAnalyzer(Version matchVersion, ISet<string> stopwords)
107  {
108  stoptable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
109  this.matchVersion = matchVersion;
110  }
111 
112  /*
113  * Builds an analyzer with the given stop words and stemming exclusion words
114  *
115  * @param matchVersion
116  * lucene compatibility version
117  * @param stopwords
118  * a stopword set
119  */
120 
121  public BrazilianAnalyzer(Version matchVersion, ISet<string> stopwords,
122  ISet<string> stemExclusionSet)
123  : this(matchVersion, stopwords)
124  {
125 
127  .Copy(stemExclusionSet));
128  }
129 
130  /*
131  * Builds an analyzer with the given stop words.
132  * @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead
133  */
134 
135  public BrazilianAnalyzer(Version matchVersion, params string[] stopwords)
136  : this(matchVersion, StopFilter.MakeStopSet(stopwords))
137  {
138 
139  }
140 
141  /*
142  * Builds an analyzer with the given stop words.
143  * @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead
144  */
145 
146  public BrazilianAnalyzer(Version matchVersion, IDictionary<string, string> stopwords)
147  : this(matchVersion, stopwords.Keys.ToArray())
148  {
149 
150  }
151 
152  /*
153  * Builds an analyzer with the given stop words.
154  * @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead
155  */
156 
157  public BrazilianAnalyzer(Version matchVersion, FileInfo stopwords)
158  : this(matchVersion, WordlistLoader.GetWordSet(stopwords))
159  {
160  }
161 
162  /*
163  * Builds an exclusionlist from an array of Strings.
164  * @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead
165  */
166 
167  public void SetStemExclusionTable(params string[] exclusionlist)
168  {
169  excltable = StopFilter.MakeStopSet(exclusionlist);
170  PreviousTokenStream = null; // force a new stemmer to be created
171  }
172 
173  /*
174  * Builds an exclusionlist from a {@link Map}.
175  * @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead
176  */
177 
178  public void SetStemExclusionTable(IDictionary<string, string> exclusionlist)
179  {
180  excltable = Support.Compatibility.SetFactory.CreateHashSet(exclusionlist.Keys);
181  PreviousTokenStream = null; // force a new stemmer to be created
182  }
183 
184  /*
185  * Builds an exclusionlist from the words contained in the given file.
186  * @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead
187  */
188 
189  public void SetStemExclusionTable(FileInfo exclusionlist)
190  {
191  excltable = WordlistLoader.GetWordSet(exclusionlist);
192  PreviousTokenStream = null; // force a new stemmer to be created
193  }
194 
195  /*
196  * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
197  *
198  * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
199  * {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and
200  * {@link BrazilianStemFilter}.
201  */
202  public override TokenStream TokenStream(String fieldName, TextReader reader)
203  {
204  TokenStream result = new StandardTokenizer(matchVersion, reader);
205  result = new LowerCaseFilter(result);
206  result = new StandardFilter(result);
208  result, stoptable);
209  result = new BrazilianStemFilter(result, excltable);
210  return result;
211  }
212 
213  private class SavedStreams
214  {
215  protected internal Tokenizer source;
216  protected internal TokenStream result;
217  };
218 
219  /*
220  * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
221  * in the provided {@link Reader}.
222  *
223  * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
224  * {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and
225  * {@link BrazilianStemFilter}.
226  */
227 
228  public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
229  {
230  SavedStreams streams = (SavedStreams) PreviousTokenStream;
231  if (streams == null)
232  {
233  streams = new SavedStreams();
234  streams.source = new StandardTokenizer(matchVersion, reader);
235  streams.result = new LowerCaseFilter(streams.source);
236  streams.result = new StandardFilter(streams.result);
237  streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
238  streams.result, stoptable);
239  streams.result = new BrazilianStemFilter(streams.result, excltable);
240  PreviousTokenStream = streams;
241  }
242  else
243  {
244  streams.source.Reset(reader);
245  }
246  return streams.result;
247  }
248  }
249 }