Lucene.Net  3.0.3
Lucene.Net is a .NET port of the Java Lucene Indexing Library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties
PersianAnalyzer.cs
Go to the documentation of this file.
1 /*
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements. See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership. The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License. You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied. See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20 */
21 
22 using System;
23 using System.Collections.Generic;
24 using System.IO;
25 using System.Linq;
26 using Lucene.Net.Analysis.AR;
27 using Version = Lucene.Net.Util.Version;
28 
29 namespace Lucene.Net.Analysis.Fa
30 {
31  /*
32  * {@link Analyzer} for Persian.
33  * <p>
34  * This Analyzer uses {@link ArabicLetterTokenizer} which implies tokenizing around
35  * zero-width non-joiner in addition to whitespace. Some persian-specific variant forms (such as farsi
36  * yeh and keheh) are standardized. "Stemming" is accomplished via stopwords.
37  * </p>
38  */
39  public sealed class PersianAnalyzer : Analyzer
40  {
41 
42  /*
43  * File containing default Persian stopwords.
44  *
45  * Default stopword list is from
46  * http://members.unine.ch/jacques.savoy/clef/index.html The stopword list is
47  * BSD-Licensed.
48  *
49  */
50  public readonly static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
51 
52  /*
53  * Contains the stopwords used with the StopFilter.
54  */
55  private readonly ISet<string> stoptable;
56 
57  /*
58  * The comment character in the stopwords file. All lines prefixed with this
59  * will be ignored
60  */
61  public static readonly String STOPWORDS_COMMENT = "#";
62 
63  /*
64  * Returns an unmodifiable instance of the default stop-words set.
65  * @return an unmodifiable instance of the default stop-words set.
66  */
67  public static ISet<string> getDefaultStopSet()
68  {
69  return DefaultSetHolder.DEFAULT_STOP_SET;
70  }
71 
72  /*
73  * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
74  * accesses the static final set the first time.;
75  */
76  private static class DefaultSetHolder
77  {
78  internal static readonly ISet<string> DEFAULT_STOP_SET;
79 
80  static DefaultSetHolder()
81  {
82  try
83  {
84  DEFAULT_STOP_SET = LoadDefaultStopWordSet();
85  }
86  catch (IOException ex)
87  {
88  // default set should always be present as it is part of the
89  // distribution (JAR)
90  throw new Exception("Unable to load default stopword set");
91  }
92  }
93 
94  static ISet<String> LoadDefaultStopWordSet()
95  {
96 
97  var stream = System.Reflection.Assembly.GetAssembly(typeof(PersianAnalyzer)).GetManifestResourceStream("Lucene.Net.Analyzers.Fa." + DEFAULT_STOPWORD_FILE);
98  try
99  {
100  StreamReader reader = new StreamReader(stream, System.Text.Encoding.UTF8);
101  // make sure it is unmodifiable as we expose it in the outer class
102  return CharArraySet.UnmodifiableSet(new CharArraySet(WordlistLoader.GetWordSet(reader, STOPWORDS_COMMENT), true));
103  }
104  finally
105  {
106  stream.Close();
107  }
108  }
109  }
110 
111  private readonly Version matchVersion;
112 
113  /*
114  * Builds an analyzer with the default stop words:
115  * {@link #DEFAULT_STOPWORD_FILE}.
116  */
117  public PersianAnalyzer(Version matchVersion)
118  : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
119  {
120 
121  }
122 
123  /*
124  * Builds an analyzer with the given stop words
125  *
126  * @param matchVersion
127  * lucene compatibility version
128  * @param stopwords
129  * a stopword set
130  */
131  public PersianAnalyzer(Version matchVersion, ISet<string> stopwords)
132  {
133  stoptable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
134  this.matchVersion = matchVersion;
135  }
136 
137  /*
138  * Builds an analyzer with the given stop words.
139  * @deprecated use {@link #PersianAnalyzer(Version, Set)} instead
140  */
141  public PersianAnalyzer(Version matchVersion, params string[] stopwords)
142  : this(matchVersion, StopFilter.MakeStopSet(stopwords))
143  {
144 
145  }
146 
147  /*
148  * Builds an analyzer with the given stop words.
149  * @deprecated use {@link #PersianAnalyzer(Version, Set)} instead
150  */
151  public PersianAnalyzer(Version matchVersion, IDictionary<string, string> stopwords)
152  : this(matchVersion, stopwords.Keys.ToArray())
153  {
154 
155  }
156 
157  /*
158  * Builds an analyzer with the given stop words. Lines can be commented out
159  * using {@link #STOPWORDS_COMMENT}
160  * @deprecated use {@link #PersianAnalyzer(Version, Set)} instead
161  */
162  public PersianAnalyzer(Version matchVersion, FileInfo stopwords)
163  : this(matchVersion, WordlistLoader.GetWordSet(stopwords, STOPWORDS_COMMENT))
164  {
165 
166  }
167 
168  /*
169  * Creates a {@link TokenStream} which tokenizes all the text in the provided
170  * {@link Reader}.
171  *
172  * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer}
173  * filtered with {@link LowerCaseFilter},
174  * {@link ArabicNormalizationFilter},
175  * {@link PersianNormalizationFilter} and Persian Stop words
176  */
177  public override TokenStream TokenStream(String fieldName, TextReader reader)
178  {
179  TokenStream result = new ArabicLetterTokenizer(reader);
180  result = new LowerCaseFilter(result);
181  result = new ArabicNormalizationFilter(result);
182  /* additional persian-specific normalization */
183  result = new PersianNormalizationFilter(result);
184  /*
185  * the order here is important: the stopword list is normalized with the
186  * above!
187  */
189  result, stoptable);
190  return result;
191  }
192 
193  private class SavedStreams
194  {
195  protected internal Tokenizer source;
196  protected internal TokenStream result;
197  }
198 
199  /*
200  * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
201  * in the provided {@link Reader}.
202  *
203  * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer}
204  * filtered with {@link LowerCaseFilter},
205  * {@link ArabicNormalizationFilter},
206  * {@link PersianNormalizationFilter} and Persian Stop words
207  */
208  public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
209  {
210  SavedStreams streams = (SavedStreams)PreviousTokenStream;
211  if (streams == null)
212  {
213  streams = new SavedStreams();
214  streams.source = new ArabicLetterTokenizer(reader);
215  streams.result = new LowerCaseFilter(streams.source);
216  streams.result = new ArabicNormalizationFilter(streams.result);
217  /* additional persian-specific normalization */
218  streams.result = new PersianNormalizationFilter(streams.result);
219  /*
220  * the order here is important: the stopword list is normalized with the
221  * above!
222  */
223  streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
224  streams.result, stoptable);
225  PreviousTokenStream = streams;
226  }
227  else
228  {
229  streams.source.Reset(reader);
230  }
231  return streams.result;
232  }
233  }
234 }