Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
ArabicAnalyzer.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.Collections.Generic;
20 using System.IO;
21 using System.Collections;
22 using System.Linq;
23 using Lucene.Net.Analysis;
24 using Version = Lucene.Net.Util.Version;
25 using Lucene.Net.Support.Compatibility;
26 
27 namespace Lucene.Net.Analysis.AR
28 {
29  /*
30  * <see cref="Analyzer"/> for Arabic.
31  * <p/>
32  * This analyzer implements light-stemming as specified by:
33  * <i>
34  * Light Stemming for Arabic Information Retrieval
35  * </i>
36  * http://www.mtholyoke.edu/~lballest/Pubs/arab_stem05.pdf
37  * <p/>
38  * The analysis package contains three primary components:
39  * <ul>
40  * <li><see cref="ArabicNormalizationFilter"/>: Arabic orthographic normalization.</li>
41  * <li><see cref="ArabicStemFilter"/>: Arabic light stemming</li>
42  * <li>Arabic stop words file: a set of default Arabic stop words.</li>
43  * </ul>
44  *
45  */
46  public class ArabicAnalyzer : Analyzer
47  {
48 
49  /*
50  * File containing default Arabic stopwords.
51  *
52  * Default stopword list is from http://members.unine.ch/jacques.savoy/clef/index.html
53  * The stopword list is BSD-Licensed.
54  */
55  public static string DEFAULT_STOPWORD_FILE = "ArabicStopWords.txt";
56 
57  /*
58  * Contains the stopwords used with the StopFilter.
59  */
60  private readonly ISet<string> stoptable;
61  /*<summary>
62  * The comment character in the stopwords file. All lines prefixed with this will be ignored
63  * </summary>
64  */
65  [Obsolete("Use WordListLoader.GetWordSet(FileInfo, string) directly")]
66  public static string STOPWORDS_COMMENT = "#";
67 
68  /// <summary>
69  /// Returns an unmodifiable instance of the default stop-words set
70  /// </summary>
71  /// <returns>Returns an unmodifiable instance of the default stop-words set</returns>
72  public static ISet<string> GetDefaultStopSet()
73  {
74  return DefaultSetHolder.DEFAULT_STOP_SET;
75  }
76 
77  private static class DefaultSetHolder
78  {
79  internal static ISet<string> DEFAULT_STOP_SET;
80 
81  static DefaultSetHolder()
82  {
83  try
84  {
85  DEFAULT_STOP_SET = LoadDefaultStopWordSet();
86  }
87  catch (System.IO.IOException)
88  {
89  // default set should always be present as it is part of the
90  // distribution (JAR)
91  throw new Exception("Unable to load default stopword set");
92  }
93  }
94 
95  internal static ISet<string> LoadDefaultStopWordSet()
96  {
97  using (StreamReader reader = new StreamReader(System.Reflection.Assembly.GetAssembly(typeof(ArabicAnalyzer)).GetManifestResourceStream("Lucene.Net.Analysis.AR." + DEFAULT_STOPWORD_FILE)))
98  {
99  return CharArraySet.UnmodifiableSet(CharArraySet.Copy(WordlistLoader.GetWordSet(reader, STOPWORDS_COMMENT)));
100  }
101  }
102  }
103 
104  private Version matchVersion;
105 
106  /*
107  * Builds an analyzer with the default stop words: <see cref="DEFAULT_STOPWORD_FILE"/>.
108  */
109  public ArabicAnalyzer(Version matchVersion)
110  : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
111  {
112  }
113 
114  /// <summary>
115  /// Builds an analyzer with the given stop words.
116  /// </summary>
117  /// <param name="matchVersion">Lucene compatibility version</param>
118  /// <param name="stopwords">a stopword set</param>
119  public ArabicAnalyzer(Version matchVersion, ISet<string> stopwords)
120  {
121  stoptable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
122  this.matchVersion = matchVersion;
123  }
124 
125  /*
126  * Builds an analyzer with the given stop words.
127  */
128  [Obsolete("Use ArabicAnalyzer(Version, Set) instead")]
129  public ArabicAnalyzer(Version matchVersion, params string[] stopwords)
130  : this(matchVersion, StopFilter.MakeStopSet(stopwords))
131  {
132  }
133 
134  /*
135  * Builds an analyzer with the given stop words.
136  */
137  [Obsolete("Use ArabicAnalyzer(Version, Set) instead")]
138  public ArabicAnalyzer(Version matchVersion, IDictionary<string, string> stopwords)
139  : this(matchVersion, stopwords.Keys.ToArray())
140  {
141  }
142 
143  /*
144  * Builds an analyzer with the given stop words. Lines can be commented out using <see cref="STOPWORDS_COMMENT"/>
145  */
146  public ArabicAnalyzer(Version matchVersion, FileInfo stopwords)
147  : this(matchVersion, WordlistLoader.GetWordSet(stopwords, STOPWORDS_COMMENT))
148  {
149  }
150 
151 
152  /*
153  * Creates a <see cref="TokenStream"/> which tokenizes all the text in the provided <see cref="TextReader"/>.
154  *
155  * <returns>A <see cref="TokenStream"/> built from an <see cref="ArabicLetterTokenizer"/> filtered with
156  * <see cref="LowerCaseFilter"/>, <see cref="StopFilter"/>, <see cref="ArabicNormalizationFilter"/>
157  * and <see cref="ArabicStemFilter"/>.</returns>
158  */
159  public override TokenStream TokenStream(string fieldName, TextReader reader)
160  {
161  TokenStream result = new ArabicLetterTokenizer(reader);
162  result = new LowerCaseFilter(result);
163  // the order here is important: the stopword list is not normalized!
164  result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stoptable);
165  result = new ArabicNormalizationFilter(result);
166  result = new ArabicStemFilter(result);
167 
168  return result;
169  }
170 
171  private class SavedStreams
172  {
173  internal Tokenizer Source;
174  internal TokenStream Result;
175  };
176 
177  /*
178  * Returns a (possibly reused) <see cref="TokenStream"/> which tokenizes all the text
179  * in the provided <see cref="TextReader"/>.
180  *
181  * <returns>A <see cref="TokenStream"/> built from an <see cref="ArabicLetterTokenizer"/> filtered with
182  * <see cref="LowerCaseFilter"/>, <see cref="StopFilter"/>, <see cref="ArabicNormalizationFilter"/>
183  * and <see cref="ArabicStemFilter"/>.</returns>
184  */
185  public override TokenStream ReusableTokenStream(string fieldName, TextReader reader)
186  {
187  SavedStreams streams = (SavedStreams)PreviousTokenStream;
188  if (streams == null)
189  {
190  streams = new SavedStreams();
191  streams.Source = new ArabicLetterTokenizer(reader);
192  streams.Result = new LowerCaseFilter(streams.Source);
193  // the order here is important: the stopword list is not normalized!
194  streams.Result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
195  streams.Result, stoptable);
196  streams.Result = new ArabicNormalizationFilter(streams.Result);
197  streams.Result = new ArabicStemFilter(streams.Result);
198  PreviousTokenStream = streams;
199  }
200  else
201  {
202  streams.Source.Reset(reader);
203  }
204  return streams.Result;
205  }
206  }
207 }