Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
CzechAnalyzer.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.Collections.Generic;
20 using System.IO;
21 using System.Text;
22 using System.Collections;
23 
24 using Lucene.Net.Analysis;
25 using Lucene.Net.Analysis.De;
26 using Lucene.Net.Analysis.Standard;
27 using Version = Lucene.Net.Util.Version;
28 
29 namespace Lucene.Net.Analysis.Cz
30 {
31 /*
32  * {@link Analyzer} for Czech language.
33  * <p>
34  * Supports an external list of stopwords (words that
35  * will not be indexed at all).
36  * A default set of stopwords is used unless an alternative list is specified.
37  * </p>
38  *
39  * <p><b>NOTE</b>: This class uses the same {@link Version}
40  * dependent settings as {@link StandardAnalyzer}.</p>
41  */
42 public sealed class CzechAnalyzer : Analyzer {
43 
44  /*
45  * List of typical stopwords.
46  * @deprecated use {@link #getDefaultStopSet()} instead
47  */
48  // TODO make this private in 3.1
49  public static readonly String[] CZECH_STOP_WORDS = {
50  "a","s","k","o","i","u","v","z","dnes","cz","t\u00edmto","bude\u0161","budem",
51  "byli","jse\u0161","m\u016fj","sv\u00fdm","ta","tomto","tohle","tuto","tyto",
52  "jej","zda","pro\u010d","m\u00e1te","tato","kam","tohoto","kdo","kte\u0159\u00ed",
53  "mi","n\u00e1m","tom","tomuto","m\u00edt","nic","proto","kterou","byla",
54  "toho","proto\u017ee","asi","ho","na\u0161i","napi\u0161te","re","co\u017e","t\u00edm",
55  "tak\u017ee","sv\u00fdch","jej\u00ed","sv\u00fdmi","jste","aj","tu","tedy","teto",
56  "bylo","kde","ke","prav\u00e9","ji","nad","nejsou","\u010di","pod","t\u00e9ma",
57  "mezi","p\u0159es","ty","pak","v\u00e1m","ani","kdy\u017e","v\u0161ak","neg","jsem",
58  "tento","\u010dl\u00e1nku","\u010dl\u00e1nky","aby","jsme","p\u0159ed","pta","jejich",
59  "byl","je\u0161t\u011b","a\u017e","bez","tak\u00e9","pouze","prvn\u00ed","va\u0161e","kter\u00e1",
60  "n\u00e1s","nov\u00fd","tipy","pokud","m\u016f\u017ee","strana","jeho","sv\u00e9","jin\u00e9",
61  "zpr\u00e1vy","nov\u00e9","nen\u00ed","v\u00e1s","jen","podle","zde","u\u017e","b\u00fdt","v\u00edce",
62  "bude","ji\u017e","ne\u017e","kter\u00fd","by","kter\u00e9","co","nebo","ten","tak",
63  "m\u00e1","p\u0159i","od","po","jsou","jak","dal\u0161\u00ed","ale","si","se","ve",
64  "to","jako","za","zp\u011bt","ze","do","pro","je","na","atd","atp",
65  "jakmile","p\u0159i\u010dem\u017e","j\u00e1","on","ona","ono","oni","ony","my","vy",
66  "j\u00ed","ji","m\u011b","mne","jemu","tomu","t\u011bm","t\u011bmu","n\u011bmu","n\u011bmu\u017e",
67  "jeho\u017e","j\u00ed\u017e","jeliko\u017e","je\u017e","jako\u017e","na\u010de\u017e",
68  };
69 
70  /*
71  * Returns a set of default Czech-stopwords
72  * @return a set of default Czech-stopwords
73  */
74  public static ISet<string> getDefaultStopSet(){
75  return DefaultSetHolder.DEFAULT_SET;
76  }
77 
78  private static class DefaultSetHolder {
79  internal static ISet<string> DEFAULT_SET = CharArraySet.UnmodifiableSet(new CharArraySet(
80  (IEnumerable<string>)CZECH_STOP_WORDS, false));
81  }
82 
83  /*
84  * Contains the stopwords used with the {@link StopFilter}.
85  */
86  // TODO make this final in 3.1
87  private ISet<string> stoptable;
88  private readonly Version matchVersion;
89 
90  /*
91  * Builds an analyzer with the default stop words ({@link #CZECH_STOP_WORDS}).
92  */
93  public CzechAnalyzer(Version matchVersion)
94  : this(matchVersion, DefaultSetHolder.DEFAULT_SET)
95  {
96 
97  }
98 
99  /*
100  * Builds an analyzer with the given stop words and stemming exclusion words
101  *
102  * @param matchVersion
103  * lucene compatibility version
104  * @param stopwords
105  * a stopword set
106  */
107  public CzechAnalyzer(Version matchVersion, ISet<string> stopwords) {
108  this.matchVersion = matchVersion;
109  this.stoptable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
110  }
111 
112 
113  /*
114  * Builds an analyzer with the given stop words.
115  * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
116  */
117  public CzechAnalyzer(Version matchVersion, params string[] stopwords)
118  : this(matchVersion, StopFilter.MakeStopSet( stopwords ))
119  {
120 
121  }
122 
123  /*
124  * Builds an analyzer with the given stop words.
125  *
126  * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
127  */
128  public CzechAnalyzer(Version matchVersion, HashSet<string> stopwords)
129  : this(matchVersion, (ISet<string>)stopwords)
130  {
131 
132  }
133 
134  /*
135  * Builds an analyzer with the given stop words.
136  * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
137  */
138  public CzechAnalyzer(Version matchVersion, FileInfo stopwords )
139  : this(matchVersion, WordlistLoader.GetWordSet( stopwords ))
140  {
141 
142  }
143 
144  /*
145  * Loads stopwords hash from resource stream (file, database...).
146  * @param wordfile File containing the wordlist
147  * @param encoding Encoding used (win-1250, iso-8859-2, ...), null for default system encoding
148  * @deprecated use {@link WordlistLoader#getWordSet(Reader, String) }
149  * and {@link #CzechAnalyzer(Version, Set)} instead
150  */
151  public void LoadStopWords( Stream wordfile, System.Text.Encoding encoding ) {
152  PreviousTokenStream = null; // force a new stopfilter to be created
153  if ( wordfile == null )
154  {
155  stoptable = Support.Compatibility.SetFactory.CreateHashSet<string>();
156  return;
157  }
158  try {
159  // clear any previous table (if present)
160  stoptable = Support.Compatibility.SetFactory.CreateHashSet<string>();
161 
162  StreamReader isr;
163  if (encoding == null)
164  isr = new StreamReader(wordfile);
165  else
166  isr = new StreamReader(wordfile, encoding);
167 
168  stoptable = WordlistLoader.GetWordSet(isr);
169  } catch ( IOException) {
170  // clear any previous table (if present)
171  // TODO: throw IOException
172  stoptable = Support.Compatibility.SetFactory.CreateHashSet<string>();
173  }
174  }
175 
176  /*
177  * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
178  *
179  * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
180  * {@link StandardFilter}, {@link LowerCaseFilter}, and {@link StopFilter}
181  */
182  public override sealed TokenStream TokenStream( String fieldName, TextReader reader ) {
183  TokenStream result = new StandardTokenizer( matchVersion, reader );
184  result = new StandardFilter( result );
185  result = new LowerCaseFilter( result );
187  result, stoptable );
188  return result;
189  }
190 
191  private class SavedStreams {
192  protected internal Tokenizer source;
193  protected internal TokenStream result;
194  };
195 
196  /*
197  * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text in
198  * the provided {@link Reader}.
199  *
200  * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
201  * {@link StandardFilter}, {@link LowerCaseFilter}, and {@link StopFilter}
202  */
203  public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
204  {
205  SavedStreams streams = (SavedStreams) PreviousTokenStream;
206  if (streams == null) {
207  streams = new SavedStreams();
208  streams.source = new StandardTokenizer(matchVersion, reader);
209  streams.result = new StandardFilter(streams.source);
210  streams.result = new LowerCaseFilter(streams.result);
211  streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
212  streams.result, stoptable);
213  PreviousTokenStream = streams;
214  } else {
215  streams.source.Reset(reader);
216  }
217  return streams.result;
218  }
219 }
220 
221 
222 }