Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
StandardAnalyzer.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.Collections;
20 using System.Collections.Generic;
21 using Lucene.Net.Analysis;
22 using Lucene.Net.Util;
23 using Version = Lucene.Net.Util.Version;
24 
25 namespace Lucene.Net.Analysis.Standard
26 {
27 
28  /// <summary> Filters <see cref="StandardTokenizer" /> with <see cref="StandardFilter" />,
29  /// <see cref="LowerCaseFilter" /> and <see cref="StopFilter" />, using a list of English stop
30  /// words.
31  ///
32  /// <a name="version"/>
33  /// <p/>
34  /// You must specify the required <see cref="Version" /> compatibility when creating
35  /// StandardAnalyzer:
36  /// <list type="bullet">
37  /// <item>As of 2.9, StopFilter preserves position increments</item>
38  /// <item>As of 2.4, Tokens incorrectly identified as acronyms are corrected (see
39  /// <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1608</a>)</item>
40  /// </list>
41  /// </summary>
42  public class StandardAnalyzer : Analyzer
43  {
44  private ISet<string> stopSet;
45 
46  /// <summary> Specifies whether deprecated acronyms should be replaced with HOST type.
47  /// See <a href="https://issues.apache.org/jira/browse/LUCENE-1068">https://issues.apache.org/jira/browse/LUCENE-1068</a>
48  /// </summary>
49  private bool replaceInvalidAcronym, enableStopPositionIncrements;
50 
51  /// <summary>An unmodifiable set containing some common English words that are usually not
52  /// useful for searching.
53  /// </summary>
54  public static readonly ISet<string> STOP_WORDS_SET;
55  private Version matchVersion;
56 
57  /// <summary>Builds an analyzer with the default stop words (<see cref="STOP_WORDS_SET" />).
58  /// </summary>
59  /// <param name="matchVersion">Lucene version to match see <see cref="Version">above</see></param>
60  public StandardAnalyzer(Version matchVersion)
61  : this(matchVersion, STOP_WORDS_SET)
62  { }
63 
64  /// <summary>Builds an analyzer with the given stop words.</summary>
65  /// <param name="matchVersion">Lucene version to match See <see cref="Version">above</see> />
66  ///
67  /// </param>
68  /// <param name="stopWords">stop words
69  /// </param>
70  public StandardAnalyzer(Version matchVersion, ISet<string> stopWords)
71  {
72  stopSet = stopWords;
73  SetOverridesTokenStreamMethod<StandardAnalyzer>();
74  enableStopPositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion);
75  replaceInvalidAcronym = matchVersion.OnOrAfter(Version.LUCENE_24);
76  this.matchVersion = matchVersion;
77  }
78 
79  /// <summary>Builds an analyzer with the stop words from the given file.</summary>
80  /// <seealso cref="WordlistLoader.GetWordSet(System.IO.FileInfo)">
81  /// </seealso>
82  /// <param name="matchVersion">Lucene version to match See <see cref="Version">above</see> />
83  ///
84  /// </param>
85  /// <param name="stopwords">File to read stop words from
86  /// </param>
87  public StandardAnalyzer(Version matchVersion, System.IO.FileInfo stopwords)
88  : this (matchVersion, WordlistLoader.GetWordSet(stopwords))
89  {
90  }
91 
92  /// <summary>Builds an analyzer with the stop words from the given reader.</summary>
93  /// <seealso cref="WordlistLoader.GetWordSet(System.IO.TextReader)">
94  /// </seealso>
95  /// <param name="matchVersion">Lucene version to match See <see cref="Version">above</see> />
96  ///
97  /// </param>
98  /// <param name="stopwords">Reader to read stop words from
99  /// </param>
100  public StandardAnalyzer(Version matchVersion, System.IO.TextReader stopwords)
101  : this(matchVersion, WordlistLoader.GetWordSet(stopwords))
102  { }
103 
104  /// <summary>Constructs a <see cref="StandardTokenizer" /> filtered by a <see cref="StandardFilter" />
105  ///, a <see cref="LowerCaseFilter" /> and a <see cref="StopFilter" />.
106  /// </summary>
107  public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
108  {
109  StandardTokenizer tokenStream = new StandardTokenizer(matchVersion, reader);
110  tokenStream.MaxTokenLength = maxTokenLength;
111  TokenStream result = new StandardFilter(tokenStream);
112  result = new LowerCaseFilter(result);
113  result = new StopFilter(enableStopPositionIncrements, result, stopSet);
114  return result;
115  }
116 
117  private sealed class SavedStreams
118  {
119  internal StandardTokenizer tokenStream;
120  internal TokenStream filteredTokenStream;
121  }
122 
123  /// <summary>Default maximum allowed token length </summary>
124  public const int DEFAULT_MAX_TOKEN_LENGTH = 255;
125 
126  private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
127 
128  /// <summary> Set maximum allowed token length. If a token is seen
129  /// that exceeds this length then it is discarded. This
130  /// setting only takes effect the next time tokenStream or
131  /// reusableTokenStream is called.
132  /// </summary>
133  public virtual int MaxTokenLength
134  {
135  get { return maxTokenLength; }
136  set { maxTokenLength = value; }
137  }
138 
139  public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader)
140  {
141  if (overridesTokenStreamMethod)
142  {
143  // LUCENE-1678: force fallback to tokenStream() if we
144  // have been subclassed and that subclass overrides
145  // tokenStream but not reusableTokenStream
146  return TokenStream(fieldName, reader);
147  }
148  SavedStreams streams = (SavedStreams) PreviousTokenStream;
149  if (streams == null)
150  {
151  streams = new SavedStreams();
152  PreviousTokenStream = streams;
153  streams.tokenStream = new StandardTokenizer(matchVersion, reader);
154  streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
155  streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
156  streams.filteredTokenStream = new StopFilter(enableStopPositionIncrements,
157  streams.filteredTokenStream, stopSet);
158  }
159  else
160  {
161  streams.tokenStream.Reset(reader);
162  }
163  streams.tokenStream.MaxTokenLength = maxTokenLength;
164 
165  streams.tokenStream.SetReplaceInvalidAcronym(replaceInvalidAcronym);
166 
167  return streams.filteredTokenStream;
168  }
169  static StandardAnalyzer()
170  {
171  STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
172  }
173  }
174 }