Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
CJKAnalyzer.cs
Go to the documentation of this file.
1 /*
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements. See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership. The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License. You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied. See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20 */
21 
22 using System;
23 using System.Collections.Generic;
24 using System.IO;
25 using System.Collections;
26 using Lucene.Net.Analysis;
27 using Version = Lucene.Net.Util.Version;
28 
29 namespace Lucene.Net.Analysis.CJK
30 {
31  /// <summary>
32  /// Filters CJKTokenizer with StopFilter.
33  ///
34  /// <author>Che, Dong</author>
35  /// </summary>
36  public class CJKAnalyzer : Analyzer
37  {
38  //~ Static fields/initializers ---------------------------------------------
39 
40  /// <summary>
41  /// An array containing some common English words that are not usually
42  /// useful for searching. and some double-byte interpunctions.....
43  /// </summary>
44  // TODO make this final in 3.1 -
45  // this might be revised and merged with StopFilter stop words too
46  [Obsolete("use GetDefaultStopSet() instead")] public static String[] STOP_WORDS =
47  {
48  "a", "and", "are", "as", "at", "be",
49  "but", "by", "for", "if", "in",
50  "into", "is", "it", "no", "not",
51  "of", "on", "or", "s", "such", "t",
52  "that", "the", "their", "then",
53  "there", "these", "they", "this",
54  "to", "was", "will", "with", "",
55  "www"
56  };
57 
58  //~ Instance fields --------------------------------------------------------
59 
60  /// <summary>
61  /// Returns an unmodifiable instance of the default stop-words set.
62  /// </summary>
63  /// <returns>Returns an unmodifiable instance of the default stop-words set.</returns>
64  public static ISet<string> GetDefaultStopSet()
65  {
66  return DefaultSetHolder.DEFAULT_STOP_SET;
67  }
68 
69  private static class DefaultSetHolder
70  {
71  internal static ISet<string> DEFAULT_STOP_SET =
72  CharArraySet.UnmodifiableSet(new CharArraySet((IEnumerable<string>)STOP_WORDS, false));
73  }
74 
75  /// <summary>
76  /// stop word list
77  /// </summary>
78  private ISet<string> stopTable;
79 
80  private readonly Version matchVersion;
81 
82  //~ Constructors -----------------------------------------------------------
83 
84  public CJKAnalyzer(Version matchVersion)
85  : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
86  {
87 
88  }
89 
90  public CJKAnalyzer(Version matchVersion, ISet<string> stopWords)
91  {
92  stopTable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopWords));
93  this.matchVersion = matchVersion;
94  }
95 
96  /// <summary>
97  /// Builds an analyzer which removes words in the provided array.
98  /// </summary>
99  /// <param name="stopWords">stop word array</param>
100  public CJKAnalyzer(Version matchVersion, params string[] stopWords)
101  {
102  stopTable = StopFilter.MakeStopSet(stopWords);
103  this.matchVersion = matchVersion;
104  }
105 
106  //~ Methods ----------------------------------------------------------------
107 
108  /// <summary>
109  /// get token stream from input
110  /// </summary>
111  /// <param name="fieldName">lucene field name</param>
112  /// <param name="reader">input reader</param>
113  /// <returns>Token Stream</returns>
114  public override sealed TokenStream TokenStream(String fieldName, TextReader reader)
115  {
117  new CJKTokenizer(reader), stopTable);
118  }
119 
120  private class SavedStreams
121  {
122  protected internal Tokenizer source;
123  protected internal TokenStream result;
124  };
125 
126  /*
127  * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
128  * in the provided {@link Reader}.
129  *
130  * @param fieldName lucene field name
131  * @param reader Input {@link Reader}
132  * @return A {@link TokenStream} built from {@link CJKTokenizer}, filtered with
133  * {@link StopFilter}
134  */
135  public override sealed TokenStream ReusableTokenStream(String fieldName, TextReader reader)
136  {
137  /* tokenStream() is final, no back compat issue */
138  SavedStreams streams = (SavedStreams) PreviousTokenStream;
139  if (streams == null)
140  {
141  streams = new SavedStreams();
142  streams.source = new CJKTokenizer(reader);
143  streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
144  streams.source, stopTable);
145  PreviousTokenStream = streams;
146  }
147  else
148  {
149  streams.source.Reset(reader);
150  }
151  return streams.result;
152  }
153  }
154 }