Lucene.Net  3.0.3
Lucene.Net is a .NET port of the Java Lucene Indexing Library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties
PatternAnalyzer.cs
Go to the documentation of this file.
1 /*
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements. See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership. The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License. You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied. See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20 */
21 
22 using System;
23 using System.Collections.Generic;
24 using System.IO;
25 using System.Text.RegularExpressions;
26 using Lucene.Net.Analysis.Tokenattributes;
27 using Version = Lucene.Net.Util.Version;
28 
29 namespace Lucene.Net.Analysis.Miscellaneous
30 {
31  /*
32  * Efficient Lucene analyzer/tokenizer that preferably operates on a String rather than a
33  * {@link java.io.Reader}, that can flexibly separate text into terms via a regular expression {@link Regex}
34  * (with behaviour identical to {@link String#split(String)}),
35  * and that combines the functionality of
36  * {@link org.apache.lucene.analysis.LetterTokenizer},
37  * {@link org.apache.lucene.analysis.LowerCaseTokenizer},
38  * {@link org.apache.lucene.analysis.WhitespaceTokenizer},
39  * {@link org.apache.lucene.analysis.StopFilter} into a single efficient
40  * multi-purpose class.
41  * <p>
42  * If you are unsure how exactly a regular expression should look like, consider
43  * prototyping by simply trying various expressions on some test texts via
44  * {@link String#split(String)}. Once you are satisfied, give that regex to
45  * RegexAnalyzer. Also see <a target="_blank"
46  * href="http://java.sun.com/docs/books/tutorial/extra/regex/">Java Regular Expression Tutorial</a>.
47  * <p>
48  * This class can be considerably faster than the "normal" Lucene tokenizers.
49  * It can also serve as a building block in a compound Lucene
50  * {@link org.apache.lucene.analysis.TokenFilter} chain. For example as in this
51  * stemming example:
52  * <pre>
53  * RegexAnalyzer pat = ...
54  * TokenStream tokenStream = new SnowballFilter(
55  * pat.tokenStream("content", "James is running round in the woods"),
56  * "English"));
57  * </pre>
58  *
59  */
60  public class PatternAnalyzer : Analyzer
61  {
62 
63  /* <c>"\\W+"</c>; Divides text at non-letters (NOT char.IsLetter(c)) */
64  public static readonly Regex NON_WORD_PATTERN = new Regex("\\W+", RegexOptions.Compiled);
65 
66  /* <c>"\\s+"</c>; Divides text at whitespaces (char.IsWhitespace(c)) */
67  public static readonly Regex WHITESPACE_PATTERN = new Regex("\\s+", RegexOptions.Compiled);
68 
69  private static readonly CharArraySet EXTENDED_ENGLISH_STOP_WORDS =
70  CharArraySet.UnmodifiableSet(new CharArraySet((IEnumerable<string>)new[]{
71  "a", "about", "above", "across", "adj", "after", "afterwards",
72  "again", "against", "albeit", "all", "almost", "alone", "along",
73  "already", "also", "although", "always", "among", "amongst", "an",
74  "and", "another", "any", "anyhow", "anyone", "anything",
75  "anywhere", "are", "around", "as", "at", "be", "became", "because",
76  "become", "becomes", "becoming", "been", "before", "beforehand",
77  "behind", "being", "below", "beside", "besides", "between",
78  "beyond", "both", "but", "by", "can", "cannot", "co", "could",
79  "down", "during", "each", "eg", "either", "else", "elsewhere",
80  "enough", "etc", "even", "ever", "every", "everyone", "everything",
81  "everywhere", "except", "few", "first", "for", "former",
82  "formerly", "from", "further", "had", "has", "have", "he", "hence",
83  "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers",
84  "herself", "him", "himself", "his", "how", "however", "i", "ie", "if",
85  "in", "inc", "indeed", "into", "is", "it", "its", "itself", "last",
86  "latter", "latterly", "least", "less", "ltd", "many", "may", "me",
87  "meanwhile", "might", "more", "moreover", "most", "mostly", "much",
88  "must", "my", "myself", "namely", "neither", "never",
89  "nevertheless", "next", "no", "nobody", "none", "noone", "nor",
90  "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
91  "once one", "only", "onto", "or", "other", "others", "otherwise",
92  "our", "ours", "ourselves", "out", "over", "own", "per", "perhaps",
93  "rather", "s", "same", "seem", "seemed", "seeming", "seems",
94  "several", "she", "should", "since", "so", "some", "somehow",
95  "someone", "something", "sometime", "sometimes", "somewhere",
96  "still", "such", "t", "than", "that", "the", "their", "them",
97  "themselves", "then", "thence", "there", "thereafter", "thereby",
98  "therefor", "therein", "thereupon", "these", "they", "this",
99  "those", "though", "through", "throughout", "thru", "thus", "to",
100  "together", "too", "toward", "towards", "under", "until", "up",
101  "upon", "us", "very", "via", "was", "we", "well", "were", "what",
102  "whatever", "whatsoever", "when", "whence", "whenever",
103  "whensoever", "where", "whereafter", "whereas", "whereat",
104  "whereby", "wherefrom", "wherein", "whereinto", "whereof",
105  "whereon", "whereto", "whereunto", "whereupon", "wherever",
106  "wherewith", "whether", "which", "whichever", "whichsoever",
107  "while", "whilst", "whither", "who", "whoever", "whole", "whom",
108  "whomever", "whomsoever", "whose", "whosoever", "why", "will",
109  "with", "within", "without", "would", "xsubj", "xcal", "xauthor",
110  "xother ", "xnote", "yet", "you", "your", "yours", "yourself",
111  "yourselves"
112  }, true));
113 
114  /*
115  * A lower-casing word analyzer with English stop words (can be shared
116  * freely across threads without harm); global per class loader.
117  */
118  public static readonly PatternAnalyzer DEFAULT_ANALYZER = new PatternAnalyzer(
119  Version.LUCENE_CURRENT, NON_WORD_PATTERN, true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
120 
121  /*
122  * A lower-casing word analyzer with <b>extended </b> English stop words
123  * (can be shared freely across threads without harm); global per class
124  * loader. The stop words are borrowed from
125  * http://thomas.loc.gov/home/stopwords.html, see
126  * http://thomas.loc.gov/home/all.about.inquery.html
127  */
128  public static readonly PatternAnalyzer EXTENDED_ANALYZER = new PatternAnalyzer(
129  Version.LUCENE_CURRENT, NON_WORD_PATTERN, true, EXTENDED_ENGLISH_STOP_WORDS);
130 
131  private readonly Regex Regex;
132  private readonly bool toLowerCase;
133  private readonly ISet<string> stopWords;
134 
135  private readonly Version matchVersion;
136 
137  /*
138  * Constructs a new instance with the given parameters.
139  *
140  * @param matchVersion If >= {@link Version#LUCENE_29}, StopFilter.enablePositionIncrement is set to true
141  * @param Regex
142  * a regular expression delimiting tokens
143  * @param toLowerCase
144  * if <c>true</c> returns tokens after applying
145  * String.toLowerCase()
146  * @param stopWords
147  * if non-null, ignores all tokens that are contained in the
148  * given stop set (after previously having applied toLowerCase()
149  * if applicable). For example, created via
150  * {@link StopFilter#makeStopSet(String[])}and/or
151  * {@link org.apache.lucene.analysis.WordlistLoader}as in
152  * <c>WordlistLoader.getWordSet(new File("samples/fulltext/stopwords.txt")</c>
153  * or <a href="http://www.unine.ch/info/clef/">other stop words
154  * lists </a>.
155  */
156  public PatternAnalyzer(Version matchVersion, Regex Regex, bool toLowerCase, ISet<string> stopWords)
157  {
158  if (Regex == null)
159  throw new ArgumentException("Regex must not be null");
160 
161  if (EqRegex(NON_WORD_PATTERN, Regex)) Regex = NON_WORD_PATTERN;
162  else if (EqRegex(WHITESPACE_PATTERN, Regex)) Regex = WHITESPACE_PATTERN;
163 
164  if (stopWords != null && stopWords.Count == 0) stopWords = null;
165 
166  this.Regex = Regex;
167  this.toLowerCase = toLowerCase;
168  this.stopWords = stopWords;
169  this.matchVersion = matchVersion;
170  }
171 
172  /*
173  * Creates a token stream that tokenizes the given string into token terms
174  * (aka words).
175  *
176  * @param fieldName
177  * the name of the field to tokenize (currently ignored).
178  * @param text
179  * the string to tokenize
180  * @return a new token stream
181  */
182  public TokenStream TokenStream(String fieldName, String text)
183  {
184  // Ideally the Analyzer superclass should have a method with the same signature,
185  // with a default impl that simply delegates to the StringReader flavour.
186  if (text == null)
187  throw new ArgumentException("text must not be null");
188 
189  TokenStream stream;
190  if (Regex == NON_WORD_PATTERN)
191  { // fast path
192  stream = new FastStringTokenizer(text, true, toLowerCase, stopWords);
193  }
194  else if (Regex == WHITESPACE_PATTERN)
195  { // fast path
196  stream = new FastStringTokenizer(text, false, toLowerCase, stopWords);
197  }
198  else
199  {
200  stream = new RegexTokenizer(text, Regex, toLowerCase);
201  if (stopWords != null) stream = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), stream, stopWords);
202  }
203 
204  return stream;
205  }
206 
207  /*
208  * Creates a token stream that tokenizes all the text in the given Reader;
209  * This implementation forwards to <c>tokenStream(String, String)</c> and is
210  * less efficient than <c>tokenStream(String, String)</c>.
211  *
212  * @param fieldName
213  * the name of the field to tokenize (currently ignored).
214  * @param reader
215  * the reader delivering the text
216  * @return a new token stream
217  */
218  public override TokenStream TokenStream(String fieldName, TextReader reader)
219  {
220  if (reader is FastStringReader)
221  { // fast path
222  return TokenStream(fieldName, ((FastStringReader)reader).GetString());
223  }
224 
225  try
226  {
227  String text = ToString(reader);
228  return TokenStream(fieldName, text);
229  }
230  catch (IOException e)
231  {
232  throw new Exception("Wrapped Exception", e);
233  }
234  }
235 
236  /*
237  * Indicates whether some other object is "equal to" this one.
238  *
239  * @param other
240  * the reference object with which to compare.
241  * @return true if equal, false otherwise
242  */
243  public override bool Equals(Object other)
244  {
245  if (this == other) return true;
246  if (this == DEFAULT_ANALYZER && other == EXTENDED_ANALYZER) return false;
247  if (other == DEFAULT_ANALYZER && this == EXTENDED_ANALYZER) return false;
248 
249  if (other is PatternAnalyzer)
250  {
251  PatternAnalyzer p2 = (PatternAnalyzer)other;
252  return
253  toLowerCase == p2.toLowerCase &&
254  EqRegex(Regex, p2.Regex) &&
255  Eq(stopWords, p2.stopWords);
256  }
257  return false;
258  }
259 
260  /*
261  * Returns a hash code value for the object.
262  *
263  * @return the hash code.
264  */
265  public override int GetHashCode()
266  {
267  if (this == DEFAULT_ANALYZER) return -1218418418; // fast path
268  if (this == EXTENDED_ANALYZER) return 1303507063; // fast path
269 
270  int h = 1;
271  h = 31 * h + Regex.GetHashCode();
272  h = 31 * h + (int)Regex.Options;
273  h = 31 * h + (toLowerCase ? 1231 : 1237);
274  h = 31 * h + (stopWords != null ? stopWords.GetHashCode() : 0);
275  return h;
276  }
277 
278  /* equality where o1 and/or o2 can be null */
279  private static bool Eq(Object o1, Object o2)
280  {
281  return (o1 == o2) || (o1 != null ? o1.Equals(o2) : false);
282  }
283 
284  /* assumes p1 and p2 are not null */
285  private static bool EqRegex(Regex p1, Regex p2)
286  {
287  return p1 == p2 || (p1.Options == p2.Options && p1.ToString() == p2.ToString());
288  }
289 
290  /*
291  * Reads until end-of-stream and returns all read chars, finally closes the stream.
292  *
293  * @param input the input stream
294  * @throws IOException if an I/O error occurs while reading the stream
295  */
296  private static String ToString(TextReader input)
297  {
298  try
299  {
300  int len = 256;
301  char[] buffer = new char[len];
302  char[] output = new char[len];
303 
304  len = 0;
305  int n;
306  while ((n = input.Read(buffer, 0, buffer.Length)) != 0)
307  {
308  if (len + n > output.Length)
309  { // grow capacity
310  char[] tmp = new char[Math.Max(output.Length << 1, len + n)];
311  Array.Copy(output, 0, tmp, 0, len);
312  Array.Copy(buffer, 0, tmp, len, n);
313  buffer = output; // use larger buffer for future larger bulk reads
314  output = tmp;
315  }
316  else
317  {
318  Array.Copy(buffer, 0, output, len, n);
319  }
320  len += n;
321  }
322 
323  return new String(output, 0, len);
324  }
325  finally
326  {
327  if (input != null) input.Dispose();
328  }
329  }
330 
331 
333  // Nested classes:
335  /*
336  * The work horse; performance isn't fantastic, but it's not nearly as bad
337  * as one might think - kudos to the Sun regex developers.
338  */
339  private sealed class RegexTokenizer : TokenStream
340  {
341 
342  private readonly String str;
343  private readonly bool toLowerCase;
344  private Match matcher;
345  private int pos = 0;
346  private static readonly System.Globalization.CultureInfo locale = System.Globalization.CultureInfo.CurrentCulture;
347  private ITermAttribute termAtt;
348  private IOffsetAttribute offsetAtt;
349 
350  public RegexTokenizer(String str, Regex regex, bool toLowerCase)
351  {
352  this.str = str;
353  this.matcher = regex.Match(str);
354  this.toLowerCase = toLowerCase;
355  this.termAtt = AddAttribute<ITermAttribute>();
356  this.offsetAtt = AddAttribute<IOffsetAttribute>();
357  }
358 
359  public sealed override bool IncrementToken()
360  {
361  if (matcher == null) return false;
362  ClearAttributes();
363  while (true)
364  { // loop takes care of leading and trailing boundary cases
365  int start = pos;
366  int end;
367  bool isMatch = matcher.Success;
368  if (isMatch)
369  {
370  end = matcher.Index;
371  pos = matcher.Index + matcher.Length;
372  matcher = matcher.NextMatch();
373  }
374  else
375  {
376  end = str.Length;
377  matcher = null; // we're finished
378  }
379 
380  if (start != end)
381  { // non-empty match (header/trailer)
382  String text = str.Substring(start, end - start);
383  if (toLowerCase) text = text.ToLower(locale);
384  termAtt.SetTermBuffer(text);
385  offsetAtt.SetOffset(start, end);
386  return true;
387  }
388  return false;
389  }
390  }
391 
392  public override sealed void End()
393  {
394  // set final offset
395  int finalOffset = str.Length;
396  this.offsetAtt.SetOffset(finalOffset, finalOffset);
397  }
398 
399  protected override void Dispose(bool disposing)
400  {
401  // Do Nothing
402  }
403  }
404 
405 
407  // Nested classes:
409  /*
410  * Special-case class for best performance in common cases; this class is
411  * otherwise unnecessary.
412  */
413  private sealed class FastStringTokenizer : TokenStream
414  {
415 
416  private readonly String str;
417  private int pos;
418  private readonly bool isLetter;
419  private readonly bool toLowerCase;
420  private readonly ISet<string> stopWords;
421  private static readonly System.Globalization.CultureInfo locale = System.Globalization.CultureInfo.CurrentCulture;
422  private ITermAttribute termAtt;
423  private IOffsetAttribute offsetAtt;
424 
425  public FastStringTokenizer(String str, bool isLetter, bool toLowerCase, ISet<string> stopWords)
426  {
427  this.str = str;
428  this.isLetter = isLetter;
429  this.toLowerCase = toLowerCase;
430  this.stopWords = stopWords;
431  this.termAtt = AddAttribute<ITermAttribute>();
432  this.offsetAtt = AddAttribute<IOffsetAttribute>();
433  }
434 
435  public override bool IncrementToken()
436  {
437  ClearAttributes();
438  // cache loop instance vars (performance)
439  String s = str;
440  int len = s.Length;
441  int i = pos;
442  bool letter = isLetter;
443 
444  int start = 0;
445  String text;
446  do
447  {
448  // find beginning of token
449  text = null;
450  while (i < len && !IsTokenChar(s[i], letter))
451  {
452  i++;
453  }
454 
455  if (i < len)
456  { // found beginning; now find end of token
457  start = i;
458  while (i < len && IsTokenChar(s[i], letter))
459  {
460  i++;
461  }
462 
463  text = s.Substring(start, i - start);
464  if (toLowerCase) text = text.ToLower(locale);
465  // if (toLowerCase) {
468  // text = s.substring(start, i).toLowerCase();
472  // } else {
473  // text = s.substring(start, i);
474  // }
475  }
476  } while (text != null && IsStopWord(text));
477 
478  pos = i;
479  if (text == null)
480  {
481  return false;
482  }
483  termAtt.SetTermBuffer(text);
484  offsetAtt.SetOffset(start, i);
485  return true;
486  }
487 
488  public override sealed void End()
489  {
490  // set final offset
491  int finalOffset = str.Length;
492  this.offsetAtt.SetOffset(finalOffset, finalOffset);
493  }
494 
495  protected override void Dispose(bool disposing)
496  {
497  // Do Nothing
498  }
499 
500  private bool IsTokenChar(char c, bool isLetter)
501  {
502  return isLetter ? char.IsLetter(c) : !char.IsWhiteSpace(c);
503  }
504 
505  private bool IsStopWord(string text)
506  {
507  return stopWords != null && stopWords.Contains(text);
508  }
509 
510  }
511 
512 
514  // Nested classes:
516  /*
517  * A StringReader that exposes it's contained string for fast direct access.
518  * Might make sense to generalize this to CharSequence and make it public?
519  */
520  internal sealed class FastStringReader : StringReader
521  {
522 
523  private readonly string s;
524 
525  protected internal FastStringReader(string s)
526  : base(s)
527  {
528  this.s = s;
529  }
530 
531  internal string GetString()
532  {
533  return s;
534  }
535  }
536 
537  }
538 }