23 using System.Collections.Generic;
25 using System.Text.RegularExpressions;
26 using Lucene.Net.Analysis.Tokenattributes;
29 namespace Lucene.Net.Analysis.Miscellaneous
64 public static readonly Regex NON_WORD_PATTERN =
new Regex(
"\\W+", RegexOptions.Compiled);
67 public static readonly Regex WHITESPACE_PATTERN =
new Regex(
"\\s+", RegexOptions.Compiled);
69 private static readonly
CharArraySet EXTENDED_ENGLISH_STOP_WORDS =
71 "a",
"about",
"above",
"across",
"adj",
"after",
"afterwards",
72 "again",
"against",
"albeit",
"all",
"almost",
"alone",
"along",
73 "already",
"also",
"although",
"always",
"among",
"amongst",
"an",
74 "and",
"another",
"any",
"anyhow",
"anyone",
"anything",
75 "anywhere",
"are",
"around",
"as",
"at",
"be",
"became",
"because",
76 "become",
"becomes",
"becoming",
"been",
"before",
"beforehand",
77 "behind",
"being",
"below",
"beside",
"besides",
"between",
78 "beyond",
"both",
"but",
"by",
"can",
"cannot",
"co",
"could",
79 "down",
"during",
"each",
"eg",
"either",
"else",
"elsewhere",
80 "enough",
"etc",
"even",
"ever",
"every",
"everyone",
"everything",
81 "everywhere",
"except",
"few",
"first",
"for",
"former",
82 "formerly",
"from",
"further",
"had",
"has",
"have",
"he",
"hence",
83 "her",
"here",
"hereafter",
"hereby",
"herein",
"hereupon",
"hers",
84 "herself",
"him",
"himself",
"his",
"how",
"however",
"i",
"ie",
"if",
85 "in",
"inc",
"indeed",
"into",
"is",
"it",
"its",
"itself",
"last",
86 "latter",
"latterly",
"least",
"less",
"ltd",
"many",
"may",
"me",
87 "meanwhile",
"might",
"more",
"moreover",
"most",
"mostly",
"much",
88 "must",
"my",
"myself",
"namely",
"neither",
"never",
89 "nevertheless",
"next",
"no",
"nobody",
"none",
"noone",
"nor",
90 "not",
"nothing",
"now",
"nowhere",
"of",
"off",
"often",
"on",
91 "once one",
"only",
"onto",
"or",
"other",
"others",
"otherwise",
92 "our",
"ours",
"ourselves",
"out",
"over",
"own",
"per",
"perhaps",
93 "rather",
"s",
"same",
"seem",
"seemed",
"seeming",
"seems",
94 "several",
"she",
"should",
"since",
"so",
"some",
"somehow",
95 "someone",
"something",
"sometime",
"sometimes",
"somewhere",
96 "still",
"such",
"t",
"than",
"that",
"the",
"their",
"them",
97 "themselves",
"then",
"thence",
"there",
"thereafter",
"thereby",
98 "therefor",
"therein",
"thereupon",
"these",
"they",
"this",
99 "those",
"though",
"through",
"throughout",
"thru",
"thus",
"to",
100 "together",
"too",
"toward",
"towards",
"under",
"until",
"up",
101 "upon",
"us",
"very",
"via",
"was",
"we",
"well",
"were",
"what",
102 "whatever",
"whatsoever",
"when",
"whence",
"whenever",
103 "whensoever",
"where",
"whereafter",
"whereas",
"whereat",
104 "whereby",
"wherefrom",
"wherein",
"whereinto",
"whereof",
105 "whereon",
"whereto",
"whereunto",
"whereupon",
"wherever",
106 "wherewith",
"whether",
"which",
"whichever",
"whichsoever",
107 "while",
"whilst",
"whither",
"who",
"whoever",
"whole",
"whom",
108 "whomever",
"whomsoever",
"whose",
"whosoever",
"why",
"will",
109 "with",
"within",
"without",
"would",
"xsubj",
"xcal",
"xauthor",
110 "xother ",
"xnote",
"yet",
"you",
"your",
"yours",
"yourself",
129 Version.LUCENE_CURRENT, NON_WORD_PATTERN,
true, EXTENDED_ENGLISH_STOP_WORDS);
131 private readonly Regex Regex;
132 private readonly
bool toLowerCase;
133 private readonly ISet<string> stopWords;
135 private readonly
Version matchVersion;
159 throw new ArgumentException(
"Regex must not be null");
161 if (EqRegex(NON_WORD_PATTERN, Regex)) Regex = NON_WORD_PATTERN;
162 else if (EqRegex(WHITESPACE_PATTERN, Regex)) Regex = WHITESPACE_PATTERN;
164 if (stopWords != null && stopWords.Count == 0) stopWords = null;
167 this.toLowerCase = toLowerCase;
168 this.stopWords = stopWords;
169 this.matchVersion = matchVersion;
187 throw new ArgumentException(
"text must not be null");
190 if (Regex == NON_WORD_PATTERN)
192 stream =
new FastStringTokenizer(text,
true, toLowerCase, stopWords);
194 else if (Regex == WHITESPACE_PATTERN)
196 stream =
new FastStringTokenizer(text,
false, toLowerCase, stopWords);
200 stream =
new RegexTokenizer(text, Regex, toLowerCase);
220 if (reader is FastStringReader)
222 return TokenStream(fieldName, ((FastStringReader)reader).GetString());
227 String text = ToString(reader);
230 catch (IOException e)
232 throw new Exception(
"Wrapped Exception", e);
243 public override bool Equals(Object other)
245 if (
this == other)
return true;
246 if (
this == DEFAULT_ANALYZER && other == EXTENDED_ANALYZER)
return false;
247 if (other == DEFAULT_ANALYZER &&
this == EXTENDED_ANALYZER)
return false;
251 PatternAnalyzer p2 = (PatternAnalyzer)other;
253 toLowerCase == p2.toLowerCase &&
254 EqRegex(Regex, p2.Regex) &&
255 Eq(stopWords, p2.stopWords);
265 public override int GetHashCode()
267 if (
this == DEFAULT_ANALYZER)
return -1218418418;
268 if (
this == EXTENDED_ANALYZER)
return 1303507063;
271 h = 31 * h + Regex.GetHashCode();
272 h = 31 * h + (int)Regex.Options;
273 h = 31 * h + (toLowerCase ? 1231 : 1237);
274 h = 31 * h + (stopWords != null ? stopWords.GetHashCode() : 0);
279 private static bool Eq(Object o1, Object o2)
281 return (o1 == o2) || (o1 != null ? o1.Equals(o2) :
false);
285 private static bool EqRegex(Regex p1, Regex p2)
287 return p1 == p2 || (p1.Options == p2.Options && p1.ToString() == p2.ToString());
296 private static String ToString(TextReader input)
301 char[] buffer =
new char[len];
302 char[] output =
new char[len];
306 while ((n = input.Read(buffer, 0, buffer.Length)) != 0)
308 if (len + n > output.Length)
310 char[] tmp =
new char[Math.Max(output.Length << 1, len + n)];
311 Array.Copy(output, 0, tmp, 0, len);
312 Array.Copy(buffer, 0, tmp, len, n);
318 Array.Copy(buffer, 0, output, len, n);
323 return new String(output, 0, len);
327 if (input != null) input.Dispose();
342 private readonly String str;
343 private readonly
bool toLowerCase;
344 private Match matcher;
346 private static readonly System.Globalization.CultureInfo locale = System.Globalization.CultureInfo.CurrentCulture;
350 public RegexTokenizer(String str, Regex regex,
bool toLowerCase)
353 this.matcher = regex.Match(str);
354 this.toLowerCase = toLowerCase;
355 this.termAtt = AddAttribute<ITermAttribute>();
356 this.offsetAtt = AddAttribute<IOffsetAttribute>();
359 public sealed
override bool IncrementToken()
361 if (matcher == null)
return false;
367 bool isMatch = matcher.Success;
371 pos = matcher.Index + matcher.Length;
372 matcher = matcher.NextMatch();
382 String text = str.Substring(start, end - start);
383 if (toLowerCase) text = text.ToLower(locale);
384 termAtt.SetTermBuffer(text);
385 offsetAtt.SetOffset(start, end);
392 public override sealed
void End()
395 int finalOffset = str.Length;
396 this.offsetAtt.SetOffset(finalOffset, finalOffset);
399 protected override void Dispose(
bool disposing)
413 private sealed
class FastStringTokenizer :
TokenStream
416 private readonly String str;
418 private readonly
bool isLetter;
419 private readonly
bool toLowerCase;
420 private readonly ISet<string> stopWords;
421 private static readonly System.Globalization.CultureInfo locale = System.Globalization.CultureInfo.CurrentCulture;
425 public FastStringTokenizer(String str,
bool isLetter,
bool toLowerCase, ISet<string> stopWords)
428 this.isLetter = isLetter;
429 this.toLowerCase = toLowerCase;
430 this.stopWords = stopWords;
431 this.termAtt = AddAttribute<ITermAttribute>();
432 this.offsetAtt = AddAttribute<IOffsetAttribute>();
435 public override bool IncrementToken()
442 bool letter = isLetter;
450 while (i < len && !IsTokenChar(s[i], letter))
458 while (i < len && IsTokenChar(s[i], letter))
463 text = s.Substring(start, i - start);
464 if (toLowerCase) text = text.ToLower(locale);
476 }
while (text != null && IsStopWord(text));
483 termAtt.SetTermBuffer(text);
484 offsetAtt.SetOffset(start, i);
488 public override sealed
void End()
491 int finalOffset = str.Length;
492 this.offsetAtt.SetOffset(finalOffset, finalOffset);
495 protected override void Dispose(
bool disposing)
500 private bool IsTokenChar(
char c,
bool isLetter)
502 return isLetter ?
char.IsLetter(c) : !
char.IsWhiteSpace(c);
505 private bool IsStopWord(
string text)
507 return stopWords != null && stopWords.Contains(text);
520 internal sealed
class FastStringReader : StringReader
523 private readonly
string s;
525 protected internal FastStringReader(
string s)
531 internal string GetString()