Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
StandardTokenizer.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using Lucene.Net.Analysis.Tokenattributes;
20 using Lucene.Net.Util;
21 using CharReader = Lucene.Net.Analysis.CharReader;
22 using Token = Lucene.Net.Analysis.Token;
23 using Tokenizer = Lucene.Net.Analysis.Tokenizer;
24 using AttributeSource = Lucene.Net.Util.AttributeSource;
25 using Version = Lucene.Net.Util.Version;
26 
27 namespace Lucene.Net.Analysis.Standard
28 {
29 
30  /// <summary>A grammar-based tokenizer constructed with JFlex
31  ///
32  /// <p/> This should be a good tokenizer for most European-language documents:
33  ///
34  /// <list type="bullet">
35  /// <item>Splits words at punctuation characters, removing punctuation. However, a
36  /// dot that's not followed by whitespace is considered part of a token.</item>
37  /// <item>Splits words at hyphens, unless there's a number in the token, in which case
38  /// the whole token is interpreted as a product number and is not split.</item>
39  /// <item>Recognizes email addresses and internet hostnames as one token.</item>
40  /// </list>
41  ///
42  /// <p/>Many applications have specific tokenizer needs. If this tokenizer does
43  /// not suit your application, please consider copying this source code
44  /// directory to your project and maintaining your own grammar-based tokenizer.
45  ///
46  /// <a name="version"/>
47  /// <p/>
48  /// You must specify the required <see cref="Version" /> compatibility when creating
49  /// StandardAnalyzer:
50  /// <list type="bullet">
51  /// <item>As of 2.4, Tokens incorrectly identified as acronyms are corrected (see
52  /// <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1608</a></item>
53  /// </list>
54  /// </summary>
55 
56  public sealed class StandardTokenizer:Tokenizer
57  {
58  private void InitBlock()
59  {
61  }
62  /// <summary>A private instance of the JFlex-constructed scanner </summary>
63  private StandardTokenizerImpl scanner;
64 
65  public const int ALPHANUM = 0;
66  public const int APOSTROPHE = 1;
67  public const int ACRONYM = 2;
68  public const int COMPANY = 3;
69  public const int EMAIL = 4;
70  public const int HOST = 5;
71  public const int NUM = 6;
72  public const int CJ = 7;
73 
74  /// <deprecated> this solves a bug where HOSTs that end with '.' are identified
75  /// as ACRONYMs.
76  /// </deprecated>
77  [Obsolete("this solves a bug where HOSTs that end with '.' are identified as ACRONYMs.")]
78  public const int ACRONYM_DEP = 8;
79 
80  /// <summary>String token types that correspond to token type int constants </summary>
81  public static readonly System.String[] TOKEN_TYPES = new System.String[]{"<ALPHANUM>", "<APOSTROPHE>", "<ACRONYM>", "<COMPANY>", "<EMAIL>", "<HOST>", "<NUM>", "<CJ>", "<ACRONYM_DEP>"};
82 
83  private bool replaceInvalidAcronym;
84 
85  private int maxTokenLength;
86 
87  /// <summary>Set the max allowed token length. Any token longer
88  /// than this is skipped.
89  /// </summary>
90  public int MaxTokenLength
91  {
92  get { return maxTokenLength; }
93  set { this.maxTokenLength = value; }
94  }
95 
96  /// <summary> Creates a new instance of the
97  /// <see cref="Lucene.Net.Analysis.Standard.StandardTokenizer" />. Attaches
98  /// the <c>input</c> to the newly created JFlex scanner.
99  ///
100  /// </summary>
101  /// <param name="matchVersion"></param>
102  /// <param name="input">The input reader
103  ///
104  /// See http://issues.apache.org/jira/browse/LUCENE-1068
105  /// </param>
106  public StandardTokenizer(Version matchVersion, System.IO.TextReader input):base()
107  {
108  InitBlock();
109  this.scanner = new StandardTokenizerImpl(input);
110  Init(input, matchVersion);
111  }
112 
113  /// <summary> Creates a new StandardTokenizer with a given <see cref="AttributeSource" />.</summary>
114  public StandardTokenizer(Version matchVersion, AttributeSource source, System.IO.TextReader input):base(source)
115  {
116  InitBlock();
117  this.scanner = new StandardTokenizerImpl(input);
118  Init(input, matchVersion);
119  }
120 
121  /// <summary> Creates a new StandardTokenizer with a given
122  /// <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory" />
123  /// </summary>
124  public StandardTokenizer(Version matchVersion, AttributeFactory factory, System.IO.TextReader input):base(factory)
125  {
126  InitBlock();
127  this.scanner = new StandardTokenizerImpl(input);
128  Init(input, matchVersion);
129  }
130 
131  private void Init(System.IO.TextReader input, Version matchVersion)
132  {
133  if (matchVersion.OnOrAfter(Version.LUCENE_24))
134  {
135  replaceInvalidAcronym = true;
136  }
137  else
138  {
139  replaceInvalidAcronym = false;
140  }
141  this.input = input;
142  termAtt = AddAttribute<ITermAttribute>();
143  offsetAtt = AddAttribute<IOffsetAttribute>();
144  posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
145  typeAtt = AddAttribute<ITypeAttribute>();
146  }
147 
148  // this tokenizer generates three attributes:
149  // offset, positionIncrement and type
150  private ITermAttribute termAtt;
151  private IOffsetAttribute offsetAtt;
152  private IPositionIncrementAttribute posIncrAtt;
153  private ITypeAttribute typeAtt;
154 
155  ///<summary>
156  /// (non-Javadoc)
157  /// <see cref="Lucene.Net.Analysis.TokenStream.IncrementToken()" />
158  ///</summary>
159  public override bool IncrementToken()
160  {
161  ClearAttributes();
162  int posIncr = 1;
163 
164  while (true)
165  {
166  int tokenType = scanner.GetNextToken();
167 
168  if (tokenType == StandardTokenizerImpl.YYEOF)
169  {
170  return false;
171  }
172 
173  if (scanner.Yylength() <= maxTokenLength)
174  {
175  posIncrAtt.PositionIncrement = posIncr;
176  scanner.GetText(termAtt);
177  int start = scanner.Yychar();
178  offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + termAtt.TermLength()));
179  // This 'if' should be removed in the next release. For now, it converts
180  // invalid acronyms to HOST. When removed, only the 'else' part should
181  // remain.
182  if (tokenType == StandardTokenizerImpl.ACRONYM_DEP)
183  {
184  if (replaceInvalidAcronym)
185  {
187  termAtt.SetTermLength(termAtt.TermLength() - 1); // remove extra '.'
188  }
189  else
190  {
192  }
193  }
194  else
195  {
196  typeAtt.Type = StandardTokenizerImpl.TOKEN_TYPES[tokenType];
197  }
198  return true;
199  }
200  // When we skip a too-long term, we still increment the
201  // position increment
202  else
203  posIncr++;
204  }
205  }
206 
207  public override void End()
208  {
209  // set final offset
210  int finalOffset = CorrectOffset(scanner.Yychar() + scanner.Yylength());
211  offsetAtt.SetOffset(finalOffset, finalOffset);
212  }
213 
214  public override void Reset(System.IO.TextReader reader)
215  {
216  base.Reset(reader);
217  scanner.Reset(reader);
218  }
219 
220  /// <summary>
221  /// Remove in 3.X and make true the only valid value
222  /// See https://issues.apache.org/jira/browse/LUCENE-1068
223  /// </summary>
224  /// <param name="replaceInvalidAcronym">Set to true to replace mischaracterized acronyms as HOST.
225  /// </param>
226  [Obsolete("Remove in 3.X and make true the only valid value. See https://issues.apache.org/jira/browse/LUCENE-1068")]
227  public void SetReplaceInvalidAcronym(bool replaceInvalidAcronym)
228  {
229  this.replaceInvalidAcronym = replaceInvalidAcronym;
230  }
231  }
232 }