Lucene.Net  3.0.3
Lucene.Net is a .NET port of the Java Lucene Indexing Library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties
StandardTokenizer.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using Lucene.Net.Analysis.Tokenattributes;
20 using Lucene.Net.Util;
21 using CharReader = Lucene.Net.Analysis.CharReader;
22 using Token = Lucene.Net.Analysis.Token;
23 using Tokenizer = Lucene.Net.Analysis.Tokenizer;
24 using AttributeSource = Lucene.Net.Util.AttributeSource;
25 using Version = Lucene.Net.Util.Version;
26 
27 namespace Lucene.Net.Analysis.Standard
28 {
29 
55 
56  public sealed class StandardTokenizer:Tokenizer
57  {
58  private void InitBlock()
59  {
61  }
63  private StandardTokenizerImpl scanner;
64 
65  public const int ALPHANUM = 0;
66  public const int APOSTROPHE = 1;
67  public const int ACRONYM = 2;
68  public const int COMPANY = 3;
69  public const int EMAIL = 4;
70  public const int HOST = 5;
71  public const int NUM = 6;
72  public const int CJ = 7;
73 
77  [Obsolete("this solves a bug where HOSTs that end with '.' are identified as ACRONYMs.")]
78  public const int ACRONYM_DEP = 8;
79 
81  public static readonly System.String[] TOKEN_TYPES = new System.String[]{"<ALPHANUM>", "<APOSTROPHE>", "<ACRONYM>", "<COMPANY>", "<EMAIL>", "<HOST>", "<NUM>", "<CJ>", "<ACRONYM_DEP>"};
82 
83  private bool replaceInvalidAcronym;
84 
85  private int maxTokenLength;
86 
90  public int MaxTokenLength
91  {
92  get { return maxTokenLength; }
93  set { this.maxTokenLength = value; }
94  }
95 
106  public StandardTokenizer(Version matchVersion, System.IO.TextReader input):base()
107  {
108  InitBlock();
109  this.scanner = new StandardTokenizerImpl(input);
110  Init(input, matchVersion);
111  }
112 
114  public StandardTokenizer(Version matchVersion, AttributeSource source, System.IO.TextReader input):base(source)
115  {
116  InitBlock();
117  this.scanner = new StandardTokenizerImpl(input);
118  Init(input, matchVersion);
119  }
120 
124  public StandardTokenizer(Version matchVersion, AttributeFactory factory, System.IO.TextReader input):base(factory)
125  {
126  InitBlock();
127  this.scanner = new StandardTokenizerImpl(input);
128  Init(input, matchVersion);
129  }
130 
131  private void Init(System.IO.TextReader input, Version matchVersion)
132  {
133  if (matchVersion.OnOrAfter(Version.LUCENE_24))
134  {
135  replaceInvalidAcronym = true;
136  }
137  else
138  {
139  replaceInvalidAcronym = false;
140  }
141  this.input = input;
142  termAtt = AddAttribute<ITermAttribute>();
143  offsetAtt = AddAttribute<IOffsetAttribute>();
144  posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
145  typeAtt = AddAttribute<ITypeAttribute>();
146  }
147 
148  // this tokenizer generates three attributes:
149  // offset, positionIncrement and type
150  private ITermAttribute termAtt;
151  private IOffsetAttribute offsetAtt;
152  private IPositionIncrementAttribute posIncrAtt;
153  private ITypeAttribute typeAtt;
154 
159  public override bool IncrementToken()
160  {
161  ClearAttributes();
162  int posIncr = 1;
163 
164  while (true)
165  {
166  int tokenType = scanner.GetNextToken();
167 
168  if (tokenType == StandardTokenizerImpl.YYEOF)
169  {
170  return false;
171  }
172 
173  if (scanner.Yylength() <= maxTokenLength)
174  {
175  posIncrAtt.PositionIncrement = posIncr;
176  scanner.GetText(termAtt);
177  int start = scanner.Yychar();
178  offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + termAtt.TermLength()));
179  // This 'if' should be removed in the next release. For now, it converts
180  // invalid acronyms to HOST. When removed, only the 'else' part should
181  // remain.
182  if (tokenType == StandardTokenizerImpl.ACRONYM_DEP)
183  {
184  if (replaceInvalidAcronym)
185  {
187  termAtt.SetTermLength(termAtt.TermLength() - 1); // remove extra '.'
188  }
189  else
190  {
192  }
193  }
194  else
195  {
196  typeAtt.Type = StandardTokenizerImpl.TOKEN_TYPES[tokenType];
197  }
198  return true;
199  }
200  // When we skip a too-long term, we still increment the
201  // position increment
202  else
203  posIncr++;
204  }
205  }
206 
207  public override void End()
208  {
209  // set final offset
210  int finalOffset = CorrectOffset(scanner.Yychar() + scanner.Yylength());
211  offsetAtt.SetOffset(finalOffset, finalOffset);
212  }
213 
214  public override void Reset(System.IO.TextReader reader)
215  {
216  base.Reset(reader);
217  scanner.Reset(reader);
218  }
219 
226  [Obsolete("Remove in 3.X and make true the only valid value. See https://issues.apache.org/jira/browse/LUCENE-1068")]
227  public void SetReplaceInvalidAcronym(bool replaceInvalidAcronym)
228  {
229  this.replaceInvalidAcronym = replaceInvalidAcronym;
230  }
231  }
232 }