Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
CharTokenizer.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using Lucene.Net.Analysis.Tokenattributes;
19 using AttributeSource = Lucene.Net.Util.AttributeSource;
20 
21 namespace Lucene.Net.Analysis
22 {
23 
24  /// <summary>An abstract base class for simple, character-oriented tokenizers.</summary>
25  public abstract class CharTokenizer:Tokenizer
26  {
27  protected CharTokenizer(System.IO.TextReader input):base(input)
28  {
29  offsetAtt = AddAttribute<IOffsetAttribute>();
30  termAtt = AddAttribute<ITermAttribute>();
31  }
32 
33  protected CharTokenizer(AttributeSource source, System.IO.TextReader input):base(source, input)
34  {
35  offsetAtt = AddAttribute<IOffsetAttribute>();
36  termAtt = AddAttribute<ITermAttribute>();
37  }
38 
39  protected CharTokenizer(AttributeFactory factory, System.IO.TextReader input):base(factory, input)
40  {
41  offsetAtt = AddAttribute<IOffsetAttribute>();
42  termAtt = AddAttribute<ITermAttribute>();
43  }
44 
45  private int offset = 0, bufferIndex = 0, dataLen = 0;
46  private const int MAX_WORD_LEN = 255;
47  private const int IO_BUFFER_SIZE = 4096;
48  private readonly char[] ioBuffer = new char[IO_BUFFER_SIZE];
49 
50  private readonly ITermAttribute termAtt;
51  private readonly IOffsetAttribute offsetAtt;
52 
53  /// <summary>Returns true iff a character should be included in a token. This
54  /// tokenizer generates as tokens adjacent sequences of characters which
55  /// satisfy this predicate. Characters for which this is false are used to
56  /// define token boundaries and are not included in tokens.
57  /// </summary>
58  protected internal abstract bool IsTokenChar(char c);
59 
60  /// <summary>Called on each token character to normalize it before it is added to the
61  /// token. The default implementation does nothing. Subclasses may use this
62  /// to, e.g., lowercase tokens.
63  /// </summary>
64  protected internal virtual char Normalize(char c)
65  {
66  return c;
67  }
68 
69  public override bool IncrementToken()
70  {
71  ClearAttributes();
72  int length = 0;
73  int start = bufferIndex;
74  char[] buffer = termAtt.TermBuffer();
75  while (true)
76  {
77 
78  if (bufferIndex >= dataLen)
79  {
80  offset += dataLen;
81  dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
82  if (dataLen <= 0)
83  {
84  dataLen = 0; // so next offset += dataLen won't decrement offset
85  if (length > 0)
86  break;
87  return false;
88  }
89  bufferIndex = 0;
90  }
91 
92  char c = ioBuffer[bufferIndex++];
93 
94  if (IsTokenChar(c))
95  {
96  // if it's a token char
97 
98  if (length == 0)
99  // start of token
100  start = offset + bufferIndex - 1;
101  else if (length == buffer.Length)
102  buffer = termAtt.ResizeTermBuffer(1 + length);
103 
104  buffer[length++] = Normalize(c); // buffer it, normalized
105 
106  if (length == MAX_WORD_LEN)
107  // buffer overflow!
108  break;
109  }
110  else if (length > 0)
111  // at non-Letter w/ chars
112  break; // return 'em
113  }
114 
115  termAtt.SetTermLength(length);
116  offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length));
117  return true;
118  }
119 
120  public override void End()
121  {
122  // set final offset
123  int finalOffset = CorrectOffset(offset);
124  offsetAtt.SetOffset(finalOffset, finalOffset);
125  }
126 
127  public override void Reset(System.IO.TextReader input)
128  {
129  base.Reset(input);
130  bufferIndex = 0;
131  offset = 0;
132  dataLen = 0;
133  }
134  }
135 }