Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
ChineseTokenizer.cs
Go to the documentation of this file.
1 /*
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements. See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership. The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License. You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied. See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20 */
21 
22 using System;
23 using System.IO;
24 using System.Text;
25 using System.Collections;
26 using System.Globalization;
27 
28 using Lucene.Net.Analysis;
29 using Lucene.Net.Analysis.Tokenattributes;
30 using Lucene.Net.Util;
31 
32 namespace Lucene.Net.Analysis.Cn
33 {
34  /// <summary>
35  /// Tokenize Chinese text as individual chinese chars.
36  /// <p>
37  /// The difference between ChineseTokenizer and
38  /// CJKTokenizer is that they have different
39  /// token parsing logic.
40  /// </p>
41  /// <p>
42  /// For example, if the Chinese text
43  /// "C1C2C3C4" is to be indexed:
44  /// <ul>
45  /// <li>The tokens returned from ChineseTokenizer are C1, C2, C3, C4</li>
46  /// <li>The tokens returned from the CJKTokenizer are C1C2, C2C3, C3C4.</li>
47  /// </ul>
48  /// </p>
49  /// <p>
50  /// Therefore the index created by CJKTokenizer is much larger.
51  /// </p>
52  /// <p>
53  /// The problem is that when searching for C1, C1C2, C1C3,
54  /// C4C2, C1C2C3 ... the ChineseTokenizer works, but the
55  /// CJKTokenizer will not work.
56  /// </p>
57  /// </summary>
58  public sealed class ChineseTokenizer : Tokenizer
59  {
60  public ChineseTokenizer(TextReader _in)
61  : base(_in)
62  {
63  Init();
64  }
65 
66  public ChineseTokenizer(AttributeSource source, TextReader _in)
67  : base(source, _in)
68  {
69  Init();
70  }
71 
72  public ChineseTokenizer(AttributeFactory factory, TextReader _in)
73  : base(factory, _in)
74  {
75  Init();
76  }
77 
78  private void Init()
79  {
80  termAtt = AddAttribute<ITermAttribute>();
81  offsetAtt = AddAttribute<IOffsetAttribute>();
82  }
83 
84  private int offset = 0, bufferIndex = 0, dataLen = 0;
85  private static readonly int MAX_WORD_LEN = 255;
86  private static readonly int IO_BUFFER_SIZE = 1024;
87  private readonly char[] buffer = new char[MAX_WORD_LEN];
88  private readonly char[] ioBuffer = new char[IO_BUFFER_SIZE];
89 
90  private int length;
91  private int start;
92 
93  private ITermAttribute termAtt;
94  private IOffsetAttribute offsetAtt;
95 
96  private void Push(char c)
97  {
98  if (length == 0) start = offset - 1; // start of token
99  buffer[length++] = Char.ToLower(c); // buffer it
100  }
101 
102  private bool Flush()
103  {
104 
105  if (length > 0)
106  {
107  termAtt.SetTermBuffer(buffer, 0, length);
108  offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length));
109  return true;
110  }
111  else
112  return false;
113  }
114 
115 
116  public override bool IncrementToken()
117  {
118  ClearAttributes();
119 
120  length = 0;
121  start = offset;
122 
123 
124  while (true)
125  {
126 
127  char c;
128  offset++;
129 
130  if (bufferIndex >= dataLen)
131  {
132  dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
133  bufferIndex = 0;
134  }
135 
136  if (dataLen == 0)
137  {
138  offset--;
139  return Flush();
140  }
141  else
142  c = ioBuffer[bufferIndex++];
143 
144 
145  switch (char.GetUnicodeCategory(c))
146  {
147 
148  case UnicodeCategory.DecimalDigitNumber:
149  case UnicodeCategory.LowercaseLetter:
150  case UnicodeCategory.UppercaseLetter:
151  Push(c);
152  if (length == MAX_WORD_LEN) return Flush();
153  break;
154 
155  case UnicodeCategory.OtherLetter:
156  if (length > 0)
157  {
158  bufferIndex--;
159  offset--;
160  return Flush();
161  }
162  Push(c);
163  return Flush();
164 
165  default:
166  if (length > 0) return Flush();
167  break;
168  }
169  }
170  }
171 
172  public override sealed void End()
173  {
174  // set final offset
175  int finalOffset = CorrectOffset(offset);
176  this.offsetAtt.SetOffset(finalOffset, finalOffset);
177  }
178 
179  public override void Reset()
180  {
181  base.Reset();
182  offset = bufferIndex = dataLen = 0;
183  }
184 
185  public override void Reset(TextReader input)
186  {
187  base.Reset(input);
188  Reset();
189  }
190  }
191 }