Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
CJKTokenizer.cs
Go to the documentation of this file.
1 /*
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements. See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership. The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License. You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied. See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20 */
21 
22 using System;
23 using System.Globalization;
24 using System.IO;
25 using System.Text;
26 using System.Text.RegularExpressions;
27 using Lucene.Net.Analysis;
28 using Lucene.Net.Analysis.Tokenattributes;
29 using Lucene.Net.Util;
30 
31 namespace Lucene.Net.Analysis.CJK
32 {
33  /// <summary>
34  /// <p>
35  /// CJKTokenizer was modified from StopTokenizer which does a decent job for
36  /// most European languages. and it perferm other token method for double-byte
37  /// chars: the token will return at each two charactors with overlap match.<br/>
38  /// Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
39  /// also need filter filter zero length token ""<br/>
40  /// for Digit: digit, '+', '#' will token as letter<br/>
41  /// for more info on Asia language(Chinese Japanese Korean) text segmentation:
42  /// please search <a
43  /// href="http://www.google.com/search?q=word+chinese+segment">google</a>
44  /// </p>
45  ///
46  /// @author Che, Dong
47  /// @version $Id: CJKTokenizer.java,v 1.3 2003/01/22 20:54:47 otis Exp $
48  /// </summary>
49  public sealed class CJKTokenizer : Tokenizer
50  {
51  //~ Static fields/initializers ---------------------------------------------
52  /// <summary>
53  /// Word token type
54  /// </summary>
55  internal static readonly int WORD_TYPE = 0;
56 
57  /// <summary>
58  /// Single byte token type
59  /// </summary>
60  internal static readonly int SINGLE_TOKEN_TYPE = 1;
61 
62  /// <summary>
63  /// Double byte token type
64  /// </summary>
65  internal static readonly int DOUBLE_TOKEN_TYPE = 2;
66 
67  /// <summary>
68  /// Names for token types
69  /// </summary>
70  internal static readonly String[] TOKEN_TYPE_NAMES = { "word", "single", "double" };
71 
72  /// <summary>
73  /// Max word length
74  /// </summary>
75  internal static readonly int MAX_WORD_LEN = 255;
76 
77  /// <summary>
78  /// buffer size
79  /// </summary>
80  internal static readonly int IO_BUFFER_SIZE = 256;
81 
82  //~ Instance fields --------------------------------------------------------
83 
84  /// <summary>
85  /// word offset, used to imply which character(in ) is parsed
86  /// </summary>
87  private int offset = 0;
88 
89  /// <summary>
90  /// the index used only for ioBuffer
91  /// </summary>
92  private int bufferIndex = 0;
93 
94  /// <summary>
95  /// data length
96  /// </summary>
97  private int dataLen = 0;
98 
99  /// <summary>
100  /// character buffer, store the characters which are used to compose <br/>
101  /// the returned Token
102  /// </summary>
103  private char[] buffer = new char[MAX_WORD_LEN];
104 
105  /// <summary>
106  /// I/O buffer, used to store the content of the input(one of the <br/>
107  /// members of Tokenizer)
108  /// </summary>
109  private char[] ioBuffer = new char[IO_BUFFER_SIZE];
110 
111  /// <summary>
112  /// word type: single=>ASCII double=>non-ASCII word=>default
113  /// </summary>
114  private int tokenType = WORD_TYPE;
115 
116  /// <summary>
117  /// tag: previous character is a cached double-byte character "C1C2C3C4"
118  /// ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
119  /// C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
120  /// </summary>
121  private bool preIsTokened = false;
122 
123  private ITermAttribute termAtt;
124  private IOffsetAttribute offsetAtt;
125  private ITypeAttribute typeAtt;
126 
127  //~ Constructors -----------------------------------------------------------
128 
129  /// <summary>
130  /// Construct a token stream processing the given input.
131  /// </summary>
132  /// <param name="_in">I/O reader</param>
133  public CJKTokenizer(TextReader _in)
134  : base(_in)
135  {
136  Init();
137  }
138 
139  public CJKTokenizer(AttributeSource source, TextReader _in)
140  : base(source, _in)
141  {
142  Init();
143  }
144 
145  public CJKTokenizer(AttributeFactory factory, TextReader _in)
146  : base(factory, _in)
147  {
148  Init();
149  }
150 
151  private void Init()
152  {
153  termAtt = AddAttribute<ITermAttribute>();
154  offsetAtt = AddAttribute<IOffsetAttribute>();
155  typeAtt = AddAttribute<ITypeAttribute>();
156  }
157 
158  //~ Methods ----------------------------------------------------------------
159 
160  /*
161  * Returns true for the next token in the stream, or false at EOS.
162  * See http://java.sun.com/j2se/1.3/docs/api/java/lang/char.UnicodeBlock.html
163  * for detail.
164  *
165  * @return false for end of stream, true otherwise
166  *
167  * @throws java.io.IOException - throw IOException when read error <br>
168  * happened in the InputStream
169  *
170  */
171 
172  Regex isBasicLatin = new Regex(@"\p{IsBasicLatin}", RegexOptions.Compiled);
173  Regex isHalfWidthAndFullWidthForms = new Regex(@"\p{IsHalfwidthandFullwidthForms}", RegexOptions.Compiled);
174 
175  public override bool IncrementToken()
176  {
177  ClearAttributes();
178  /* how many character(s) has been stored in buffer */
179 
180  while (true)
181  {
182  // loop until we find a non-empty token
183 
184  int length = 0;
185 
186  /* the position used to create Token */
187  int start = offset;
188 
189  while (true)
190  {
191  // loop until we've found a full token
192  /* current character */
193  char c;
194 
195  offset++;
196 
197  if (bufferIndex >= dataLen)
198  {
199  dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
200  bufferIndex = 0;
201  }
202 
203  if (dataLen == 0) // input.Read returns 0 when its empty, not -1, as in java
204  {
205  if (length > 0)
206  {
207  if (preIsTokened == true)
208  {
209  length = 0;
210  preIsTokened = false;
211  }
212  else
213  {
214  offset--;
215  }
216 
217  break;
218  }
219  else
220  {
221  offset--;
222  return false;
223  }
224  }
225  else
226  {
227  //get current character
228  c = ioBuffer[bufferIndex++];
229  }
230 
231  //TODO: Using a Regex to determine the UnicodeCategory is probably slower than
232  // If we just created a small class that would look it up for us, which
233  // would likely be trivial, however time-consuming. I can't imagine a Regex
234  // being fast for this, considering we have to pull a char from the buffer,
235  // and convert it to a string before we run a regex on it. - cc
236  bool isHalfFullForm = isHalfWidthAndFullWidthForms.Match(c.ToString()).Success;
237  //if the current character is ASCII or Extend ASCII
238  if ((isBasicLatin.Match(c.ToString()).Success) || (isHalfFullForm))
239  {
240  if (isHalfFullForm)
241  {
242  int i = (int) c;
243  if (i >= 65281 && i <= 65374)
244  {
245  // convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN
246  i = i - 65248;
247  c = (char) i;
248  }
249  }
250 
251  // if the current character is a letter or "_" "+" "#"
252  if (char.IsLetterOrDigit(c)
253  || ((c == '_') || (c == '+') || (c == '#'))
254  )
255  {
256  if (length == 0)
257  {
258  // "javaC1C2C3C4linux" <br>
259  // ^--: the current character begin to token the ASCII
260  // letter
261  start = offset - 1;
262  }
263  else if (tokenType == DOUBLE_TOKEN_TYPE)
264  {
265  // "javaC1C2C3C4linux" <br>
266  // ^--: the previous non-ASCII
267  // : the current character
268  offset--;
269  bufferIndex--;
270 
271  if (preIsTokened == true)
272  {
273  // there is only one non-ASCII has been stored
274  length = 0;
275  preIsTokened = false;
276  break;
277  }
278  else
279  {
280  break;
281  }
282  }
283 
284  // store the LowerCase(c) in the buffer
285  buffer[length++] = char.ToLower(c); // TODO: is java invariant? If so, this should be ToLowerInvariant()
286  tokenType = SINGLE_TOKEN_TYPE;
287 
288  // break the procedure if buffer overflowed!
289  if (length == MAX_WORD_LEN)
290  {
291  break;
292  }
293  }
294  else if (length > 0)
295  {
296  if (preIsTokened)
297  {
298  length = 0;
299  preIsTokened = false;
300  }
301  else
302  {
303  break;
304  }
305  }
306  }
307  else
308  {
309  // non-ASCII letter, e.g."C1C2C3C4"
310  if (char.IsLetter(c))
311  {
312  if (length == 0)
313  {
314  start = offset - 1;
315  buffer[length++] = c;
316  tokenType = DOUBLE_TOKEN_TYPE;
317  }
318  else
319  {
320  if (tokenType == SINGLE_TOKEN_TYPE)
321  {
322  offset--;
323  bufferIndex--;
324 
325  //return the previous ASCII characters
326  break;
327  }
328  else
329  {
330  buffer[length++] = c;
331  tokenType = DOUBLE_TOKEN_TYPE;
332 
333  if (length == 2)
334  {
335  offset--;
336  bufferIndex--;
337  preIsTokened = true;
338 
339  break;
340  }
341  }
342  }
343  }
344  else if (length > 0)
345  {
346  if (preIsTokened == true)
347  {
348  // empty the buffer
349  length = 0;
350  preIsTokened = false;
351  }
352  else
353  {
354  break;
355  }
356  }
357  }
358  }
359 
360  if (length > 0)
361  {
362  termAtt.SetTermBuffer(buffer, 0, length);
363  offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length));
364  typeAtt.Type = TOKEN_TYPE_NAMES[tokenType];
365  return true;
366  }
367  else if (dataLen == 0)
368  {
369  offset--;
370  return false;
371  }
372 
373  // Cycle back and try for the next token (don't
374  // return an empty string)
375  }
376  }
377 
378  public override void End()
379  {
380  // set final offset
381  int finalOffset = CorrectOffset(offset);
382  this.offsetAtt.SetOffset(finalOffset, finalOffset);
383  }
384 
385  public override void Reset()
386  {
387  base.Reset();
388  offset = bufferIndex = dataLen = 0;
389  preIsTokened = false;
390  tokenType = WORD_TYPE;
391  }
392 
393  public override void Reset(TextReader reader)
394  {
395  base.Reset(reader);
396  Reset();
397  }
398  }
399 }