23 using System.Globalization;
26 using System.Text.RegularExpressions;
27 using Lucene.Net.Analysis;
28 using Lucene.Net.Analysis.Tokenattributes;
29 using Lucene.Net.Util;
31 namespace Lucene.Net.Analysis.CJK
55 internal static readonly
int WORD_TYPE = 0;
60 internal static readonly
int SINGLE_TOKEN_TYPE = 1;
65 internal static readonly
int DOUBLE_TOKEN_TYPE = 2;
70 internal static readonly String[] TOKEN_TYPE_NAMES = {
"word",
"single",
"double" };
75 internal static readonly
int MAX_WORD_LEN = 255;
80 internal static readonly
int IO_BUFFER_SIZE = 256;
87 private int offset = 0;
92 private int bufferIndex = 0;
97 private int dataLen = 0;
103 private char[] buffer =
new char[MAX_WORD_LEN];
109 private char[] ioBuffer =
new char[IO_BUFFER_SIZE];
114 private int tokenType = WORD_TYPE;
121 private bool preIsTokened =
false;
153 termAtt = AddAttribute<ITermAttribute>();
154 offsetAtt = AddAttribute<IOffsetAttribute>();
155 typeAtt = AddAttribute<ITypeAttribute>();
172 Regex isBasicLatin =
new Regex(
@"\p{IsBasicLatin}", RegexOptions.Compiled);
173 Regex isHalfWidthAndFullWidthForms =
new Regex(
@"\p{IsHalfwidthandFullwidthForms}", RegexOptions.Compiled);
175 public override bool IncrementToken()
197 if (bufferIndex >= dataLen)
199 dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
207 if (preIsTokened ==
true)
210 preIsTokened =
false;
228 c = ioBuffer[bufferIndex++];
236 bool isHalfFullForm = isHalfWidthAndFullWidthForms.Match(c.ToString()).Success;
238 if ((isBasicLatin.Match(c.ToString()).Success) || (isHalfFullForm))
243 if (i >= 65281 && i <= 65374)
252 if (
char.IsLetterOrDigit(c)
253 || ((c ==
'_') || (c ==
'+') || (c ==
'#'))
263 else if (tokenType == DOUBLE_TOKEN_TYPE)
271 if (preIsTokened ==
true)
275 preIsTokened =
false;
285 buffer[length++] =
char.ToLower(c);
286 tokenType = SINGLE_TOKEN_TYPE;
289 if (length == MAX_WORD_LEN)
299 preIsTokened =
false;
310 if (
char.IsLetter(c))
315 buffer[length++] = c;
316 tokenType = DOUBLE_TOKEN_TYPE;
320 if (tokenType == SINGLE_TOKEN_TYPE)
330 buffer[length++] = c;
331 tokenType = DOUBLE_TOKEN_TYPE;
346 if (preIsTokened ==
true)
350 preIsTokened =
false;
362 termAtt.SetTermBuffer(buffer, 0, length);
363 offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length));
364 typeAtt.Type = TOKEN_TYPE_NAMES[tokenType];
367 else if (dataLen == 0)
378 public override void End()
381 int finalOffset = CorrectOffset(offset);
382 this.offsetAtt.SetOffset(finalOffset, finalOffset);
385 public override void Reset()
388 offset = bufferIndex = dataLen = 0;
389 preIsTokened =
false;
390 tokenType = WORD_TYPE;
393 public override void Reset(TextReader reader)