Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
ThaiWordFilter.cs
Go to the documentation of this file.
1 /*
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements. See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership. The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License. You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied. See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20 */
21 
22 using System;
23 using System.Collections.Generic;
24 using System.Linq;
25 using System.Text;
26 using System.Text.RegularExpressions;
27 using Lucene.Net.Analysis.Tokenattributes;
28 
29 namespace Lucene.Net.Analysis.Th
30 {
31  /*
32  * {@link TokenFilter} that use {@link java.text.BreakIterator} to break each
33  * Token that is Thai into separate Token(s) for each Thai word.
34  * <p>WARNING: this filter may not work correctly with all JREs.
35  * It is known to work with Sun/Oracle and Harmony JREs.
36  */
37  public sealed class ThaiWordFilter : TokenFilter
38  {
39  //private BreakIterator breaker = null;
40 
41  private ITermAttribute termAtt;
42  private IOffsetAttribute offsetAtt;
43 
44  private State thaiState = null;
45  // I'm sure this is far slower than if we just created a simple UnicodeBlock class
46  // considering this is used on a single char, we have to create a new string for it,
47  // via ToString(), so we can then run a costly(?) regex on it. Yikes.
48  private Regex _isThaiRegex = new Regex(@"\p{IsThai}", RegexOptions.Compiled);
49 
51  : base(input)
52  {
53  throw new NotSupportedException("PORT ISSUES");
54  //breaker = BreakIterator.getWordInstance(new Locale("th"));
55  //termAtt = AddAttribute<TermAttribute>();
56  //offsetAtt = AddAttribute<OffsetAttribute>();
57  }
58 
59  public sealed override bool IncrementToken()
60  {
61  //int end;
62  //if (thaiState != null)
63  //{
64  // int start = breaker.Current();
65  // end = breaker.next();
66  // if (end != BreakIterator.DONE)
67  // {
68  // RestoreState(thaiState);
69  // termAtt.SetTermBuffer(termAtt.TermBuffer(), start, end - start);
70  // offsetAtt.SetOffset(offsetAtt.StartOffset() + start, offsetAtt.StartOffset() + end);
71  // return true;
72  // }
73  // thaiState = null;
74  //}
75 
76  //if (input.IncrementToken() == false || termAtt.TermLength() == 0)
77  // return false;
78 
79  //String text = termAtt.Term();
80  //if (!_isThaiRegex.Match(new string(new[]{text[0]})).Success)
81  //{
82  // termAtt.SetTermBuffer(text.ToLower());
83  // return true;
84  //}
85 
86  //thaiState = CaptureState();
87 
88  //breaker.SetText(text);
89  //end = breaker.next();
90  //if (end != BreakIterator.DONE)
91  //{
92  // termAtt.SetTermBuffer(text, 0, end);
93  // offsetAtt.SetOffset(offsetAtt.StartOffset(), offsetAtt.StartOffset() + end);
94  // return true;
95  //}
96  return false;
97  }
98 
99  public override void Reset()
100  {
101  base.Reset();
102  thaiState = null;
103  }
104  }
105 }