Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
ChineseFilter.cs
Go to the documentation of this file.
1 /*
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements. See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership. The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License. You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied. See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20 */
21 
22 using System;
23 using System.Collections.Generic;
24 using System.IO;
25 using System.Collections;
26 using System.Globalization;
27 
28 using Lucene.Net.Analysis;
29 using Lucene.Net.Analysis.Tokenattributes;
30 
31 namespace Lucene.Net.Analysis.Cn
32 {
33  // TODO: convert this XML code to valid .NET
34  /// <summary>
35  /// A {@link TokenFilter} with a stop word table.
36  /// <ul>
37  /// <li>Numeric tokens are removed.</li>
38  /// <li>English tokens must be larger than 1 char.</li>
39  /// <li>One Chinese char as one Chinese word.</li>
40  /// </ul>
41  /// TO DO:
42  /// <ol>
43  /// <li>Add Chinese stop words, such as \ue400</li>
44  /// <li>Dictionary based Chinese word extraction</li>
45  /// <li>Intelligent Chinese word extraction</li>
46  /// </ol>
47  /// </summary>
48  public sealed class ChineseFilter : TokenFilter
49  {
50  // Only English now, Chinese to be added later.
51  public static String[] STOP_WORDS =
52  {
53  "and", "are", "as", "at", "be", "but", "by",
54  "for", "if", "in", "into", "is", "it",
55  "no", "not", "of", "on", "or", "such",
56  "that", "the", "their", "then", "there", "these",
57  "they", "this", "to", "was", "will", "with"
58  };
59 
60  private CharArraySet stopTable;
61  private ITermAttribute termAtt;
62 
64  : base(_in)
65  {
66  stopTable = new CharArraySet((IEnumerable<string>)STOP_WORDS, false);
67  termAtt = AddAttribute<ITermAttribute>();
68  }
69 
70  public override bool IncrementToken()
71  {
72  while (input.IncrementToken())
73  {
74  char[] text = termAtt.TermBuffer();
75  int termLength = termAtt.TermLength();
76 
77  // why not key off token type here assuming ChineseTokenizer comes first?
78  if (!stopTable.Contains(text, 0, termLength))
79  {
80  switch (char.GetUnicodeCategory(text[0]))
81  {
82  case UnicodeCategory.LowercaseLetter:
83  case UnicodeCategory.UppercaseLetter:
84  // English word/token should larger than 1 char.
85  if (termLength > 1)
86  {
87  return true;
88  }
89  break;
90  case UnicodeCategory.OtherLetter:
91  // One Chinese char as one Chinese word.
92  // Chinese word extraction to be added later here.
93  return true;
94  }
95  }
96  }
97  return false;
98  }
99  }
100 }