Lucene.Net  3.0.3
Lucene.Net is a .NET port of the Java Lucene Indexing Library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties
GreekLowerCaseFilter.cs
Go to the documentation of this file.
1 /*
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements. See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership. The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License. You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied. See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20 */
21 
22 using Lucene.Net.Analysis.Tokenattributes;
23 
24 namespace Lucene.Net.Analysis.El
25 {
26  /*
27  * Normalizes token text to lower case, removes some Greek diacritics,
28  * and standardizes final sigma to sigma.
29  *
30  */
31  public sealed class GreekLowerCaseFilter : TokenFilter
32  {
33  private ITermAttribute termAtt;
34 
36  : base(_in)
37  {
38  termAtt = AddAttribute<ITermAttribute>();
39  }
40 
41  public override bool IncrementToken()
42  {
43  if (input.IncrementToken())
44  {
45  char[] chArray = termAtt.TermBuffer();
46  int chLen = termAtt.TermLength();
47  // TODO: iterate codepoints to support supp. characters
48  for (int i = 0; i < chLen; i++)
49  {
50  chArray[i] = (char)lowerCase(chArray[i]);
51  }
52  return true;
53  }
54  else
55  {
56  return false;
57  }
58  }
59 
60  private int lowerCase(int codepoint)
61  {
62  switch (codepoint)
63  {
64  /* There are two lowercase forms of sigma:
65  * U+03C2: small final sigma (end of word)
66  * U+03C3: small sigma (otherwise)
67  *
68  * Standardize both to U+03C3
69  */
70  case '\u03C2': /* small final sigma */
71  return '\u03C3'; /* small sigma */
72 
73  /* Some greek characters contain diacritics.
74  * This filter removes these, converting to the lowercase base form.
75  */
76 
77  case '\u0386': /* capital alpha with tonos */
78  case '\u03AC': /* small alpha with tonos */
79  return '\u03B1'; /* small alpha */
80 
81  case '\u0388': /* capital epsilon with tonos */
82  case '\u03AD': /* small epsilon with tonos */
83  return '\u03B5'; /* small epsilon */
84 
85  case '\u0389': /* capital eta with tonos */
86  case '\u03AE': /* small eta with tonos */
87  return '\u03B7'; /* small eta */
88 
89  case '\u038A': /* capital iota with tonos */
90  case '\u03AA': /* capital iota with dialytika */
91  case '\u03AF': /* small iota with tonos */
92  case '\u03CA': /* small iota with dialytika */
93  case '\u0390': /* small iota with dialytika and tonos */
94  return '\u03B9'; /* small iota */
95 
96  case '\u038E': /* capital upsilon with tonos */
97  case '\u03AB': /* capital upsilon with dialytika */
98  case '\u03CD': /* small upsilon with tonos */
99  case '\u03CB': /* small upsilon with dialytika */
100  case '\u03B0': /* small upsilon with dialytika and tonos */
101  return '\u03C5'; /* small upsilon */
102 
103  case '\u038C': /* capital omicron with tonos */
104  case '\u03CC': /* small omicron with tonos */
105  return '\u03BF'; /* small omicron */
106 
107  case '\u038F': /* capital omega with tonos */
108  case '\u03CE': /* small omega with tonos */
109  return '\u03C9'; /* small omega */
110 
111  /* The previous implementation did the conversion below.
112  * Only implemented for backwards compatibility with old indexes.
113  */
114 
115  case '\u03A2': /* reserved */
116  return '\u03C2'; /* small final sigma */
117 
118  default:
119  return char.ToLower((char)codepoint);
120  }
121  }
122  }
123 }