Lucene.Net  3.0.3
Lucene.Net is a .NET port of the Java Lucene Indexing Library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties
ArabicNormalizer.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.IO;
20 using System.Collections;
21 
22 using Lucene.Net.Analysis;
23 using Lucene.Net.Analysis.Tokenattributes;
24 using Lucene.Net.Util;
25 
26 
27 namespace Lucene.Net.Analysis.AR
28 {
29  /*
30  * Normalizer for Arabic.
31  * <p/>
32  * Normalization is done in-place for efficiency, operating on a termbuffer.
33  * <p/>
34  * Normalization is defined as:
35  * <ul>
36  * <li> Normalization of hamza with alef seat to a bare alef.</li>
37  * <li> Normalization of teh marbuta to heh</li>
38  * <li> Normalization of dotless yeh (alef maksura) to yeh.</li>
39  * <li> Removal of Arabic diacritics (the harakat)</li>
40  * <li> Removal of tatweel (stretching character).</li>
41  * </ul>
42  *
43  */
44  public class ArabicNormalizer
45  {
46  public const char ALEF = '\u0627';
47  public const char ALEF_MADDA = '\u0622';
48  public const char ALEF_HAMZA_ABOVE = '\u0623';
49  public const char ALEF_HAMZA_BELOW = '\u0625';
50 
51  public const char YEH = '\u064A';
52  public const char DOTLESS_YEH = '\u0649';
53 
54  public const char TEH_MARBUTA = '\u0629';
55  public const char HEH = '\u0647';
56 
57  public const char TATWEEL = '\u0640';
58 
59  public const char FATHATAN = '\u064B';
60  public const char DAMMATAN = '\u064C';
61  public const char KASRATAN = '\u064D';
62  public const char FATHA = '\u064E';
63  public const char DAMMA = '\u064F';
64  public const char KASRA = '\u0650';
65  public const char SHADDA = '\u0651';
66  public const char SUKUN = '\u0652';
67 
68  /*
69  * Normalize an input buffer of Arabic text
70  *
71  * <param name="s">input buffer</param>
72  * <param name="len">length of input buffer</param>
73  * <returns>length of input buffer after normalization</returns>
74  */
75  public int Normalize(char[] s, int len)
76  {
77 
78  for (int i = 0; i < len; i++)
79  {
80  switch (s[i])
81  {
82  case ALEF_MADDA:
83  case ALEF_HAMZA_ABOVE:
84  case ALEF_HAMZA_BELOW:
85  s[i] = ALEF;
86  break;
87  case DOTLESS_YEH:
88  s[i] = YEH;
89  break;
90  case TEH_MARBUTA:
91  s[i] = HEH;
92  break;
93  case TATWEEL:
94  case KASRATAN:
95  case DAMMATAN:
96  case FATHATAN:
97  case FATHA:
98  case DAMMA:
99  case KASRA:
100  case SHADDA:
101  case SUKUN:
102  len = Delete(s, i, len);
103  i--;
104  break;
105  default:
106  break;
107  }
108  }
109 
110  return len;
111  }
112 
113  /*
114  * Delete a character in-place
115  *
116  * <param name="s">Input Buffer</param>
117  * <param name="pos">Position of character to delete</param>
118  * <param name="len">length of input buffer</param>
119  * <returns>length of input buffer after deletion</returns>
120  */
121  protected int Delete(char[] s, int pos, int len)
122  {
123  if (pos < len)
124  Array.Copy(s, pos + 1, s, pos, len - pos - 1);
125 
126  return len - 1;
127  }
128 
129  }
130 }