Lucene.Net
3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
Main Page
Classes
Files
File List
File Members
All
Classes
Namespaces
Files
Functions
Variables
Typedefs
Enumerations
Properties
Pages
contrib
Analyzers
AR
ArabicNormalizer.cs
Go to the documentation of this file.
1
/*
2
* Licensed to the Apache Software Foundation (ASF) under one or more
3
* contributor license agreements. See the NOTICE file distributed with
4
* this work for additional information regarding copyright ownership.
5
* The ASF licenses this file to You under the Apache License, Version 2.0
6
* (the "License"); you may not use this file except in compliance with
7
* the License. You may obtain a copy of the License at
8
*
9
* http://www.apache.org/licenses/LICENSE-2.0
10
*
11
* Unless required by applicable law or agreed to in writing, software
12
* distributed under the License is distributed on an "AS IS" BASIS,
13
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
* See the License for the specific language governing permissions and
15
* limitations under the License.
16
*/
17
18
using
System;
19
using
System.IO;
20
using
System.Collections;
21
22
using
Lucene.Net.Analysis;
23
using
Lucene.Net.Analysis.Tokenattributes;
24
using
Lucene.Net.Util;
25
26
27
namespace
Lucene.Net.Analysis.AR
28
{
29
/*
30
* Normalizer for Arabic.
31
* <p/>
32
* Normalization is done in-place for efficiency, operating on a termbuffer.
33
* <p/>
34
* Normalization is defined as:
35
* <ul>
36
* <li> Normalization of hamza with alef seat to a bare alef.</li>
37
* <li> Normalization of teh marbuta to heh</li>
38
* <li> Normalization of dotless yeh (alef maksura) to yeh.</li>
39
* <li> Removal of Arabic diacritics (the harakat)</li>
40
* <li> Removal of tatweel (stretching character).</li>
41
* </ul>
42
*
43
*/
44
public
class
ArabicNormalizer
45
{
46
public
const
char
ALEF =
'\u0627'
;
47
public
const
char
ALEF_MADDA =
'\u0622'
;
48
public
const
char
ALEF_HAMZA_ABOVE =
'\u0623'
;
49
public
const
char
ALEF_HAMZA_BELOW =
'\u0625'
;
50
51
public
const
char
YEH =
'\u064A'
;
52
public
const
char
DOTLESS_YEH =
'\u0649'
;
53
54
public
const
char
TEH_MARBUTA =
'\u0629'
;
55
public
const
char
HEH =
'\u0647'
;
56
57
public
const
char
TATWEEL =
'\u0640'
;
58
59
public
const
char
FATHATAN =
'\u064B'
;
60
public
const
char
DAMMATAN =
'\u064C'
;
61
public
const
char
KASRATAN =
'\u064D'
;
62
public
const
char
FATHA =
'\u064E'
;
63
public
const
char
DAMMA =
'\u064F'
;
64
public
const
char
KASRA =
'\u0650'
;
65
public
const
char
SHADDA =
'\u0651'
;
66
public
const
char
SUKUN =
'\u0652'
;
67
68
/*
69
* Normalize an input buffer of Arabic text
70
*
71
* <param name="s">input buffer</param>
72
* <param name="len">length of input buffer</param>
73
* <returns>length of input buffer after normalization</returns>
74
*/
75
public
int
Normalize(
char
[] s,
int
len)
76
{
77
78
for
(
int
i = 0; i < len; i++)
79
{
80
switch
(s[i])
81
{
82
case
ALEF_MADDA:
83
case
ALEF_HAMZA_ABOVE:
84
case
ALEF_HAMZA_BELOW:
85
s[i] = ALEF;
86
break
;
87
case
DOTLESS_YEH:
88
s[i] = YEH;
89
break
;
90
case
TEH_MARBUTA:
91
s[i] = HEH;
92
break
;
93
case
TATWEEL:
94
case
KASRATAN:
95
case
DAMMATAN:
96
case
FATHATAN:
97
case
FATHA:
98
case
DAMMA:
99
case
KASRA:
100
case
SHADDA:
101
case
SUKUN:
102
len = Delete(s, i, len);
103
i--;
104
break
;
105
default
:
106
break
;
107
}
108
}
109
110
return
len;
111
}
112
113
/*
114
* Delete a character in-place
115
*
116
* <param name="s">Input Buffer</param>
117
* <param name="pos">Position of character to delete</param>
118
* <param name="len">length of input buffer</param>
119
* <returns>length of input buffer after deletion</returns>
120
*/
121
protected
int
Delete(
char
[] s,
int
pos,
int
len)
122
{
123
if
(pos < len)
124
Array.Copy(s, pos + 1, s, pos, len - pos - 1);
125
126
return
len - 1;
127
}
128
129
}
130
}
Generated on Thu Jan 3 2013 02:34:08 for Lucene.Net by
1.8.3