Lucene.Net
3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
Main Page
Classes
Files
File List
File Members
All
Classes
Namespaces
Files
Functions
Variables
Typedefs
Enumerations
Properties
Pages
contrib
Analyzers
Fa
PersianNormalizer.cs
Go to the documentation of this file.
1
/*
2
*
3
* Licensed to the Apache Software Foundation (ASF) under one
4
* or more contributor license agreements. See the NOTICE file
5
* distributed with this work for additional information
6
* regarding copyright ownership. The ASF licenses this file
7
* to you under the Apache License, Version 2.0 (the
8
* "License"); you may not use this file except in compliance
9
* with the License. You may obtain a copy of the License at
10
*
11
* http://www.apache.org/licenses/LICENSE-2.0
12
*
13
* Unless required by applicable law or agreed to in writing,
14
* software distributed under the License is distributed on an
15
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16
* KIND, either express or implied. See the License for the
17
* specific language governing permissions and limitations
18
* under the License.
19
*
20
*/
21
22
using
System;
23
24
namespace
Lucene.Net.Analysis.Fa
25
{
26
/*
27
* Normalizer for Persian.
28
* <p>
29
* Normalization is done in-place for efficiency, operating on a termbuffer.
30
* <p>
31
* Normalization is defined as:
32
* <ul>
33
* <li>Normalization of various heh + hamza forms and heh goal to heh.
34
* <li>Normalization of farsi yeh and yeh barree to arabic yeh
35
* <li>Normalization of persian keheh to arabic kaf
36
* </ul>
37
*
38
*/
39
public
class
PersianNormalizer
{
40
public
const
char
YEH =
'\u064A'
;
41
42
public
const
char
FARSI_YEH =
'\u06CC'
;
43
44
public
const
char
YEH_BARREE =
'\u06D2'
;
45
46
public
const
char
KEHEH =
'\u06A9'
;
47
48
public
const
char
KAF =
'\u0643'
;
49
50
public
const
char
HAMZA_ABOVE =
'\u0654'
;
51
52
public
const
char
HEH_YEH =
'\u06C0'
;
53
54
public
const
char
HEH_GOAL =
'\u06C1'
;
55
56
public
const
char
HEH =
'\u0647'
;
57
58
/*
59
* Normalize an input buffer of Persian text
60
*
61
* @param s input buffer
62
* @param len length of input buffer
63
* @return length of input buffer after normalization
64
*/
65
public
int
Normalize
(
char
[] s,
int
len) {
66
67
for
(
int
i = 0; i < len; i++) {
68
switch
(s[i]) {
69
case
FARSI_YEH:
70
case
YEH_BARREE:
71
s[i] = YEH;
72
break
;
73
case
KEHEH:
74
s[i] = KAF;
75
break
;
76
case
HEH_YEH:
77
case
HEH_GOAL:
78
s[i] = HEH;
79
break
;
80
case
HAMZA_ABOVE:
// necessary for HEH + HAMZA
81
len = Delete(s, i, len);
82
i--;
83
break
;
84
default
:
85
break
;
86
}
87
}
88
89
return
len;
90
}
91
92
/*
93
* Delete a character in-place
94
*
95
* @param s Input Buffer
96
* @param pos Position of character to delete
97
* @param len length of input buffer
98
* @return length of input buffer after deletion
99
*/
100
protected
int
Delete
(
char
[] s,
int
pos,
int
len) {
101
if
(pos < len)
102
Array.Copy(s, pos + 1, s, pos, len - pos - 1);
103
104
return
len - 1;
105
}
106
107
}
108
}
Generated on Thu Jan 3 2013 02:34:08 for Lucene.Net by
1.8.3