Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
PersianNormalizer.cs
Go to the documentation of this file.
1 /*
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements. See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership. The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License. You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied. See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20 */
21 
22 using System;
23 
24 namespace Lucene.Net.Analysis.Fa
25 {
26 /*
27  * Normalizer for Persian.
28  * <p>
29  * Normalization is done in-place for efficiency, operating on a termbuffer.
30  * <p>
31  * Normalization is defined as:
32  * <ul>
33  * <li>Normalization of various heh + hamza forms and heh goal to heh.
34  * <li>Normalization of farsi yeh and yeh barree to arabic yeh
35  * <li>Normalization of persian keheh to arabic kaf
36  * </ul>
37  *
38  */
39 public class PersianNormalizer {
40  public const char YEH = '\u064A';
41 
42  public const char FARSI_YEH = '\u06CC';
43 
44  public const char YEH_BARREE = '\u06D2';
45 
46  public const char KEHEH = '\u06A9';
47 
48  public const char KAF = '\u0643';
49 
50  public const char HAMZA_ABOVE = '\u0654';
51 
52  public const char HEH_YEH = '\u06C0';
53 
54  public const char HEH_GOAL = '\u06C1';
55 
56  public const char HEH = '\u0647';
57 
58  /*
59  * Normalize an input buffer of Persian text
60  *
61  * @param s input buffer
62  * @param len length of input buffer
63  * @return length of input buffer after normalization
64  */
65  public int Normalize(char[] s, int len) {
66 
67  for (int i = 0; i < len; i++) {
68  switch (s[i]) {
69  case FARSI_YEH:
70  case YEH_BARREE:
71  s[i] = YEH;
72  break;
73  case KEHEH:
74  s[i] = KAF;
75  break;
76  case HEH_YEH:
77  case HEH_GOAL:
78  s[i] = HEH;
79  break;
80  case HAMZA_ABOVE: // necessary for HEH + HAMZA
81  len = Delete(s, i, len);
82  i--;
83  break;
84  default:
85  break;
86  }
87  }
88 
89  return len;
90  }
91 
92  /*
93  * Delete a character in-place
94  *
95  * @param s Input Buffer
96  * @param pos Position of character to delete
97  * @param len length of input buffer
98  * @return length of input buffer after deletion
99  */
100  protected int Delete(char[] s, int pos, int len) {
101  if (pos < len)
102  Array.Copy(s, pos + 1, s, pos, len - pos - 1);
103 
104  return len - 1;
105  }
106 
107 }
108 }