Lucene.Net  3.0.3
Lucene.Net is a .NET port of the Java Lucene Indexing Library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties
ArabicStemmer.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.IO;
20 using System.Collections;
21 
22 using Lucene.Net.Analysis;
23 using Lucene.Net.Analysis.Tokenattributes;
24 using Lucene.Net.Util;
25 
26 
27 namespace Lucene.Net.Analysis.AR
28 {
29 
30 
31  /*
32  * Stemmer for Arabic.
33  * <p/>
34  * Stemming is done in-place for efficiency, operating on a termbuffer.
35  * <p/>
36  * Stemming is defined as:
37  * <ul>
38  * <li> Removal of attached definite article, conjunction, and prepositions.</li>
39  * <li> Stemming of common suffixes.</li>
40  * </ul>
41  *
42  */
43  public class ArabicStemmer
44  {
45  public const char ALEF = '\u0627';
46  public const char BEH = '\u0628';
47  public const char TEH_MARBUTA = '\u0629';
48  public const char TEH = '\u062A';
49  public const char FEH = '\u0641';
50  public const char KAF = '\u0643';
51  public const char LAM = '\u0644';
52  public const char NOON = '\u0646';
53  public const char HEH = '\u0647';
54  public const char WAW = '\u0648';
55  public const char YEH = '\u064A';
56 
57  public static readonly char[][] prefixes = {
58  ("" + ALEF + LAM).ToCharArray(),
59  ("" + WAW + ALEF + LAM).ToCharArray(),
60  ("" + BEH + ALEF + LAM).ToCharArray(),
61  ("" + KAF + ALEF + LAM).ToCharArray(),
62  ("" + FEH + ALEF + LAM).ToCharArray(),
63  ("" + LAM + LAM).ToCharArray(),
64  ("" + WAW).ToCharArray(),
65  };
66 
67  public static readonly char[][] suffixes = {
68  ("" + HEH + ALEF).ToCharArray(),
69  ("" + ALEF + NOON).ToCharArray(),
70  ("" + ALEF + TEH).ToCharArray(),
71  ("" + WAW + NOON).ToCharArray(),
72  ("" + YEH + NOON).ToCharArray(),
73  ("" + YEH + HEH).ToCharArray(),
74  ("" + YEH + TEH_MARBUTA).ToCharArray(),
75  ("" + HEH).ToCharArray(),
76  ("" + TEH_MARBUTA).ToCharArray(),
77  ("" + YEH).ToCharArray(),
78  };
79 
80 
81  /*
82  * Stem an input buffer of Arabic text.
83  *
84  * <param name="s">input buffer</param>
85  * <param name="len">length of input buffer</param>
86  * <returns>length of input buffer after normalization</returns>
87  */
88  public int Stem(char[] s, int len)
89  {
90  len = StemPrefix(s, len);
91  len = StemSuffix(s, len);
92 
93  return len;
94  }
95 
96  /*
97  * Stem a prefix off an Arabic word.
98  * <param name="s">input buffer</param>
99  * <param name="len">length of input buffer</param>
100  * <returns>new length of input buffer after stemming.</returns>
101  */
102  public int StemPrefix(char[] s, int len)
103  {
104  for (int i = 0; i < prefixes.Length; i++)
105  if (StartsWith(s, len, prefixes[i]))
106  return DeleteN(s, 0, len, prefixes[i].Length);
107  return len;
108  }
109 
110  /*
111  * Stem suffix(es) off an Arabic word.
112  * <param name="s">input buffer</param>
113  * <param name="len">length of input buffer</param>
114  * <returns>new length of input buffer after stemming</returns>
115  */
116  public int StemSuffix(char[] s, int len)
117  {
118  for (int i = 0; i < suffixes.Length; i++)
119  if (EndsWith(s, len, suffixes[i]))
120  len = DeleteN(s, len - suffixes[i].Length, len, suffixes[i].Length);
121  return len;
122  }
123 
124  /*
125  * Returns true if the prefix matches and can be stemmed
126  * <param name="s">input buffer</param>
127  * <param name="len">length of input buffer</param>
128  * <param name="prefix">prefix to check</param>
129  * <returns>true if the prefix matches and can be stemmed</returns>
130  */
131  bool StartsWith(char[] s, int len, char[] prefix)
132  {
133  if (prefix.Length == 1 && len < 4)
134  { // wa- prefix requires at least 3 characters
135  return false;
136  }
137  else if (len < prefix.Length + 2)
138  { // other prefixes require only 2.
139  return false;
140  }
141  else
142  {
143  for (int i = 0; i < prefix.Length; i++)
144  if (s[i] != prefix[i])
145  return false;
146 
147  return true;
148  }
149  }
150 
151  /*
152  * Returns true if the suffix matches and can be stemmed
153  * <param name="s">input buffer</param>
154  * <param name="len">length of input buffer</param>
155  * <param name="suffix">suffix to check</param>
156  * <returns>true if the suffix matches and can be stemmed</returns>
157  */
158  bool EndsWith(char[] s, int len, char[] suffix)
159  {
160  if (len < suffix.Length + 2)
161  { // all suffixes require at least 2 characters after stemming
162  return false;
163  }
164  else
165  {
166  for (int i = 0; i < suffix.Length; i++)
167  if (s[len - suffix.Length + i] != suffix[i])
168  return false;
169 
170  return true;
171  }
172  }
173 
174 
175  /*
176  * Delete n characters in-place
177  *
178  * <param name="s">Input Buffer</param>
179  * <param name="pos">Position of character to delete</param>
180  * <param name="len">Length of input buffer</param>
181  * <param name="nChars">number of characters to delete</param>
182  * <returns>length of input buffer after deletion</returns>
183  */
184  protected int DeleteN(char[] s, int pos, int len, int nChars)
185  {
186  for (int i = 0; i < nChars; i++)
187  len = Delete(s, pos, len);
188  return len;
189  }
190 
191  /*
192  * Delete a character in-place
193  *
194  * <param name="s">Input Buffer</param>
195  * <param name="pos">Position of character to delete</param>
196  * <param name="len">length of input buffer</param>
197  * <returns>length of input buffer after deletion</returns>
198  */
199  protected int Delete(char[] s, int pos, int len)
200  {
201  if (pos < len)
202  Array.Copy(s, pos + 1, s, pos, len - pos - 1);
203 
204  return len - 1;
205  }
206 
207  }
208 }