Lucene.Net  3.0.3
Lucene.Net is a .NET port of the Java Lucene Indexing Library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties
ElisionFilter.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.Collections.Generic;
20 using Lucene.Net.Analysis.Tokenattributes;
21 
22 namespace Lucene.Net.Analysis.Fr
23 {
24  /*
25  * Removes elisions from a {@link TokenStream}. For example, "l'avion" (the plane) will be
26  * tokenized as "avion" (plane).
27  * <p>
28  * Note that {@link StandardTokenizer} sees " ' " as a space, and cuts it out.
29  *
30  * @see <a href="http://fr.wikipedia.org/wiki/%C3%89lision">Elision in Wikipedia</a>
31  */
32  public sealed class ElisionFilter : TokenFilter
33  {
34  private CharArraySet articles = null;
35  private ITermAttribute termAtt;
36 
37  private static char[] apostrophes = { '\'', '’' };
38 
39  public void SetArticles(ISet<string> articles)
40  {
41  if (articles is CharArraySet)
42  this.articles = (CharArraySet)articles;
43  else
44  this.articles = new CharArraySet(articles, true);
45  }
46 
47  /*
48  * Constructs an elision filter with standard stop words
49  */
50  internal ElisionFilter(TokenStream input)
51  : this(input, new[] { "l", "m", "t", "qu", "n", "s", "j" })
52  { }
53 
54  /*
55  * Constructs an elision filter with a Set of stop words
56  */
57  public ElisionFilter(TokenStream input, ISet<string> articles)
58  : base(input)
59  {
60  SetArticles(articles);
61  termAtt = AddAttribute<ITermAttribute>();
62  }
63 
64  /*
65  * Constructs an elision filter with an array of stop words
66  */
67  public ElisionFilter(TokenStream input, IEnumerable<string> articles)
68  : base(input)
69  {
70  this.articles = new CharArraySet(articles, true);
71  termAtt = AddAttribute<ITermAttribute>();
72  }
73 
74  /*
75  * Increments the {@link TokenStream} with a {@link TermAttribute} without elisioned start
76  */
77  public override sealed bool IncrementToken()
78  {
79  if (input.IncrementToken())
80  {
81  char[] termBuffer = termAtt.TermBuffer();
82  int termLength = termAtt.TermLength();
83 
84  int minPoz = int.MaxValue;
85  for (int i = 0; i < apostrophes.Length; i++)
86  {
87  char apos = apostrophes[i];
88  // The equivalent of String.indexOf(ch)
89  for (int poz = 0; poz < termLength; poz++)
90  {
91  if (termBuffer[poz] == apos)
92  {
93  minPoz = Math.Min(poz, minPoz);
94  break;
95  }
96  }
97  }
98 
99  // An apostrophe has been found. If the prefix is an article strip it off.
100  if (minPoz != int.MaxValue
101  && articles.Contains(termAtt.TermBuffer(), 0, minPoz))
102  {
103  termAtt.SetTermBuffer(termAtt.TermBuffer(), minPoz + 1, termAtt.TermLength() - (minPoz + 1));
104  }
105 
106  return true;
107  }
108  else
109  {
110  return false;
111  }
112  }
113  }
114 }