Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
HunspellStemFilter.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.Collections.Generic;
20 using System.Linq;
21 using Lucene.Net.Analysis.Tokenattributes;
22 
23 namespace Lucene.Net.Analysis.Hunspell {
24  /// <summary>
25  /// TokenFilter that uses hunspell affix rules and words to stem tokens. Since hunspell supports a
26  /// word having multiple stems, this filter can emit multiple tokens for each consumed token.
27  /// </summary>
29  private readonly ITermAttribute _termAtt;
30  private readonly IPositionIncrementAttribute _posIncAtt;
31  private readonly HunspellStemmer _stemmer;
32 
33  private readonly Queue<HunspellStem> _buffer = new Queue<HunspellStem>();
34  private State _savedState;
35 
36  private readonly Boolean _dedup;
37 
38  /// <summary>
39  /// Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using
40  /// affix rules in the provided HunspellDictionary.
41  /// </summary>
42  /// <param name="input">TokenStream whose tokens will be stemmed.</param>
43  /// <param name="dictionary">HunspellDictionary containing the affix rules and words that will be used to stem the tokens.</param>
44  /// <param name="dedup">true if only unique terms should be output.</param>
45  public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, Boolean dedup = true)
46  : base(input) {
47  _posIncAtt = AddAttribute<IPositionIncrementAttribute>();
48  _termAtt = AddAttribute<ITermAttribute>();
49 
50  _dedup = dedup;
51  _stemmer = new HunspellStemmer(dictionary);
52  }
53 
54  public override Boolean IncrementToken() {
55  if (_buffer.Any()) {
56  var nextStem = _buffer.Dequeue();
57 
58  RestoreState(_savedState);
59  _posIncAtt.PositionIncrement = 0;
60  _termAtt.SetTermBuffer(nextStem.Stem, 0, nextStem.StemLength);
61  return true;
62  }
63 
64  if (!input.IncrementToken())
65  return false;
66 
67  var newTerms = _dedup
68  ? _stemmer.UniqueStems(_termAtt.Term)
69  : _stemmer.Stem(_termAtt.Term);
70  foreach (var newTerm in newTerms)
71  _buffer.Enqueue(newTerm);
72 
73  if (_buffer.Count == 0)
74  // we do not know this word, return it unchanged
75  return true;
76 
77  var stem = _buffer.Dequeue();
78  _termAtt.SetTermBuffer(stem.Stem, 0, stem.StemLength);
79 
80  if (_buffer.Count > 0)
81  _savedState = CaptureState();
82 
83  return true;
84  }
85 
86  public override void Reset() {
87  base.Reset();
88 
89  _buffer.Clear();
90  }
91  }
92 }