Lucene.Net  3.0.3
Lucene.Net is a .NET port of the Java Lucene Indexing Library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties
CompoundWordTokenFilterBase.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.Collections.Generic;
20 using Lucene.Net.Analysis.Tokenattributes;
21 
22 namespace Lucene.Net.Analysis.Compound
23 {
24 
25  /*
26  * Base class for decomposition token filters.
27  */
28  public abstract class CompoundWordTokenFilterBase : TokenFilter
29  {
30  /*
31  * The default for minimal word length that gets decomposed
32  */
33  public static readonly int DEFAULT_MIN_WORD_SIZE = 5;
34 
35  /*
36  * The default for minimal length of subwords that get propagated to the output of this filter
37  */
38  public static readonly int DEFAULT_MIN_SUBWORD_SIZE = 2;
39 
40  /*
41  * The default for maximal length of subwords that get propagated to the output of this filter
42  */
43  public static readonly int DEFAULT_MAX_SUBWORD_SIZE = 15;
44 
45  protected readonly CharArraySet dictionary;
46  protected readonly LinkedList<Token> tokens;
47  protected readonly int minWordSize;
48  protected readonly int minSubwordSize;
49  protected readonly int maxSubwordSize;
50  protected readonly bool onlyLongestMatch;
51 
52  private ITermAttribute termAtt;
53  private IOffsetAttribute offsetAtt;
54  private IFlagsAttribute flagsAtt;
55  private IPositionIncrementAttribute posIncAtt;
56  private ITypeAttribute typeAtt;
57  private IPayloadAttribute payloadAtt;
58 
59  private readonly Token wrapper = new Token();
60 
61  protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch)
62  : this(input, MakeDictionary(dictionary), minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch)
63  {
64 
65  }
66 
67  protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, bool onlyLongestMatch)
68  : this(input, MakeDictionary(dictionary), DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch)
69  {
70 
71  }
72 
73  protected CompoundWordTokenFilterBase(TokenStream input, ISet<string> dictionary, bool onlyLongestMatch)
74  : this(input, dictionary, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch)
75  {
76 
77  }
78 
79  protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary)
80  : this(input, MakeDictionary(dictionary), DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false)
81  {
82 
83  }
84 
85  protected CompoundWordTokenFilterBase(TokenStream input, ISet<string> dictionary)
86  : this(input, dictionary, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false)
87  {
88 
89  }
90 
91  protected CompoundWordTokenFilterBase(TokenStream input, ISet<string> dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch)
92  : base(input)
93  {
94  this.tokens = new LinkedList<Token>();
95  this.minWordSize = minWordSize;
96  this.minSubwordSize = minSubwordSize;
97  this.maxSubwordSize = maxSubwordSize;
98  this.onlyLongestMatch = onlyLongestMatch;
99 
100  if (dictionary is CharArraySet)
101  {
102  this.dictionary = (CharArraySet)dictionary;
103  }
104  else
105  {
106  this.dictionary = new CharArraySet(dictionary.Count, false);
107  AddAllLowerCase(this.dictionary, dictionary);
108  }
109 
110  termAtt = AddAttribute<ITermAttribute>();
111  offsetAtt = AddAttribute<IOffsetAttribute>();
112  flagsAtt = AddAttribute<IFlagsAttribute>();
113  posIncAtt = AddAttribute<IPositionIncrementAttribute>();
114  typeAtt = AddAttribute<ITypeAttribute>();
115  payloadAtt = AddAttribute<IPayloadAttribute>();
116  }
117 
118  /*
119  * Create a set of words from an array
120  * The resulting Set does case insensitive matching
121  * TODO We should look for a faster dictionary lookup approach.
122  * @param dictionary
123  * @return {@link Set} of lowercased terms
124  */
125  public static ISet<string> MakeDictionary(String[] dictionary)
126  {
127  // is the below really case insensitive?
128  CharArraySet dict = new CharArraySet(dictionary.Length, false);
129  AddAllLowerCase(dict, dictionary);
130  return dict;
131  }
132 
133  private void setToken(Token token)
134  {
135  ClearAttributes();
136  termAtt.SetTermBuffer(token.TermBuffer(), 0, token.TermLength());
137  flagsAtt.Flags = token.Flags;
138  typeAtt.Type = token.Type;
139  offsetAtt.SetOffset(token.StartOffset, token.EndOffset);
140  posIncAtt.PositionIncrement = token.PositionIncrement;
141  payloadAtt.Payload = token.Payload;
142  }
143 
144  public sealed override bool IncrementToken()
145  {
146  if (tokens.Count > 0)
147  {
148  setToken((Token)tokens.First.Value);
149  tokens.RemoveFirst();
150  return true;
151  }
152 
153  if (input.IncrementToken() == false)
154  return false;
155 
156  wrapper.SetTermBuffer(termAtt.TermBuffer(), 0, termAtt.TermLength());
157  wrapper.StartOffset = offsetAtt.StartOffset;
158  wrapper.EndOffset = offsetAtt.EndOffset;
159  wrapper.Flags = flagsAtt.Flags;
160  wrapper.Type = typeAtt.Type;
161  wrapper.PositionIncrement = posIncAtt.PositionIncrement;
162  wrapper.Payload = payloadAtt.Payload;
163 
164  Decompose(wrapper);
165 
166  if (tokens.Count > 0)
167  {
168  setToken(tokens.First.Value);
169  tokens.RemoveFirst();
170  return true;
171  }
172  else
173  {
174  return false;
175  }
176  }
177 
178  protected static void AddAllLowerCase(ISet<string> target, ICollection<string> col)
179  {
180  foreach (var str in col)
181  {
182  target.Add(str.ToLower(System.Globalization.CultureInfo.GetCultureInfo("en-US")));
183  }
184  }
185 
186  protected static char[] MakeLowerCaseCopy(char[] buffer)
187  {
188  char[] result = new char[buffer.Length];
189  Array.Copy(buffer, 0, result, 0, buffer.Length);
190 
191  for (int i = 0; i < buffer.Length; ++i)
192  {
193  result[i] = char.ToLower(buffer[i]); // Is java invariant?
194  }
195 
196  return result;
197  }
198 
199  protected Token CreateToken(int offset, int length,
200  Token prototype)
201  {
202  int newStart = prototype.StartOffset + offset;
203  Token t = prototype.Clone(prototype.TermBuffer(), offset, length, newStart, newStart + length);
204  t.PositionIncrement = 0;
205  return t;
206  }
207 
208  protected void Decompose(Token token)
209  {
210  // In any case we give the original token back
211  tokens.AddLast((Token)token.Clone());
212 
213  // Only words longer than minWordSize get processed
214  if (token.TermLength() < this.minWordSize)
215  {
216  return;
217  }
218 
219  DecomposeInternal(token);
220  }
221 
222  protected abstract void DecomposeInternal(Token token);
223 
224  public override void Reset()
225  {
226  base.Reset();
227  tokens.Clear();
228  }
229  }
230 }