Lucene.Net  3.0.3
Lucene.Net is a .NET port of the Java Lucene Indexing Library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties
ShingleFilter.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.Collections.Generic;
20 using System.Linq;
21 using System.Text;
22 using Lucene.Net.Analysis.Tokenattributes;
23 using Lucene.Net.Util;
24 
25 namespace Lucene.Net.Analysis.Shingle
26 {
27  /*
28  * <p>A ShingleFilter constructs shingles (token n-grams) from a token stream.
29  * In other words, it creates combinations of tokens as a single token.
30  *
31  * <p>For example, the sentence "please divide this sentence into shingles"
32  * might be tokenized into shingles "please divide", "divide this",
33  * "this sentence", "sentence into", and "into shingles".
34  *
35  * <p>This filter handles position increments > 1 by inserting filler tokens
36  * (tokens with termtext "_"). It does not handle a position increment of 0.
37  */
38  public sealed class ShingleFilter : TokenFilter
39  {
40 
41  private LinkedList<State> shingleBuf = new LinkedList<State>();
42  private StringBuilder[] shingles;
43  private String tokenType = "shingle";
44 
45  /*
46  * filler token for when positionIncrement is more than 1
47  */
48  public static readonly char[] FILLER_TOKEN = { '_' };
49 
50 
51  /*
52  * default maximum shingle size is 2.
53  */
54  public const int DEFAULT_MAX_SHINGLE_SIZE = 2;
55 
56  /*
57  * The string to use when joining adjacent tokens to form a shingle
58  */
59  public const String TOKEN_SEPARATOR = " ";
60 
61  /*
62  * By default, we output unigrams (individual tokens) as well as shingles
63  * (token n-grams).
64  */
65  private bool outputUnigrams = true;
66 
67  /*
68  * maximum shingle size (number of tokens)
69  */
70  private int maxShingleSize;
71 
72  /*
73  * Constructs a ShingleFilter with the specified single size from the
74  * {@link TokenStream} <c>input</c>
75  *
76  * @param input input stream
77  * @param maxShingleSize maximum shingle size produced by the filter.
78  */
79  public ShingleFilter(TokenStream input, int maxShingleSize)
80  : base(input)
81  {
82  SetMaxShingleSize(maxShingleSize);
83  this.termAtt = AddAttribute<ITermAttribute>(); ;
84  this.offsetAtt = AddAttribute<IOffsetAttribute>(); ;
85  this.posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); ;
86  this.typeAtt = AddAttribute<ITypeAttribute>(); ;
87  }
88 
89  /*
90  * Construct a ShingleFilter with default shingle size.
91  *
92  * @param input input stream
93  */
94  public ShingleFilter(TokenStream input)
95  : this(input, DEFAULT_MAX_SHINGLE_SIZE)
96  {
97  }
98 
99  /*
100  * Construct a ShingleFilter with the specified token type for shingle tokens.
101  *
102  * @param input input stream
103  * @param tokenType token type for shingle tokens
104  */
105  public ShingleFilter(TokenStream input, String tokenType)
106  : this(input, DEFAULT_MAX_SHINGLE_SIZE)
107  {
108  setTokenType(tokenType);
109  }
110 
111  /*
112  * Set the type of the shingle tokens produced by this filter.
113  * (default: "shingle")
114  *
115  * @param tokenType token tokenType
116  */
117  public void setTokenType(String tokenType)
118  {
119  this.tokenType = tokenType;
120  }
121 
122  /*
123  * Shall the output stream contain the input tokens (unigrams) as well as
124  * shingles? (default: true.)
125  *
126  * @param outputUnigrams Whether or not the output stream shall contain
127  * the input tokens (unigrams)
128  */
129  public void SetOutputUnigrams(bool outputUnigrams)
130  {
131  this.outputUnigrams = outputUnigrams;
132  }
133 
134  /*
135  * Set the max shingle size (default: 2)
136  *
137  * @param maxShingleSize max size of output shingles
138  */
139  public void SetMaxShingleSize(int maxShingleSize)
140  {
141  if (maxShingleSize < 2)
142  {
143  throw new ArgumentException("Max shingle size must be >= 2");
144  }
145  shingles = new StringBuilder[maxShingleSize];
146  for (int i = 0; i < shingles.Length; i++)
147  {
148  shingles[i] = new StringBuilder();
149  }
150  this.maxShingleSize = maxShingleSize;
151  }
152 
153  /*
154  * Clear the StringBuilders that are used for storing the output shingles.
155  */
156  private void ClearShingles()
157  {
158  for (int i = 0; i < shingles.Length; i++)
159  {
160  shingles[i].Length = 0;
161  }
162  }
163 
164  private AttributeSource.State nextToken;
165  private int shingleBufferPosition;
166  private int[] endOffsets;
167 
168  /* (non-Javadoc)
169  * @see org.apache.lucene.analysis.TokenStream#next()
170  */
171  public sealed override bool IncrementToken()
172  {
173  while (true)
174  {
175  if (nextToken == null)
176  {
177  if (!FillShingleBuffer())
178  {
179  return false;
180  }
181  }
182 
183  nextToken = shingleBuf.First.Value;
184 
185  if (outputUnigrams)
186  {
187  if (shingleBufferPosition == 0)
188  {
189  RestoreState(nextToken);
190  posIncrAtt.PositionIncrement = 1;
191  shingleBufferPosition++;
192  return true;
193  }
194  }
195  else if (shingleBufferPosition % this.maxShingleSize == 0)
196  {
197  shingleBufferPosition++;
198  }
199 
200  if (shingleBufferPosition < shingleBuf.Count)
201  {
202  RestoreState(nextToken);
203  typeAtt.Type = tokenType;
204  offsetAtt.SetOffset(offsetAtt.StartOffset, endOffsets[shingleBufferPosition]);
205  StringBuilder buf = shingles[shingleBufferPosition];
206  int termLength = buf.Length;
207  char[] TermBuffer = termAtt.TermBuffer();
208  if (TermBuffer.Length < termLength)
209  TermBuffer = termAtt.ResizeTermBuffer(termLength);
210  buf.CopyTo(0, TermBuffer, 0, termLength);
211  termAtt.SetTermLength(termLength);
212  if ((!outputUnigrams) && shingleBufferPosition % this.maxShingleSize == 1)
213  {
214  posIncrAtt.PositionIncrement = 1;
215  }
216  else
217  {
218  posIncrAtt.PositionIncrement = 0;
219  }
220  shingleBufferPosition++;
221  if (shingleBufferPosition == shingleBuf.Count)
222  {
223  nextToken = null;
224  shingleBufferPosition = 0;
225  }
226  return true;
227  }
228  else
229  {
230  nextToken = null;
231  shingleBufferPosition = 0;
232  }
233  }
234  }
235 
236  private int numFillerTokensToInsert;
237  private AttributeSource.State currentToken;
238  private bool hasCurrentToken;
239 
240  private ITermAttribute termAtt;
241  private IOffsetAttribute offsetAtt;
242  private IPositionIncrementAttribute posIncrAtt;
243  private ITypeAttribute typeAtt;
244 
245  /*
246  * Get the next token from the input stream and push it on the token buffer.
247  * If we encounter a token with position increment > 1, we put filler tokens
248  * on the token buffer.
249  * <p/>
250  * Returns null when the end of the input stream is reached.
251  * @return the next token, or null if at end of input stream
252  * @throws IOException if the input stream has a problem
253  */
254  private bool GetNextToken()
255  {
256 
257  while (true)
258  {
259  if (numFillerTokensToInsert > 0)
260  {
261  if (currentToken == null)
262  {
263  currentToken = CaptureState();
264  }
265  else
266  {
267  RestoreState(currentToken);
268  }
269  numFillerTokensToInsert--;
270  // A filler token occupies no space
271  offsetAtt.SetOffset(offsetAtt.StartOffset, offsetAtt.StartOffset);
272  termAtt.SetTermBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.Length);
273  return true;
274  }
275 
276  if (hasCurrentToken)
277  {
278  if (currentToken != null)
279  {
280  RestoreState(currentToken);
281  currentToken = null;
282  }
283  hasCurrentToken = false;
284  return true;
285  }
286 
287  if (!input.IncrementToken()) return false;
288  hasCurrentToken = true;
289 
290  if (posIncrAtt.PositionIncrement > 1)
291  {
292  numFillerTokensToInsert = posIncrAtt.PositionIncrement - 1;
293  }
294  }
295  }
296 
297  /*
298  * Fill the output buffer with new shingles.
299  *
300  * @throws IOException if there's a problem getting the next token
301  */
302  private bool FillShingleBuffer()
303  {
304  bool addedToken = false;
305  /*
306  * Try to fill the shingle buffer.
307  */
308  do
309  {
310  if (GetNextToken())
311  {
312  shingleBuf.AddLast(CaptureState());
313  if (shingleBuf.Count > maxShingleSize)
314  {
315  shingleBuf.RemoveFirst();
316  }
317  addedToken = true;
318  }
319  else
320  {
321  break;
322  }
323  } while (shingleBuf.Count < maxShingleSize);
324 
325  if (shingleBuf.Count == 0)
326  {
327  return false;
328  }
329 
330  /*
331  * If no new token could be added to the shingle buffer, we have reached
332  * the end of the input stream and have to discard the least recent token.
333  */
334  if (!addedToken)
335  {
336  shingleBuf.RemoveFirst();
337  }
338 
339  if (shingleBuf.Count == 0)
340  {
341  return false;
342  }
343 
344  ClearShingles();
345 
346  endOffsets = new int[shingleBuf.Count];
347  // Set all offsets to 0
348  endOffsets.Initialize();
349 
350  int i = 0;
351  for (IEnumerator<State> it = shingleBuf.GetEnumerator(); it.MoveNext(); )
352  {
353  RestoreState(it.Current);
354  for (int j = i; j < shingles.Length; j++)
355  {
356  if (shingles[j].Length != 0)
357  {
358  shingles[j].Append(TOKEN_SEPARATOR);
359  }
360  shingles[j].Append(termAtt.TermBuffer().Take(termAtt.TermLength()).ToArray());
361  }
362 
363  endOffsets[i] = offsetAtt.EndOffset;
364  i++;
365  }
366 
367  return true;
368  }
369 
370  public override void Reset()
371  {
372  base.Reset();
373  nextToken = null;
374  shingleBufferPosition = 0;
375  shingleBuf.Clear();
376  numFillerTokensToInsert = 0;
377  currentToken = null;
378  hasCurrentToken = false;
379  }
380  }
381 }