Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
NGramTokenFilter.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System.IO;
19 using System.Collections;
20 
21 using Lucene.Net.Analysis;
22 using Lucene.Net.Analysis.Tokenattributes;
23 using Lucene.Net.Util;
24 
25 namespace Lucene.Net.Analysis.NGram
26 {
27  /*
28  * Tokenizes the input into n-grams of the given size(s).
29  */
30  public sealed class NGramTokenFilter : TokenFilter
31  {
32  public static int DEFAULT_MIN_NGRAM_SIZE = 1;
33  public static int DEFAULT_MAX_NGRAM_SIZE = 2;
34 
35  private int minGram, maxGram;
36 
37  private char[] curTermBuffer;
38  private int curTermLength;
39  private int curGramSize;
40  private int curPos;
41  private int tokStart;
42 
43  private ITermAttribute termAtt;
44  private IOffsetAttribute offsetAtt;
45 
46  /*
47  * Creates NGramTokenFilter with given min and max n-grams.
48  * <param name="input"><see cref="TokenStream"/> holding the input to be tokenized</param>
49  * <param name="minGram">the smallest n-gram to generate</param>
50  * <param name="maxGram">the largest n-gram to generate</param>
51  */
52  public NGramTokenFilter(TokenStream input, int minGram, int maxGram)
53  : base(input)
54  {
55 
56  if (minGram < 1)
57  {
58  throw new System.ArgumentException("minGram must be greater than zero");
59  }
60  if (minGram > maxGram)
61  {
62  throw new System.ArgumentException("minGram must not be greater than maxGram");
63  }
64  this.minGram = minGram;
65  this.maxGram = maxGram;
66 
67  this.termAtt = AddAttribute<ITermAttribute>();
68  this.offsetAtt = AddAttribute<IOffsetAttribute>();
69  }
70 
71  /*
72  * Creates NGramTokenFilter with default min and max n-grams.
73  * <param name="input"><see cref="TokenStream"/> holding the input to be tokenized</param>
74  */
76  : this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE)
77  {
78 
79  }
80 
81  /* Returns the next token in the stream, or null at EOS. */
82  public override bool IncrementToken()
83  {
84  while (true)
85  {
86  if (curTermBuffer == null)
87  {
88  if (!input.IncrementToken())
89  {
90  return false;
91  }
92  else
93  {
94  curTermBuffer = (char[])termAtt.TermBuffer().Clone();
95  curTermLength = termAtt.TermLength();
96  curGramSize = minGram;
97  curPos = 0;
98  tokStart = offsetAtt.StartOffset;
99  }
100  }
101  while (curGramSize <= maxGram)
102  {
103  while (curPos + curGramSize <= curTermLength)
104  { // while there is input
105  ClearAttributes();
106  termAtt.SetTermBuffer(curTermBuffer, curPos, curGramSize);
107  offsetAtt.SetOffset(tokStart + curPos, tokStart + curPos + curGramSize);
108  curPos++;
109  return true;
110  }
111  curGramSize++; // increase n-gram size
112  curPos = 0;
113  }
114  curTermBuffer = null;
115  }
116  }
117 
118  public override void Reset()
119  {
120  base.Reset();
121  curTermBuffer = null;
122  }
123  }
124 }