Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
EdgeNGramTokenizer.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System.IO;
19 using System.Collections;
20 
21 using Lucene.Net.Analysis;
22 using Lucene.Net.Analysis.Tokenattributes;
23 using Lucene.Net.Util;
24 
25 namespace Lucene.Net.Analysis.NGram
26 {
27 
28  /*
29  * Tokenizes the input from an edge into n-grams of given size(s).
30  * <p>
31  * This <see cref="Tokenizer"/> create n-grams from the beginning edge or ending edge of a input token.
32  * MaxGram can't be larger than 1024 because of limitation.
33  * </p>
34  */
35  public sealed class EdgeNGramTokenizer : Tokenizer
36  {
37  public static Side DEFAULT_SIDE = Side.FRONT;
38  public static int DEFAULT_MAX_GRAM_SIZE = 1;
39  public static int DEFAULT_MIN_GRAM_SIZE = 1;
40 
41  private ITermAttribute termAtt;
42  private IOffsetAttribute offsetAtt;
43 
44  /* Specifies which side of the input the n-gram should be generated from */
45  // Moved Side enum from this class to external definition
46 
47  private int minGram;
48  private int maxGram;
49  private int gramSize;
50  private Side side;
51  private bool started = false;
52  private int inLen;
53  private string inStr;
54 
55 
56  /*
57  * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
58  *
59  * <param name="input"><see cref="TextReader"/> holding the input to be tokenized</param>
60  * <param name="side">the <see cref="Side"/> from which to chop off an n-gram</param>
61  * <param name="minGram">the smallest n-gram to generate</param>
62  * <param name="maxGram">the largest n-gram to generate</param>
63  */
64  public EdgeNGramTokenizer(TextReader input, Side side, int minGram, int maxGram)
65  : base(input)
66  {
67  init(side, minGram, maxGram);
68  }
69 
70  /*
71  * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
72  *
73  * <param name="source"><see cref="AttributeSource"/> to use</param>
74  * <param name="input"><see cref="TextReader"/> holding the input to be tokenized</param>
75  * <param name="side">the <see cref="Side"/> from which to chop off an n-gram</param>
76  * <param name="minGram">the smallest n-gram to generate</param>
77  * <param name="maxGram">the largest n-gram to generate</param>
78  */
79  public EdgeNGramTokenizer(AttributeSource source, TextReader input, Side side, int minGram, int maxGram)
80  : base(source, input)
81  {
82 
83  init(side, minGram, maxGram);
84  }
85 
86  /*
87  * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
88  *
89  * <param name="factory"><see cref="AttributeSource.AttributeFactory"/> to use</param>
90  * <param name="input"><see cref="TextReader"/> holding the input to be tokenized</param>
91  * <param name="side">the <see cref="Side"/> from which to chop off an n-gram</param>
92  * <param name="minGram">the smallest n-gram to generate</param>
93  * <param name="maxGram">the largest n-gram to generate</param>
94  */
95  public EdgeNGramTokenizer(AttributeFactory factory, TextReader input, Side side, int minGram, int maxGram)
96  : base(factory, input)
97  {
98 
99  init(side, minGram, maxGram);
100  }
101 
102  /*
103  * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
104  *
105  * <param name="input"><see cref="TextReader"/> holding the input to be tokenized</param>
106  * <param name="sideLabel">the name of the <see cref="Side"/> from which to chop off an n-gram</param>
107  * <param name="minGram">the smallest n-gram to generate</param>
108  * <param name="maxGram">the largest n-gram to generate</param>
109  */
110  public EdgeNGramTokenizer(TextReader input, string sideLabel, int minGram, int maxGram)
111  : this(input, SideExtensions.GetSide(sideLabel), minGram, maxGram)
112  {
113 
114  }
115 
116  /*
117  * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
118  *
119  * <param name="source"><see cref="AttributeSource"/> to use</param>
120  * <param name="input"><see cref="TextReader"/> holding the input to be tokenized</param>
121  * <param name="sideLabel">the name of the <see cref="Side"/> from which to chop off an n-gram</param>
122  * <param name="minGram">the smallest n-gram to generate</param>
123  * <param name="maxGram">the largest n-gram to generate</param>
124  */
125  public EdgeNGramTokenizer(AttributeSource source, TextReader input, string sideLabel, int minGram, int maxGram)
126  : this(source, input, SideExtensions.GetSide(sideLabel), minGram, maxGram)
127  {
128 
129  }
130 
131  /*
132  * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
133  *
134  * <param name="factory"><see cref="AttributeSource.AttributeFactory"/> to use</param>
135  * <param name="input"><see cref="TextReader"/> holding the input to be tokenized</param>
136  * <param name="sideLabel">the name of the <see cref="Side"/> from which to chop off an n-gram</param>
137  * <param name="minGram">the smallest n-gram to generate</param>
138  * <param name="maxGram">the largest n-gram to generate</param>
139  */
140  public EdgeNGramTokenizer(AttributeFactory factory, TextReader input, string sideLabel, int minGram, int maxGram) :
141  this(factory, input, SideExtensions.GetSide(sideLabel), minGram, maxGram)
142  {
143  }
144 
145  private void init(Side side, int minGram, int maxGram)
146  {
147  if (side == null)
148  {
149  throw new System.ArgumentException("sideLabel must be either front or back");
150  }
151 
152  if (minGram < 1)
153  {
154  throw new System.ArgumentException("minGram must be greater than zero");
155  }
156 
157  if (minGram > maxGram)
158  {
159  throw new System.ArgumentException("minGram must not be greater than maxGram");
160  }
161 
162  this.minGram = minGram;
163  this.maxGram = maxGram;
164  this.side = side;
165 
166  this.termAtt = AddAttribute<ITermAttribute>();
167  this.offsetAtt = AddAttribute<IOffsetAttribute>();
168 
169  }
170 
171  /* Returns the next token in the stream, or null at EOS. */
172  public override bool IncrementToken()
173  {
174  ClearAttributes();
175  // if we are just starting, read the whole input
176  if (!started)
177  {
178  started = true;
179  char[] chars = new char[1024];
180  inStr = input.ReadToEnd().Trim(); // remove any leading or trailing spaces
181  inLen = inStr.Length;
182  gramSize = minGram;
183  }
184 
185  // if the remaining input is too short, we can't generate any n-grams
186  if (gramSize > inLen)
187  {
188  return false;
189  }
190 
191  // if we have hit the end of our n-gram size range, quit
192  if (gramSize > maxGram)
193  {
194  return false;
195  }
196 
197  // grab gramSize chars from front or back
198  int start = side == Side.FRONT ? 0 : inLen - gramSize;
199  int end = start + gramSize;
200  termAtt.SetTermBuffer(inStr, start, gramSize);
201  offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(end));
202  gramSize++;
203  return true;
204  }
205 
206  public override void End()
207  {
208  // set offset
209  int finalOffset = inLen;
210  this.offsetAtt.SetOffset(finalOffset, finalOffset);
211  }
212 
213  public override void Reset(TextReader input)
214  {
215  base.Reset(input);
216  Reset();
217  }
218 
219  public override void Reset()
220  {
221  base.Reset();
222  started = false;
223  }
224  }
225 }