Lucene.Net  3.0.3
Lucene.Net is a .NET port of the Java Lucene Indexing Library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties
NGramTokenizer.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System.IO;
19 using System.Collections;
20 
21 using Lucene.Net.Analysis;
22 using Lucene.Net.Analysis.Tokenattributes;
23 using Lucene.Net.Util;
24 
25 namespace Lucene.Net.Analysis.NGram
26 {
27 
28  /*
29  * Tokenizes the input into n-grams of the given size(s).
30  */
31  public sealed class NGramTokenizer : Tokenizer
32  {
33  public static int DEFAULT_MIN_NGRAM_SIZE = 1;
34  public static int DEFAULT_MAX_NGRAM_SIZE = 2;
35 
36  private int minGram, maxGram;
37  private int gramSize;
38  private int pos = 0;
39  private int inLen;
40  private string inStr;
41  private bool started = false;
42 
43  private ITermAttribute termAtt;
44  private IOffsetAttribute offsetAtt;
45 
46  /*
47  * Creates NGramTokenizer with given min and max n-grams.
48  * <param name="input"><see cref="TextReader"/> holding the input to be tokenized</param>
49  * <param name="minGram">the smallest n-gram to generate</param>
50  * <param name="maxGram">the largest n-gram to generate</param>
51  */
52  public NGramTokenizer(TextReader input, int minGram, int maxGram)
53  : base(input)
54  {
55  init(minGram, maxGram);
56  }
57 
58  /*
59  * Creates NGramTokenizer with given min and max n-grams.
60  * <param name="source"><see cref="AttributeSource"/> to use</param>
61  * <param name="input"><see cref="TextReader"/> holding the input to be tokenized</param>
62  * <param name="minGram">the smallest n-gram to generate</param>
63  * <param name="maxGram">the largest n-gram to generate</param>
64  */
65  public NGramTokenizer(AttributeSource source, TextReader input, int minGram, int maxGram)
66  : base(source, input)
67  {
68  init(minGram, maxGram);
69  }
70 
71  /*
72  * Creates NGramTokenizer with given min and max n-grams.
73  * <param name="factory"><see cref="AttributeSource.AttributeFactory"/> to use</param>
74  * <param name="input"><see cref="TextReader"/> holding the input to be tokenized</param>
75  * <param name="minGram">the smallest n-gram to generate</param>
76  * <param name="maxGram">the largest n-gram to generate</param>
77  */
78  public NGramTokenizer(AttributeFactory factory, TextReader input, int minGram, int maxGram)
79  : base(factory, input)
80  {
81  init(minGram, maxGram);
82  }
83 
84  /*
85  * Creates NGramTokenizer with default min and max n-grams.
86  * <param name="input"><see cref="TextReader"/> holding the input to be tokenized</param>
87  */
88  public NGramTokenizer(TextReader input)
89  : this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE)
90  {
91 
92  }
93 
94  private void init(int minGram, int maxGram)
95  {
96  if (minGram < 1)
97  {
98  throw new System.ArgumentException("minGram must be greater than zero");
99  }
100  if (minGram > maxGram)
101  {
102  throw new System.ArgumentException("minGram must not be greater than maxGram");
103  }
104  this.minGram = minGram;
105  this.maxGram = maxGram;
106 
107  this.termAtt = AddAttribute<ITermAttribute>();
108  this.offsetAtt = AddAttribute<IOffsetAttribute>();
109  }
110 
111  /* Returns the next token in the stream, or null at EOS. */
112  public override bool IncrementToken()
113  {
114  ClearAttributes();
115  if (!started)
116  {
117  started = true;
118  gramSize = minGram;
119  char[] chars = new char[1024];
120  inStr = input.ReadToEnd(); // remove any trailing empty strings
121  inLen = inStr.Length;
122  }
123 
124  if (pos + gramSize > inLen)
125  { // if we hit the end of the string
126  pos = 0; // reset to beginning of string
127  gramSize++; // increase n-gram size
128  if (gramSize > maxGram) // we are done
129  return false;
130  if (pos + gramSize > inLen)
131  return false;
132  }
133 
134  int oldPos = pos;
135  pos++;
136  termAtt.SetTermBuffer(inStr, oldPos, gramSize);
137  offsetAtt.SetOffset(CorrectOffset(oldPos), CorrectOffset(oldPos + gramSize));
138  return true;
139  }
140 
141  public override void End()
142  {
143  // set offset
144  int finalOffset = inLen;
145  this.offsetAtt.SetOffset(finalOffset, finalOffset);
146  }
147 
148  public override void Reset(TextReader input)
149  {
150  base.Reset(input);
151  Reset();
152  }
153 
154  public override void Reset()
155  {
156  base.Reset();
157  started = false;
158  pos = 0;
159  }
160  }
161 }