Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
EdgeNGramTokenFilter.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.IO;
20 using System.Collections;
21 
22 using Lucene.Net.Analysis;
23 using Lucene.Net.Analysis.Tokenattributes;
24 using Lucene.Net.Util;
25 
26 namespace Lucene.Net.Analysis.NGram
27 {
28  public static class SideExtensions
29  {
30  public static string GetLabel(this Side theSide)
31  {
32  switch(theSide)
33  {
34  case Side.FRONT:
35  return "front";
36  case Side.BACK:
37  return "back";
38  default:
39  throw new ArgumentException(string.Format("{0} is not a valid value for EdgeNGramTokenFilter.Side", theSide));
40  }
41  }
42 
43  public static Side GetSide(string sideName)
44  {
45  if (Side.FRONT.GetLabel() == sideName)
46  {
47  return Side.FRONT;
48  }
49 
50  if (Side.BACK.GetLabel() == sideName)
51  {
52  return Side.BACK;
53  }
54 
55  return (Side)(-1); // TODO: returning null instead of null? Should an exception be thrown instead?
56  }
57  }
58 
59  /// <summary>
60  /// Specifies which side of the input the n-gram should be generated from
61  /// </summary>
62  public enum Side
63  {
64  FRONT,
65  BACK
66  }
67 
68  /*
69  * Tokenizes the given token into n-grams of given size(s).
70  * <p>
71  * This <see cref="TokenFilter"/> create n-grams from the beginning edge or ending edge of a input token.
72  * </p>
73  */
74  public sealed class EdgeNGramTokenFilter : TokenFilter
75  {
76  public static Side DEFAULT_SIDE = Side.FRONT;
77  public static int DEFAULT_MAX_GRAM_SIZE = 1;
78  public static int DEFAULT_MIN_GRAM_SIZE = 1;
79 
80  private int minGram;
81  private int maxGram;
82  private Side side;
83  private char[] curTermBuffer;
84  private int curTermLength;
85  private int curGramSize;
86  private int tokStart;
87 
88  private ITermAttribute termAtt;
89  private IOffsetAttribute offsetAtt;
90 
91 
92  protected EdgeNGramTokenFilter(TokenStream input) : base(input)
93  {
94  this.termAtt = AddAttribute<ITermAttribute>();
95  this.offsetAtt = AddAttribute<IOffsetAttribute>();
96  }
97 
98  /*
99  * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
100  *
101  * <param name="input"><see cref="TokenStream"/> holding the input to be tokenized</param>
102  * <param name="side">the <see cref="Side"/> from which to chop off an n-gram</param>
103  * <param name="minGram">the smallest n-gram to generate</param>
104  * <param name="maxGram">the largest n-gram to generate</param>
105  */
106  public EdgeNGramTokenFilter(TokenStream input, Side side, int minGram, int maxGram)
107  : base(input)
108  {
109 
110 
111  if (side != Side.FRONT && side != Side.BACK)
112  {
113  throw new System.ArgumentException("sideLabel must be either front or back");
114  }
115 
116  if (minGram < 1)
117  {
118  throw new System.ArgumentException("minGram must be greater than zero");
119  }
120 
121  if (minGram > maxGram)
122  {
123  throw new System.ArgumentException("minGram must not be greater than maxGram");
124  }
125 
126  this.minGram = minGram;
127  this.maxGram = maxGram;
128  this.side = side;
129  this.termAtt = AddAttribute<ITermAttribute>();
130  this.offsetAtt = AddAttribute<IOffsetAttribute>();
131  }
132 
133  /*
134  * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
135  *
136  * <param name="input"><see cref="TokenStream"/> holding the input to be tokenized</param>
137  * <param name="sideLabel">the name of the <see cref="Side"/> from which to chop off an n-gram</param>
138  * <param name="minGram">the smallest n-gram to generate</param>
139  * <param name="maxGram">the largest n-gram to generate</param>
140  */
141  public EdgeNGramTokenFilter(TokenStream input, string sideLabel, int minGram, int maxGram)
142  : this(input, SideExtensions.GetSide(sideLabel), minGram, maxGram)
143  {
144  }
145 
146  public override bool IncrementToken()
147  {
148  while (true)
149  {
150  if (curTermBuffer == null)
151  {
152  if (!input.IncrementToken())
153  {
154  return false;
155  }
156  else
157  {
158  curTermBuffer = (char[])termAtt.TermBuffer().Clone();
159  curTermLength = termAtt.TermLength();
160  curGramSize = minGram;
161  tokStart = offsetAtt.StartOffset;
162  }
163  }
164  if (curGramSize <= maxGram)
165  {
166  if (!(curGramSize > curTermLength // if the remaining input is too short, we can't generate any n-grams
167  || curGramSize > maxGram))
168  { // if we have hit the end of our n-gram size range, quit
169  // grab gramSize chars from front or back
170  int start = side == Side.FRONT ? 0 : curTermLength - curGramSize;
171  int end = start + curGramSize;
172  ClearAttributes();
173  offsetAtt.SetOffset(tokStart + start, tokStart + end);
174  termAtt.SetTermBuffer(curTermBuffer, start, curGramSize);
175  curGramSize++;
176  return true;
177  }
178  }
179  curTermBuffer = null;
180  }
181  }
182 
183  public override void Reset()
184  {
185  base.Reset();
186  curTermBuffer = null;
187  }
188  }
189 }