19 using System.Collections.Generic;
22 using Lucene.Net.Analysis.Tokenattributes;
23 using Lucene.Net.Util;
25 namespace Lucene.Net.Analysis.Shingle
41 private LinkedList<State> shingleBuf =
new LinkedList<State>();
42 private StringBuilder[] shingles;
43 private String tokenType =
"shingle";
48 public static readonly
char[] FILLER_TOKEN = {
'_' };
54 public const int DEFAULT_MAX_SHINGLE_SIZE = 2;
59 public const String TOKEN_SEPARATOR =
" ";
65 private bool outputUnigrams =
true;
70 private int maxShingleSize;
82 SetMaxShingleSize(maxShingleSize);
83 this.termAtt = AddAttribute<ITermAttribute>(); ;
84 this.offsetAtt = AddAttribute<IOffsetAttribute>(); ;
85 this.posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); ;
86 this.typeAtt = AddAttribute<ITypeAttribute>(); ;
95 : this(input, DEFAULT_MAX_SHINGLE_SIZE)
106 : this(input, DEFAULT_MAX_SHINGLE_SIZE)
108 setTokenType(tokenType);
117 public void setTokenType(String tokenType)
119 this.tokenType = tokenType;
129 public void SetOutputUnigrams(
bool outputUnigrams)
131 this.outputUnigrams = outputUnigrams;
139 public void SetMaxShingleSize(
int maxShingleSize)
141 if (maxShingleSize < 2)
143 throw new ArgumentException(
"Max shingle size must be >= 2");
145 shingles =
new StringBuilder[maxShingleSize];
146 for (
int i = 0; i < shingles.Length; i++)
148 shingles[i] =
new StringBuilder();
150 this.maxShingleSize = maxShingleSize;
156 private void ClearShingles()
158 for (
int i = 0; i < shingles.Length; i++)
160 shingles[i].Length = 0;
165 private int shingleBufferPosition;
166 private int[] endOffsets;
171 public sealed
override bool IncrementToken()
175 if (nextToken == null)
177 if (!FillShingleBuffer())
183 nextToken = shingleBuf.First.Value;
187 if (shingleBufferPosition == 0)
189 RestoreState(nextToken);
190 posIncrAtt.PositionIncrement = 1;
191 shingleBufferPosition++;
195 else if (shingleBufferPosition % this.maxShingleSize == 0)
197 shingleBufferPosition++;
200 if (shingleBufferPosition < shingleBuf.Count)
202 RestoreState(nextToken);
203 typeAtt.Type = tokenType;
204 offsetAtt.SetOffset(offsetAtt.StartOffset, endOffsets[shingleBufferPosition]);
205 StringBuilder buf = shingles[shingleBufferPosition];
206 int termLength = buf.Length;
207 char[] TermBuffer = termAtt.TermBuffer();
208 if (TermBuffer.Length < termLength)
209 TermBuffer = termAtt.ResizeTermBuffer(termLength);
210 buf.CopyTo(0, TermBuffer, 0, termLength);
211 termAtt.SetTermLength(termLength);
212 if ((!outputUnigrams) && shingleBufferPosition % this.maxShingleSize == 1)
214 posIncrAtt.PositionIncrement = 1;
218 posIncrAtt.PositionIncrement = 0;
220 shingleBufferPosition++;
221 if (shingleBufferPosition == shingleBuf.Count)
224 shingleBufferPosition = 0;
231 shingleBufferPosition = 0;
236 private int numFillerTokensToInsert;
238 private bool hasCurrentToken;
254 private bool GetNextToken()
259 if (numFillerTokensToInsert > 0)
261 if (currentToken == null)
263 currentToken = CaptureState();
267 RestoreState(currentToken);
269 numFillerTokensToInsert--;
271 offsetAtt.SetOffset(offsetAtt.StartOffset, offsetAtt.StartOffset);
272 termAtt.SetTermBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.Length);
278 if (currentToken != null)
280 RestoreState(currentToken);
283 hasCurrentToken =
false;
287 if (!input.IncrementToken())
return false;
288 hasCurrentToken =
true;
290 if (posIncrAtt.PositionIncrement > 1)
292 numFillerTokensToInsert = posIncrAtt.PositionIncrement - 1;
302 private bool FillShingleBuffer()
304 bool addedToken =
false;
312 shingleBuf.AddLast(CaptureState());
313 if (shingleBuf.Count > maxShingleSize)
315 shingleBuf.RemoveFirst();
323 }
while (shingleBuf.Count < maxShingleSize);
325 if (shingleBuf.Count == 0)
336 shingleBuf.RemoveFirst();
339 if (shingleBuf.Count == 0)
346 endOffsets =
new int[shingleBuf.Count];
348 endOffsets.Initialize();
351 for (IEnumerator<State> it = shingleBuf.GetEnumerator(); it.MoveNext(); )
353 RestoreState(it.Current);
354 for (
int j = i; j < shingles.Length; j++)
356 if (shingles[j].Length != 0)
358 shingles[j].Append(TOKEN_SEPARATOR);
360 shingles[j].Append(termAtt.TermBuffer().Take(termAtt.TermLength()).ToArray());
363 endOffsets[i] = offsetAtt.EndOffset;
370 public override void Reset()
374 shingleBufferPosition = 0;
376 numFillerTokensToInsert = 0;
378 hasCurrentToken =
false;