19 using System.Collections.Generic;
22 using Lucene.Net.Analysis.Miscellaneous;
23 using Lucene.Net.Analysis.Shingle.Codec;
24 using Lucene.Net.Analysis.Shingle.Matrix;
25 using Lucene.Net.Analysis.Tokenattributes;
26 using Lucene.Net.Support;
28 namespace Lucene.Net.Analysis.Shingle
107 public static Char DefaultSpacerCharacter =
'_';
123 private readonly
Token _requestNextToken =
new Token();
124 private readonly
Token _reusableToken =
new Token();
137 private readonly HashSet<EquatableList<Token>> _shinglesSeen =
138 new HashSet<EquatableList<Token>>();
142 private List<Token> _currentPermuationTokens;
145 private List<Row> _currentPermutationRows;
147 private int _currentPermutationTokensStartOffset;
148 private int _currentShingleLength;
150 private Token _readColumnBuf;
169 MinimumShingleSize = minimumShingleSize;
170 MaximumShingleSize = maximumShingleSize;
171 SpacerCharacter = spacerCharacter;
172 IsIgnoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
173 _settingsCodec = settingsCodec;
176 _termAtt = AddAttribute<ITermAttribute>();
177 _posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
178 _payloadAtt = AddAttribute<IPayloadAttribute>();
179 _offsetAtt = AddAttribute<IOffsetAttribute>();
180 _typeAtt = AddAttribute<ITypeAttribute>();
181 _flagsAtt = AddAttribute<IFlagsAttribute>();
206 : this(input, minimumShingleSize, maximumShingleSize, DefaultSpacerCharacter) { }
218 : this( input, minimumShingleSize, maximumShingleSize, spacerCharacter, IgnoringSinglePrefixOrSuffixShingleByDefault) { }
231 : this(input, minimumShingleSize, maximumShingleSize, spacerCharacter, ignoringSinglePrefixOrSuffixShingle, DefaultSettingsCodec) { }
245 MinimumShingleSize = minimumShingleSize;
246 MaximumShingleSize = maximumShingleSize;
247 SpacerCharacter = spacerCharacter;
248 IsIgnoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
249 _settingsCodec = settingsCodec;
252 _termAtt = AddAttribute<ITermAttribute>();
253 _posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
254 _payloadAtt = AddAttribute<IPayloadAttribute>();
255 _offsetAtt = AddAttribute<IOffsetAttribute>();
256 _typeAtt = AddAttribute<ITypeAttribute>();
257 _flagsAtt = AddAttribute<IFlagsAttribute>();
268 public int MinimumShingleSize {
get;
set; }
270 public int MaximumShingleSize {
get;
set; }
274 public Char? SpacerCharacter {
get;
set; }
276 public bool IsIgnoringSinglePrefixOrSuffixShingle {
get;
set; }
278 public override void Reset()
280 _permutations = null;
281 _shinglesSeen.Clear();
285 protected override void Dispose(
bool disposing)
290 public override sealed
bool IncrementToken()
297 while (
Matrix.
Columns.Count < MaximumShingleSize && ReadColumn())
309 token = ProduceNextToken(_reusableToken);
310 }
while (token == _requestNextToken);
319 _flagsAtt.Flags = token.
Flags;
321 _typeAtt.Type = token.
Type;
322 _payloadAtt.Payload = token.
Payload;
329 if (!_input.IncrementToken())
return null;
331 token.
SetTermBuffer(_inTermAtt.TermBuffer(), 0, _inTermAtt.TermLength());
333 token.
Flags = _inFlagsAtt.Flags;
334 token.
SetOffset(_inOffsetAtt.StartOffset, _inOffsetAtt.EndOffset);
335 token.
Type = _inTypeAtt.Type;
336 token.
Payload = _inPayloadAtt.Payload;
342 if (!this.IncrementToken())
return null;
343 token.
SetTermBuffer(_termAtt.TermBuffer(), 0, _termAtt.TermLength());
345 token.
Flags = _flagsAtt.Flags;
346 token.
SetOffset(_offsetAtt.StartOffset, _offsetAtt.EndOffset);
347 token.
Type = _typeAtt.Type;
348 token.
Payload = _payloadAtt.Payload;
360 private Token ProduceNextToken(
Token reusableToken)
362 if (_currentPermuationTokens != null)
364 _currentShingleLength++;
366 if (_currentShingleLength + _currentPermutationTokensStartOffset <= _currentPermuationTokens.Count
367 && _currentShingleLength <= MaximumShingleSize)
371 if (IsIgnoringSinglePrefixOrSuffixShingle &&
372 _currentShingleLength == 1 &&
373 (_currentPermutationRows[_currentPermutationTokensStartOffset].
Column.
IsFirst || _currentPermutationRows[_currentPermutationTokensStartOffset].Column.IsLast))
375 return GetNextToken(reusableToken);
380 var shingle =
new EquatableList<Token>();
382 for (
int i = 0; i < _currentShingleLength; i++)
384 var shingleToken = _currentPermuationTokens[i + _currentPermutationTokensStartOffset];
385 termLength += shingleToken.TermLength();
386 shingle.Add(shingleToken);
388 if (SpacerCharacter != null)
389 termLength += _currentShingleLength - 1;
392 if (!_shinglesSeen.Add(shingle))
393 return _requestNextToken;
396 var sb =
new StringBuilder(termLength + 10);
397 foreach (var shingleToken
in shingle)
399 if (SpacerCharacter != null && sb.Length > 0)
400 sb.Append(SpacerCharacter);
402 sb.Append(shingleToken.TermBuffer(), 0, shingleToken.TermLength());
405 reusableToken.SetTermBuffer(sb.ToString());
406 UpdateToken(reusableToken, shingle, _currentPermutationTokensStartOffset, _currentPermutationRows,
407 _currentPermuationTokens);
409 return reusableToken;
413 if (_currentPermutationTokensStartOffset < _currentPermuationTokens.Count - 1)
416 _currentPermutationTokensStartOffset++;
417 _currentShingleLength = MinimumShingleSize - 1;
418 return _requestNextToken;
423 if (_permutations == null)
426 if (!_permutations.HasNext())
441 var deletedColumnTokens = deletedColumn.Rows.SelectMany(row => row.Tokens).ToList();
451 _shinglesSeen.RemoveWhere(
452 shingle => (shingle.Find(deletedColumnTokens.Contains) !=
default(
Token)));
476 NextTokensPermutation();
477 return _requestNextToken;
480 if (_permutations == null)
483 if (!_permutations.HasNext())
486 NextTokensPermutation();
488 return _requestNextToken;
497 private void NextTokensPermutation()
499 var rowsPermutation = _permutations.Next();
500 var currentPermutationRows =
new List<Row>();
501 var currentPermuationTokens =
new List<Token>();
503 foreach (var row
in rowsPermutation)
505 foreach (var token
in row.Tokens)
507 currentPermuationTokens.Add(token);
508 currentPermutationRows.Add(row);
511 _currentPermuationTokens = currentPermuationTokens;
512 _currentPermutationRows = currentPermutationRows;
514 _currentPermutationTokensStartOffset = 0;
515 _currentShingleLength = MinimumShingleSize - 1;
528 public void UpdateToken(
Token token, List<Token> shingle,
int currentPermutationStartOffset, List<Row> currentPermutationRows, List<Token> currentPermuationTokens)
534 token.
EndOffset = shingle[shingle.Count - 1].EndOffset;
536 _settingsCodec.SetWeight(
538 CalculateShingleWeight(token, shingle, currentPermutationStartOffset, currentPermutationRows, currentPermuationTokens)
557 public float CalculateShingleWeight(
Token shingleToken, List<Token> shingle,
int currentPermutationStartOffset, List<Row> currentPermutationRows, List<Token> currentPermuationTokens)
559 var weights =
new double[shingle.Count];
564 for (
int i = 0; i < weights.Length; i++)
566 weights[i] = _settingsCodec.GetWeight(shingle[i]);
568 double tmp = weights[i];
576 double factor = 1d/Math.Sqrt(total);
578 double weight = weights.Sum(partWeight => partWeight*factor);
580 return (
float) weight;
589 private bool ReadColumn()
593 if (_readColumnBuf != null)
595 token = _readColumnBuf;
596 _readColumnBuf = null;
600 token = GetNextInputToken(
new Token());
607 var currentReaderRow =
new Row(currentReaderColumn);
609 currentReaderRow.Tokens.AddLast(token);
611 TokenPositioner tokenPositioner;
612 while ((_readColumnBuf = GetNextInputToken(
new Token())) != null &&
613 (tokenPositioner = _settingsCodec.GetTokenPositioner(_readColumnBuf)) != TokenPositioner.NewColumn)
615 if (tokenPositioner == TokenPositioner.SameRow)
617 currentReaderRow.Tokens.AddLast(_readColumnBuf);
621 currentReaderRow =
new Row(currentReaderColumn);
622 currentReaderRow.Tokens.AddLast(_readColumnBuf);
624 _readColumnBuf = null;
627 if (_readColumnBuf == null)
629 _readColumnBuf = GetNextInputToken(
new Token());
631 if (_readColumnBuf == null)
632 currentReaderColumn.IsLast =
true;