Lucene.Net  3.0.3
Lucene.Net is a .NET port of the Java Lucene Indexing Library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties
ShingleMatrixFilter.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.Collections.Generic;
20 using System.Linq;
21 using System.Text;
22 using Lucene.Net.Analysis.Miscellaneous;
23 using Lucene.Net.Analysis.Shingle.Codec;
24 using Lucene.Net.Analysis.Shingle.Matrix;
25 using Lucene.Net.Analysis.Tokenattributes;
26 using Lucene.Net.Support;
27 
28 namespace Lucene.Net.Analysis.Shingle
29 {
105  public sealed class ShingleMatrixFilter : TokenStream
106  {
107  public static Char DefaultSpacerCharacter = '_';
108  public static TokenSettingsCodec DefaultSettingsCodec = new OneDimensionalNonWeightedTokenSettingsCodec();
110 
111  private readonly IFlagsAttribute _flagsAtt;
112  private readonly IFlagsAttribute _inFlagsAtt;
113 
114  private readonly IOffsetAttribute _inOffsetAtt;
115  private readonly IPayloadAttribute _inPayloadAtt;
116  private readonly IPositionIncrementAttribute _inPosIncrAtt;
117  private readonly ITermAttribute _inTermAtt;
118  private readonly ITypeAttribute _inTypeAtt;
119  private readonly TokenStream _input;
120  private readonly IOffsetAttribute _offsetAtt;
121  private readonly IPayloadAttribute _payloadAtt;
122  private readonly IPositionIncrementAttribute _posIncrAtt;
123  private readonly Token _requestNextToken = new Token();
124  private readonly Token _reusableToken = new Token();
125  private readonly TokenSettingsCodec _settingsCodec;
126 
137  private readonly HashSet<EquatableList<Token>> _shinglesSeen =
138  new HashSet<EquatableList<Token>>();
139 
140  private readonly ITermAttribute _termAtt;
141  private readonly ITypeAttribute _typeAtt;
142  private List<Token> _currentPermuationTokens;
143 
144  // Index to what row a token in currentShingleTokens represents
145  private List<Row> _currentPermutationRows;
146 
147  private int _currentPermutationTokensStartOffset;
148  private int _currentShingleLength;
149  private MatrixPermutationIterator _permutations;
150  private Token _readColumnBuf;
151 
152 
166  public ShingleMatrixFilter(Matrix.Matrix matrix, int minimumShingleSize, int maximumShingleSize, Char spacerCharacter, bool ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec)
167  {
168  Matrix = matrix;
169  MinimumShingleSize = minimumShingleSize;
170  MaximumShingleSize = maximumShingleSize;
171  SpacerCharacter = spacerCharacter;
172  IsIgnoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
173  _settingsCodec = settingsCodec;
174 
175  // ReSharper disable DoNotCallOverridableMethodsInConstructor
176  _termAtt = AddAttribute<ITermAttribute>();
177  _posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
178  _payloadAtt = AddAttribute<IPayloadAttribute>();
179  _offsetAtt = AddAttribute<IOffsetAttribute>();
180  _typeAtt = AddAttribute<ITypeAttribute>();
181  _flagsAtt = AddAttribute<IFlagsAttribute>();
182  // ReSharper restore DoNotCallOverridableMethodsInConstructor
183 
184  // set the input to be an empty token stream, we already have the data.
185  _input = new EmptyTokenStream();
186 
187  _inTermAtt = _input.AddAttribute<ITermAttribute>();
188  _inPosIncrAtt = _input.AddAttribute<IPositionIncrementAttribute>();
189  _inPayloadAtt = _input.AddAttribute<IPayloadAttribute>();
190  _inOffsetAtt = _input.AddAttribute<IOffsetAttribute>();
191  _inTypeAtt = _input.AddAttribute<ITypeAttribute>();
192  _inFlagsAtt = _input.AddAttribute<IFlagsAttribute>();
193  }
194 
205  public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize)
206  : this(input, minimumShingleSize, maximumShingleSize, DefaultSpacerCharacter) { }
207 
217  public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize, Char? spacerCharacter)
218  : this( input, minimumShingleSize, maximumShingleSize, spacerCharacter, IgnoringSinglePrefixOrSuffixShingleByDefault) { }
219 
230  public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize, Char? spacerCharacter, bool ignoringSinglePrefixOrSuffixShingle)
231  : this(input, minimumShingleSize, maximumShingleSize, spacerCharacter, ignoringSinglePrefixOrSuffixShingle, DefaultSettingsCodec) { }
232 
242  public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize, Char? spacerCharacter, bool ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec)
243  {
244  _input = input;
245  MinimumShingleSize = minimumShingleSize;
246  MaximumShingleSize = maximumShingleSize;
247  SpacerCharacter = spacerCharacter;
248  IsIgnoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
249  _settingsCodec = settingsCodec;
250 
251  // ReSharper disable DoNotCallOverridableMethodsInConstructor
252  _termAtt = AddAttribute<ITermAttribute>();
253  _posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
254  _payloadAtt = AddAttribute<IPayloadAttribute>();
255  _offsetAtt = AddAttribute<IOffsetAttribute>();
256  _typeAtt = AddAttribute<ITypeAttribute>();
257  _flagsAtt = AddAttribute<IFlagsAttribute>();
258  // ReSharper restore DoNotCallOverridableMethodsInConstructor
259 
260  _inTermAtt = input.AddAttribute<ITermAttribute>();
261  _inPosIncrAtt = input.AddAttribute<IPositionIncrementAttribute>();
262  _inPayloadAtt = input.AddAttribute<IPayloadAttribute>();
263  _inOffsetAtt = input.AddAttribute<IOffsetAttribute>();
264  _inTypeAtt = input.AddAttribute<ITypeAttribute>();
265  _inFlagsAtt = input.AddAttribute<IFlagsAttribute>();
266  }
267 
268  public int MinimumShingleSize { get; set; }
269 
270  public int MaximumShingleSize { get; set; }
271 
272  public Matrix.Matrix Matrix { get; set; }
273 
274  public Char? SpacerCharacter { get; set; }
275 
276  public bool IsIgnoringSinglePrefixOrSuffixShingle { get; set; }
277 
278  public override void Reset()
279  {
280  _permutations = null;
281  _shinglesSeen.Clear();
282  _input.Reset();
283  }
284 
285  protected override void Dispose(bool disposing)
286  {
287  // Do nothing
288  }
289 
290  public override sealed bool IncrementToken()
291  {
292  if (Matrix == null)
293  {
294  Matrix = new Matrix.Matrix();
295 
296  // fill matrix with maximumShingleSize columns
297  while (Matrix.Columns.Count < MaximumShingleSize && ReadColumn())
298  {
299  // this loop looks ugly
300  }
301  }
302 
303  // This loop exists in order to avoid recursive calls to the next method
304  // as the complexity of a large matrix
305  // then would require a multi gigabyte sized stack.
306  Token token;
307  do
308  {
309  token = ProduceNextToken(_reusableToken);
310  } while (token == _requestNextToken);
311 
312  if (token == null)
313  return false;
314 
315  ClearAttributes();
316 
317  _termAtt.SetTermBuffer(token.TermBuffer(), 0, token.TermLength());
318  _posIncrAtt.PositionIncrement = token.PositionIncrement;
319  _flagsAtt.Flags = token.Flags;
320  _offsetAtt.SetOffset(token.StartOffset, token.EndOffset);
321  _typeAtt.Type = token.Type;
322  _payloadAtt.Payload = token.Payload;
323 
324  return true;
325  }
326 
327  private Token GetNextInputToken(Token token)
328  {
329  if (!_input.IncrementToken()) return null;
330 
331  token.SetTermBuffer(_inTermAtt.TermBuffer(), 0, _inTermAtt.TermLength());
332  token.PositionIncrement = _inPosIncrAtt.PositionIncrement;
333  token.Flags = _inFlagsAtt.Flags;
334  token.SetOffset(_inOffsetAtt.StartOffset, _inOffsetAtt.EndOffset);
335  token.Type = _inTypeAtt.Type;
336  token.Payload = _inPayloadAtt.Payload;
337  return token;
338  }
339 
340  private Token GetNextToken(Token token)
341  {
342  if (!this.IncrementToken()) return null;
343  token.SetTermBuffer(_termAtt.TermBuffer(), 0, _termAtt.TermLength());
344  token.PositionIncrement = _posIncrAtt.PositionIncrement;
345  token.Flags = _flagsAtt.Flags;
346  token.SetOffset(_offsetAtt.StartOffset, _offsetAtt.EndOffset);
347  token.Type = _typeAtt.Type;
348  token.Payload = _payloadAtt.Payload;
349  return token;
350  }
351 
360  private Token ProduceNextToken(Token reusableToken)
361  {
362  if (_currentPermuationTokens != null)
363  {
364  _currentShingleLength++;
365 
366  if (_currentShingleLength + _currentPermutationTokensStartOffset <= _currentPermuationTokens.Count
367  && _currentShingleLength <= MaximumShingleSize)
368  {
369  // it is possible to create at least one more shingle of the current matrix permutation
370 
371  if (IsIgnoringSinglePrefixOrSuffixShingle &&
372  _currentShingleLength == 1 &&
373  (_currentPermutationRows[_currentPermutationTokensStartOffset].Column.IsFirst || _currentPermutationRows[_currentPermutationTokensStartOffset].Column.IsLast))
374  {
375  return GetNextToken(reusableToken);
376  }
377 
378  var termLength = 0;
379 
380  var shingle = new EquatableList<Token>();
381 
382  for (int i = 0; i < _currentShingleLength; i++)
383  {
384  var shingleToken = _currentPermuationTokens[i + _currentPermutationTokensStartOffset];
385  termLength += shingleToken.TermLength();
386  shingle.Add(shingleToken);
387  }
388  if (SpacerCharacter != null)
389  termLength += _currentShingleLength - 1;
390 
391  // only produce shingles that not already has been created
392  if (!_shinglesSeen.Add(shingle))
393  return _requestNextToken;
394 
395  // shingle token factory
396  var sb = new StringBuilder(termLength + 10); // paranormal ability to foresee the future. ;)
397  foreach (var shingleToken in shingle)
398  {
399  if (SpacerCharacter != null && sb.Length > 0)
400  sb.Append(SpacerCharacter);
401 
402  sb.Append(shingleToken.TermBuffer(), 0, shingleToken.TermLength());
403  }
404 
405  reusableToken.SetTermBuffer(sb.ToString());
406  UpdateToken(reusableToken, shingle, _currentPermutationTokensStartOffset, _currentPermutationRows,
407  _currentPermuationTokens);
408 
409  return reusableToken;
410  }
411 
412  // it is NOT possible to create one more shingles of the current matrix permutation
413  if (_currentPermutationTokensStartOffset < _currentPermuationTokens.Count - 1)
414  {
415  // reset shingle size and move one step to the right in the current tokens permutation
416  _currentPermutationTokensStartOffset++;
417  _currentShingleLength = MinimumShingleSize - 1;
418  return _requestNextToken;
419  }
420 
421 
422  // todo does this ever occur?
423  if (_permutations == null)
424  return null;
425 
426  if (!_permutations.HasNext())
427  {
428  // load more data (if available) to the matrix
429 
430  // don't really care, we just read it.
431  if (_input != null)
432  ReadColumn();
433 
434  // get rid of resources
435 
436  // delete the first column in the matrix
437  var deletedColumn = Matrix.Columns[0];
438  Matrix.Columns.RemoveAt(0);
439 
440  // remove all shingles seen that include any of the tokens from the deleted column.
441  var deletedColumnTokens = deletedColumn.Rows.SelectMany(row => row.Tokens).ToList();
442 
443  // I'm a little concerned about this part of the code, because the unit tests currently
444  // don't cover this scenario. (I put a break point here, and ran the unit tests in debug mode
445  // and this code block was never hit... I also changed it significatly from the Java version
446  // to use RemoveWhere and LINQ.
447  //
448  // TODO: Write a unit test to cover this and make sure this is a good port! -thoward
449 
450  // linq version
451  _shinglesSeen.RemoveWhere(
452  shingle => (shingle.Find(deletedColumnTokens.Contains) != default(Token)));
453 
455  //var shinglesSeenIterator = _shinglesSeen.ToList();
456  //foreach (var shingle in shinglesSeenIterator)
457  //{
458  // foreach (var deletedColumnToken in deletedColumnTokens)
459  // {
460  // if (shingle.Contains(deletedColumnToken))
461  // {
462  // _shinglesSeen.Remove(shingle);
463  // break;
464  // }
465  // }
466  //}
467 
468  // exhausted
469  if (Matrix.Columns.Count < MinimumShingleSize)
470  return null;
471 
472  // create permutations of the matrix it now looks
473  _permutations = Matrix.PermutationIterator();
474  }
475 
476  NextTokensPermutation();
477  return _requestNextToken;
478  }
479 
480  if (_permutations == null)
481  _permutations = Matrix.PermutationIterator();
482 
483  if (!_permutations.HasNext())
484  return null;
485 
486  NextTokensPermutation();
487 
488  return _requestNextToken;
489  }
490 
497  private void NextTokensPermutation()
498  {
499  var rowsPermutation = _permutations.Next();
500  var currentPermutationRows = new List<Row>();
501  var currentPermuationTokens = new List<Token>();
502 
503  foreach (var row in rowsPermutation)
504  {
505  foreach (var token in row.Tokens)
506  {
507  currentPermuationTokens.Add(token);
508  currentPermutationRows.Add(row);
509  }
510  }
511  _currentPermuationTokens = currentPermuationTokens;
512  _currentPermutationRows = currentPermutationRows;
513 
514  _currentPermutationTokensStartOffset = 0;
515  _currentShingleLength = MinimumShingleSize - 1;
516  }
517 
528  public void UpdateToken(Token token, List<Token> shingle, int currentPermutationStartOffset, List<Row> currentPermutationRows, List<Token> currentPermuationTokens)
529  {
530  token.Type = typeof(ShingleMatrixFilter).Name;
531  token.Flags = 0;
532  token.PositionIncrement = 1;
533  token.StartOffset = (shingle[0]).StartOffset;
534  token.EndOffset = shingle[shingle.Count - 1].EndOffset;
535 
536  _settingsCodec.SetWeight(
537  token,
538  CalculateShingleWeight(token, shingle, currentPermutationStartOffset, currentPermutationRows, currentPermuationTokens)
539  );
540  }
541 
557  public float CalculateShingleWeight(Token shingleToken, List<Token> shingle, int currentPermutationStartOffset, List<Row> currentPermutationRows, List<Token> currentPermuationTokens)
558  {
559  var weights = new double[shingle.Count];
560 
561  double total = 0f;
562  double top = 0d;
563 
564  for (int i = 0; i < weights.Length; i++)
565  {
566  weights[i] = _settingsCodec.GetWeight(shingle[i]);
567 
568  double tmp = weights[i];
569 
570  if (tmp > top)
571  top = tmp;
572 
573  total += tmp;
574  }
575 
576  double factor = 1d/Math.Sqrt(total);
577 
578  double weight = weights.Sum(partWeight => partWeight*factor);
579 
580  return (float) weight;
581  }
582 
589  private bool ReadColumn()
590  {
591  Token token;
592 
593  if (_readColumnBuf != null)
594  {
595  token = _readColumnBuf;
596  _readColumnBuf = null;
597  }
598  else
599  {
600  token = GetNextInputToken(new Token());
601  }
602 
603  if (token == null)
604  return false;
605 
606  var currentReaderColumn = new Column(Matrix);
607  var currentReaderRow = new Row(currentReaderColumn);
608 
609  currentReaderRow.Tokens.AddLast(token);
610 
611  TokenPositioner tokenPositioner;
612  while ((_readColumnBuf = GetNextInputToken(new Token())) != null &&
613  (tokenPositioner = _settingsCodec.GetTokenPositioner(_readColumnBuf)) != TokenPositioner.NewColumn)
614  {
615  if (tokenPositioner == TokenPositioner.SameRow)
616  {
617  currentReaderRow.Tokens.AddLast(_readColumnBuf);
618  }
619  else
620  {
621  currentReaderRow = new Row(currentReaderColumn);
622  currentReaderRow.Tokens.AddLast(_readColumnBuf);
623  }
624  _readColumnBuf = null;
625  }
626 
627  if (_readColumnBuf == null)
628  {
629  _readColumnBuf = GetNextInputToken(new Token());
630 
631  if (_readColumnBuf == null)
632  currentReaderColumn.IsLast = true;
633  }
634 
635  return true;
636  }
637  }
638 }