Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
ShingleMatrixFilter.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.Collections.Generic;
20 using System.Linq;
21 using System.Text;
22 using Lucene.Net.Analysis.Miscellaneous;
23 using Lucene.Net.Analysis.Shingle.Codec;
24 using Lucene.Net.Analysis.Shingle.Matrix;
25 using Lucene.Net.Analysis.Tokenattributes;
26 using Lucene.Net.Support;
27 
28 namespace Lucene.Net.Analysis.Shingle
29 {
30  /// <summary>
31  /// <p>A ShingleMatrixFilter constructs shingles (token n-grams) from a token stream.
32  /// In other words, it creates combinations of tokens as a single token.</p>
33  ///
34  /// <p>For example, the sentence "please divide this sentence into shingles"
35  /// might be tokenized into shingles "please divide", "divide this",
36  /// "this sentence", "sentence into", and "into shingles".</p>
37  ///
38  /// <p>Using a shingle filter at index and query time can in some instances
39  /// be used to replace phrase queries, especially them with 0 slop.</p>
40  ///
41  /// <p>Without a spacer character
42  /// it can be used to handle composition and decomposition of words
43  /// such as searching for "multi dimensional" instead of "multidimensional".
44  /// It is a rather common human problem at query time
45  /// in several languages, notably the northern Germanic branch.</p>
46  ///
47  /// <p>Shingles are amongst many things also known to solve problems
48  /// in spell checking, language detection and document clustering.</p>
49  ///
50  /// <p>This filter is backed by a three dimensional column oriented matrix
51  /// used to create permutations of the second dimension, the rows,
52  /// and leaves the third, the z-axis, for for multi token synonyms.</p>
53  ///
54  /// <p>In order to use this filter you need to define a way of positioning
55  /// the input stream tokens in the matrix. This is done using a
56  /// ShingleMatrixFilter.TokenSettingsCodec.
57  /// There are three simple implementations for demonstrational purposes,
58  /// see ShingleMatrixFilter.OneDimensionalNonWeightedTokenSettingsCodec,
59  /// ShingleMatrixFilter.TwoDimensionalNonWeightedSynonymTokenSettingsCodec
60  /// and ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec.</p>
61  ///
62  /// <p>Consider this token matrix:</p>
63  /// <pre>
64  /// Token[column][row][z-axis]{
65  /// {{hello}, {greetings, and, salutations}},
66  /// {{world}, {earth}, {tellus}}
67  /// };
68  /// </pre>
69  ///
70  /// It would produce the following 2-3 gram sized shingles:
71  ///
72  /// <pre>
73  /// "hello_world"
74  /// "greetings_and"
75  /// "greetings_and_salutations"
76  /// "and_salutations"
77  /// "and_salutations_world"
78  /// "salutations_world"
79  /// "hello_earth"
80  /// "and_salutations_earth"
81  /// "salutations_earth"
82  /// "hello_tellus"
83  /// "and_salutations_tellus"
84  /// "salutations_tellus"
85  /// </pre>
86  ///
87  /// <p>This implementation can be rather heap demanding
88  /// if (maximum shingle size - minimum shingle size) is a great number and the stream contains many columns,
89  /// or if each column contains a great number of rows.</p>
90  ///
91  /// <p>The problem is that in order avoid producing duplicates
92  /// the filter needs to keep track of any shingle already produced and returned to the consumer.</p>
93  ///
94  /// <p>There is a bit of resource management to handle this
95  /// but it would of course be much better if the filter was written
96  /// so it never created the same shingle more than once in the first place.</p>
97  ///
98  /// <p>The filter also has basic support for calculating weights for the shingles
99  /// based on the weights of the tokens from the input stream, output shingle size, etc.
100  /// See CalculateShingleWeight.
101  /// <p/>
102  /// <b>NOTE:</b> This filter might not behave correctly if used with custom Attributes, i.e. Attributes other than
103  /// the ones located in org.apache.lucene.analysis.tokenattributes.</p>
104  /// </summary>
105  public sealed class ShingleMatrixFilter : TokenStream
106  {
107  public static Char DefaultSpacerCharacter = '_';
108  public static TokenSettingsCodec DefaultSettingsCodec = new OneDimensionalNonWeightedTokenSettingsCodec();
110 
111  private readonly IFlagsAttribute _flagsAtt;
112  private readonly IFlagsAttribute _inFlagsAtt;
113 
114  private readonly IOffsetAttribute _inOffsetAtt;
115  private readonly IPayloadAttribute _inPayloadAtt;
116  private readonly IPositionIncrementAttribute _inPosIncrAtt;
117  private readonly ITermAttribute _inTermAtt;
118  private readonly ITypeAttribute _inTypeAtt;
119  private readonly TokenStream _input;
120  private readonly IOffsetAttribute _offsetAtt;
121  private readonly IPayloadAttribute _payloadAtt;
122  private readonly IPositionIncrementAttribute _posIncrAtt;
123  private readonly Token _requestNextToken = new Token();
124  private readonly Token _reusableToken = new Token();
125  private readonly TokenSettingsCodec _settingsCodec;
126 
127  /// <summary>
128  /// A set containing shingles that has been the result of a call to Next(Token),
129  /// used to avoid producing the same shingle more than once.
130  ///
131  /// <p>
132  /// NOTE: The Java List implementation uses a different equality comparison scheme
133  /// than .NET's Generic List. So We have to use a custom IEqualityComparer implementation
134  /// to get the same behaviour.
135  /// </p>
136  /// </summary>
137  private readonly HashSet<EquatableList<Token>> _shinglesSeen =
138  new HashSet<EquatableList<Token>>();
139 
140  private readonly ITermAttribute _termAtt;
141  private readonly ITypeAttribute _typeAtt;
142  private List<Token> _currentPermuationTokens;
143 
144  // Index to what row a token in currentShingleTokens represents
145  private List<Row> _currentPermutationRows;
146 
147  private int _currentPermutationTokensStartOffset;
148  private int _currentShingleLength;
149  private MatrixPermutationIterator _permutations;
150  private Token _readColumnBuf;
151 
152 
153  /// <summary>
154  /// Creates a shingle filter based on a user defined matrix.
155  ///
156  /// The filter /will/ delete columns from the input matrix! You will not be able to reset the filter if you used this constructor.
157  /// todo: don't touch the matrix! use a bool, set the input stream to null or something, and keep track of where in the matrix we are at.
158  ///
159  /// </summary>
160  /// <param name="matrix">the input based for creating shingles. Does not need to contain any information until ShingleMatrixFilter.IncrementToken() is called the first time.</param>
161  /// <param name="minimumShingleSize">minimum number of tokens in any shingle.</param>
162  /// <param name="maximumShingleSize">maximum number of tokens in any shingle.</param>
163  /// <param name="spacerCharacter">character to use between texts of the token parts in a shingle. null for none.</param>
164  /// <param name="ignoringSinglePrefixOrSuffixShingle">if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'.</param>
165  /// <param name="settingsCodec">codec used to read input token weight and matrix positioning.</param>
166  public ShingleMatrixFilter(Matrix.Matrix matrix, int minimumShingleSize, int maximumShingleSize, Char spacerCharacter, bool ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec)
167  {
168  Matrix = matrix;
169  MinimumShingleSize = minimumShingleSize;
170  MaximumShingleSize = maximumShingleSize;
171  SpacerCharacter = spacerCharacter;
172  IsIgnoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
173  _settingsCodec = settingsCodec;
174 
175  // ReSharper disable DoNotCallOverridableMethodsInConstructor
176  _termAtt = AddAttribute<ITermAttribute>();
177  _posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
178  _payloadAtt = AddAttribute<IPayloadAttribute>();
179  _offsetAtt = AddAttribute<IOffsetAttribute>();
180  _typeAtt = AddAttribute<ITypeAttribute>();
181  _flagsAtt = AddAttribute<IFlagsAttribute>();
182  // ReSharper restore DoNotCallOverridableMethodsInConstructor
183 
184  // set the input to be an empty token stream, we already have the data.
185  _input = new EmptyTokenStream();
186 
187  _inTermAtt = _input.AddAttribute<ITermAttribute>();
188  _inPosIncrAtt = _input.AddAttribute<IPositionIncrementAttribute>();
189  _inPayloadAtt = _input.AddAttribute<IPayloadAttribute>();
190  _inOffsetAtt = _input.AddAttribute<IOffsetAttribute>();
191  _inTypeAtt = _input.AddAttribute<ITypeAttribute>();
192  _inFlagsAtt = _input.AddAttribute<IFlagsAttribute>();
193  }
194 
195  /// <summary>
196  /// Creates a shingle filter using default settings.
197  ///
198  /// See ShingleMatrixFilter.DefaultSpacerCharacter,
199  /// ShingleMatrixFilter.IgnoringSinglePrefixOrSuffixShingleByDefault,
200  /// and ShingleMatrixFilter.DefaultSettingsCodec
201  /// </summary>
202  /// <param name="input">stream from which to construct the matrix</param>
203  /// <param name="minimumShingleSize">minimum number of tokens in any shingle.</param>
204  /// <param name="maximumShingleSize">maximum number of tokens in any shingle.</param>
205  public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize)
206  : this(input, minimumShingleSize, maximumShingleSize, DefaultSpacerCharacter) { }
207 
208  /// <summary>
209  /// Creates a shingle filter using default settings.
210  ///
211  /// See IgnoringSinglePrefixOrSuffixShingleByDefault, and DefaultSettingsCodec
212  /// </summary>
213  /// <param name="input">stream from which to construct the matrix</param>
214  /// <param name="minimumShingleSize">minimum number of tokens in any shingle.</param>
215  /// <param name="maximumShingleSize">maximum number of tokens in any shingle.</param>
216  /// <param name="spacerCharacter">character to use between texts of the token parts in a shingle. null for none. </param>
217  public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize, Char? spacerCharacter)
218  : this( input, minimumShingleSize, maximumShingleSize, spacerCharacter, IgnoringSinglePrefixOrSuffixShingleByDefault) { }
219 
220  /// <summary>
221  /// Creates a shingle filter using the default <see cref="TokenSettingsCodec"/>.
222  ///
223  /// See DefaultSettingsCodec
224  /// </summary>
225  /// <param name="input">stream from which to construct the matrix</param>
226  /// <param name="minimumShingleSize">minimum number of tokens in any shingle.</param>
227  /// <param name="maximumShingleSize">maximum number of tokens in any shingle.</param>
228  /// <param name="spacerCharacter">character to use between texts of the token parts in a shingle. null for none.</param>
229  /// <param name="ignoringSinglePrefixOrSuffixShingle">if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'.</param>
230  public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize, Char? spacerCharacter, bool ignoringSinglePrefixOrSuffixShingle)
231  : this(input, minimumShingleSize, maximumShingleSize, spacerCharacter, ignoringSinglePrefixOrSuffixShingle, DefaultSettingsCodec) { }
232 
233  /// <summary>
234  /// Creates a shingle filter with ad hoc parameter settings.
235  /// </summary>
236  /// <param name="input">stream from which to construct the matrix</param>
237  /// <param name="minimumShingleSize">minimum number of tokens in any shingle.</param>
238  /// <param name="maximumShingleSize">maximum number of tokens in any shingle.</param>
239  /// <param name="spacerCharacter">character to use between texts of the token parts in a shingle. null for none.</param>
240  /// <param name="ignoringSinglePrefixOrSuffixShingle">if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'.</param>
241  /// <param name="settingsCodec">codec used to read input token weight and matrix positioning.</param>
242  public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize, Char? spacerCharacter, bool ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec)
243  {
244  _input = input;
245  MinimumShingleSize = minimumShingleSize;
246  MaximumShingleSize = maximumShingleSize;
247  SpacerCharacter = spacerCharacter;
248  IsIgnoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
249  _settingsCodec = settingsCodec;
250 
251  // ReSharper disable DoNotCallOverridableMethodsInConstructor
252  _termAtt = AddAttribute<ITermAttribute>();
253  _posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
254  _payloadAtt = AddAttribute<IPayloadAttribute>();
255  _offsetAtt = AddAttribute<IOffsetAttribute>();
256  _typeAtt = AddAttribute<ITypeAttribute>();
257  _flagsAtt = AddAttribute<IFlagsAttribute>();
258  // ReSharper restore DoNotCallOverridableMethodsInConstructor
259 
260  _inTermAtt = input.AddAttribute<ITermAttribute>();
261  _inPosIncrAtt = input.AddAttribute<IPositionIncrementAttribute>();
262  _inPayloadAtt = input.AddAttribute<IPayloadAttribute>();
263  _inOffsetAtt = input.AddAttribute<IOffsetAttribute>();
264  _inTypeAtt = input.AddAttribute<ITypeAttribute>();
265  _inFlagsAtt = input.AddAttribute<IFlagsAttribute>();
266  }
267 
268  public int MinimumShingleSize { get; set; }
269 
270  public int MaximumShingleSize { get; set; }
271 
272  public Matrix.Matrix Matrix { get; set; }
273 
274  public Char? SpacerCharacter { get; set; }
275 
276  public bool IsIgnoringSinglePrefixOrSuffixShingle { get; set; }
277 
278  public override void Reset()
279  {
280  _permutations = null;
281  _shinglesSeen.Clear();
282  _input.Reset();
283  }
284 
285  protected override void Dispose(bool disposing)
286  {
287  // Do nothing
288  }
289 
290  public override sealed bool IncrementToken()
291  {
292  if (Matrix == null)
293  {
294  Matrix = new Matrix.Matrix();
295 
296  // fill matrix with maximumShingleSize columns
297  while (Matrix.Columns.Count < MaximumShingleSize && ReadColumn())
298  {
299  // this loop looks ugly
300  }
301  }
302 
303  // This loop exists in order to avoid recursive calls to the next method
304  // as the complexity of a large matrix
305  // then would require a multi gigabyte sized stack.
306  Token token;
307  do
308  {
309  token = ProduceNextToken(_reusableToken);
310  } while (token == _requestNextToken);
311 
312  if (token == null)
313  return false;
314 
315  ClearAttributes();
316 
317  _termAtt.SetTermBuffer(token.TermBuffer(), 0, token.TermLength());
318  _posIncrAtt.PositionIncrement = token.PositionIncrement;
319  _flagsAtt.Flags = token.Flags;
320  _offsetAtt.SetOffset(token.StartOffset, token.EndOffset);
321  _typeAtt.Type = token.Type;
322  _payloadAtt.Payload = token.Payload;
323 
324  return true;
325  }
326 
327  private Token GetNextInputToken(Token token)
328  {
329  if (!_input.IncrementToken()) return null;
330 
331  token.SetTermBuffer(_inTermAtt.TermBuffer(), 0, _inTermAtt.TermLength());
332  token.PositionIncrement = _inPosIncrAtt.PositionIncrement;
333  token.Flags = _inFlagsAtt.Flags;
334  token.SetOffset(_inOffsetAtt.StartOffset, _inOffsetAtt.EndOffset);
335  token.Type = _inTypeAtt.Type;
336  token.Payload = _inPayloadAtt.Payload;
337  return token;
338  }
339 
340  private Token GetNextToken(Token token)
341  {
342  if (!this.IncrementToken()) return null;
343  token.SetTermBuffer(_termAtt.TermBuffer(), 0, _termAtt.TermLength());
344  token.PositionIncrement = _posIncrAtt.PositionIncrement;
345  token.Flags = _flagsAtt.Flags;
346  token.SetOffset(_offsetAtt.StartOffset, _offsetAtt.EndOffset);
347  token.Type = _typeAtt.Type;
348  token.Payload = _payloadAtt.Payload;
349  return token;
350  }
351 
352  /// <summary>
353  /// This method exists in order to avoid recursive calls to the method
354  /// as the complexity of a fairly small matrix then easily would require
355  /// a gigabyte sized stack per thread.
356  /// </summary>
357  /// <param name="reusableToken"></param>
358  /// <returns>null if exhausted, instance request_next_token if one more call is required for an answer,
359  /// or instance parameter resuableToken.</returns>
360  private Token ProduceNextToken(Token reusableToken)
361  {
362  if (_currentPermuationTokens != null)
363  {
364  _currentShingleLength++;
365 
366  if (_currentShingleLength + _currentPermutationTokensStartOffset <= _currentPermuationTokens.Count
367  && _currentShingleLength <= MaximumShingleSize)
368  {
369  // it is possible to create at least one more shingle of the current matrix permutation
370 
371  if (IsIgnoringSinglePrefixOrSuffixShingle &&
372  _currentShingleLength == 1 &&
373  (_currentPermutationRows[_currentPermutationTokensStartOffset].Column.IsFirst || _currentPermutationRows[_currentPermutationTokensStartOffset].Column.IsLast))
374  {
375  return GetNextToken(reusableToken);
376  }
377 
378  var termLength = 0;
379 
380  var shingle = new EquatableList<Token>();
381 
382  for (int i = 0; i < _currentShingleLength; i++)
383  {
384  var shingleToken = _currentPermuationTokens[i + _currentPermutationTokensStartOffset];
385  termLength += shingleToken.TermLength();
386  shingle.Add(shingleToken);
387  }
388  if (SpacerCharacter != null)
389  termLength += _currentShingleLength - 1;
390 
391  // only produce shingles that not already has been created
392  if (!_shinglesSeen.Add(shingle))
393  return _requestNextToken;
394 
395  // shingle token factory
396  var sb = new StringBuilder(termLength + 10); // paranormal ability to foresee the future. ;)
397  foreach (var shingleToken in shingle)
398  {
399  if (SpacerCharacter != null && sb.Length > 0)
400  sb.Append(SpacerCharacter);
401 
402  sb.Append(shingleToken.TermBuffer(), 0, shingleToken.TermLength());
403  }
404 
405  reusableToken.SetTermBuffer(sb.ToString());
406  UpdateToken(reusableToken, shingle, _currentPermutationTokensStartOffset, _currentPermutationRows,
407  _currentPermuationTokens);
408 
409  return reusableToken;
410  }
411 
412  // it is NOT possible to create one more shingles of the current matrix permutation
413  if (_currentPermutationTokensStartOffset < _currentPermuationTokens.Count - 1)
414  {
415  // reset shingle size and move one step to the right in the current tokens permutation
416  _currentPermutationTokensStartOffset++;
417  _currentShingleLength = MinimumShingleSize - 1;
418  return _requestNextToken;
419  }
420 
421 
422  // todo does this ever occur?
423  if (_permutations == null)
424  return null;
425 
426  if (!_permutations.HasNext())
427  {
428  // load more data (if available) to the matrix
429 
430  // don't really care, we just read it.
431  if (_input != null)
432  ReadColumn();
433 
434  // get rid of resources
435 
436  // delete the first column in the matrix
437  var deletedColumn = Matrix.Columns[0];
438  Matrix.Columns.RemoveAt(0);
439 
440  // remove all shingles seen that include any of the tokens from the deleted column.
441  var deletedColumnTokens = deletedColumn.Rows.SelectMany(row => row.Tokens).ToList();
442 
443  // I'm a little concerned about this part of the code, because the unit tests currently
444  // don't cover this scenario. (I put a break point here, and ran the unit tests in debug mode
445  // and this code block was never hit... I also changed it significatly from the Java version
446  // to use RemoveWhere and LINQ.
447  //
448  // TODO: Write a unit test to cover this and make sure this is a good port! -thoward
449 
450  // linq version
451  _shinglesSeen.RemoveWhere(
452  shingle => (shingle.Find(deletedColumnTokens.Contains) != default(Token)));
453 
454  //// initial conversion
455  //var shinglesSeenIterator = _shinglesSeen.ToList();
456  //foreach (var shingle in shinglesSeenIterator)
457  //{
458  // foreach (var deletedColumnToken in deletedColumnTokens)
459  // {
460  // if (shingle.Contains(deletedColumnToken))
461  // {
462  // _shinglesSeen.Remove(shingle);
463  // break;
464  // }
465  // }
466  //}
467 
468  // exhausted
469  if (Matrix.Columns.Count < MinimumShingleSize)
470  return null;
471 
472  // create permutations of the matrix it now looks
473  _permutations = Matrix.PermutationIterator();
474  }
475 
476  NextTokensPermutation();
477  return _requestNextToken;
478  }
479 
480  if (_permutations == null)
481  _permutations = Matrix.PermutationIterator();
482 
483  if (!_permutations.HasNext())
484  return null;
485 
486  NextTokensPermutation();
487 
488  return _requestNextToken;
489  }
490 
491  /// <summary>
492  /// Get next permutation of row combinations,
493  /// creates list of all tokens in the row and
494  /// an index from each such token to what row they exist in.
495  /// finally resets the current (next) shingle size and offset.
496  /// </summary>
497  private void NextTokensPermutation()
498  {
499  var rowsPermutation = _permutations.Next();
500  var currentPermutationRows = new List<Row>();
501  var currentPermuationTokens = new List<Token>();
502 
503  foreach (var row in rowsPermutation)
504  {
505  foreach (var token in row.Tokens)
506  {
507  currentPermuationTokens.Add(token);
508  currentPermutationRows.Add(row);
509  }
510  }
511  _currentPermuationTokens = currentPermuationTokens;
512  _currentPermutationRows = currentPermutationRows;
513 
514  _currentPermutationTokensStartOffset = 0;
515  _currentShingleLength = MinimumShingleSize - 1;
516  }
517 
518  /// <summary>
519  /// Final touch of a shingle token before it is passed on to the consumer from method <see cref="IncrementToken()"/>.
520  ///
521  /// Calculates and sets type, flags, position increment, start/end offsets and weight.
522  /// </summary>
523  /// <param name="token">Shingle Token</param>
524  /// <param name="shingle">Tokens used to produce the shingle token.</param>
525  /// <param name="currentPermutationStartOffset">Start offset in parameter currentPermutationTokens</param>
526  /// <param name="currentPermutationRows">index to Matrix.Column.Row from the position of tokens in parameter currentPermutationTokens</param>
527  /// <param name="currentPermuationTokens">tokens of the current permutation of rows in the matrix. </param>
528  public void UpdateToken(Token token, List<Token> shingle, int currentPermutationStartOffset, List<Row> currentPermutationRows, List<Token> currentPermuationTokens)
529  {
530  token.Type = typeof(ShingleMatrixFilter).Name;
531  token.Flags = 0;
532  token.PositionIncrement = 1;
533  token.StartOffset = (shingle[0]).StartOffset;
534  token.EndOffset = shingle[shingle.Count - 1].EndOffset;
535 
536  _settingsCodec.SetWeight(
537  token,
538  CalculateShingleWeight(token, shingle, currentPermutationStartOffset, currentPermutationRows, currentPermuationTokens)
539  );
540  }
541 
542  /// <summary>
543  /// Evaluates the new shingle token weight.
544  ///
545  /// for (shingle part token in shingle)
546  /// weight += shingle part token weight * (1 / sqrt(all shingle part token weights summed))
547  ///
548  /// This algorithm gives a slightly greater score for longer shingles
549  /// and is rather penalising to great shingle token part weights.
550  /// </summary>
551  /// <param name="shingleToken">token returned to consumer</param>
552  /// <param name="shingle">tokens the tokens used to produce the shingle token.</param>
553  /// <param name="currentPermutationStartOffset">start offset in parameter currentPermutationRows and currentPermutationTokens.</param>
554  /// <param name="currentPermutationRows">an index to what matrix row a token in parameter currentPermutationTokens exist.</param>
555  /// <param name="currentPermuationTokens">all tokens in the current row permutation of the matrix. A sub list (parameter offset, parameter shingle.size) equals parameter shingle.</param>
556  /// <returns>weight to be set for parameter shingleToken </returns>
557  public float CalculateShingleWeight(Token shingleToken, List<Token> shingle, int currentPermutationStartOffset, List<Row> currentPermutationRows, List<Token> currentPermuationTokens)
558  {
559  var weights = new double[shingle.Count];
560 
561  double total = 0f;
562  double top = 0d;
563 
564  for (int i = 0; i < weights.Length; i++)
565  {
566  weights[i] = _settingsCodec.GetWeight(shingle[i]);
567 
568  double tmp = weights[i];
569 
570  if (tmp > top)
571  top = tmp;
572 
573  total += tmp;
574  }
575 
576  double factor = 1d/Math.Sqrt(total);
577 
578  double weight = weights.Sum(partWeight => partWeight*factor);
579 
580  return (float) weight;
581  }
582 
583  /// <summary>
584  /// Loads one column from the token stream.
585  ///
586  /// When the last token is read from the token stream it will column.setLast(true);
587  /// </summary>
588  /// <returns>true if it manage to read one more column from the input token stream</returns>
589  private bool ReadColumn()
590  {
591  Token token;
592 
593  if (_readColumnBuf != null)
594  {
595  token = _readColumnBuf;
596  _readColumnBuf = null;
597  }
598  else
599  {
600  token = GetNextInputToken(new Token());
601  }
602 
603  if (token == null)
604  return false;
605 
606  var currentReaderColumn = new Column(Matrix);
607  var currentReaderRow = new Row(currentReaderColumn);
608 
609  currentReaderRow.Tokens.AddLast(token);
610 
611  TokenPositioner tokenPositioner;
612  while ((_readColumnBuf = GetNextInputToken(new Token())) != null &&
613  (tokenPositioner = _settingsCodec.GetTokenPositioner(_readColumnBuf)) != TokenPositioner.NewColumn)
614  {
615  if (tokenPositioner == TokenPositioner.SameRow)
616  {
617  currentReaderRow.Tokens.AddLast(_readColumnBuf);
618  }
619  else
620  {
621  currentReaderRow = new Row(currentReaderColumn);
622  currentReaderRow.Tokens.AddLast(_readColumnBuf);
623  }
624  _readColumnBuf = null;
625  }
626 
627  if (_readColumnBuf == null)
628  {
629  _readColumnBuf = GetNextInputToken(new Token());
630 
631  if (_readColumnBuf == null)
632  currentReaderColumn.IsLast = true;
633  }
634 
635  return true;
636  }
637  }
638 }