Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
ShingleAnalyzerWrapper.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.IO;
20 using Lucene.Net.Analysis.Standard;
21 using Version = Lucene.Net.Util.Version;
22 
23 namespace Lucene.Net.Analysis.Shingle
24 {
25  /*
26  * A ShingleAnalyzerWrapper wraps a {@link ShingleFilter} around another {@link Analyzer}.
27  * <p>
28  * A shingle is another name for a token based n-gram.
29  * </p>
30  */
32  {
33 
35  protected int maxShingleSize = 2;
36  protected bool outputUnigrams = true;
37 
38  public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer)
39  {
40  this.defaultAnalyzer = defaultAnalyzer;
41  SetOverridesTokenStreamMethod<ShingleAnalyzerWrapper>();
42  }
43 
44  public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int maxShingleSize)
45  : this(defaultAnalyzer)
46  {
47 
48  this.maxShingleSize = maxShingleSize;
49  }
50 
51  /*
52  * Wraps {@link StandardAnalyzer}.
53  */
54  public ShingleAnalyzerWrapper(Version matchVersion)
55  {
56  this.defaultAnalyzer = new StandardAnalyzer(matchVersion);
57  SetOverridesTokenStreamMethod<ShingleAnalyzerWrapper>();
58  }
59 
60  /*
61  * Wraps {@link StandardAnalyzer}.
62  */
63  public ShingleAnalyzerWrapper(Version matchVersion, int nGramSize)
64  : this(matchVersion)
65  {
66  this.maxShingleSize = nGramSize;
67  }
68 
69  /// <summary>
70  /// Gets or sets the max shingle (ngram) size
71  /// </summary>
72  public int MaxShingleSize
73  {
74  get { return maxShingleSize; }
75  set { this.maxShingleSize = value; }
76  }
77  /// <summary>
78  /// Gets or sets whether or not to have the filter pass the original tokens
79  /// (the "unigrams") to the output stream
80  /// </summary>
81  public bool IsOutputUnigrams
82  {
83  get { return outputUnigrams; }
84  set { this.outputUnigrams = value; }
85  }
86 
87  public override TokenStream TokenStream(String fieldName, TextReader reader)
88  {
89  TokenStream wrapped;
90  try
91  {
92  wrapped = defaultAnalyzer.ReusableTokenStream(fieldName, reader);
93  }
94  catch (IOException)
95  {
96  wrapped = defaultAnalyzer.TokenStream(fieldName, reader);
97  }
98  ShingleFilter filter = new ShingleFilter(wrapped);
99  filter.SetMaxShingleSize(maxShingleSize);
100  filter.SetOutputUnigrams(outputUnigrams);
101  return filter;
102  }
103 
104  class SavedStreams
105  {
106  protected internal TokenStream wrapped;
107  protected internal ShingleFilter shingle;
108  };
109 
110  public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
111  {
112  if (overridesTokenStreamMethod)
113  {
114  // LUCENE-1678: force fallback to tokenStream() if we
115  // have been subclassed and that subclass overrides
116  // tokenStream but not reusableTokenStream
117  return TokenStream(fieldName, reader);
118  }
119 
120  SavedStreams streams = (SavedStreams)PreviousTokenStream;
121  if (streams == null)
122  {
123  streams = new SavedStreams();
124  streams.wrapped = defaultAnalyzer.ReusableTokenStream(fieldName, reader);
125  streams.shingle = new ShingleFilter(streams.wrapped);
126  PreviousTokenStream = streams;
127  }
128  else
129  {
130  TokenStream result = defaultAnalyzer.ReusableTokenStream(fieldName, reader);
131  if (result == streams.wrapped)
132  {
133  /* the wrapped analyzer reused the stream */
134  streams.shingle.Reset();
135  }
136  else
137  {
138  /* the wrapped analyzer did not, create a new shingle around the new one */
139  streams.wrapped = result;
140  streams.shingle = new ShingleFilter(streams.wrapped);
141  }
142  }
143  streams.shingle.SetMaxShingleSize(maxShingleSize);
144  streams.shingle.SetOutputUnigrams(outputUnigrams);
145  return streams.shingle;
146  }
147  }
148 }