Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
TeeSinkTokenFilter.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.Collections.Generic;
20 using Attribute = Lucene.Net.Util.Attribute;
21 using AttributeSource = Lucene.Net.Util.AttributeSource;
22 
23 namespace Lucene.Net.Analysis
24 {
25 
26  /// <summary> This TokenFilter provides the ability to set aside attribute states
27  /// that have already been analyzed. This is useful in situations where multiple fields share
28  /// many common analysis steps and then go their separate ways.
29  /// <p/>
30  /// It is also useful for doing things like entity extraction or proper noun analysis as
31  /// part of the analysis workflow and saving off those tokens for use in another field.
32  ///
33  /// <code>
34  /// TeeSinkTokenFilter source1 = new TeeSinkTokenFilter(new WhitespaceTokenizer(reader1));
35  /// TeeSinkTokenFilter.SinkTokenStream sink1 = source1.newSinkTokenStream();
36  /// TeeSinkTokenFilter.SinkTokenStream sink2 = source1.newSinkTokenStream();
37  /// TeeSinkTokenFilter source2 = new TeeSinkTokenFilter(new WhitespaceTokenizer(reader2));
38  /// source2.addSinkTokenStream(sink1);
39  /// source2.addSinkTokenStream(sink2);
40  /// TokenStream final1 = new LowerCaseFilter(source1);
41  /// TokenStream final2 = source2;
42  /// TokenStream final3 = new EntityDetect(sink1);
43  /// TokenStream final4 = new URLDetect(sink2);
44  /// d.add(new Field("f1", final1));
45  /// d.add(new Field("f2", final2));
46  /// d.add(new Field("f3", final3));
47  /// d.add(new Field("f4", final4));
48  /// </code>
49  /// In this example, <c>sink1</c> and <c>sink2</c> will both get tokens from both
50  /// <c>reader1</c> and <c>reader2</c> after whitespace tokenizer
51  /// and now we can further wrap any of these in extra analysis, and more "sources" can be inserted if desired.
52  /// It is important, that tees are consumed before sinks (in the above example, the field names must be
53  /// less the sink's field names). If you are not sure, which stream is consumed first, you can simply
54  /// add another sink and then pass all tokens to the sinks at once using <see cref="ConsumeAllTokens" />.
55  /// This TokenFilter is exhausted after this. In the above example, change
56  /// the example above to:
57  /// <code>
58  /// ...
59  /// TokenStream final1 = new LowerCaseFilter(source1.newSinkTokenStream());
60  /// TokenStream final2 = source2.newSinkTokenStream();
61  /// sink1.consumeAllTokens();
62  /// sink2.consumeAllTokens();
63  /// ...
64  /// </code>
65  /// In this case, the fields can be added in any order, because the sources are not used anymore and all sinks are ready.
66  /// <p/>Note, the EntityDetect and URLDetect TokenStreams are for the example and do not currently exist in Lucene.
67  /// </summary>
68  public sealed class TeeSinkTokenFilter:TokenFilter
69  {
71  {
72  public override bool Accept(AttributeSource source)
73  {
74  return true;
75  }
76  }
77  private readonly LinkedList<WeakReference> sinks = new LinkedList<WeakReference>();
78 
79  /// <summary> Instantiates a new TeeSinkTokenFilter.</summary>
80  public TeeSinkTokenFilter(TokenStream input):base(input)
81  {
82  }
83 
84  /// <summary> Returns a new <see cref="SinkTokenStream" /> that receives all tokens consumed by this stream.</summary>
85  public SinkTokenStream NewSinkTokenStream()
86  {
87  return NewSinkTokenStream(ACCEPT_ALL_FILTER);
88  }
89 
90  /// <summary> Returns a new <see cref="SinkTokenStream" /> that receives all tokens consumed by this stream
91  /// that pass the supplied filter.
92  /// </summary>
93  /// <seealso cref="SinkFilter">
94  /// </seealso>
95  public SinkTokenStream NewSinkTokenStream(SinkFilter filter)
96  {
97  var sink = new SinkTokenStream(this.CloneAttributes(), filter);
98  sinks.AddLast(new WeakReference(sink));
99  return sink;
100  }
101 
102  /// <summary> Adds a <see cref="SinkTokenStream" /> created by another <c>TeeSinkTokenFilter</c>
103  /// to this one. The supplied stream will also receive all consumed tokens.
104  /// This method can be used to pass tokens from two different tees to one sink.
105  /// </summary>
106  public void AddSinkTokenStream(SinkTokenStream sink)
107  {
108  // check that sink has correct factory
109  if (!this.Factory.Equals(sink.Factory))
110  {
111  throw new System.ArgumentException("The supplied sink is not compatible to this tee");
112  }
113  // add eventually missing attribute impls to the existing sink
114  foreach (var impl in this.CloneAttributes().GetAttributeImplsIterator())
115  {
116  sink.AddAttributeImpl(impl);
117  }
118  sinks.AddLast(new WeakReference(sink));
119  }
120 
121  /// <summary> <c>TeeSinkTokenFilter</c> passes all tokens to the added sinks
122  /// when itself is consumed. To be sure, that all tokens from the input
123  /// stream are passed to the sinks, you can call this methods.
124  /// This instance is exhausted after this, but all sinks are instant available.
125  /// </summary>
126  public void ConsumeAllTokens()
127  {
128  while (IncrementToken())
129  {
130  }
131  }
132 
133  public override bool IncrementToken()
134  {
135  if (input.IncrementToken())
136  {
137  // capture state lazily - maybe no SinkFilter accepts this state
138  State state = null;
139  foreach(WeakReference wr in sinks)
140  {
141  var sink = (SinkTokenStream)wr.Target;
142  if (sink != null)
143  {
144  if (sink.Accept(this))
145  {
146  if (state == null)
147  {
148  state = this.CaptureState();
149  }
150  sink.AddState(state);
151  }
152  }
153  }
154  return true;
155  }
156 
157  return false;
158  }
159 
160  public override void End()
161  {
162  base.End();
163  State finalState = CaptureState();
164  foreach(WeakReference wr in sinks)
165  {
166  var sink = (SinkTokenStream)wr.Target;
167  if (sink != null)
168  {
169  sink.SetFinalState(finalState);
170  }
171  }
172  }
173 
174  /// <summary> A filter that decides which <see cref="AttributeSource" /> states to store in the sink.</summary>
175  public abstract class SinkFilter
176  {
177  /// <summary> Returns true, iff the current state of the passed-in <see cref="AttributeSource" /> shall be stored
178  /// in the sink.
179  /// </summary>
180  public abstract bool Accept(AttributeSource source);
181 
182  /// <summary> Called by <see cref="SinkTokenStream.Reset()" />. This method does nothing by default
183  /// and can optionally be overridden.
184  /// </summary>
185  public virtual void Reset()
186  {
187  // nothing to do; can be overridden
188  }
189  }
190 
191  public sealed class SinkTokenStream : TokenStream
192  {
193  private readonly LinkedList<State> cachedStates = new LinkedList<State>();
194  private State finalState;
195  private IEnumerator<AttributeSource.State> it = null;
196  private readonly SinkFilter filter;
197 
198  internal SinkTokenStream(AttributeSource source, SinkFilter filter)
199  : base(source)
200  {
201  this.filter = filter;
202  }
203 
204  internal /*private*/ bool Accept(AttributeSource source)
205  {
206  return filter.Accept(source);
207  }
208 
209  internal /*private*/ void AddState(AttributeSource.State state)
210  {
211  if (it != null)
212  {
213  throw new System.SystemException("The tee must be consumed before sinks are consumed.");
214  }
215  cachedStates.AddLast(state);
216  }
217 
218  internal /*private*/ void SetFinalState(AttributeSource.State finalState)
219  {
220  this.finalState = finalState;
221  }
222 
223  public override bool IncrementToken()
224  {
225  // lazy init the iterator
226  if (it == null)
227  {
228  it = cachedStates.GetEnumerator();
229  }
230 
231  if (!it.MoveNext())
232  {
233  return false;
234  }
235 
236  State state = it.Current;
237  RestoreState(state);
238  return true;
239  }
240 
241  public override void End()
242  {
243  if (finalState != null)
244  {
245  RestoreState(finalState);
246  }
247  }
248 
249  public override void Reset()
250  {
251  it = cachedStates.GetEnumerator();
252  }
253 
254  protected override void Dispose(bool disposing)
255  {
256  // Do nothing.
257  }
258  }
259 
260  private static readonly SinkFilter ACCEPT_ALL_FILTER;
261  static TeeSinkTokenFilter()
262  {
263  ACCEPT_ALL_FILTER = new AnonymousClassSinkFilter();
264  }
265  }
266 }