Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
TokenStream.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using Lucene.Net.Util;
20 using Document = Lucene.Net.Documents.Document;
21 using Field = Lucene.Net.Documents.Field;
22 using IndexWriter = Lucene.Net.Index.IndexWriter;
23 using AttributeSource = Lucene.Net.Util.AttributeSource;
24 
25 namespace Lucene.Net.Analysis
26 {
27 
28  /// <summary> A <c>TokenStream</c> enumerates the sequence of tokens, either from
29  /// <see cref="Field" />s of a <see cref="Document" /> or from query text.
30  /// <p/>
31  /// This is an abstract class. Concrete subclasses are:
32  /// <list type="bullet">
33  /// <item><see cref="Tokenizer" />, a <c>TokenStream</c> whose input is a Reader; and</item>
34  /// <item><see cref="TokenFilter" />, a <c>TokenStream</c> whose input is another
35  /// <c>TokenStream</c>.</item>
36  /// </list>
37  /// A new <c>TokenStream</c> API has been introduced with Lucene 2.9. This API
38  /// has moved from being <see cref="Token" /> based to <see cref="IAttribute" /> based. While
39  /// <see cref="Token" /> still exists in 2.9 as a convenience class, the preferred way
40  /// to store the information of a <see cref="Token" /> is to use <see cref="Util.Attribute" />s.
41  /// <p/>
42  /// <c>TokenStream</c> now extends <see cref="AttributeSource" />, which provides
43  /// access to all of the token <see cref="IAttribute" />s for the <c>TokenStream</c>.
44  /// Note that only one instance per <see cref="Util.Attribute" /> is created and reused
45  /// for every token. This approach reduces object creation and allows local
46  /// caching of references to the <see cref="Util.Attribute" />s. See
47  /// <see cref="IncrementToken()" /> for further details.
48  /// <p/>
49  /// <b>The workflow of the new <c>TokenStream</c> API is as follows:</b>
50  /// <list type="bullet">
51  /// <item>Instantiation of <c>TokenStream</c>/<see cref="TokenFilter" />s which add/get
52  /// attributes to/from the <see cref="AttributeSource" />.</item>
53  /// <item>The consumer calls <see cref="TokenStream.Reset()" />.</item>
54  /// <item>The consumer retrieves attributes from the stream and stores local
55  /// references to all attributes it wants to access</item>
56  /// <item>The consumer calls <see cref="IncrementToken()" /> until it returns false and
57  /// consumes the attributes after each call.</item>
58  /// <item>The consumer calls <see cref="End()" /> so that any end-of-stream operations
59  /// can be performed.</item>
60  /// <item>The consumer calls <see cref="Close()" /> to release any resource when finished
61  /// using the <c>TokenStream</c></item>
62  /// </list>
63  /// To make sure that filters and consumers know which attributes are available,
64  /// the attributes must be added during instantiation. Filters and consumers are
65  /// not required to check for availability of attributes in
66  /// <see cref="IncrementToken()" />.
67  /// <p/>
68  /// You can find some example code for the new API in the analysis package level
69  /// Javadoc.
70  /// <p/>
71  /// Sometimes it is desirable to capture a current state of a <c>TokenStream</c>
72  /// , e. g. for buffering purposes (see <see cref="CachingTokenFilter" />,
73  /// <see cref="TeeSinkTokenFilter" />). For this usecase
74  /// <see cref="AttributeSource.CaptureState" /> and <see cref="AttributeSource.RestoreState" />
75  /// can be used.
76  /// </summary>
77  public abstract class TokenStream : AttributeSource, IDisposable
78  {
79  /// <summary> A TokenStream using the default attribute factory.</summary>
80  protected internal TokenStream()
81  { }
82 
83  /// <summary> A TokenStream that uses the same attributes as the supplied one.</summary>
84  protected internal TokenStream(AttributeSource input)
85  : base(input)
86  { }
87 
88  /// <summary> A TokenStream using the supplied AttributeFactory for creating new <see cref="IAttribute" /> instances.</summary>
89  protected internal TokenStream(AttributeFactory factory)
90  : base(factory)
91  { }
92 
93  /// <summary> Consumers (i.e., <see cref="IndexWriter" />) use this method to advance the stream to
94  /// the next token. Implementing classes must implement this method and update
95  /// the appropriate <see cref="Util.Attribute" />s with the attributes of the next
96  /// token.
97  ///
98  /// The producer must make no assumptions about the attributes after the
99  /// method has been returned: the caller may arbitrarily change it. If the
100  /// producer needs to preserve the state for subsequent calls, it can use
101  /// <see cref="AttributeSource.CaptureState" /> to create a copy of the current attribute state.
102  ///
103  /// This method is called for every token of a document, so an efficient
104  /// implementation is crucial for good performance. To avoid calls to
105  /// <see cref="AttributeSource.AddAttribute{T}()" /> and <see cref="AttributeSource.GetAttribute{T}()" />,
106  /// references to all <see cref="Util.Attribute" />s that this stream uses should be
107  /// retrieved during instantiation.
108  ///
109  /// To ensure that filters and consumers know which attributes are available,
110  /// the attributes must be added during instantiation. Filters and consumers
111  /// are not required to check for availability of attributes in
112  /// <see cref="IncrementToken()" />.
113  ///
114  /// </summary>
115  /// <returns> false for end of stream; true otherwise</returns>
116  public abstract bool IncrementToken();
117 
118  /// <summary> This method is called by the consumer after the last token has been
119  /// consumed, after <see cref="IncrementToken" /> returned <c>false</c>
120  /// (using the new <c>TokenStream</c> API). Streams implementing the old API
121  /// should upgrade to use this feature.
122  /// <p/>
123  /// This method can be used to perform any end-of-stream operations, such as
124  /// setting the final offset of a stream. The final offset of a stream might
125  /// differ from the offset of the last token eg in case one or more whitespaces
126  /// followed after the last token, but a <see cref="WhitespaceTokenizer" /> was used.
127  ///
128  /// </summary>
129  /// <throws> IOException </throws>
130  public virtual void End()
131  {
132  // do nothing by default
133  }
134 
135  /// <summary> Resets this stream to the beginning. This is an optional operation, so
136  /// subclasses may or may not implement this method. <see cref="Reset()" /> is not needed for
137  /// the standard indexing process. However, if the tokens of a
138  /// <c>TokenStream</c> are intended to be consumed more than once, it is
139  /// necessary to implement <see cref="Reset()" />. Note that if your TokenStream
140  /// caches tokens and feeds them back again after a reset, it is imperative
141  /// that you clone the tokens when you store them away (on the first pass) as
142  /// well as when you return them (on future passes after <see cref="Reset()" />).
143  /// </summary>
144  public virtual void Reset()
145  {
146  }
147 
148  /// <summary>Releases resources associated with this stream. </summary>
149  [Obsolete("Use Dispose() instead")]
150  public void Close()
151  {
152  Dispose();
153  }
154 
155  public void Dispose()
156  {
157  Dispose(true);
158  }
159 
160  protected abstract void Dispose(bool disposing);
161  }
162 }