Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
Tokenizer.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using AttributeSource = Lucene.Net.Util.AttributeSource;
19 
20 namespace Lucene.Net.Analysis
21 {
22 
23  /// <summary> A Tokenizer is a TokenStream whose input is a Reader.
24  /// <p/>
25  /// This is an abstract class; subclasses must override <see cref="TokenStream.IncrementToken()" />
26  /// <p/>
27  /// NOTE: Subclasses overriding <see cref="TokenStream.IncrementToken()" /> must call
28  /// <see cref="AttributeSource.ClearAttributes()" /> before setting attributes.
29  /// </summary>
30 
31  public abstract class Tokenizer:TokenStream
32  {
33  /// <summary>The text source for this Tokenizer. </summary>
34  protected internal System.IO.TextReader input;
35 
36  private bool isDisposed;
37 
38  /// <summary>Construct a tokenizer with null input. </summary>
39  protected internal Tokenizer()
40  {
41  }
42 
43  /// <summary>Construct a token stream processing the given input. </summary>
44  protected internal Tokenizer(System.IO.TextReader input)
45  {
46  this.input = CharReader.Get(input);
47  }
48 
49  /// <summary>Construct a tokenizer with null input using the given AttributeFactory. </summary>
50  protected internal Tokenizer(AttributeFactory factory):base(factory)
51  {
52  }
53 
54  /// <summary>Construct a token stream processing the given input using the given AttributeFactory. </summary>
55  protected internal Tokenizer(AttributeFactory factory, System.IO.TextReader input):base(factory)
56  {
57  this.input = CharReader.Get(input);
58  }
59 
60  /// <summary>Construct a token stream processing the given input using the given AttributeSource. </summary>
61  protected internal Tokenizer(AttributeSource source):base(source)
62  {
63  }
64 
65  /// <summary>Construct a token stream processing the given input using the given AttributeSource. </summary>
66  protected internal Tokenizer(AttributeSource source, System.IO.TextReader input):base(source)
67  {
68  this.input = CharReader.Get(input);
69  }
70 
71  protected override void Dispose(bool disposing)
72  {
73  if (isDisposed) return;
74 
75  if (disposing)
76  {
77  if (input != null)
78  {
79  input.Close();
80  }
81  }
82 
83  // LUCENE-2387: don't hold onto Reader after close, so
84  // GC can reclaim
85  input = null;
86  isDisposed = true;
87  }
88 
89  /// <summary>Return the corrected offset. If <see cref="input" /> is a <see cref="CharStream" /> subclass
90  /// this method calls <see cref="CharStream.CorrectOffset" />, else returns <c>currentOff</c>.
91  /// </summary>
92  /// <param name="currentOff">offset as seen in the output
93  /// </param>
94  /// <returns> corrected offset based on the input
95  /// </returns>
96  /// <seealso cref="CharStream.CorrectOffset">
97  /// </seealso>
98  protected internal int CorrectOffset(int currentOff)
99  {
100  return (input is CharStream)?((CharStream) input).CorrectOffset(currentOff):currentOff;
101  }
102 
103  /// <summary>Expert: Reset the tokenizer to a new reader. Typically, an
104  /// analyzer (in its reusableTokenStream method) will use
105  /// this to re-use a previously created tokenizer.
106  /// </summary>
107  public virtual void Reset(System.IO.TextReader input)
108  {
109  this.input = input;
110  }
111  }
112 }