Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
Analysis.Ext.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.Collections;
20 using System.Collections.Generic;
21 using System.Text;
22 using System.IO;
23 
24 using Lucene.Net.Analysis;
25 using Lucene.Net.Analysis.Tokenattributes;
26 using Lucene.Net.Util;
27 
28 
29 namespace Lucene.Net.Analysis.Ext
30 {
31  /// <summary>
32  /// This analyzer targets short fields where *word* like searches are required.
33  /// [SomeUser@GMAIL.com 1234567890] will be tokenized as
34  /// [s.o.m.e.u.s.e.r..g.m.a.i.l..com..1.2.3.4.5.6.7.8.9.0] (read .'s as blank)
35  ///
36  /// Usage:
37  /// QueryParser p = new QueryParser(Lucene.Net.Util.Version.LUCENE_29, "fieldName", new SingleCharTokenAnalyzer());
38  /// p.SetDefaultOperator(QueryParser.Operator.AND);
39  /// p.SetEnablePositionIncrements(true);
40  ///
41  /// TopDocs td = src.Search(p.Parse("678"), 10);
42  /// or
43  /// TopDocs td = src.Search(p.Parse("\"gmail.com 1234\""), 10);
44  /// </summary>
46  {
47  /// <summary>
48  /// </summary>
49  public override TokenStream TokenStream(string fieldName, TextReader reader)
50  {
51  TokenStream t = null;
52  t = new LetterOrDigitTokenizer(reader);
53  t = new LowerCaseFilter(t);
54  t = new ASCIIFoldingFilter(t);
55  t = new SingleCharTokenizer(t);
56 
57  return t;
58  }
59 
60  class SingleCharTokenizer : Tokenizer
61  {
62  TokenStream _input = null;
63 
64  ITermAttribute _termAttribute = null;
65  IOffsetAttribute _offsetAttribute = null;
66  IPositionIncrementAttribute _positionIncrementAttribute = null;
67 
68  char[] _buffer = null;
69  int _offset = -1;
70  int _length = -1;
71  int _offsetInStream = -1;
72 
73  public SingleCharTokenizer(TokenStream input): base(input)
74  {
75  _input = input;
76  _termAttribute = AddAttribute<ITermAttribute>();
77  _offsetAttribute = AddAttribute<IOffsetAttribute>();
78  _positionIncrementAttribute = AddAttribute<IPositionIncrementAttribute>();
79  }
80 
81  public override bool IncrementToken()
82  {
83  int positionIncrement = 0;
84  if (_buffer == null || _offset >= _length)
85  {
86  if (!_input.IncrementToken()) return false;
87 
88  _offset = 0;
89  _buffer = _termAttribute.TermBuffer();
90  _length = _termAttribute.TermLength();
91  positionIncrement++;
92  _offsetInStream++;
93  }
94 
95  _offsetAttribute.SetOffset(_offsetInStream, _offsetInStream + 1);
96  _offsetInStream++;
97 
98  positionIncrement++;
99  _positionIncrementAttribute.PositionIncrement = positionIncrement;
100 
101  _termAttribute.SetTermLength(1);
102  _termAttribute.SetTermBuffer(_buffer[_offset++].ToString());
103 
104  return true;
105  }
106 
107  public override void Reset()
108  {
109  _buffer = null;
110  _offset = -1;
111  _length = -1;
112  _offsetInStream = -1;
113 
114  base.Reset();
115  }
116 
117  protected override void Dispose(bool disposing)
118  {
119  _input.Close();
120  base.Dispose(disposing);
121  }
122  }
123  }
124 
125  /// <summary>
126  /// Another Analyzer. Every char which is not a letter or digit is treated as a word separator.
127  /// [Name.Surname@gmail.com 123.456 ğüşıöç%ĞÜŞİÖÇ$ΑΒΓΔΕΖ#АБВГДЕ SSß] will be tokenized as
128  /// [name surname gmail com 123 456 gusioc gusioc αβγδεζ абвгде ssss]
129  ///
130  /// No problem with searches like someuser@gmail or 123.456 since they are
131  /// converted to phrase-query as "someuser gmail" or "123 456".
132  /// </summary>
134  {
135  /// <summary>
136  /// </summary>
137  public override TokenStream TokenStream(string fieldName, TextReader reader)
138  {
139  TokenStream t = null;
140  t = new LetterOrDigitTokenizer(reader);
141  t = new LowerCaseFilter(t);
142  t = new ASCIIFoldingFilter(t);
143 
144  return t;
145  }
146  }
147 
148  /// <summary>
149  /// if a char is not a letter or digit, it is a word separator
150  /// </summary>
152  {
153  /// <summary>
154  /// </summary>
155  public LetterOrDigitTokenizer(TextReader reader): base(reader)
156  {
157  }
158 
159  /// <summary>
160  /// </summary>
161  protected override bool IsTokenChar(char c)
162  {
163  return char.IsLetterOrDigit(c);
164  }
165  }
166 }