Lucene.Net  3.0.3
Lucene.Net is a .NET port of the Java Lucene Indexing Library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties
DictionaryCompoundWordTokenFilter.cs
Go to the documentation of this file.
1 /*
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements. See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership. The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License. You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied. See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20 */
21 
22 using System;
23 using System.Collections.Generic;
24 
25 namespace Lucene.Net.Analysis.Compound
26 {
27  /*
28  * A {@link TokenFilter} that decomposes compound words found in many Germanic languages.
29  * <p>
30  * "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
31  * "Donaudampfschiff" even when you only enter "schiff".
32  * It uses a brute-force algorithm to achieve this.
33  * </p>
34  */
36  {
37  /*
38  *
39  * @param input the {@link TokenStream} to process
40  * @param dictionary the word dictionary to match against
41  * @param minWordSize only words longer than this get processed
42  * @param minSubwordSize only subwords longer than this get to the output stream
43  * @param maxSubwordSize only subwords shorter than this get to the output stream
44  * @param onlyLongestMatch Add only the longest matching subword to the stream
45  */
46  public DictionaryCompoundWordTokenFilter(TokenStream input, String[] dictionary,
47  int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch)
48  : base(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch)
49  {
50 
51  }
52 
53  /*
54  *
55  * @param input the {@link TokenStream} to process
56  * @param dictionary the word dictionary to match against
57  */
58  public DictionaryCompoundWordTokenFilter(TokenStream input, String[] dictionary)
59  : base(input, dictionary)
60  {
61 
62  }
63 
64  /*
65  *
66  * @param input the {@link TokenStream} to process
67  * @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
68  * lower case strings.
69  */
70  public DictionaryCompoundWordTokenFilter(TokenStream input, ISet<string> dictionary)
71  : base(input, dictionary)
72  {
73 
74  }
75 
76  /*
77  *
78  * @param input the {@link TokenStream} to process
79  * @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
80  * lower case strings.
81  * @param minWordSize only words longer than this get processed
82  * @param minSubwordSize only subwords longer than this get to the output stream
83  * @param maxSubwordSize only subwords shorter than this get to the output stream
84  * @param onlyLongestMatch Add only the longest matching subword to the stream
85  */
86  public DictionaryCompoundWordTokenFilter(TokenStream input, ISet<string> dictionary,
87  int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch)
88  : base(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch)
89  {
90 
91  }
92 
93  protected override void DecomposeInternal(Token token)
94  {
95  // Only words longer than minWordSize get processed
96  if (token.TermLength() < this.minWordSize)
97  {
98  return;
99  }
100 
101  char[] lowerCaseTermBuffer = MakeLowerCaseCopy(token.TermBuffer());
102 
103  for (int i = 0; i < token.TermLength() - this.minSubwordSize; ++i)
104  {
105  Token longestMatchToken = null;
106  for (int j = this.minSubwordSize - 1; j < this.maxSubwordSize; ++j)
107  {
108  if (i + j > token.TermLength())
109  {
110  break;
111  }
112  if (dictionary.Contains(lowerCaseTermBuffer, i, j))
113  {
114  if (this.onlyLongestMatch)
115  {
116  if (longestMatchToken != null)
117  {
118  if (longestMatchToken.TermLength() < j)
119  {
120  longestMatchToken = CreateToken(i, j, token);
121  }
122  }
123  else
124  {
125  longestMatchToken = CreateToken(i, j, token);
126  }
127  }
128  else
129  {
130  tokens.AddLast(CreateToken(i, j, token));
131  }
132  }
133  }
134  if (this.onlyLongestMatch && longestMatchToken != null)
135  {
136  tokens.AddLast(longestMatchToken);
137  }
138  }
139  }
140  }
141 }