Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
HyphenationCompoundWordTokenFilter.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 //using System;
19 //using System.Collections.Generic;
20 //using System.IO;
21 //using System.Linq;
22 //using Lucene.Net.Analysis.Compound.Hyphenation;
23 
24 //namespace Lucene.Net.Analysis.Compound
25 //{
26 // /*
27 // * A {@link TokenFilter} that decomposes compound words found in many Germanic languages.
28 // * <p>
29 // * "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
30 // * "Donaudampfschiff" even when you only enter "schiff". It uses a hyphenation
31 // * grammar and a word dictionary to achieve this.
32 // * </p>
33 // */
34 //public class HyphenationCompoundWordTokenFilter : CompoundWordTokenFilterBase
35 //{
36 // private HyphenationTree hyphenator;
37 
38 // /*
39 // *
40 // * @param input the {@link TokenStream} to process
41 // * @param hyphenator the hyphenation pattern tree to use for hyphenation
42 // * @param dictionary the word dictionary to match against
43 // * @param minWordSize only words longer than this get processed
44 // * @param minSubwordSize only subwords longer than this get to the output
45 // * stream
46 // * @param maxSubwordSize only subwords shorter than this get to the output
47 // * stream
48 // * @param onlyLongestMatch Add only the longest matching subword to the stream
49 // */
50 // public HyphenationCompoundWordTokenFilter(TokenStream input, HyphenationTree hyphenator, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch)
51 // : this(input, hyphenator, MakeDictionary(dictionary), minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch)
52 // {
53 // }
54 
55 // /*
56 // *
57 // * @param input the {@link TokenStream} to process
58 // * @param hyphenator the hyphenation pattern tree to use for hyphenation
59 // * @param dictionary the word dictionary to match against
60 // */
61 // public HyphenationCompoundWordTokenFilter(TokenStream input, HyphenationTree hyphenator, String[] dictionary)
62 // : this(input, hyphenator, MakeDictionary(dictionary), DEFAULT_MIN_WORD_SIZE,
63 // DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false)
64 // {
65 
66 // }
67 
68 // /*
69 // *
70 // * @param input the {@link TokenStream} to process
71 // * @param hyphenator the hyphenation pattern tree to use for hyphenation
72 // * @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
73 // * lower case strings.
74 // */
75 // public HyphenationCompoundWordTokenFilter(TokenStream input,
76 // HyphenationTree hyphenator, ISet<string> dictionary)
77 // : this(input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false)
78 // {
79 
80 // }
81 
82 // /*
83 // *
84 // * @param input the {@link TokenStream} to process
85 // * @param hyphenator the hyphenation pattern tree to use for hyphenation
86 // * @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
87 // * lower case strings.
88 // * @param minWordSize only words longer than this get processed
89 // * @param minSubwordSize only subwords longer than this get to the output
90 // * stream
91 // * @param maxSubwordSize only subwords shorter than this get to the output
92 // * stream
93 // * @param onlyLongestMatch Add only the longest matching subword to the stream
94 // */
95 // public HyphenationCompoundWordTokenFilter(TokenStream input,
96 // HyphenationTree hyphenator, ISet<string> dictionary, int minWordSize,
97 // int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch)
98 // : base(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize,
99 // onlyLongestMatch)
100 // {
101 
102 
103 // this.hyphenator = hyphenator;
104 // }
105 
106 // /*
107 // * Create a hyphenator tree
108 // *
109 // * @param hyphenationFilename the filename of the XML grammar to load
110 // * @return An object representing the hyphenation patterns
111 // * @throws Exception
112 // */
113 // public static HyphenationTree GetHyphenationTree(String hyphenationFilename)
114 // {
115 // return GetHyphenationTree(new InputSource(hyphenationFilename));
116 // }
117 
118 // /*
119 // * Create a hyphenator tree
120 // *
121 // * @param hyphenationFile the file of the XML grammar to load
122 // * @return An object representing the hyphenation patterns
123 // * @throws Exception
124 // */
125 // public static HyphenationTree GetHyphenationTree(FileInfo hyphenationFile)
126 // {
127 // return GetHyphenationTree(new InputSource(hyphenationFile.toURL().toExternalForm()));
128 // }
129 
130 // /*
131 // * Create a hyphenator tree
132 // *
133 // * @param hyphenationReader the reader of the XML grammar to load from
134 // * @return An object representing the hyphenation patterns
135 // * @throws Exception
136 // */
137 // public static HyphenationTree GetHyphenationTree(TextReader hyphenationReader)
138 // {
139 // InputSource _is = new InputSource(hyphenationReader);
140 // // we need this to load the DTD in very old parsers (like the one in JDK 1.4).
141 // // The DTD itsself is provided via EntityResolver, so it should always load, but
142 // // some parsers still want to have a base URL (Crimson).
143 // _is.setSystemId("urn:java:" + HyphenationTree.class.getName());
144 // return getHyphenationTree(is);
145 // }
146 
147 // /*
148 // * Create a hyphenator tree
149 // *
150 // * @param hyphenationSource the InputSource pointing to the XML grammar
151 // * @return An object representing the hyphenation patterns
152 // * @throws Exception
153 // */
154 // public static HyphenationTree GetHyphenationTree(InputSource hyphenationSource)
155 //{
156 // HyphenationTree tree = new HyphenationTree();
157 // tree.loadPatterns(hyphenationSource);
158 // return tree;
159 // }
160 
161 // protected override void DecomposeInternal(Token token)
162 //{
163 // // get the hyphenation points
164 // Hyphenation.Hyphenation hyphens = hyphenator.hyphenate(token.TermBuffer(), 0, token.TermLength(), 1, 1);
165 // // No hyphen points found -> exit
166 // if (hyphens == null) {
167 // return;
168 // }
169 
170 // int[] hyp = hyphens.GetHyphenationPoints();
171 // char[] lowerCaseTermBuffer=MakeLowerCaseCopy(token.TermBuffer());
172 
173 // for (int i = 0; i < hyp.Length; ++i) {
174 // int remaining = hyp.Length - i;
175 // int start = hyp[i];
176 // Token longestMatchToken = null;
177 // for (int j = 1; j < remaining; j++) {
178 // int partLength = hyp[i + j] - start;
179 
180 // // if the part is longer than maxSubwordSize we
181 // // are done with this round
182 // if (partLength > this.maxSubwordSize) {
183 // break;
184 // }
185 
186 // // we only put subwords to the token stream
187 // // that are longer than minPartSize
188 // if (partLength < this.minSubwordSize) {
189 // continue;
190 // }
191 
192 // // check the dictionary
193 // if (dictionary.Contains(lowerCaseTermBuffer, start, partLength)) {
194 // if (this.onlyLongestMatch) {
195 // if (longestMatchToken != null) {
196 // if (longestMatchToken.TermLength() < partLength) {
197 // longestMatchToken = CreateToken(start, partLength, token);
198 // }
199 // } else {
200 // longestMatchToken = CreateToken(start, partLength, token);
201 // }
202 // } else {
203 // tokens.AddLast(CreateToken(start, partLength, token));
204 // }
205 // } else if (dictionary.Contains(lowerCaseTermBuffer, start,
206 // partLength - 1)) {
207 // // check the dictionary again with a word that is one character
208 // // shorter
209 // // to avoid problems with genitive 's characters and other binding
210 // // characters
211 // if (this.onlyLongestMatch) {
212 // if (longestMatchToken != null) {
213 // if (longestMatchToken.TermLength() < partLength - 1) {
214 // longestMatchToken = CreateToken(start, partLength - 1, token);
215 // }
216 // } else {
217 // longestMatchToken = CreateToken(start, partLength - 1, token);
218 // }
219 // } else {
220 // tokens.AddLast(CreateToken(start, partLength - 1, token));
221 // }
222 // }
223 // }
224 // if (this.onlyLongestMatch && longestMatchToken!=null) {
225 // tokens.AddLast(longestMatchToken);
226 // }
227 // }
228 // }
229 //}
230 //}