Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
FieldCacheTermsFilter.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 
20 using IndexReader = Lucene.Net.Index.IndexReader;
21 using TermDocs = Lucene.Net.Index.TermDocs;
22 using OpenBitSet = Lucene.Net.Util.OpenBitSet;
23 
24 namespace Lucene.Net.Search
25 {
26 
27  /// <summary> A <see cref="Filter" /> that only accepts documents whose single
28  /// term value in the specified field is contained in the
29  /// provided set of allowed terms.
30  ///
31  /// <p/>
32  ///
33  /// This is the same functionality as TermsFilter (from
34  /// contrib/queries), except this filter requires that the
35  /// field contains only a single term for all documents.
36  /// Because of drastically different implementations, they
37  /// also have different performance characteristics, as
38  /// described below.
39  ///
40  /// <p/>
41  ///
42  /// The first invocation of this filter on a given field will
43  /// be slower, since a <see cref="StringIndex" /> must be
44  /// created. Subsequent invocations using the same field
45  /// will re-use this cache. However, as with all
46  /// functionality based on <see cref="FieldCache" />, persistent RAM
47  /// is consumed to hold the cache, and is not freed until the
48  /// <see cref="IndexReader" /> is closed. In contrast, TermsFilter
49  /// has no persistent RAM consumption.
50  ///
51  ///
52  /// <p/>
53  ///
54  /// With each search, this filter translates the specified
55  /// set of Terms into a private <see cref="OpenBitSet" /> keyed by
56  /// term number per unique <see cref="IndexReader" /> (normally one
57  /// reader per segment). Then, during matching, the term
58  /// number for each docID is retrieved from the cache and
59  /// then checked for inclusion using the <see cref="OpenBitSet" />.
60  /// Since all testing is done using RAM resident data
61  /// structures, performance should be very fast, most likely
62  /// fast enough to not require further caching of the
63  /// DocIdSet for each possible combination of terms.
64  /// However, because docIDs are simply scanned linearly, an
65  /// index with a great many small documents may find this
66  /// linear scan too costly.
67  ///
68  /// <p/>
69  ///
70  /// In contrast, TermsFilter builds up an <see cref="OpenBitSet" />,
71  /// keyed by docID, every time it's created, by enumerating
72  /// through all matching docs using <see cref="TermDocs" /> to seek
73  /// and scan through each term's docID list. While there is
74  /// no linear scan of all docIDs, besides the allocation of
75  /// the underlying array in the <see cref="OpenBitSet" />, this
76  /// approach requires a number of "disk seeks" in proportion
77  /// to the number of terms, which can be exceptionally costly
78  /// when there are cache misses in the OS's IO cache.
79  ///
80  /// <p/>
81  ///
82  /// Generally, this filter will be slower on the first
83  /// invocation for a given field, but subsequent invocations,
84  /// even if you change the allowed set of Terms, should be
85  /// faster than TermsFilter, especially as the number of
86  /// Terms being matched increases. If you are matching only
87  /// a very small number of terms, and those terms in turn
88  /// match a very small number of documents, TermsFilter may
89  /// perform faster.
90  ///
91  /// <p/>
92  ///
93  /// Which filter is best is very application dependent.
94  /// </summary>
95 
96  [Serializable]
98  {
99  private readonly string field;
100  private readonly string[] terms;
101 
102  public FieldCacheTermsFilter(string field, params string[] terms)
103  {
104  this.field = field;
105  this.terms = terms;
106  }
107 
108  public virtual FieldCache FieldCache
109  {
110  get { return FieldCache_Fields.DEFAULT; }
111  }
112 
113  public override DocIdSet GetDocIdSet(IndexReader reader)
114  {
115  return new FieldCacheTermsFilterDocIdSet(this, FieldCache.GetStringIndex(reader, field));
116  }
117 
118  protected internal class FieldCacheTermsFilterDocIdSet:DocIdSet
119  {
120  private void InitBlock(FieldCacheTermsFilter enclosingInstance)
121  {
122  this.enclosingInstance = enclosingInstance;
123  }
124  private FieldCacheTermsFilter enclosingInstance;
125  public FieldCacheTermsFilter Enclosing_Instance
126  {
127  get
128  {
129  return enclosingInstance;
130  }
131 
132  }
133  private readonly Lucene.Net.Search.StringIndex fcsi;
134 
135  private readonly OpenBitSet openBitSet;
136 
137  public FieldCacheTermsFilterDocIdSet(FieldCacheTermsFilter enclosingInstance, StringIndex fcsi)
138  {
139  InitBlock(enclosingInstance);
140  this.fcsi = fcsi;
141  openBitSet = new OpenBitSet(this.fcsi.lookup.Length);
142  foreach (string t in Enclosing_Instance.terms)
143  {
144  int termNumber = this.fcsi.BinarySearchLookup(t);
145  if (termNumber > 0)
146  {
147  openBitSet.FastSet(termNumber);
148  }
149  }
150  }
151 
152  public override DocIdSetIterator Iterator()
153  {
154  return new FieldCacheTermsFilterDocIdSetIterator(this);
155  }
156 
157  /// <summary>This DocIdSet implementation is cacheable. </summary>
158  public override bool IsCacheable
159  {
160  get { return true; }
161  }
162 
163  protected internal class FieldCacheTermsFilterDocIdSetIterator:DocIdSetIterator
164  {
165  public FieldCacheTermsFilterDocIdSetIterator(FieldCacheTermsFilterDocIdSet enclosingInstance)
166  {
167  InitBlock(enclosingInstance);
168  }
169  private void InitBlock(FieldCacheTermsFilterDocIdSet enclosingInstance)
170  {
171  this.enclosingInstance = enclosingInstance;
172  }
173  private FieldCacheTermsFilterDocIdSet enclosingInstance;
174  public FieldCacheTermsFilterDocIdSet Enclosing_Instance
175  {
176  get
177  {
178  return enclosingInstance;
179  }
180 
181  }
182  private int doc = - 1;
183 
184  public override int DocID()
185  {
186  return doc;
187  }
188 
189  public override int NextDoc()
190  {
191  try
192  {
193  while (!Enclosing_Instance.openBitSet.FastGet(Enclosing_Instance.fcsi.order[++doc]))
194  {
195  }
196  }
197  catch (IndexOutOfRangeException)
198  {
199  doc = NO_MORE_DOCS;
200  }
201  return doc;
202  }
203 
204  public override int Advance(int target)
205  {
206  try
207  {
208  doc = target;
209  while (!Enclosing_Instance.openBitSet.FastGet(Enclosing_Instance.fcsi.order[doc]))
210  {
211  doc++;
212  }
213  }
214  catch (IndexOutOfRangeException)
215  {
216  doc = NO_MORE_DOCS;
217  }
218  return doc;
219  }
220  }
221  }
222  }
223 }