Lucene.Net  3.0.3
Lucene.Net is a .NET port of the Java Lucene Indexing Library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties
DuplicateFilter.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.Collections.Generic;
20 using System.Linq;
21 using System.Text;
22 
23 using Lucene.Net.Search;
24 using Lucene.Net.Index;
25 using Lucene.Net.Util;
26 
27 namespace Lucene.Net.Search
28 {
29  public class DuplicateFilter : Filter
30  {
31  String fieldName;
32 
33  /*
34  * KeepMode determines which document id to consider as the master, all others being
35  * identified as duplicates. Selecting the "first occurrence" can potentially save on IO.
36  */
37  int keepMode = KM_USE_FIRST_OCCURRENCE;
38  public static int KM_USE_FIRST_OCCURRENCE = 1;
39  public static int KM_USE_LAST_OCCURRENCE = 2;
40 
41  /*
42  * "Full" processing mode starts by setting all bits to false and only setting bits
43  * for documents that contain the given field and are identified as none-duplicates.
44 
45  * "Fast" processing sets all bits to true then unsets all duplicate docs found for the
46  * given field. This approach avoids the need to read TermDocs for terms that are seen
47  * to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially
48  * faster approach , the downside is that bitsets produced will include bits set for
49  * documents that do not actually contain the field given.
50  *
51  */
52  int processingMode = PM_FULL_VALIDATION;
53  public static int PM_FULL_VALIDATION = 1;
54  public static int PM_FAST_INVALIDATION = 2;
55 
56 
57 
58  public DuplicateFilter(String fieldName) : this(fieldName, KM_USE_LAST_OCCURRENCE, PM_FULL_VALIDATION)
59  {
60  }
61 
62 
63  public DuplicateFilter(String fieldName, int keepMode, int processingMode)
64  {
65  this.fieldName = fieldName;
66  this.keepMode = keepMode;
67  this.processingMode = processingMode;
68  }
69 
70  public override DocIdSet GetDocIdSet(IndexReader reader)
71  {
72  if (processingMode == PM_FAST_INVALIDATION)
73  {
74  return FastBits(reader);
75  }
76  else
77  {
78  return CorrectBits(reader);
79  }
80  }
81 
82  private OpenBitSet CorrectBits(IndexReader reader)
83  {
84  OpenBitSet bits = new OpenBitSet(reader.MaxDoc); //assume all are INvalid
85  Term startTerm = new Term(fieldName);
86  TermEnum te = reader.Terms(startTerm);
87  if (te != null)
88  {
89  Term currTerm = te.Term;
90  while ((currTerm != null) && (currTerm.Field == startTerm.Field)) //term fieldnames are interned
91  {
92  int lastDoc = -1;
93  //set non duplicates
94  TermDocs td = reader.TermDocs(currTerm);
95  if (td.Next())
96  {
97  if (keepMode == KM_USE_FIRST_OCCURRENCE)
98  {
99  bits.Set(td.Doc);
100  }
101  else
102  {
103  do
104  {
105  lastDoc = td.Doc;
106  } while (td.Next());
107  bits.Set(lastDoc);
108  }
109  }
110  if (!te.Next())
111  {
112  break;
113  }
114  currTerm = te.Term;
115  }
116  }
117  return bits;
118  }
119 
120  private OpenBitSet FastBits(IndexReader reader)
121  {
122  OpenBitSet bits = new OpenBitSet(reader.MaxDoc);
123  bits.Set(0, reader.MaxDoc); //assume all are valid
124  Term startTerm = new Term(fieldName);
125  TermEnum te = reader.Terms(startTerm);
126  if (te != null)
127  {
128  Term currTerm = te.Term;
129 
130  while ((currTerm != null) && (currTerm.Field == startTerm.Field)) //term fieldnames are interned
131  {
132  if (te.DocFreq() > 1)
133  {
134  int lastDoc = -1;
135  //unset potential duplicates
136  TermDocs td = reader.TermDocs(currTerm);
137  td.Next();
138  if (keepMode == KM_USE_FIRST_OCCURRENCE)
139  {
140  td.Next();
141  }
142  do
143  {
144  lastDoc = td.Doc;
145  bits.Clear(lastDoc);
146  } while (td.Next());
147  if (keepMode == KM_USE_LAST_OCCURRENCE)
148  {
149  //restore the last bit
150  bits.Set(lastDoc);
151  }
152  }
153  if (!te.Next())
154  {
155  break;
156  }
157  currTerm = te.Term;
158  }
159  }
160  return bits;
161  }
162 
163  public string FieldName
164  {
165  get { return fieldName; }
166  set { this.fieldName = value; }
167  }
168 
169  public int KeepMode
170  {
171  get { return keepMode; }
172  set { this.keepMode = value; }
173  }
174 
175  public override bool Equals(Object obj)
176  {
177  if (this == obj)
178  return true;
179  if ((obj == null) || (obj.GetType()!= this.GetType()))
180  return false;
181  DuplicateFilter other = (DuplicateFilter)obj;
182  return keepMode == other.keepMode &&
183  processingMode == other.processingMode &&
184  (fieldName == other.fieldName || (fieldName != null && fieldName.Equals(other.fieldName)));
185  }
186 
187  public override int GetHashCode()
188  {
189  int hash = 217;
190  hash = 31 * hash + keepMode;
191  hash = 31 * hash + processingMode;
192  hash = 31 * hash + fieldName.GetHashCode();
193  return hash;
194  }
195 
196  public int ProcessingMode
197  {
198  get { return processingMode; }
199  set { this.processingMode = value; }
200  }
201  }
202 }