docs/3.0.3/_duplicate_filter_8cs_source.html

/*

 * Licensed to the Apache Software Foundation (ASF) under one or more

 * contributor license agreements.  See the NOTICE file distributed with

 * this work for additional information regarding copyright ownership.

 * The ASF licenses this file to You under the Apache License, Version 2.0

 * (the "License"); you may not use this file except in compliance with

 * the License.  You may obtain a copy of the License at

 *

 *     http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


using System;

using System.Collections.Generic;

using System.Linq;

using System.Text;


using Lucene.Net.Search;

using Lucene.Net.Index;

using Lucene.Net.Util;


namespace Lucene.Net.Search

{

    public class DuplicateFilter : Filter

    {

        String fieldName;


        /*

         * KeepMode determines which document id to consider as the master, all others being

         * identified as duplicates. Selecting the "first occurrence" can potentially save on IO.

         */

        int keepMode = KM_USE_FIRST_OCCURRENCE;

        public static int KM_USE_FIRST_OCCURRENCE = 1;

        public static int KM_USE_LAST_OCCURRENCE = 2;


        /*

         * "Full" processing mode starts by setting all bits to false and only setting bits

         * for documents that contain the given field and are identified as none-duplicates.


         * "Fast" processing sets all bits to true then unsets all duplicate docs found for the

         * given field. This approach avoids the need to read TermDocs for terms that are seen

         * to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially

         * faster approach , the downside is that bitsets produced will include bits set for

         * documents that do not actually contain the field given.

         *

         */

        int processingMode = PM_FULL_VALIDATION;

        public static int PM_FULL_VALIDATION = 1;

        public static int PM_FAST_INVALIDATION = 2;


        public DuplicateFilter(String fieldName) :  this(fieldName, KM_USE_LAST_OCCURRENCE, PM_FULL_VALIDATION)

        {

        }


        public DuplicateFilter(String fieldName, int keepMode, int processingMode)

        {

            this.fieldName = fieldName;

            this.keepMode = keepMode;

            this.processingMode = processingMode;

        }


        public override DocIdSet GetDocIdSet(IndexReader reader)

        {

            if (processingMode == PM_FAST_INVALIDATION)

            {

                return FastBits(reader);

            }

            else

            {

                return CorrectBits(reader);

            }

        }


        private OpenBitSet CorrectBits(IndexReader reader)

        {

            OpenBitSet bits = new OpenBitSet(reader.MaxDoc); //assume all are INvalid

            Term startTerm = new Term(fieldName);

            TermEnum te = reader.Terms(startTerm);

            if (te != null)

            {

                Term currTerm = te.Term;

                while ((currTerm != null) && (currTerm.Field == startTerm.Field)) //term fieldnames are interned

                {

                    int lastDoc = -1;

                    //set non duplicates

                    TermDocs td = reader.TermDocs(currTerm);

                    if (td.Next())

                    {

                        if (keepMode == KM_USE_FIRST_OCCURRENCE)

                        {

                            bits.Set(td.Doc);

                        }

                        else

                        {

                            do

                            {

                                lastDoc = td.Doc;

                            } while (td.Next());

                            bits.Set(lastDoc);

                        }

                    }

                    if (!te.Next())

                    {

                        break;

                    }

                    currTerm = te.Term;

                }

            }

            return bits;

        }


        private OpenBitSet FastBits(IndexReader reader)

        {

            OpenBitSet bits = new OpenBitSet(reader.MaxDoc);

            bits.Set(0, reader.MaxDoc); //assume all are valid

            Term startTerm = new Term(fieldName);

            TermEnum te = reader.Terms(startTerm);

            if (te != null)

            {

                Term currTerm = te.Term;


                while ((currTerm != null) && (currTerm.Field == startTerm.Field)) //term fieldnames are interned

                {

                    if (te.DocFreq() > 1)

                    {

                        int lastDoc = -1;

                        //unset potential duplicates

                        TermDocs td = reader.TermDocs(currTerm);

                        td.Next();

                        if (keepMode == KM_USE_FIRST_OCCURRENCE)

                        {

                            td.Next();

                        }

                        do

                        {

                            lastDoc = td.Doc;

                            bits.Clear(lastDoc);

                        } while (td.Next());

                        if (keepMode == KM_USE_LAST_OCCURRENCE)

                        {

                            //restore the last bit

                            bits.Set(lastDoc);

                        }

                    }

                    if (!te.Next())

                    {

                        break;

                    }

                    currTerm = te.Term;

                }

            }

            return bits;

        }


        public string FieldName

        {

            get { return fieldName; }

            set { this.fieldName = value; }

        }


        public int KeepMode

        {

            get { return keepMode; }

            set { this.keepMode = value; }

        }


        public override bool Equals(Object obj)

        {

            if (this == obj)

                return true;

            if ((obj == null) || (obj.GetType()!= this.GetType()))

                return false;

            DuplicateFilter other = (DuplicateFilter)obj;

            return keepMode == other.keepMode &&

            processingMode == other.processingMode &&

                (fieldName == other.fieldName || (fieldName != null && fieldName.Equals(other.fieldName)));

        }


        public override int GetHashCode()

        {

            int hash = 217;

            hash = 31 * hash + keepMode;

            hash = 31 * hash + processingMode;

            hash = 31 * hash + fieldName.GetHashCode();

            return hash;

        }


        public int ProcessingMode

        {

            get { return processingMode; }

            set { this.processingMode = value; }

        }

    }

}