db/d75/_weighted_span_term_extractor_8cs_source.html

/*

 *

 * Licensed to the Apache Software Foundation (ASF) under one

 * or more contributor license agreements.  See the NOTICE file

 * distributed with this work for additional information

 * regarding copyright ownership.  The ASF licenses this file

 * to you under the Apache License, Version 2.0 (the

 * "License"); you may not use this file except in compliance

 * with the License.  You may obtain a copy of the License at

 *

 *   http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing,

 * software distributed under the License is distributed on an

 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

 * KIND, either express or implied.  See the License for the

 * specific language governing permissions and limitations

 * under the License.

 *

*/


using System;

using System.Collections.Generic;

using System.IO;

using System.Linq;

using System.Text;

using Lucene.Net.Analysis;

using Lucene.Net.Index;

using Lucene.Net.Index.Memory;

using Lucene.Net.Search.Spans;

using Lucene.Net.Store;

using Lucene.Net.Support;

using Lucene.Net.Util;


namespace Lucene.Net.Search.Highlight

{

    /// <summary>

    /// Class used to extract <see cref="WeightedSpanTerm"/>s from a <see cref="Query"/> based on whether

    /// <see cref="Term"/>s from the <see cref="Query"/> are contained in a supplied <see cref="Analysis.TokenStream"/>.

    /// </summary>

    public class WeightedSpanTermExtractor

    {

        private String fieldName;

        private TokenStream tokenStream;

        private IDictionary<String, IndexReader> readers = new HashMap<String, IndexReader>(10);

        private String defaultField;

        private bool expandMultiTermQuery;

        private bool cachedTokenStream;

        private bool wrapToCaching = true;


        public WeightedSpanTermExtractor()

        {

        }


        public WeightedSpanTermExtractor(String defaultField)

        {

            if (defaultField != null)

            {

                this.defaultField = StringHelper.Intern(defaultField);

            }

        }


        private void CloseReaders()

        {

            ICollection<IndexReader> readerSet = readers.Values;


            foreach (IndexReader reader in readerSet)

            {

                try

                {

                    reader.Close();

                }

                catch (IOException e)

                {

                    // alert?

                }

            }

        }


        /// <summary>

        /// Fills a <c>Map</c> with <see cref="WeightedSpanTerm"/>s using the terms from the supplied <c>Query</c>.

        /// </summary>

        /// <param name="query">Query to extract Terms from</param>

        /// <param name="terms">Map to place created WeightedSpanTerms in</param>

        private void Extract(Query query, IDictionary<String, WeightedSpanTerm> terms)

        {

            if (query is BooleanQuery)

            {

                BooleanClause[] queryClauses = ((BooleanQuery) query).GetClauses();


                for (int i = 0; i < queryClauses.Length; i++)

                {

                    if (!queryClauses[i].IsProhibited)

                    {

                        Extract(queryClauses[i].Query, terms);

                    }

                }

            }

            else if (query is PhraseQuery)

            {

                PhraseQuery phraseQuery = ((PhraseQuery) query);

                Term[] phraseQueryTerms = phraseQuery.GetTerms();

                SpanQuery[] clauses = new SpanQuery[phraseQueryTerms.Length];

                for (int i = 0; i < phraseQueryTerms.Length; i++)

                {

                    clauses[i] = new SpanTermQuery(phraseQueryTerms[i]);

                }

                int slop = phraseQuery.Slop;

                int[] positions = phraseQuery.GetPositions();

                // add largest position increment to slop

                if (positions.Length > 0)

                {

                    int lastPos = positions[0];

                    int largestInc = 0;

                    int sz = positions.Length;

                    for (int i = 1; i < sz; i++)

                    {

                        int pos = positions[i];

                        int inc = pos - lastPos;

                        if (inc > largestInc)

                        {

                            largestInc = inc;

                        }

                        lastPos = pos;

                    }

                    if (largestInc > 1)

                    {

                        slop += largestInc;

                    }

                }


                bool inorder = slop == 0;


                SpanNearQuery sp = new SpanNearQuery(clauses, slop, inorder);

                sp.Boost = query.Boost;

                ExtractWeightedSpanTerms(terms, sp);

            }

            else if (query is TermQuery)

            {

                ExtractWeightedTerms(terms, query);

            }

            else if (query is SpanQuery)

            {

                ExtractWeightedSpanTerms(terms, (SpanQuery) query);

            }

            else if (query is FilteredQuery)

            {

                Extract(((FilteredQuery) query).Query, terms);

            }

            else if (query is DisjunctionMaxQuery)

            {

                foreach (var q in ((DisjunctionMaxQuery) query))

                {

                    Extract(q, terms);

                }

            }

            else if (query is MultiTermQuery && expandMultiTermQuery)

            {

                MultiTermQuery mtq = ((MultiTermQuery) query);

                if (mtq.RewriteMethod != MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE)

                {

                    mtq = (MultiTermQuery) mtq.Clone();

                    mtq.RewriteMethod = MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE;

                    query = mtq;

                }

                FakeReader fReader = new FakeReader();

                MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE.Rewrite(fReader, mtq);

                if (fReader.Field != null)

                {

                    IndexReader ir = GetReaderForField(fReader.Field);

                    Extract(query.Rewrite(ir), terms);

                }

            }

            else if (query is MultiPhraseQuery)

            {

                MultiPhraseQuery mpq = (MultiPhraseQuery) query;

                IList<Term[]> termArrays = mpq.GetTermArrays();

                int[] positions = mpq.GetPositions();

                if (positions.Length > 0)

                {


                    int maxPosition = positions[positions.Length - 1];

                    for (int i = 0; i < positions.Length - 1; ++i)

                    {

                        if (positions[i] > maxPosition)

                        {

                            maxPosition = positions[i];

                        }

                    }


                    var disjunctLists = new List<SpanQuery>[maxPosition + 1];

                    int distinctPositions = 0;


                    for (int i = 0; i < termArrays.Count; ++i)

                    {

                        Term[] termArray = termArrays[i];

                        List<SpanQuery> disjuncts = disjunctLists[positions[i]];

                        if (disjuncts == null)

                        {

                            disjuncts = (disjunctLists[positions[i]] = new List<SpanQuery>(termArray.Length));

                            ++distinctPositions;

                        }

                        for (int j = 0; j < termArray.Length; ++j)

                        {

                            disjuncts.Add(new SpanTermQuery(termArray[j]));

                        }

                    }


                    int positionGaps = 0;

                    int position = 0;

                    SpanQuery[] clauses = new SpanQuery[distinctPositions];

                    for (int i = 0; i < disjunctLists.Length; ++i)

                    {

                        List<SpanQuery> disjuncts = disjunctLists[i];

                        if (disjuncts != null)

                        {

                            clauses[position++] = new SpanOrQuery(disjuncts.ToArray());

                        }

                        else

                        {

                            ++positionGaps;

                        }

                    }


                    int slop = mpq.Slop;

                    bool inorder = (slop == 0);


                    SpanNearQuery sp = new SpanNearQuery(clauses, slop + positionGaps, inorder);

                    sp.Boost = query.Boost;

                    ExtractWeightedSpanTerms(terms, sp);

                }

            }

        }


        /// <summary>

        /// Fills a <c>Map</c> with <see cref="WeightedSpanTerm"/>s using the terms from the supplied <c>SpanQuery</c>.

        /// </summary>

        /// <param name="terms">Map to place created WeightedSpanTerms in</param>

        /// <param name="spanQuery">SpanQuery to extract Terms from</param>

        private void ExtractWeightedSpanTerms(IDictionary<String, WeightedSpanTerm> terms, SpanQuery spanQuery)

        {

            HashSet<String> fieldNames;


            if (fieldName == null)

            {

                fieldNames = new HashSet<String>();

                CollectSpanQueryFields(spanQuery, fieldNames);

            }

            else

            {

                fieldNames = new HashSet<String>();

                fieldNames.Add(fieldName);

            }

            // To support the use of the default field name

            if (defaultField != null)

            {

                fieldNames.Add(defaultField);

            }


            IDictionary<String, SpanQuery> queries = new HashMap<String, SpanQuery>();


            var nonWeightedTerms = Support.Compatibility.SetFactory.CreateHashSet<Term>();

            bool mustRewriteQuery = MustRewriteQuery(spanQuery);

            if (mustRewriteQuery)

            {

                foreach (String field in fieldNames)

                {

                    SpanQuery rewrittenQuery = (SpanQuery) spanQuery.Rewrite(GetReaderForField(field));

                    queries[field] = rewrittenQuery;

                    rewrittenQuery.ExtractTerms(nonWeightedTerms);

                }

            }

            else

            {

                spanQuery.ExtractTerms(nonWeightedTerms);

            }


            List<PositionSpan> spanPositions = new List<PositionSpan>();


            foreach (String field in fieldNames)

            {


                IndexReader reader = GetReaderForField(field);

                Spans.Spans spans;

                if (mustRewriteQuery)

                {

                    spans = queries[field].GetSpans(reader);

                }

                else

                {

                    spans = spanQuery.GetSpans(reader);

                }


                // collect span positions

                while (spans.Next())

                {

                    spanPositions.Add(new PositionSpan(spans.Start(), spans.End() - 1));

                }


            }


            if (spanPositions.Count == 0)

            {

                // no spans found

                return;

            }


            foreach (Term queryTerm in nonWeightedTerms)

            {


                if (FieldNameComparator(queryTerm.Field))

                {

                    WeightedSpanTerm weightedSpanTerm = terms[queryTerm.Text];


                    if (weightedSpanTerm == null)

                    {

                        weightedSpanTerm = new WeightedSpanTerm(spanQuery.Boost, queryTerm.Text);

                        weightedSpanTerm.AddPositionSpans(spanPositions);

                        weightedSpanTerm.SetPositionSensitive(true);

                        terms[queryTerm.Text] = weightedSpanTerm;

                    }

                    else

                    {

                        if (spanPositions.Count > 0)

                        {

                            weightedSpanTerm.AddPositionSpans(spanPositions);

                        }

                    }

                }

            }

        }


        /// <summary>

        /// Fills a <c>Map</c> with <see cref="WeightedSpanTerm"/>s using the terms from the supplied <c>Query</c>.

        /// </summary>

        /// <param name="terms"></param>

        /// <param name="query"></param>

        private void ExtractWeightedTerms(IDictionary<String, WeightedSpanTerm> terms, Query query)

        {

            var nonWeightedTerms = Support.Compatibility.SetFactory.CreateHashSet<Term>();

            query.ExtractTerms(nonWeightedTerms);


            foreach (Term queryTerm in nonWeightedTerms)

            {


                if (FieldNameComparator(queryTerm.Field))

                {

                    WeightedSpanTerm weightedSpanTerm = new WeightedSpanTerm(query.Boost, queryTerm.Text);

                    terms[queryTerm.Text] = weightedSpanTerm;

                }

            }

        }


        /// <summary>

        /// Necessary to implement matches for queries against <c>defaultField</c>

        /// </summary>

        private bool FieldNameComparator(String fieldNameToCheck)

        {

            bool rv = fieldName == null || fieldNameToCheck == fieldName

                      || fieldNameToCheck == defaultField;

            return rv;

        }


        private IndexReader GetReaderForField(String field)

        {

            if (wrapToCaching && !cachedTokenStream && !(tokenStream is CachingTokenFilter))

            {

                tokenStream = new CachingTokenFilter(tokenStream);

                cachedTokenStream = true;

            }

            IndexReader reader = readers[field];

            if (reader == null)

            {

                MemoryIndex indexer = new MemoryIndex();

                indexer.AddField(field, tokenStream);

                tokenStream.Reset();

                IndexSearcher searcher = indexer.CreateSearcher();

                reader = searcher.IndexReader;

                readers[field] = reader;

            }


            return reader;

        }


        /// <summary>

        /// Creates a Map of <c>WeightedSpanTerms</c> from the given <c>Query</c> and <c>TokenStream</c>.

        /// </summary>

        /// <param name="query">query that caused hit</param>

        /// <param name="tokenStream">TokenStream of text to be highlighted</param>

        /// <returns>Map containing WeightedSpanTerms</returns>

        public IDictionary<String, WeightedSpanTerm> GetWeightedSpanTerms(Query query, TokenStream tokenStream)

        {

            return GetWeightedSpanTerms(query, tokenStream, null);

        }


        /// <summary>

        /// Creates a Map of <c>WeightedSpanTerms</c> from the given <c>Query</c> and <c>TokenStream</c>.

        /// </summary>

        /// <param name="query">query that caused hit</param>

        /// <param name="tokenStream">tokenStream of text to be highlighted</param>

        /// <param name="fieldName">restricts Term's used based on field name</param>

        /// <returns>Map containing WeightedSpanTerms</returns>

        public IDictionary<String, WeightedSpanTerm> GetWeightedSpanTerms(Query query, TokenStream tokenStream,

                                                                          String fieldName)

        {

            if (fieldName != null)

            {

                this.fieldName = StringHelper.Intern(fieldName);

            }

            else

            {

                this.fieldName = null;

            }


            IDictionary<String, WeightedSpanTerm> terms = new PositionCheckingMap<String>();

            this.tokenStream = tokenStream;

            try

            {

                Extract(query, terms);

            }

            finally

            {

                CloseReaders();

            }


            return terms;

        }


        /// <summary>

        /// Creates a Map of <c>WeightedSpanTerms</c> from the given <c>Query</c> and <c>TokenStream</c>. Uses a supplied

        /// <c>IndexReader</c> to properly Weight terms (for gradient highlighting).

        /// </summary>

        /// <param name="query">Query that caused hit</param>

        /// <param name="tokenStream">Tokenstream of text to be highlighted</param>

        /// <param name="fieldName">restricts Term's used based on field name</param>

        /// <param name="reader">to use for scoring</param>

        /// <returns>Map of WeightedSpanTerms with quasi tf/idf scores</returns>

        public IDictionary<String, WeightedSpanTerm> GetWeightedSpanTermsWithScores(Query query, TokenStream tokenStream,

                                                                                    String fieldName, IndexReader reader)

        {

            if (fieldName != null)

            {

                this.fieldName = StringHelper.Intern(fieldName);

            }

            else

            {

                this.fieldName = null;

            }

            this.tokenStream = tokenStream;


            IDictionary<String, WeightedSpanTerm> terms = new PositionCheckingMap<String>();

            Extract(query, terms);


            int totalNumDocs = reader.NumDocs();

            var weightedTerms = terms.Keys;


            try

            {

                foreach (var wt in weightedTerms)

                {

                    WeightedSpanTerm weightedSpanTerm = terms[wt];

                    int docFreq = reader.DocFreq(new Term(fieldName, weightedSpanTerm.Term));

                    // docFreq counts deletes

                    if (totalNumDocs < docFreq)

                    {

                        docFreq = totalNumDocs;

                    }

                    // IDF algorithm taken from DefaultSimilarity class

                    float idf = (float) (Math.Log((float) totalNumDocs/(double) (docFreq + 1)) + 1.0);

                    weightedSpanTerm.Weight *= idf;

                }

            }

            finally

            {


                CloseReaders();

            }


            return terms;

        }


        private void CollectSpanQueryFields(SpanQuery spanQuery, HashSet<String> fieldNames)

        {

            if (spanQuery is FieldMaskingSpanQuery)

            {

                CollectSpanQueryFields(((FieldMaskingSpanQuery) spanQuery).MaskedQuery, fieldNames);

            }

            else if (spanQuery is SpanFirstQuery)

            {

                CollectSpanQueryFields(((SpanFirstQuery) spanQuery).Match, fieldNames);

            }

            else if (spanQuery is SpanNearQuery)

            {

                foreach (SpanQuery clause in ((SpanNearQuery) spanQuery).GetClauses())

                {

                    CollectSpanQueryFields(clause, fieldNames);

                }

            }

            else if (spanQuery is SpanNotQuery)

            {

                CollectSpanQueryFields(((SpanNotQuery) spanQuery).Include, fieldNames);

            }

            else if (spanQuery is SpanOrQuery)

            {

                foreach (SpanQuery clause in ((SpanOrQuery) spanQuery).GetClauses())

                {

                    CollectSpanQueryFields(clause, fieldNames);

                }

            }

            else

            {

                fieldNames.Add(spanQuery.Field);

            }

        }


        private bool MustRewriteQuery(SpanQuery spanQuery)

        {

            if (!expandMultiTermQuery)

            {

                return false; // Will throw UnsupportedOperationException in case of a SpanRegexQuery.

            }

            else if (spanQuery is FieldMaskingSpanQuery)

            {

                return MustRewriteQuery(((FieldMaskingSpanQuery)spanQuery).MaskedQuery);

            }

            else if (spanQuery is SpanFirstQuery)

            {

                return MustRewriteQuery(((SpanFirstQuery)spanQuery).Match);

            }

            else if (spanQuery is SpanNearQuery)

            {

                foreach (SpanQuery clause in ((SpanNearQuery) spanQuery).GetClauses())

                {

                    if (MustRewriteQuery(clause))

                    {

                        return true;

                    }

                }

                return false;

            }

            else if (spanQuery is SpanNotQuery)

            {

                SpanNotQuery spanNotQuery = (SpanNotQuery) spanQuery;

                return MustRewriteQuery(spanNotQuery.Include) || MustRewriteQuery(spanNotQuery.Exclude);

            }

            else if (spanQuery is SpanOrQuery)

            {

                foreach (SpanQuery clause in ((SpanOrQuery) spanQuery).GetClauses())

                {

                    if (MustRewriteQuery(clause))

                    {

                        return true;

                    }

                }

                return false;

            }

            else if (spanQuery is SpanTermQuery)

            {

                return false;

            }

            else

            {

                return true;

            }

        }


        /// <summary>

        /// This class makes sure that if both position sensitive and insensitive

        /// versions of the same term are added, the position insensitive one wins.

        /// </summary>

        /// <typeparam name="K"></typeparam>

        private class PositionCheckingMap<K> : HashMap<K, WeightedSpanTerm>

        {

            public PositionCheckingMap()

            {


            }


            public PositionCheckingMap(IEnumerable<KeyValuePair<K, WeightedSpanTerm>> m)

            {

                PutAll(m);

            }


            public void PutAll(IEnumerable<KeyValuePair<K, WeightedSpanTerm>> m)

            {

                foreach (var entry in m)

                {

                    Add(entry.Key, entry.Value);

                }

            }


            public override void Add(K key, WeightedSpanTerm value)

            {

                base.Add(key, value);

                WeightedSpanTerm prev = this[key];


                if (prev == null) return;


                WeightedSpanTerm prevTerm = prev;

                WeightedSpanTerm newTerm = value;

                if (!prevTerm.IsPositionSensitive())

                {

                    newTerm.SetPositionSensitive(false);

                }

            }


        }


        public bool ExpandMultiTermQuery

        {

            set { this.expandMultiTermQuery = value; }

            get { return expandMultiTermQuery; }

        }


        public bool IsCachedTokenStream

        {

            get { return cachedTokenStream; }

        }


        public TokenStream TokenStream

        {

            get { return tokenStream; }

        }


        /// <summary>

        /// By default, <see cref="Analysis.TokenStream"/>s that are not of the type

        /// <see cref="CachingTokenFilter"/> are wrapped in a <see cref="CachingTokenFilter"/> to

        /// <see cref="Analysis.TokenStream"/> impl and you don't want it to be wrapped, set this to

        /// false.

        /// </summary>

        public void SetWrapIfNotCachingTokenFilter(bool wrap)

        {

            this.wrapToCaching = wrap;

        }


        /// <summary>

        /// A fake IndexReader class to extract the field from a MultiTermQuery

        /// </summary>

        protected internal sealed class FakeReader : FilterIndexReader

        {


            private static IndexReader EMPTY_MEMORY_INDEX_READER = new MemoryIndex().CreateSearcher().IndexReader;


            public String Field { get; private set; }


            protected internal FakeReader()

                : base(EMPTY_MEMORY_INDEX_READER)

            {


            }


            public override TermEnum Terms(Term t)

            {

                // only set first fieldname, maybe use a Set?

                if (t != null && Field == null)

                    Field = t.Field;

                return base.Terms(t);

            }


        }

    }

}