Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
QueryTermExtractor.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.Collections.Generic;
20 using System.IO;
21 using System.Linq;
22 using Lucene.Net.Index;
23 using Lucene.Net.Util;
24 
25 namespace Lucene.Net.Search.Highlight
26 {
27 
28  /// <summary> Utility class used to extract the terms used in a query, plus any weights.
29  /// This class will not find terms for MultiTermQuery, RangeQuery and PrefixQuery classes
30  /// so the caller must pass a rewritten query (see Query.rewrite) to obtain a list of
31  /// expanded terms.</summary>
32  public static class QueryTermExtractor
33  {
34 
35  /// <summary> Extracts all terms texts of a given Query into an array of WeightedTerms
36  ///
37  /// </summary>
38  /// <param name="query"> Query to extract term texts from
39  /// </param>
40  /// <returns> an array of the terms used in a query, plus their weights.
41  /// </returns>
42  public static WeightedTerm[] GetTerms(Query query)
43  {
44  return GetTerms(query, false);
45  }
46 
47  /// <summary> Extracts all terms texts of a given Query into an array of WeightedTerms
48  ///
49  /// </summary>
50  /// <param name="query">Query to extract term texts from</param>
51  /// <param name="reader">used to compute IDF which can be used to a) score selected fragments better
52  /// b) use graded highlights eg chaning intensity of font color</param>
53  /// <param name="fieldName">the field on which Inverse Document Frequency (IDF) calculations are based</param>
54  /// <returns> an array of the terms used in a query, plus their weights.</returns>
55  public static WeightedTerm[] GetIdfWeightedTerms(Query query, IndexReader reader, string fieldName)
56  {
57  WeightedTerm[] terms = GetTerms(query, false, fieldName);
58  int totalNumDocs = reader.NumDocs();
59  foreach (WeightedTerm t in terms)
60  {
61  try
62  {
63  int docFreq = reader.DocFreq(new Term(fieldName, t.Term));
64  // docFreq counts deletes
65  if (totalNumDocs < docFreq)
66  {
67  docFreq = totalNumDocs;
68  }
69  //IDF algorithm taken from DefaultSimilarity class
70  var idf = (float)(Math.Log((float)totalNumDocs / (double)(docFreq + 1)) + 1.0);
71  t.Weight *= idf;
72  }
73  catch (IOException e)
74  {
75  //ignore
76  }
77  }
78  return terms;
79  }
80 
81  /// <summary>Extracts all terms texts of a given Query into an array of WeightedTerms</summary>
82  /// <param name="query">Query to extract term texts from</param>
83  /// <param name="prohibited"><c>true</c> to extract "prohibited" terms, too </param>
84  /// <param name="fieldName"> The fieldName used to filter query terms</param>
85  /// <returns>an array of the terms used in a query, plus their weights.</returns>
86  public static WeightedTerm[] GetTerms(Query query, bool prohibited, string fieldName)
87  {
88  var terms = new HashSet<WeightedTerm>();
89  if (fieldName != null)
90  {
91  fieldName = StringHelper.Intern(fieldName);
92  }
93  GetTerms(query, terms, prohibited, fieldName);
94  return terms.ToArray();
95  }
96 
97  /// <summary> Extracts all terms texts of a given Query into an array of WeightedTerms
98  ///
99  /// </summary>
100  /// <param name="query"> Query to extract term texts from
101  /// </param>
102  /// <param name="prohibited"><c>true</c> to extract "prohibited" terms, too
103  /// </param>
104  /// <returns> an array of the terms used in a query, plus their weights.
105  /// </returns>
106  public static WeightedTerm[] GetTerms(Query query, bool prohibited)
107  {
108  return GetTerms(query, prohibited, null);
109  }
110 
111  //fieldname MUST be interned prior to this call
112  private static void GetTerms(Query query, HashSet<WeightedTerm> terms, bool prohibited, string fieldName)
113  {
114  try
115  {
116  if (query is BooleanQuery)
117  GetTermsFromBooleanQuery((BooleanQuery) query, terms, prohibited, fieldName);
118  else if (query is FilteredQuery)
119  GetTermsFromFilteredQuery((FilteredQuery) query, terms, prohibited, fieldName);
120  else
121  {
122  var nonWeightedTerms = Support.Compatibility.SetFactory.CreateHashSet<Term>();
123  query.ExtractTerms(nonWeightedTerms);
124  foreach (var term in nonWeightedTerms)
125  {
126  if ((fieldName == null) || (term.Field == fieldName))
127  {
128  terms.Add(new WeightedTerm(query.Boost, term.Text));
129  }
130  }
131  }
132  }
133  catch (System.NotSupportedException ignore)
134  {
135  //this is non-fatal for our purposes
136  }
137  }
138 
139  /// <summary> extractTerms is currently the only query-independent means of introspecting queries but it only reveals
140  /// a list of terms for that query - not the boosts each individual term in that query may or may not have.
141  /// "Container" queries such as BooleanQuery should be unwrapped to get at the boost info held
142  /// in each child element.
143  /// Some discussion around this topic here:
144  /// http://www.gossamer-threads.com/lists/lucene/java-dev/34208?search_string=introspection;#34208
145  /// Unfortunately there seemed to be limited interest in requiring all Query objects to implement
146  /// something common which would allow access to child queries so what follows here are query-specific
147  /// implementations for accessing embedded query elements.
148  /// </summary>
149  private static void GetTermsFromBooleanQuery(BooleanQuery query, HashSet<WeightedTerm> terms, bool prohibited, string fieldName)
150  {
151  BooleanClause[] queryClauses = query.GetClauses();
152  for (int i = 0; i < queryClauses.Length; i++)
153  {
154  if (prohibited || queryClauses[i].Occur != Occur.MUST_NOT)
155  GetTerms(queryClauses[i].Query, terms, prohibited, fieldName);
156  }
157  }
158  private static void GetTermsFromFilteredQuery(FilteredQuery query, HashSet<WeightedTerm> terms, bool prohibited, string fieldName)
159  {
160  GetTerms(query.Query, terms, prohibited, fieldName);
161  }
162  }
163 }