Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
MemoryIndex.cs
Go to the documentation of this file.
1 /*
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements. See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership. The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License. You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied. See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20 */
21 
22 using System;
23 using System.Collections.Generic;
24 using System.IO;
25 using System.Linq;
26 using System.Text;
27 using Lucene.Net.Analysis;
28 using Lucene.Net.Analysis.Tokenattributes;
29 using Lucene.Net.Documents;
30 using Lucene.Net.Search;
31 using Lucene.Net.Support;
32 
33 namespace Lucene.Net.Index.Memory
34 {
35  /// <summary>
36  /// High-performance single-document main memory Apache Lucene fulltext search index.
37  ///
38  /// <h4>Overview</h4>
39  ///
40  /// This class is a replacement/substitute for a large subset of
41  /// {@link RAMDirectory} functionality. It is designed to
42  /// enable maximum efficiency for on-the-fly matchmaking combining structured and
43  /// fuzzy fulltext search in realtime streaming applications such as Nux XQuery based XML
44  /// message queues, publish-subscribe systems for Blogs/newsfeeds, text chat, data acquisition and
45  /// distribution systems, application level routers, firewalls, classifiers, etc.
46  /// Rather than targeting fulltext search of infrequent queries over huge persistent
47  /// data archives (historic search), this class targets fulltext search of huge
48  /// numbers of queries over comparatively small transient realtime data (prospective
49  /// search).
50  /// For example as in
51  /// <pre>
52  /// float score = search(String text, Query query)
53  /// </pre>
54  /// <p/>
55  /// Each instance can hold at most one Lucene "document", with a document containing
56  /// zero or more "fields", each field having a name and a fulltext value. The
57  /// fulltext value is tokenized (split and transformed) into zero or more index terms
58  /// (aka words) on <c>addField()</c>, according to the policy implemented by an
59  /// Analyzer. For example, Lucene analyzers can split on whitespace, normalize to lower case
60  /// for case insensitivity, ignore common terms with little discriminatory value such as "he", "in", "and" (stop
61  /// words), reduce the terms to their natural linguistic root form such as "fishing"
62  /// being reduced to "fish" (stemming), resolve synonyms/inflexions/thesauri
63  /// (upon indexing and/or querying), etc. For details, see
64  /// <a target="_blank" href="http://today.java.net/pub/a/today/2003/07/30/LuceneIntro.html">Lucene Analyzer Intro</a>.
65  /// <p/>
66  /// Arbitrary Lucene queries can be run against this class - see <a target="_blank"
67  /// href="../../../../../../../queryparsersyntax.html">Lucene Query Syntax</a>
68  /// as well as <a target="_blank"
69  /// href="http://today.java.net/pub/a/today/2003/11/07/QueryParserRules.html">Query Parser Rules</a>.
70  /// Note that a Lucene query selects on the field names and associated (indexed)
71  /// tokenized terms, not on the original fulltext(s) - the latter are not stored
72  /// but rather thrown away immediately after tokenization.
73  /// <p/>
74  /// For some interesting background information on search technology, see Bob Wyman's
75  /// <a target="_blank"
76  /// href="http://bobwyman.pubsub.com/main/2005/05/mary_hodder_poi.html">Prospective Search</a>,
77  /// Jim Gray's
78  /// <a target="_blank" href="http://www.acmqueue.org/modules.php?name=Content&amp;pa=showpage&amp;pid=293&amp;page=4">
79  /// A Call to Arms - Custom subscriptions</a>, and Tim Bray's
80  /// <a target="_blank"
81  /// href="http://www.tbray.org/ongoing/When/200x/2003/07/30/OnSearchTOC">On Search, the Series</a>.
82  ///
83  ///
84  /// <h4>Example Usage</h4>
85  ///
86  /// <pre>
87  /// Analyzer analyzer = PatternAnalyzer.DEFAULT_ANALYZER;
88  /// //Analyzer analyzer = new SimpleAnalyzer();
89  /// MemoryIndex index = new MemoryIndex();
90  /// index.addField("content", "Readings about Salmons and other select Alaska fishing Manuals", analyzer);
91  /// index.addField("author", "Tales of James", analyzer);
92  /// QueryParser parser = new QueryParser("content", analyzer);
93  /// float score = index.search(parser.parse("+author:james +salmon~ +fish/// manual~"));
94  /// if (score &gt; 0.0f) {
95  /// System.out.println("it's a match");
96  /// } else {
97  /// System.out.println("no match found");
98  /// }
99  /// System.out.println("indexData=" + index.toString());
100  /// </pre>
101  ///
102  ///
103  /// <h4>Example XQuery Usage</h4>
104  ///
105  /// <pre>
106  /// (: An XQuery that finds all books authored by James that have something to do with "salmon fishing manuals", sorted by relevance :)
107  /// declare namespace lucene = "java:nux.xom.pool.FullTextUtil";
108  /// declare variable $query := "+salmon~ +fish/// manual~"; (: any arbitrary Lucene query can go here :)
109  ///
110  /// for $book in /books/book[author="James" and lucene:match(abstract, $query) > 0.0]
111  /// let $score := lucene:match($book/abstract, $query)
112  /// order by $score descending
113  /// return $book
114  /// </pre>
115  ///
116  ///
117  /// <h4>No thread safety guarantees</h4>
118  ///
119  /// An instance can be queried multiple times with the same or different queries,
120  /// but an instance is not thread-safe. If desired use idioms such as:
121  /// <pre>
122  /// MemoryIndex index = ...
123  /// synchronized (index) {
124  /// // read and/or write index (i.e. add fields and/or query)
125  /// }
126  /// </pre>
127  ///
128  ///
129  /// <h4>Performance Notes</h4>
130  ///
131  /// Internally there's a new data structure geared towards efficient indexing
132  /// and searching, plus the necessary support code to seamlessly plug into the Lucene
133  /// framework.
134  /// <p/>
135  /// This class performs very well for very small texts (e.g. 10 chars)
136  /// as well as for large texts (e.g. 10 MB) and everything in between.
137  /// Typically, it is about 10-100 times faster than <c>RAMDirectory</c>.
138  /// Note that <c>RAMDirectory</c> has particularly
139  /// large efficiency overheads for small to medium sized texts, both in time and space.
140  /// Indexing a field with N tokens takes O(N) in the best case, and O(N logN) in the worst
141  /// case. Memory consumption is probably larger than for <c>RAMDirectory</c>.
142  /// <p/>
143  /// Example throughput of many simple term queries over a single MemoryIndex:
144  /// ~500000 queries/sec on a MacBook Pro, jdk 1.5.0_06, server VM.
145  /// As always, your mileage may vary.
146  /// <p/>
147  /// If you're curious about
148  /// the whereabouts of bottlenecks, run java 1.5 with the non-perturbing '-server
149  /// -agentlib:hprof=cpu=samples,depth=10' flags, then study the trace log and
150  /// correlate its hotspot trailer with its call stack headers (see <a
151  /// target="_blank" href="http://java.sun.com/developer/technicalArticles/Programming/HPROF.html">
152  /// hprof tracing </a>).
153  ///
154  ///</summary>
155  [Serializable]
156  public partial class MemoryIndex
157  {
158  /* info for each field: Map<String fieldName, Info field> */
159  private HashMap<String, Info> fields = new HashMap<String, Info>();
160 
161  /* fields sorted ascending by fieldName; lazily computed on demand */
162  [NonSerialized] private KeyValuePair<String, Info>[] sortedFields;
163 
164  /* pos: positions[3*i], startOffset: positions[3*i +1], endOffset: positions[3*i +2] */
165  private int stride;
166 
167  /* Could be made configurable; See {@link Document#setBoost(float)} */
168  private static float docBoost = 1.0f;
169 
170  private static long serialVersionUID = 2782195016849084649L;
171 
172  private static bool DEBUG = false;
173 
174  /*
175  * Constructs an empty instance.
176  */
177  public MemoryIndex()
178  : this(false)
179  {
180  }
181 
182  /*
183  * Constructs an empty instance that can optionally store the start and end
184  * character offset of each token term in the text. This can be useful for
185  * highlighting of hit locations with the Lucene highlighter package.
186  * Private until the highlighter package matures, so that this can actually
187  * be meaningfully integrated.
188  *
189  * @param storeOffsets
190  * whether or not to store the start and end character offset of
191  * each token term in the text
192  */
193 
194  private MemoryIndex(bool storeOffsets)
195  {
196  this.stride = storeOffsets ? 3 : 1;
197  }
198 
199  /*
200  * Convenience method; Tokenizes the given field text and adds the resulting
201  * terms to the index; Equivalent to adding an indexed non-keyword Lucene
202  * {@link org.apache.lucene.document.Field} that is
203  * {@link org.apache.lucene.document.Field.Index#ANALYZED tokenized},
204  * {@link org.apache.lucene.document.Field.Store#NO not stored},
205  * {@link org.apache.lucene.document.Field.TermVector#WITH_POSITIONS termVectorStored with positions} (or
206  * {@link org.apache.lucene.document.Field.TermVector#WITH_POSITIONS termVectorStored with positions and offsets}),
207  *
208  * @param fieldName
209  * a name to be associated with the text
210  * @param text
211  * the text to tokenize and index.
212  * @param analyzer
213  * the analyzer to use for tokenization
214  */
215 
216  public void AddField(String fieldName, String text, Analyzer analyzer)
217  {
218  if (fieldName == null)
219  throw new ArgumentException("fieldName must not be null");
220  if (text == null)
221  throw new ArgumentException("text must not be null");
222  if (analyzer == null)
223  throw new ArgumentException("analyzer must not be null");
224 
225  TokenStream stream = analyzer.TokenStream(fieldName, new StringReader(text));
226 
227  AddField(fieldName, stream);
228  }
229 
230  /*
231  * Convenience method; Creates and returns a token stream that generates a
232  * token for each keyword in the given collection, "as is", without any
233  * transforming text analysis. The resulting token stream can be fed into
234  * {@link #addField(String, TokenStream)}, perhaps wrapped into another
235  * {@link org.apache.lucene.analysis.TokenFilter}, as desired.
236  *
237  * @param keywords
238  * the keywords to generate tokens for
239  * @return the corresponding token stream
240  */
241 
242  public TokenStream CreateKeywordTokenStream<T>(ICollection<T> keywords)
243  {
244  // TODO: deprecate & move this method into AnalyzerUtil?
245  if (keywords == null)
246  throw new ArgumentException("keywords must not be null");
247 
248  return new KeywordTokenStream<T>(keywords);
249  }
250 
251  /*
252  * Equivalent to <c>addField(fieldName, stream, 1.0f)</c>.
253  *
254  * @param fieldName
255  * a name to be associated with the text
256  * @param stream
257  * the token stream to retrieve tokens from
258  */
259  public void AddField(String fieldName, TokenStream stream)
260  {
261  AddField(fieldName, stream, 1.0f);
262  }
263 
264  /*
265  * Iterates over the given token stream and adds the resulting terms to the index;
266  * Equivalent to adding a tokenized, indexed, termVectorStored, unstored,
267  * Lucene {@link org.apache.lucene.document.Field}.
268  * Finally closes the token stream. Note that untokenized keywords can be added with this method via
269  * {@link #CreateKeywordTokenStream(Collection)}, the Lucene contrib <c>KeywordTokenizer</c> or similar utilities.
270  *
271  * @param fieldName
272  * a name to be associated with the text
273  * @param stream
274  * the token stream to retrieve tokens from.
275  * @param boost
276  * the boost factor for hits for this field
277  * @see org.apache.lucene.document.Field#setBoost(float)
278  */
279  public void AddField(String fieldName, TokenStream stream, float boost)
280  {
281  try
282  {
283  if (fieldName == null)
284  throw new ArgumentException("fieldName must not be null");
285  if (stream == null)
286  throw new ArgumentException("token stream must not be null");
287  if (boost <= 0.0f)
288  throw new ArgumentException("boost factor must be greater than 0.0");
289  if (fields[fieldName] != null)
290  throw new ArgumentException("field must not be added more than once");
291 
292  var terms = new HashMap<String, ArrayIntList>();
293  int numTokens = 0;
294  int numOverlapTokens = 0;
295  int pos = -1;
296 
297  var termAtt = stream.AddAttribute<ITermAttribute>();
298  var posIncrAttribute = stream.AddAttribute<IPositionIncrementAttribute>();
299  var offsetAtt = stream.AddAttribute<IOffsetAttribute>();
300 
301  stream.Reset();
302  while (stream.IncrementToken())
303  {
304  String term = termAtt.Term;
305  if (term.Length == 0) continue; // nothing to do
306  // if (DEBUG) System.Diagnostics.Debug.WriteLine("token='" + term + "'");
307  numTokens++;
308  int posIncr = posIncrAttribute.PositionIncrement;
309  if (posIncr == 0)
310  numOverlapTokens++;
311  pos += posIncr;
312 
313  ArrayIntList positions = terms[term];
314  if (positions == null)
315  {
316  // term not seen before
317  positions = new ArrayIntList(stride);
318  terms[term] = positions;
319  }
320  if (stride == 1)
321  {
322  positions.Add(pos);
323  }
324  else
325  {
326  positions.Add(pos, offsetAtt.StartOffset, offsetAtt.EndOffset);
327  }
328  }
329  stream.End();
330 
331  // ensure infos.numTokens > 0 invariant; needed for correct operation of terms()
332  if (numTokens > 0)
333  {
334  boost = boost*docBoost; // see DocumentWriter.addDocument(...)
335  fields[fieldName] = new Info(terms, numTokens, numOverlapTokens, boost);
336  sortedFields = null; // invalidate sorted view, if any
337  }
338  }
339  catch (IOException e)
340  {
341  // can never happen
342  throw new SystemException(string.Empty, e);
343  }
344  finally
345  {
346  try
347  {
348  if (stream != null) stream.Close();
349  }
350  catch (IOException e2)
351  {
352  throw new SystemException(string.Empty, e2);
353  }
354  }
355  }
356 
357  /*
358  * Creates and returns a searcher that can be used to execute arbitrary
359  * Lucene queries and to collect the resulting query results as hits.
360  *
361  * @return a searcher
362  */
363 
364  public IndexSearcher CreateSearcher()
365  {
366  MemoryIndexReader reader = new MemoryIndexReader(this);
367  IndexSearcher searcher = new IndexSearcher(reader); // ensures no auto-close !!
368  reader.SetSearcher(searcher); // to later get hold of searcher.getSimilarity()
369  return searcher;
370  }
371 
372  /*
373  * Convenience method that efficiently returns the relevance score by
374  * matching this index against the given Lucene query expression.
375  *
376  * @param query
377  * an arbitrary Lucene query to run against this index
378  * @return the relevance score of the matchmaking; A number in the range
379  * [0.0 .. 1.0], with 0.0 indicating no match. The higher the number
380  * the better the match.
381  *
382  */
383 
384  public float Search(Query query)
385  {
386  if (query == null)
387  throw new ArgumentException("query must not be null");
388 
389  Searcher searcher = CreateSearcher();
390  try
391  {
392  float[] scores = new float[1]; // inits to 0.0f (no match)
393  searcher.Search(query, new FillingCollector(scores));
394  float score = scores[0];
395  return score;
396  }
397  catch (IOException e)
398  {
399  // can never happen (RAMDirectory)
400  throw new SystemException(string.Empty, e);
401  }
402  finally
403  {
404  // searcher.close();
405  /*
406  * Note that it is harmless and important for good performance to
407  * NOT close the index reader!!! This avoids all sorts of
408  * unnecessary baggage and locking in the Lucene IndexReader
409  * superclass, all of which is completely unnecessary for this main
410  * memory index data structure without thread-safety claims.
411  *
412  * Wishing IndexReader would be an interface...
413  *
414  * Actually with the new tight createSearcher() API auto-closing is now
415  * made impossible, hence searcher.close() would be harmless and also
416  * would not degrade performance...
417  */
418  }
419  }
420 
421  /*
422  * Returns a reasonable approximation of the main memory [bytes] consumed by
423  * this instance. Useful for smart memory sensititive caches/pools. Assumes
424  * fieldNames are interned, whereas tokenized terms are memory-overlaid.
425  *
426  * @return the main memory consumption
427  */
428  public int GetMemorySize()
429  {
430  // for example usage in a smart cache see nux.xom.pool.Pool
431  int PTR = VM.PTR;
432  int INT = VM.INT;
433  int size = 0;
434  size += VM.SizeOfObject(2*PTR + INT); // memory index
435  if (sortedFields != null) size += VM.SizeOfObjectArray(sortedFields.Length);
436 
437  size += VM.SizeOfHashMap(fields.Count);
438  foreach (var entry in fields)
439  {
440  // for each Field Info
441  Info info = entry.Value;
442  size += VM.SizeOfObject(2*INT + 3*PTR); // Info instance vars
443  if (info.SortedTerms != null) size += VM.SizeOfObjectArray(info.SortedTerms.Length);
444 
445  int len = info.Terms.Count;
446  size += VM.SizeOfHashMap(len);
447 
448  var iter2 = info.Terms.GetEnumerator();
449  while (--len >= 0)
450  {
451  iter2.MoveNext();
452  // for each term
453  KeyValuePair<String, ArrayIntList> e = iter2.Current;
454  size += VM.SizeOfObject(PTR + 3*INT); // assumes substring() memory overlay
455 // size += STR + 2 * ((String) e.getKey()).length();
456  ArrayIntList positions = e.Value;
457  size += VM.SizeOfArrayIntList(positions.Size());
458  }
459  }
460  return size;
461  }
462 
463  private int NumPositions(ArrayIntList positions)
464  {
465  return positions.Size()/stride;
466  }
467 
468  /* sorts into ascending order (on demand), reusing memory along the way */
469 
470  private void SortFields()
471  {
472  if (sortedFields == null) sortedFields = Sort(fields);
473  }
474 
475  /* returns a view of the given map's entries, sorted ascending by key */
476 
477  private static KeyValuePair<TKey, TValue>[] Sort<TKey, TValue>(HashMap<TKey, TValue> map)
478  where TKey : class, IComparable<TKey>
479  {
480  int size = map.Count;
481 
482  var entries = map.ToArray();
483 
484  if (size > 1) Array.Sort(entries, TermComparer.KeyComparer);
485  return entries;
486  }
487 
488  /*
489  * Returns a String representation of the index data for debugging purposes.
490  *
491  * @return the string representation
492  */
493 
494  public override String ToString()
495  {
496  StringBuilder result = new StringBuilder(256);
497  SortFields();
498  int sumChars = 0;
499  int sumPositions = 0;
500  int sumTerms = 0;
501 
502  for (int i = 0; i < sortedFields.Length; i++)
503  {
504  KeyValuePair<String, Info> entry = sortedFields[i];
505  String fieldName = entry.Key;
506  Info info = entry.Value;
507  info.SortTerms();
508  result.Append(fieldName + ":\n");
509 
510  int numChars = 0;
511  int numPos = 0;
512  for (int j = 0; j < info.SortedTerms.Length; j++)
513  {
514  KeyValuePair<String, ArrayIntList> e = info.SortedTerms[j];
515  String term = e.Key;
516  ArrayIntList positions = e.Value;
517  result.Append("\t'" + term + "':" + NumPositions(positions) + ":");
518  result.Append(positions.ToString(stride)); // ignore offsets
519  result.Append("\n");
520  numPos += NumPositions(positions);
521  numChars += term.Length;
522  }
523 
524  result.Append("\tterms=" + info.SortedTerms.Length);
525  result.Append(", positions=" + numPos);
526  result.Append(", Kchars=" + (numChars/1000.0f));
527  result.Append("\n");
528  sumPositions += numPos;
529  sumChars += numChars;
530  sumTerms += info.SortedTerms.Length;
531  }
532 
533  result.Append("\nfields=" + sortedFields.Length);
534  result.Append(", terms=" + sumTerms);
535  result.Append(", positions=" + sumPositions);
536  result.Append(", Kchars=" + (sumChars/1000.0f));
537  return result.ToString();
538  }
539 
540 
541  ///////////////////////////////////////////////////////////////////////////////
542  // Nested classes:
543  ///////////////////////////////////////////////////////////////////////////////
544  /*
545  * Index data structure for a field; Contains the tokenized term texts and
546  * their positions.
547  */
548 
549  [Serializable]
550  private sealed class Info
551  {
552  public static readonly IComparer<KeyValuePair<string, Info>> InfoComparer = new TermComparer<Info>();
553  public static readonly IComparer<KeyValuePair<string, ArrayIntList>> ArrayIntListComparer = new TermComparer<ArrayIntList>();
554  /*
555  * Term strings and their positions for this field: Map <String
556  * termText, ArrayIntList positions>
557  */
558  private HashMap<String, ArrayIntList> terms;
559 
560  /* Terms sorted ascending by term text; computed on demand */
561  [NonSerialized] private KeyValuePair<String, ArrayIntList>[] sortedTerms;
562 
563  /* Number of added tokens for this field */
564  private int numTokens;
565 
566  /* Number of overlapping tokens for this field */
567  private int numOverlapTokens;
568 
569  /* Boost factor for hits for this field */
570  private float boost;
571 
572  /* Term for this field's fieldName, lazily computed on demand */
573  [NonSerialized] public Term template;
574 
575  private static long serialVersionUID = 2882195016849084649L;
576 
577  public Info(HashMap<String, ArrayIntList> terms, int numTokens, int numOverlapTokens, float boost)
578  {
579  this.terms = terms;
580  this.numTokens = numTokens;
581  this.NumOverlapTokens = numOverlapTokens;
582  this.boost = boost;
583  }
584 
585  public HashMap<string, ArrayIntList> Terms
586  {
587  get { return terms; }
588  }
589 
590  public int NumTokens
591  {
592  get { return numTokens; }
593  }
594 
595  public int NumOverlapTokens
596  {
597  get { return numOverlapTokens; }
598  set { numOverlapTokens = value; }
599  }
600 
601  public float Boost
602  {
603  get { return boost; }
604  }
605 
606  public KeyValuePair<string, ArrayIntList>[] SortedTerms
607  {
608  get { return sortedTerms; }
609  }
610 
611  /*
612  * Sorts hashed terms into ascending order, reusing memory along the
613  * way. Note that sorting is lazily delayed until required (often it's
614  * not required at all). If a sorted view is required then hashing +
615  * sort + binary search is still faster and smaller than TreeMap usage
616  * (which would be an alternative and somewhat more elegant approach,
617  * apart from more sophisticated Tries / prefix trees).
618  */
619 
620  public void SortTerms()
621  {
622  if (SortedTerms == null) sortedTerms = Sort(Terms);
623  }
624 
625  /* note that the frequency can be calculated as numPosition(getPositions(x)) */
626 
627  public ArrayIntList GetPositions(String term)
628  {
629  return Terms[term];
630  }
631 
632  /* note that the frequency can be calculated as numPosition(getPositions(x)) */
633 
634  public ArrayIntList GetPositions(int pos)
635  {
636  return SortedTerms[pos].Value;
637  }
638  }
639 
640 
641  ///////////////////////////////////////////////////////////////////////////////
642  // Nested classes:
643  ///////////////////////////////////////////////////////////////////////////////
644  /*
645  * Efficient resizable auto-expanding list holding <c>int</c> elements;
646  * implemented with arrays.
647  */
648 
649  [Serializable]
650  private sealed class ArrayIntList
651  {
652 
653  private int[] elements;
654  private int size = 0;
655 
656  private static long serialVersionUID = 2282195016849084649L;
657 
658  private ArrayIntList()
659  : this(10)
660  {
661 
662  }
663 
664  public ArrayIntList(int initialCapacity)
665  {
666  elements = new int[initialCapacity];
667  }
668 
669  public void Add(int elem)
670  {
671  if (size == elements.Length) EnsureCapacity(size + 1);
672  elements[size++] = elem;
673  }
674 
675  public void Add(int pos, int start, int end)
676  {
677  if (size + 3 > elements.Length) EnsureCapacity(size + 3);
678  elements[size] = pos;
679  elements[size + 1] = start;
680  elements[size + 2] = end;
681  size += 3;
682  }
683 
684  public int Get(int index)
685  {
686  if (index >= size) ThrowIndex(index);
687  return elements[index];
688  }
689 
690  public int Size()
691  {
692  return size;
693  }
694 
695  public int[] ToArray(int stride)
696  {
697  int[] arr = new int[Size()/stride];
698  if (stride == 1)
699  {
700  Array.Copy(elements, 0, arr, 0, size);
701  }
702  else
703  {
704  for (int i = 0, j = 0; j < size; i++, j += stride) arr[i] = elements[j];
705  }
706  return arr;
707  }
708 
709  private void EnsureCapacity(int minCapacity)
710  {
711  int newCapacity = Math.Max(minCapacity, (elements.Length*3)/2 + 1);
712  int[] newElements = new int[newCapacity];
713  Array.Copy(elements, 0, newElements, 0, size);
714  elements = newElements;
715  }
716 
717  private void ThrowIndex(int index)
718  {
719  throw new IndexOutOfRangeException("index: " + index
720  + ", size: " + size);
721  }
722 
723  /* returns the first few positions (without offsets); debug only */
724 
725  public string ToString(int stride)
726  {
727  int s = Size()/stride;
728  int len = Math.Min(10, s); // avoid printing huge lists
729  StringBuilder buf = new StringBuilder(4*len);
730  buf.Append("[");
731  for (int i = 0; i < len; i++)
732  {
733  buf.Append(Get(i*stride));
734  if (i < len - 1) buf.Append(", ");
735  }
736  if (len != s) buf.Append(", ..."); // and some more...
737  buf.Append("]");
738  return buf.ToString();
739  }
740  }
741 
742 
743  ///////////////////////////////////////////////////////////////////////////////
744  // Nested classes:
745  ///////////////////////////////////////////////////////////////////////////////
746  private static readonly Term MATCH_ALL_TERM = new Term("");
747 
748  /*
749  * Search support for Lucene framework integration; implements all methods
750  * required by the Lucene IndexReader contracts.
751  */
752 
753  private sealed partial class MemoryIndexReader : IndexReader
754  {
755  private readonly MemoryIndex _index;
756 
757  private Searcher searcher; // needed to find searcher.getSimilarity()
758 
759  internal MemoryIndexReader(MemoryIndex index)
760  {
761  _index = index;
762  }
763 
764  private Info GetInfo(String fieldName)
765  {
766  return _index.fields[fieldName];
767  }
768 
769  private Info GetInfo(int pos)
770  {
771  return _index.sortedFields[pos].Value;
772  }
773 
774  public override int DocFreq(Term term)
775  {
776  Info info = GetInfo(term.Field);
777  int freq = 0;
778  if (info != null) freq = info.GetPositions(term.Text) != null ? 1 : 0;
779  if (DEBUG) System.Diagnostics.Debug.WriteLine("MemoryIndexReader.docFreq: " + term + ", freq:" + freq);
780  return freq;
781  }
782 
783  public override TermEnum Terms()
784  {
785  if (DEBUG) System.Diagnostics.Debug.WriteLine("MemoryIndexReader.terms()");
786  return Terms(MATCH_ALL_TERM);
787  }
788 
789  public override TermEnum Terms(Term term)
790  {
791  if (DEBUG) System.Diagnostics.Debug.WriteLine("MemoryIndexReader.terms: " + term);
792 
793  int i; // index into info.sortedTerms
794  int j; // index into sortedFields
795 
796  _index.SortFields();
797  if (_index.sortedFields.Length == 1 && _index.sortedFields[0].Key == term.Field)
798  {
799  j = 0; // fast path
800  }
801  else
802  {
803  j = Array.BinarySearch(_index.sortedFields, new KeyValuePair<string, Info>(term.Field, null), Info.InfoComparer);
804  }
805 
806  if (j < 0)
807  {
808  // not found; choose successor
809  j = -j - 1;
810  i = 0;
811  if (j < _index.sortedFields.Length) GetInfo(j).SortTerms();
812  }
813  else
814  {
815  // found
816  Info info = GetInfo(j);
817  info.SortTerms();
818  i = Array.BinarySearch(info.SortedTerms, new KeyValuePair<string, ArrayIntList>(term.Text, null), Info.ArrayIntListComparer);
819  if (i < 0)
820  {
821  // not found; choose successor
822  i = -i - 1;
823  if (i >= info.SortedTerms.Length)
824  {
825  // move to next successor
826  j++;
827  i = 0;
828  if (j < _index.sortedFields.Length) GetInfo(j).SortTerms();
829  }
830  }
831  }
832  int ix = i;
833  int jx = j;
834 
835  return new MemoryTermEnum(_index, this, ix, jx);
836  }
837 
838  public override TermPositions TermPositions()
839  {
840  if (DEBUG) System.Diagnostics.Debug.WriteLine("MemoryIndexReader.termPositions");
841 
842  return new MemoryTermPositions(_index, this);
843  }
844 
845 
846  public override TermDocs TermDocs()
847  {
848  if (DEBUG) System.Diagnostics.Debug.WriteLine("MemoryIndexReader.termDocs");
849  return TermPositions();
850  }
851 
852  public override ITermFreqVector[] GetTermFreqVectors(int docNumber)
853  {
854  if (DEBUG) System.Diagnostics.Debug.WriteLine("MemoryIndexReader.getTermFreqVectors");
855  // This is okay, ToArray() is as optimized as writing it by hand
856  return _index.fields.Keys.Select(k => GetTermFreqVector(docNumber, k)).ToArray();
857  }
858 
859  public override void GetTermFreqVector(int docNumber, TermVectorMapper mapper)
860  {
861  if (DEBUG) System.Diagnostics.Debug.WriteLine("MemoryIndexReader.getTermFreqVectors");
862 
863  // if (vectors.length == 0) return null;
864  foreach (String fieldName in _index.fields.Keys)
865  {
866  GetTermFreqVector(docNumber, fieldName, mapper);
867  }
868  }
869 
870  public override void GetTermFreqVector(int docNumber, String field, TermVectorMapper mapper)
871  {
872  if (DEBUG) System.Diagnostics.Debug.WriteLine("MemoryIndexReader.getTermFreqVector");
873  Info info = GetInfo(field);
874  if (info == null)
875  {
876  return;
877  }
878  info.SortTerms();
879  mapper.SetExpectations(field, info.SortedTerms.Length, _index.stride != 1, true);
880  for (int i = info.SortedTerms.Length; --i >= 0;)
881  {
882 
883  ArrayIntList positions = info.SortedTerms[i].Value;
884  int size = positions.Size();
885  var offsets = new TermVectorOffsetInfo[size/_index.stride];
886 
887  for (int k = 0, j = 1; j < size; k++, j += _index.stride)
888  {
889  int start = positions.Get(j);
890  int end = positions.Get(j + 1);
891  offsets[k] = new TermVectorOffsetInfo(start, end);
892  }
893  mapper.Map(info.SortedTerms[i].Key, _index.NumPositions(info.SortedTerms[i].Value), offsets,
894  (info.SortedTerms[i].Value).ToArray(_index.stride));
895  }
896  }
897 
898  public override ITermFreqVector GetTermFreqVector(int docNumber, String fieldName)
899  {
900  if (DEBUG) System.Diagnostics.Debug.WriteLine("MemoryIndexReader.getTermFreqVector");
901  Info info = GetInfo(fieldName);
902  if (info == null) return null; // TODO: or return empty vector impl???
903  info.SortTerms();
904 
905  return new MemoryTermPositionVector(_index, info, fieldName);
906  }
907 
908  private Similarity GetSimilarity()
909  {
910  if (searcher != null) return searcher.Similarity;
911  return Similarity.Default;
912  }
913 
914  internal void SetSearcher(Searcher searcher)
915  {
916  this.searcher = searcher;
917  }
918 
919  /* performance hack: cache norms to avoid repeated expensive calculations */
920  private byte[] cachedNorms;
921  private String cachedFieldName;
922  private Similarity cachedSimilarity;
923 
924  public override byte[] Norms(String fieldName)
925  {
926  byte[] norms = cachedNorms;
927  Similarity sim = GetSimilarity();
928  if (fieldName != cachedFieldName || sim != cachedSimilarity)
929  {
930  // not cached?
931  Info info = GetInfo(fieldName);
932  int numTokens = info != null ? info.NumTokens : 0;
933  int numOverlapTokens = info != null ? info.NumOverlapTokens : 0;
934  float boost = info != null ? info.Boost : 1.0f;
935  FieldInvertState invertState = new FieldInvertState(0, numTokens, numOverlapTokens, 0, boost);
936  float n = sim.ComputeNorm(fieldName, invertState);
937  byte norm = Similarity.EncodeNorm(n);
938  norms = new byte[] {norm};
939 
940  // cache it for future reuse
941  cachedNorms = norms;
942  cachedFieldName = fieldName;
943  cachedSimilarity = sim;
944  if (DEBUG)
945  System.Diagnostics.Debug.WriteLine("MemoryIndexReader.norms: " + fieldName + ":" + n + ":" +
946  norm + ":" + numTokens);
947  }
948  return norms;
949  }
950 
951  public override void Norms(String fieldName, byte[] bytes, int offset)
952  {
953  if (DEBUG) System.Diagnostics.Debug.WriteLine("MemoryIndexReader.norms*: " + fieldName);
954  byte[] norms = Norms(fieldName);
955  Buffer.BlockCopy(norms, 0, bytes, offset, norms.Length);
956  }
957 
958  protected override void DoSetNorm(int doc, String fieldName, byte value)
959  {
960  throw new NotSupportedException();
961  }
962 
963  public override int NumDocs()
964  {
965  if (DEBUG) System.Diagnostics.Debug.WriteLine("MemoryIndexReader.numDocs");
966  return _index.fields.Count > 0 ? 1 : 0;
967  }
968 
969  public override int MaxDoc
970  {
971  get
972  {
973  if (DEBUG) System.Diagnostics.Debug.WriteLine("MemoryIndexReader.maxDoc");
974  return 1;
975  }
976  }
977 
978  public override Document Document(int n)
979  {
980  if (DEBUG) System.Diagnostics.Debug.WriteLine("MemoryIndexReader.document");
981  return new Document(); // there are no stored fields
982  }
983 
984  //When we convert to JDK 1.5 make this Set<String>
985  public override Document Document(int n, FieldSelector fieldSelector)
986  {
987  if (DEBUG) System.Diagnostics.Debug.WriteLine("MemoryIndexReader.document");
988  return new Document(); // there are no stored fields
989  }
990 
991  public override bool IsDeleted(int n)
992  {
993  if (DEBUG) System.Diagnostics.Debug.WriteLine("MemoryIndexReader.isDeleted");
994  return false;
995  }
996 
997  public override bool HasDeletions
998  {
999  get
1000  {
1001  if (DEBUG) System.Diagnostics.Debug.WriteLine("MemoryIndexReader.hasDeletions");
1002  return false;
1003  }
1004  }
1005 
1006  protected override void DoDelete(int docNum)
1007  {
1008  throw new NotSupportedException();
1009  }
1010 
1011  protected override void DoUndeleteAll()
1012  {
1013  throw new NotSupportedException();
1014  }
1015 
1016  protected override void DoCommit(IDictionary<String, String> commitUserData)
1017  {
1018  if (DEBUG) System.Diagnostics.Debug.WriteLine("MemoryIndexReader.doCommit");
1019 
1020  }
1021 
1022  protected override void DoClose()
1023  {
1024  if (DEBUG) System.Diagnostics.Debug.WriteLine("MemoryIndexReader.doClose");
1025  }
1026 
1027  // lucene >= 1.9 (remove this method for lucene-1.4.3)
1028  public override ICollection<String> GetFieldNames(FieldOption fieldOption)
1029  {
1030  if (DEBUG) System.Diagnostics.Debug.WriteLine("MemoryIndexReader.getFieldNamesOption");
1031  if (fieldOption == FieldOption.UNINDEXED)
1032  return CollectionsHelper<string>.EmptyList();
1033  if (fieldOption == FieldOption.INDEXED_NO_TERMVECTOR)
1034  return CollectionsHelper<string>.EmptyList();
1035  if (fieldOption == FieldOption.TERMVECTOR_WITH_OFFSET && _index.stride == 1)
1036  return CollectionsHelper<string>.EmptyList();
1037  if (fieldOption == FieldOption.TERMVECTOR_WITH_POSITION_OFFSET && _index.stride == 1)
1038  return CollectionsHelper<string>.EmptyList();
1039 
1040  return _index.fields.Keys.AsReadOnly();
1041  }
1042  }
1043 
1044 
1045  ///////////////////////////////////////////////////////////////////////////////
1046  // Nested classes:
1047  ///////////////////////////////////////////////////////////////////////////////
1048  private static class VM
1049  {
1050 
1051  public static readonly int PTR = Is64BitVM() ? 8 : 4;
1052 
1053  // bytes occupied by primitive data types
1054  public static readonly int BOOLEAN = 1;
1055  public static readonly int BYTE = 1;
1056  public static readonly int CHAR = 2;
1057  public static readonly int SHORT = 2;
1058  public static readonly int INT = 4;
1059  public static readonly int LONG = 8;
1060  public static readonly int FLOAT = 4;
1061  public static readonly int DOUBLE = 8;
1062 
1063  private static readonly int LOG_PTR = (int) Math.Round(Log2(PTR));
1064 
1065  /*
1066  * Object header of any heap allocated Java object.
1067  * ptr to class, info for monitor, gc, hash, etc.
1068  */
1069  private static readonly int OBJECT_HEADER = 2*PTR;
1070 
1071  // assumes n > 0
1072  // 64 bit VM:
1073  // 0 --> 0*PTR
1074  // 1..8 --> 1*PTR
1075  // 9..16 --> 2*PTR
1076  private static int SizeOf(int n)
1077  {
1078  return (((n - 1) >> LOG_PTR) + 1) << LOG_PTR;
1079  }
1080 
1081  public static int SizeOfObject(int n)
1082  {
1083  return SizeOf(OBJECT_HEADER + n);
1084  }
1085 
1086  public static int SizeOfObjectArray(int len)
1087  {
1088  return SizeOfObject(INT + PTR*len);
1089  }
1090 
1091  public static int SizeOfCharArray(int len)
1092  {
1093  return SizeOfObject(INT + CHAR*len);
1094  }
1095 
1096  public static int SizeOfIntArray(int len)
1097  {
1098  return SizeOfObject(INT + INT*len);
1099  }
1100 
1101  public static int SizeOfString(int len)
1102  {
1103  return SizeOfObject(3*INT + PTR) + SizeOfCharArray(len);
1104  }
1105 
1106  public static int SizeOfHashMap(int len)
1107  {
1108  return SizeOfObject(4*PTR + 4*INT) + SizeOfObjectArray(len)
1109  + len*SizeOfObject(3*PTR + INT); // entries
1110  }
1111 
1112  // note: does not include referenced objects
1113  public static int SizeOfArrayList(int len)
1114  {
1115  return SizeOfObject(PTR + 2*INT) + SizeOfObjectArray(len);
1116  }
1117 
1118  public static int SizeOfArrayIntList(int len)
1119  {
1120  return SizeOfObject(PTR + INT) + SizeOfIntArray(len);
1121  }
1122 
1123  private static bool Is64BitVM()
1124  {
1125  return IntPtr.Size == 8;
1126  }
1127 
1128  /* logarithm to the base 2. Example: log2(4) == 2, log2(8) == 3 */
1129 
1130  private static double Log2(double value)
1131  {
1132  return Math.Log(value, 2);
1133  //return Math.Log(value) / Math.Log(2);
1134  }
1135  }
1136 
1137  }
1138 }