Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
SegmentMerger.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.Collections.Generic;
20 using Document = Lucene.Net.Documents.Document;
21 using FieldSelector = Lucene.Net.Documents.FieldSelector;
22 using FieldSelectorResult = Lucene.Net.Documents.FieldSelectorResult;
23 using FieldOption = Lucene.Net.Index.IndexReader.FieldOption;
24 using MergeAbortedException = Lucene.Net.Index.MergePolicy.MergeAbortedException;
25 using Directory = Lucene.Net.Store.Directory;
26 using IndexInput = Lucene.Net.Store.IndexInput;
27 using IndexOutput = Lucene.Net.Store.IndexOutput;
28 
29 namespace Lucene.Net.Index
30 {
31 
32  /// <summary> The SegmentMerger class combines two or more Segments, represented by an IndexReader (<see cref="Add" />,
33  /// into a single Segment. After adding the appropriate readers, call the merge method to combine the
34  /// segments.
35  /// <p/>
36  /// If the compoundFile flag is set, then the segments will be merged into a compound file.
37  ///
38  ///
39  /// </summary>
40  /// <seealso cref="Merge()">
41  /// </seealso>
42  /// <seealso cref="Add">
43  /// </seealso>
44  public sealed class SegmentMerger
45  {
46  private class AnonymousClassCheckAbort:CheckAbort
47  {
48  private void InitBlock(SegmentMerger enclosingInstance)
49  {
50  this.enclosingInstance = enclosingInstance;
51  }
52  private SegmentMerger enclosingInstance;
53  public SegmentMerger Enclosing_Instance
54  {
55  get
56  {
57  return enclosingInstance;
58  }
59 
60  }
61  internal AnonymousClassCheckAbort(SegmentMerger enclosingInstance, Lucene.Net.Index.MergePolicy.OneMerge Param1, Lucene.Net.Store.Directory Param2):base(Param1, Param2)
62  {
63  InitBlock(enclosingInstance);
64  }
65  public override void Work(double units)
66  {
67  // do nothing
68  }
69  }
70  private class AnonymousClassCheckAbort1:CheckAbort
71  {
72  private void InitBlock(SegmentMerger enclosingInstance)
73  {
74  this.enclosingInstance = enclosingInstance;
75  }
76  private SegmentMerger enclosingInstance;
77  public SegmentMerger Enclosing_Instance
78  {
79  get
80  {
81  return enclosingInstance;
82  }
83 
84  }
85  internal AnonymousClassCheckAbort1(SegmentMerger enclosingInstance, Lucene.Net.Index.MergePolicy.OneMerge Param1, Lucene.Net.Store.Directory Param2):base(Param1, Param2)
86  {
87  InitBlock(enclosingInstance);
88  }
89  public override void Work(double units)
90  {
91  // do nothing
92  }
93  }
94 
95  private void InitBlock()
96  {
97  termIndexInterval = IndexWriter.DEFAULT_TERM_INDEX_INTERVAL;
98  }
99 
100  /// <summary>norms header placeholder </summary>
101  internal static readonly byte[] NORMS_HEADER = new byte[]{(byte) 'N', (byte) 'R', (byte) 'M', unchecked((byte) - 1)};
102 
103  private Directory directory;
104  private System.String segment;
105  private int termIndexInterval;
106 
107  private IList<IndexReader> readers = new List<IndexReader>();
108  private FieldInfos fieldInfos;
109 
110  private int mergedDocs;
111 
112  private CheckAbort checkAbort;
113 
114  // Whether we should merge doc stores (stored fields and
115  // vectors files). When all segments we are merging
116  // already share the same doc store files, we don't need
117  // to merge the doc stores.
118  private bool mergeDocStores;
119 
120  /// <summary>Maximum number of contiguous documents to bulk-copy
121  /// when merging stored fields
122  /// </summary>
123  private const int MAX_RAW_MERGE_DOCS = 4192;
124 
125  /// <summary>This ctor used only by test code.
126  ///
127  /// </summary>
128  /// <param name="dir">The Directory to merge the other segments into
129  /// </param>
130  /// <param name="name">The name of the new segment
131  /// </param>
132  public /*internal*/ SegmentMerger(Directory dir, System.String name)
133  {
134  InitBlock();
135  directory = dir;
136  segment = name;
137  checkAbort = new AnonymousClassCheckAbort(this, null, null);
138  }
139 
140  internal SegmentMerger(IndexWriter writer, System.String name, MergePolicy.OneMerge merge)
141  {
142  InitBlock();
143  directory = writer.Directory;
144  segment = name;
145  if (merge != null)
146  {
147  checkAbort = new CheckAbort(merge, directory);
148  }
149  else
150  {
151  checkAbort = new AnonymousClassCheckAbort1(this, null, null);
152  }
153  termIndexInterval = writer.TermIndexInterval;
154  }
155 
156  internal bool HasProx()
157  {
158  return fieldInfos.HasProx();
159  }
160 
161  /// <summary> Add an IndexReader to the collection of readers that are to be merged</summary>
162  /// <param name="reader">
163  /// </param>
164  public /*internal*/ void Add(IndexReader reader)
165  {
166  readers.Add(reader);
167  }
168 
169  /// <summary> </summary>
170  /// <param name="i">The index of the reader to return
171  /// </param>
172  /// <returns> The ith reader to be merged
173  /// </returns>
174  internal IndexReader SegmentReader(int i)
175  {
176  return readers[i];
177  }
178 
179  /// <summary> Merges the readers specified by the <see cref="Add" /> method into the directory passed to the constructor</summary>
180  /// <returns> The number of documents that were merged
181  /// </returns>
182  /// <throws> CorruptIndexException if the index is corrupt </throws>
183  /// <throws> IOException if there is a low-level IO error </throws>
184  public /*internal*/ int Merge()
185  {
186  return Merge(true);
187  }
188 
189  /// <summary> Merges the readers specified by the <see cref="Add" /> method
190  /// into the directory passed to the constructor.
191  /// </summary>
192  /// <param name="mergeDocStores">if false, we will not merge the
193  /// stored fields nor vectors files
194  /// </param>
195  /// <returns> The number of documents that were merged
196  /// </returns>
197  /// <throws> CorruptIndexException if the index is corrupt </throws>
198  /// <throws> IOException if there is a low-level IO error </throws>
199  internal int Merge(bool mergeDocStores)
200  {
201 
202  this.mergeDocStores = mergeDocStores;
203 
204  // NOTE: it's important to add calls to
205  // checkAbort.work(...) if you make any changes to this
206  // method that will spend alot of time. The frequency
207  // of this check impacts how long
208  // IndexWriter.close(false) takes to actually stop the
209  // threads.
210 
211  mergedDocs = MergeFields();
212  MergeTerms();
213  MergeNorms();
214 
215  if (mergeDocStores && fieldInfos.HasVectors())
216  MergeVectors();
217 
218  return mergedDocs;
219  }
220 
221  /// <summary> close all IndexReaders that have been added.
222  /// Should not be called before merge().
223  /// </summary>
224  /// <throws> IOException </throws>
225  internal void CloseReaders()
226  {
227  foreach(IndexReader reader in readers)
228  {
229  reader.Dispose();
230  }
231  }
232 
233  internal ICollection<string> GetMergedFiles()
234  {
235  ISet<string> fileSet = Lucene.Net.Support.Compatibility.SetFactory.CreateHashSet<string>();
236 
237  // Basic files
238  for (int i = 0; i < IndexFileNames.COMPOUND_EXTENSIONS.Length; i++)
239  {
240  System.String ext = IndexFileNames.COMPOUND_EXTENSIONS[i];
241 
242  if (ext.Equals(IndexFileNames.PROX_EXTENSION) && !HasProx())
243  continue;
244 
245  if (mergeDocStores || (!ext.Equals(IndexFileNames.FIELDS_EXTENSION) && !ext.Equals(IndexFileNames.FIELDS_INDEX_EXTENSION)))
246  fileSet.Add(segment + "." + ext);
247  }
248 
249  // Fieldable norm files
250  for (int i = 0; i < fieldInfos.Size(); i++)
251  {
252  FieldInfo fi = fieldInfos.FieldInfo(i);
253  if (fi.isIndexed && !fi.omitNorms)
254  {
255  fileSet.Add(segment + "." + IndexFileNames.NORMS_EXTENSION);
256  break;
257  }
258  }
259 
260  // Vector files
261  if (fieldInfos.HasVectors() && mergeDocStores)
262  {
263  for (int i = 0; i < IndexFileNames.VECTOR_EXTENSIONS.Length; i++)
264  {
265  fileSet.Add(segment + "." + IndexFileNames.VECTOR_EXTENSIONS[i]);
266  }
267  }
268 
269  return fileSet;
270  }
271 
272  public /*internal*/ ICollection<string> CreateCompoundFile(System.String fileName)
273  {
274  ICollection<string> files = GetMergedFiles();
275  CompoundFileWriter cfsWriter = new CompoundFileWriter(directory, fileName, checkAbort);
276 
277  // Now merge all added files
278  foreach(var file in files)
279  {
280  cfsWriter.AddFile(file);
281  }
282 
283  // Perform the merge
284  cfsWriter.Close();
285 
286  return files;
287  }
288 
289  private void AddIndexed(IndexReader reader, FieldInfos fInfos, ICollection<string> names, bool storeTermVectors, bool storePositionWithTermVector, bool storeOffsetWithTermVector, bool storePayloads, bool omitTFAndPositions)
290  {
291  foreach (var field in names)
292  {
293  fInfos.Add(field, true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector,
294  !reader.HasNorms(field), storePayloads, omitTFAndPositions);
295  }
296  }
297 
298  private SegmentReader[] matchingSegmentReaders;
299  private int[] rawDocLengths;
300  private int[] rawDocLengths2;
301 
302  private void SetMatchingSegmentReaders()
303  {
304  // If the i'th reader is a SegmentReader and has
305  // identical fieldName -> number mapping, then this
306  // array will be non-null at position i:
307  int numReaders = readers.Count;
308  matchingSegmentReaders = new SegmentReader[numReaders];
309 
310  // If this reader is a SegmentReader, and all of its
311  // field name -> number mappings match the "merged"
312  // FieldInfos, then we can do a bulk copy of the
313  // stored fields:
314  for (int i = 0; i < numReaders; i++)
315  {
316  IndexReader reader = readers[i];
317  if (reader is SegmentReader)
318  {
319  SegmentReader segmentReader = (SegmentReader) reader;
320  bool same = true;
321  FieldInfos segmentFieldInfos = segmentReader.FieldInfos();
322  int numFieldInfos = segmentFieldInfos.Size();
323  for (int j = 0; same && j < numFieldInfos; j++)
324  {
325  same = fieldInfos.FieldName(j).Equals(segmentFieldInfos.FieldName(j));
326  }
327  if (same)
328  {
329  matchingSegmentReaders[i] = segmentReader;
330  }
331  }
332  }
333 
334  // Used for bulk-reading raw bytes for stored fields
335  rawDocLengths = new int[MAX_RAW_MERGE_DOCS];
336  rawDocLengths2 = new int[MAX_RAW_MERGE_DOCS];
337  }
338 
339  /// <summary> </summary>
340  /// <returns> The number of documents in all of the readers
341  /// </returns>
342  /// <throws> CorruptIndexException if the index is corrupt </throws>
343  /// <throws> IOException if there is a low-level IO error </throws>
344  private int MergeFields()
345  {
346 
347  if (!mergeDocStores)
348  {
349  // When we are not merging by doc stores, their field
350  // name -> number mapping are the same. So, we start
351  // with the fieldInfos of the last segment in this
352  // case, to keep that numbering.
353  SegmentReader sr = (SegmentReader) readers[readers.Count - 1];
354  fieldInfos = (FieldInfos) sr.core.fieldInfos.Clone();
355  }
356  else
357  {
358  fieldInfos = new FieldInfos(); // merge field names
359  }
360 
361  foreach(IndexReader reader in readers)
362  {
363  if (reader is SegmentReader)
364  {
365  SegmentReader segmentReader = (SegmentReader) reader;
366  FieldInfos readerFieldInfos = segmentReader.FieldInfos();
367  int numReaderFieldInfos = readerFieldInfos.Size();
368  for (int j = 0; j < numReaderFieldInfos; j++)
369  {
370  FieldInfo fi = readerFieldInfos.FieldInfo(j);
371  fieldInfos.Add(fi.name, fi.isIndexed, fi.storeTermVector, fi.storePositionWithTermVector, fi.storeOffsetWithTermVector, !reader.HasNorms(fi.name), fi.storePayloads, fi.omitTermFreqAndPositions);
372  }
373  }
374  else
375  {
376  AddIndexed(reader, fieldInfos, reader.GetFieldNames(FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true, false, false);
377  AddIndexed(reader, fieldInfos, reader.GetFieldNames(FieldOption.TERMVECTOR_WITH_POSITION), true, true, false, false, false);
378  AddIndexed(reader, fieldInfos, reader.GetFieldNames(FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true, false, false);
379  AddIndexed(reader, fieldInfos, reader.GetFieldNames(FieldOption.TERMVECTOR), true, false, false, false, false);
380  AddIndexed(reader, fieldInfos, reader.GetFieldNames(FieldOption.OMIT_TERM_FREQ_AND_POSITIONS), false, false, false, false, true);
381  AddIndexed(reader, fieldInfos, reader.GetFieldNames(FieldOption.STORES_PAYLOADS), false, false, false, true, false);
382  AddIndexed(reader, fieldInfos, reader.GetFieldNames(FieldOption.INDEXED), false, false, false, false, false);
383  fieldInfos.Add(reader.GetFieldNames(FieldOption.UNINDEXED), false);
384  }
385  }
386  fieldInfos.Write(directory, segment + ".fnm");
387 
388  int docCount = 0;
389 
390  SetMatchingSegmentReaders();
391 
392  if (mergeDocStores)
393  {
394  // merge field values
395  FieldsWriter fieldsWriter = new FieldsWriter(directory, segment, fieldInfos);
396 
397  try
398  {
399  int idx = 0;
400  foreach(IndexReader reader in readers)
401  {
402  SegmentReader matchingSegmentReader = matchingSegmentReaders[idx++];
403  FieldsReader matchingFieldsReader = null;
404  if (matchingSegmentReader != null)
405  {
406  FieldsReader fieldsReader = matchingSegmentReader.GetFieldsReader();
407  if (fieldsReader != null && fieldsReader.CanReadRawDocs())
408  {
409  matchingFieldsReader = fieldsReader;
410  }
411  }
412  if (reader.HasDeletions)
413  {
414  docCount += CopyFieldsWithDeletions(fieldsWriter, reader, matchingFieldsReader);
415  }
416  else
417  {
418  docCount += CopyFieldsNoDeletions(fieldsWriter, reader, matchingFieldsReader);
419  }
420  }
421  }
422  finally
423  {
424  fieldsWriter.Dispose();
425  }
426 
427  System.String fileName = segment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION;
428  long fdxFileLength = directory.FileLength(fileName);
429 
430  if (4 + ((long) docCount) * 8 != fdxFileLength)
431  // This is most likely a bug in Sun JRE 1.6.0_04/_05;
432  // we detect that the bug has struck, here, and
433  // throw an exception to prevent the corruption from
434  // entering the index. See LUCENE-1282 for
435  // details.
436  throw new System.SystemException("mergeFields produced an invalid result: docCount is " + docCount + " but fdx file size is " + fdxFileLength + " file=" + fileName + " file exists?=" + directory.FileExists(fileName) + "; now aborting this merge to prevent index corruption");
437  }
438  // If we are skipping the doc stores, that means there
439  // are no deletions in any of these segments, so we
440  // just sum numDocs() of each segment to get total docCount
441  else
442  {
443  foreach(IndexReader reader in readers)
444  {
445  docCount += reader.NumDocs();
446  }
447  }
448 
449  return docCount;
450  }
451 
452  private int CopyFieldsWithDeletions(FieldsWriter fieldsWriter, IndexReader reader, FieldsReader matchingFieldsReader)
453  {
454  int docCount = 0;
455  int maxDoc = reader.MaxDoc;
456  if (matchingFieldsReader != null)
457  {
458  // We can bulk-copy because the fieldInfos are "congruent"
459  for (int j = 0; j < maxDoc; )
460  {
461  if (reader.IsDeleted(j))
462  {
463  // skip deleted docs
464  ++j;
465  continue;
466  }
467  // We can optimize this case (doing a bulk byte copy) since the field
468  // numbers are identical
469  int start = j, numDocs = 0;
470  do
471  {
472  j++;
473  numDocs++;
474  if (j >= maxDoc)
475  break;
476  if (reader.IsDeleted(j))
477  {
478  j++;
479  break;
480  }
481  }
482  while (numDocs < MAX_RAW_MERGE_DOCS);
483 
484  IndexInput stream = matchingFieldsReader.RawDocs(rawDocLengths, start, numDocs);
485  fieldsWriter.AddRawDocuments(stream, rawDocLengths, numDocs);
486  docCount += numDocs;
487  checkAbort.Work(300 * numDocs);
488  }
489  }
490  else
491  {
492  for (int j = 0; j < maxDoc; j++)
493  {
494  if (reader.IsDeleted(j))
495  {
496  // skip deleted docs
497  continue;
498  }
499  // NOTE: it's very important to first assign to doc then pass it to
500  // termVectorsWriter.addAllDocVectors; see LUCENE-1282
501  Document doc = reader.Document(j);
502  fieldsWriter.AddDocument(doc);
503  docCount++;
504  checkAbort.Work(300);
505  }
506  }
507  return docCount;
508  }
509 
510  private int CopyFieldsNoDeletions(FieldsWriter fieldsWriter, IndexReader reader, FieldsReader matchingFieldsReader)
511  {
512  int maxDoc = reader.MaxDoc;
513  int docCount = 0;
514  if (matchingFieldsReader != null)
515  {
516  // We can bulk-copy because the fieldInfos are "congruent"
517  while (docCount < maxDoc)
518  {
519  int len = System.Math.Min(MAX_RAW_MERGE_DOCS, maxDoc - docCount);
520  IndexInput stream = matchingFieldsReader.RawDocs(rawDocLengths, docCount, len);
521  fieldsWriter.AddRawDocuments(stream, rawDocLengths, len);
522  docCount += len;
523  checkAbort.Work(300 * len);
524  }
525  }
526  else
527  {
528  for (; docCount < maxDoc; docCount++)
529  {
530  // NOTE: it's very important to first assign to doc then pass it to
531  // termVectorsWriter.addAllDocVectors; see LUCENE-1282
532  Document doc = reader.Document(docCount);
533  fieldsWriter.AddDocument(doc);
534  checkAbort.Work(300);
535  }
536  }
537  return docCount;
538  }
539 
540  /// <summary> Merge the TermVectors from each of the segments into the new one.</summary>
541  /// <throws> IOException </throws>
542  private void MergeVectors()
543  {
544  TermVectorsWriter termVectorsWriter = new TermVectorsWriter(directory, segment, fieldInfos);
545 
546  try
547  {
548  int idx = 0;
549  foreach(IndexReader reader in readers)
550  {
551  SegmentReader matchingSegmentReader = matchingSegmentReaders[idx++];
552  TermVectorsReader matchingVectorsReader = null;
553  if (matchingSegmentReader != null)
554  {
555  TermVectorsReader vectorsReader = matchingSegmentReader.GetTermVectorsReaderOrig();
556 
557  // If the TV* files are an older format then they cannot read raw docs:
558  if (vectorsReader != null && vectorsReader.CanReadRawDocs())
559  {
560  matchingVectorsReader = vectorsReader;
561  }
562  }
563  if (reader.HasDeletions)
564  {
565  CopyVectorsWithDeletions(termVectorsWriter, matchingVectorsReader, reader);
566  }
567  else
568  {
569  CopyVectorsNoDeletions(termVectorsWriter, matchingVectorsReader, reader);
570  }
571  }
572  }
573  finally
574  {
575  termVectorsWriter.Dispose();
576  }
577 
578  System.String fileName = segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION;
579  long tvxSize = directory.FileLength(fileName);
580 
581  if (4 + ((long) mergedDocs) * 16 != tvxSize)
582  // This is most likely a bug in Sun JRE 1.6.0_04/_05;
583  // we detect that the bug has struck, here, and
584  // throw an exception to prevent the corruption from
585  // entering the index. See LUCENE-1282 for
586  // details.
587  throw new System.SystemException("mergeVectors produced an invalid result: mergedDocs is " + mergedDocs + " but tvx size is " + tvxSize + " file=" + fileName + " file exists?=" + directory.FileExists(fileName) + "; now aborting this merge to prevent index corruption");
588  }
589 
590  private void CopyVectorsWithDeletions(TermVectorsWriter termVectorsWriter, TermVectorsReader matchingVectorsReader, IndexReader reader)
591  {
592  int maxDoc = reader.MaxDoc;
593  if (matchingVectorsReader != null)
594  {
595  // We can bulk-copy because the fieldInfos are "congruent"
596  for (int docNum = 0; docNum < maxDoc; )
597  {
598  if (reader.IsDeleted(docNum))
599  {
600  // skip deleted docs
601  ++docNum;
602  continue;
603  }
604  // We can optimize this case (doing a bulk byte copy) since the field
605  // numbers are identical
606  int start = docNum, numDocs = 0;
607  do
608  {
609  docNum++;
610  numDocs++;
611  if (docNum >= maxDoc)
612  break;
613  if (reader.IsDeleted(docNum))
614  {
615  docNum++;
616  break;
617  }
618  }
619  while (numDocs < MAX_RAW_MERGE_DOCS);
620 
621  matchingVectorsReader.RawDocs(rawDocLengths, rawDocLengths2, start, numDocs);
622  termVectorsWriter.AddRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, numDocs);
623  checkAbort.Work(300 * numDocs);
624  }
625  }
626  else
627  {
628  for (int docNum = 0; docNum < maxDoc; docNum++)
629  {
630  if (reader.IsDeleted(docNum))
631  {
632  // skip deleted docs
633  continue;
634  }
635 
636  // NOTE: it's very important to first assign to vectors then pass it to
637  // termVectorsWriter.addAllDocVectors; see LUCENE-1282
638  ITermFreqVector[] vectors = reader.GetTermFreqVectors(docNum);
639  termVectorsWriter.AddAllDocVectors(vectors);
640  checkAbort.Work(300);
641  }
642  }
643  }
644 
645  private void CopyVectorsNoDeletions(TermVectorsWriter termVectorsWriter, TermVectorsReader matchingVectorsReader, IndexReader reader)
646  {
647  int maxDoc = reader.MaxDoc;
648  if (matchingVectorsReader != null)
649  {
650  // We can bulk-copy because the fieldInfos are "congruent"
651  int docCount = 0;
652  while (docCount < maxDoc)
653  {
654  int len = System.Math.Min(MAX_RAW_MERGE_DOCS, maxDoc - docCount);
655  matchingVectorsReader.RawDocs(rawDocLengths, rawDocLengths2, docCount, len);
656  termVectorsWriter.AddRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, len);
657  docCount += len;
658  checkAbort.Work(300 * len);
659  }
660  }
661  else
662  {
663  for (int docNum = 0; docNum < maxDoc; docNum++)
664  {
665  // NOTE: it's very important to first assign to vectors then pass it to
666  // termVectorsWriter.addAllDocVectors; see LUCENE-1282
667  ITermFreqVector[] vectors = reader.GetTermFreqVectors(docNum);
668  termVectorsWriter.AddAllDocVectors(vectors);
669  checkAbort.Work(300);
670  }
671  }
672  }
673 
674  private SegmentMergeQueue queue = null;
675 
676  private void MergeTerms()
677  {
678 
679  SegmentWriteState state = new SegmentWriteState(null, directory, segment, null, mergedDocs, 0, termIndexInterval);
680 
681  FormatPostingsFieldsConsumer consumer = new FormatPostingsFieldsWriter(state, fieldInfos);
682 
683  try
684  {
685  queue = new SegmentMergeQueue(readers.Count);
686 
687  MergeTermInfos(consumer);
688  }
689  finally
690  {
691  consumer.Finish();
692  if (queue != null)
693  queue.Dispose();
694  }
695  }
696 
697  internal bool omitTermFreqAndPositions;
698 
699  private void MergeTermInfos(FormatPostingsFieldsConsumer consumer)
700  {
701  int base_Renamed = 0;
702  int readerCount = readers.Count;
703  for (int i = 0; i < readerCount; i++)
704  {
705  IndexReader reader = readers[i];
706  TermEnum termEnum = reader.Terms();
707  SegmentMergeInfo smi = new SegmentMergeInfo(base_Renamed, termEnum, reader);
708  int[] docMap = smi.GetDocMap();
709  if (docMap != null)
710  {
711  if (docMaps == null)
712  {
713  docMaps = new int[readerCount][];
714  delCounts = new int[readerCount];
715  }
716  docMaps[i] = docMap;
717  delCounts[i] = smi.reader.MaxDoc - smi.reader.NumDocs();
718  }
719 
720  base_Renamed += reader.NumDocs();
721 
722  System.Diagnostics.Debug.Assert(reader.NumDocs() == reader.MaxDoc - smi.delCount);
723 
724  if (smi.Next())
725  queue.Add(smi);
726  // initialize queue
727  else
728  smi.Dispose();
729  }
730 
731  SegmentMergeInfo[] match = new SegmentMergeInfo[readers.Count];
732 
733  System.String currentField = null;
734  FormatPostingsTermsConsumer termsConsumer = null;
735 
736  while (queue.Size() > 0)
737  {
738  int matchSize = 0; // pop matching terms
739  match[matchSize++] = queue.Pop();
740  Term term = match[0].term;
741  SegmentMergeInfo top = queue.Top();
742 
743  while (top != null && term.CompareTo(top.term) == 0)
744  {
745  match[matchSize++] = queue.Pop();
746  top = queue.Top();
747  }
748 
749  if ((System.Object) currentField != (System.Object) term.Field)
750  {
751  currentField = term.Field;
752  if (termsConsumer != null)
753  termsConsumer.Finish();
754  FieldInfo fieldInfo = fieldInfos.FieldInfo(currentField);
755  termsConsumer = consumer.AddField(fieldInfo);
756  omitTermFreqAndPositions = fieldInfo.omitTermFreqAndPositions;
757  }
758 
759  int df = AppendPostings(termsConsumer, match, matchSize); // add new TermInfo
760 
761  checkAbort.Work(df / 3.0);
762 
763  while (matchSize > 0)
764  {
765  SegmentMergeInfo smi = match[--matchSize];
766  if (smi.Next())
767  queue.Add(smi);
768  // restore queue
769  else
770  smi.Dispose(); // done with a segment
771  }
772  }
773  }
774 
775  private byte[] payloadBuffer;
776  private int[][] docMaps;
777  internal int[][] GetDocMaps()
778  {
779  return docMaps;
780  }
781  private int[] delCounts;
782  internal int[] GetDelCounts()
783  {
784  return delCounts;
785  }
786 
787  /// <summary>Process postings from multiple segments all positioned on the
788  /// same term. Writes out merged entries into freqOutput and
789  /// the proxOutput streams.
790  ///
791  /// </summary>
792  /// <param name="smis">array of segments
793  /// </param>
794  /// <param name="n">number of cells in the array actually occupied
795  /// </param>
796  /// <returns> number of documents across all segments where this term was found
797  /// </returns>
798  /// <throws> CorruptIndexException if the index is corrupt </throws>
799  /// <throws> IOException if there is a low-level IO error </throws>
800  private int AppendPostings(FormatPostingsTermsConsumer termsConsumer, SegmentMergeInfo[] smis, int n)
801  {
802 
803  FormatPostingsDocsConsumer docConsumer = termsConsumer.AddTerm(smis[0].term.Text);
804  int df = 0;
805  for (int i = 0; i < n; i++)
806  {
807  SegmentMergeInfo smi = smis[i];
808  TermPositions postings = smi.GetPositions();
809  System.Diagnostics.Debug.Assert(postings != null);
810  int base_Renamed = smi.base_Renamed;
811  int[] docMap = smi.GetDocMap();
812  postings.Seek(smi.termEnum);
813 
814  while (postings.Next())
815  {
816  df++;
817  int doc = postings.Doc;
818  if (docMap != null)
819  doc = docMap[doc]; // map around deletions
820  doc += base_Renamed; // convert to merged space
821 
822  int freq = postings.Freq;
823  FormatPostingsPositionsConsumer posConsumer = docConsumer.AddDoc(doc, freq);
824 
825  if (!omitTermFreqAndPositions)
826  {
827  for (int j = 0; j < freq; j++)
828  {
829  int position = postings.NextPosition();
830  int payloadLength = postings.PayloadLength;
831  if (payloadLength > 0)
832  {
833  if (payloadBuffer == null || payloadBuffer.Length < payloadLength)
834  payloadBuffer = new byte[payloadLength];
835  postings.GetPayload(payloadBuffer, 0);
836  }
837  posConsumer.AddPosition(position, payloadBuffer, 0, payloadLength);
838  }
839  posConsumer.Finish();
840  }
841  }
842  }
843  docConsumer.Finish();
844 
845  return df;
846  }
847 
848  private void MergeNorms()
849  {
850  byte[] normBuffer = null;
851  IndexOutput output = null;
852  try
853  {
854  int numFieldInfos = fieldInfos.Size();
855  for (int i = 0; i < numFieldInfos; i++)
856  {
857  FieldInfo fi = fieldInfos.FieldInfo(i);
858  if (fi.isIndexed && !fi.omitNorms)
859  {
860  if (output == null)
861  {
862  output = directory.CreateOutput(segment + "." + IndexFileNames.NORMS_EXTENSION);
863  output.WriteBytes(NORMS_HEADER, NORMS_HEADER.Length);
864  }
865  foreach(IndexReader reader in readers)
866  {
867  int maxDoc = reader.MaxDoc;
868  if (normBuffer == null || normBuffer.Length < maxDoc)
869  {
870  // the buffer is too small for the current segment
871  normBuffer = new byte[maxDoc];
872  }
873  reader.Norms(fi.name, normBuffer, 0);
874  if (!reader.HasDeletions)
875  {
876  //optimized case for segments without deleted docs
877  output.WriteBytes(normBuffer, maxDoc);
878  }
879  else
880  {
881  // this segment has deleted docs, so we have to
882  // check for every doc if it is deleted or not
883  for (int k = 0; k < maxDoc; k++)
884  {
885  if (!reader.IsDeleted(k))
886  {
887  output.WriteByte(normBuffer[k]);
888  }
889  }
890  }
891  checkAbort.Work(maxDoc);
892  }
893  }
894  }
895  }
896  finally
897  {
898  if (output != null)
899  {
900  output.Close();
901  }
902  }
903  }
904 
905  internal class CheckAbort
906  {
907  private double workCount;
908  private MergePolicy.OneMerge merge;
909  private Directory dir;
910  public CheckAbort(MergePolicy.OneMerge merge, Directory dir)
911  {
912  this.merge = merge;
913  this.dir = dir;
914  }
915 
916  /// <summary> Records the fact that roughly units amount of work
917  /// have been done since this method was last called.
918  /// When adding time-consuming code into SegmentMerger,
919  /// you should test different values for units to ensure
920  /// that the time in between calls to merge.checkAborted
921  /// is up to ~ 1 second.
922  /// </summary>
923  public virtual void Work(double units)
924  {
925  workCount += units;
926  if (workCount >= 10000.0)
927  {
928  merge.CheckAborted(dir);
929  workCount = 0;
930  }
931  }
932  }
933  }
934 }