Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
LogMergePolicy.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.Collections.Generic;
20 
21 namespace Lucene.Net.Index
22 {
23 
24  /// <summary><p/>This class implements a <see cref="MergePolicy" /> that tries
25  /// to merge segments into levels of exponentially
26  /// increasing size, where each level has fewer segments than
27  /// the value of the merge factor. Whenever extra segments
28  /// (beyond the merge factor upper bound) are encountered,
29  /// all segments within the level are merged. You can get or
30  /// set the merge factor using <see cref="MergeFactor" /> and
31  /// <see cref="MergeFactor" /> respectively.<p/>
32  ///
33  /// <p/>This class is abstract and requires a subclass to
34  /// define the <see cref="Size" /> method which specifies how a
35  /// segment's size is determined. <see cref="LogDocMergePolicy" />
36  /// is one subclass that measures size by document count in
37  /// the segment. <see cref="LogByteSizeMergePolicy" /> is another
38  /// subclass that measures size as the total byte size of the
39  /// file(s) for the segment.<p/>
40  /// </summary>
41 
42  public abstract class LogMergePolicy : MergePolicy
43  {
44 
45  /// <summary>Defines the allowed range of log(size) for each
46  /// level. A level is computed by taking the max segment
47  /// log size, minus LEVEL_LOG_SPAN, and finding all
48  /// segments falling within that range.
49  /// </summary>
50  public const double LEVEL_LOG_SPAN = 0.75;
51 
52  /// <summary>Default merge factor, which is how many segments are
53  /// merged at a time
54  /// </summary>
55  public const int DEFAULT_MERGE_FACTOR = 10;
56 
57  /// <summary>Default maximum segment size. A segment of this size</summary>
58  /// <seealso cref="MaxMergeDocs">
59  /// </seealso>
60  public static readonly int DEFAULT_MAX_MERGE_DOCS = System.Int32.MaxValue;
61 
62  /// <summary> Default noCFSRatio. If a merge's size is >= 10% of
63  /// the index, then we disable compound file for it.
64  /// See <see cref="NoCFSRatio"/>
65  /// </summary>
66  public static double DEFAULT_NO_CFS_RATIO = 0.1;
67 
68  private int mergeFactor = DEFAULT_MERGE_FACTOR;
69 
70  internal long minMergeSize;
71  internal long maxMergeSize;
72  internal int maxMergeDocs = DEFAULT_MAX_MERGE_DOCS;
73 
74  protected double internalNoCFSRatio = DEFAULT_NO_CFS_RATIO;
75 
76  /* TODO 3.0: change this default to true */
77  protected internal bool internalCalibrateSizeByDeletes = true;
78 
79  private bool useCompoundFile = true;
80  private bool useCompoundDocStore = true;
81 
82  protected LogMergePolicy(IndexWriter writer):base(writer)
83  {
84  }
85 
86  protected internal virtual bool Verbose()
87  {
88  return writer != null && writer.Verbose;
89  }
90 
91  public double NoCFSRatio
92  {
93  get { return internalNoCFSRatio; }
94  set
95  {
96  if (value < 0.0 || value > 1.0)
97  {
98  throw new ArgumentException("noCFSRatio must be 0.0 to 1.0 inclusive; got " + value);
99  }
100  this.internalNoCFSRatio = value;
101  }
102  }
103 
104  /* If a merged segment will be more than this percentage
105  * of the total size of the index, leave the segment as
106  * non-compound file even if compound file is enabled.
107  * Set to 1.0 to always use CFS regardless of merge
108  * size. */
109  private void Message(System.String message)
110  {
111  if (Verbose())
112  writer.Message("LMP: " + message);
113  }
114 
115 
116  /// <summary>Gets or sets how often segment indices are merged by
117  /// addDocument(). With smaller values, less RAM is used
118  /// while indexing, and searches on unoptimized indices are
119  /// faster, but indexing speed is slower. With larger
120  /// values, more RAM is used during indexing, and while
121  /// searches on unoptimized indices are slower, indexing is
122  /// faster. Thus larger values (&gt; 10) are best for batch
123  /// index creation, and smaller values (&lt; 10) for indices
124  /// that are interactively maintained.
125  /// </summary>
126  public virtual int MergeFactor
127  {
128  get { return mergeFactor; }
129  set
130  {
131  if (value < 2)
132  throw new System.ArgumentException("mergeFactor cannot be less than 2");
133  this.mergeFactor = value;
134  }
135  }
136 
137  public override bool UseCompoundFile(SegmentInfos infos, SegmentInfo info)
138  {
139  return useCompoundFile;
140  }
141 
142  /// <summary>Gets or sets whether compound file format should be used for
143  /// newly flushed and newly merged segments.
144  /// </summary>
145  public virtual void SetUseCompoundFile(bool useCompoundFile)
146  {
147  this.useCompoundFile = useCompoundFile;
148  }
149 
150  [System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Design", "CA1024:UsePropertiesWhereAppropriate")]
151  public virtual bool GetUseCompoundFile()
152  {
153  return useCompoundFile;
154  }
155 
156  // Javadoc inherited
157  public override bool UseCompoundDocStore(SegmentInfos infos)
158  {
159  return useCompoundDocStore;
160  }
161 
162  /// <summary>Sets whether compound file format should be used for
163  /// newly flushed and newly merged doc store
164  /// segment files (term vectors and stored fields).
165  /// </summary>
166  public virtual void SetUseCompoundDocStore(bool useCompoundDocStore)
167  {
168  this.useCompoundDocStore = useCompoundDocStore;
169  }
170 
171  /// <summary>Returns true if newly flushed and newly merge doc
172  /// store segment files (term vectors and stored fields)
173  /// </summary>
174  /// <seealso cref="SetUseCompoundDocStore ">
175  /// </seealso>
176  [System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Design", "CA1024:UsePropertiesWhereAppropriate")]
177  public virtual bool GetUseCompoundDocStore()
178  {
179  return useCompoundDocStore;
180  }
181 
182  /// <summary>Gets or sets whether the segment size should be calibrated by
183  /// the number of deletes when choosing segments for merge.
184  /// </summary>
185  public virtual bool CalibrateSizeByDeletes
186  {
187  set { this.internalCalibrateSizeByDeletes = value; }
188  get { return internalCalibrateSizeByDeletes; }
189  }
190 
191  abstract protected internal long Size(SegmentInfo info);
192 
193  protected internal virtual long SizeDocs(SegmentInfo info)
194  {
195  if (internalCalibrateSizeByDeletes)
196  {
197  int delCount = writer.NumDeletedDocs(info);
198  return (info.docCount - (long) delCount);
199  }
200  else
201  {
202  return info.docCount;
203  }
204  }
205 
206  protected internal virtual long SizeBytes(SegmentInfo info)
207  {
208  long byteSize = info.SizeInBytes();
209  if (internalCalibrateSizeByDeletes)
210  {
211  int delCount = writer.NumDeletedDocs(info);
212  float delRatio = (info.docCount <= 0?0.0f:((float) delCount / (float) info.docCount));
213  return (info.docCount <= 0?byteSize:(long) (byteSize * (1.0f - delRatio)));
214  }
215  else
216  {
217  return byteSize;
218  }
219  }
220 
221  private bool IsOptimized(SegmentInfos infos, int maxNumSegments, ISet<SegmentInfo> segmentsToOptimize)
222  {
223  int numSegments = infos.Count;
224  int numToOptimize = 0;
225  SegmentInfo optimizeInfo = null;
226  for (int i = 0; i < numSegments && numToOptimize <= maxNumSegments; i++)
227  {
228  SegmentInfo info = infos.Info(i);
229  if (segmentsToOptimize.Contains(info))
230  {
231  numToOptimize++;
232  optimizeInfo = info;
233  }
234  }
235 
236  return numToOptimize <= maxNumSegments && (numToOptimize != 1 || IsOptimized(optimizeInfo));
237  }
238 
239  /// <summary>Returns true if this single info is optimized (has no
240  /// pending norms or deletes, is in the same dir as the
241  /// writer, and matches the current compound file setting
242  /// </summary>
243  private bool IsOptimized(SegmentInfo info)
244  {
245  bool hasDeletions = writer.NumDeletedDocs(info) > 0;
246  return !hasDeletions && !info.HasSeparateNorms() && info.dir == writer.Directory &&
247  (info.GetUseCompoundFile() == useCompoundFile || internalNoCFSRatio < 1.0);
248  }
249 
250  /// <summary>Returns the merges necessary to optimize the index.
251  /// This merge policy defines "optimized" to mean only one
252  /// segment in the index, where that segment has no
253  /// deletions pending nor separate norms, and it is in
254  /// compound file format if the current useCompoundFile
255  /// setting is true. This method returns multiple merges
256  /// (mergeFactor at a time) so the <see cref="MergeScheduler" />
257  /// in use may make use of concurrency.
258  /// </summary>
259  public override MergeSpecification FindMergesForOptimize(SegmentInfos infos, int maxNumSegments, ISet<SegmentInfo> segmentsToOptimize)
260  {
261  MergeSpecification spec;
262 
263  System.Diagnostics.Debug.Assert(maxNumSegments > 0);
264 
265  if (!IsOptimized(infos, maxNumSegments, segmentsToOptimize))
266  {
267 
268  // Find the newest (rightmost) segment that needs to
269  // be optimized (other segments may have been flushed
270  // since optimize started):
271  int last = infos.Count;
272  while (last > 0)
273  {
274  SegmentInfo info = infos.Info(--last);
275  if (segmentsToOptimize.Contains(info))
276  {
277  last++;
278  break;
279  }
280  }
281 
282  if (last > 0)
283  {
284 
285  spec = new MergeSpecification();
286 
287  // First, enroll all "full" merges (size
288  // mergeFactor) to potentially be run concurrently:
289  while (last - maxNumSegments + 1 >= mergeFactor)
290  {
291  spec.Add(MakeOneMerge(infos, infos.Range(last - mergeFactor, last)));
292  last -= mergeFactor;
293  }
294 
295  // Only if there are no full merges pending do we
296  // add a final partial (< mergeFactor segments) merge:
297  if (0 == spec.merges.Count)
298  {
299  if (maxNumSegments == 1)
300  {
301 
302  // Since we must optimize down to 1 segment, the
303  // choice is simple:
304  if (last > 1 || !IsOptimized(infos.Info(0)))
305  spec.Add(MakeOneMerge(infos, infos.Range(0, last)));
306  }
307  else if (last > maxNumSegments)
308  {
309 
310  // Take care to pick a partial merge that is
311  // least cost, but does not make the index too
312  // lopsided. If we always just picked the
313  // partial tail then we could produce a highly
314  // lopsided index over time:
315 
316  // We must merge this many segments to leave
317  // maxNumSegments in the index (from when
318  // optimize was first kicked off):
319  int finalMergeSize = last - maxNumSegments + 1;
320 
321  // Consider all possible starting points:
322  long bestSize = 0;
323  int bestStart = 0;
324 
325  for (int i = 0; i < last - finalMergeSize + 1; i++)
326  {
327  long sumSize = 0;
328  for (int j = 0; j < finalMergeSize; j++)
329  sumSize += Size(infos.Info(j + i));
330  if (i == 0 || (sumSize < 2 * Size(infos.Info(i - 1)) && sumSize < bestSize))
331  {
332  bestStart = i;
333  bestSize = sumSize;
334  }
335  }
336 
337  spec.Add(MakeOneMerge(infos, infos.Range(bestStart, bestStart + finalMergeSize)));
338  }
339  }
340  }
341  else
342  spec = null;
343  }
344  else
345  spec = null;
346 
347  return spec;
348  }
349 
350  /// <summary> Finds merges necessary to expunge all deletes from the
351  /// index. We simply merge adjacent segments that have
352  /// deletes, up to mergeFactor at a time.
353  /// </summary>
354  public override MergeSpecification FindMergesToExpungeDeletes(SegmentInfos segmentInfos)
355  {
356  int numSegments = segmentInfos.Count;
357 
358  if (Verbose())
359  Message("findMergesToExpungeDeletes: " + numSegments + " segments");
360 
362  int firstSegmentWithDeletions = - 1;
363  for (int i = 0; i < numSegments; i++)
364  {
365  SegmentInfo info = segmentInfos.Info(i);
366  int delCount = writer.NumDeletedDocs(info);
367  if (delCount > 0)
368  {
369  if (Verbose())
370  Message(" segment " + info.name + " has deletions");
371  if (firstSegmentWithDeletions == - 1)
372  firstSegmentWithDeletions = i;
373  else if (i - firstSegmentWithDeletions == mergeFactor)
374  {
375  // We've seen mergeFactor segments in a row with
376  // deletions, so force a merge now:
377  if (Verbose())
378  Message(" add merge " + firstSegmentWithDeletions + " to " + (i - 1) + " inclusive");
379  spec.Add(MakeOneMerge(segmentInfos, segmentInfos.Range(firstSegmentWithDeletions, i)));
380  firstSegmentWithDeletions = i;
381  }
382  }
383  else if (firstSegmentWithDeletions != - 1)
384  {
385  // End of a sequence of segments with deletions, so,
386  // merge those past segments even if it's fewer than
387  // mergeFactor segments
388  if (Verbose())
389  Message(" add merge " + firstSegmentWithDeletions + " to " + (i - 1) + " inclusive");
390  spec.Add(MakeOneMerge(segmentInfos, segmentInfos.Range(firstSegmentWithDeletions, i)));
391  firstSegmentWithDeletions = - 1;
392  }
393  }
394 
395  if (firstSegmentWithDeletions != - 1)
396  {
397  if (Verbose())
398  Message(" add merge " + firstSegmentWithDeletions + " to " + (numSegments - 1) + " inclusive");
399  spec.Add(MakeOneMerge(segmentInfos, segmentInfos.Range(firstSegmentWithDeletions, numSegments)));
400  }
401 
402  return spec;
403  }
404 
405  /// <summary>Checks if any merges are now necessary and returns a
406  /// <see cref="MergePolicy.MergeSpecification" /> if so. A merge
407  /// is necessary when there are more than <see cref="MergeFactor" />
408  /// segments at a given level. When
409  /// multiple levels have too many segments, this method
410  /// will return multiple merges, allowing the <see cref="MergeScheduler" />
411  /// to use concurrency.
412  /// </summary>
413  public override MergeSpecification FindMerges(SegmentInfos infos)
414  {
415 
416  int numSegments = infos.Count;
417  if (Verbose())
418  Message("findMerges: " + numSegments + " segments");
419 
420  // Compute levels, which is just log (base mergeFactor)
421  // of the size of each segment
422  float[] levels = new float[numSegments];
423  float norm = (float) System.Math.Log(mergeFactor);
424 
425  for (int i = 0; i < numSegments; i++)
426  {
427  SegmentInfo info = infos.Info(i);
428  long size = Size(info);
429 
430  // Floor tiny segments
431  if (size < 1)
432  size = 1;
433  levels[i] = (float) System.Math.Log(size) / norm;
434  }
435 
436  float levelFloor;
437  if (minMergeSize <= 0)
438  levelFloor = (float) 0.0;
439  else
440  {
441  levelFloor = (float) (System.Math.Log(minMergeSize) / norm);
442  }
443 
444  // Now, we quantize the log values into levels. The
445  // first level is any segment whose log size is within
446  // LEVEL_LOG_SPAN of the max size, or, who has such as
447  // segment "to the right". Then, we find the max of all
448  // other segments and use that to define the next level
449  // segment, etc.
450 
451  MergeSpecification spec = null;
452 
453  int start = 0;
454  while (start < numSegments)
455  {
456 
457  // Find max level of all segments not already
458  // quantized.
459  float maxLevel = levels[start];
460  for (int i = 1 + start; i < numSegments; i++)
461  {
462  float level = levels[i];
463  if (level > maxLevel)
464  maxLevel = level;
465  }
466 
467  // Now search backwards for the rightmost segment that
468  // falls into this level:
469  float levelBottom;
470  if (maxLevel < levelFloor)
471  // All remaining segments fall into the min level
472  levelBottom = - 1.0F;
473  else
474  {
475  levelBottom = (float) (maxLevel - LEVEL_LOG_SPAN);
476 
477  // Force a boundary at the level floor
478  if (levelBottom < levelFloor && maxLevel >= levelFloor)
479  levelBottom = levelFloor;
480  }
481 
482  int upto = numSegments - 1;
483  while (upto >= start)
484  {
485  if (levels[upto] >= levelBottom)
486  {
487  break;
488  }
489  upto--;
490  }
491  if (Verbose())
492  Message(" level " + levelBottom + " to " + maxLevel + ": " + (1 + upto - start) + " segments");
493 
494  // Finally, record all merges that are viable at this level:
495  int end = start + mergeFactor;
496  while (end <= 1 + upto)
497  {
498  bool anyTooLarge = false;
499  for (int i = start; i < end; i++)
500  {
501  SegmentInfo info = infos.Info(i);
502  anyTooLarge |= (Size(info) >= maxMergeSize || SizeDocs(info) >= maxMergeDocs);
503  }
504 
505  if (!anyTooLarge)
506  {
507  if (spec == null)
508  spec = new MergeSpecification();
509  if (Verbose())
510  Message(" " + start + " to " + end + ": add this merge");
511  spec.Add(MakeOneMerge(infos, infos.Range(start, end)));
512  }
513  else if (Verbose())
514  Message(" " + start + " to " + end + ": contains segment over maxMergeSize or maxMergeDocs; skipping");
515 
516  start = end;
517  end = start + mergeFactor;
518  }
519 
520  start = 1 + upto;
521  }
522 
523  return spec;
524  }
525 
526  protected OneMerge MakeOneMerge(SegmentInfos infos, SegmentInfos infosToMerge)
527  {
528  bool doCFS;
529  if (!useCompoundFile)
530  {
531  doCFS = false;
532  }
533  else if (internalNoCFSRatio == 1.0)
534  {
535  doCFS = true;
536  }
537  else
538  {
539  long totSize = 0;
540  foreach(SegmentInfo info in infos)
541  {
542  totSize += Size(info);
543  }
544  long mergeSize = 0;
545  foreach(SegmentInfo info in infosToMerge)
546  {
547  mergeSize += Size(info);
548  }
549 
550  doCFS = mergeSize <= internalNoCFSRatio * totSize;
551  }
552 
553  return new OneMerge(infosToMerge, doCFS);
554  }
555 
556  /// <summary>
557  /// Gets or sets the largest segment (measured by document
558  /// count) that may be merged with other segments.
559  /// <p/>Determines the largest segment (measured by
560  /// document count) that may be merged with other segments.
561  /// Small values (e.g., less than 10,000) are best for
562  /// interactive indexing, as this limits the length of
563  /// pauses while indexing to a few seconds. Larger values
564  /// are best for batched indexing and speedier
565  /// searches.<p/>
566  ///
567  /// <p/>The default value is <see cref="int.MaxValue" />.<p/>
568  ///
569  /// <p/>The default merge policy (<see cref="LogByteSizeMergePolicy" />)
570  /// also allows you to set this
571  /// limit by net size (in MB) of the segment, using
572  /// <see cref="LogByteSizeMergePolicy.MaxMergeMB" />.<p/>
573  /// </summary>
574  public virtual int MaxMergeDocs
575  {
576  set { this.maxMergeDocs = value; }
577  get { return maxMergeDocs; }
578  }
579  }
580 }