Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
IndexHtml.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.Diagnostics;
20 using System.IO;
21 using Lucene.Net.Analysis.Standard;
22 using Lucene.Net.Index;
23 
24 using FSDirectory = Lucene.Net.Store.FSDirectory;
25 using Version = Lucene.Net.Util.Version;
26 
27 namespace Lucene.Net.Demo
28 {
29 
30  /// <summary>Indexer for HTML files. </summary>
31  public static class IndexHTML
32  {
33 
34  /// <summary>Indexer for HTML files.</summary>
35  [STAThread]
36  public static void Main(System.String[] argv)
37  {
38  try
39  {
40  var index = new DirectoryInfo("index");
41  bool create = false;
42  DirectoryInfo root = null;
43 
44  var usage = "IndexHTML [-create] [-index <index>] <root_directory>";
45 
46  if (argv.Length == 0)
47  {
48  Console.Error.WriteLine("Usage: " + usage);
49  return ;
50  }
51 
52  for (int i = 0; i < argv.Length; i++)
53  {
54  if (argv[i].Equals("-index"))
55  {
56  // parse -index option
57  index = new DirectoryInfo(argv[++i]);
58  }
59  else if (argv[i].Equals("-create"))
60  {
61  // parse -create option
62  create = true;
63  }
64  else if (i != argv.Length - 1)
65  {
66  Console.Error.WriteLine("Usage: " + usage);
67  return ;
68  }
69  else
70  root = new DirectoryInfo(argv[i]);
71  }
72 
73  if (root == null)
74  {
75  Console.Error.WriteLine("Specify directory to index");
76  Console.Error.WriteLine("Usage: " + usage);
77  return ;
78  }
79 
80  var start = DateTime.Now;
81 
82  using (var writer = new IndexWriter(FSDirectory.Open(index), new StandardAnalyzer(Version.LUCENE_30), create, new IndexWriter.MaxFieldLength(1000000)))
83  {
84  if (!create)
85  {
86  // We're not creating a new index, iterate our index and remove
87  // any stale documents.
88  IndexDocs(writer, root, index, Operation.RemoveStale);
89  }
90 
91  var operation = create
92  ? Operation.CompleteReindex
93  : Operation.IncrementalReindex;
94  IndexDocs(writer, root, index, operation); // add new docs
95 
96  Console.Out.WriteLine("Optimizing index...");
97  writer.Optimize();
98  writer.Commit();
99  }
100 
101  var end = DateTime.Now;
102 
103  Console.Out.Write(end.Millisecond - start.Millisecond);
104  Console.Out.WriteLine(" total milliseconds");
105  }
106  catch (Exception e)
107  {
108  Console.Error.WriteLine(e.StackTrace);
109  }
110  }
111 
112  /* Walk directory hierarchy in uid order, while keeping uid iterator from
113  /* existing index in sync. Mismatches indicate one of: (a) old documents to
114  /* be deleted; (b) unchanged documents, to be left alone; or (c) new
115  /* documents, to be indexed.
116  */
117 
118  private static void IndexDocs(IndexWriter writer, DirectoryInfo file, DirectoryInfo index, Operation operation)
119  {
120  if (operation == Operation.CompleteReindex)
121  {
122  // Perform a full reindexing.
123  IndexDirectory(writer, null, file, operation);
124  }
125  else
126  {
127  // Perform an incremental reindexing.
128 
129  using (var reader = IndexReader.Open(FSDirectory.Open(index), true)) // open existing index
130  using (var uidIter = reader.Terms(new Term("uid", ""))) // init uid iterator
131  {
132  IndexDirectory(writer, uidIter, file, operation);
133 
134  if (operation == Operation.RemoveStale) {
135  // Delete remaining, presumed stale, documents. This works since
136  // the above call to IndexDirectory should have positioned the uidIter
137  // after any uids matching existing documents. Any remaining uid
138  // is remains from documents that has been deleted since they was
139  // indexed.
140  while (uidIter.Term != null && uidIter.Term.Field == "uid") {
141  Console.Out.WriteLine("deleting " + HTMLDocument.Uid2url(uidIter.Term.Text));
142  writer.DeleteDocuments(uidIter.Term);
143  uidIter.Next();
144  }
145  }
146  }
147  }
148  }
149 
150  private static void IndexDirectory(IndexWriter writer, TermEnum uidIter, DirectoryInfo dir, Operation operation) {
151  var entries = Directory.GetFileSystemEntries(dir.FullName);
152 
153  // Sort the entries. This is important, the uidIter TermEnum is
154  // iterated in a forward-only fashion, requiring all files to be
155  // passed in ascending order.
156  Array.Sort(entries);
157 
158  foreach (var entry in entries) {
159  var path = Path.Combine(dir.FullName, entry);
160  if (Directory.Exists(path)) {
161  IndexDirectory(writer, uidIter, new DirectoryInfo(path), operation);
162  } else if (File.Exists(path)) {
163  IndexFile(writer, uidIter, new FileInfo(path), operation);
164  }
165  }
166  }
167 
168  private static void IndexFile(IndexWriter writer, TermEnum uidIter, FileInfo file, Operation operation)
169  {
170  if (file.FullName.EndsWith(".html") || file.FullName.EndsWith(".htm") || file.FullName.EndsWith(".txt"))
171  {
172  // We've found a file we should index.
173 
174  if (operation == Operation.IncrementalReindex ||
175  operation == Operation.RemoveStale)
176  {
177  // We should only get here with an open uidIter.
178  Debug.Assert(uidIter != null, "Expected uidIter != null for operation " + operation);
179 
180  var uid = HTMLDocument.Uid(file); // construct uid for doc
181 
182  while (uidIter.Term != null && uidIter.Term.Field == "uid" && String.CompareOrdinal(uidIter.Term.Text, uid) < 0)
183  {
184  if (operation == Operation.RemoveStale)
185  {
186  Console.Out.WriteLine("deleting " + HTMLDocument.Uid2url(uidIter.Term.Text));
187  writer.DeleteDocuments(uidIter.Term);
188  }
189  uidIter.Next();
190  }
191 
192  // The uidIter TermEnum should now be pointing at either
193  // 1) a null term, meaning there are no more uids to check.
194  // 2) a term matching the current file.
195  // 3) a term not matching us.
196  if (uidIter.Term != null && uidIter.Term.Field == "uid" && String.CompareOrdinal(uidIter.Term.Text, uid) == 0)
197  {
198  // uidIter points to the current document, we should move one
199  // step ahead to keep state consistant, and carry on.
200  uidIter.Next();
201  }
202  else if (operation == Operation.IncrementalReindex)
203  {
204  // uidIter does not point to the current document, and we're
205  // currently indexing documents.
206  var doc = HTMLDocument.Document(file);
207  Console.Out.WriteLine("adding " + doc.Get("path"));
208  writer.AddDocument(doc);
209  }
210  }
211  else
212  {
213  // We're doing a complete reindexing. We aren't using uidIter,
214  // but for completeness we assert that it's null (as expected).
215  Debug.Assert(uidIter == null, "Expected uidIter == null for operation == " + operation);
216 
217  var doc = HTMLDocument.Document(file);
218  Console.Out.WriteLine("adding " + doc.Get("path"));
219  writer.AddDocument(doc);
220  }
221  }
222  }
223 
224  private enum Operation {
225  /// <summary>
226  /// Indicates an incremental indexing.
227  /// </summary>
228  IncrementalReindex,
229 
230  /// <summary>
231  /// Indicates that stale entries in the index should be removed.
232  /// </summary>
233  RemoveStale,
234 
235  /// <summary>
236  /// Indicates an complete reindexing.
237  /// </summary>
238  CompleteReindex
239  }
240  }
241 }