Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
HTMLDocument.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.IO;
20 using Lucene.Net.Documents;
21 
22 using HTMLParser = Lucene.Net.Demo.Html.HTMLParser;
23 
24 namespace Lucene.Net.Demo
25 {
26 
27  /// <summary>A utility for making Lucene Documents for HTML documents. </summary>
28 
29  public static class HTMLDocument
30  {
31  internal static char dirSep = Path.DirectorySeparatorChar;
32 
33  public static String Uid(FileInfo f)
34  {
35  // Append path and date into a string in such a way that lexicographic
36  // sorting gives the same results as a walk of the file hierarchy. Thus
37  // null (\u0000) is used both to separate directory components and to
38  // separate the path from the date.
39  return f.FullName.Replace(dirSep, '\u0000') + "\u0000" + DateTools.TimeToString(f.LastWriteTime.Millisecond, DateTools.Resolution.SECOND);
40  }
41 
42  public static String Uid2url(String uid)
43  {
44  var url = uid.Replace('\u0000', '/'); // replace nulls with slashes
45  return url.Substring(0, (url.LastIndexOf('/')) - (0)); // remove date from end
46  }
47 
48  public static Document Document(FileInfo f)
49  {
50  // make a new, empty document
51  Document doc = new Document();
52 
53  // Add the url as a field named "path". Use a field that is
54  // indexed (i.e. searchable), but don't tokenize the field into words.
55  doc.Add(new Field("path", f.FullName.Replace(dirSep, '/'), Field.Store.YES, Field.Index.NOT_ANALYZED));
56 
57  // Add the last modified date of the file a field named "modified".
58  // Use a field that is indexed (i.e. searchable), but don't tokenize
59  // the field into words.
60  doc.Add(new Field("modified", DateTools.TimeToString(f.LastWriteTime.Millisecond, DateTools.Resolution.MINUTE), Field.Store.YES, Field.Index.NOT_ANALYZED));
61 
62  // Add the uid as a field, so that index can be incrementally maintained.
63  // This field is not stored with document, it is indexed, but it is not
64  // tokenized prior to indexing.
65  doc.Add(new Field("uid", Uid(f), Field.Store.NO, Field.Index.NOT_ANALYZED));
66 
67  using (var fileStream = f.OpenRead())
68  {
69  var parser = new HTMLParser(fileStream);
70 
71  // Add the tag-stripped contents as a Reader-valued Text field so it will
72  // get tokenized and indexed.
73  doc.Add(new Field("contents", parser.GetReader()));
74 
75  // Add the summary as a field that is stored and returned with
76  // hit documents for display.
77  doc.Add(new Field("summary", parser.GetSummary(), Field.Store.YES, Field.Index.NO));
78 
79  // Add the title as a field that it can be searched and that is stored.
80  doc.Add(new Field("title", parser.GetTitle(), Field.Store.YES, Field.Index.ANALYZED));
81 
82  // return the document
83  return doc;
84  }
85  }
86  }
87 }