Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
Syns2Index.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.Collections.Generic;
20 using System.IO;
21 using System.Linq;
22 using Lucene.Net.Store;
23 using Analyzer = Lucene.Net.Analysis.Analyzer;
24 using Directory = System.IO.Directory;
25 using StandardAnalyzer = Lucene.Net.Analysis.Standard.StandardAnalyzer;
26 using Document = Lucene.Net.Documents.Document;
27 using Field = Lucene.Net.Documents.Field;
28 using IndexWriter = Lucene.Net.Index.IndexWriter;
29 
30 namespace WorldNet.Net
31 {
32 
33  /// <summary> Convert the prolog file wn_s.pl from the <a href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">WordNet prolog download</a>
34  /// into a Lucene index suitable for looking up synonyms and performing query expansion (<see cref="SynExpand.Expand"/>).
35  ///
36  /// This has been tested with WordNet 2.0.
37  ///
38  /// The index has fields named "word" (<see cref="F_WORD"/>)
39  /// and "syn" (<see cref="F_SYN"/>).
40  /// <p>
41  /// The source word (such as 'big') can be looked up in the
42  /// "word" field, and if present there will be fields named "syn"
43  /// for every synonym. What's tricky here is that there could be <b>multiple</b>
44  /// fields with the same name, in the general case for words that have multiple synonyms.
45  /// That's not a problem with Lucene, you just use <see cref="Document.GetValues"/>
46  /// </p>
47  /// <p>
48  /// While the WordNet file distinguishes groups of synonyms with
49  /// related meanings we don't do that here.
50  /// </p>
51  /// This can take 4 minutes to execute and build an index on a "fast" system and the index takes up almost 3 MB.
52  /// </summary>
53  ///
54  /// <seealso cref="http://www.cogsci.princeton.edu/~wn/"></seealso>
55  /// <seealso cref="http://www.cogsci.princeton.edu/~wn/man/prologdb.5WN.html"></seealso>
56  /// <seealso cref="http://www.hostmon.com/rfc/advanced.jsp"> </seealso>
57  public class Syns2Index
58  {
59  /// <summary> </summary>
60  private static readonly System.IO.StreamWriter o;
61 
62  /// <summary> </summary>
63  private static readonly System.IO.StreamWriter err;
64 
65  /// <summary> </summary>
66  public const System.String F_SYN = "syn";
67 
68  /// <summary> </summary>
69  public const System.String F_WORD = "word";
70 
71  /// <summary> </summary>
72  private static readonly Analyzer ana = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_CURRENT);
73 
74  /// <summary>
75  /// Takes arg of prolog file name and index directory.
76  /// </summary>
77  [STAThread]
78  public static void Main(System.String[] args)
79  {
80  // get command line arguments
81  String prologFilename = null; // name of file "wn_s.pl"
82  String indexDir = null;
83  if (args.Length == 2)
84  {
85  prologFilename = args[0];
86  indexDir = args[1];
87  }
88  else
89  {
90  Usage();
91  Environment.Exit(1);
92  }
93 
94  // ensure that the prolog file is readable
95  if (!(new FileInfo(prologFilename)).Exists)
96  {
97  err.WriteLine("Error: cannot read Prolog file: " + prologFilename);
98  Environment.Exit(1);
99  }
100  // exit if the target index directory already exists
101  if (Directory.Exists((new FileInfo(indexDir)).FullName))
102  {
103  err.WriteLine("Error: index directory already exists: " + indexDir);
104  err.WriteLine("Please specify a name of a non-existent directory");
105  Environment.Exit(1);
106  }
107 
108  o.WriteLine("Opening Prolog file " + prologFilename);
109  var fis = new FileStream(prologFilename, FileMode.Open, FileAccess.Read);
110  var br = new StreamReader(new StreamReader(fis, System.Text.Encoding.Default).BaseStream, new StreamReader(fis, System.Text.Encoding.Default).CurrentEncoding);
111  String line;
112 
113  // maps a word to all the "groups" it's in
114  System.Collections.IDictionary word2Nums = new System.Collections.SortedList();
115  // maps a group to all the words in it
116  System.Collections.IDictionary num2Words = new System.Collections.SortedList();
117  // number of rejected words
118  var ndecent = 0;
119 
120  // status output
121  var mod = 1;
122  var row = 1;
123  // parse prolog file
124  o.WriteLine("[1/2] Parsing " + prologFilename);
125  while ((line = br.ReadLine()) != null)
126  {
127  // occasional progress
128  if ((++row) % mod == 0) // periodically print out line we read in
129  {
130  mod *= 2;
131  o.WriteLine("\t" + row + " " + line + " " + word2Nums.Count + " " + num2Words.Count + " ndecent=" + ndecent);
132  }
133 
134  // syntax check
135  if (!line.StartsWith("s("))
136  {
137  err.WriteLine("OUCH: " + line);
138  Environment.Exit(1);
139  }
140 
141  // parse line
142  line = line.Substring(2);
143  var comma = line.IndexOf(',');
144  var num = line.Substring(0, comma);
145  var q1 = line.IndexOf('\'');
146  line = line.Substring(q1 + 1);
147  var q2 = line.IndexOf('\'');
148  var word = line.Substring(0, q2).ToLower().Replace("''", "'");
149 
150  // make sure is a normal word
151  if (!IsDecent(word))
152  {
153  ndecent++;
154  continue; // don't store words w/ spaces
155  }
156 
157  // 1/2: word2Nums map
158  // append to entry or add new one
159  var lis = (System.Collections.IList) word2Nums[word];
160  if (lis == null)
161  {
162  lis = new List<String> {num};
163  word2Nums[word] = lis;
164  }
165  else
166  lis.Add(num);
167 
168  // 2/2: num2Words map
169  lis = (System.Collections.IList) num2Words[num];
170  if (lis == null)
171  {
172  lis = new List<String> { word };
173  num2Words[num] = lis;
174  }
175  else
176  lis.Add(word);
177  }
178 
179  // close the streams
180  fis.Close();
181  br.Close();
182 
183  // create the index
184  o.WriteLine("[2/2] Building index to store synonyms, " + " map sizes are " + word2Nums.Count + " and " + num2Words.Count);
185  Index(indexDir, word2Nums, num2Words);
186  }
187 
188  /// <summary>
189  /// Checks to see if a word contains only alphabetic characters by
190  /// checking it one character at a time.
191  /// </summary>
192  /// <param name="s">string to check </param>
193  /// <returns> <c>true</c> if the string is decent</returns>
194  private static bool IsDecent(String s)
195  {
196  var len = s.Length;
197  for (var i = 0; i < len; i++)
198  {
199  if (!Char.IsLetter(s[i]))
200  {
201  return false;
202  }
203  }
204  return true;
205  }
206 
207  /// <summary>
208  /// Forms a Lucene index based on the 2 maps.
209  /// </summary>
210  /// <param name="indexDir">the direcotry where the index should be created</param>
211  /// <param name="word2Nums">word2Nums</param>
212  /// <param name="num2Words">num2Words</param>
213  private static void Index(String indexDir, System.Collections.IDictionary word2Nums, System.Collections.IDictionary num2Words)
214  {
215  var row = 0;
216  var mod = 1;
217 
218  using (var dir = FSDirectory.Open(new DirectoryInfo(indexDir)))
219  {
220  var writer = new IndexWriter(dir, ana, true, IndexWriter.MaxFieldLength.LIMITED);
221  writer.UseCompoundFile = true; // why?
222 
223  var i1 = word2Nums.Keys.GetEnumerator();
224  while (i1.MoveNext())
225  {
226  var g = (String)i1.Current;
227  var doc = new Document();
228 
229  var n = Index(word2Nums, num2Words, g, doc);
230  if (n > 0)
231  {
232  doc.Add(new Field(F_WORD, g, Field.Store.YES, Field.Index.NOT_ANALYZED));
233  if ((++row % mod) == 0)
234  {
235  o.WriteLine("\trow=" + row + "/" + word2Nums.Count + " doc= " + doc);
236  mod *= 2;
237  }
238  writer.AddDocument(doc);
239  }
240  }
241  o.WriteLine("Optimizing..");
242  writer.Optimize();
243  writer.Close();
244  }
245 
246  }
247 
248  /// <summary>
249  /// Given the 2 maps fills a document for 1 word.
250  /// </summary>
251  private static int Index(System.Collections.IDictionary word2Nums, System.Collections.IDictionary num2Words, System.String g, Document doc)
252  {
253  var keys = (System.Collections.IList) word2Nums[g]; // get list of key#'s
254  var i2 = keys.GetEnumerator();
255 
256  var already = new System.Collections.SortedList(); // keep them sorted
257 
258  // pass 1: fill up 'already' with all words
259  while (i2.MoveNext()) // for each key#
260  {
261  foreach (var item in
262  ((System.Collections.IList) num2Words[i2.Current]).Cast<object>().Where(item => already.Contains(item) == false))
263  {
264  already.Add(item, item);
265  }
266  }
267 
268  var num = 0;
269  already.Remove(g); // of course a word is it's own syn
270  var it = already.GetEnumerator();
271  while (it.MoveNext())
272  {
273  var cur = (String) it.Key;
274  // don't store things like 'pit bull' -> 'american pit bull'
275  if (!IsDecent(cur))
276  {
277  continue;
278  }
279  num++;
280  doc.Add(new Field(F_SYN, cur, Field.Store.YES, Field.Index.NO));
281  }
282  return num;
283  }
284 
285  /// <summary> </summary>
286  private static void Usage()
287  {
288  o.WriteLine("\n\n" + typeof(Syns2Index) + " <prolog file> <index dir>\n\n");
289  }
290 
291  }
292 }