Lucene.Net  3.0.3
Lucene.Net is a port of the Lucene search engine library, written in C# and targeted at .NET runtime users.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Properties Pages
Field.cs
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 using System;
19 using System.IO;
20 using TokenStream = Lucene.Net.Analysis.TokenStream;
21 using IndexWriter = Lucene.Net.Index.IndexWriter;
22 using StringHelper = Lucene.Net.Util.StringHelper;
23 
24 namespace Lucene.Net.Documents
25 {
26 
27  /// <summary>A field is a section of a Document. Each field has two parts, a name and a
28  /// value. Values may be free text, provided as a String or as a Reader, or they
29  /// may be atomic keywords, which are not further processed. Such keywords may
30  /// be used to represent dates, urls, etc. Fields are optionally stored in the
31  /// index, so that they may be returned with hits on the document.
32  /// </summary>
33 
34  [Serializable]
35  public sealed class Field:AbstractField, IFieldable
36  {
37  /// <summary>Specifies whether and how a field should be stored. </summary>
38  public enum Store
39  {
40  /// <summary>Store the original field value in the index. This is useful for short texts
41  /// like a document's title which should be displayed with the results. The
42  /// value is stored in its original form, i.e. no analyzer is used before it is
43  /// stored.
44  /// </summary>
45  YES,
46 
47  /// <summary>Do not store the field value in the index. </summary>
48  NO
49  }
50 
51  /// <summary>Specifies whether and how a field should be indexed. </summary>
52 
53  public enum Index
54  {
55  /// <summary>Do not index the field value. This field can thus not be searched,
56  /// but one can still access its contents provided it is
57  /// <see cref="Field.Store">stored</see>.
58  /// </summary>
59  NO,
60 
61  /// <summary>Index the tokens produced by running the field's
62  /// value through an Analyzer. This is useful for
63  /// common text.
64  /// </summary>
65  ANALYZED,
66 
67  /// <summary>Index the field's value without using an Analyzer, so it can be searched.
68  /// As no analyzer is used the value will be stored as a single term. This is
69  /// useful for unique Ids like product numbers.
70  /// </summary>
71  NOT_ANALYZED,
72 
73  /// <summary>Expert: Index the field's value without an Analyzer,
74  /// and also disable the storing of norms. Note that you
75  /// can also separately enable/disable norms by setting
76  /// <see cref="AbstractField.OmitNorms" />. No norms means that
77  /// index-time field and document boosting and field
78  /// length normalization are disabled. The benefit is
79  /// less memory usage as norms take up one byte of RAM
80  /// per indexed field for every document in the index,
81  /// during searching. Note that once you index a given
82  /// field <i>with</i> norms enabled, disabling norms will
83  /// have no effect. In other words, for this to have the
84  /// above described effect on a field, all instances of
85  /// that field must be indexed with NOT_ANALYZED_NO_NORMS
86  /// from the beginning.
87  /// </summary>
88  NOT_ANALYZED_NO_NORMS,
89 
90  /// <summary>Expert: Index the tokens produced by running the
91  /// field's value through an Analyzer, and also
92  /// separately disable the storing of norms. See
93  /// <see cref="NOT_ANALYZED_NO_NORMS" /> for what norms are
94  /// and why you may want to disable them.
95  /// </summary>
96  ANALYZED_NO_NORMS,
97  }
98 
99  /// <summary>Specifies whether and how a field should have term vectors. </summary>
100  public enum TermVector
101  {
102  /// <summary>Do not store term vectors. </summary>
103  NO,
104 
105  /// <summary>Store the term vectors of each document. A term vector is a list
106  /// of the document's terms and their number of occurrences in that document.
107  /// </summary>
108  YES,
109 
110  /// <summary> Store the term vector + token position information
111  ///
112  /// </summary>
113  /// <seealso cref="YES">
114  /// </seealso>
115  WITH_POSITIONS,
116 
117  /// <summary> Store the term vector + Token offset information
118  ///
119  /// </summary>
120  /// <seealso cref="YES">
121  /// </seealso>
122  WITH_OFFSETS,
123 
124  /// <summary> Store the term vector + Token position and offset information
125  ///
126  /// </summary>
127  /// <seealso cref="YES">
128  /// </seealso>
129  /// <seealso cref="WITH_POSITIONS">
130  /// </seealso>
131  /// <seealso cref="WITH_OFFSETS">
132  /// </seealso>
133  WITH_POSITIONS_OFFSETS,
134  }
135 
136 
137  /// <summary>The value of the field as a String, or null. If null, the Reader value or
138  /// binary value is used. Exactly one of stringValue(),
139  /// readerValue(), and getBinaryValue() must be set.
140  /// </summary>
141  public override string StringValue
142  {
143  get { return fieldsData is System.String ? (System.String) fieldsData : null; }
144  }
145 
146  /// <summary>The value of the field as a Reader, or null. If null, the String value or
147  /// binary value is used. Exactly one of stringValue(),
148  /// readerValue(), and getBinaryValue() must be set.
149  /// </summary>
150  public override TextReader ReaderValue
151  {
152  get { return fieldsData is System.IO.TextReader ? (System.IO.TextReader) fieldsData : null; }
153  }
154 
155  /// <summary>The TokesStream for this field to be used when indexing, or null. If null, the Reader value
156  /// or String value is analyzed to produce the indexed tokens.
157  /// </summary>
158  public override TokenStream TokenStreamValue
159  {
160  get { return tokenStream; }
161  }
162 
163 
164  /// <summary><p/>Expert: change the value of this field. This can
165  /// be used during indexing to re-use a single Field
166  /// instance to improve indexing speed by avoiding GC cost
167  /// of new'ing and reclaiming Field instances. Typically
168  /// a single <see cref="Document" /> instance is re-used as
169  /// well. This helps most on small documents.<p/>
170  ///
171  /// <p/>Each Field instance should only be used once
172  /// within a single <see cref="Document" /> instance. See <a
173  /// href="http://wiki.apache.org/lucene-java/ImproveIndexingSpeed">ImproveIndexingSpeed</a>
174  /// for details.<p/>
175  /// </summary>
176  public void SetValue(System.String value)
177  {
178  if (internalIsBinary)
179  {
180  throw new System.ArgumentException("cannot set a String value on a binary field");
181  }
182  fieldsData = value;
183  }
184 
185  /// <summary>Expert: change the value of this field. See <a href="#setValue(java.lang.String)">setValue(String)</a>. </summary>
186  public void SetValue(System.IO.TextReader value)
187  {
188  if (internalIsBinary)
189  {
190  throw new System.ArgumentException("cannot set a Reader value on a binary field");
191  }
192  if (internalIsStored)
193  {
194  throw new System.ArgumentException("cannot set a Reader value on a stored field");
195  }
196  fieldsData = value;
197  }
198 
199  /// <summary>Expert: change the value of this field. See <a href="#setValue(java.lang.String)">setValue(String)</a>. </summary>
200  public void SetValue(byte[] value)
201  {
202  if (!internalIsBinary)
203  {
204  throw new System.ArgumentException("cannot set a byte[] value on a non-binary field");
205  }
206  fieldsData = value;
207  internalBinaryLength = value.Length;
208  internalbinaryOffset = 0;
209  }
210 
211  /// <summary>Expert: change the value of this field. See <a href="#setValue(java.lang.String)">setValue(String)</a>. </summary>
212  public void SetValue(byte[] value, int offset, int length)
213  {
214  if (!internalIsBinary)
215  {
216  throw new System.ArgumentException("cannot set a byte[] value on a non-binary field");
217  }
218  fieldsData = value;
219  internalBinaryLength = length;
220  internalbinaryOffset = offset;
221  }
222 
223  /// <summary>Expert: sets the token stream to be used for indexing and causes isIndexed() and isTokenized() to return true.
224  /// May be combined with stored values from stringValue() or GetBinaryValue()
225  /// </summary>
226  public void SetTokenStream(TokenStream tokenStream)
227  {
228  this.internalIsIndexed = true;
229  this.internalIsTokenized = true;
230  this.tokenStream = tokenStream;
231  }
232 
233  /// <summary> Create a field by specifying its name, value and how it will
234  /// be saved in the index. Term vectors will not be stored in the index.
235  ///
236  /// </summary>
237  /// <param name="name">The name of the field
238  /// </param>
239  /// <param name="value">The string to process
240  /// </param>
241  /// <param name="store">Whether <c>value</c> should be stored in the index
242  /// </param>
243  /// <param name="index">Whether the field should be indexed, and if so, if it should
244  /// be tokenized before indexing
245  /// </param>
246  /// <throws> NullPointerException if name or value is <c>null</c> </throws>
247  /// <throws> IllegalArgumentException if the field is neither stored nor indexed </throws>
248  public Field(System.String name, System.String value, Store store, Index index)
249  : this(name, value, store, index, TermVector.NO)
250  {
251  }
252 
253  /// <summary> Create a field by specifying its name, value and how it will
254  /// be saved in the index.
255  ///
256  /// </summary>
257  /// <param name="name">The name of the field
258  /// </param>
259  /// <param name="value">The string to process
260  /// </param>
261  /// <param name="store">Whether <c>value</c> should be stored in the index
262  /// </param>
263  /// <param name="index">Whether the field should be indexed, and if so, if it should
264  /// be tokenized before indexing
265  /// </param>
266  /// <param name="termVector">Whether term vector should be stored
267  /// </param>
268  /// <throws> NullPointerException if name or value is <c>null</c> </throws>
269  /// <throws> IllegalArgumentException in any of the following situations: </throws>
270  /// <summary> <list>
271  /// <item>the field is neither stored nor indexed</item>
272  /// <item>the field is not indexed but termVector is <c>TermVector.YES</c></item>
273  /// </list>
274  /// </summary>
275  public Field(System.String name, System.String value, Store store, Index index, TermVector termVector)
276  : this(name, true, value, store, index, termVector)
277  {
278  }
279 
280  /// <summary> Create a field by specifying its name, value and how it will
281  /// be saved in the index.
282  ///
283  /// </summary>
284  /// <param name="name">The name of the field
285  /// </param>
286  /// <param name="internName">Whether to .intern() name or not
287  /// </param>
288  /// <param name="value">The string to process
289  /// </param>
290  /// <param name="store">Whether <c>value</c> should be stored in the index
291  /// </param>
292  /// <param name="index">Whether the field should be indexed, and if so, if it should
293  /// be tokenized before indexing
294  /// </param>
295  /// <param name="termVector">Whether term vector should be stored
296  /// </param>
297  /// <throws> NullPointerException if name or value is <c>null</c> </throws>
298  /// <throws> IllegalArgumentException in any of the following situations: </throws>
299  /// <summary> <list>
300  /// <item>the field is neither stored nor indexed</item>
301  /// <item>the field is not indexed but termVector is <c>TermVector.YES</c></item>
302  /// </list>
303  /// </summary>
304  public Field(System.String name, bool internName, System.String value, Store store, Index index, TermVector termVector)
305  {
306  if (name == null)
307  throw new System.NullReferenceException("name cannot be null");
308  if (value == null)
309  throw new System.NullReferenceException("value cannot be null");
310  if (name.Length == 0 && value.Length == 0)
311  throw new System.ArgumentException("name and value cannot both be empty");
312  if (index == Index.NO && store == Store.NO)
313  throw new System.ArgumentException("it doesn't make sense to have a field that " + "is neither indexed nor stored");
314  if (index == Index.NO && termVector != TermVector.NO)
315  throw new System.ArgumentException("cannot store term vector information " + "for a field that is not indexed");
316 
317  if (internName)
318  // field names are optionally interned
319  name = StringHelper.Intern(name);
320 
321  this.internalName = name;
322 
323  this.fieldsData = value;
324 
325  this.internalIsStored = store.IsStored();
326 
327  this.internalIsIndexed = index.IsIndexed();
328  this.internalIsTokenized = index.IsAnalyzed();
329  this.internalOmitNorms = index.OmitNorms();
330 
331  if (index == Index.NO)
332  {
333  this.internalOmitTermFreqAndPositions = false;
334  }
335 
336  this.internalIsBinary = false;
337 
338  SetStoreTermVector(termVector);
339  }
340 
341  /// <summary> Create a tokenized and indexed field that is not stored. Term vectors will
342  /// not be stored. The Reader is read only when the Document is added to the index,
343  /// i.e. you may not close the Reader until <see cref="IndexWriter.AddDocument(Document)" />
344  /// has been called.
345  ///
346  /// </summary>
347  /// <param name="name">The name of the field
348  /// </param>
349  /// <param name="reader">The reader with the content
350  /// </param>
351  /// <throws> NullPointerException if name or reader is <c>null</c> </throws>
352  public Field(System.String name, System.IO.TextReader reader):this(name, reader, TermVector.NO)
353  {
354  }
355 
356  /// <summary> Create a tokenized and indexed field that is not stored, optionally with
357  /// storing term vectors. The Reader is read only when the Document is added to the index,
358  /// i.e. you may not close the Reader until <see cref="IndexWriter.AddDocument(Document)" />
359  /// has been called.
360  ///
361  /// </summary>
362  /// <param name="name">The name of the field
363  /// </param>
364  /// <param name="reader">The reader with the content
365  /// </param>
366  /// <param name="termVector">Whether term vector should be stored
367  /// </param>
368  /// <throws> NullPointerException if name or reader is <c>null</c> </throws>
369  public Field(System.String name, System.IO.TextReader reader, TermVector termVector)
370  {
371  if (name == null)
372  throw new System.NullReferenceException("name cannot be null");
373  if (reader == null)
374  throw new System.NullReferenceException("reader cannot be null");
375 
376  this.internalName = StringHelper.Intern(name); // field names are interned
377  this.fieldsData = reader;
378 
379  this.internalIsStored = false;
380 
381  this.internalIsIndexed = true;
382  this.internalIsTokenized = true;
383 
384  this.internalIsBinary = false;
385 
386  SetStoreTermVector(termVector);
387  }
388 
389  /// <summary> Create a tokenized and indexed field that is not stored. Term vectors will
390  /// not be stored. This is useful for pre-analyzed fields.
391  /// The TokenStream is read only when the Document is added to the index,
392  /// i.e. you may not close the TokenStream until <see cref="IndexWriter.AddDocument(Document)" />
393  /// has been called.
394  ///
395  /// </summary>
396  /// <param name="name">The name of the field
397  /// </param>
398  /// <param name="tokenStream">The TokenStream with the content
399  /// </param>
400  /// <throws> NullPointerException if name or tokenStream is <c>null</c> </throws>
401  public Field(System.String name, TokenStream tokenStream):this(name, tokenStream, TermVector.NO)
402  {
403  }
404 
405  /// <summary> Create a tokenized and indexed field that is not stored, optionally with
406  /// storing term vectors. This is useful for pre-analyzed fields.
407  /// The TokenStream is read only when the Document is added to the index,
408  /// i.e. you may not close the TokenStream until <see cref="IndexWriter.AddDocument(Document)" />
409  /// has been called.
410  ///
411  /// </summary>
412  /// <param name="name">The name of the field
413  /// </param>
414  /// <param name="tokenStream">The TokenStream with the content
415  /// </param>
416  /// <param name="termVector">Whether term vector should be stored
417  /// </param>
418  /// <throws> NullPointerException if name or tokenStream is <c>null</c> </throws>
419  public Field(System.String name, TokenStream tokenStream, TermVector termVector)
420  {
421  if (name == null)
422  throw new System.NullReferenceException("name cannot be null");
423  if (tokenStream == null)
424  throw new System.NullReferenceException("tokenStream cannot be null");
425 
426  this.internalName = StringHelper.Intern(name); // field names are interned
427  this.fieldsData = null;
428  this.tokenStream = tokenStream;
429 
430  this.internalIsStored = false;
431 
432  this.internalIsIndexed = true;
433  this.internalIsTokenized = true;
434 
435  this.internalIsBinary = false;
436 
437  SetStoreTermVector(termVector);
438  }
439 
440 
441  /// <summary> Create a stored field with binary value. Optionally the value may be compressed.
442  ///
443  /// </summary>
444  /// <param name="name">The name of the field
445  /// </param>
446  /// <param name="value_Renamed">The binary value
447  /// </param>
448  /// <param name="store">How <c>value</c> should be stored (compressed or not)
449  /// </param>
450  /// <throws> IllegalArgumentException if store is <c>Store.NO</c> </throws>
451  public Field(System.String name, byte[] value_Renamed, Store store):this(name, value_Renamed, 0, value_Renamed.Length, store)
452  {
453  }
454 
455  /// <summary> Create a stored field with binary value. Optionally the value may be compressed.
456  ///
457  /// </summary>
458  /// <param name="name">The name of the field
459  /// </param>
460  /// <param name="value_Renamed">The binary value
461  /// </param>
462  /// <param name="offset">Starting offset in value where this Field's bytes are
463  /// </param>
464  /// <param name="length">Number of bytes to use for this Field, starting at offset
465  /// </param>
466  /// <param name="store">How <c>value</c> should be stored (compressed or not)
467  /// </param>
468  /// <throws> IllegalArgumentException if store is <c>Store.NO</c> </throws>
469  public Field(System.String name, byte[] value_Renamed, int offset, int length, Store store)
470  {
471 
472  if (name == null)
473  throw new System.ArgumentException("name cannot be null");
474  if (value_Renamed == null)
475  throw new System.ArgumentException("value cannot be null");
476 
477  this.internalName = StringHelper.Intern(name); // field names are interned
478  fieldsData = value_Renamed;
479 
480  if (store == Store.NO)
481  throw new System.ArgumentException("binary values can't be unstored");
482 
483  internalIsStored = store.IsStored();
484  internalIsIndexed = false;
485  internalIsTokenized = false;
486  internalOmitTermFreqAndPositions = false;
487  internalOmitNorms = true;
488 
489  internalIsBinary = true;
490  internalBinaryLength = length;
491  internalbinaryOffset = offset;
492 
493  SetStoreTermVector(TermVector.NO);
494  }
495  }
496 
497  public static class FieldExtensions
498  {
499  public static bool IsStored(this Field.Store store)
500  {
501  switch(store)
502  {
503  case Field.Store.YES:
504  return true;
505  case Field.Store.NO:
506  return false;
507  default:
508  throw new ArgumentOutOfRangeException("store", "Invalid value for Field.Store");
509  }
510  }
511 
512  public static bool IsIndexed(this Field.Index index)
513  {
514  switch(index)
515  {
516  case Field.Index.NO:
517  return false;
518  case Field.Index.ANALYZED:
519  case Field.Index.NOT_ANALYZED:
520  case Field.Index.NOT_ANALYZED_NO_NORMS:
521  case Field.Index.ANALYZED_NO_NORMS:
522  return true;
523  default:
524  throw new ArgumentOutOfRangeException("index", "Invalid value for Field.Index");
525  }
526  }
527 
528  public static bool IsAnalyzed(this Field.Index index)
529  {
530  switch (index)
531  {
532  case Field.Index.NO:
533  case Field.Index.NOT_ANALYZED:
534  case Field.Index.NOT_ANALYZED_NO_NORMS:
535  return false;
536  case Field.Index.ANALYZED:
537  case Field.Index.ANALYZED_NO_NORMS:
538  return true;
539  default:
540  throw new ArgumentOutOfRangeException("index", "Invalid value for Field.Index");
541  }
542  }
543 
544  public static bool OmitNorms(this Field.Index index)
545  {
546  switch (index)
547  {
548  case Field.Index.ANALYZED:
549  case Field.Index.NOT_ANALYZED:
550  return false;
551  case Field.Index.NO:
552  case Field.Index.NOT_ANALYZED_NO_NORMS:
553  case Field.Index.ANALYZED_NO_NORMS:
554  return true;
555  default:
556  throw new ArgumentOutOfRangeException("index", "Invalid value for Field.Index");
557  }
558  }
559 
560  public static bool IsStored(this Field.TermVector tv)
561  {
562  switch(tv)
563  {
564  case Field.TermVector.NO:
565  return false;
566  case Field.TermVector.YES:
567  case Field.TermVector.WITH_OFFSETS:
568  case Field.TermVector.WITH_POSITIONS:
569  case Field.TermVector.WITH_POSITIONS_OFFSETS:
570  return true;
571  default:
572  throw new ArgumentOutOfRangeException("tv", "Invalid value for Field.TermVector");
573  }
574  }
575 
576  public static bool WithPositions(this Field.TermVector tv)
577  {
578  switch (tv)
579  {
580  case Field.TermVector.NO:
581  case Field.TermVector.YES:
582  case Field.TermVector.WITH_OFFSETS:
583  return false;
584  case Field.TermVector.WITH_POSITIONS:
585  case Field.TermVector.WITH_POSITIONS_OFFSETS:
586  return true;
587  default:
588  throw new ArgumentOutOfRangeException("tv", "Invalid value for Field.TermVector");
589  }
590  }
591 
592  public static bool WithOffsets(this Field.TermVector tv)
593  {
594  switch (tv)
595  {
596  case Field.TermVector.NO:
597  case Field.TermVector.YES:
598  case Field.TermVector.WITH_POSITIONS:
599  return false;
600  case Field.TermVector.WITH_OFFSETS:
601  case Field.TermVector.WITH_POSITIONS_OFFSETS:
602  return true;
603  default:
604  throw new ArgumentOutOfRangeException("tv", "Invalid value for Field.TermVector");
605  }
606  }
607 
608  public static Field.Index ToIndex(bool indexed, bool analyed)
609  {
610  return ToIndex(indexed, analyed, false);
611  }
612 
613  public static Field.Index ToIndex(bool indexed, bool analyzed, bool omitNorms)
614  {
615 
616  // If it is not indexed nothing else matters
617  if (!indexed)
618  {
619  return Field.Index.NO;
620  }
621 
622  // typical, non-expert
623  if (!omitNorms)
624  {
625  if (analyzed)
626  {
627  return Field.Index.ANALYZED;
628  }
629  return Field.Index.NOT_ANALYZED;
630  }
631 
632  // Expert: Norms omitted
633  if (analyzed)
634  {
635  return Field.Index.ANALYZED_NO_NORMS;
636  }
637  return Field.Index.NOT_ANALYZED_NO_NORMS;
638  }
639 
640  /// <summary>
641  /// Get the best representation of a TermVector given the flags.
642  /// </summary>
643  public static Field.TermVector ToTermVector(bool stored, bool withOffsets, bool withPositions)
644  {
645  // If it is not stored, nothing else matters.
646  if (!stored)
647  {
648  return Field.TermVector.NO;
649  }
650 
651  if (withOffsets)
652  {
653  if (withPositions)
654  {
655  return Field.TermVector.WITH_POSITIONS_OFFSETS;
656  }
657  return Field.TermVector.WITH_OFFSETS;
658  }
659 
660  if (withPositions)
661  {
662  return Field.TermVector.WITH_POSITIONS;
663  }
664  return Field.TermVector.YES;
665  }
666  }
667 }