docs/3.0.3/_token_sources_8cs_source.html

/*

 * Licensed to the Apache Software Foundation (ASF) under one or more

 * contributor license agreements.  See the NOTICE file distributed with

 * this work for additional information regarding copyright ownership.

 * The ASF licenses this file to You under the Apache License, Version 2.0

 * (the "License"); you may not use this file except in compliance with

 * the License.  You may obtain a copy of the License at

 *

 * http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


/*

* Created on 28-Oct-2004

*/


using System;

using System.Collections.Generic;

using System.IO;

using System.Linq;

using Lucene.Net.Analysis;

using Lucene.Net.Analysis.Tokenattributes;

using Lucene.Net.Documents;

using Lucene.Net.Index;


namespace Lucene.Net.Search.Highlight

{


    public class TokenSources

    {

        public class StoredTokenStream : TokenStream

        {

            protected internal Token[] tokens;

            protected internal int currentToken = 0;

            protected internal ITermAttribute termAtt;

            protected internal IOffsetAttribute offsetAtt;


            protected internal StoredTokenStream(Token[] tokens)

            {

                this.tokens = tokens;

                termAtt = AddAttribute<ITermAttribute>();

                offsetAtt = AddAttribute<IOffsetAttribute>();

            }


            public override bool IncrementToken()

            {

                if (currentToken >= tokens.Length)

                {

                    return false;

                }

                ClearAttributes();

                Token token = tokens[currentToken++];

                termAtt.SetTermBuffer(token.Term);

                offsetAtt.SetOffset(token.StartOffset, token.EndOffset);

                return true;

            }


            protected override void Dispose(bool disposing)

            {

                // do nothing

            }

        }


        public static TokenStream GetAnyTokenStream(IndexReader reader, int docId, String field, Document doc,

                                                    Analyzer analyzer)

        {

            TokenStream ts = null;


            var tfv = reader.GetTermFreqVector(docId, field);

            if (tfv != null)

            {

                var termPositionVector = tfv as TermPositionVector;

                if (termPositionVector != null)

                {

                    ts = GetTokenStream(termPositionVector);

                }

            }

            //No token info stored so fall back to analyzing raw content

            return ts ?? GetTokenStream(doc, field, analyzer);

        }


        public static TokenStream GetAnyTokenStream(IndexReader reader, int docId, String field, Analyzer analyzer)

        {

            TokenStream ts = null;


            var tfv = reader.GetTermFreqVector(docId, field);

            if (tfv != null)

            {

                var termPositionVector = tfv as TermPositionVector;

                if (termPositionVector != null)

                {

                    ts = GetTokenStream(termPositionVector);

                }

            }

            //No token info stored so fall back to analyzing raw content

            return ts ?? GetTokenStream(reader, docId, field, analyzer);

        }


        public static TokenStream GetTokenStream(TermPositionVector tpv)

        {

            //assumes the worst and makes no assumptions about token position sequences.

            return GetTokenStream(tpv, false);

        }


        public static TokenStream GetTokenStream(TermPositionVector tpv, bool tokenPositionsGuaranteedContiguous)

        {

            //code to reconstruct the original sequence of Tokens

            String[] terms = tpv.GetTerms();

            int[] freq = tpv.GetTermFrequencies();


            int totalTokens = freq.Sum();


            var tokensInOriginalOrder = new Token[totalTokens];

            List<Token> unsortedTokens = null;

            for (int t = 0; t < freq.Length; t++)

            {

                TermVectorOffsetInfo[] offsets = tpv.GetOffsets(t);

                if (offsets == null)

                {

                    return null;

                }


                int[] pos = null;

                if (tokenPositionsGuaranteedContiguous)

                {

                    //try get the token position info to speed up assembly of tokens into sorted sequence

                    pos = tpv.GetTermPositions(t);

                }

                if (pos == null)

                {

                    //tokens NOT stored with positions or not guaranteed contiguous - must add to list and sort later

                    if (unsortedTokens == null)

                    {

                        unsortedTokens = new List<Token>();

                    }


                    foreach (TermVectorOffsetInfo t1 in offsets)

                    {

                        var token = new Token(t1.StartOffset, t1.EndOffset);

                        token.SetTermBuffer(terms[t]);

                        unsortedTokens.Add(token);

                    }

                }

                else

                {

                    //We have positions stored and a guarantee that the token position information is contiguous


                    // This may be fast BUT wont work if Tokenizers used which create >1 token in same position or

                    // creates jumps in position numbers - this code would fail under those circumstances


                    //tokens stored with positions - can use this to index straight into sorted array

                    for (int tp = 0; tp < pos.Length; tp++)

                    {

                        var token = new Token(terms[t], offsets[tp].StartOffset, offsets[tp].EndOffset);

                        tokensInOriginalOrder[pos[tp]] = token;

                    }

                }

            }

            //If the field has been stored without position data we must perform a sort

            if (unsortedTokens != null)

            {

                tokensInOriginalOrder = unsortedTokens.ToArray();

                Array.Sort(tokensInOriginalOrder, (t1, t2) =>

                                                      {

                                                          if (t1.StartOffset > t2.EndOffset)

                                                              return 1;

                                                          if (t1.StartOffset < t2.StartOffset)

                                                              return -1;

                                                          return 0;

                                                      });

            }

            return new StoredTokenStream(tokensInOriginalOrder);

        }


        public static TokenStream GetTokenStream(IndexReader reader, int docId, System.String field)

        {

            var tfv = reader.GetTermFreqVector(docId, field);

            if (tfv == null)

            {

                throw new ArgumentException(field + " in doc #" + docId

                                            + "does not have any term position data stored");

            }

            if (tfv is TermPositionVector)

            {

                var tpv = (TermPositionVector) reader.GetTermFreqVector(docId, field);

                return GetTokenStream(tpv);

            }

            throw new ArgumentException(field + " in doc #" + docId

                                        + "does not have any term position data stored");

        }


        //convenience method

        public static TokenStream GetTokenStream(IndexReader reader, int docId, String field, Analyzer analyzer)

        {

            Document doc = reader.Document(docId);

            return GetTokenStream(doc, field, analyzer);

        }


        public static TokenStream GetTokenStream(Document doc, String field, Analyzer analyzer)

        {

            String contents = doc.Get(field);

            if (contents == null)

            {

                throw new ArgumentException("Field " + field + " in document is not stored and cannot be analyzed");

            }

            return GetTokenStream(field, contents, analyzer);

        }


        //convenience method

        public static TokenStream GetTokenStream(String field, String contents, Analyzer analyzer)

        {

            return analyzer.TokenStream(field, new StringReader(contents));

        }

    }

}