21 using Lucene.Net.Support;
23 namespace Lucene.Net.Demo.Html
25 #pragma warning disable 162,164
29 private void InitBlock()
31 jj_2_rtns =
new JJCalls[2];
32 jj_ls =
new LookaheadSuccess();
34 public static int SUMMARY_LENGTH = 200;
36 internal System.Text.StringBuilder title =
new System.Text.StringBuilder(SUMMARY_LENGTH);
37 internal System.Text.StringBuilder summary =
new System.Text.StringBuilder(SUMMARY_LENGTH * 2);
38 internal System.Collections.Specialized.NameValueCollection metaTags =
new System.Collections.Specialized.NameValueCollection();
39 internal System.String currentMetaTag = null;
40 internal System.String currentMetaContent = null;
41 internal int length = 0;
42 internal bool titleComplete =
false;
43 internal bool summaryComplete =
false;
44 internal bool inTitle =
false;
45 internal bool inMetaTag =
false;
46 internal bool inStyle =
false;
47 internal bool afterTag =
false;
48 internal bool afterSpace =
false;
49 internal System.String eol = System.Environment.NewLine;
50 internal System.IO.StreamReader pipeIn = null;
51 internal System.IO.StreamWriter pipeOut;
52 private MyPipedInputStream pipeInStream = null;
53 private System.IO.StreamWriter pipeOutStream = null;
55 private class MyPipedInputStream : System.IO.MemoryStream
60 public System.IO.Stream BaseStream
68 public override int Read(byte[] buffer,
int offset,
int count)
72 base.Seek(_readPtr, System.IO.SeekOrigin.Begin);
73 int x = base.Read(buffer, offset, count);
79 public override void Write(byte[] buffer,
int offset,
int count)
83 base.Seek(_writePtr, System.IO.SeekOrigin.Begin);
84 base.Write(buffer, offset, count);
89 public override void Close()
94 public virtual bool Full()
102 public HTMLParser(System.IO.FileInfo file):this(new System.IO.FileStream(file.FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read))
106 public virtual System.String GetTitle()
114 if (titleComplete || pipeInStream.Full())
116 System.Threading.Monitor.Wait(
this, TimeSpan.FromMilliseconds(10));
119 return title.ToString().Trim();
122 public virtual System.Collections.Specialized.NameValueCollection GetMetaTags()
130 if (titleComplete || pipeInStream.Full())
132 System.Threading.Monitor.Wait(
this, TimeSpan.FromMilliseconds(10));
139 public virtual System.String GetSummary()
147 if (summary.Length >= SUMMARY_LENGTH || pipeInStream.Full())
149 System.Threading.Monitor.Wait(
this, TimeSpan.FromMilliseconds(10));
152 if (summary.Length > SUMMARY_LENGTH)
153 summary.Length = SUMMARY_LENGTH;
155 System.String sum = summary.ToString().Trim();
156 System.String tit = GetTitle();
157 if (sum.StartsWith(tit) || sum.Equals(
""))
163 public virtual System.IO.StreamReader GetReader()
167 pipeInStream =
new MyPipedInputStream();
168 pipeOutStream =
new System.IO.StreamWriter(pipeInStream.BaseStream);
169 pipeIn =
new System.IO.StreamReader(pipeInStream.BaseStream, System.Text.Encoding.GetEncoding(
"UTF-16BE"));
170 pipeOut =
new System.IO.StreamWriter(pipeOutStream.BaseStream, System.Text.Encoding.GetEncoding(
"UTF-16BE"));
179 internal virtual void AddToSummary(System.String text)
181 if (summary.Length < SUMMARY_LENGTH)
183 summary.Append(text);
184 if (summary.Length >= SUMMARY_LENGTH)
188 summaryComplete =
true;
189 System.Threading.Monitor.PulseAll(
this);
195 internal virtual void AddText(System.String text)
204 if (!titleComplete && !(title.Length == 0))
209 titleComplete =
true;
210 System.Threading.Monitor.PulseAll(
this);
215 length += text.Length;
221 internal virtual void AddMetaTag()
223 metaTags[currentMetaTag] = currentMetaContent;
224 currentMetaTag = null;
225 currentMetaContent = null;
229 internal virtual void AddSpace()
238 System.String space = afterTag?eol:
" ";
239 length += space.Length;
240 pipeOut.Write(space);
245 public void HTMLDocument()
250 switch ((jj_ntk == - 1)?Jj_ntk():jj_ntk)
253 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.ScriptStart:
254 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.TagName:
255 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.DeclName:
256 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Comment1:
257 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Comment2:
258 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Word:
259 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Entity:
260 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Space:
261 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Punct:
270 switch ((jj_ntk == - 1)?Jj_ntk():jj_ntk)
273 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.TagName:
278 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.DeclName:
283 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Comment1:
284 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Comment2:
289 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.ScriptStart:
294 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Word:
295 t = Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Word);
296 AddText(t.
image); afterTag =
false;
299 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Entity:
300 t = Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Entity);
304 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Punct:
305 t = Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Punct);
306 AddText(t.
image); afterTag =
false;
309 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Space:
310 Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Space);
311 AddSpace(); afterTag =
false;
316 Jj_consume_token(- 1);
331 t1 = Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.TagName);
332 System.String tagName = t1.
image.ToLower();
337 inTitle = tagName.ToUpper().Equals(
"<title".ToUpper());
338 inMetaTag = tagName.ToUpper().Equals(
"<META".ToUpper());
339 inStyle = tagName.ToUpper().Equals(
"<STYLE".ToUpper());
340 inImg = tagName.ToUpper().Equals(
"<img".ToUpper());
344 switch ((jj_ntk == - 1)?Jj_ntk():jj_ntk)
347 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.ArgName:
356 t1 = Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.ArgName);
357 switch ((jj_ntk == - 1)?Jj_ntk():jj_ntk)
360 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.ArgEquals:
361 Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.ArgEquals);
362 switch ((jj_ntk == - 1)?Jj_ntk():jj_ntk)
365 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.ArgValue:
366 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.ArgQuote1:
367 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.ArgQuote2:
369 if (inImg && t1.
image.ToUpper().Equals(
"alt".ToUpper()) && t2 != null)
370 AddText(
"[" + t2.
image +
"]");
372 if (inMetaTag && (t1.
image.ToUpper().Equals(
"name".ToUpper()) || t1.
image.ToUpper().Equals(
"HTTP-EQUIV".ToUpper())) && t2 != null)
374 currentMetaTag = t2.
image.ToLower();
375 if (currentMetaTag != null && currentMetaContent != null)
380 if (inMetaTag && t1.
image.ToUpper().Equals(
"content".ToUpper()) && t2 != null)
382 currentMetaContent = t2.
image.ToLower();
383 if (currentMetaTag != null && currentMetaContent != null)
408 Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.TagEnd);
414 switch ((jj_ntk == - 1)?Jj_ntk():jj_ntk)
417 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.ArgValue:
418 t = Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.ArgValue);
429 Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.ArgQuote1);
430 Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.CloseQuote1);
438 switch ((jj_ntk == - 1)?Jj_ntk():jj_ntk)
441 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.ArgQuote1:
442 Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.ArgQuote1);
443 t = Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Quote1Text);
444 Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.CloseQuote1);
455 Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.ArgQuote2);
456 Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.CloseQuote2);
464 switch ((jj_ntk == - 1)?Jj_ntk():jj_ntk)
467 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.ArgQuote2:
468 Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.ArgQuote2);
469 t = Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Quote2Text);
470 Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.CloseQuote2);
479 Jj_consume_token(- 1);
491 throw new System.ApplicationException(
"Missing return statement in function");
497 t = Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.DeclName);
500 switch ((jj_ntk == - 1)?Jj_ntk():jj_ntk)
503 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.ArgName:
504 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.ArgEquals:
505 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.ArgValue:
506 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.ArgQuote1:
507 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.ArgQuote2:
516 switch ((jj_ntk == - 1)?Jj_ntk():jj_ntk)
519 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.ArgName:
520 Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.ArgName);
523 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.ArgValue:
524 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.ArgQuote1:
525 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.ArgQuote2:
529 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.ArgEquals:
530 Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.ArgEquals);
535 Jj_consume_token(- 1);
543 Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.TagEnd);
548 throw new System.ApplicationException(
"Missing return statement in function");
551 public void CommentTag()
553 switch ((jj_ntk == - 1)?Jj_ntk():jj_ntk)
556 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Comment1:
557 Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Comment1);
560 switch ((jj_ntk == - 1)?Jj_ntk():jj_ntk)
563 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.CommentText1:
572 Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.CommentText1);
577 Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.CommentEnd1);
580 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Comment2:
581 Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Comment2);
584 switch ((jj_ntk == - 1)?Jj_ntk():jj_ntk)
587 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.CommentText2:
596 Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.CommentText2);
601 Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.CommentEnd2);
606 Jj_consume_token(- 1);
612 public void ScriptTag()
614 Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.ScriptStart);
617 switch ((jj_ntk == - 1)?Jj_ntk():jj_ntk)
620 case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.ScriptText:
629 Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.ScriptText);
634 Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.ScriptEnd);
637 private bool Jj_2_1(
int xla)
639 jj_la = xla; jj_lastpos = jj_scanpos = token;
644 catch (LookaheadSuccess ls)
654 private bool Jj_2_2(
int xla)
656 jj_la = xla; jj_lastpos = jj_scanpos = token;
661 catch (LookaheadSuccess ls)
671 private bool Jj_3_1()
673 if (Jj_scan_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.ArgQuote1))
675 if (Jj_scan_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.CloseQuote1))
680 private bool Jj_3_2()
682 if (Jj_scan_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.ArgQuote2))
684 if (Jj_scan_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.CloseQuote2))
693 private Token jj_scanpos, jj_lastpos;
695 public bool lookingAhead =
false;
696 private bool jj_semLA;
698 private int[] jj_la1 =
new int[14];
699 private static int[] jj_la1_0;
700 private static void Jj_la1_0()
702 jj_la1_0 =
new int[]{0x2c7e, 0x2c7e, 0x10000, 0x380000, 0x20000, 0x80000, 0x100000, 0x200000, 0x3b0000, 0x3b0000, 0x8000000, 0x20000000, 0x30, 0x4000};
704 private JJCalls[] jj_2_rtns;
705 private bool jj_rescan =
false;
706 private int jj_gc = 0;
708 public HTMLParser(System.IO.Stream stream):this(stream, null)
711 public HTMLParser(System.IO.Stream stream, System.String encoding)
718 catch (System.IO.IOException e)
720 throw new System.Exception(e.Message, e);
726 for (
int i = 0; i < 14; i++)
728 for (
int i = 0; i < jj_2_rtns.Length; i++)
729 jj_2_rtns[i] =
new JJCalls();
732 public virtual void ReInit(System.IO.Stream stream)
734 ReInit(stream, null);
736 public virtual void ReInit(System.IO.Stream stream, System.String encoding)
740 jj_input_stream.ReInit(stream, encoding, 1, 1);
742 catch (System.IO.IOException e)
744 throw new System.Exception(e.Message, e);
746 token_source.ReInit(jj_input_stream);
750 for (
int i = 0; i < 14; i++)
752 for (
int i = 0; i < jj_2_rtns.Length; i++)
753 jj_2_rtns[i] =
new JJCalls();
764 for (
int i = 0; i < 14; i++)
766 for (
int i = 0; i < jj_2_rtns.Length; i++)
767 jj_2_rtns[i] =
new JJCalls();
770 public virtual void ReInit(System.IO.StreamReader stream)
772 jj_input_stream.ReInit(stream, 1, 1);
773 token_source.ReInit(jj_input_stream);
777 for (
int i = 0; i < 14; i++)
779 for (
int i = 0; i < jj_2_rtns.Length; i++)
780 jj_2_rtns[i] =
new JJCalls();
790 for (
int i = 0; i < 14; i++)
792 for (
int i = 0; i < jj_2_rtns.Length; i++)
793 jj_2_rtns[i] =
new JJCalls();
802 for (
int i = 0; i < 14; i++)
804 for (
int i = 0; i < jj_2_rtns.Length; i++)
805 jj_2_rtns[i] =
new JJCalls();
808 private Token Jj_consume_token(
int kind)
811 if ((oldToken = token).next != null)
814 token = token.
next = token_source.GetNextToken();
816 if (token.kind == kind)
822 for (
int i = 0; i < jj_2_rtns.Length; i++)
824 JJCalls c = jj_2_rtns[i];
837 throw GenerateParseException();
841 private sealed
class LookaheadSuccess:System.ApplicationException
845 private LookaheadSuccess jj_ls;
846 private bool Jj_scan_token(
int kind)
848 if (jj_scanpos == jj_lastpos)
851 if (jj_scanpos.next == null)
853 jj_lastpos = jj_scanpos = jj_scanpos.next = token_source.GetNextToken();
857 jj_lastpos = jj_scanpos = jj_scanpos.next;
862 jj_scanpos = jj_scanpos.next;
866 int i = 0;
Token tok = token;
867 while (tok != null && tok != jj_scanpos)
872 Jj_add_error_token(kind, i);
874 if (jj_scanpos.kind != kind)
876 if (jj_la == 0 && jj_scanpos == jj_lastpos)
883 if (token.next != null)
886 token = token.
next = token_source.GetNextToken();
894 Token t = lookingAhead?jj_scanpos:token;
895 for (
int i = 0; i < index; i++)
900 t = t.
next = token_source.GetNextToken();
907 if ((jj_nt = token.next) == null)
908 return (jj_ntk = (token.next = token_source.GetNextToken()).kind);
910 return (jj_ntk = jj_nt.kind);
913 private System.Collections.ArrayList jj_expentries = System.Collections.ArrayList.Synchronized(
new System.Collections.ArrayList(10));
914 private int[] jj_expentry;
915 private int jj_kind = - 1;
916 private int[] jj_lasttokens =
new int[100];
917 private int jj_endpos;
919 private void Jj_add_error_token(
int kind,
int pos)
923 if (pos == jj_endpos + 1)
925 jj_lasttokens[jj_endpos++] = kind;
927 else if (jj_endpos != 0)
929 jj_expentry =
new int[jj_endpos];
930 for (
int i = 0; i < jj_endpos; i++)
932 jj_expentry[i] = jj_lasttokens[i];
935 for (System.Collections.IEnumerator e = jj_expentries.GetEnumerator(); e.MoveNext(); )
937 int[] oldentry = (
int[]) (e.Current);
938 if (oldentry.Length == jj_expentry.Length)
941 for (
int i = 0; i < jj_expentry.Length; i++)
943 if (oldentry[i] != jj_expentry[i])
954 jj_expentries.Add(jj_expentry);
956 jj_lasttokens[(jj_endpos = pos) - 1] = kind;
962 jj_expentries.Clear();
963 bool[] la1tokens =
new bool[31];
964 for (
int i = 0; i < 31; i++)
966 la1tokens[i] =
false;
970 la1tokens[jj_kind] =
true;
973 for (
int i = 0; i < 14; i++)
975 if (jj_la1[i] == jj_gen)
977 for (
int j = 0; j < 32; j++)
979 if ((jj_la1_0[i] & (1 << j)) != 0)
986 for (
int i = 0; i < 31; i++)
990 jj_expentry =
new int[1];
992 jj_expentries.Add(jj_expentry);
997 Jj_add_error_token(0, 0);
998 int[][] exptokseq =
new int[jj_expentries.Count][];
999 for (
int i = 0; i < jj_expentries.Count; i++)
1001 exptokseq[i] = (
int[]) jj_expentries[i];
1003 return new ParseException(token, exptokseq, Lucene.Net.Demo.Html.HTMLParserConstants_Fields.tokenImage);
1006 public void Enable_tracing()
1010 public void Disable_tracing()
1014 private void Jj_rescan_token()
1017 for (
int i = 0; i < 2; i++)
1021 JJCalls p = jj_2_rtns[i];
1026 jj_la = p.arg; jj_lastpos = jj_scanpos = p.first;
1030 case 0: Jj_3_1();
break;
1032 case 1: Jj_3_2();
break;
1039 catch (LookaheadSuccess ls)
1046 private void Jj_save(
int index,
int xla)
1048 JJCalls p = jj_2_rtns[index];
1049 while (p.gen > jj_gen)
1053 p = p.next =
new JJCalls();
break;
1057 p.gen = jj_gen + xla - jj_la; p.first = token; p.arg = xla;
1060 internal sealed
class JJCalls
1063 internal Token first;
1065 internal JJCalls next;