📄 htmlparser.java
字号:
/* Generated By:JavaCC: Do not edit this line. HTMLParser.java */
package org.apache.lucene.demo.html;
import java.io.*;
public class HTMLParser implements HTMLParserConstants {
public static int SUMMARY_LENGTH = 200;
StringBuffer title = new StringBuffer(SUMMARY_LENGTH);
StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2);
int length = 0;
boolean titleComplete = false;
boolean inTitle = false;
boolean inScript = false;
boolean afterTag = false;
boolean afterSpace = false;
String eol = System.getProperty("line.separator");
PipedReader pipeIn = null;
PipedWriter pipeOut;
public HTMLParser(File file) throws FileNotFoundException {
this(new FileInputStream(file));
}
public String getTitle() throws IOException, InterruptedException {
if (pipeIn == null)
getReader(); // spawn parsing thread
while (true) {
synchronized(this) {
if (titleComplete || (length > SUMMARY_LENGTH))
break;
wait(10);
}
}
return title.toString().trim();
}
public String getSummary() throws IOException, InterruptedException {
if (pipeIn == null)
getReader(); // spawn parsing thread
while (true) {
synchronized(this) {
if (summary.length() >= SUMMARY_LENGTH)
break;
wait(10);
}
}
if (summary.length() > SUMMARY_LENGTH)
summary.setLength(SUMMARY_LENGTH);
String sum = summary.toString().trim();
String tit = getTitle();
if (sum.startsWith(tit))
return sum.substring(tit.length());
else
return sum;
}
public Reader getReader() throws IOException {
if (pipeIn == null) {
pipeIn = new PipedReader();
pipeOut = new PipedWriter(pipeIn);
Thread thread = new ParserThread(this);
thread.start(); // start parsing
}
return pipeIn;
}
void addToSummary(String text) {
if (summary.length() < SUMMARY_LENGTH) {
summary.append(text);
if (summary.length() >= SUMMARY_LENGTH) {
synchronized(this) {
notifyAll();
}
}
}
}
void addText(String text) throws IOException {
if (inScript)
return;
if (inTitle)
title.append(text);
else {
addToSummary(text);
if (!titleComplete && !title.equals("")) { // finished title
synchronized(this) {
titleComplete = true; // tell waiting threads
notifyAll();
}
}
}
length += text.length();
pipeOut.write(text);
afterSpace = false;
}
void addSpace() throws IOException {
if (inScript)
return;
if (!afterSpace) {
if (inTitle)
title.append(" ");
else
addToSummary(" ");
String space = afterTag ? eol : " ";
length += space.length();
pipeOut.write(space);
afterSpace = true;
}
}
final public void HTMLDocument() throws ParseException, IOException {
Token t;
label_1:
while (true) {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case TagName:
case DeclName:
case Comment1:
case Comment2:
case Word:
case Entity:
case Space:
case Punct:
;
break;
default:
jj_la1[0] = jj_gen;
break label_1;
}
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case TagName:
Tag();
afterTag = true;
break;
case DeclName:
t = Decl();
afterTag = true;
break;
case Comment1:
case Comment2:
CommentTag();
afterTag = true;
break;
case Word:
t = jj_consume_token(Word);
addText(t.image); afterTag = false;
break;
case Entity:
t = jj_consume_token(Entity);
addText(Entities.decode(t.image)); afterTag = false;
break;
case Punct:
t = jj_consume_token(Punct);
addText(t.image); afterTag = false;
break;
case Space:
jj_consume_token(Space);
addSpace(); afterTag = false;
break;
default:
jj_la1[1] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
}
jj_consume_token(0);
}
final public void Tag() throws ParseException, IOException {
Token t1, t2;
boolean inImg = false;
t1 = jj_consume_token(TagName);
inTitle = t1.image.equalsIgnoreCase("<title"); // keep track if in <TITLE>
inImg = t1.image.equalsIgnoreCase("<img"); // keep track if in <IMG>
if (inScript) { // keep track if in <SCRIPT>
inScript = !t1.image.equalsIgnoreCase("</script");
} else {
inScript = t1.image.equalsIgnoreCase("<script");
}
label_2:
while (true) {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case ArgName:
;
break;
default:
jj_la1[2] = jj_gen;
break label_2;
}
t1 = jj_consume_token(ArgName);
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case ArgEquals:
jj_consume_token(ArgEquals);
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case ArgValue:
case ArgQuote1:
case ArgQuote2:
t2 = ArgValue();
if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)
addText("[" + t2.image + "]");
break;
default:
jj_la1[3] = jj_gen;
;
}
break;
default:
jj_la1[4] = jj_gen;
;
}
}
jj_consume_token(TagEnd);
}
final public Token ArgValue() throws ParseException {
Token t = null;
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case ArgValue:
t = jj_consume_token(ArgValue);
{if (true) return t;}
break;
default:
jj_la1[5] = jj_gen;
if (jj_2_1(2)) {
jj_consume_token(ArgQuote1);
jj_consume_token(CloseQuote1);
{if (true) return t;}
} else {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case ArgQuote1:
jj_consume_token(ArgQuote1);
t = jj_consume_token(Quote1Text);
jj_consume_token(CloseQuote1);
{if (true) return t;}
break;
default:
jj_la1[6] = jj_gen;
if (jj_2_2(2)) {
jj_consume_token(ArgQuote2);
jj_consume_token(CloseQuote2);
{if (true) return t;}
} else {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case ArgQuote2:
jj_consume_token(ArgQuote2);
t = jj_consume_token(Quote2Text);
jj_consume_token(CloseQuote2);
{if (true) return t;}
break;
default:
jj_la1[7] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
}
}
}
}
throw new Error("Missing return statement in function");
}
final public Token Decl() throws ParseException {
Token t;
t = jj_consume_token(DeclName);
label_3:
while (true) {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case ArgName:
case ArgEquals:
case ArgValue:
case ArgQuote1:
case ArgQuote2:
;
break;
default:
jj_la1[8] = jj_gen;
break label_3;
}
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case ArgName:
jj_consume_token(ArgName);
break;
case ArgValue:
case ArgQuote1:
case ArgQuote2:
ArgValue();
break;
case ArgEquals:
jj_consume_token(ArgEquals);
break;
default:
jj_la1[9] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
}
jj_consume_token(TagEnd);
{if (true) return t;}
throw new Error("Missing return statement in function");
}
final public void CommentTag() throws ParseException {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case Comment1:
jj_consume_token(Comment1);
label_4:
while (true) {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case CommentText1:
;
break;
default:
jj_la1[10] = jj_gen;
break label_4;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -