📄 htmlparser.java
字号:
/* Generated By:JavaCC: Do not edit this line. HTMLParser.java */
package com.laoer.bbscs.lucene.html;
import java.io.*;
import java.util.Properties;
public class HTMLParser
implements HTMLParserConstants {
public static int SUMMARY_LENGTH = 200;
StringBuffer title = new StringBuffer(SUMMARY_LENGTH);
StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2);
Properties metaTags = new Properties();
String currentMetaTag = null;
String currentMetaContent = null;
int length = 0;
boolean titleComplete = false;
boolean inTitle = false;
boolean inMetaTag = false;
boolean inStyle = false;
boolean afterTag = false;
boolean afterSpace = false;
String eol = System.getProperty("line.separator");
Reader pipeIn = null;
Writer pipeOut;
private MyPipedInputStream pipeInStream = null;
private PipedOutputStream pipeOutStream = null;
private class MyPipedInputStream
extends PipedInputStream {
public MyPipedInputStream() {
super();
}
public MyPipedInputStream(PipedOutputStream src) throws IOException {
super(src);
}
public boolean full() throws IOException {
return this.available() >= PipedInputStream.PIPE_SIZE;
}
}
public HTMLParser(File file) throws FileNotFoundException {
this(new FileInputStream(file));
}
public String getTitle() throws IOException, InterruptedException {
if (pipeIn == null) {
getReader(); // spawn parsing thread
}
while (true) {
synchronized (this) {
if (titleComplete || pipeInStream.full()) {
break;
}
wait(10);
}
}
return title.toString().trim();
}
public Properties getMetaTags() throws IOException,
InterruptedException {
if (pipeIn == null) {
getReader(); // spawn parsing thread
}
while (true) {
synchronized (this) {
if (titleComplete || pipeInStream.full()) {
break;
}
wait(10);
}
}
return metaTags;
}
public String getSummary() throws IOException, InterruptedException {
if (pipeIn == null) {
getReader(); // spawn parsing thread
}
while (true) {
synchronized (this) {
if (summary.length() >= SUMMARY_LENGTH || pipeInStream.full()) {
break;
}
wait(10);
}
}
if (summary.length() > SUMMARY_LENGTH) {
summary.setLength(SUMMARY_LENGTH);
}
String sum = summary.toString().trim();
String tit = getTitle();
if (sum.startsWith(tit) || sum.equals("")) {
return tit;
}
else {
return sum;
}
}
public Reader getReader() throws IOException {
if (pipeIn == null) {
pipeInStream = new MyPipedInputStream();
pipeOutStream = new PipedOutputStream(pipeInStream);
pipeIn = new InputStreamReader(pipeInStream);
pipeOut = new OutputStreamWriter(pipeOutStream);
Thread thread = new ParserThread(this);
thread.start(); // start parsing
}
return pipeIn;
}
void addToSummary(String text) {
if (summary.length() < SUMMARY_LENGTH) {
summary.append(text);
if (summary.length() >= SUMMARY_LENGTH) {
synchronized (this) {
notifyAll();
}
}
}
}
void addText(String text) throws IOException {
if (inStyle) {
return;
}
if (inTitle) {
title.append(text);
}
else {
addToSummary(text);
if (!titleComplete && !title.equals("")) { // finished title
synchronized (this) {
titleComplete = true; // tell waiting threads
notifyAll();
}
}
}
length += text.length();
pipeOut.write(text);
afterSpace = false;
}
void addMetaTag() throws IOException {
metaTags.setProperty(currentMetaTag, currentMetaContent);
currentMetaTag = null;
currentMetaContent = null;
return;
}
void addSpace() throws IOException {
if (!afterSpace) {
if (inTitle) {
title.append(" ");
}
else {
addToSummary(" ");
}
String space = afterTag ? eol : " ";
length += space.length();
pipeOut.write(space);
afterSpace = true;
}
}
final public void HTMLDocument() throws ParseException, IOException {
Token t;
label_1:while (true) {
switch ( (jj_ntk == -1) ? jj_ntk() : jj_ntk) {
case ScriptStart:
case TagName:
case DeclName:
case Comment1:
case Comment2:
case Word:
case Entity:
case Space:
case Punct:
;
break;
default:
jj_la1[0] = jj_gen;
break label_1;
}
switch ( (jj_ntk == -1) ? jj_ntk() : jj_ntk) {
case TagName:
Tag();
afterTag = true;
break;
case DeclName:
t = Decl();
afterTag = true;
break;
case Comment1:
case Comment2:
CommentTag();
afterTag = true;
break;
case ScriptStart:
ScriptTag();
afterTag = true;
break;
case Word:
t = jj_consume_token(Word);
addText(t.image);
afterTag = false;
break;
case Entity:
t = jj_consume_token(Entity);
addText(Entities.decode(t.image));
afterTag = false;
break;
case Punct:
t = jj_consume_token(Punct);
addText(t.image);
afterTag = false;
break;
case Space:
jj_consume_token(Space);
addSpace();
afterTag = false;
break;
default:
jj_la1[1] = jj_gen;
jj_consume_token( -1);
throw new ParseException();
}
}
jj_consume_token(0);
}
final public void Tag() throws ParseException, IOException {
Token t1, t2;
boolean inImg = false;
t1 = jj_consume_token(TagName);
String tagName = t1.image.toLowerCase();
if (Tags.WS_ELEMS.contains(tagName)) {
addSpace();
}
inTitle = tagName.equalsIgnoreCase("<title"); // keep track if in <TITLE>
inMetaTag = tagName.equalsIgnoreCase("<META"); // keep track if in <META>
inStyle = tagName.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE>
inImg = tagName.equalsIgnoreCase("<img"); // keep track if in <IMG>
label_2:while (true) {
switch ( (jj_ntk == -1) ? jj_ntk() : jj_ntk) {
case ArgName:
;
break;
default:
jj_la1[2] = jj_gen;
break label_2;
}
t1 = jj_consume_token(ArgName);
switch ( (jj_ntk == -1) ? jj_ntk() : jj_ntk) {
case ArgEquals:
jj_consume_token(ArgEquals);
switch ( (jj_ntk == -1) ? jj_ntk() : jj_ntk) {
case ArgValue:
case ArgQuote1:
case ArgQuote2:
t2 = ArgValue();
if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null) {
addText("[" + t2.image + "]");
}
if (inMetaTag &&
(t1.image.equalsIgnoreCase("name") ||
t1.image.equalsIgnoreCase("HTTP-EQUIV")
)
&& t2 != null) {
currentMetaTag = t2.image.toLowerCase();
if (currentMetaTag != null && currentMetaContent != null) {
addMetaTag();
}
}
if (inMetaTag && t1.image.equalsIgnoreCase("content") && t2 !=
null) {
currentMetaContent = t2.image.toLowerCase();
if (currentMetaTag != null && currentMetaContent != null) {
addMetaTag();
}
}
break;
default:
jj_la1[3] = jj_gen;
;
}
break;
default:
jj_la1[4] = jj_gen;
;
}
}
jj_consume_token(TagEnd);
}
final public Token ArgValue() throws ParseException {
Token t = null;
switch ( (jj_ntk == -1) ? jj_ntk() : jj_ntk) {
case ArgValue:
t = jj_consume_token(ArgValue);
{
if (true) {
return t;
}
}
break;
default:
jj_la1[5] = jj_gen;
if (jj_2_1(2)) {
jj_consume_token(ArgQuote1);
jj_consume_token(CloseQuote1);
{
if (true) {
return t;
}
}
}
else {
switch ( (jj_ntk == -1) ? jj_ntk() : jj_ntk) {
case ArgQuote1:
jj_consume_token(ArgQuote1);
t = jj_consume_token(Quote1Text);
jj_consume_token(CloseQuote1);
{
if (true) {
return t;
}
}
break;
default:
jj_la1[6] = jj_gen;
if (jj_2_2(2)) {
jj_consume_token(ArgQuote2);
jj_consume_token(CloseQuote2);
{
if (true) {
return t;
}
}
}
else {
switch ( (jj_ntk == -1) ? jj_ntk() : jj_ntk) {
case ArgQuote2:
jj_consume_token(ArgQuote2);
t = jj_consume_token(Quote2Text);
jj_consume_token(CloseQuote2);
{
if (true) {
return t;
}
}
break;
default:
jj_la1[7] = jj_gen;
jj_consume_token( -1);
throw new ParseException();
}
}
}
}
}
throw new Error("Missing return statement in function");
}
final public Token Decl() throws ParseException {
Token t;
t = jj_consume_token(DeclName);
label_3:while (true) {
switch ( (jj_ntk == -1) ? jj_ntk() : jj_ntk) {
case ArgName:
case ArgEquals:
case ArgValue:
case ArgQuote1:
case ArgQuote2:
;
break;
default:
jj_la1[8] = jj_gen;
break label_3;
}
switch ( (jj_ntk == -1) ? jj_ntk() : jj_ntk) {
case ArgName:
jj_consume_token(ArgName);
break;
case ArgValue:
case ArgQuote1:
case ArgQuote2:
ArgValue();
break;
case ArgEquals:
jj_consume_token(ArgEquals);
break;
default:
jj_la1[9] = jj_gen;
jj_consume_token( -1);
throw new ParseException();
}
}
jj_consume_token(TagEnd);
{
if (true) {
return t;
}
}
throw new Error("Missing return statement in function");
}
final public void CommentTag() throws ParseException {
switch ( (jj_ntk == -1) ? jj_ntk() : jj_ntk) {
case Comment1:
jj_consume_token(Comment1);
label_4:
while (true) {
switch ( (jj_ntk == -1) ? jj_ntk() : jj_ntk) {
case CommentText1:
;
break;
default:
jj_la1[10] = jj_gen;
break label_4;
}
jj_consume_token(CommentText1);
}
jj_consume_token(CommentEnd1);
break;
case Comment2:
jj_consume_token(Comment2);
label_5:
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -