📄 htmlparser.java
字号:
package com.laoer.bbscs.lucene.html;
/**
* <p>Title: 天乙社区V5.0</p>
* <p>Description: BBS-CS天乙社区V5.0</p>
* <p>Copyright: Copyright (c) 2003</p>
* <p>Company: laoer.com</p>
* @author 龚天乙
* @version 5.0
*/
import java.io.*;
public class HTMLParser
implements HTMLParserConstants {
public static int SUMMARY_LENGTH = 200;
StringBuffer title = new StringBuffer(SUMMARY_LENGTH);
StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2);
int length = 0;
boolean titleComplete = false;
boolean inTitle = false;
boolean inScript = false;
boolean afterTag = false;
boolean afterSpace = false;
String eol = System.getProperty("line.separator");
PipedReader pipeIn = null;
PipedWriter pipeOut;
public HTMLParser(File file) throws FileNotFoundException {
this(new FileInputStream(file));
}
public String getTitle() throws IOException, InterruptedException {
if (pipeIn == null) {
getReader(); // spawn parsing thread
}
while (true) {
synchronized (this) {
if (titleComplete || (length > SUMMARY_LENGTH)) {
break;
}
wait(10);
}
}
return title.toString().trim();
}
public String getSummary() throws IOException, InterruptedException {
if (pipeIn == null) {
getReader(); // spawn parsing thread
}
while (true) {
synchronized (this) {
if (summary.length() >= SUMMARY_LENGTH) {
break;
}
wait(10);
}
}
if (summary.length() > SUMMARY_LENGTH) {
summary.setLength(SUMMARY_LENGTH);
}
String sum = summary.toString().trim();
String tit = getTitle();
if (sum.startsWith(tit)) {
return sum.substring(tit.length());
}
else {
return sum;
}
}
public Reader getReader() throws IOException {
if (pipeIn == null) {
pipeIn = new PipedReader();
pipeOut = new PipedWriter(pipeIn);
Thread thread = new ParserThread(this);
thread.start(); // start parsing
}
return pipeIn;
}
void addToSummary(String text) {
if (summary.length() < SUMMARY_LENGTH) {
summary.append(text);
if (summary.length() >= SUMMARY_LENGTH) {
synchronized (this) {
notifyAll();
}
}
}
}
void addText(String text) throws IOException {
if (inScript) {
return;
}
if (inTitle) {
title.append(text);
}
else {
addToSummary(text);
if (!titleComplete && !title.equals("")) { // finished title
synchronized (this) {
titleComplete = true; // tell waiting threads
notifyAll();
}
}
}
length += text.length();
pipeOut.write(text);
afterSpace = false;
}
void addSpace() throws IOException {
if (inScript) {
return;
}
if (!afterSpace) {
if (inTitle) {
title.append(" ");
}
else {
addToSummary(" ");
}
String space = afterTag ? eol : " ";
length += space.length();
pipeOut.write(space);
afterSpace = true;
}
}
final public void HTMLDocument() throws ParseException, IOException {
Token t;
label_1:while (true) {
switch ( (jj_ntk == -1) ? jj_ntk() : jj_ntk) {
case TagName:
case DeclName:
case Comment1:
case Comment2:
case Word:
case Entity:
case Space:
case Punct:
;
break;
default:
jj_la1[0] = jj_gen;
break label_1;
}
switch ( (jj_ntk == -1) ? jj_ntk() : jj_ntk) {
case TagName:
Tag();
afterTag = true;
break;
case DeclName:
t = Decl();
afterTag = true;
break;
case Comment1:
case Comment2:
CommentTag();
afterTag = true;
break;
case Word:
t = jj_consume_token(Word);
addText(t.image);
afterTag = false;
break;
case Entity:
t = jj_consume_token(Entity);
addText(Entities.decode(t.image));
afterTag = false;
break;
case Punct:
t = jj_consume_token(Punct);
addText(t.image);
afterTag = false;
break;
case Space:
jj_consume_token(Space);
addSpace();
afterTag = false;
break;
default:
jj_la1[1] = jj_gen;
jj_consume_token( -1);
throw new ParseException();
}
}
jj_consume_token(0);
}
final public void Tag() throws ParseException, IOException {
Token t1, t2;
boolean inImg = false;
t1 = jj_consume_token(TagName);
inTitle = t1.image.equalsIgnoreCase("<title"); // keep track if in <TITLE>
inImg = t1.image.equalsIgnoreCase("<img"); // keep track if in <IMG>
if (inScript) { // keep track if in <SCRIPT>
inScript = !t1.image.equalsIgnoreCase("</script");
}
else {
inScript = t1.image.equalsIgnoreCase("<script");
}
label_2:while (true) {
switch ( (jj_ntk == -1) ? jj_ntk() : jj_ntk) {
case ArgName:
;
break;
default:
jj_la1[2] = jj_gen;
break label_2;
}
t1 = jj_consume_token(ArgName);
switch ( (jj_ntk == -1) ? jj_ntk() : jj_ntk) {
case ArgEquals:
jj_consume_token(ArgEquals);
switch ( (jj_ntk == -1) ? jj_ntk() : jj_ntk) {
case ArgValue:
case ArgQuote1:
case ArgQuote2:
t2 = ArgValue();
if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null) {
addText("[" + t2.image + "]");
}
break;
default:
jj_la1[3] = jj_gen;
;
}
break;
default:
jj_la1[4] = jj_gen;
;
}
}
jj_consume_token(TagEnd);
}
final public Token ArgValue() throws ParseException {
Token t = null;
switch ( (jj_ntk == -1) ? jj_ntk() : jj_ntk) {
case ArgValue:
t = jj_consume_token(ArgValue);
{
if (true) {
return t;
}
}
break;
default:
jj_la1[5] = jj_gen;
if (jj_2_1(2)) {
jj_consume_token(ArgQuote1);
jj_consume_token(CloseQuote1);
{
if (true) {
return t;
}
}
}
else {
switch ( (jj_ntk == -1) ? jj_ntk() : jj_ntk) {
case ArgQuote1:
jj_consume_token(ArgQuote1);
t = jj_consume_token(Quote1Text);
jj_consume_token(CloseQuote1);
{
if (true) {
return t;
}
}
break;
default:
jj_la1[6] = jj_gen;
if (jj_2_2(2)) {
jj_consume_token(ArgQuote2);
jj_consume_token(CloseQuote2);
{
if (true) {
return t;
}
}
}
else {
switch ( (jj_ntk == -1) ? jj_ntk() : jj_ntk) {
case ArgQuote2:
jj_consume_token(ArgQuote2);
t = jj_consume_token(Quote2Text);
jj_consume_token(CloseQuote2);
{
if (true) {
return t;
}
}
break;
default:
jj_la1[7] = jj_gen;
jj_consume_token( -1);
throw new ParseException();
}
}
}
}
}
throw new Error("Missing return statement in function");
}
final public Token Decl() throws ParseException {
Token t;
t = jj_consume_token(DeclName);
label_3:while (true) {
switch ( (jj_ntk == -1) ? jj_ntk() : jj_ntk) {
case ArgName:
case ArgEquals:
case ArgValue:
case ArgQuote1:
case ArgQuote2:
;
break;
default:
jj_la1[8] = jj_gen;
break label_3;
}
switch ( (jj_ntk == -1) ? jj_ntk() : jj_ntk) {
case ArgName:
jj_consume_token(ArgName);
break;
case ArgValue:
case ArgQuote1:
case ArgQuote2:
ArgValue();
break;
case ArgEquals:
jj_consume_token(ArgEquals);
break;
default:
jj_la1[9] = jj_gen;
jj_consume_token( -1);
throw new ParseException();
}
}
jj_consume_token(TagEnd);
{
if (true) {
return t;
}
}
throw new Error("Missing return statement in function");
}
final public void CommentTag() throws ParseException {
switch ( (jj_ntk == -1) ? jj_ntk() : jj_ntk) {
case Comment1:
jj_consume_token(Comment1);
label_4:
while (true) {
switch ( (jj_ntk == -1) ? jj_ntk() : jj_ntk) {
case CommentText1:
;
break;
default:
jj_la1[10] = jj_gen;
break label_4;
}
jj_consume_token(CommentText1);
}
jj_consume_token(CommentEnd1);
break;
case Comment2:
jj_consume_token(Comment2);
label_5:
while (true) {
switch ( (jj_ntk == -1) ? jj_ntk() : jj_ntk) {
case CommentText2:
;
break;
default:
jj_la1[11] = jj_gen;
break label_5;
}
jj_consume_token(CommentText2);
}
jj_consume_token(CommentEnd2);
break;
default:
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -