📄 htmlparser.java
字号:
package html_parser;
import html_connection.HtmlConnection;
import java.io.BufferedInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.LinkedList;
public class HtmlParser {
URL currentURL;
HtmlConnection srcConnection;
BufferedInputStream srcFile;
FileOutputStream dstFile;
boolean isInLink = false;
StringBuffer linkDescribe;
public LinkedList<SuperLink> superLink = new LinkedList<SuperLink>();
public static void main(String[] argv) {
HtmlParser parser = new HtmlParser("E:/test3.html","E:/test.txt");
while (parser.superLink.size() > 0) {
SuperLink link = parser.superLink.pop();
System.out.println(link.getURL() + " " + link.describe);
}
}
class ExtCharException extends Exception {
final static long serialVersionUID = 8271;
char ch;
public ExtCharException ( char ch ) {
this.ch = ch;
}
}
private void dealExtChar() throws IOException, ExtCharException {
StringBuffer temp = new StringBuffer();
int chi;
while((chi = srcFile.read()) != -1) {
if ((char)chi == '&') {
dstFile.write('&');
if (isInLink) linkDescribe.append('&');
for (int i=0;i<temp.length();i++) {
dstFile.write(temp.charAt(i));
if (isInLink) linkDescribe.append(temp.charAt(i));
}
temp = new StringBuffer();
} else {
if ((char)chi == ';' || (char)chi == '<' || Character.isWhitespace(chi)) break;
temp.append((char)chi);
}
}
String tStr = temp.toString().toLowerCase();
if (tStr.equals("quot")) {
dstFile.write('"');
if (isInLink) linkDescribe.append('"');
} else if (tStr.equals("amp")) {
dstFile.write('&');
if (isInLink) linkDescribe.append('&');
} else if (tStr.equals("#39")) {
dstFile.write('\'');
if (isInLink) linkDescribe.append('\'');
} else if (tStr.equals("gt")) {
dstFile.write('>');
if (isInLink) linkDescribe.append('>');
} else if (tStr.equals("lt")) {
dstFile.write('<');
if (isInLink) linkDescribe.append('<');
} else if (tStr.equals("nbsp")) {
dstFile.write(' ');
if (isInLink) linkDescribe.append(' ');
} else {
dstFile.write('&');
if (isInLink) linkDescribe.append('&');
for (int i=0;i<temp.length();i++){
dstFile.write(temp.charAt(i));
if (isInLink) linkDescribe.append(temp.charAt(i));
}
throw new ExtCharException((char)chi);
}
}
String readTag() throws IOException {
int chi;
StringBuffer tag = new StringBuffer();
while ((chi = srcFile.read()) != '>' || chi == -1) {
if ((char)chi == '<') tag = new StringBuffer();
else tag.append((char)chi);
}
return tag.toString().trim();
}
public HtmlParser(URL srcURL, String dstFileName) {
parser(srcURL,dstFileName);
}
public HtmlParser(String srcURL, String dstFileName) {
if (srcURL.indexOf("://") == -1) {
if (srcURL.indexOf(":/") == -1) {
srcURL = "http://" + srcURL;
} else {
srcURL = "file://" + srcURL;
}
}
try {
URL url = new URL(srcURL);
parser(url,dstFileName);
} catch (MalformedURLException err) {
System.out.println("URL Not Found!");
}
}
public void parser(URL srcURL, String dstFileName) {
try {
currentURL = srcURL;
srcConnection = new HtmlConnection(srcURL);
srcFile = srcConnection.getInputStream();
dstFile = new FileOutputStream(dstFileName);
TagStack tagStack = new TagStack(dstFile);
int chi;
boolean inScript = false, isBlankBefore = false;
while ( (chi = srcFile.read()) != -1) {
char ch = (char)chi;
if (inScript) {
if (ch == '<') {
Tag tag = new Tag(readTag());
if (tag.match("script")) inScript = false;
}
continue;
}
if (ch == '&') {
try {
dealExtChar();
continue;
} catch (ExtCharException err) {
ch = err.ch;
}
}
if (ch == '<') {
Tag tag = new Tag(readTag());
if (tag.isEndingTag()) {
NLineELink ret = tagStack.pop(tag);
if (ret.isEndLink) {
superLink.getLast().describe = StringTools.stringTransfer(linkDescribe);
isInLink = false;
}
if (ret.isNewLine) {
isBlankBefore = true;
if (isInLink) linkDescribe.append('\n');
}
} else if (tag.match("script")) {
inScript = true;
} else {
if (tag.match("a")) {
if (tagStack.pop(new Tag("/a")).isEndLink)
superLink.getLast().describe = StringTools.stringTransfer(linkDescribe);
isInLink = true;
try {
superLink.add( new SuperLink(tag, currentURL) );
linkDescribe = new StringBuffer();
} catch(NotLinkException err) {
isInLink = false;
continue;
}
} else if (tag.match("p")) {
dstFile.write('\n');
if (isInLink) linkDescribe.append('\n');
isBlankBefore = true;
}
tagStack.push(tag);
if (tag.isSelfEndingTag()) {
NLineELink ret = tagStack.pop(tag);
if (ret.isEndLink) {
isInLink = false;
}
if (ret.isNewLine) {
isBlankBefore = true;
if (isInLink) linkDescribe.append('\n');
}
}
}
//System.gc();
} else {
if ( Character.isWhitespace(ch))
{
if (isBlankBefore) continue;
dstFile.write(' ');
if (isInLink) linkDescribe.append(' ');
isBlankBefore = true;
} else {
isBlankBefore = false;
dstFile.write(ch);
if (isInLink) linkDescribe.append(ch);
}
}
}
} catch (IOException err) {
System.out.println("File I/O Error!");
} finally {
try {
if (srcFile != null) srcFile.close();
srcFile = null;
if (dstFile != null) dstFile.close();
dstFile = null;
} catch (IOException err) {
System.out.println("I/O Error when closing file!");
}
linkDescribe = null;
System.gc();
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -