⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 htmlparser.java

📁 简单的html解释器
💻 JAVA
字号:
package html_parser;


import html_connection.HtmlConnection;

import java.io.BufferedInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.LinkedList;


public class HtmlParser {
	URL currentURL;
	HtmlConnection srcConnection;
	BufferedInputStream srcFile;
	FileOutputStream dstFile;
	boolean isInLink = false;
	StringBuffer linkDescribe;

	public LinkedList<SuperLink> superLink = new LinkedList<SuperLink>();

	public static void main(String[] argv) {
		HtmlParser parser = new HtmlParser("E:/test3.html","E:/test.txt");
		while (parser.superLink.size() > 0) {
			SuperLink link = parser.superLink.pop();
						
			System.out.println(link.getURL() + "      " + link.describe);
		}
	}

	class ExtCharException extends Exception {
		final static long serialVersionUID = 8271; 
		char ch;
		public ExtCharException ( char ch ) {
			this.ch = ch;
		}
	}
	private void dealExtChar() throws IOException, ExtCharException {
		StringBuffer temp = new StringBuffer();
		int chi;
		while((chi = srcFile.read()) != -1) {
			if ((char)chi == '&') {
				dstFile.write('&');
				if (isInLink) linkDescribe.append('&');
				for (int i=0;i<temp.length();i++) {
					dstFile.write(temp.charAt(i));
					if (isInLink) linkDescribe.append(temp.charAt(i));
				}
				temp = new StringBuffer();
			} else {
				if ((char)chi == ';' || (char)chi == '<' || Character.isWhitespace(chi)) break;
				temp.append((char)chi);
			}
		}
		String tStr = temp.toString().toLowerCase();
		if (tStr.equals("quot")) {
			dstFile.write('"');
			if (isInLink) linkDescribe.append('"');
		} else if (tStr.equals("amp")) {
			dstFile.write('&');
			if (isInLink) linkDescribe.append('&');
		} else if (tStr.equals("#39")) {
			dstFile.write('\'');
			if (isInLink) linkDescribe.append('\'');
		} else if (tStr.equals("gt")) {
			dstFile.write('>');
			if (isInLink) linkDescribe.append('>');
		} else if (tStr.equals("lt")) {
			dstFile.write('<');
			if (isInLink) linkDescribe.append('<');
		} else if (tStr.equals("nbsp")) {
			dstFile.write(' ');
			if (isInLink) linkDescribe.append(' ');
		} else {
			dstFile.write('&');
			if (isInLink) linkDescribe.append('&');
			for (int i=0;i<temp.length();i++){
				dstFile.write(temp.charAt(i));
				if (isInLink) linkDescribe.append(temp.charAt(i));
			}
			throw new ExtCharException((char)chi);
		}
	}
	
	String readTag() throws IOException {
		int chi;
		StringBuffer tag = new StringBuffer();
		while ((chi = srcFile.read()) != '>' || chi == -1) {
			if ((char)chi == '<') tag = new StringBuffer();
			else tag.append((char)chi);
		}
		return tag.toString().trim();
	}

	public HtmlParser(URL srcURL, String dstFileName) {
		parser(srcURL,dstFileName);
	}
	public HtmlParser(String srcURL, String dstFileName) {
		if (srcURL.indexOf("://") == -1) {
			if (srcURL.indexOf(":/") == -1) {
				srcURL = "http://" + srcURL;
			} else {
				srcURL = "file://" + srcURL;
			}
		}			
		try	{
			URL url = new URL(srcURL);
			parser(url,dstFileName);
		} catch (MalformedURLException err) {
			System.out.println("URL Not Found!");
		}
	}
	
	public void parser(URL srcURL, String dstFileName) {		
		try {
			currentURL = srcURL;
			srcConnection = new HtmlConnection(srcURL);
			srcFile = srcConnection.getInputStream();
			dstFile = new FileOutputStream(dstFileName);
			TagStack tagStack = new TagStack(dstFile);
			int chi;
			boolean inScript = false, isBlankBefore = false;
			while ( (chi = srcFile.read()) != -1) {
				char ch = (char)chi;
				if (inScript) {
					if (ch == '<') {
						Tag tag = new Tag(readTag());
						if (tag.match("script")) inScript = false;
					}
					continue;
				}
				if (ch == '&') {
					try {
						dealExtChar();
						continue;
					} catch (ExtCharException err) {
						ch = err.ch;
					}
				}
				if (ch == '<') {
					Tag tag = new Tag(readTag());
					
					if (tag.isEndingTag()) {
						NLineELink ret = tagStack.pop(tag);
						if (ret.isEndLink) {
							superLink.getLast().describe = StringTools.stringTransfer(linkDescribe);
							isInLink = false;
						}
						if (ret.isNewLine) {
							isBlankBefore = true;
							if (isInLink) linkDescribe.append('\n');
						}
					} else if (tag.match("script")) {
						inScript = true;
					} else {
						if (tag.match("a")) {
							if (tagStack.pop(new Tag("/a")).isEndLink) 
								superLink.getLast().describe = StringTools.stringTransfer(linkDescribe);
							isInLink = true;
							try {
								superLink.add( new SuperLink(tag, currentURL) );
								linkDescribe = new StringBuffer();
							} catch(NotLinkException err) {
								isInLink = false;
								continue;
							}
						} else if (tag.match("p")) {
							dstFile.write('\n');
							if (isInLink) linkDescribe.append('\n');
							isBlankBefore = true;
						}
						
						tagStack.push(tag);
						
						if (tag.isSelfEndingTag()) { 
							NLineELink ret = tagStack.pop(tag);
							if (ret.isEndLink) {
								isInLink = false;
							}
							if (ret.isNewLine) {
								isBlankBefore = true;
								if (isInLink) linkDescribe.append('\n');
							}
						}
					}
					//System.gc();
				} else {
					if ( Character.isWhitespace(ch))
					{
						if (isBlankBefore) continue;
						dstFile.write(' ');
						if (isInLink) linkDescribe.append(' ');
						isBlankBefore = true;
					} else {
						isBlankBefore = false;
						dstFile.write(ch);
						if (isInLink) linkDescribe.append(ch);
					}			
				}
			}
		} catch (IOException err) {
			System.out.println("File I/O Error!");
		} finally {
			try {
				if (srcFile != null) srcFile.close();
				srcFile = null;
				if (dstFile != null) dstFile.close();
				dstFile = null;
			} catch (IOException err) {
				System.out.println("I/O Error when closing file!");
			}
			linkDescribe = null;
			System.gc();
		}
	}


}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -