htmltokenizer.java

来自「java html 解析小程序,文件包很小」· Java 代码 · 共 232 行

JAVA

232 行

/* * HTML Parser * Copyright (C) 1997 David McNicol * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * file COPYING for more details. */package cvu.html;import java.util.Vector;import java.util.Enumeration;import java.io.FileInputStream;import java.io.InputStream;import java.io.IOException;import java.io.InputStreamReader;import java.io.Reader;/** * This class tokenizes a stream of HTML tags and blocks of text. After * the stream has been tokenized an Enumeration of tokens can be accessed. * @see TagToken * @see TextToken * @see java.util.Enumeration * @author <a href="http://www.strath.ac.uk/~ras97108/">David McNicol</a> */public class HTMLTokenizer {	private final int BUF_LEN = 256; // Maximum length of read buffer.	private Vector tokens;	         // Store for finished tokens.	private char separator;	// Stores the current separator character.	private int start;	// Index of the start of the next token.	/**	 * Constructs a new HTMLTokenizer using the given filename	 * to create the input stream.	 * @param file the name of the file to open.	 */	public HTMLTokenizer (String file) {			InputStream is; // The new input stream.		// Initialise the variables.		tokens = new Vector();		try {			// Open an input stream using the file name.			is = new FileInputStream(file);			// Parse the input stream.			parseInputStream(is);		}		catch (IOException ioe) {			return;		}	}    public HTMLTokenizer (InputStream is) throws IOException {        tokens = new Vector();        parseInputStream(is);    }    public HTMLTokenizer (Reader is) throws IOException {        tokens = new Vector();        parseInputStream(is);    }	/**	 * Returns an enumeration of the tokens which have been	 * created by the HTMLTokenizer.	 */	public Enumeration getTokens () {		return tokens.elements();	}	/**	 * Returns the vector in which the tokens are stored.	 */	public Vector getTokenVector () {		return tokens;	}	/**	 * Parses the input stream given into tokens.	 * @param is the input stream to parse.	 */	private void parseInputStream (InputStream is) throws IOException {            parseInputStream(new InputStreamReader(is));        }	private void parseInputStream (Reader is) throws IOException {		char[] charbuf;	     // Read buffer converted to characters.		StringBuffer unused; // Characters still to be processed.		int length;	     // Length of last chunk of read data.		int i;		     // Loop variable.		// Create new buffers.		charbuf = new char[BUF_LEN];		unused = null;		// Set the separator initially.		separator = '<';		// Loop round while the end-of-file has not been reached.		while (true) {			// Read in the first chunk of data.			length = is.read(charbuf);			// Check for end-of-file.			if (length < 0) break;					// Process it.			unused = processBuffer(charbuf, unused, length);		}	}	/**	 * Processes the given character array. The token buffer will be	 * updated to start with the contents of the given StringBuffer.	 * Any leftover parts of the buffer that have not been processed	 * are returned in a StringBuffer. The next call to processBuffer	 * will start where the last one left off by putting the returned	 * StringBuffer in the argument list of the next call.	 * @param charbuf the character array to be processed.	 * @param old the leftovers from the last call.	 * @param len the maximum length of the array to process.	 */	private StringBuffer processBuffer (char[] charbuf, StringBuffer old,	  int len) {		StringBuffer data; // Stores current token's data.		int idx;	   // The index of the next separator.		int i;		   // Loop variable.		// Get a buffer for the current token.		if (old != null)			data = old;		else			data = new StringBuffer(80);		// Make sure the start index is initialized properly.		start = 0;		idx = -1;		while (true) {			// Set the new start index.			start = idx + 1;			// Get the index of the separator.			idx = indexOf(separator, charbuf, start, len);			// Check if the separator appears or not.			if (idx < 0) {				// Update the data buffer.				if (len - start > 0)				  data.append(charbuf, start, len - start);				// If there is data in the buffer, return it.				if (data.length() > 0)					return data;				else					return null;			}			// Append the start of the read buffer onto the			// data buffer.			data.append(charbuf, start, idx - start);			// Check if we should create text or a tag.			if (separator == '<') {				// Check if there is any content to store.				if (data.length() > 0) {					// Create a new TextToken.					TextToken tt = new TextToken();					// Put the data into the token.					tt.setText(data);					// Store the new TextToken.					tokens.addElement(tt);				}			} else {				// Convert the data to a string.				String s = data.toString();				// Create a new TagToken with it.				TagToken tt = new TagToken(s);				// Store the new TagToken.				tokens.addElement(tt);			}			// Create a new, empty data buffer.			data = new StringBuffer(BUF_LEN);			// Swap the separator character.			if (separator == '<')				separator = '>';			else				separator = '<';		}	}	/**	 * Returns the index of the given character in the given byte	 * array or -1 if the character does not appear there.	 * @param c the test character.	 * @param array the byte array to search.	 * @param start the first index to search.	 * @param len the maximum length to search.	 */	private int indexOf (char c, char[] array, int start, int len) {		for (int i = start; i < len; i++)			if (array[i] == c) return i;		return -1;	}}

htmltokenizer.java - 源码说明

本页面展示了「java html 解析小程序,文件包很小」中的 htmltokenizer.java 源码文件，采用 Java 编程语言编写，共 232 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。

虫虫下载站收录了大量与java相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。

⌨️ 快捷键说明

复制代码Ctrl + C

搜索代码Ctrl + F

全屏模式F11

增大字号Ctrl + =

减小字号Ctrl + -

显示快捷键?