📄 htmltokenizer.java
字号:
/* * HTML Parser * Copyright (C) 1997 David McNicol * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * file COPYING for more details. */package cvu.html;import java.util.Vector;import java.util.Enumeration;import java.io.FileInputStream;import java.io.InputStream;import java.io.IOException;import java.io.InputStreamReader;import java.io.Reader;/** * This class tokenizes a stream of HTML tags and blocks of text. After * the stream has been tokenized an Enumeration of tokens can be accessed. * @see TagToken * @see TextToken * @see java.util.Enumeration * @author <a href="http://www.strath.ac.uk/~ras97108/">David McNicol</a> */public class HTMLTokenizer { private final int BUF_LEN = 256; // Maximum length of read buffer. private Vector tokens; // Store for finished tokens. private char separator; // Stores the current separator character. private int start; // Index of the start of the next token. /** * Constructs a new HTMLTokenizer using the given filename * to create the input stream. * @param file the name of the file to open. */ public HTMLTokenizer (String file) { InputStream is; // The new input stream. // Initialise the variables. tokens = new Vector(); try { // Open an input stream using the file name. is = new FileInputStream(file); // Parse the input stream. parseInputStream(is); } catch (IOException ioe) { return; } } public HTMLTokenizer (InputStream is) throws IOException { tokens = new Vector(); parseInputStream(is); } public HTMLTokenizer (Reader is) throws IOException { tokens = new Vector(); parseInputStream(is); } /** * Returns an enumeration of the tokens which have been * created by the HTMLTokenizer. */ public Enumeration getTokens () { return tokens.elements(); } /** * Returns the vector in which the tokens are stored. */ public Vector getTokenVector () { return tokens; } /** * Parses the input stream given into tokens. * @param is the input stream to parse. */ private void parseInputStream (InputStream is) throws IOException { parseInputStream(new InputStreamReader(is)); } private void parseInputStream (Reader is) throws IOException { char[] charbuf; // Read buffer converted to characters. StringBuffer unused; // Characters still to be processed. int length; // Length of last chunk of read data. int i; // Loop variable. // Create new buffers. charbuf = new char[BUF_LEN]; unused = null; // Set the separator initially. separator = '<'; // Loop round while the end-of-file has not been reached. while (true) { // Read in the first chunk of data. length = is.read(charbuf); // Check for end-of-file. if (length < 0) break; // Process it. unused = processBuffer(charbuf, unused, length); } } /** * Processes the given character array. The token buffer will be * updated to start with the contents of the given StringBuffer. * Any leftover parts of the buffer that have not been processed * are returned in a StringBuffer. The next call to processBuffer * will start where the last one left off by putting the returned * StringBuffer in the argument list of the next call. * @param charbuf the character array to be processed. * @param old the leftovers from the last call. * @param len the maximum length of the array to process. */ private StringBuffer processBuffer (char[] charbuf, StringBuffer old, int len) { StringBuffer data; // Stores current token's data. int idx; // The index of the next separator. int i; // Loop variable. // Get a buffer for the current token. if (old != null) data = old; else data = new StringBuffer(80); // Make sure the start index is initialized properly. start = 0; idx = -1; while (true) { // Set the new start index. start = idx + 1; // Get the index of the separator. idx = indexOf(separator, charbuf, start, len); // Check if the separator appears or not. if (idx < 0) { // Update the data buffer. if (len - start > 0) data.append(charbuf, start, len - start); // If there is data in the buffer, return it. if (data.length() > 0) return data; else return null; } // Append the start of the read buffer onto the // data buffer. data.append(charbuf, start, idx - start); // Check if we should create text or a tag. if (separator == '<') { // Check if there is any content to store. if (data.length() > 0) { // Create a new TextToken. TextToken tt = new TextToken(); // Put the data into the token. tt.setText(data); // Store the new TextToken. tokens.addElement(tt); } } else { // Convert the data to a string. String s = data.toString(); // Create a new TagToken with it. TagToken tt = new TagToken(s); // Store the new TagToken. tokens.addElement(tt); } // Create a new, empty data buffer. data = new StringBuffer(BUF_LEN); // Swap the separator character. if (separator == '<') separator = '>'; else separator = '<'; } } /** * Returns the index of the given character in the given byte * array or -1 if the character does not appear there. * @param c the test character. * @param array the byte array to search. * @param start the first index to search. * @param len the maximum length to search. */ private int indexOf (char c, char[] array, int start, int len) { for (int i = start; i < len; i++) if (array[i] == c) return i; return -1; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -