⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 xmlreader.java

📁 一个很不错的词频统计程序,目前只支持英文,中文的本人正在修改中.改好后上传给大家分享
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
package edu.udo.cs.wvtool.external;

import java.io.IOException;
import java.io.Reader;
import java.util.Hashtable;

/**
 * A minimalistic XML pull parser, similar to kXML, but not supporting
 * namespaces or legacy events. If you need support for namespaces, or access to
 * XML comments or processing instructions, please use kXML(2) instead.
 * 
 * @author Stefan Haustein
 */

public class XmlReader {

    /** Return value of getType before first call to next() */

    public final static int START_DOCUMENT = 0;

    /** Signal logical end of xml document */

    public final static int END_DOCUMENT = 1;

    /** Start tag was just read */

    public final static int START_TAG = 2;

    /**
     * End tag was just read
     */
    public final static int END_TAG = 3;

    /** Text was just read */
    public final static int TEXT = 4;

    public final static int CDSECT = 5;

    final static int ENTITY_REF = 6;

    static final private String UNEXPECTED_EOF = "Unexpected EOF";

    static final private int LEGACY = 999;

    // general

    public boolean relaxed;

    private Hashtable entityMap;

    private int depth;

    private String[] elementStack = new String[4];

    // source

    private Reader reader;

    private char[] srcBuf = new char[Runtime.getRuntime().freeMemory() >= 1048576 ? 8192 : 128];

    private int srcPos;

    private int srcCount;

    private boolean eof;

    private int line;

    private int column;

    private int peek0;

    private int peek1;

    // txtbuffer

    private char[] txtBuf = new char[128];

    private int txtPos;

    // Event-related

    private int type;

    private String text;

    private boolean isWhitespace;

    private String name;

    private boolean degenerated;

    private int attributeCount;

    private String[] attributes = new String[16];

    private String[] TYPES = { "Start Document", "End Document", "Start Tag", "End Tag", "Text" };

    private final int read() throws IOException {

        int r = peek0;
        peek0 = peek1;

        if (peek0 == -1) {
            eof = true;
            return r;
        } else if (r == '\n' || r == '\r') {
            line++;
            column = 0;
            if (r == '\r' && peek0 == '\n')
                peek0 = 0;
        }
        column++;

        if (srcPos >= srcCount) {
            srcCount = reader.read(srcBuf, 0, srcBuf.length);
            if (srcCount <= 0) {
                peek1 = -1;
                return r;
            }
            srcPos = 0;
        }

        peek1 = srcBuf[srcPos++];
        return r;
    }

    private final void exception(String desc) throws IOException {
        throw new IOException(desc + " pos: " + getPositionDescription());
    }

    private final void push(int c) {
        if (c == 0)
            return;

        if (txtPos == txtBuf.length) {
            char[] bigger = new char[txtPos * 4 / 3 + 4];
            System.arraycopy(txtBuf, 0, bigger, 0, txtPos);
            txtBuf = bigger;
        }

        txtBuf[txtPos++] = (char) c;
    }

    private final void read(char c) throws IOException {
        if (read() != c) {
            if (relaxed) {
                if (c <= 32) {
                    skip();
                    read();
                }
            } else {
                exception("expected: '" + c + "'");
            }
        }
    }

    private final void skip() throws IOException {

        while (!eof && peek0 <= ' ')
            read();
    }

    private final String pop(int pos) {
        String result = new String(txtBuf, pos, txtPos - pos);
        txtPos = pos;
        return result;
    }

    private final String readName() throws IOException {

        int pos = txtPos;
        int c = peek0;
        if ((c < 'a' || c > 'z') && (c < 'A' || c > 'Z') && c != '_' && c != ':' && !relaxed)
            exception("name expected");

        do {
            push(read());
            c = peek0;
        } while ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-'
                || c == ':' || c == '.');

        return pop(pos);
    }

    private final void parseLegacy(boolean push) throws IOException {

        String req = "";
        int term;

        read(); // <
        int c = read();

        if (c == '?') {
            term = '?';
        } else if (c == '!') {
            if (peek0 == '-') {
                req = "--";
                term = '-';
            } else {
                req = "DOCTYPE";
                term = -1;
            }
        } else {
            if (c != '[')
                exception("cantreachme: " + c);
            req = "CDATA[";
            term = ']';
        }

        for (int i = 0; i < req.length(); i++)
            read(req.charAt(i));

        if (term == -1)
            parseDoctype();
        else {
            while (true) {
                if (eof)
                    exception(UNEXPECTED_EOF);

                c = read();
                if (push)
                    push(c);

                if ((term == '?' || c == term) && peek0 == term && peek1 == '>')
                    break;
            }
            read();
            read();

            if (push && term != '?')
                pop(txtPos - 1);
        }
    }

    /** precondition: &lt! consumed */

    private final void parseDoctype() throws IOException {

        int nesting = 1;

        while (true) {
            int i = read();
            switch (i) {

            case -1:
                exception(UNEXPECTED_EOF);

            case '<':
                nesting++;
                break;

            case '>':
                if ((--nesting) == 0)
                    return;
                break;
            }
        }
    }

    /* precondition: &lt;/ consumed */

    private final void parseEndTag() throws IOException {

        read(); // '<'
        read(); // '/'
        name = readName();
        if (depth == 0 && !relaxed)
            exception("element stack empty");

        if (name.equals(elementStack[depth - 1]))
            depth--;
        else if (!relaxed)
            exception("expected: " + elementStack[depth]);
        skip();
        read('>');
    }

    private final int peekType() {
        switch (peek0) {
        case -1:
            return END_DOCUMENT;
        case '&':
            return ENTITY_REF;
        case '<':
            switch (peek1) {
            case '/':
                return END_TAG;
            case '[':
                return CDSECT;
            case '?':
            case '!':
                return LEGACY;
            default:
                return START_TAG;
            }
        default:
            return TEXT;
        }
    }

    private static final String[] ensureCapacity(String[] arr, int required) {
        if (arr.length >= required)
            return arr;
        String[] bigger = new String[required + 16];
        System.arraycopy(arr, 0, bigger, 0, arr.length);
        return bigger;
    }

    /** Sets name and attributes */

    private final void parseStartTag() throws IOException {

        read(); // <
        name = readName();
        elementStack = ensureCapacity(elementStack, depth + 1);
        elementStack[depth++] = name;

        while (true) {
            skip();

            int c = peek0;

            if (c == '/') {
                degenerated = true;
                read();
                skip();
                read('>');
                break;
            }

            if (c == '>') {
                read();
                break;
            }

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -