📄 htmlparser.java
字号:
/* * WebSPHINX web crawling toolkit * Copyright (C) 1998,1999 Carnegie Mellon University * * This library is free software; you can redistribute it * and/or modify it under the terms of the GNU Library * General Public License as published by the Free Software * Foundation, version 2. * * WebSPHINX homepage: http://www.cs.cmu.edu/~rcm/websphinx/ */package websphinx;import java.io.InputStream;import java.io.IOException;//#ifdef JDK1.1 import java.io.Reader;import java.io.InputStreamReader;import java.io.StringReader;//#endif JDK1.1/*#ifdef JDK1.0import java.io.StringBufferInputStream;#endif JDK1.0*/import java.util.Hashtable;import java.util.Enumeration;import java.util.Vector;import java.util.Stack;import java.net.URL;import java.net.MalformedURLException;/** * HTML parser. Parses an input stream or String and * converts it to a sequence of Tags and a tree of Elements. * HTMLParser is used by Page to parse pages. */// FIX: make HTMLParser into an interface, and// split this implementation into Tokenizer and TreeBuilderpublic class HTMLParser { static final int BUFFER_SIZE = 10240; int maxBytes = Integer.MAX_VALUE; /** * Make an HTMLParser. */ public HTMLParser () { } /** * Make an HTMLParser which retrieves pages * using the specified download parameters. Pages * larger than dp.getMaxPageSize() are rejected by parse() * with an IOException. * @param dp download parameters used during parsing */ public HTMLParser (DownloadParameters dp) { this.maxBytes = dp.getMaxPageSize () * 1024; } /** * Parse an input stream. * @param page Page to receive parsed HTML * @param input stream containing HTML */ public void parse (Page page, InputStream stream) throws IOException {//#ifdef JDK1.1 Reader r = new InputStreamReader (stream); tokenize (page, r, true);//#endif JDK1.1/*#ifdef JDK1.0 tokenize (page, stream, true);#endif JDK1.0*/ buildParseTree (page); } /** * Parse an input stream. * @param page Page to receive parsed HTML * @param input stream containing HTML *///#ifdef JDK1.1 public void parse (Page page, Reader stream) throws IOException { tokenize (page, stream, true); buildParseTree (page); }//#endif JDK1.1 /** * Parse a string. * @param page Page to receive parsed HTML * @param content String containing HTML */ public void parse (Page page, String content) throws IOException {//#ifdef JDK1.1 Reader r = new StringReader (content); tokenize (page, r, false); r.close ();//#endif JDK1.1/*#ifdef JDK1.0 InputStream stream = new StringBufferInputStream (content); tokenize (page, stream, false); stream.close ();#endif JDK1.0*/ buildParseTree (page); } /** * Download an input stream without parsing it. * @param page Page to receive the downloaded content * @param input stream containing content */ public void dontParse (Page page, InputStream stream) throws IOException {//#ifdef JDK1.1 Reader r = new InputStreamReader (stream); dontParse (page, r);//#endif JDK1.1/*#ifdef JDK1.0 int n; int total = 0; contentBuf.setLength (0); while ((n = stream.read (buf)) != -1) { total += n; if (total > maxBytes) { throw new IOException ("Page greater than " + maxBytes + " bytes"); } contentBuf.append (new String (buf, 0, 0, n)); } page.content = contentBuf.toString (); page.start = 0; page.end = contentBuf.length();#endif JDK1.0*/ } /** * Download an input stream without parsing it. * @param page Page to receive the downloaded content * @param r stream containing content *///#ifdef JDK1.1 public void dontParse (Page page, Reader stream) throws IOException { int n; int total = 0; contentBuf.setLength (0); while ((n = stream.read (buf)) != -1) { total += n; if (total > maxBytes) { throw new IOException ("Page greater than " + maxBytes + " bytes"); } contentBuf.append (buf, 0, n); } page.content = contentBuf.toString (); page.start = 0; page.end = contentBuf.length(); }//#endif JDK1.1 /* * HTML tokenizer state machine */ // state takes on one of the following values: private static final int START = 0; private static final int INWORD = 1; private static final int ENTITY = 2; private static final int LT = 4; private static final int BANG = 5; private static final int BANG_DASH = 6; private static final int CMT = 7; private static final int CMT_DASH = 8; private static final int CMT_DASHDASH = 9; private static final int DIRECTIVE = 10; private static final int STAG = 11; private static final int ETAG = 12; private static final int ATTR = 13; private static final int ATTRNAME = 14; private static final int EQ = 15; private static final int AFTEREQ = 16; private static final int ATTRVAL = 17; private static final int ATTRVAL_SQ = 18; private static final int ATTRVAL_DQ = 19; private static final int DONE = 20; private static final int ENTNUM = 21; private static final int ENTREF = 22;//#ifdef JDK1.1 char[] buf = new char[BUFFER_SIZE];//#endif JDK1.1/*#ifdef JDK1.0 byte[] buf = new byte[BUFFER_SIZE];#endif JDK1.0*/ StringBuffer contentBuf = new StringBuffer (); StringBuffer wordBuf = new StringBuffer (); StringBuffer tagName = new StringBuffer (); StringBuffer attrName = new StringBuffer (); StringBuffer attrVal = new StringBuffer (); Vector attrs = new Vector (); StringBuffer entity = new StringBuffer (); // FIX: should entities in attr names or values be expanded?//#ifdef JDK1.1 private void tokenize (Page page, Reader stream, boolean saveContent) throws IOException {//#endif JDK1.1/*#ifdef JDK1.0 private void tokenize (Page page, InputStream stream, boolean saveContent) throws IOException {#endif JDK1.0*/ int state = START; int bufptr = 0; int buflen = 0; int bufbase = 0; // token list Vector tokens = new Vector(); int wordStart = 0; int nWords = 0; Tag tag = null; int tagStart = 0; int entnum = 0; StringBuffer entityTargetBuf = null; int postEntityState = 0; contentBuf.setLength (0); while (true) { if (bufptr >= buflen) { bufptr = 0; bufbase += buflen; buflen = stream.read (buf); if (buflen == -1) break; if (bufbase + buflen > maxBytes) { throw new IOException ("Page exceeded " + maxBytes + " bytes"); } if (saveContent)//#ifdef JDK1.1 contentBuf.append (buf, 0, buflen);//#endif JDK1.1/*#ifdef JDK1.0 contentBuf.append (new String (buf, 0, 0, buflen));#endif JDK1.0*/ } char c = (char)buf[bufptr]; //System.err.println ("%% state == " + state + ", ptr == " + (bufbase+bufptr) + ", c == " + c); switch (state) { case START: // after whitespace or tag switch (c) { case '<': ++bufptr; state = LT; break; case ' ': case '\t': case '\n': case '\r': ++bufptr; break; default: wordBuf.setLength (0); wordStart = bufbase+bufptr; state = INWORD; break; } break; case INWORD: // Character data switch (c) { case '<': tokens.addElement (new Text (page, wordStart, bufbase+bufptr, wordBuf.toString ())); ++nWords; state = START; break; case ' ': case '\t': case '\n': case '\r': tokens.addElement (new Text (page, wordStart, bufbase+bufptr, wordBuf.toString ())); ++nWords; state = START; ++bufptr; break; case '&': ++bufptr; postEntityState = INWORD; entityTargetBuf = wordBuf; state = ENTITY; break; default: wordBuf.append ((char)c); ++bufptr; // state == INWORD; break; } break; // Entities case ENTITY: if (c == '#') { ++bufptr; entnum = 0; state = ENTNUM; } else if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) { entity.setLength (0); state = ENTREF; } else { entityTargetBuf.append ('&'); state = postEntityState; } break; case ENTREF: if (!Character.isLetterOrDigit(c)) { Character ent = lookupEntityRef (entity.toString ()); if (ent != null) { entityTargetBuf.append (ent.charValue()); if (c == ';') ++bufptr; } else { // unrecognized entity -- leave // as-is entityTargetBuf.append ('&'); entityTargetBuf.append (entity.toString ()); } state = postEntityState; } else { ++bufptr; entity.append (c); // state == ENTREF; } break; case ENTNUM: if (c==';' || !Character.isDigit(c)) { entityTargetBuf.append ((char) entnum); if (c == ';') ++bufptr; state = postEntityState; } else { entnum = 10*entnum + (c - '0'); ++bufptr; } break; case LT: tagStart = bufbase+bufptr-1; switch (c) { case '/': ++bufptr; tagName.setLength (0); state = ETAG; break; case '!': ++bufptr; state = BANG; break; default: if (Character.isLetter (c)) { tagName.setLength (0); state = STAG; } else { wordBuf.append ('<'); state = INWORD; } break; } break; // Comments and directives. // Implements the (broken, but easy) Netscape rule: // <!-- starts a comment, --> closes. // All other directives <!foo> are also returned as comments. case BANG: if (c == '-') { ++bufptr; state = BANG_DASH; } else { state = DIRECTIVE; } break; case BANG_DASH: if (c == '-') { ++bufptr; state = CMT; } else { state = DIRECTIVE; } break; case CMT: if (c == '-') { ++bufptr; state = CMT_DASH; } else {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -