📄 robotsmetatagparser.java

📁 这是一个用于测试用的搜索引擎的案例
💻 JAVA
字号:
package ir.webutils;import java.util.*;import java.net.*;import javax.swing.text.*;import javax.swing.text.html.*;import javax.swing.text.html.parser.*;import java.io.*;/** * Parser callback that extracts robots META tag information. * * @author Ted Wild */public final class RobotsMetaTagParser extends HTMLEditorKit.ParserCallback {    private String page;    private HTMLEditorKit.Parser parser;    private String robotRules = null;    private URL url;    private boolean index = true;    public RobotsMetaTagParser() {	HTMLParserMaker kit = new HTMLParserMaker();	this.parser = kit.getParser();    }    public RobotsMetaTagParser(URL url) {	this();	this.url = url;    }    public RobotsMetaTagParser(URL url, String page) {	this();	this.url = url;	this.page = page;    }    public void setPage(String page) {	this.page = page;    }    public void setUrl(URL url) {	this.url = url;    }    /**     * Checks for robots META tags.  If a robots META tag is found,     * then the content (if any) is extracted and stored.  Note that     * only the last robots META tag will be considered.       *     * @param tag Indicates the type of tag that caused this method to     * be called.  Only META tags are handled, any other kind of tag     * causes this method to do nothing.     *     * @param attributes The attributes of this tag.  If the tag     * defines the "name" attribute with value "robots" (not case     * sensitive) then the "content" attribute will be checked, and     * stored if it exists.     *     * @param position The position of the tag in the document.  Not     * used.  */    public void handleSimpleTag(HTML.Tag tag, MutableAttributeSet attributes, int position) {		if (tag == HTML.Tag.META) {	    if (attributes.isDefined(HTML.Attribute.NAME)) {		String name = (String) attributes.getAttribute(HTML.Attribute.NAME);				if (name.compareToIgnoreCase("robots") == 0) {		    if (attributes.isDefined(HTML.Attribute.CONTENT))			robotRules = ((String) attributes.getAttribute(HTML.Attribute.CONTENT)).toLowerCase();		}	    }	}    }    /**     * Parses the document and returns a list of links that can not be     * followed.  This method also sets a flag that indicates whether     * or not this page can be indexed.  Clients can then use     * <code>index</code> to check the value of this flag.     *     * @return A <code>List</code> of <code>Link</code>s that should     * not be followed from this page.  */    public List<Link> parseMetaTags() {	StringReader r = new StringReader(this.page);	try {	    parser.parse(r, this, true);	}	catch (ChangedCharSetException e) {	    // should not occur	}	catch (IOException e) {	    System.err.println("RobotsMetaTagParser.parseMetaTags(): " + e);	}	if (robotRules != null) {	    	    if (robotRules.indexOf("no") != -1) {		if (robotRules.indexOf("noindex") != -1)		    index = false;		if (robotRules.indexOf("nofollow") != -1 || robotRules.indexOf("none") != -1)		    return new LinkExtractor(new HTMLPage(new Link(this.url),this.page)).extractLinks();			    }	}	return new LinkedList<Link>();    }    /**     * Indicates whether the page can be indexed.  Call this method     * only after <code>parseMetaTags</code> has been called.     *     * @return <code>true</code> iff. the page can be indexed.  */    public boolean index() {	return index;    }}// RobotsMetaTagParser
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -