📄 standardclassifier.java

📁 java写的crawler
💻 JAVA
字号:
/* * WebSphinx web-crawling toolkit * * Copyright (c) 1998-2002 Carnegie Mellon University.  All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright *    notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright *    notice, this list of conditions and the following disclaimer in *    the documentation and/or other materials provided with the *    distribution. * * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */package websphinx;import java.net.URL;/** * Standard classifier, installed in every crawler by default. * <P>On the entire page, this classifier sets the following labels: * <UL> * <LI><B>root</B>: page is the root page of a Web site.  For instance, *     "http://www.digital.com/" and "http://www.digital.com/index.html" are both *     marked as root, but "http://www.digital.com/about" is not. * </UL> * <P>Also sets one or more of the following labels on every link: * <UL> * <LI><B>hyperlink</B>: link is a hyperlink (A, AREA, or FRAME tags) to another page on the Web (using http, file, ftp, or gopher protocols) * <LI><B>image</B>: link is an inline image (IMG). * <LI><B>form</B>: link is a form (FORM tag).  A form generally requires some parameters to use. * <LI><B>code</B>: link points to code (APPLET, EMBED, or SCRIPT). * <LI><B>remote</B>: link points to a different Web server. * <LI><B>local</B>: link points to the same Web server. * <LI><B>same-page</B>: link points to the same page (e.g., by an anchor reference like "#top") * <LI><B>sibling</B>: a local link that points to a page in the same directory (e.g. "sibling.html") * <LI><B>descendent</B>: a local link that points downwards in the directory structure (e.g., "deep/deeper/deepest.html") * <LI><B>ancestor</B>: a link that points upwards in the directory structure (e.g., "../..") * </UL> */public class StandardClassifier implements Classifier  {    /**     * Make a StandardClassifier.     */    public StandardClassifier () {    }    /**      * Classify a page.     * @param page Page to classify     */    // FIX: use regular expressions throughout this method    public void classify (Page page) {        Link origin = page.getOrigin ();        String pageHost = origin.getHost ();        int pagePort = origin.getPort ();        String pagePath = origin.getFile();        String pageFilename = origin.getFilename();        URL base = page.getBase ();        String baseHost = base.getHost ();        int basePort = base.getPort ();        String basePath = base.getFile ();        if (pageFilename.equals ("") || pageFilename.startsWith ("index.htm"))            page.setLabel ("root");        // FIX: Link needs to resolve "foo/bar/.." and "foo/." to "foo" in order for this        // stuff to work properly        Link[] links = page.getLinks ();        if (links != null) {            for (int i=0; i<links.length; ++i) {                Link link = links[i];                                if ((link.getHost().equals (pageHost)                     && link.getPort() == pagePort)                    || (link.getHost().equals (baseHost)                        && link.getPort() == basePort)) {                    link.setLabel ("local");                                        String linkPath = link.getFile ();                                        if (linkPath.equals (pagePath)                        || linkPath.equals (basePath))                        link.setLabel ("same-page");                    else if (link.getDirectory ().equals (origin.getDirectory ()))                        link.setLabel ("sibling");                    else if (descendsFrom (linkPath, pagePath)                             || descendsFrom (linkPath, basePath))                        link.setLabel ("descendent");                    else if (descendsFrom (pagePath, linkPath)                             || descendsFrom (basePath, linkPath))                        link.setLabel ("ancestor");                    // NIY: child, parent                }                else                    link.setLabel ("remote");                // Link tag kinds: resource, form, hyperlink                String tagName = link.getTagName();                                if (tagName == Tag.IMG)                    link.setLabel ("image");                else if (tagName == Tag.APPLET || tagName == Tag.EMBED || tagName == Tag.SCRIPT)                    link.setLabel ("code");                else if (tagName == Tag.FORM)                    link.setLabel ("form");                else if (tagName == Tag.A || tagName == Tag.AREA || tagName == Tag.FRAME) {                    String protocol = link.getProtocol ();                                        if ((protocol.equals ("http")                         || protocol.equals ("ftp")                         || protocol.equals ("file")                         || protocol.equals ("gopher"))                        && link.getMethod() == Link.GET)                        link.setLabel ("hyperlink");                }            }        }    }    private boolean descendsFrom (String path1, String path2) {        return path1.startsWith (path2.endsWith ("/")                                 ? path2                                 : path2 + "/");    }    /**     * Priority of this classifier.     */    public static final float priority = 0.0F;        /**     * Get priority of this classifier.     * @return priority.     */    public float getPriority () {        return priority;    }}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -