📄 newsindex.java

📁 一个Web爬虫（机器人
💻 JAVA
字号:
/* * WebSPHINX web crawling toolkit * Copyright (C) 1998,1999 Carnegie Mellon University  *  * This library is free software; you can redistribute it * and/or modify it under the terms of the GNU Library * General Public License as published by the Free Software  * Foundation, version 2. * * WebSPHINX homepage: http://www.cs.cmu.edu/~rcm/websphinx/ */package websphinx.searchengine;import websphinx.*;import java.net.URL;import java.net.URLEncoder;import java.net.MalformedURLException;/** * <A href="http://www.newsindex.com/">NewsIndex</a> search engine. */public class NewsIndex implements SearchEngine {    static Pattern patCount = new Regexp (        "<center>Headlines\\s+\\d+\\s+to\\s+\\d+\\s+of\\s+(\\d+)</center>"    );    static Pattern patNoHits = new Regexp (        "No articles were found matching your search criteria"    );    static Pattern patResult = new Tagexp (      "<dd>(?{link}(?{title}<a>.*?</a>))" // title and link    + "<blockquote><b></b>"               // news source    + "(?{description})</blockquote>"     // description and index date    );    /**     * Classify a page.  Sets the following labels:     * <TABLE>     * <TR><TH>Name <TH>Type  <TH>Meaning     * <TR><TD>searchengine.source <TD>Page label <TD>NewsIndex object that labeled the page     * <TR><TD>searchengine.count <TD>Page field <TD>Number of results on page     * <TR><TD>searchengine.results <TD>Page fields <TD>Array of results.  Each result region     * contains subfields: title, description, and link.     * <TR><TD>searchengine.more-results <TD>Link label <TD>Link to a page containing more results.     * </TABLE>     */    public void classify (Page page) {        String title = page.getTitle ();        if (title != null && title.equals ("News Index - Results")) {            page.setObjectLabel ("searchengine.source", this);            Region count = patCount.oneMatch (page);            if (count != null)                page.setField ("searchengine.count", count.getField ("0"));                        Region[] results = patResult.allMatches (page);            SearchEngineResult[] ser = new SearchEngineResult[results.length];            for (int i=0; i<results.length; ++i)                ser[i] = new SearchEngineResult (results[i]);            page.setFields ("searchengine.results", ser);            // find "more" link            Link[] links = page.getLinks ();            for (int i=0; i<links.length; ++i) {                if (links[i].toText().equals ("Next 10 Headlines")) {                    links[i].setLabel ("searchengine.more-results");                    links[i].setLabel ("hyperlink");                                 break;                }            }        }    }    /**     * Priority of this classifier.     */    public static final float priority = 0.0F;        /**     * Get priority of this classifier.     * @return priority.     */    public float getPriority () {        return priority;    }    /**     * Make a query URL for NewsIndex.     * @param keywords list of keywords, separated by spaces     * @return URL that submits the keywords to NewsIndex.     */    public URL makeQuery (String keywords) {        try {            return new URL("http://www.newsindex.com/cgi-bin/process.cgi?mode=any&query="                         + URLEncoder.encode(keywords));        } catch (MalformedURLException e) {            throw new RuntimeException ("internal error");        }    }    /**     * Get number of results per page for this search engine.     * @return typical number of results per page     */    public int getResultsPerPage () {        return 10;    }    /**     * Search NewsIndex.     * @param keywords list of keywords, separated by spaces     * @return enumeration of SearchEngineResults returned by an NewsIndex query constructed from the keywords.     */    public static Search search (String keywords) {        return new Search (new NewsIndex(), keywords);    }    /**     * Search NewsIndex.     * @param keywords list of keywords, separated by spaces     * @param maxResults maximum number of results to return     * @return enumeration of SearchEngineResults returned by an NewsIndex query constructed from the keywords.     * The enumeration yields at most maxResults objects.     */    public static Search search (String keywords, int maxResults) {        return new Search (new NewsIndex(), keywords, maxResults);    }}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -