newsbot.java

来自「一个Web爬虫（机器人」· Java 代码 · 共 153 行

JAVA

153 行

/* * WebSPHINX web crawling toolkit * Copyright (C) 1998,1999 Carnegie Mellon University  *  * This library is free software; you can redistribute it * and/or modify it under the terms of the GNU Library * General Public License as published by the Free Software  * Foundation, version 2. * * WebSPHINX homepage: http://www.cs.cmu.edu/~rcm/websphinx/ */package websphinx.searchengine;import websphinx.*;import java.net.URL;import java.net.URLEncoder;import java.net.MalformedURLException;/** * <A href="http://www.newbot.com/">NewsBot</a> search engine. */public class NewsBot implements SearchEngine {    static Pattern patTitle = new Regexp ("^");    static Pattern patCount = new Regexp (        "Returned <B>(\\d+)</b> results"    );    static Pattern patNoHits = new Regexp (        "Sorry -- your search yielded no results"    );    // FIX: works only for Netscape    static Pattern patResult = new Tagexp (            "<font>"           +"(?{link}<A>(?{title})</A>)"           +"</font>"           +"<br>"           +"<font></font>(?{description})<br>"           +"<font><b></b></font><p>"    );    static Pattern patMoreLink = new Tagexp (        "<input type=image name=act.next>"    );    /**     * Classify a page.  Sets the following labels:     * <TABLE>     * <TR><TH>Name <TH>Type  <TH>Meaning     * <TR><TD>searchengine.source <TD>Page label <TD>NewsBot object that labeled this page     * <TR><TD>searchengine.count <TD>Page field <TD>Number of results on page     * <TR><TD>searchengine.results <TD>Page fields <TD>Array of results.  Each result region     * contains subfields: rank, title, description, and link.     * <TR><TD>searchengine.more-results <TD>Link label <TD>Link to a page containing more results.     * </TABLE>     */    public void classify (Page page) {        String title = page.getTitle ();        if (title != null && title.startsWith ("HotBot results:")) {            page.setObjectLabel ("searchengine.source", this);            Region count = patCount.oneMatch (page);            if (count != null)                page.setField ("searchengine.count", count.getField ("0"));                        Region[] results = patResult.allMatches (page);            SearchEngineResult[] ser = new SearchEngineResult[results.length];            for (int i=0; i<results.length; ++i) {                ser[i] = new SearchEngineResult (results[i]);                //System.out.println (ser[i]);            }            page.setFields ("searchengine.results", ser);            PatternMatcher m = patMoreLink.match (page);            while (m.hasMoreElements ()) {                Link link = (Link)m.nextMatch ();                link.setLabel ("searchengine.more-results");                link.setLabel ("hyperlink");                         }        }        else System.err.println ("not a NewsBot page");    }    /**     * Priority of this classifier.     */    public static final float priority = 0.0F;        /**     * Get priority of this classifier.     * @return priority.     */    public float getPriority () {        return priority;    }    /**     * Make a query URL for NewsBot.     * @param keywords list of keywords, separated by spaces     * @return URL that submits the keywords to NewsBot.     */    public URL makeQuery (String keywords) {        try {            java.util.StringTokenizer tok = new java.util.StringTokenizer (keywords);            StringBuffer output = new StringBuffer ();            while (tok.hasMoreElements ()) {                String kw = tok.nextToken ();                if (output.length () > 0)                    output.append (" or ");                output.append (kw);            }            return new URL("http://engine.newbot.com/newbot/server/query.fpl?client_id=0sQaJNoAahXc&output=hotbot4&logad=1&client_sw=html&client_vr=0.9&client_last_updated=ignore&T0=hotbot&S0=date&P0=&F0=24&Q0="                           + URLEncoder.encode(output.toString())+ "&max_results=50&S0=rank&Search.x=55&Search.y=4");        } catch (MalformedURLException e) {            throw new RuntimeException ("internal error");        }    }    /**     * Get number of results per page for this search engine.     * @return typical number of results per page     */    public int getResultsPerPage () {        return 10;    }    /**     * Search NewsBot.     * @param keywords list of keywords, separated by spaces     * @return enumeration of SearchEngineResults returned by a NewsBot query constructed from the keywords.     */    public static Search search (String keywords) {        return new Search (new NewsBot(), keywords);    }    /**     * Search NewsBot.     * @param keywords list of keywords, separated by spaces     * @param maxResults maximum number of results to return     * @return enumeration of SearchEngineResults returned by an NewsBot query constructed from the keywords.     * The enumeration yields at most maxResults objects.     */    public static Search search (String keywords, int maxResults) {        return new Search (new NewsBot(), keywords, maxResults);    }}

newsbot.java - 源码说明

本页面展示了「一个Web爬虫（机器人」中的 newsbot.java 源码文件，采用 Java 编程语言编写，共 153 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。

虫虫下载站收录了大量与Web相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。

⌨️ 快捷键说明

复制代码Ctrl + C

搜索代码Ctrl + F

全屏模式F11

增大字号Ctrl + =

减小字号Ctrl + -

显示快捷键?