⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 spiderworker.java

📁 java版的网络蚂蚁软件
💻 JAVA
字号:
package com.heaton.bot;import com.heaton.bot.*;import java.net.*;/** * The SpiderWorker class performs the actual work of * spidering pages.  It is implemented as a thread * that is created by the spider class. * * Copyright 2001-2003 by Jeff Heaton (http://www.jeffheaton.com) * * @author Jeff Heaton * @version 1.2 */public class SpiderWorker extends Thread {  /**   * The URL that this spider worker   * should be downloading.   */  protected String target;  /**   * The owner of this spider worker class,   * should always be a Spider object.   * This is the class that this spider   * worker will send its data to.   */  protected Spider owner;  /**   * Indicates if the spider is busy or not.   * true = busy   * false = idle   */  protected boolean busy;  /**   * A descendant of the HTTP object that   * this class should be using for HTTP   * communication. This is usually the   * HTTPSocket class.   */  protected HTTP http;  /**   * Constructs a spider worker object.   *   * @param owner The owner of this object, usually   * a Spider object.   * @param http   */  public SpiderWorker(Spider owner,HTTP http)  {    this.http = http;    this.owner = owner;  }  /**   * Returns true of false to indicate if   * the spider is busy or idle.   *   * @return true = busy   * false = idle   */  public boolean isBusy()  {    return this.busy;  }  /**   * The run method causes this thread to go idle   * and wait for a workload. Once a workload is   * received, the processWorkload method is called   * to handle the workload.   */  public void run()  {    for ( ;; ) {      target = this.owner.getWorkload();      if ( target==null )        return;      owner.getSpiderDone().workerBegin();      processWorkload();      owner.getSpiderDone().workerEnd();    }  }  /**   * The run method actually performs the   * the workload assigned to this object.   */  public void processWorkload()  {    try {      busy = true;      Log.log(Log.LOG_LEVEL_NORMAL,"Spidering " + target );      http.send(target,null);      Attribute typeAttribute = http.getServerHeaders().get("Content-Type");      // if no content-type at all, its PROBABLY not HTML      if ( typeAttribute==null )        return;      // now check to see if is HTML, ONLY PARSE text type files(namely HTML)      owner.processPage(http);            if ( !typeAttribute.getValue().startsWith("text/") )        return;      HTMLParser parse = new HTMLParser();      parse.source = new StringBuffer(http.getBody());      // find all the links      while ( !parse.eof() ) {        char ch = parse.get();        if ( ch==0 ) {          HTMLTag tag = parse.getTag();          Attribute link = tag.get("HREF");          if ( link==null )            link = tag.get("SRC");          if ( link==null )            continue;          URL target=null;          try {            target = new URL(new URL(this.target),link.getValue());          } catch ( MalformedURLException e ) {            Log.log(Log.LOG_LEVEL_TRACE,                    "Spider found other link: " + link );            owner.foundOtherLink(link.getValue());            continue;          }          if ( owner.getRemoveQuery() )            target = URLUtility.stripQuery(target);          target = URLUtility.stripAnhcor(target);          if ( target.getHost().equalsIgnoreCase(                                                new URL(this.target).getHost()) ) {            Log.log(Log.LOG_LEVEL_NORMAL,                    "Spider found internal link: " + target.toString() );            owner.foundInternalLink(target.toString());          } else {            Log.log(Log.LOG_LEVEL_NORMAL,                    "Spider found external link: " + target.toString() );            owner.foundExternalLink(target.toString());          }        }      }      owner.completePage(http,false);    } catch ( java.io.IOException e ) {      Log.log(Log.LOG_LEVEL_ERROR,              "Error loading file("+ target +"): " + e );      owner.completePage(http,true);    } catch ( Exception e ) {      Log.logException(                      "Exception while processing file("+ target +"): ", e );      owner.completePage(http,true);    } finally {      busy = false;    }  }  /**   * Returns the HTTP descendant that this   * object should use for all HTTP communication.   *   * @return An HTTP descendant object.   */  public HTTP getHTTP()  {    return http;  }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -