📄 linklocalizer.java

📁 真正的网络爬虫的源代码啊,希望大家好好阅读,写出心得体会啊
💻 JAVA
字号:
package net.matuschek.spider.docfilter;

/************************************************
    Copyright (c) 2001/2002 by Daniel Matuschek
*************************************************/


import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.StringTokenizer;

import net.matuschek.http.HttpDoc;
import net.matuschek.util.NullWriter;

import org.w3c.dom.Node;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.w3c.dom.Document;

import org.w3c.tidy.Tidy;


/**
 * Localizer tries to replace absolute links by relative links
 * and should allow offline browsing. 
 *
 * It uses JTidy to parse the file.
 * 
 * @author Daniel Matuschek 
 * @version $Revision: 1.11 $
 */
public class LinkLocalizer implements DocumentFilter
{
  /** processing enabled ? */
  protected boolean enabled=true;

  /**
   * This method processes the file and will replace 
   * absolute links by relative.
   *
   * @return the old document, if the ContentType is not
   * text/html, a new (localized) document otherwise.
   */
  public HttpDoc process(HttpDoc input) 
    throws FilterException 
  {
    if (input == null) { 
      return null;
    }

    if (! input.isHTML()) {
      return input;
    }

    if (! enabled) {
      return input;
    }

    // okay, parse the HTML code
    ByteArrayInputStream bis = new ByteArrayInputStream(input.getContent());
    Tidy tidy = new Tidy();
    tidy.setUpperCaseTags(false);
    tidy.setUpperCaseAttrs(false);
    tidy.setErrout(new PrintWriter(new NullWriter()));

    Document doc = tidy.parseDOM(bis,null);

    rewriteDOM(doc,input.getURL());

    ByteArrayOutputStream bos = new ByteArrayOutputStream();
    tidy.pprint(doc,bos);

    input.setContent(bos.toByteArray());
    
    return input;
  }


  /**
   * Enable processing, the will parse the document and try to
   * replace absolute by relative links.
   */
  public void enable() {
    this.enabled=true;
  }


  /**
   * Disable processing, the filter will not change the document
   * content.
   */
  public void disable() {
    this.enabled=false;
  }


  /**
   * Is the link processing enabled ?
   *
   * @return true, if the filter processes links, false otherwise
   */
  public boolean isEnabled() {
    return this.enabled;
  }


  /**
   * Rewrite this DOM with relative URLs. Will process the whole DOM
   *
   * @param node root node of the DOM to modify
   * @param url base URL of teh document itself (for relative addressing)
   */
  private void rewriteDOM(Node node, URL url) 
    throws FilterException
  {

    // this should not happen !
    if (node==null) {
      throw new FilterException("Got a null node");
    }

    // ELEMENT ?
    if (node instanceof Element) {
      String name = node.getNodeName();
      if (name.equals("a") 
	  || name.equals("area")) {
	localizeAttrib(node,"href",url);

      }	else if (name.equals("img") 
		 || name.equals("frame")) {
	localizeAttrib(node,"src",url);

      }
    }

    // recursive travel through all childs
    NodeList childs = node.getChildNodes();

    for (int i=0; i<childs.getLength(); i++) {
      rewriteDOM(childs.item(i),url);
    }
    

  }


  /**
   * Localize a given attribute for a Element. <br />
   * Thanks to Paul Tan for the feedback
   *
   * @param node an element node that should be localized
   * @param attribute name of the attribute that should be localized
   * @param context an URL that is the context for relative 
   * addressing (base address)
   */
  private void localizeAttrib(Node node,
			      String attribute,
			      URL context) 
  {
    Element el = (Element)node;
    String oldValue = el.getAttribute(attribute);

    // only localize if the attribute exists
    // only localize if the file is in another directory
    if (!oldValue.equals("") && oldValue.indexOf("/")!=-1) {
      String newValue = localizeURL(oldValue,context);
      el.setAttribute(attribute, newValue);
    } // end of if ()
    
  }



  /**
   * Localize a given URL.
   *
   * Thanks to Paul Tan and Laurent Salinas for the feedback.
   *
   * @param urlStr a String containing a URL, can be relative 
   * (e.g. ../index.html) or absolute ("http://myserver/")
   * @param context an URL that a the context URL for relative URLs
   *
   * @return a String containing an URL that will be relative to the given
   * context if both URLs are on the same host, otherwise it will simply
   * return urlStr
   */
  private String localizeURL(String urlStr, URL context) {
    URL url;
    try {
      url = new URL(context, urlStr);
    } catch (MalformedURLException e) {
      return urlStr;
    }

    // only localize "http:" links
    if (! url.getProtocol().equalsIgnoreCase("http")) {
      return urlStr;
    }

    // only localize if new URL is on the same host !
    
    if ((context != null) 
	&& (context.getHost().equalsIgnoreCase(url.getHost()))) {
      String ref = url.getRef();
      String path = url.getPath();
      
      // Already relative
      // this should only happen if the context
      // is null
      if (path.startsWith("../")) {
	return urlStr;
      }

      // URL references
      if ((ref != null) && (! ref.equals(""))) {
	path = path+"#"+ref;
      }

      // implied index.html
      if ((path.length()>0) && (path.charAt(path.length()-1)) == '/') {
	path = path+"index.html";
      }
	
      return localizePath(url.getPath(),context.getPath());
    } else {
      return urlStr;
    }
  }


  /** 
   * Localize a given path. Very dumb, but it works ;-)
   *
   * @param path path to localize
   * @param context reference path
   * @return a path that is given relative
   * 
   * Example: <br />
   * path="/images/test.gif" <br />
   * context="/test/index.html"<br />
   * result="../images/test.gif"
   */
  private String localizePath(String path, String context) {
    StringTokenizer st = new StringTokenizer(context,"/");
    int depth = st.countTokens();
    if (! context.endsWith("/")) {
      depth--;
    }      

    StringBuffer sb = new StringBuffer();
    if (depth>0) {
      for (int i=0; i<depth; i++) {
	sb.append("/..");
      }
      sb.deleteCharAt(0);
    } else {
      if (path.startsWith("/")) {
	// delete first character (absolute path);
	path=path.substring(1);
      }
    }
    sb.append(path);

    return sb.toString();
  }

}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -