📄 linktransformer.java
字号:
/* * WebSPHINX web crawling toolkit * Copyright (C) 1998,1999 Carnegie Mellon University * * This library is free software; you can redistribute it * and/or modify it under the terms of the GNU Library * General Public License as published by the Free Software * Foundation, version 2. * * WebSPHINX homepage: http://www.cs.cmu.edu/~rcm/websphinx/ */package websphinx;import java.io.*;import java.net.URL;import java.net.MalformedURLException;import java.util.Hashtable;/** * Transformer that remaps URLs in links. * <P> * The default LinkTransformer simply converts all links * to absolute URLs. Other common effects are easy to * achieve: * <UL> * <LI>To make all links relative to a base URL, use * setBase() to set a base URL. * <LI>To replace certain URLs with different ones, * use map() to set up the mappings. * </UL> * The default LinkTransformer strips out <BASE> * elements. Instead, it can output a <BASE> * element with a user-specified URL. Use setBase() to set * the URL and setEmitBaseElement() to indicate that it * should be emitted. */public class LinkTransformer extends HTMLTransformer { protected Hashtable map; protected URL base = null; boolean emitBaseElement = false; boolean needToEmitBase = false; /** * Make a LinkTransformer writing to a file. * @param filename Filename to write to */ public LinkTransformer (String filename) throws IOException { super (filename); } /** * Make a LinkTransformer that writes pages to a * file. * @param filename Name of file to receive HTML output * @param seekable True if file should be opened for random access */ public LinkTransformer (String filename, boolean seekable) throws IOException { super (filename, seekable); } /** * Make a LinkTransformer writing to a stream. * @param out stream to write to */ public LinkTransformer (OutputStream out) { super (out); }//#ifdef JDK1.1 public LinkTransformer (Writer writer) { super (writer); }//#endif JDK1.1 /** * Make a LinkTransformer writing to another HTMLTransformer * @param next next transformer in filter chain */ public LinkTransformer (HTMLTransformer next) { super (next); } /** * Get the base URL used by the LinkTransformer. * A transformed link's URL is written out relative * to this URL. For instance, if the base URL is * http://www.yahoo.com/Entertainment/, then a link * URL http://www.yahoo.com/News/Current/ * would be written out as ../News/Current/. * @return base URL, or null if no base URL is set. Default is null. */ public URL getBase () { return base; } /** * Set the base URL used by the LinkTransformer. * A transformed link's URL is written out relative * to this URL. For instance, if the base URL is * http://www.yahoo.com/Entertainment/, then a link * URL http://www.yahoo.com/News/Current/ * would be written out as ../News/Current/. * @param base base URL, or null if no base URL should be used. */ public synchronized void setBase (URL base) { this.base = base; } /** * Test whether the LinkTransformer should emit a * <BASE> element pointing to the base URL. * @return true if a <BASE> element should be * emitted with each page. */ public boolean getEmitBaseElement () { return emitBaseElement; } /** * Set whether the LinkTransformer should emit a * <BASE> element pointing to the base URL. * @param emitBase true if a <BASE> element should be * emitted with each page. */ public synchronized void setEmitBaseElement (boolean emitBase) { emitBaseElement = emitBase; } /** * Look up the href for a URL, taking any mapping * into account. * @param base base URL (or null if an absolute URL is desired) * @param url URL of interest * @return relative href for url from base */ public String lookup (URL base, URL url) { if (map != null) { Object obj = map.get (url); if (obj instanceof URL) return base != null ? Link.relativeTo (base, (URL)obj) : obj.toString (); else if (obj instanceof String) return base != null ? Link.relativeTo (base, (String)obj) : obj.toString (); } return base != null ? Link.relativeTo (base, url) : url.toString (); } /** * Map a URL to an href. For example, Concatenator * uses this call to map page URLs to their corresponding * anchors in the concatenation. * @param url URL of interest * @param href href which should be returned by lookup (null, url) */ public synchronized void map (URL url, String href) { if (map == null) map = new Hashtable (); map.put (url, href); } /** * Map a URL to a new URL. For example, Mirror * uses this call to map remote URLs to their corresponding * local URLs. * @param url URL of interest * @param newURL URL which should be returned by lookup (null, url) */ public synchronized void map (URL url, URL newURL) { if (map == null) map = new Hashtable (); map.put (url, newURL); } /** * Test whether a URL is mapped. * @param url URL of interest * @return true if map () was called to remap url */ public boolean isMapped (URL url) { return map != null && map.containsKey (url); } /** * Write a page through the transformer. If * getEmitBaseElement() is true and getBase() is * non-null, then the transformer * outputs a <BASE> element either inside the * page's <HEAD> element (if present) or before * the first tag that belongs in <BODY>. * @param page Page to write */ public synchronized void writePage (Page page) throws IOException { needToEmitBase = emitBaseElement && base != null; super.writePage (page); needToEmitBase = false; } /** * Handle an element written through the transformer. * Remaps attributes that contain URLs. * @param elem Element to transform */ protected void handleElement (Element elem) throws IOException { Tag tag = elem.getStartTag (); String tagName = elem.getTagName (); if (needToEmitBase && tag.isBodyTag ()) { emit ("<BASE HREF=\"" + base.toString () + "\">"); needToEmitBase = false; } if (elem instanceof Link) handleLink ((Link)elem); else if (tagName == Tag.BASE) handleBase (elem); else if (needToEmitBase && tagName == Tag.HEAD) { // put BASE at the end of HEAD, if we don't find it earlier emit (elem.getStartTag ()); transformContents (elem); if (needToEmitBase) { emit ("<BASE HREF=\"" + base.toString () + "\">"); needToEmitBase = false; } if (elem.getEndTag () != null) emit (elem.getEndTag ()); } else super.handleElement (elem); } /** * Handle a Link's transformation. * Default implementation replaces the link's URL * with lookup(URL). * @param link Link to transform */ protected void handleLink (Link link) throws IOException { emit ( link.replaceHref (lookup (base, link.getURL())) ); transformContents (link); if (link.getEndTag () != null) emit (link.getEndTag ()); } /** * Handle the BASE element. * Default implementation removes if if EmitBaseElement * is false, or changes its URL to Base if EmitBaseElement * is true. * @param elem BASE element to transform */ protected void handleBase (Element elem) throws IOException { Tag tag = elem.getStartTag (); if (needToEmitBase) { emit (tag.replaceHTMLAttribute ("href", base.toString())); needToEmitBase = false; } else if (tag.hasHTMLAttribute ("href") && tag.countHTMLAttributes () > 1) // tag has other attributes that we want to preserve emit (tag.removeHTMLAttribute ("href")); // otherwise skip the BASE element } /* * Testing * public static void main (String[] args) throws Exception { OutputStream out = (args.length >= 2) ? (OutputStream)new java.io.FileOutputStream (args[1]) : (OutputStream)System.out; HTMLTransformer unparser = new LinkTransformer (out); Link link = new Link (args[0]); Page page = new Page (link); unparser.write (page); unparser.close (); } */}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -