linkfilter.java
来自「kaffe Java 解释器语言,源码,Java的子集系统,开放源代码」· Java 代码 · 共 233 行
JAVA
233 行
/* * Copyright (C) 1999-2001 David Brownell * * This file is part of GNU JAXP, a library. * * GNU JAXP is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * GNU JAXP is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * As a special exception, if you link this library with other files to * produce an executable, this library does not by itself cause the * resulting executable to be covered by the GNU General Public License. * This exception does not however invalidate any other reasons why the * executable file might be covered by the GNU General Public License. */package gnu.xml.pipeline;import java.io.IOException;import java.net.URL; import java.util.Enumeration;import java.util.Vector;import org.xml.sax.Attributes;import org.xml.sax.Locator;import org.xml.sax.SAXException;/** * Pipeline filter to remember XHTML links found in a document, * so they can later be crawled. Fragments are not counted, and duplicates * are ignored. Callers are responsible for filtering out URLs they aren't * interested in. Events are passed through unmodified. * * <p> Input MUST include a setDocumentLocator() call, as it's used to * resolve relative links in the absence of a "base" element. Input MUST * also include namespace identifiers, since it is the XHTML namespace * identifier which is used to identify the relevant elements. * * <p><em>FIXME:</em> handle xml:base attribute ... in association with * a stack of base URIs. Similarly, recognize/support XLink data. * * @author David Brownell */public class LinkFilter extends EventFilter{ // for storing URIs private Vector vector = new Vector (); // struct for "full" link record (tbd) // these for troubleshooting original source: // original uri // uri as resolved (base, relative, etc) // URI of originating doc // line # // original element + attrs (img src, desc, etc) // XLink model of the link ... for inter-site pairups ? private String baseURI; private boolean siteRestricted = false; // // XXX leverage blacklist info (like robots.txt) // // XXX constructor w/param ... pipeline for sending link data // probably XHTML --> XLink, providing info as sketched above // /** * Constructs a new event filter, which collects links in private data * structure for later enumeration. */ // constructor used by PipelineFactory public LinkFilter () { super.setContentHandler (this); } /** * Constructs a new event filter, which collects links in private data * structure for later enumeration and passes all events, unmodified, * to the next consumer. */ // constructor used by PipelineFactory public LinkFilter (EventConsumer next) { super (next); super.setContentHandler (this); } /** * Returns an enumeration of the links found since the filter * was constructed, or since removeAllLinks() was called. * * @return enumeration of strings. */ public Enumeration getLinks () { return vector.elements (); } /** * Removes records about all links reported to the event * stream, as if the filter were newly created. */ public void removeAllLinks () { vector = new Vector (); } /** * Collects URIs for (X)HTML content from elements which hold them. */ public void startElement ( String uri, String localName, String qName, Attributes atts ) throws SAXException { String link; // Recognize XHTML links. if ("http://www.w3.org/1999/xhtml".equals (uri)) { if ("a".equals (localName) || "base".equals (localName) || "area".equals (localName)) link = atts.getValue ("href"); else if ("iframe".equals (localName) || "frame".equals (localName)) link = atts.getValue ("src"); else if ("blockquote".equals (localName) || "q".equals (localName) || "ins".equals (localName) || "del".equals (localName)) link = atts.getValue ("cite"); else link = null; link = maybeAddLink (link); // "base" modifies designated baseURI if ("base".equals (localName) && link != null) baseURI = link; if ("iframe".equals (localName) || "img".equals (localName)) maybeAddLink (atts.getValue ("longdesc")); } super.startElement (uri, localName, qName, atts); } private String maybeAddLink (String link) { int index; // ignore empty links and fragments inside docs if (link == null) return null; if ((index = link.indexOf ("#")) >= 0) link = link.substring (0, index); if (link.equals ("")) return null; try { // get the real URI URL base = new URL ((baseURI != null) ? baseURI : getDocumentLocator ().getSystemId ()); URL url = new URL (base, link); link = url.toString (); // ignore duplicates if (vector.contains (link)) return link; // other than what "base" does, stick to original site: if (siteRestricted) { // don't switch protocols if (!base.getProtocol ().equals (url.getProtocol ())) return link; // don't switch servers if (base.getHost () != null && !base.getHost ().equals (url.getHost ())) return link; } vector.addElement (link); return link; } catch (IOException e) { // bad URLs we don't want } return null; } /** * Reports an error if no Locator has been made available. */ public void startDocument () throws SAXException { if (getDocumentLocator () == null) throw new SAXException ("no Locator!"); } /** * Forgets about any base URI information that may be recorded. * Applications will often want to call removeAllLinks(), likely * after examining the links which were reported. */ public void endDocument () throws SAXException { baseURI = null; super.endDocument (); }}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?