📄 candidateuri.java
字号:
/* Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * CandidateURI.java * Created on Sep 30, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/datamodel/CandidateURI.java,v 1.49 2006/08/15 00:25:02 paul_jack Exp $ */package org.archive.crawler.datamodel;import java.io.IOException;import java.io.ObjectInputStream;import java.io.ObjectOutputStream;import java.io.PrintWriter;import java.io.Serializable;import java.util.ArrayList;import java.util.Iterator;import java.util.List;import org.apache.commons.httpclient.URIException;import org.archive.crawler.extractor.Link;import org.archive.net.UURI;import org.archive.net.UURIFactory;import org.archive.util.ArchiveUtils;import org.archive.util.Reporter;import st.ata.util.AList;import st.ata.util.HashtableAList;/** * A URI, discovered or passed-in, that may be scheduled. * When scheduled, a CandidateURI becomes a {@link CrawlURI} * made with the data contained herein. A CandidateURI * contains just the fields necessary to perform quick in-scope analysis. * * <p>Has a flexible attribute list that will be promoted into * any {@link CrawlURI} created from this CandidateURI. Use it * to add custom data or state needed later doing custom processing. * See accessors/setters {@link #putString(String, String)}, * {@link #getString(String)}, etc. * * @author Gordon Mohr */public class CandidateURIimplements Serializable, Reporter, CoreAttributeConstants { private static final long serialVersionUID = -7152937921526560388L; /** Highest scheduling priority. * Before any others of its class. */ public static final int HIGHEST = 0; /** High scheduling priority. * After any {@link #HIGHEST}. */ public static final int HIGH = 1; /** Medium priority. * After any {@link #HIGH}. */ public static final int MEDIUM = 2; /** Normal/low priority. * Whenever/end of queue. */ public static final int NORMAL = 3; private int schedulingDirective = NORMAL; /** * Usuable URI under consideration. Transient to allow * more efficient custom serialization */ private transient UURI uuri; /** Seed status */ private boolean isSeed = false; private boolean forceRevisit = false; // even if already visited /** String of letters indicating how this URI was reached from a seed. * <pre> * P precondition * R redirection * E embedded (as frame, src, link, codebase, etc.) * X speculative embed (as from javascript, some alternate-format extractors * L link</pre> * For example LLLE (an embedded image on a page 3 links from seed). */ private String pathFromSeed; /** * Where this URI was (presently) discovered. . Transient to allow * more efficient custom serialization */ private transient UURI via; /** * Context of URI's discovery, as per the 'context' in Link */ private CharSequence viaContext; /** * Flexible dynamic attributes list. * <p> * The attribute list is a flexible map of key/value pairs for storing * status of this URI for use by other processors. By convention the * attribute list is keyed by constants found in the * {@link CoreAttributeConstants} interface. Use this list to carry * data or state produced by custom processors rather change the * classes {@link CrawlURI} or this class, CandidateURI. * * Transient to allow more efficient custom serialization. */ private transient AList alist; /** * Cache of this candidate uuri as a string. * * Profiling shows us spending about 1-2% of total elapsed time in * toString. */ private String cachedCandidateURIString = null; /** * Frontier/Scheduler lifecycle info. * This is an identifier set by the Frontier for its * purposes. Usually its the name of the Frontier queue * this URI gets queued to. Values can be host + port * or IP, etc. */ private String classKey; /** * Constructor. * Protected access to block access to default constructor. */ protected CandidateURI () { super(); } /** * @param u uuri instance this CandidateURI wraps. */ public CandidateURI(UURI u) { this.uuri = u; } /** * @param u uuri instance this CandidateURI wraps. * @param pathFromSeed * @param via * @param viaContext */ public CandidateURI(UURI u, String pathFromSeed, UURI via, CharSequence viaContext) { this.uuri = u; this.pathFromSeed = pathFromSeed; this.via = via; this.viaContext = viaContext; } /** * Set the <tt>isSeed</tt> attribute of this URI. * @param b Is this URI a seed, true or false. */ public void setIsSeed(boolean b) { this.isSeed = b; if (this.isSeed) { if(pathFromSeed==null) { this.pathFromSeed = ""; }// seeds created on redirect must have a via to be recognized; don't clear// setVia(null); } } /** * @return UURI */ public UURI getUURI() { return this.uuri; } /** * @return Whether seeded. */ public boolean isSeed() { return this.isSeed; } /** * @return path (hop-types) from seed */ public String getPathFromSeed() { return this.pathFromSeed; } /** * @return URI via which this one was discovered */ public UURI getVia() { return this.via; } /** * @return CharSequence context in which this one was discovered */ public CharSequence getViaContext() { return this.viaContext; } /** * @param string */ protected void setPathFromSeed(String string) { pathFromSeed = string; } /** * Called when making a copy of another CandidateURI. * @param alist AList to use. */ protected void setAList(AList alist) { this.alist = alist; } public void setVia(UURI via) { this.via = via; } /** * @return This candidate URI as a string wrapped with 'CandidateURI(' + * ')'. */ public synchronized String getCandidateURIString() { if (this.cachedCandidateURIString == null) { this.cachedCandidateURIString = "CandidateURI(" + toString() + ")"; } return this.cachedCandidateURIString; } /** * Method returns string version of this URI's referral URI. * @return String version of referral URI */ public String flattenVia() { return (via == null)? "": via.toString(); } /** * @return The UURI this CandidateURI wraps as a string * (We used return what {@link #getCandidateURIString()} * returns on a toString -- use that method if you still need * this functionality). * @see #getCandidateURIString() */ public String toString() { return getURIString(); } /** * @return URI String * @deprecated Use {@link #toString()}. */ public String getURIString() { return getUURI().toString(); } /** * Compares the domain of this CandidateURI with that of another * CandidateURI * * @param other The other CandidateURI * * @return True if both are in the same domain, false otherwise. * @throws URIException */ public boolean sameDomainAs(CandidateURI other) throws URIException { String domain = getUURI().getHost(); if (domain == null) { return false; } while(domain.lastIndexOf('.') > domain.indexOf('.')) { // While has more than one dot, lop off first segment domain = domain.substring(domain.indexOf('.') + 1); } if(other.getUURI().getHost() == null) { return false; } return other.getUURI().getHost().endsWith(domain); } /** * If this method returns true, this URI should be fetched even though * it already has been crawled. This also implies * that this URI will be scheduled for crawl before any other waiting * URIs for the same host. * * This value is used to refetch any expired robots.txt or dns-lookups. * * @return true if crawling of this URI should be forced */ public boolean forceFetch() { return forceRevisit; } /** * Method to signal that this URI should be fetched even though * it already has been crawled. Setting this to true also implies * that this URI will be scheduled for crawl before any other waiting * URIs for the same host. * * This value is used to refetch any expired robots.txt or dns-lookups. * * @param b set to true to enforce the crawling of this URI */ public void setForceFetch(boolean b) { forceRevisit = b; } /** * @return Returns the schedulingDirective. */ public int getSchedulingDirective() { return schedulingDirective; } /** * @param schedulingDirective The schedulingDirective to set. */ public void setSchedulingDirective(int schedulingDirective) { this.schedulingDirective = schedulingDirective; } /** * @return True if needs immediate scheduling. */ public boolean needsImmediateScheduling() { return schedulingDirective == HIGH;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -