⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 candidateuri.java

📁 最强的爬虫工程
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
    }    /**     * @return True if needs soon but not top scheduling.     */    public boolean needsSoonScheduling() {        return schedulingDirective == MEDIUM;    }    /**     * Tally up the number of transitive (non-simple-link) hops at     * the end of this CandidateURI's pathFromSeed.     *      * In some cases, URIs with greater than zero but less than some     * threshold such hops are treated specially.      *      * <p>TODO: consider moving link-count in here as well, caching     * calculation, and refactoring CrawlScope.exceedsMaxHops() to use this.      *      * @return Transhop count.     */    public int getTransHops() {        String path = getPathFromSeed();        int transCount = 0;        for(int i=path.length()-1;i>=0;i--) {            if(path.charAt(i)==Link.NAVLINK_HOP) {                break;            }            transCount++;        }        return transCount;    }    /**     * Given a string containing a URI, then optional whitespace     * delimited hops-path and via info, create a CandidateURI      * instance.     *      * @param uriHopsViaString String with a URI.     * @return A CandidateURI made from passed <code>uriHopsViaString</code>.     * @throws URIException     */    public static CandidateURI fromString(String uriHopsViaString)            throws URIException {        String args[] = uriHopsViaString.split("\\s+");        String pathFromSeeds = (args.length > 1 && !args[1].equals("-")) ?                args[1]: "";        UURI via = (args.length > 2 && !args[2].equals("-")) ?                UURIFactory.getInstance(args[2]) : null;        CharSequence viaContext = (args.length > 3 && !args[3].equals("-")) ?                args[2]: null;        return new CandidateURI(UURIFactory.getInstance(args[0]),                pathFromSeeds, via, viaContext);    }        public static CandidateURI createSeedCandidateURI(UURI uuri) {        CandidateURI c = new CandidateURI(uuri);        c.setIsSeed(true);        return c;    }        /**     * Utility method for creation of CandidateURIs found extracting     * links from this CrawlURI.     * @param baseUURI BaseUURI for <code>link</code>.     * @param link Link to wrap CandidateURI in.     * @return New candidateURI wrapper around <code>link</code>.     * @throws URIException     */    public CandidateURI createCandidateURI(UURI baseUURI, Link link)    throws URIException {        UURI u = (link.getDestination() instanceof UURI)?            (UURI)link.getDestination():            UURIFactory.getInstance(baseUURI,                link.getDestination().toString());        CandidateURI newCaURI = new CandidateURI(u, getPathFromSeed() + link.getHopType(),                getUURI(), link.getContext());        newCaURI.inheritFrom(this);        return newCaURI;    }    /**     * Utility method for creation of CandidateURIs found extracting     * links from this CrawlURI.     * @param baseUURI BaseUURI for <code>link</code>.     * @param link Link to wrap CandidateURI in.     * @param scheduling How new CandidateURI should be scheduled.     * @param seed True if this CandidateURI is a seed.     * @return New candidateURI wrapper around <code>link</code>.     * @throws URIException     */    public CandidateURI createCandidateURI(UURI baseUURI, Link link,        int scheduling, boolean seed)    throws URIException {        final CandidateURI caURI = createCandidateURI(baseUURI, link);        caURI.setSchedulingDirective(scheduling);        caURI.setIsSeed(seed);        return caURI;    }        /**     * Inherit (copy) the relevant keys-values from the ancestor.      *      * @param ancestor     */    protected void inheritFrom(CandidateURI ancestor) {        List heritableKeys = (List) ancestor.getObject(A_HERITABLE_KEYS);        if(heritableKeys!=null) {            getAList().copyKeysFrom(heritableKeys.iterator(),ancestor.getAList());        }    }        /**     * Get the token (usually the hostname + port) which indicates     * what "class" this CrawlURI should be grouped with,     * for the purposes of ensuring only one item of the     * class is processed at once, all items of the class     * are held for a politeness period, etc.     *     * @return Token (usually the hostname) which indicates     * what "class" this CrawlURI should be grouped with.     */    public String getClassKey() {        return classKey;    }    public void setClassKey(String key) {        classKey = key;    }        /**     * Assumption is that only one thread at a time will ever be accessing     * a particular CandidateURI.     *      * @deprecated Public access will be deprecated. This methods access     * will change in next release.  Use specialized accessors instead such     * as {@link #getString(String)}.     *      * @return the attribute list.     */    public AList getAList() {        if (this.alist == null) {            this.alist = new HashtableAList();        }        return this.alist;    }        protected void clearAList() {        this.alist = null;    }        public void putObject(String key, Object value) {        getAList().putObject(key, value);    }        public Object getObject(String key) {        return getAList().getObject(key);    }        public String getString(String key) {        return getAList().getString(key);    }        public void putString(String key, String value) {        getAList().putString(key, value);    }        public long getLong(String key) {        return getAList().getLong(key);    }        public void putLong(String key, long value) {        getAList().putLong(key, value);    }        public int getInt(String key) {        return getAList().getInt(key);    }        public void putInt(String key, int value) {        getAList().putInt(key, value);    }        public boolean containsKey(String key) {        return getAList().containsKey(key);    }        public void remove(String key) {        getAList().remove(key);    }        public Iterator keys() {        return getAList().getKeys();    }        /**     * @return True if this CandidateURI was result of a redirect:     * i.e. Its parent URI redirected to here, this URI was what was in      * the 'Location:' or 'Content-Location:' HTTP Header.     */    public boolean isLocation() {        return this.pathFromSeed != null && this.pathFromSeed.length() > 0 &&            this.pathFromSeed.charAt(this.pathFromSeed.length() - 1) ==                Link.REFER_HOP;    }    /**     * Custom serialization writing 'uuri' and 'via' as Strings, rather     * than the bloated full serialization of their object classes, and      * an empty alist as 'null'. Shrinks serialized form by 50% or more     * in short tests.      *      * @param stream     * @throws IOException     */    private void writeObject(ObjectOutputStream stream)        throws IOException {        stream.defaultWriteObject();        stream.writeUTF(uuri.toString());        stream.writeObject((via == null) ? null : via.getURI());        stream.writeObject((alist==null) ? null : alist);    }    /**     * Custom deserialization to reconstruct UURI instances from more     * compact Strings.      *      * @param stream     * @throws IOException     * @throws ClassNotFoundException     */    private void readObject(ObjectInputStream stream)        throws IOException, ClassNotFoundException {        stream.defaultReadObject();        uuri = readUuri(stream.readUTF());        via = readUuri((String)stream.readObject());        alist = (AList) stream.readObject();    }    /**     * Read a UURI from a String, handling a null or URIException     *      * @param u String or null from which to create UURI     * @return the best UURI instance creatable     */    protected UURI readUuri(String u) {        if (u == null) {            return null;        }        try {            return UURIFactory.getInstance(u);        } catch (URIException ux) {            // simply continue to next try        }        try {            // try adding an junk scheme            return UURIFactory.getInstance("invalid:" + u);        } catch (URIException ux) {            ux.printStackTrace();            // ignored; method continues        }        try {            // return total junk            return UURIFactory.getInstance("invalid:");        } catch (URIException e) {            e.printStackTrace();            return null;        }    }        //    // Reporter implementation    //    public String singleLineReport() {        return ArchiveUtils.singleLineReport(this);    }        public void singleLineReportTo(PrintWriter w) {        String className = this.getClass().getName();        className = className.substring(className.lastIndexOf(".")+1);        w.print(className);        w.print(" ");        w.print(getUURI().toString());        w.print(" ");        w.print(pathFromSeed);        w.print(" ");        w.print(flattenVia());    }    /* (non-Javadoc)     * @see org.archive.util.Reporter#singleLineLegend()     */    public String singleLineLegend() {        return "className uri hopsPath viaUri";    }        /* (non-Javadoc)     * @see org.archive.util.Reporter#getReports()     */    public String[] getReports() {        // none but default: empty options        return new String[] {};    }    /* (non-Javadoc)     * @see org.archive.util.Reporter#reportTo(java.lang.String, java.io.Writer)     */    public void reportTo(String name, PrintWriter writer) {        singleLineReportTo(writer);        writer.print("\n");    }    /* (non-Javadoc)     * @see org.archive.util.Reporter#reportTo(java.io.Writer)     */    public void reportTo(PrintWriter writer) throws IOException {        reportTo(null,writer);    }    /** Make the given key 'heritable', meaning its value will be      * added to descendant CandidateURIs. Only keys with immutable     * values should be made heritable -- the value instance may      * be shared until the AList is serialized/deserialized.      *      * @param key to make heritable     */    public void makeHeritable(String key) {        List heritableKeys = (List) getObject(A_HERITABLE_KEYS);        if(heritableKeys==null) {            heritableKeys = new ArrayList();            heritableKeys.add(A_HERITABLE_KEYS);            putObject(A_HERITABLE_KEYS,heritableKeys);        }        heritableKeys.add(key);    }        /** Make the given key non-'heritable', meaning its value will      * not be added to descendant CandidateURIs. Only meaningful if     * key was previously made heritable.       *      * @param key to make non-heritable     */    public void makeNonHeritable(String key) {        List heritableKeys = (List) getObject(A_HERITABLE_KEYS);        if(heritableKeys==null) {            return;        }        heritableKeys.remove(key);        if(heritableKeys.size()==1) {            // only remaining heritable key is itself; disable completely            remove(A_HERITABLE_KEYS);        }    }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -