⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 crawluri.java

📁 高性能分词算法
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
    /**     * Set the retained content-digest value (usu. SHA1).      *      * @param digestValue     * @deprecated Use {@link #setContentDigest(String scheme, byte[])}     */    public void setContentDigest(byte[] digestValue) {        setContentDigest("SHA1", digestValue);    }        public void setContentDigest(final String scheme,            final byte [] digestValue) {        this.contentDigest = digestValue;        this.contentDigestScheme = scheme;    }        public String getContentDigestSchemeString() {        if(this.contentDigest==null) {            return null;        }        return this.contentDigestScheme + ":" + getContentDigestString();    }    /**     * Return the retained content-digest value, if any.     *      * @return Digest value.     */    public Object getContentDigest() {        return contentDigest;    }        public String getContentDigestString() {        if(this.contentDigest==null) {            return null;        }        return Base32.encode(this.contentDigest);    }    transient Object holder;    transient Object holderKey;    /**     * Remember a 'holder' to which some enclosing/queueing     * facility has assigned this CrawlURI     * .     * @param obj     */    public void setHolder(Object obj) {        holder=obj;    }    /**     * Return the 'holder' for the convenience of      * an external facility.     *     * @return holder     */    public Object getHolder() {        return holder;    }    /**     * Remember a 'holderKey' which some enclosing/queueing     * facility has assigned this CrawlURI     * .     * @param obj     */    public void setHolderKey(Object obj) {        holderKey=obj;    }    /**     * Return the 'holderKey' for convenience of      * an external facility (Frontier).     *      * @return holderKey      */    public Object getHolderKey() {        return holderKey;    }    /**     * Get the ordinal (serial number) assigned at creation.     *      * @return ordinal     */    public long getOrdinal() {        return ordinal;    }    /** spot for an integer cost to be placed by external facility (frontier).     *  cost is truncated to 8 bits at times, so should not exceed 255 */    int holderCost = UNCALCULATED;    /**     * Return the 'holderCost' for convenience of external facility (frontier)     * @return value of holderCost     */    public int getHolderCost() {        return holderCost;    }    /**     * Remember a 'holderCost' which some enclosing/queueing     * facility has assigned this CrawlURI     * @param cost value to remember     */    public void setHolderCost(int cost) {        holderCost = cost;    }    /**      * All discovered outbound Links (navlinks, embeds, etc.)      * Can either contain Link instances or CandidateURI instances, or both.     * The LinksScoper processor converts Link instances in this collection     * to CandidateURI instances.      */    transient Collection<Object> outLinks = new HashSet<Object>();        /**     * Returns discovered links.  The returned collection might be empty if     * no links were discovered, or if something like LinksScoper promoted     * the links to CandidateURIs.     *      * Elements can be removed from the returned collection, but not added.     * To add a discovered link, use one of the createAndAdd methods or     * {@link #getOutObjects()}.     *      * @return Collection of all discovered outbound Links     */    public Collection<Link> getOutLinks() {        return Transform.subclasses(outLinks, Link.class);    }        /**     * Returns discovered candidate URIs.  The returned collection will be     * emtpy until something like LinksScoper promotes discovered Links     * into CandidateURIs.     *      * Elements can be removed from the returned collection, but not added.     * To add a candidate URI, use {@link #replaceOutlinks(Collection)} or     * {@link #getOutObjects}.     *      * @return  Collection of candidate URIs     */    public Collection<CandidateURI> getOutCandidates() {        return Transform.subclasses(outLinks, CandidateURI.class);    }            /**     * Returns all of the outbound objects.  The returned Collection will     * contain Link instances, or CandidateURI instances, or both.       *      * @return  the collection of Links and/or CandidateURIs     */    public Collection<Object> getOutObjects() {        return outLinks;    }        /**     * Add a discovered Link, unless it would exceed the max number     * to accept. (If so, increment discarded link counter.)      *      * @param link the Link to add     */    public void addOutLink(Link link) {        if (outLinks.size() < MAX_OUTLINKS) {            outLinks.add(link);        } else {            // note & discard            discardedOutlinks++;        }    }        public void clearOutlinks() {        this.outLinks.clear();    }        /**     * Replace current collection of links w/ passed list.     * Used by Scopers adjusting the list of links (removing those     * not in scope and promoting Links to CandidateURIs).     *      * @param a collection of CandidateURIs replacing any previously     *   existing outLinks or outCandidates     */    public void replaceOutlinks(Collection<CandidateURI> links) {        clearOutlinks();        this.outLinks.addAll(links);    }            /**     * @return Count of outlinks.     */    public int outlinksSize() {        return this.outLinks.size();    }    /**     * Convenience method for creating a Link discovered at this URI     * with the given string and context     *      * @param url     *            String to use to create Link     * @param context     *            CharSequence context to use     * @param hopType     * @return Link.     * @throws URIException     *             if Link UURI cannot be constructed     */    public Link createLink(String url, CharSequence context,            char hopType) throws URIException {        return new Link(getUURI(), UURIFactory.getInstance(getUURI(),                url), context, hopType);    }        /**     * Convenience method for creating a Link with the given string and     * context     *      * @param url     *            String to use to create Link     * @param context     *            CharSequence context to use     * @param hopType     * @throws URIException     *             if Link UURI cannot be constructed     */    public void createAndAddLink(String url, CharSequence context,            char hopType) throws URIException {        addOutLink(createLink(url, context, hopType));    }    /**     * Convenience method for creating a Link with the given string and     * context, relative to a previously set base HREF if available (or     * relative to the current CrawlURI if no other base has been set)     *      * @param url String URL to add as destination of link     * @param context String context where link was discovered     * @param hopType char hop-type indicator     * @throws URIException     */    public void createAndAddLinkRelativeToBase(String url,            CharSequence context, char hopType) throws URIException {        addOutLink(new Link(getUURI(), UURIFactory.getInstance(                getBaseURI(), url), context, hopType));    }        /**     * Convenience method for creating a Link with the given string and     * context, relative to this CrawlURI's via UURI if available. (If     * a via is not available, falls back to using      * #createAndAddLinkRelativeToBase.)     *      * @param url String URL to add as destination of link     * @param context String context where link was discovered     * @param hopType char hop-type indicator     * @throws URIException     */    public void createAndAddLinkRelativeToVia(String url,            CharSequence context, char hopType) throws URIException {        if(getVia()!=null) {            addOutLink(new Link(getUURI(), UURIFactory.getInstance(                getVia(), url), context, hopType));        } else {            // if no 'via', fall back to base/self            createAndAddLinkRelativeToBase(url,context,hopType);        }    }        /**     * Set the (HTML) Base URI used for derelativizing internal URIs.      *      * @param baseHref String base href to use     * @throws URIException if supplied string cannot be interpreted as URI     */    public void setBaseURI(String baseHref) throws URIException {        putObject(A_HTML_BASE, UURIFactory.getInstance(baseHref));    }          /**     * Get the (HTML) Base URI used for derelativizing internal URIs.      *     * @return UURI base URI previously set      */      public UURI getBaseURI() {        if (!containsKey(A_HTML_BASE)) {            return getUURI();        }        return (UURI)getObject(A_HTML_BASE);    }        /**     * Add the key of alist items you want to persist across     * processings.     * @param key Key to add.     */    public static void addAlistPersistentMember(Object key) {        alistPersistentMember.add(key);    }        /**     * @param key Key to remove.     * @return True if list contained the element.     */    public static boolean removeAlistPersistentMember(Object key) {        return alistPersistentMember.remove(key);    }    /**     * Custom serialization writing an empty 'outLinks' as null. Estimated     * to save ~20 bytes in serialized form.      *      * @param stream     * @throws IOException     */    private void writeObject(ObjectOutputStream stream) throws IOException {        stream.defaultWriteObject();        stream.writeObject((outLinks.isEmpty()) ? null : outLinks);    }    /**     * Custom deserialization recreating empty HashSet from null in 'outLinks'     * slot.      *      * @param stream     * @throws IOException     * @throws ClassNotFoundException     */    private void readObject(ObjectInputStream stream) throws IOException,            ClassNotFoundException {        stream.defaultReadObject();        @SuppressWarnings("unchecked")        HashSet<Object> ol = (HashSet<Object>) stream.readObject();        outLinks = (ol == null) ? new HashSet<Object>() : ol;    }    public long getFetchDuration() {        if(! containsKey(A_FETCH_COMPLETED_TIME)) {            return -1;        }                long completedTime = getLong(A_FETCH_COMPLETED_TIME);        long beganTime = getLong(A_FETCH_BEGAN_TIME);        return completedTime - beganTime;    }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -