📄 crawluri.java
字号:
* @return True if there are avatars attached to this instance. */ public boolean hasCredentialAvatars() { return getCredentialAvatars() != null && getCredentialAvatars().size() > 0; } /** * Add an avatar. * * We do lazy instantiation. * * @param ca Credential avatar to add to set of avatars. */ public void addCredentialAvatar(CredentialAvatar ca) { Set<CredentialAvatar> avatars = getCredentialAvatars(); if (avatars == null) { avatars = new HashSet<CredentialAvatar>(); setCredentialAvatars(avatars); } avatars.add(ca); } /** * Remove all credential avatars from this crawl uri. */ public void removeCredentialAvatars() { if (hasCredentialAvatars()) { remove(A_CREDENTIAL_AVATARS_KEY); } } /** * Remove all credential avatars from this crawl uri. * @param ca Avatar to remove. * @return True if we removed passed parameter. False if no operation * performed. */ public boolean removeCredentialAvatar(CredentialAvatar ca) { boolean result = false; Set avatars = getCredentialAvatars(); if (avatars != null && avatars.size() > 0) { result = avatars.remove(ca); } return result; } /** * Ask this URI if it was a success or not. * * Only makes sense to call this method after execution of * HttpMethod#execute. Regard any status larger then 0 as success * except for below caveat regarding 401s. Use {@link #is2XXSuccess()} if * looking for a status code in the 200 range. * * <p>401s caveat: If any rfc2617 credential data present and we got a 401 * assume it got loaded in FetchHTTP on expectation that we're to go around * the processing chain again. Report this condition as a failure so we * get another crack at the processing chain only this time we'll be making * use of the loaded credential data. * * @return True if ths URI has been successfully processed. * @see #is2XXSuccess() */ public boolean isSuccess() { boolean result = false; int statusCode = this.fetchStatus; if (statusCode == HttpStatus.SC_UNAUTHORIZED && hasRfc2617CredentialAvatar()) { result = false; } else { result = (statusCode > 0); } return result; } /** * @return True if status code is in the 2xx range. * @see #isSuccess() */ public boolean is2XXSuccess() { return this.fetchStatus >= 200 && this.fetchStatus < 300; } /** * @return True if we have an rfc2617 payload. */ public boolean hasRfc2617CredentialAvatar() { boolean result = false; Set avatars = getCredentialAvatars(); if (avatars != null && avatars.size() > 0) { for (Iterator i = avatars.iterator(); i.hasNext();) { if (((CredentialAvatar)i.next()). match(Rfc2617Credential.class)) { result = true; break; } } } return result; } /** * Set whether this URI should be fetched by sending a HTTP POST request. * Else a HTTP GET request will be used. * * @param b Set whether this curi is to be POST'd. Else its to be GET'd. */ public void setPost(boolean b) { this.post = b; } /** * Returns true if this URI should be fetched by sending a HTTP POST request. * * * TODO: Compound this and {@link #isHttpTransaction()} method so that there * is one place to go to find out if get http, post http, ftp, dns. * * @return Returns is this CrawlURI instance is to be posted. */ public boolean isPost() { return this.post; } /** * Set the retained content-digest value (usu. SHA1). * * @param digestValue * @deprecated Use {@link #setContentDigest(String scheme, byte[])} */ public void setContentDigest(byte[] digestValue) { setContentDigest("SHA1", digestValue); } public void setContentDigest(final String scheme, final byte [] digestValue) { this.contentDigest = digestValue; this.contentDigestScheme = scheme; } public String getContentDigestSchemeString() { if(this.contentDigest==null) { return null; } return this.contentDigestScheme + ":" + getContentDigestString(); } /** * Return the retained content-digest value, if any. * * @return Digest value. */ public Object getContentDigest() { return contentDigest; } public String getContentDigestString() { if(this.contentDigest==null) { return null; } return Base32.encode(this.contentDigest); } transient Object holder; transient Object holderKey; /** * Remember a 'holder' to which some enclosing/queueing * facility has assigned this CrawlURI * . * @param obj */ public void setHolder(Object obj) { holder=obj; } /** * Return the 'holder' for the convenience of * an external facility. * * @return holder */ public Object getHolder() { return holder; } /** * Remember a 'holderKey' which some enclosing/queueing * facility has assigned this CrawlURI * . * @param obj */ public void setHolderKey(Object obj) { holderKey=obj; } /** * Return the 'holderKey' for convenience of * an external facility (Frontier). * * @return holderKey */ public Object getHolderKey() { return holderKey; } /** * Get the ordinal (serial number) assigned at creation. * * @return ordinal */ public long getOrdinal() { return ordinal; } /** spot for an integer cost to be placed by external facility (frontier). * cost is truncated to 8 bits at times, so should not exceed 255 */ int holderCost = UNCALCULATED; /** * Return the 'holderCost' for convenience of external facility (frontier) * @return value of holderCost */ public int getHolderCost() { return holderCost; } /** * Remember a 'holderCost' which some enclosing/queueing * facility has assigned this CrawlURI * @param cost value to remember */ public void setHolderCost(int cost) { holderCost = cost; } /** * All discovered outbound Links (navlinks, embeds, etc.) * Can either contain Link instances or CandidateURI instances, or both. * The LinksScoper processor converts Link instances in this collection * to CandidateURI instances. */ transient Collection<Object> outLinks = new HashSet<Object>(); /** * Returns discovered links. The returned collection might be empty if * no links were discovered, or if something like LinksScoper promoted * the links to CandidateURIs. * * Elements can be removed from the returned collection, but not added. * To add a discovered link, use one of the createAndAdd methods or * {@link #getOutObjects()}. * * @return Collection of all discovered outbound Links */ public Collection<Link> getOutLinks() { return Transform.subclasses(outLinks, Link.class); } /** * Returns discovered candidate URIs. The returned collection will be * emtpy until something like LinksScoper promotes discovered Links * into CandidateURIs. * * Elements can be removed from the returned collection, but not added. * To add a candidate URI, use {@link #replaceOutlinks(Collection)} or * {@link #getOutObjects}. * * @return Collection of candidate URIs */ public Collection<CandidateURI> getOutCandidates() { return Transform.subclasses(outLinks, CandidateURI.class); } /** * Returns all of the outbound objects. The returned Collection will * contain Link instances, or CandidateURI instances, or both. * * @return the collection of Links and/or CandidateURIs */ public Collection<Object> getOutObjects() { return outLinks; } /** * Add a discovered Link, unless it would exceed the max number * to accept. (If so, increment discarded link counter.) * * @param link the Link to add */ public void addOutLink(Link link) { if (outLinks.size() < MAX_OUTLINKS) { outLinks.add(link); } else { // note & discard discardedOutlinks++; } } public void clearOutlinks() { this.outLinks.clear(); } /** * Replace current collection of links w/ passed list. * Used by Scopers adjusting the list of links (removing those * not in scope and promoting Links to CandidateURIs). * * @param a collection of CandidateURIs replacing any previously * existing outLinks or outCandidates */ public void replaceOutlinks(Collection<CandidateURI> links) { clearOutlinks(); this.outLinks.addAll(links); } /** * @return Count of outlinks. */ public int outlinksSize() { return this.outLinks.size(); } /** * Convenience method for creating a Link discovered at this URI * with the given string and context * * @param url * String to use to create Link * @param context * CharSequence context to use * @param hopType * @return Link. * @throws URIException * if Link UURI cannot be constructed */ public Link createLink(String url, CharSequence context, char hopType) throws URIException { return new Link(getUURI(), UURIFactory.getInstance(getUURI(), url), context, hopType); } /** * Convenience method for creating a Link with the given string and * context * * @param url * String to use to create Link * @param context * CharSequence context to use * @param hopType * @throws URIException * if Link UURI cannot be constructed */ public void createAndAddLink(String url, CharSequence context, char hopType) throws URIException { addOutLink(createLink(url, context, hopType)); } /** * Convenience method for creating a Link with the given string and * context, relative to a previously set base HREF if available (or * relative to the current CrawlURI if no other base has been set) * * @param url String URL to add as destination of link * @param context String context where link was discovered * @param hopType char hop-type indicator * @throws URIException */ public void createAndAddLinkRelativeToBase(String url, CharSequence context, char hopType) throws URIException { addOutLink(new Link(getUURI(), UURIFactory.getInstance( getBaseURI(), url), context, hopType)); } /** * Convenience method for creating a Link with the given string and * context, relative to this CrawlURI's via UURI if available. (If * a via is not available, falls back to using * #createAndAddLinkRelativeToBase.) * * @param url String URL to add as destination of link * @param context String context where link was discovered * @param hopType char hop-type indicator * @throws URIException */ public void createAndAddLinkRelativeToVia(String url, CharSequence context, char hopType) throws URIException { if(getVia()!=null) { addOutLink(new Link(getUURI(), UURIFactory.getInstance( getVia(), url), context, hopType)); } else { // if no 'via', fall back to base/self createAndAddLinkRelativeToBase(url,context,hopType); } } /** * Set the (HTML) Base URI used for derelativizing internal URIs. * * @param baseHref String base href to use * @throws URIException if supplied string cannot be interpreted as URI */ public void setBaseURI(String baseHref) throws URIException { putObject(A_HTML_BASE, UURIFactory.getInstance(baseHref)); } /** * Get the (HTML) Base URI used for derelativizing internal URIs. * * @return UURI base URI previously set */ public UURI getBaseURI() { if (!containsKey(A_HTML_BASE)) { return getUURI(); } return (UURI)getObject(A_HTML_BASE); } /** * Add the key of alist items you want to persist across * processings. * @param key Key to add. */ public static void addAlistPersistentMember(Object key) { alistPersistentMember.add(key); } /** * @param key Key to remove. * @return True if list contained the element. */ public static boolean removeAlistPersistentMember(Object key) { return alistPersistentMember.remove(key); } /** * Custom serialization writing an empty 'outLinks' as null. Estimated * to save ~20 bytes in serialized form. * * @param stream * @throws IOException */ private void writeObject(ObjectOutputStream stream) throws IOException { stream.defaultWriteObject(); stream.writeObject((outLinks.isEmpty()) ? null : outLinks); } /** * Custom deserialization recreating empty HashSet from null in 'outLinks' * slot. * * @param stream * @throws IOException * @throws ClassNotFoundException */ private void readObject(ObjectInputStream stream) throws IOException, ClassNotFoundException { stream.defaultReadObject(); @SuppressWarnings("unchecked") HashSet<Object> ol = (HashSet<Object>) stream.readObject(); outLinks = (ol == null) ? new HashSet<Object>() : ol; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -