📄 candidateuri.java
字号:
} /** * @return True if needs soon but not top scheduling. */ public boolean needsSoonScheduling() { return schedulingDirective == MEDIUM; } /** * Tally up the number of transitive (non-simple-link) hops at * the end of this CandidateURI's pathFromSeed. * * In some cases, URIs with greater than zero but less than some * threshold such hops are treated specially. * * <p>TODO: consider moving link-count in here as well, caching * calculation, and refactoring CrawlScope.exceedsMaxHops() to use this. * * @return Transhop count. */ public int getTransHops() { String path = getPathFromSeed(); int transCount = 0; for(int i=path.length()-1;i>=0;i--) { if(path.charAt(i)==Link.NAVLINK_HOP) { break; } transCount++; } return transCount; } /** * Given a string containing a URI, then optional whitespace * delimited hops-path and via info, create a CandidateURI * instance. * * @param uriHopsViaString String with a URI. * @return A CandidateURI made from passed <code>uriHopsViaString</code>. * @throws URIException */ public static CandidateURI fromString(String uriHopsViaString) throws URIException { String args[] = uriHopsViaString.split("\\s+"); String pathFromSeeds = (args.length > 1 && !args[1].equals("-")) ? args[1]: ""; UURI via = (args.length > 2 && !args[2].equals("-")) ? UURIFactory.getInstance(args[2]) : null; CharSequence viaContext = (args.length > 3 && !args[3].equals("-")) ? args[2]: null; return new CandidateURI(UURIFactory.getInstance(args[0]), pathFromSeeds, via, viaContext); } public static CandidateURI createSeedCandidateURI(UURI uuri) { CandidateURI c = new CandidateURI(uuri); c.setIsSeed(true); return c; } /** * Utility method for creation of CandidateURIs found extracting * links from this CrawlURI. * @param baseUURI BaseUURI for <code>link</code>. * @param link Link to wrap CandidateURI in. * @return New candidateURI wrapper around <code>link</code>. * @throws URIException */ public CandidateURI createCandidateURI(UURI baseUURI, Link link) throws URIException { UURI u = (link.getDestination() instanceof UURI)? (UURI)link.getDestination(): UURIFactory.getInstance(baseUURI, link.getDestination().toString()); CandidateURI newCaURI = new CandidateURI(u, getPathFromSeed() + link.getHopType(), getUURI(), link.getContext()); newCaURI.inheritFrom(this); return newCaURI; } /** * Utility method for creation of CandidateURIs found extracting * links from this CrawlURI. * @param baseUURI BaseUURI for <code>link</code>. * @param link Link to wrap CandidateURI in. * @param scheduling How new CandidateURI should be scheduled. * @param seed True if this CandidateURI is a seed. * @return New candidateURI wrapper around <code>link</code>. * @throws URIException */ public CandidateURI createCandidateURI(UURI baseUURI, Link link, int scheduling, boolean seed) throws URIException { final CandidateURI caURI = createCandidateURI(baseUURI, link); caURI.setSchedulingDirective(scheduling); caURI.setIsSeed(seed); return caURI; } /** * Inherit (copy) the relevant keys-values from the ancestor. * * @param ancestor */ protected void inheritFrom(CandidateURI ancestor) { List heritableKeys = (List) ancestor.getObject(A_HERITABLE_KEYS); if(heritableKeys!=null) { getAList().copyKeysFrom(heritableKeys.iterator(),ancestor.getAList()); } } /** * Get the token (usually the hostname + port) which indicates * what "class" this CrawlURI should be grouped with, * for the purposes of ensuring only one item of the * class is processed at once, all items of the class * are held for a politeness period, etc. * * @return Token (usually the hostname) which indicates * what "class" this CrawlURI should be grouped with. */ public String getClassKey() { return classKey; } public void setClassKey(String key) { classKey = key; } /** * Assumption is that only one thread at a time will ever be accessing * a particular CandidateURI. * * @deprecated Public access will be deprecated. This methods access * will change in next release. Use specialized accessors instead such * as {@link #getString(String)}. * * @return the attribute list. */ public AList getAList() { if (this.alist == null) { this.alist = new HashtableAList(); } return this.alist; } protected void clearAList() { this.alist = null; } public void putObject(String key, Object value) { getAList().putObject(key, value); } public Object getObject(String key) { return getAList().getObject(key); } public String getString(String key) { return getAList().getString(key); } public void putString(String key, String value) { getAList().putString(key, value); } public long getLong(String key) { return getAList().getLong(key); } public void putLong(String key, long value) { getAList().putLong(key, value); } public int getInt(String key) { return getAList().getInt(key); } public void putInt(String key, int value) { getAList().putInt(key, value); } public boolean containsKey(String key) { return getAList().containsKey(key); } public void remove(String key) { getAList().remove(key); } public Iterator keys() { return getAList().getKeys(); } /** * @return True if this CandidateURI was result of a redirect: * i.e. Its parent URI redirected to here, this URI was what was in * the 'Location:' or 'Content-Location:' HTTP Header. */ public boolean isLocation() { return this.pathFromSeed != null && this.pathFromSeed.length() > 0 && this.pathFromSeed.charAt(this.pathFromSeed.length() - 1) == Link.REFER_HOP; } /** * Custom serialization writing 'uuri' and 'via' as Strings, rather * than the bloated full serialization of their object classes, and * an empty alist as 'null'. Shrinks serialized form by 50% or more * in short tests. * * @param stream * @throws IOException */ private void writeObject(ObjectOutputStream stream) throws IOException { stream.defaultWriteObject(); stream.writeUTF(uuri.toString()); stream.writeObject((via == null) ? null : via.getURI()); stream.writeObject((alist==null) ? null : alist); } /** * Custom deserialization to reconstruct UURI instances from more * compact Strings. * * @param stream * @throws IOException * @throws ClassNotFoundException */ private void readObject(ObjectInputStream stream) throws IOException, ClassNotFoundException { stream.defaultReadObject(); uuri = readUuri(stream.readUTF()); via = readUuri((String)stream.readObject()); alist = (AList) stream.readObject(); } /** * Read a UURI from a String, handling a null or URIException * * @param u String or null from which to create UURI * @return the best UURI instance creatable */ protected UURI readUuri(String u) { if (u == null) { return null; } try { return UURIFactory.getInstance(u); } catch (URIException ux) { // simply continue to next try } try { // try adding an junk scheme return UURIFactory.getInstance("invalid:" + u); } catch (URIException ux) { ux.printStackTrace(); // ignored; method continues } try { // return total junk return UURIFactory.getInstance("invalid:"); } catch (URIException e) { e.printStackTrace(); return null; } } // // Reporter implementation // public String singleLineReport() { return ArchiveUtils.singleLineReport(this); } public void singleLineReportTo(PrintWriter w) { String className = this.getClass().getName(); className = className.substring(className.lastIndexOf(".")+1); w.print(className); w.print(" "); w.print(getUURI().toString()); w.print(" "); w.print(pathFromSeed); w.print(" "); w.print(flattenVia()); } /* (non-Javadoc) * @see org.archive.util.Reporter#singleLineLegend() */ public String singleLineLegend() { return "className uri hopsPath viaUri"; } /* (non-Javadoc) * @see org.archive.util.Reporter#getReports() */ public String[] getReports() { // none but default: empty options return new String[] {}; } /* (non-Javadoc) * @see org.archive.util.Reporter#reportTo(java.lang.String, java.io.Writer) */ public void reportTo(String name, PrintWriter writer) { singleLineReportTo(writer); writer.print("\n"); } /* (non-Javadoc) * @see org.archive.util.Reporter#reportTo(java.io.Writer) */ public void reportTo(PrintWriter writer) throws IOException { reportTo(null,writer); } /** Make the given key 'heritable', meaning its value will be * added to descendant CandidateURIs. Only keys with immutable * values should be made heritable -- the value instance may * be shared until the AList is serialized/deserialized. * * @param key to make heritable */ public void makeHeritable(String key) { List heritableKeys = (List) getObject(A_HERITABLE_KEYS); if(heritableKeys==null) { heritableKeys = new ArrayList(); heritableKeys.add(A_HERITABLE_KEYS); putObject(A_HERITABLE_KEYS,heritableKeys); } heritableKeys.add(key); } /** Make the given key non-'heritable', meaning its value will * not be added to descendant CandidateURIs. Only meaningful if * key was previously made heritable. * * @param key to make non-heritable */ public void makeNonHeritable(String key) { List heritableKeys = (List) getObject(A_HERITABLE_KEYS); if(heritableKeys==null) { return; } heritableKeys.remove(key); if(heritableKeys.size()==1) { // only remaining heritable key is itself; disable completely remove(A_HERITABLE_KEYS); } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -