📄 extractoruniversal.java
字号:
+ "|(pro(/.*)?)" // pro Accountants, lawyers, and physicians + "|(ps(/.*)?)" // ps Palestinian Territories + "|(pt(/.*)?)" // pt Portugal + "|(pw(/.*)?)" // pw Palau + "|(py(/.*)?)" // py Paraguay + "|(qa(/.*)?)" // qa Qatar + "|(re(/.*)?)" // re Reunion Island + "|(ro(/.*)?)" // ro Romania + "|(ru(/.*)?)" // ru Russian Federation + "|(rw(/.*)?)" // rw Rwanda + "|(sa(/.*)?)" // sa Saudi Arabia + "|(sb(/.*)?)" // sb Solomon Islands + "|(sc(/.*)?)" // sc Seychelles + "|(sd(/.*)?)" // sd Sudan + "|(se(/.*)?)" // se Sweden + "|(sg(/.*)?)" // sg Singapore + "|(sh(/.*)?)" // sh St. Helena + "|(si(/.*)?)" // si Slovenia + "|(sj(/.*)?)" // sj Svalbard and Jan Mayen Islands + "|(sk(/.*)?)" // sk Slovak Republic + "|(sl(/.*)?)" // sl Sierra Leone + "|(sm(/.*)?)" // sm San Marino + "|(sn(/.*)?)" // sn Senegal + "|(so(/.*)?)" // so Somalia + "|(sr(/.*)?)" // sr Suriname + "|(sv(/.*)?)" // sv El Salvador + "|(st(/.*)?)" // st Sao Tome and Principe + "|(sy(/.*)?)" // sy Syrian Arab Republic + "|(sz(/.*)?)" // sz Swaziland + "|(tc(/.*)?)" // tc Turks and Caicos Islands + "|(td(/.*)?)" // td Chad + "|(tf(/.*)?)" // tf French Southern Territories + "|(tg(/.*)?)" // tg Togo + "|(th(/.*)?)" // th Thailand + "|(tj(/.*)?)" // tj Tajikistan + "|(tk(/.*)?)" // tk Tokelau + "|(tm(/.*)?)" // tm Turkmenistan + "|(tn(/.*)?)" // tn Tunisia + "|(to(/.*)?)" // to Tonga + "|(tp(/.*)?)" // tp East Timor + "|(tr(/.*)?)" // tr Turkey + "|(tt(/.*)?)" // tt Trinidad and Tobago + "|(tv(/.*)?)" // tv Tuvalu + "|(tw(/.*)?)" // tw Taiwan + "|(tz(/.*)?)" // tz Tanzania + "|(ua(/.*)?)" // ua Ukraine + "|(ug(/.*)?)" // ug Uganda + "|(uk(/.*)?)" // uk United Kingdom + "|(um(/.*)?)" // um US Minor Outlying Islands + "|(us(/.*)?)" // us United States + "|(uy(/.*)?)" // uy Uruguay + "|(uz(/.*)?)" // uz Uzbekistan + "|(va(/.*)?)" // va Holy See (City Vatican State) + "|(vc(/.*)?)" // vc Saint Vincent and the Grenadines + "|(ve(/.*)?)" // ve Venezuela + "|(vg(/.*)?)" // vg Virgin Islands (British) + "|(vi(/.*)?)" // vi Virgin Islands (USA) + "|(vn(/.*)?)" // vn Vietnam + "|(vu(/.*)?)" // vu Vanuatu + "|(wf(/.*)?)" // wf Wallis and Futuna Islands + "|(ws(/.*)?)" // ws Western Samoa + "|(ye(/.*)?)" // ye Yemen + "|(yt(/.*)?)" // yt Mayotte + "|(yu(/.*)?)" // yu Yugoslavia + "|(za(/.*)?)" // za South Africa + "|(zm(/.*)?)" // zm Zambia + "|(zw(/.*)?)" // zw Zimbabwe ; protected long numberOfCURIsHandled = 0; protected long numberOfLinksExtracted= 0; /** * Constructor * @param name The name of the module. */ public ExtractorUniversal(String name) { super(name, "Link extraction on unknown file types. A best effort" + " extractor that looks at the raw byte code of any file " + "that has not been handled by another extractor and tries" + " to find URIs. Will only match absolute URIs."); Type e; e = addElementToDefinition(new SimpleType(ATTR_MAX_DEPTH_BYTES, "How deep to look into files for URI strings, in bytes", new Long(DEFAULT_MAX_DEPTH_BYTES))); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_MAX_URL_LENGTH, "Max length of URIs in bytes", new Long(DEFAULT_MAX_URL_LENGTH))); e.setExpertSetting(true); } protected void extract(CrawlURI curi) { if (!isHttpTransactionContentToProcess(curi)) { return; } numberOfCURIsHandled++; try { InputStream instream = curi.getHttpRecorder().getRecordedInput(). getContentReplayInputStream(); int ch = instream.read(); StringBuffer lookat = new StringBuffer(); long counter = 0; long maxdepth = ((Long)getAttribute(ATTR_MAX_DEPTH_BYTES,curi)). longValue(); if(maxdepth<=0){ maxdepth = Long.MAX_VALUE; } long maxURLLength = ((Long)getAttribute(ATTR_MAX_URL_LENGTH,curi)). longValue(); boolean foundDot = false; while(ch != -1 && ++counter <= maxdepth) { if(lookat.length()>maxURLLength){ //Exceeded maximum length of a URL. Start fresh. lookat = new StringBuffer(); foundDot = false; } else if(isURLableChar(ch)){ //Add to buffer. if(ch == 46){ // Current character is a dot '.' foundDot = true; } lookat.append((char)ch); } else if(lookat.length() > 3 && foundDot) { // It takes a bare mininum of 4 characters to form a URL // Since we have at least that many let's try link // extraction. String newURL = lookat.toString(); if(looksLikeAnURL(newURL)) { // Looks like we found something. // Let's start with a little cleanup as we may have // junk in front or at the end. if(newURL.toLowerCase().indexOf("http") > 0){ // Got garbage in front of the protocol. Remove. newURL = newURL.substring(newURL.toLowerCase(). indexOf("http")); } while(newURL.substring(newURL.length()-1).equals(".")) { // URLs can't end with a dot. Strip it off. newURL = newURL.substring(0,newURL.length()-1); } // And add the URL to speculative embeds. numberOfLinksExtracted++; curi.createAndAddLink(newURL,Link.SPECULATIVE_MISC,Link.SPECULATIVE_HOP); } // Reset lookat for next string. lookat = new StringBuffer(); foundDot = false; } else if(lookat.length()>0) { // Didn't get enough chars. Reset lookat for next string. lookat = new StringBuffer(); foundDot = false; } ch = instream.read(); } } catch(IOException e){ //TODO: Handle this exception. e.printStackTrace(); } catch (AttributeNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } // Set flag to indicate that link extraction is completed. curi.linkExtractorFinished(); } /** * This method takes a look at a string and determines if it could be a URL. * To qualify the string must either begin with "http://" (https would also * work) followed by something that looks like an IP address or contain * within the string (possible at the end but not at the beginning) a TLD * (Top Level Domain) preceded by a dot. * * @param lookat The string to examine in an effort to determine if it * could be a URL * @return True if the string matches the above criteria for a URL. */ private boolean looksLikeAnURL(String lookat) { if(lookat.indexOf("http://")==0 || lookat.indexOf("https://")==0){ //Check if the rest of the string looks like an IP address. //if so return true. Otherwise continue on. Matcher ip = TextUtils.getMatcher(IP_ADDRESS, lookat); boolean testVal = ip.matches(); TextUtils.recycleMatcher(ip); if(testVal){ return true; } } int dot = lookat.indexOf("."); if(dot!=0){//An URL can't start with a .tld. while(dot != -1 && dot < lookat.length()){ lookat = lookat.substring(dot+1); if (isTLD(lookat.substring(0, lookat.length() <= 6? lookat.length(): 6))) { return true; } dot = lookat.indexOf("."); } } return false; } /** * Checks if a string is equal to known Top Level Domain. The string may * contain additional characters <i>after</i> the TLD but not before. * @param potentialTLD The string (usually 2-6 chars) to check if it starts * with a TLD. * @return True if the given string starts with the name of a known TLD * * @see #TLDs */ private boolean isTLD(String potentialTLD) { if(potentialTLD.length()<2){ return false; } potentialTLD.toLowerCase(); Matcher uri = TextUtils.getMatcher(TLDs, potentialTLD); boolean ret = uri.matches(); TextUtils.recycleMatcher(uri); return ret; } /** * Determines if a char (as represented by an int in the range of 0-255) is * a character (in the Ansi character set) that can be present in a URL. * This method takes a <b>strict</b> approach to what characters can be in * a URL. * <p> * The following are considered to be 'URLable'<br> * <ul> * <li> <code># $ % & + , - . /</code> values 35-38,43-47 * <li> <code>[0-9]</code> values 48-57 * <li> <code>: ; = ? @</code> value 58-59,61,63-64 * <li> <code>[A-Z]</code> values 65-90 * <li> <code>_</code> value 95 * <li> <code>[a-z]</code> values 97-122 * <li> <code>~</code> value 126 * </ul> * <p> * To summerize, the following ranges are considered URLable:<br> * 35-38,43-59,61,63-90,95,97-122,126 * * @param ch The character (represented by an int) to test. * @return True if it is a URLable character, false otherwise. */ private boolean isURLableChar(int ch) { return (ch>=35 && ch<=38) || (ch>=43 && ch<=59) || (ch==61) || (ch>=63 && ch<=90) || (ch==95) || (ch>=97 && ch<=122) || (ch==126); } /* (non-Javadoc) * @see org.archive.crawler.framework.Processor#report() */ public String report() { StringBuffer ret = new StringBuffer(); ret.append("Processor: org.archive.crawler.extractor." + "ExtractorUniversal\n"); ret.append(" Function: Link extraction on unknown file" + " types.\n"); ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n"); ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n"); return ret.toString(); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -