📄 extractoruniversal.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
        + "|(pro(/.*)?)" // pro Accountants, lawyers, and physicians        + "|(ps(/.*)?)" // ps  Palestinian Territories        + "|(pt(/.*)?)" // pt  Portugal        + "|(pw(/.*)?)" // pw  Palau        + "|(py(/.*)?)" // py  Paraguay        + "|(qa(/.*)?)" // qa  Qatar        + "|(re(/.*)?)" // re  Reunion Island        + "|(ro(/.*)?)" // ro  Romania        + "|(ru(/.*)?)" // ru  Russian Federation        + "|(rw(/.*)?)" // rw  Rwanda        + "|(sa(/.*)?)" // sa  Saudi Arabia        + "|(sb(/.*)?)" // sb  Solomon Islands        + "|(sc(/.*)?)" // sc  Seychelles        + "|(sd(/.*)?)" // sd  Sudan        + "|(se(/.*)?)" // se  Sweden        + "|(sg(/.*)?)" // sg  Singapore        + "|(sh(/.*)?)" // sh  St. Helena        + "|(si(/.*)?)" // si  Slovenia        + "|(sj(/.*)?)" // sj  Svalbard and Jan Mayen Islands        + "|(sk(/.*)?)" // sk  Slovak Republic        + "|(sl(/.*)?)" // sl  Sierra Leone        + "|(sm(/.*)?)" // sm  San Marino        + "|(sn(/.*)?)" // sn  Senegal        + "|(so(/.*)?)" // so  Somalia        + "|(sr(/.*)?)" // sr  Suriname        + "|(sv(/.*)?)" // sv  El Salvador        + "|(st(/.*)?)" // st  Sao Tome and Principe        + "|(sy(/.*)?)" // sy  Syrian Arab Republic        + "|(sz(/.*)?)" // sz  Swaziland        + "|(tc(/.*)?)" // tc  Turks and Caicos Islands        + "|(td(/.*)?)" // td  Chad        + "|(tf(/.*)?)" // tf  French Southern Territories        + "|(tg(/.*)?)" // tg  Togo        + "|(th(/.*)?)" // th  Thailand        + "|(tj(/.*)?)" // tj  Tajikistan        + "|(tk(/.*)?)" // tk  Tokelau        + "|(tm(/.*)?)" // tm  Turkmenistan        + "|(tn(/.*)?)" // tn  Tunisia        + "|(to(/.*)?)" // to  Tonga        + "|(tp(/.*)?)" // tp  East Timor        + "|(tr(/.*)?)" // tr  Turkey        + "|(tt(/.*)?)" // tt  Trinidad and Tobago        + "|(tv(/.*)?)" // tv  Tuvalu        + "|(tw(/.*)?)" // tw  Taiwan        + "|(tz(/.*)?)" // tz  Tanzania        + "|(ua(/.*)?)" // ua  Ukraine        + "|(ug(/.*)?)" // ug  Uganda        + "|(uk(/.*)?)" // uk  United Kingdom        + "|(um(/.*)?)" // um  US Minor Outlying Islands        + "|(us(/.*)?)" // us  United States        + "|(uy(/.*)?)" // uy  Uruguay        + "|(uz(/.*)?)" // uz  Uzbekistan        + "|(va(/.*)?)" // va  Holy See (City Vatican State)        + "|(vc(/.*)?)" // vc  Saint Vincent and the Grenadines        + "|(ve(/.*)?)" // ve  Venezuela        + "|(vg(/.*)?)" // vg  Virgin Islands (British)        + "|(vi(/.*)?)" // vi  Virgin Islands (USA)        + "|(vn(/.*)?)" // vn  Vietnam        + "|(vu(/.*)?)" // vu  Vanuatu        + "|(wf(/.*)?)" // wf  Wallis and Futuna Islands        + "|(ws(/.*)?)" // ws  Western Samoa        + "|(ye(/.*)?)" // ye  Yemen        + "|(yt(/.*)?)" // yt  Mayotte        + "|(yu(/.*)?)" // yu  Yugoslavia        + "|(za(/.*)?)" // za  South Africa        + "|(zm(/.*)?)" // zm  Zambia        + "|(zw(/.*)?)" // zw  Zimbabwe        ;    protected long numberOfCURIsHandled = 0;    protected long numberOfLinksExtracted= 0;    /**     * Constructor     * @param name The name of the module.     */    public ExtractorUniversal(String name) {        super(name, "Link extraction on unknown file types. A best effort" +                " extractor that looks at the raw byte code of any file " +                "that has not been handled by another extractor and tries" +                " to find URIs. Will only match absolute URIs.");        Type e;        e = addElementToDefinition(new SimpleType(ATTR_MAX_DEPTH_BYTES,            "How deep to look into files for URI strings, in bytes",            new Long(DEFAULT_MAX_DEPTH_BYTES)));        e.setExpertSetting(true);        e = addElementToDefinition(new SimpleType(ATTR_MAX_URL_LENGTH,            "Max length of URIs in bytes", new Long(DEFAULT_MAX_URL_LENGTH)));        e.setExpertSetting(true);    }    protected void extract(CrawlURI curi) {        if (!isHttpTransactionContentToProcess(curi)) {            return;        }        numberOfCURIsHandled++;        try {            InputStream instream = curi.getHttpRecorder().getRecordedInput().                getContentReplayInputStream();            int ch = instream.read();            StringBuffer lookat = new StringBuffer();            long counter = 0;            long maxdepth = ((Long)getAttribute(ATTR_MAX_DEPTH_BYTES,curi)).                longValue();            if(maxdepth<=0){                maxdepth = Long.MAX_VALUE;            }            long maxURLLength = ((Long)getAttribute(ATTR_MAX_URL_LENGTH,curi)).                longValue();            boolean foundDot = false;            while(ch != -1 && ++counter <= maxdepth) {                if(lookat.length()>maxURLLength){                    //Exceeded maximum length of a URL. Start fresh.                    lookat = new StringBuffer();                    foundDot = false;                }                else if(isURLableChar(ch)){                    //Add to buffer.                    if(ch == 46){                        // Current character is a dot '.'                        foundDot = true;                    }                    lookat.append((char)ch);                } else if(lookat.length() > 3 && foundDot) {                    // It takes a bare mininum of 4 characters to form a URL                    // Since we have at least that many let's try link                    // extraction.                    String newURL = lookat.toString();                    if(looksLikeAnURL(newURL))                    {                        // Looks like we found something.                        // Let's start with a little cleanup as we may have                        // junk in front or at the end.                        if(newURL.toLowerCase().indexOf("http") > 0){                            // Got garbage in front of the protocol. Remove.                            newURL = newURL.substring(newURL.toLowerCase().                                indexOf("http"));                        }                        while(newURL.substring(newURL.length()-1).equals("."))                        {                            // URLs can't end with a dot. Strip it off.                            newURL = newURL.substring(0,newURL.length()-1);                        }                        // And add the URL to speculative embeds.                        numberOfLinksExtracted++;                        curi.createAndAddLink(newURL,Link.SPECULATIVE_MISC,Link.SPECULATIVE_HOP);                    }                    // Reset lookat for next string.                    lookat = new StringBuffer();                    foundDot = false;                } else if(lookat.length()>0) {                    // Didn't get enough chars. Reset lookat for next string.                    lookat = new StringBuffer();                    foundDot = false;                }                ch = instream.read();            }        } catch(IOException e){            //TODO: Handle this exception.            e.printStackTrace();        } catch (AttributeNotFoundException e) {            // TODO Auto-generated catch block            e.printStackTrace();        }        // Set flag to indicate that link extraction is completed.        curi.linkExtractorFinished();    }    /**     * This method takes a look at a string and determines if it could be a URL.     * To qualify the string must either begin with "http://" (https would also     * work) followed by something that looks like an IP address or contain     * within the string (possible at the end but not at the beginning) a TLD     * (Top Level Domain) preceded by a dot.     *     * @param lookat The string to examine in an effort to determine if it     * could be a URL     * @return True if the string matches the above criteria for a URL.     */    private boolean looksLikeAnURL(String lookat) {        if(lookat.indexOf("http://")==0 || lookat.indexOf("https://")==0){            //Check if the rest of the string looks like an IP address.            //if so return true. Otherwise continue on.            Matcher ip = TextUtils.getMatcher(IP_ADDRESS, lookat);            boolean testVal = ip.matches();            TextUtils.recycleMatcher(ip);            if(testVal){                return true;            }        }        int dot = lookat.indexOf(".");        if(dot!=0){//An URL can't start with a .tld.            while(dot != -1 && dot < lookat.length()){                lookat = lookat.substring(dot+1);                if (isTLD(lookat.substring(0, lookat.length() <= 6?                    lookat.length(): 6)))                {                    return true;                }                dot = lookat.indexOf(".");            }        }        return false;    }    /**     * Checks if a string is equal to known Top Level Domain. The string may     * contain additional characters <i>after</i> the TLD but not before.     * @param potentialTLD The string (usually 2-6 chars) to check if it starts     * with a TLD.     * @return True if the given string starts with the name of a known TLD     *     * @see #TLDs     */    private boolean isTLD(String potentialTLD) {        if(potentialTLD.length()<2){            return false;        }        potentialTLD.toLowerCase();        Matcher uri = TextUtils.getMatcher(TLDs, potentialTLD);        boolean ret = uri.matches();        TextUtils.recycleMatcher(uri);        return ret;    }    /**     * Determines if a char (as represented by an int in the range of 0-255) is     * a character (in the Ansi character set) that can be present in a URL.     * This method takes a <b>strict</b> approach to what characters can be in     * a URL.     * <p>     * The following are considered to be 'URLable'<br>     * <ul>     *  <li> <code># $ % & + , - . /</code> values 35-38,43-47     *  <li> <code>[0-9]</code> values 48-57     *  <li> <code>: ; = ? @</code> value 58-59,61,63-64     *  <li> <code>[A-Z]</code> values 65-90     *  <li> <code>_</code> value 95     *  <li> <code>[a-z]</code> values 97-122     *  <li> <code>~</code> value 126     * </ul>     * <p>     * To summerize, the following ranges are considered URLable:<br>     * 35-38,43-59,61,63-90,95,97-122,126     *     * @param ch The character (represented by an int) to test.     * @return True if it is a URLable character, false otherwise.     */    private boolean isURLableChar(int ch) {        return (ch>=35 && ch<=38)            || (ch>=43 && ch<=59)            || (ch==61)            || (ch>=63 && ch<=90)            || (ch==95)            || (ch>=97 && ch<=122)            || (ch==126);    }    /* (non-Javadoc)     * @see org.archive.crawler.framework.Processor#report()     */    public String report() {        StringBuffer ret = new StringBuffer();        ret.append("Processor: org.archive.crawler.extractor." +            "ExtractorUniversal\n");        ret.append("  Function:          Link extraction on unknown file" +            " types.\n");        ret.append("  CrawlURIs handled: " + numberOfCURIsHandled + "\n");        ret.append("  Links extracted:   " + numberOfLinksExtracted + "\n\n");        return ret.toString();    }}
上一页 12
💿 文件大小 9430 K
👤 上传用户 zergwyk
📂 所属分类 Internet/网络编程
🏷️ 相关标签

#lucece
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -