⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 extractoruniversal.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
/* Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA * * Created on Jan 15, 2004 * */package org.archive.crawler.extractor;import java.io.IOException;import java.io.InputStream;import java.util.regex.Matcher;import javax.management.AttributeNotFoundException;import org.archive.crawler.datamodel.CoreAttributeConstants;import org.archive.crawler.datamodel.CrawlURI;import org.archive.crawler.settings.SimpleType;import org.archive.crawler.settings.Type;import org.archive.net.UURI;import org.archive.util.TextUtils;/** * A last ditch extractor that will look at the raw byte code and try to extract * anything that <i>looks</i> like a link. * * If used, it should always be specified as the last link extractor in the * order file. * <p> * To accomplish this it will scan through the bytecode and try and build up * strings of consecutive bytes that all represent characters that are valid * in a URL (see #isURLableChar(int) for details). * Once it hits the end of such a string (i.e. finds a character that * should not be in a URL) it will try to determine if it has found a URL. * This is done be seeing if the string is an IP address prefixed with * http(s):// or contains a dot followed by a Top Level Domain and end of * string or a slash. * * @author Kristinn Sigurdsson */public class ExtractorUniversal extends Extractorimplements CoreAttributeConstants {    private static final long serialVersionUID = -7593380118857156939L;//    private static final Logger logger =//        Logger.getLogger(ExtractorUniversal.class.getName());        private static String ATTR_MAX_DEPTH_BYTES = "max-depth-bytes";    /** Default value for how far into an unknown document we should scan     * - 10k. A value of 0 or lower will disable this.     */    private static long DEFAULT_MAX_DEPTH_BYTES = 10240;    private static String ATTR_MAX_URL_LENGTH = "max-url-length";    /** Maximum length for a URI that we try to match.*/    private static long DEFAULT_MAX_URL_LENGTH = UURI.MAX_URL_LENGTH;    /**     * Matches any string that begins with http:// or https:// followed by     * something that looks like an ip address (four numbers, none longer then     * 3 chars seperated by 3 dots). Does <b>not</b> ensure that the numbers are     * each in the range 0-255.     */    static final String IP_ADDRESS =        "((http://)|(https://))(\\d(\\d)?(\\d)?\\.\\d(\\d)?(\\d)?\\.\\d(\\d)?(\\d)?\\.\\d(\\d)?(\\d)?)";    /**     * Matches any string that begins with a TLD (no .) followed by a '/' slash     * or end of string. If followed by slash then nothing after the slash is     * of consequence.     */    public static final String TLDs =          "(ac(/.*)?)"  // ac  Ascension Island        + "|(ad(/.*)?)" // ad  Andorra        + "|(ae(/.*)?)" // ae  United Arab Emirates        + "|(af(/.*)?)" // af  Afghanistan        + "|(ag(/.*)?)" // ag  Antigua and Barbuda        + "|(ai(/.*)?)" // ai  Anguilla        + "|(al(/.*)?)" // al  Albania        + "|(am(/.*)?)" // am  Armenia        + "|(an(/.*)?)" // an  Netherlands Antilles        + "|(ao(/.*)?)" // ao  Angola        + "|(aero(/.*)?)" // aero Air-transport industry        + "|(aq(/.*)?)" // aq  Antarctica        + "|(ar(/.*)?)" // ar  Argentina        + "|(as(/.*)?)" // as  American Samoa        + "|(at(/.*)?)" // at  Austria        + "|(au(/.*)?)" // au  Australia        + "|(aw(/.*)?)" // aw  Aruba        + "|(az(/.*)?)" // az  Azerbaijan        + "|(ba(/.*)?)" // ba  Bosnia Hercegovina        + "|(bb(/.*)?)" // bb  Barbados        + "|(bd(/.*)?)" // bd  Bangladesh        + "|(be(/.*)?)" // be  Belgium        + "|(bf(/.*)?)" // bf  Burkina Faso        + "|(bg(/.*)?)" // bg  Bulgaria        + "|(bh(/.*)?)" // bh  Bahrain        + "|(bi(/.*)?)" // bi  Burundi        + "|(biz(/.*)?)" // biz Businesses        + "|(bj(/.*)?)" // bj  Benin        + "|(bm(/.*)?)" // bm  Bermuda        + "|(bn(/.*)?)" // bn  Brunei Darussalam        + "|(bo(/.*)?)" // bo  Bolivia        + "|(br(/.*)?)" // br  Brazil        + "|(bs(/.*)?)" // bs  Bahamas        + "|(bt(/.*)?)" // bt  Bhutan        + "|(bv(/.*)?)" // bv  Bouvet Island        + "|(bw(/.*)?)" // bw  Botswana        + "|(by(/.*)?)" // by  Belarus (Byelorussia)        + "|(bz(/.*)?)" // bz  Belize        + "|(ca(/.*)?)" // ca  Canada        + "|(cc(/.*)?)" // cc  Cocos Islands (Keeling)        + "|(cd(/.*)?)" // cd  Congo, Democratic Republic of the        + "|(cf(/.*)?)" // cf  Central African Republic        + "|(cg(/.*)?)" // cg  Congo, Republic of        + "|(ch(/.*)?)" // ch  Switzerland        + "|(ci(/.*)?)" // ci  Cote d'Ivoire (Ivory Coast)        + "|(ck(/.*)?)" // ck  Cook Islands        + "|(cl(/.*)?)" // cl  Chile        + "|(cm(/.*)?)" // cm  Cameroon        + "|(cn(/.*)?)" // cn  China        + "|(co(/.*)?)" // co  Colombia        + "|(com(/.*)?)" // com Commercial        + "|(coop(/.*)?)" // coop Cooperatives        + "|(cr(/.*)?)" // cr  Costa Rica        + "|(cs(/.*)?)" // cs  Czechoslovakia        + "|(cu(/.*)?)" // cu  Cuba        + "|(cv(/.*)?)" // cv  Cap Verde        + "|(cx(/.*)?)" // cx  Christmas Island        + "|(cy(/.*)?)" // cy  Cyprus        + "|(cz(/.*)?)" // cz  Czech Republic        + "|(de(/.*)?)" // de  Germany        + "|(dj(/.*)?)" // dj  Djibouti        + "|(dk(/.*)?)" // dk  Denmark        + "|(dm(/.*)?)" // dm  Dominica        + "|(do(/.*)?)" // do  Dominican Republic        + "|(dz(/.*)?)" // dz  Algeria        + "|(ec(/.*)?)" // ec  Ecuador        + "|(edu(/.*)?)" // edu Educational Institution        + "|(ee(/.*)?)" // ee  Estonia        + "|(eg(/.*)?)" // eg  Egypt        + "|(eh(/.*)?)" // eh  Western Sahara        + "|(er(/.*)?)" // er  Eritrea        + "|(es(/.*)?)" // es  Spain        + "|(et(/.*)?)" // et  Ethiopia        + "|(fi(/.*)?)" // fi  Finland        + "|(fj(/.*)?)" // fj  Fiji        + "|(fk(/.*)?)" // fk  Falkland Islands        + "|(fm(/.*)?)" // fm  Micronesia, Federal State of        + "|(fo(/.*)?)" // fo  Faroe Islands        + "|(fr(/.*)?)" // fr  France        + "|(ga(/.*)?)" // ga  Gabon        + "|(gd(/.*)?)" // gd  Grenada        + "|(ge(/.*)?)" // ge  Georgia        + "|(gf(/.*)?)" // gf  French Guiana        + "|(gg(/.*)?)" // gg  Guernsey        + "|(gh(/.*)?)" // gh  Ghana        + "|(gi(/.*)?)" // gi  Gibraltar        + "|(gl(/.*)?)" // gl  Greenland        + "|(gm(/.*)?)" // gm  Gambia        + "|(gn(/.*)?)" // gn  Guinea        + "|(gov(/.*)?)" // gov Government (US)        + "|(gp(/.*)?)" // gp  Guadeloupe        + "|(gq(/.*)?)" // gq  Equatorial Guinea        + "|(gr(/.*)?)" // gr  Greece        + "|(gs(/.*)?)" // gs  South Georgia and the South Sandwich Islands        + "|(gt(/.*)?)" // gt  Guatemala        + "|(gu(/.*)?)" // gu  Guam        + "|(gw(/.*)?)" // gw  Guinea-Bissau        + "|(gy(/.*)?)" // gy  Guyana        + "|(hk(/.*)?)" // hk  Hong Kong        + "|(hm(/.*)?)" // hm  Heard and McDonald Islands        + "|(hn(/.*)?)" // hn  Honduras        + "|(hr(/.*)?)" // hr  Croatia/Hrvatska        + "|(ht(/.*)?)" // ht  Haiti        + "|(hu(/.*)?)" // hu  Hungary        + "|(id(/.*)?)" // id  Indonesia        + "|(ie(/.*)?)" // ie  Ireland        + "|(il(/.*)?)" // il  Israel        + "|(im(/.*)?)" // im  Isle of Man        + "|(in(/.*)?)" // in  India        + "|(info(/.*)?)" // info        + "|(int(/.*)?)" // int Int. Organizations        + "|(io(/.*)?)" // io  British Indian Ocean Territory        + "|(iq(/.*)?)" // iq  Iraq        + "|(ir(/.*)?)" // ir  Iran, Islamic Republic of        + "|(is(/.*)?)" // is  Iceland        + "|(it(/.*)?)" // it  Italy        + "|(je(/.*)?)" // je  Jersey        + "|(jm(/.*)?)" // jm  Jamaica        + "|(jo(/.*)?)" // jo  Jordan        + "|(jp(/.*)?)" // jp  Japan        + "|(ke(/.*)?)" // ke  Kenya        + "|(kg(/.*)?)" // kg  Kyrgyzstan        + "|(kh(/.*)?)" // kh  Cambodia        + "|(ki(/.*)?)" // ki  Kiribati        + "|(km(/.*)?)" // km  Comoros        + "|(kn(/.*)?)" // kn  Saint Kitts and Nevis        + "|(kp(/.*)?)" // kp  Korea, Democratic People's Republic        + "|(kr(/.*)?)" // kr  Korea, Republic of        + "|(kw(/.*)?)" // kw  Kuwait        + "|(ky(/.*)?)" // ky  Cayman Islands        + "|(kz(/.*)?)" // kz  Kazakhstan        + "|(la(/.*)?)" // la  Lao People's Democratic Republic        + "|(lb(/.*)?)" // lb  Lebanon        + "|(lc(/.*)?)" // lc  Saint Lucia        + "|(li(/.*)?)" // li  Liechtenstein        + "|(lk(/.*)?)" // lk  Sri Lanka        + "|(lr(/.*)?)" // lr  Liberia        + "|(ls(/.*)?)" // ls  Lesotho        + "|(lt(/.*)?)" // lt  Lithuania        + "|(lu(/.*)?)" // lu  Luxembourg        + "|(lv(/.*)?)" // lv  Latvia        + "|(ly(/.*)?)" // ly  Libyan Arab Jamahiriya        + "|(ma(/.*)?)" // ma  Morocco        + "|(mc(/.*)?)" // mc  Monaco        + "|(md(/.*)?)" // md  Moldova, Republic of        + "|(mg(/.*)?)" // mg  Madagascar        + "|(mh(/.*)?)" // mh  Marshall Islands        + "|(mil(/.*)?)" // mil Military (US Dept of Defense)        + "|(mk(/.*)?)" // mk  Macedonia, Former Yugoslav Republic        + "|(ml(/.*)?)" // ml  Mali        + "|(mm(/.*)?)" // mm  Myanmar        + "|(mn(/.*)?)" // mn  Mongolia        + "|(mo(/.*)?)" // mo  Macau        + "|(mp(/.*)?)" // mp  Northern Mariana Islands        + "|(mq(/.*)?)" // mq  Martinique        + "|(mr(/.*)?)" // mr  Mauritani        + "|(ms(/.*)?)" // ms  Montserrat        + "|(mt(/.*)?)" // mt  Malta        + "|(mu(/.*)?)" // mu  Mauritius        + "|(museum(/.*)?)" // museum Museums        + "|(mv(/.*)?)" // mv  Maldives        + "|(mw(/.*)?)" // mw  Malawi        + "|(mx(/.*)?)" // mx  Mexico        + "|(my(/.*)?)" // my  Malaysia        + "|(mz(/.*)?)" // mz  Mozambique        + "|(na(/.*)?)" // na  Namibia        + "|(name(/.*)?)" // name Individuals        + "|(nc(/.*)?)" // nc  New Caledonia        + "|(ne(/.*)?)" // ne  Niger        + "|(net(/.*)?)" // net networks        + "|(nf(/.*)?)" // nf  Norfolk Island        + "|(ng(/.*)?)" // ng  Nigeria        + "|(ni(/.*)?)" // ni  Nicaragua        + "|(nl(/.*)?)" // nl  Netherlands        + "|(no(/.*)?)" // no  Norway        + "|(np(/.*)?)" // np  Nepal        + "|(nr(/.*)?)" // nr  Nauru        + "|(nt(/.*)?)" // nt  Neutral Zone        + "|(nu(/.*)?)" // nu  Niue        + "|(nz(/.*)?)" // nz  New Zealand        + "|(om(/.*)?)" // om  Oman        + "|(org(/.*)?)" // org Organization (non-profit)        + "|(pa(/.*)?)" // pa  Panama        + "|(pe(/.*)?)" // pe  Peru        + "|(pf(/.*)?)" // pf  French Polynesia        + "|(pg(/.*)?)" // pg  Papua New Guinea        + "|(ph(/.*)?)" // ph  Philippines        + "|(pk(/.*)?)" // pk  Pakistan        + "|(pl(/.*)?)" // pl  Poland        + "|(pm(/.*)?)" // pm  St. Pierre and Miquelon        + "|(pn(/.*)?)" // pn  Pitcairn Island        + "|(pr(/.*)?)" // pr  Puerto Rico

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -