📄 extractoruniversal.java
字号:
/* Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Created on Jan 15, 2004 * */package org.archive.crawler.extractor;import java.io.IOException;import java.io.InputStream;import java.util.regex.Matcher;import javax.management.AttributeNotFoundException;import org.archive.crawler.datamodel.CoreAttributeConstants;import org.archive.crawler.datamodel.CrawlURI;import org.archive.crawler.settings.SimpleType;import org.archive.crawler.settings.Type;import org.archive.net.UURI;import org.archive.util.TextUtils;/** * A last ditch extractor that will look at the raw byte code and try to extract * anything that <i>looks</i> like a link. * * If used, it should always be specified as the last link extractor in the * order file. * <p> * To accomplish this it will scan through the bytecode and try and build up * strings of consecutive bytes that all represent characters that are valid * in a URL (see #isURLableChar(int) for details). * Once it hits the end of such a string (i.e. finds a character that * should not be in a URL) it will try to determine if it has found a URL. * This is done be seeing if the string is an IP address prefixed with * http(s):// or contains a dot followed by a Top Level Domain and end of * string or a slash. * * @author Kristinn Sigurdsson */public class ExtractorUniversal extends Extractorimplements CoreAttributeConstants { private static final long serialVersionUID = -7593380118857156939L;// private static final Logger logger =// Logger.getLogger(ExtractorUniversal.class.getName()); private static String ATTR_MAX_DEPTH_BYTES = "max-depth-bytes"; /** Default value for how far into an unknown document we should scan * - 10k. A value of 0 or lower will disable this. */ private static long DEFAULT_MAX_DEPTH_BYTES = 10240; private static String ATTR_MAX_URL_LENGTH = "max-url-length"; /** Maximum length for a URI that we try to match.*/ private static long DEFAULT_MAX_URL_LENGTH = UURI.MAX_URL_LENGTH; /** * Matches any string that begins with http:// or https:// followed by * something that looks like an ip address (four numbers, none longer then * 3 chars seperated by 3 dots). Does <b>not</b> ensure that the numbers are * each in the range 0-255. */ static final String IP_ADDRESS = "((http://)|(https://))(\\d(\\d)?(\\d)?\\.\\d(\\d)?(\\d)?\\.\\d(\\d)?(\\d)?\\.\\d(\\d)?(\\d)?)"; /** * Matches any string that begins with a TLD (no .) followed by a '/' slash * or end of string. If followed by slash then nothing after the slash is * of consequence. */ public static final String TLDs = "(ac(/.*)?)" // ac Ascension Island + "|(ad(/.*)?)" // ad Andorra + "|(ae(/.*)?)" // ae United Arab Emirates + "|(af(/.*)?)" // af Afghanistan + "|(ag(/.*)?)" // ag Antigua and Barbuda + "|(ai(/.*)?)" // ai Anguilla + "|(al(/.*)?)" // al Albania + "|(am(/.*)?)" // am Armenia + "|(an(/.*)?)" // an Netherlands Antilles + "|(ao(/.*)?)" // ao Angola + "|(aero(/.*)?)" // aero Air-transport industry + "|(aq(/.*)?)" // aq Antarctica + "|(ar(/.*)?)" // ar Argentina + "|(as(/.*)?)" // as American Samoa + "|(at(/.*)?)" // at Austria + "|(au(/.*)?)" // au Australia + "|(aw(/.*)?)" // aw Aruba + "|(az(/.*)?)" // az Azerbaijan + "|(ba(/.*)?)" // ba Bosnia Hercegovina + "|(bb(/.*)?)" // bb Barbados + "|(bd(/.*)?)" // bd Bangladesh + "|(be(/.*)?)" // be Belgium + "|(bf(/.*)?)" // bf Burkina Faso + "|(bg(/.*)?)" // bg Bulgaria + "|(bh(/.*)?)" // bh Bahrain + "|(bi(/.*)?)" // bi Burundi + "|(biz(/.*)?)" // biz Businesses + "|(bj(/.*)?)" // bj Benin + "|(bm(/.*)?)" // bm Bermuda + "|(bn(/.*)?)" // bn Brunei Darussalam + "|(bo(/.*)?)" // bo Bolivia + "|(br(/.*)?)" // br Brazil + "|(bs(/.*)?)" // bs Bahamas + "|(bt(/.*)?)" // bt Bhutan + "|(bv(/.*)?)" // bv Bouvet Island + "|(bw(/.*)?)" // bw Botswana + "|(by(/.*)?)" // by Belarus (Byelorussia) + "|(bz(/.*)?)" // bz Belize + "|(ca(/.*)?)" // ca Canada + "|(cc(/.*)?)" // cc Cocos Islands (Keeling) + "|(cd(/.*)?)" // cd Congo, Democratic Republic of the + "|(cf(/.*)?)" // cf Central African Republic + "|(cg(/.*)?)" // cg Congo, Republic of + "|(ch(/.*)?)" // ch Switzerland + "|(ci(/.*)?)" // ci Cote d'Ivoire (Ivory Coast) + "|(ck(/.*)?)" // ck Cook Islands + "|(cl(/.*)?)" // cl Chile + "|(cm(/.*)?)" // cm Cameroon + "|(cn(/.*)?)" // cn China + "|(co(/.*)?)" // co Colombia + "|(com(/.*)?)" // com Commercial + "|(coop(/.*)?)" // coop Cooperatives + "|(cr(/.*)?)" // cr Costa Rica + "|(cs(/.*)?)" // cs Czechoslovakia + "|(cu(/.*)?)" // cu Cuba + "|(cv(/.*)?)" // cv Cap Verde + "|(cx(/.*)?)" // cx Christmas Island + "|(cy(/.*)?)" // cy Cyprus + "|(cz(/.*)?)" // cz Czech Republic + "|(de(/.*)?)" // de Germany + "|(dj(/.*)?)" // dj Djibouti + "|(dk(/.*)?)" // dk Denmark + "|(dm(/.*)?)" // dm Dominica + "|(do(/.*)?)" // do Dominican Republic + "|(dz(/.*)?)" // dz Algeria + "|(ec(/.*)?)" // ec Ecuador + "|(edu(/.*)?)" // edu Educational Institution + "|(ee(/.*)?)" // ee Estonia + "|(eg(/.*)?)" // eg Egypt + "|(eh(/.*)?)" // eh Western Sahara + "|(er(/.*)?)" // er Eritrea + "|(es(/.*)?)" // es Spain + "|(et(/.*)?)" // et Ethiopia + "|(fi(/.*)?)" // fi Finland + "|(fj(/.*)?)" // fj Fiji + "|(fk(/.*)?)" // fk Falkland Islands + "|(fm(/.*)?)" // fm Micronesia, Federal State of + "|(fo(/.*)?)" // fo Faroe Islands + "|(fr(/.*)?)" // fr France + "|(ga(/.*)?)" // ga Gabon + "|(gd(/.*)?)" // gd Grenada + "|(ge(/.*)?)" // ge Georgia + "|(gf(/.*)?)" // gf French Guiana + "|(gg(/.*)?)" // gg Guernsey + "|(gh(/.*)?)" // gh Ghana + "|(gi(/.*)?)" // gi Gibraltar + "|(gl(/.*)?)" // gl Greenland + "|(gm(/.*)?)" // gm Gambia + "|(gn(/.*)?)" // gn Guinea + "|(gov(/.*)?)" // gov Government (US) + "|(gp(/.*)?)" // gp Guadeloupe + "|(gq(/.*)?)" // gq Equatorial Guinea + "|(gr(/.*)?)" // gr Greece + "|(gs(/.*)?)" // gs South Georgia and the South Sandwich Islands + "|(gt(/.*)?)" // gt Guatemala + "|(gu(/.*)?)" // gu Guam + "|(gw(/.*)?)" // gw Guinea-Bissau + "|(gy(/.*)?)" // gy Guyana + "|(hk(/.*)?)" // hk Hong Kong + "|(hm(/.*)?)" // hm Heard and McDonald Islands + "|(hn(/.*)?)" // hn Honduras + "|(hr(/.*)?)" // hr Croatia/Hrvatska + "|(ht(/.*)?)" // ht Haiti + "|(hu(/.*)?)" // hu Hungary + "|(id(/.*)?)" // id Indonesia + "|(ie(/.*)?)" // ie Ireland + "|(il(/.*)?)" // il Israel + "|(im(/.*)?)" // im Isle of Man + "|(in(/.*)?)" // in India + "|(info(/.*)?)" // info + "|(int(/.*)?)" // int Int. Organizations + "|(io(/.*)?)" // io British Indian Ocean Territory + "|(iq(/.*)?)" // iq Iraq + "|(ir(/.*)?)" // ir Iran, Islamic Republic of + "|(is(/.*)?)" // is Iceland + "|(it(/.*)?)" // it Italy + "|(je(/.*)?)" // je Jersey + "|(jm(/.*)?)" // jm Jamaica + "|(jo(/.*)?)" // jo Jordan + "|(jp(/.*)?)" // jp Japan + "|(ke(/.*)?)" // ke Kenya + "|(kg(/.*)?)" // kg Kyrgyzstan + "|(kh(/.*)?)" // kh Cambodia + "|(ki(/.*)?)" // ki Kiribati + "|(km(/.*)?)" // km Comoros + "|(kn(/.*)?)" // kn Saint Kitts and Nevis + "|(kp(/.*)?)" // kp Korea, Democratic People's Republic + "|(kr(/.*)?)" // kr Korea, Republic of + "|(kw(/.*)?)" // kw Kuwait + "|(ky(/.*)?)" // ky Cayman Islands + "|(kz(/.*)?)" // kz Kazakhstan + "|(la(/.*)?)" // la Lao People's Democratic Republic + "|(lb(/.*)?)" // lb Lebanon + "|(lc(/.*)?)" // lc Saint Lucia + "|(li(/.*)?)" // li Liechtenstein + "|(lk(/.*)?)" // lk Sri Lanka + "|(lr(/.*)?)" // lr Liberia + "|(ls(/.*)?)" // ls Lesotho + "|(lt(/.*)?)" // lt Lithuania + "|(lu(/.*)?)" // lu Luxembourg + "|(lv(/.*)?)" // lv Latvia + "|(ly(/.*)?)" // ly Libyan Arab Jamahiriya + "|(ma(/.*)?)" // ma Morocco + "|(mc(/.*)?)" // mc Monaco + "|(md(/.*)?)" // md Moldova, Republic of + "|(mg(/.*)?)" // mg Madagascar + "|(mh(/.*)?)" // mh Marshall Islands + "|(mil(/.*)?)" // mil Military (US Dept of Defense) + "|(mk(/.*)?)" // mk Macedonia, Former Yugoslav Republic + "|(ml(/.*)?)" // ml Mali + "|(mm(/.*)?)" // mm Myanmar + "|(mn(/.*)?)" // mn Mongolia + "|(mo(/.*)?)" // mo Macau + "|(mp(/.*)?)" // mp Northern Mariana Islands + "|(mq(/.*)?)" // mq Martinique + "|(mr(/.*)?)" // mr Mauritani + "|(ms(/.*)?)" // ms Montserrat + "|(mt(/.*)?)" // mt Malta + "|(mu(/.*)?)" // mu Mauritius + "|(museum(/.*)?)" // museum Museums + "|(mv(/.*)?)" // mv Maldives + "|(mw(/.*)?)" // mw Malawi + "|(mx(/.*)?)" // mx Mexico + "|(my(/.*)?)" // my Malaysia + "|(mz(/.*)?)" // mz Mozambique + "|(na(/.*)?)" // na Namibia + "|(name(/.*)?)" // name Individuals + "|(nc(/.*)?)" // nc New Caledonia + "|(ne(/.*)?)" // ne Niger + "|(net(/.*)?)" // net networks + "|(nf(/.*)?)" // nf Norfolk Island + "|(ng(/.*)?)" // ng Nigeria + "|(ni(/.*)?)" // ni Nicaragua + "|(nl(/.*)?)" // nl Netherlands + "|(no(/.*)?)" // no Norway + "|(np(/.*)?)" // np Nepal + "|(nr(/.*)?)" // nr Nauru + "|(nt(/.*)?)" // nt Neutral Zone + "|(nu(/.*)?)" // nu Niue + "|(nz(/.*)?)" // nz New Zealand + "|(om(/.*)?)" // om Oman + "|(org(/.*)?)" // org Organization (non-profit) + "|(pa(/.*)?)" // pa Panama + "|(pe(/.*)?)" // pe Peru + "|(pf(/.*)?)" // pf French Polynesia + "|(pg(/.*)?)" // pg Papua New Guinea + "|(ph(/.*)?)" // ph Philippines + "|(pk(/.*)?)" // pk Pakistan + "|(pl(/.*)?)" // pl Poland + "|(pm(/.*)?)" // pm St. Pierre and Miquelon + "|(pn(/.*)?)" // pn Pitcairn Island + "|(pr(/.*)?)" // pr Puerto Rico
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -