⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 netinfo.py

📁 Harvestman-最新版本
💻 PY
字号:
"""netinfo - Module summarizing information regarding protocols,ports, file extensions, regular expressions for analyzing URLs etcfor HarvestMan.Created Anand B Pillai <abpillai at gmail dot com> Feb 22 2008, moving                                                   content from urlparser.pyCopyright (C) 2008, Anand B Pillai."""import reURLSEP = '/'          # URL separator characterPROTOSEP = '//'       # String which separates a protocol string from the rest of URLDOTDOT = '..'         # A convenient name for the .. stringDOT = '.'             # A convenient name for the . stringPORTSEP = ':'         # Protocol separator character, character which separates the protocol                      # string from rest of URLBACKSLASH = '\\'      # A convenient name for the backslash character# Mapping popular protocols to most widely used port numbersprotocol_map = { "http://" : 80,                        "ftp://" : 21,                 "https://" : 443,                 "file://": 0,                 "file:": 0                 }# Popular image types file extensionsimage_extns = ('.bmp', '.dib', '.dcx', '.emf', '.fpx', '.gif', '.ico', '.img',               '.jp2', '.jpc', '.j2k', '.jpf', '.jpg', '.jpeg', '.jpe',               '.mng', '.pbm', '.pcd', '.pcx', '.pgm', '.png', '.ppm',                '.psd', '.ras', '.rgb', '.tga', '.tif', '.tiff', '.wbmp',               '.xbm', '.xpm')# Popular video types file extensionsmovie_extns = ('.3gp', '.avi', '.asf','.asx', '.avs', '.bay',               '.bik', '.bsf', '.dat', '.dv' ,'.dvr-ms', 'flc',               '.flv', '.ivf', '.m1v', '.m2ts', '.m2v', '.m4v',               '.mgv', '.mkv', '.mov', '.mp2v', '.mp4', '.mpe',               '.mpeg', '.mpg', '.ogm', '.qt', '.ratDVD', '.rm',               '.smi', '.vob', '.wm', '.wmv', '.xvid' )# Popular audio types file extensionssound_extns = ('.aac', '.aif', '.aiff', '.aifc', '.aifr', '.amr',               '.ape' ,'.asf', '.au', '.aud', '.aup', '.bwf',                '.cda', '.dct', '.dss', '.dts', '.dvf', '.esu',               '.eta', '.flac', '.gsm', '.jam', '.m4a', '.m4p',               '.mdi', '.mid', '.midi', '.mka', '.mod', '.mp1', '.mp2',               '.mp3', '.mpa', '.mpc', '.mpega', '.msv', '.mus',               '.nrj', '.nwc', '.nwp', '.ogg', '.psb', '.psm', '.ra',               '.ram', '.rel', '.sab', '.shn', '.smf', '.snd', '.speex',               '.tta', '.vox', '.vy3', '.wav', '.wave', '.wma',               '.wpk', '.wv', '.wvc')# Most common web page url file extensions# including dynamic server pages & cgi scripts.webpage_extns = ('', '.htm', '.html', '.shtm', '.shtml', '.php',                 '.php3','.php4','.asp', '.aspx', '.jsp','.psp','.pl',                 '.cgi', '.stx', '.cfm', '.cfml', '.cms', '.ars')# Document extensionsdocument_extns = ('.doc','.rtf','.odt','.odp','.ott','.sxw','.stw',                  '.sdw','.vor','.pdf','.ps')# Extensions for flash/flash source code/flash action scriptflash_extns = ('.swf', '.fla', '.mxml', '.as', '.abc')# Web-page extensions which automatically default to directories# These are special web-page types which are web-pages as well# as directories. Most common example is the .ars file extension# of arstechnica.com.default_directory_extns = ('.ars',)# Most common stylesheet url file extensionsstylesheet_extns = ( '.css', )# Regular expression for matching# urls which contain white spaceswspacere = re.compile(r'\s+\S+', re.LOCALE|re.UNICODE)# Regular expression for anchor tagsanchore = re.compile(r'\#+')# jkleven: Regex if we still don't recognize a URL address as HTML.  Only# to be used if we've looked at everything else and URL still isn't# a known type.  This regex is similar to one in pageparser.py but # we changed a few '*' to '+' to get one or more matches.  # form_re = re.compile(r'[-.:_a-zA-Z0-9]+\?[-.:_a-zA-Z0-9]+=[-.a:_-zA-Z0-9]*', re.UNICODE)# Made this more generic and lenient.form_re = re.compile(r'([^&=\?]*\?)([^&=\?]*=[^&=\?])*', re.UNICODE)# Junk chars which cannot be part of valid filenamesjunk_chars = ('?','*','"','<','>','!',':','/','\\')# Replacement charsjunk_chars_repl = ('',)*len(junk_chars)# Dirty chars which need to be hex encoded in URLs (apart from white-space)# We are assuming that there won't be many idiots who would put a backslash in a URL...dirty_chars = ('<','>','(',')','{','}','[',']','^','`','|')# These are replaced with their hex counterpartsdirty_chars_repl = ('%3C','%3E','%28','%29','%7B','%7D','%5B','%5D','%5E','%60','%7C')# %xx char replacement regexppercent_repl = re.compile(r'\%[a-f0-9][a-f0-9]', re.IGNORECASE)# params_re = re.compile(r'([-.:_a-zA-Z0-9]*=[-.a:_-zA-Z0-9]*)+', re.UNICODE)# params_re = re.compile(r'([-.:_a-zA-Z0-9]*=[^\&]*)+', re.UNICODE)# Regexp which extracts params from query URLs, most genericparams_re = re.compile(r'([^&=\?]*=[^&=\?]*)', re.UNICODE)# Regular expression for validating a query param group (such as "lang=en")param_re = re.compile(r'([^&=\?]+=[^&=\?\s]+)', re.UNICODE)# ampersand regular expression at URL endampersand_re = re.compile(r'\&+$')# question mark regular expression at URL endquestion_re = re.compile(r'\?+$')# Regular expression for www prefixes at front of a stringwww_re = re.compile(r'^www(\d*)\.')# Regular expression for www prefixes anywherewww2_re = re.compile(r'www(\d*)\.')# List of TLD (top-level domain) name endings from http://data.iana.org/TLD/tlds-alpha-by-domain.txttlds = ['ac', 'ad', 'ae', 'aero', 'af', 'ag', 'ai', 'al', 'am', 'an', 'ao', 'aq',        'ar', 'arpa', 'as', 'asia', 'at', 'au', 'aw','ax', 'az', 'ba', 'bb', 'bd',        'be', 'bf', 'bg', 'bh', 'bi', 'biz', 'bj', 'bm', 'bn', 'bo', 'br', 'bs',        'bt', 'bv', 'bw', 'by', 'bz', 'ca', 'cat', 'cc', 'cd', 'cf', 'cg', 'ch',        'ci', 'ck', 'cl', 'cm', 'cn', 'co', 'com', 'coop', 'cr', 'cu', 'cv', 'cx',        'cy', 'cz', 'de', 'dj', 'dk', 'dm', 'do', 'dz', 'ec', 'edu', 'ee', 'eg',        'er', 'es', 'et', 'eu', 'fi', 'fj', 'fk', 'fm', 'fo', 'fr', 'ga', 'gb',        'gd', 'ge', 'gf', 'gg', 'gh', 'gi', 'gl', 'gm', 'gn', 'gov', 'gp', 'gq',        'gr', 'gs', 'gt', 'gu', 'gw', 'gy', 'hk', 'hm', 'hn', 'hr', 'ht', 'hu',        'id', 'ie', 'il', 'im', 'in', 'info', 'int', 'io', 'iq', 'ir', 'is',        'it', 'je', 'jm', 'jo', 'jobs', 'jp', 'ke', 'kg', 'kh', 'ki', 'km', 'kn',        'kp', 'kr', 'kw', 'ky', 'kz', 'la', 'lb', 'lc', 'li', 'lk', 'lr', 'ls',        'lt', 'lu', 'lv', 'ly', 'ma', 'mc', 'md', 'me', 'mg', 'mh', 'mil', 'mk',        'ml', 'mm', 'mn', 'mo', 'mobi', 'mp', 'mq', 'mr', 'ms', 'mt', 'mu',        'museum', 'mv', 'mw', 'mx', 'my', 'mz', 'na', 'name', 'nc', 'ne', 'net',        'nf', 'ng', 'ni', 'nl', 'no', 'np', 'nr', 'nu', 'nz', 'om', 'org', 'pa',        'pe', 'pf', 'pg', 'ph', 'pk', 'pl', 'pm', 'pn', 'pr', 'pro', 'ps', 'pt',        'pw', 'py', 'qa', 're', 'ro', 'rs', 'ru', 'rw', 'sa', 'sb', 'sc', 'sd',        'se', 'sg', 'sh', 'si', 'sj', 'sk', 'sl', 'sm', 'sn', 'so', 'sr', 'st',        'su', 'sv', 'sy', 'sz', 'tc', 'td', 'tel', 'tf', 'tg', 'th', 'tj', 'tk',        'tl', 'tm', 'tn', 'to', 'tp', 'tr', 'travel', 'tt', 'tv', 'tw', 'tz',        'ua', 'ug', 'uk', 'um', 'us', 'uy', 'uz', 'va', 'vc', 've', 'vg', 'vi',        'vn', 'vu', 'wf', 'ws', 'xn--0zwm56d', 'xn--11b5bs3a9aj6g', 'xn--80akhbyknj4f',        'xn--9t4b11yi5a', 'xn--deba0ad', 'xn--g6w251d', 'xn--hgbk6aj7f53bba',        'xn--hlcj6aya9esc7a', 'xn--jxalpdlp', 'xn--kgbechtv', 'xn--zckzah',        'ye', 'yt', 'yu', 'za', 'zm', 'zw']def get_base_server(server):    """ Return the base server name of  the passed    server (domain) name """    # If the server name is of the form say bar.foo.com    # or vodka.bar.foo.com, i.e there are more than one    # '.' in the name, then we need to return the    # last string containing a dot in the middle.    if server.count('.') > 1:        dotstrings = server.split('.')        # now the list is of the form => [vodka, bar, foo, com]        # Skip the list for skipping over tld domain name endings        # such as .org.uk, .mobi.uk etc. For example, if the        # server is games.mobileworld.mobi.uk, then we        # need to return mobileworld.mobi.uk, not mobi.uk        dotstrings.reverse()        idx = 0        for item in dotstrings:            if item.lower() in tlds:                idx += 1        return '.'.join(dotstrings[idx::-1])    else:        # The server is of the form foo.com or just "foo"        # so return it straight away        return server

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -