📄 pagesum2.py
字号:
#! /usr/bin/env python################################################################################ ## Copyright 2005 University of Cambridge Computer Laboratory. ## ## This file is part of Nprobe. ## ## Nprobe is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation; either version 2 of the License, or ## (at your option) any later version. ## ## Nprobe is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Nprobe; if not, write to the Free Software ## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ## ################################################################################from sys import argvimport getoptimport osimport sysimport reimport commandsfrom math import sqrtimport Numericfrom urlparse import urlparsefrom socket import inet_atonimport np_warningsfrom np_treestats import NOB_THRESHfrom np_TCPConn import D_BIGDELfrom nprobe import CT_TEXT_HTML, CT_TEXT_XML, http_server_objtype_stringfrom histo import Histogram, HistogramErrorN_SERV_BKTS = 1000replist = []def rep(s): replist.append(s)def rep_now(s): print s rep(s)def rep_per(args): boss_v, boss_t = args[0] if not boss_v: return rep('%d %s:' % (boss_v, boss_t)) for v, t in args[1:]: if v: pcf = 100.0/boss_v rep('\t%d %s (%.2f%%)' % (v, t, v*pcf))def rep_rep(): try: sf = open(sumfilenm, 'w') except IOError, s: print 'Couldn\'t open summary file', s sys.exit(1) for s in replist: print s sf.write(s + '\n') acc_fields = ['totpages', 'nobs', 'nconns', 'npdel', 'npbdel', 'npbdel85', 'no_del_del', 'sp', 'lp', 'unlinked', 'refr', 'inv', 'rvst']class accum: def __init__(self, what): self.what = what for f in acc_fields: setattr(self, f, 0) self.ud = {} self.sd = {} self.cd = {} self.nservd = Numeric.zeros(N_SERV_BKTS,) def report(self): rep(self.what) rep('%d servers %d clients %d obs %d conns' \ % (self.nserv, self.ncli, self.nobs, self.nconns)) rep_per([(self.nurl, 'URLs')]) page_per = [ (self.totpages, 'Page downloads'), (self.sp, 'lt %d obs' % (nob_thresh)), (self.lp, 'gt %d obs' % (nob_thresh)) ] single = self.unlinked+self.refr+self.inv+self.rvst if single: page_per += [ (single, 'single objects not in trees'), (self.unlinked, 'unlinked'), (self.refr, 'refreshes'), (self.rvst, 'revisits'), (self.inv, 'invalid') ] if self.npdel: page_per += [ (self.npdel, 'delayed'), (self.npbdel, 'long delayed'), (self.npbdel85, 'long delayed at 85%') ] if self.no_del_del: page_per += [(self.no_del_del, 'delays not adding to total')] rep_per(page_per) rep_per([(self.nserv, 'Servers'), (self.nsdel, 'delayed'), (self.nsbdel, 'long delayed') ]) rep_per([(self.nurl, 'URLs'), (self.nudel, 'delayed'), (self.nubdel, 'long delayed') ]) rep('Distribution of servers/page') for i in range(N_SERV_BKTS): ns = self.nservd[i] if ns: rep(' %d %12d' % (i, ns)) rep('\n')def accum_tot(a, b, c): c.ud = {} c.ud.update(a.ud) for v in b.ud.items(): s = v[1] e = c.ud.setdefault(v[0], [0,0,0,0]) e[0] += s[0] e[1] += s[1] e[2] += s[2] e[3] += s[3] c.sd = {} c.sd.update(a.sd) for v in b.sd.items(): s = v[1] e = c.sd.setdefault(v[0], [0,0,0,0]) e[0] += s[0] e[1] += s[1] e[2] += s[2] e[3] += s[3] c.cd = {} c.cd.update(a.cd) for v in b.cd.items(): s = v[1] e = c.cd.setdefault(v[0], [0,0,0,0]) e[0] += s[0] e[1] += s[1] e[2] += s[2] e[3] += s[3] for i in range(N_SERV_BKTS): c.nservd[i] = a.nservd[i] + b.nservd[i] for f in acc_fields: setattr(c, f, getattr(a, f) + getattr(b, f))def do_pagestuff(sd): def dl_accum(d, s, e): a = [0]*(e-s+1) for v in d.values(): n = 0 for i in range(s, e+1): a[n] += v[i] n += 1 return a def by_0(a, b): return int(b[0]-a[0]) def by_1(a, b): return int(b[1]-a[1]) def by_2(a, b): return int(b[2]-a[2]) def by_3_0(a, b): return int(b[3][0]-a[3][0]) def by_3_1(a, b): return int(b[3][1]-a[3][1]) def by_3_2(a, b): return int(b[3][2]-a[3][2]) def by_4_1(a, b): return int(b[4][1]-a[4][1]) def by_4_2(a, b): return int(b[4][2]-a[4][2]) def by_5_1(a, b): return int(b[5][1]-a[5][1]) def by_5_2(a, b): return int(b[5][2]-a[5][2]) def by_7(a, b): return long(b[7]-a[7]) def add_3(a, b): a[0] += b[0] a[1] += b[1] a[2] += b[2] def object_type_string(otype): return http_server_objtype_string(otype).replace('/', '-') slist = [] tot_bytes = 0 tot_disc_p = tot_disc_o = tot_disc_oo = 0 allptots = [0, 0, 0] allotots = [0, 0, 0] allootots = [0, 0, 0] plinksd = {} intlinksd = {} otypesd = {} uhist = Histogram(lower=0, bucketsz=1) page_n = 0 for s, (hdd, pd, od, odd) in sd.items(): ptots = dl_accum(pd, 1, 4) add_3(allptots, ptots) otots = dl_accum(od, 0, 3) add_3(allotots, otots) ootots = dl_accum(odd, 0, 3) add_3(allootots, ootots) totb = ptots[-1] + otots[-1] + ootots[-1] tot_bytes += totb disc_p = len(pd) tot_disc_p += disc_p disc_o = len(od) tot_disc_o += disc_o disc_oo = len(odd) tot_disc_oo += disc_oo slist.append((disc_p, disc_o, disc_oo, ptots, otots, ootots, s, totb)) for p in pd.values(): page_n += 1 uhist.add(p[8]) ld = p[0] # links to page for ltype, n in ld.items(): plinksd[ltype] = plinksd.setdefault(ltype, 0) + n ld = p[5] # links within page for ltype, (max_ndisc, nd_disc, followed, dups) in ld.items(): ent = intlinksd.setdefault(ltype, (Histogram(lower=0, bucketsz=1), Histogram(lower=0, bucketsz=1), Histogram(lower=0, bucketsz=1), Histogram(lower=0, bucketsz=1))) for h, v, div in [ (ent[0], max_ndisc, 0), (ent[1], nd_disc, 1), (ent[2], followed, 1), (ent[3], dups, 1) ]: if v: if div: v = v/p[3] h.add(v) otypes = p[7] for ot, n in otypes.items(): h = otypesd.setdefault(ot, Histogram(lower=0, bucketsz=1)) h.add(n) slist.sort() slist.reverse() for lab, srt, f1, f2, tot in [ ('discrete page references', by_0, 0, None, tot_disc_p), ('discrete object references', by_1, 1, None, tot_disc_o), ('discrete others object references', by_2, 2, None, tot_disc_oo), ('pages encountered', by_3_0, 3, 0, allptots[0]), ('pages requested', by_3_1, 3, 1, allptots[1]), ('pages downloaded', by_3_2, 3, 2, allptots[2]), ('objects requested', by_4_1, 4, 1, allotots[1]), ('objects downloaded', by_4_2, 4, 2, allotots[2]), ('others objects requested', by_5_1, 5, 1, allootots[1]), ('others objects downloaded', by_5_2, 5, 2, allootots[2]),# ('bytes downloaded', by_7, 7, None, tot_bytes) ]: rep('top servers by %s:\n' % (lab)) slist.sort(srt) rest_val = 0 rest_pc = 0.0 for stuff in slist: if f2 == None: val = stuff[f1] else: val = stuff[f1][f2] pc = (val*100.0)/tot if pc >= 2.5: rep('\t%s %d (%.2f%%)' % (stuff[6], val, pc)) else: rest_val += val rest_pc += pc rep('\tOther %d (%.2f%%)' % (rest_val, rest_pc)) rep('\n') # links to pages rep('Link types to pages encountered:\n') pagelinks = [(n, ltype) for ltype, n in plinksd.items()] pagelinks.sort() pagelinks.reverse() rest_val = 0 rest_pc = 0.0 for n, ltype in pagelinks: pc = (n*100.0)/allptots[0] if pc >= 2.5: rep('0x%x %d (%.2f%%)' % (ltype, n, pc)) else: rest_val += n rest_pc += pc rep('Other %d (%.2f%%)' % (rest_val, rest_pc)) rep('\n') linksdir = os.path.join(basedir, 'links_data') try: os.makedirs(linksdir) except OSError,s: if str(s).find('File exists') < 0: raise intlinks = [(lt, lhists) for lt, lhists in intlinksd.items()] intlinks.sort() comm = 'First column is number of links, second is No. pages occurring' for lt, hists in intlinks: lts = '0x%x' % (lt) for h, fn, tit in [ (hists[0], 'oa_max', 'max over all sightings of a page'), (hists[1], 'ave_seen', 'average No. over page downloads'), (hists[2], 'ave_followed', 'average No. followed'), (hists[3], 'ave_duplicated', 'average No. duplicated') ]: fnm = os.path.join(linksdir, '%s-%s' % (lts, fn)) #f = open(fnm, 'w') try: h.results(zeros=0, file=fnm, title=tit, comment=comm) except HistogramError, s: if str(s).find('No samples presented') >= 0: continue else: raise fnm = os.path.join(linksdir, 'disc_urls') try: uhist.results(zeros=0, file=fnm, title='disc_urls', comment='Max number of discrete URL links of all types over all sightings of a page\n - first column is number of URLs, second is No. pages occurring') except HistogramError, s: if str(s).find('No samples presented') < 0: raise typesdir = os.path.join(basedir, 'types_per_page') try: os.makedirs(typesdir) except OSError,s: if str(s).find('File exists') < 0: raise for type, h in otypesd.items(): typestr = object_type_string(type) fnm = os.path.join(typesdir, typestr) try: h.results(zeros=0, file=fnm, comment='Distribution of downloaded object types per page\n - first column is number of obs. of the type, second column is No. pages occuring', title='Downloaded types distribution') except HistogramError, s: if str(s).find('No samples presented') < 0: raise scriptname = os.path.basename(argv[0])ofnm = Nonenob_thresh = NOB_THRESHtry: optlist, args = getopt.getopt(sys.argv[1:], 'o:n:')except getopt.error, s: print '%s: %s' % (scriptname, s) usage(scriptname) sys.exit(1)for opt in optlist: if opt[0] == '-o': ofnm = opt[1] if opt[0] == '-n': nob_thresh = opt[1]start_re = re.compile('# Run start = ([0-9]*).*')start = 0accums = [accum('NOT IN TREES:'), accum('IN_TREES:'), accum('TOTAL:')]basedir = os.path.dirname(args[0])basedir = os.path.join(basedir, 'Page_results')try: os.makedirs(basedir)except OSError,s: if str(s).find('File exists') < 0: raisefrange = []suffs = []pref_re = re.compile('(.\.rep\.\d*)-(.\.rep\.\d*)(\..*)')print 'files from', os.getcwd(), ':'rep('files from %s:' % (os.getcwd()))for fnm in args: fnm = os.path.basename(fnm) print ' ', os.path.basename(fnm) m = pref_re.match(fnm) if m: #print m.group(1), m.group(2), m.group(3) frange.append(m.group(1)) frange.append(m.group(2)) suffs.append(m.group(3))if frange: for suff in suffs[1:]: if suff != suffs[0]: print 'Ouch mixed suffix:', suff frange.sort() ofnm = frange[0] + '-' + frange[-1] + suffs[0] #print frangeelse: pref_re = re.compile('(.\.rep\.\d*)\.(.*)') m = pref_re.match(os.path.basename(fnm)) if m: #print m.group(1), m.group(2) ofnm = m.group(0)if not ofnm: pref_re = re.compile('.*\.Pages') m = pref_re.match(os.path.basename(fnm))
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -