📄 pagesum2.py
字号:
if m: ofnm = os.path.basename(fnm)if not ofnm: ofnm = raw_input('output files base? - base is %s\n?' % (basedir)) if not ofnm: ofnm = os.path.basename(fnm)if ofnm[0] == '-': ofnm = ofnm[1:] outfbase = os.path.join(basedir, ofnm)#print 'writing to'rep('Output written to:')writes = [['', []], ['.lt%d' % (nob_thresh), []], ['.gt%d' % (nob_thresh), []]]ofnms = []for suff2, write in writes: for suff1 in ['.dur', '.dur85', '.ndur', '.ndur85', '.del', '.del85', '.acc_del', '.ddf']: outfnm = outfbase + suff1 + suff2 ofnms.append(outfnm) try: f = open(outfnm, 'w') except IOError, s: print 'Couldn\'t open data file', s sys.exit(1) write.append(f.write) #print 'writing to', os.path.basename(outfnm) #print outfnm rep(outfnm)sumfilenm = outfbase + '.Summary'#print 'Summary file is', sumfilenmrep('Summary file is %s' % (sumfilenm))#sys.exit(0)totp = 0toto = 0badl = 0min_tm = 99999999999999999.9999max_tm = 0single_unlinked = 0## The following for page and object downloads info gathering#pagefirst = inobs = 1 # flagsnamecache = {}## ## entries for 'P' keyed entry are:# {(URL) host: (0/hostdata, 1/page_dict, 2/object_dict, 3/others_object_dict}# hostdata = {IP_addrs}# page_dict = {(URL path) page: [0/{linktypes (to page)}, 1/times-seen,# 2/-requested, 3/-downloaded,# 4/accum bytes, 5/{linktypes (in page)# 6/{page type}, 7/{downloaded object types},# 8/max discrete URL links in page]# object_dict = {(URL path) object: [0/times-seen,# 1/-requested, 2/-downloaded, 3/accum bytes,# 4/ {page_type}]# others_object_dict = same as object_dict but objects served for pages on different server#servs = {'T':{}, 'P':{}, 'U':{}, 'R':{}, 'I':{}, 'V':{}}line_err = 0###for fnm in args: if line_err == 1: s = replist.pop() rep_now('%s - **File truncated**' % (s)) line_err = 0 print os.path.basename(fnm) f = open(fnm, 'r') lno = -1 for l in f.readlines(): lno += 1 l = l.replace('\n', '') if l[0] == '#': m = None m = start_re.match(l) if m: strt = long(m.group(1))/1000000.0 #print 'start = ', start if start == 0: start = strt strt -= start print 'start is', start, 'offset is', strt if inobs: inobs = 0 # # denotes end of objects pass continue sf = l.rstrip().split(' ') intree = 1 #print sf #url, cli, serv, by, no, nc, tm, dur, ndur, dur85, ndur85, dflags = sf rt = sf[0] if rt == 'T': if len(sf) < 14: rep_now('Bad line file %s line %d \"%s"' % (fnm, lno, l)) line_err += 1 continue toto += 1 o_obno = int(sf[1]) sf = sf[2:] o_url = sf[0] o_serv = sf[1] o_connorder = int(sf[2]) o_nbytes = int(sf[3]) o_ltype = int(sf[4], 16) o_type = int(sf[5]) o_retcode = int(sf[6]) o_tm = float(sf[7]) + strt o_finger = (int(sf[8]), int(sf[9]), int(sf[10])) o_iscont = int(sf[11]) if o_iscont: o_alinks = int(sf[12]) nltypes = int(sf[13]) #assert len(sf) == 14 + nltypes*5 if len(sf) != 14 + nltypes*5: rep_now('Bad line file %s line %d \"%s"' % (fnm, lno, l)) line_err += 1 continue ldata = [] for i in range(nltypes): data = [] off = 14 + (i*5) for j in range(5): data.append(int(sf[off+j])) ldata.append(data) else: #assert len(sf) == 12 if len(sf) != 12: rep_now('Bad line file %s line %d \"%s"' % (fnm, lno, l)) line_err += 1 continue elif rt == 'P': #assert len(sf) == 20 if len(sf) != 20: rep_now('Bad line file %s line %d \"%s"' % (fnm, lno, l)) line_err += 1 continue totp += 1 pagenum = int(sf[1]) sf = sf[2:] url = sf[0] cli = sf[1] serv = sf[2] by = int(sf[3]) no = int(sf[4]) nc = int(sf[5]) ltype = int(sf[6], 16) ctype = int(sf[7]) nservs = int(sf[8]) tm = float(sf[9]) + strt dur = int(sf[10]) ndur = int(sf[11]) dur85 = int(sf[12]) ndur85 = int(sf[13]) dflags = int(sf[14]) acc_del = int(sf[15]) delv = long(sf[16]) cnt_del = int(sf[17]) pagefirst = inobs = 1 cont_seen = 0 elif rt in ['U', 'R', 'I', 'V']: #assert len(sf) == 10 if len(sf) != 10: rep_now('Bad line file %s line %d \"%s"' % (fnm, lno, l)) line_err += 1 continue totp += 1 intree = 0 url = sf[1] cli = sf[2] serv = sf[3] by = int(sf[4]) ctype = int(sf[5]) no = 1 nc = 1 tm = float(sf[6])/1000 + strt dur = int(sf[7]) dur85 = dur ndur = int(sf[8]) ndur85 = ndur dflags = int(sf[9]) acc_del = cnt_del = dur - ndur delv = acc_del*acc_del nservs = 1 ## if ctype == CT_TEXT_HTML or ctype == CT_TEXT_XML:## #single object page## single_unlinked += 1## intree = 1## ltype = 0 else: print 'pagesum - bad line %s %d: \"%s\"' % (fnm, lno, l) badl += 1 continue if rt == 'T': up = urlparse(o_url) o_host = up[1].split(':')[0] if not o_host: try: o_host = namecache[o_serv] except KeyError: o_host = o_serv else: namecache[o_serv] = o_host obnm = up[2] if not obnm: obnm = 'NK' got = o_retcode == 200 or o_retcode == 206 if pagefirst and o_host == host and obnm == page: #it's the page root container #print 'xx', pd[2] += 1 pd[4] += o_nbytes if got: pd[3] += 1 ptd = pd[6] # page object type dict ptd[o_type] = ptd.setdefault(o_type, 0) + 1 cont_seen = 1 else: # its a constituent object if o_host == host: # served by page host od = hd[2] else: od = servs['P'].setdefault(host, ({}, {}, {}, {}))[3] odd = od.setdefault(obnm, [0, 0, 0, 0, {}]) #print 'XXX', odd[0] += 1 odd[1] += 1 odd[3] += o_nbytes if got: odd[2] += 1 ptd = odd[4] ptd[o_type] = ptd.setdefault(o_type, 0) + 1 if got: ptd = pd[7] ptd[o_type] = ptd.setdefault(o_type, 0) + 1 if o_iscont and cont_seen: # add in page links data (transitive in case of frames) pd[8] = max(o_alinks, pd[8]) dd = pd[5] for lt in ldata: ld = dd.setdefault(lt[0], [0, 0, 0, 0]) ld[0] = max(ld[0], lt[1]) for i in range(2,5): ld[i-1] += lt[i] pagefirst = 0 else: if ndur == 0: continue #print rt, tm min_tm = min(tm, min_tm) max_tm = max(tm, max_tm) acc = accums[intree] acc.totpages += 1 ue = acc.ud.setdefault(url, [0,0,0,0]) se = acc.sd.setdefault(serv, [0,0,0,0]) ce = acc.cd.setdefault(cli, [0,0,0,0]) ue[0] += 1 se[0] +=1 ce[0] += 1 dl = dur - ndur dl85 = dur85 - ndur85 if dl: acc.npbdel += 1 ue[1] += 1 se[1] += 1 if dl85: acc.npbdel85 += 1 ue[2] += 1 se[2] += 1 if dflags: acc.npdel += 1 ue[3] += 1 se[3] += 1 acc.nobs += no acc.nconns += nc #print totdur, durlessdel, tot85dur, dur85lessdel if not intree: if no != 1: print 'Single object goof %d objects %s %d: %s' \ % (no, fnm, lno, l) if rt == 'U': acc.unlinked += 1 elif rt == 'R': acc.refr += 1 elif rt == 'I': acc.inv += 1 elif rt == 'V': acc.rvst += 1 else: print 'Invalid non-tree reason %s %d: %s' % (fnm, lno, l) sys.exit(1) write = writes[0][1] write[0]('%.3f\t%d\n' % (tm, dur)) write[1]('%.3f\t%d\n' % (tm, dur85)) write[2]('%.3f\t%d\n' % (tm, ndur)) write[3]('%.3f\t%d\n' % (tm, ndur85)) if dl: write[4]('%.3f\t%d\n' % (tm, dl)) if acc_del and dur: write[6]('%.3f\t%.2f\n' % (tm, ((acc_del/no)*100.0)/ndur)) write[7]('%.3f\t%.2f\n' % ( tm, (sqrt(delv/no))/ndur) ) if dl85: write[5]('%.3f\t%d\n' % (tm, dur85-ndur85)) if no > nob_thresh: write = writes[2][1] acc.lp += 1 else: write = writes[1][1] acc.sp += 1 write[0]('%.3f\t%d\n' % (tm, dur)) write[1]('%.3f\t%d\n' % (tm, dur85)) write[2]('%.3f\t%d\n' % (tm, ndur)) write[3]('%.3f\t%d\n' % (tm, ndur85)) if dl: write[4]('%.3f\t%d\n' % (tm, dur-ndur)) if acc_del and dur: write[6]('%.3f\t%.2f\n' % (tm, ((acc_del/no)*100.0)/ndur)) write[7]('%.3f\t%.2f\n' % (tm, (sqrt(delv/no))/ndur)) if dl85: write[5]('%.3f\t%d\n' % (tm, dur85-ndur85)) if acc_del and not (dur-ndur): acc.no_del_del += 1 acc.nservd[nservs] += 1 # page/object info gathering if rt == 'P': up = urlparse(url) host = up[1].split(':')[0] if not host: try: host = namecache[serv] except KeyError: host = serv else: namecache[serv] = host page = up[2] if not page: page = 'NK' hd = servs['P'].setdefault(host, ({}, {}, {}, {})) hhd = hd[0] # accumulate IP addrs for this (URL) host hhd[serv] = hhd.setdefault(serv, 0) + 1 #accumulate pages from this (URL) host pd = hd[1].setdefault(page, [{}, 0, 0, 0, 0, {}, {}, {}, 0]) #print '%x' % ltype ld = pd[0] # accumulate link types to page ld[ltype] = ld.setdefault(ltype, 0) + 1 # accumulate times seen pd[1] += 1if not totp: print 'No pages in page file(s)' sys.exit(1)print totp, 'pages'print toto, 'objects'print '%d/%d bad lines' % (badl, totp)print 'times:', min_tm, max_tm accum_tot(accums[0], accums[1], accums[2])for i in [1, 0, 2]: acc = accums[i] acc.nurl = len(acc.ud) acc.nserv = len(acc.sd) acc.ncli = len(acc.cd) acc.nsdel = 0 acc.nsbdel = 0 for s in acc.sd.values(): if s[1]: acc.nsbdel += 1 if s[3]: acc.nsdel += 1 acc.nudel = 0 acc.nubdel = 0 for u in acc.ud.values(): if u[1]: acc.nubdel += 1 if u[3]: acc.nudel += 1 acc.report()userv = 0itsd = accums[1].sdfor s in accums[0].sd.keys(): if not itsd.has_key(s): userv += 1 uurl = 0itud = accums[1].udfor s in accums[0].ud.keys(): if not itud.has_key(s): uurl += 1 rep('%d servers %d urls not seen in trees' % (userv, uurl))rep('%d single unlinked' % (single_unlinked))do_pagestuff(servs['P']) rep_rep()## for fn in ofnms:## #tmpfile = os.tempnam('/tmp')## tmpfile = fn + '.sorted'## sortcmd = 'sort -n -o %s %s' % (tmpfile, fn) ## mvcmd = 'mv %s %s ' % (tmpfile, fn)## for cmd in [sortcmd, mvcmd]:## status, output = commands.getstatusoutput(cmd)## if status:## print cmd, 'failed with', output
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -