📄 do_csum.s
字号:
/* * * Optmized version of the standard do_csum() function * * Return: a 64bit quantity containing the 16bit Internet checksum * * Inputs: * in0: address of buffer to checksum (char *) * in1: length of the buffer (int) * * Copyright (C) 1999 Hewlett-Packard Co * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com> * */#include <asm/asmmacro.h>//// Theory of operations:// The goal is to go as quickly as possible to the point where// we can checksum 8 bytes/loop. Before reaching that point we must// take care of incorrect alignment of first byte.//// The code hereafter also takes care of the "tail" part of the buffer// before entering the core loop, if any. The checksum is a sum so it// allows us to commute operations. So we do do the "head" and "tail"// first to finish at full speed in the body. Once we get the head and// tail values, we feed them into the pipeline, very handy initialization.//// Of course we deal with the special case where the whole buffer fits// into one 8 byte word. In this case we have only one entry in the pipeline.//// We use a (3+1)-stage pipeline in the loop to account for possible// load latency and also to accomodate for head and tail.//// The end of the function deals with folding the checksum from 64bits// down to 16bits taking care of the carry.//// This version avoids synchronization in the core loop by also using a// pipeline for the accumulation of the checksum in result[].//// p[] // |---|// 0| | r32 : new value loaded in pipeline// |---|// 1| | r33 : in transit data// |---|// 2| | r34 : current value to add to checksum// |---|// 3| | r35 : previous value added to checksum (previous iteration)// |---|//// result[] // |---|// 0| | r36 : new checksum// |---|// 1| | r37 : previous value of checksum// |---|// 2| | r38 : final checksum when out of the loop (after 2 epilogue rots)// |---|////// NOT YET DONE:// - Take advantage of the MMI bandwidth to load more than 8byte per loop// iteration// - use the lfetch instruction to augment the chances of the data being in// the cache when we need it.// - Maybe another algorithm which would take care of the folding at the// end in a different manner// - Work with people more knowledgeable than me on the network stack// to figure out if we could not split the function depending on the // type of packet or alignment we get. Like the ip_fast_csum() routine// where we know we have at least 20bytes worth of data to checksum.// - Look at RFCs about checksums to see whether or not we can do better//// - Do a better job of handling small packets.//#define saved_pfs r11#define hmask r16#define tmask r17#define first r18#define firstval r19#define firstoff r20#define last r21#define lastval r22#define lastoff r23#define saved_lc r24#define saved_pr r25#define tmp1 r26#define tmp2 r27#define tmp3 r28#define carry r29#define buf in0#define len in1 .text .psr abi64 .psr lsb .lsb// unsigned long do_csum(unsigned char *buf,int len)GLOBAL_ENTRY(do_csum) UNW(.prologue) UNW(.save ar.pfs, saved_pfs) alloc saved_pfs=ar.pfs,2,8,0,8 .rotr p[4], result[3] mov ret0=r0 // in case we have zero length cmp4.lt p0,p6=r0,len // check for zero length or negative (32bit len) ;; // avoid WAW on CFM mov tmp3=0x7 // a temporary mask/value add tmp1=buf,len // last byte's address(p6) br.ret.spnt.few rp // return if true (hope we can avoid that) and firstoff=7,buf // how many bytes off for first element tbit.nz p10,p0=buf,0 // is buf an odd address ? mov hmask=-1 // intialize head mask ;; andcm first=buf,tmp3 // 8byte aligned down address of first element mov tmask=-1 // initialize tail mask adds tmp2=-1,tmp1 // last-1 ;; and lastoff=7,tmp1 // how many bytes off for last element andcm last=tmp2,tmp3 // address of word containing last byte UNW(.save pr, saved_pr) mov saved_pr=pr // preserve predicates (rotation) ;; sub tmp3=last,first // tmp3=distance from first to last cmp.eq p8,p9=last,first // everything fits in one word ? sub tmp1=8,lastoff // complement to lastoff ld8 firstval=[first],8 // load,ahead of time, "first" word shl tmp2=firstoff,3 // number of bits ;; and tmp1=7, tmp1 // make sure that if tmp1==8 -> tmp1=0(p9) ld8 lastval=[last] // load,ahead of time, "last" word, if needed(p8) mov lastval=r0 // we don't need lastval if first==last mov result[1]=r0 // initialize result ;; shl tmp1=tmp1,3 // number of bits shl hmask=hmask,tmp2 // build head mask, mask off [0,firstoff[ ;; shr.u tmask=tmask,tmp1 // build tail mask, mask off ]8,lastoff] UNW(.save ar.lc, saved_lc) mov saved_lc=ar.lc // save lc ;; UNW(.body)(p8) and hmask=hmask,tmask // apply tail mask to head mask if 1 word only(p9) and p[1]=lastval,tmask // mask last it as appropriate shr.u tmp3=tmp3,3 // we do 8 bytes per loop ;; cmp.lt p6,p7=2,tmp3 // tmp3 > 2 ? and p[2]=firstval,hmask // and mask it as appropriate add tmp1=-2,tmp3 // -2 = -1 (br.ctop) -1 (last-first) ;; // XXX Fixme: not very nice initialization here // // Setup loop control registers: // // tmp3=0 (1 word) : lc=0, ec=2, p16=F // tmp3=1 (2 words) : lc=0, ec=3, p16=F // tmp3=2 (3 words) : lc=0, ec=4, p16=T // tmp3>2 (4 or more): lc=tmp3-2, ec=4, p16=T // cmp.eq p8,p9=r0,tmp3 // tmp3 == 0 ?(p6) mov ar.lc=tmp1(p7) mov ar.lc=0 ;; cmp.lt p6,p7=1,tmp3 // tmp3 > 1 ?(p8) mov ar.ec=2 // we need the extra rotation on result[](p9) mov ar.ec=3 // hard not to set it twice sometimes ;; mov carry=r0 // initialize carry(p6) mov ar.ec=4(p6) mov pr.rot=0xffffffffffff0000 // p16=T, p18=T cmp.ne p8,p0=r0,r0 // p8 is false mov p[3]=r0 // make sure first compare fails(p7) mov pr.rot=0xfffffffffffe0000 // p16=F, p18=T ;;1:(p16) ld8 p[0]=[first],8 // load next(p8) adds carry=1,carry // add carry on prev_prev_value(p18) add result[0]=result[1],p[2] // new_res = prev_res + cur_val cmp.ltu p8,p0=result[1],p[3] // p8= prev_result < prev_val br.ctop.dptk.few 1b // loop until lc--==0 ;; // RAW on carry when loop exits (p8) adds carry=1,carry;; // correct for carry on prev_value add result[2]=carry,result[2];; // add carry to final result cmp.ltu p6,p7=result[2], carry // check for new carry ;;(p6) adds result[2]=1,result[1] // correct if required movl tmp3=0xffffffff ;; // XXX Fixme // // now fold 64 into 16 bits taking care of carry // that's not very good because it has lots of sequentiality // and tmp1=result[2],tmp3 shr.u tmp2=result[2],32 ;; add result[2]=tmp1,tmp2 shr.u tmp3=tmp3,16 ;; and tmp1=result[2],tmp3 shr.u tmp2=result[2],16 ;; add result[2]=tmp1,tmp2 ;; and tmp1=result[2],tmp3 shr.u tmp2=result[2],16 ;; add result[2]=tmp1,tmp2 ;; and tmp1=result[2],tmp3 shr.u tmp2=result[2],16 ;; add ret0=tmp1,tmp2 mov pr=saved_pr,0xffffffffffff0000 ;; // if buf was odd then swap bytes mov ar.pfs=saved_pfs // restore ar.ec(p10) mux1 ret0=ret0,@rev // reverse word ;; mov ar.lc=saved_lc(p10) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes br.ret.sptk.few rpEND(do_csum)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -