📄 vec_csum.s
字号:
//------------------------------------------------------------------// file: vec_csum.S// AltiVec enabled version of linux' checksum routines//------------------------------------------------------------------//------------------------------------------------------------------// Copyright Motorola, Inc. 2003// ALL RIGHTS RESERVED//// You are hereby granted a copyright license to use, modify, and // distribute the SOFTWARE so long as this entire notice is retained // without alteration in any modified and/or redistributed versions, // and that such modified versions are clearly identified as such. // No licenses are granted by implication, estoppel or otherwise under // any patents or trademarks of Motorola, Inc.//// The SOFTWARE is provided on an "AS IS" basis and without warranty. // To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS // ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED // WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR // PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH // REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS // THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS. //// To the maximum extent permitted by applicable law, IN NO EVENT SHALL // MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER // (INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF // BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS // INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR // INABILITY TO USE THE SOFTWARE. Motorola assumes no responsibility // for the maintenance and support of the SOFTWARE.//------------------------------------------------------------------//------------------------------------------------------------------// extern unsigned long csum_partial_copy_generic(src, dst, len, sum,// src_err, dst_err);// Computes the checksum of a memory block at src, length len,// and adds in "sum" (32-bit), while copying the block to dst.// Returns:// unsigned long sum//------------------------------------------------------------------//------------------------------------------------------------------// extern unsigned long csum_partial(buff, len, sum);//// computes the checksum of a memory block at buff, length len,// and adds in "sum" (32-bit unsigned long)// Returns:// unsigned long sum//------------------------------------------------------------------//------------------------------------------------------------------// Assumptions from studying the original linux code:// Copying forward is always safe// src and dst are always half-word aligned// len may be odd or even 0-n;// there is no test to see if src and dst are equal.// returns unsigned int checksum////------------------------------------------------------------------// Revision History:// Rev 0.0 Original Chuck Corley 04/19/03//// This is alpha quality code; users are encouraged to make it faster.// ASSUMPTIONS:// Code is highly likely to be in the cache; data is not (streaming data)// Don't use vectors for BC <= MIN_VEC. Works only if MIN_VEC >= 32 bytes.#define MIN_VEC 48 // Experimentally chosen on 7455@1GHz/133 to beat scalar // Register useage#define Rt r0 // r0 when used as a temporary register #define SRC r3 // entering: src ptr; exiting: unsigned long checksum#define DST r4 // entering: dst pointer; exiting: #define BC r5 // entering: Byte_Count#define SUM r6 // entering: Partial checksum#define SER r7 // entering: src_err address#define DER r8 // entering: dst_err address#define DM2 r9// dst -2 for hw-by-hw forwards initially#define D r9 // dst[28:31]#define DR r9 // dst[0:27]#define DNX r9 // (dst+n*16)[28:31]#define BL r9 // second byte_kount index pointer#define DBC r10// dst + byte count initially#define DBK r10// (dst+byte_count-1) then (dst+byte_count-1)[28:31]#define SM2 r11// src -2 for hw-by-hw forwards initially#define QW r11 // number of quad words (vectors)#define SP8 r11 // data stream touch block & stride info for Big_loop#define SBC r11// src + byte count initially then src[28:31]#define BK r12 // Byte Kount index#define BLK r12 // temporary data stream touch block & stride info#define S r12// src[28:31]#define DMS r12 // dst - src initially#define V0 v0 // all zeros#define VCARS v0 // sum of carries#define V1 v1 // all ones#define VMM v1 // mask for final dst right#define VS0 v2 // src vector for permuting#define VL v2 // low data #define VS1 v3 // src vector for permuting#define VH v3 // high data#define VPS0 v4 // permuted source vector to store#define VP2 v5 // dst permute register#define VM v5 // mask for first dst left#define VS2 v5 // src vector for permuting#define VP3 v6 // d - s permute register#define VS3 v6 // 4th src vector in csum_partial#define VP4 v7 // Byte_Count permute register#define VPS1 v7 // 2nd permuted source vector to store#define VSUM v8 // Updated sum#define VFIN v8 // final sum#define VCAR1 v9 // temp register for carries#define VCAR3 v9 // temp register for carries#define VCAR2 v10 // temp register for carries#define VCARF v11 // temp register for carries#define VTEMP v12 // Temp register// Conditionalize the use of dcba. It will help if the data is// not in cache and hurt if it is. Generally, except for small// benchmarks repeated many times, we assume data is not in cache// (data streaming) and using dcba is a performance boost.#ifndef NO_DCBA#if defined(__GNUC__) || defined(__MWERKS__) || defined(_DIAB_TOOL) // gcc and codewarrior and diab don't assemble dcba#define DCBK .long 0x7c0465ec// dcba r4,r12 or dcba DST,BK#else#ifdef __ghs__.macro DCBK.long 0x7c0465ec.endm#else#define DCBK dcba DST,BK#endif // __ghs__#endif // __GNUC__ or __MWERKS__#else#define DCBK nop#endif // NO_DCBA// Conditionalize the use of dst (data stream touch). It will help// if the data is not in cache and hurt if it is (though not as badly// as dcbz). Generally, except for small benchmarks repeated many times,// we assume data is not in cache (data streaming) and using dst is a// performance boost.#ifndef NO_DST#define STRM_F dst SRC,BLK,0#define STRM_1 dst SP8,Rt,1#else#define STRM_F nop#define STRM_1 nop#endif .text#if __MWERKS__ .align 16#define SP r1#else .align 4#endif#ifdef LIBMOTOVEC .global csum_partial_copy_generic_vec csum_partial_copy_generic:#else .global vec_csum_partial_copy_generic vec_csum_partial_copy_generic:#endif li BK,32 // IU1 rlwinm Rt,BC,31,1,31 // IU1 BC/2 cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count dcbt SRC,BK // LSU prefetch next cacheline cmpi cr6,0,Rt,0 // IU1 BC/2 == 0? addic SUM,SUM,0 // IU1 Zero carry bit addi SM2,SRC,-2 // IU1 Pre-bias and duplicate src add DBC,DST,BC // IU1 Address of last dst byte + 1 bgt cr7,v_csumcpy // b if BC>MIN_VEC (will copy vectors fwd) andi. BK,BC,1 // IU1 BC[31]==0? addi DM2,DST,-2 // IU1 Pre-bias and duplicate destination add S,SRC,BC // IU1 Last src byte + 1 (temp use of S) beq cr6,No_HWs // b if BC/2==0 mtctr Rt // i=BC/2; do ...;i--; while (i>0)HW_cpy: lhzu Rt,2(SM2) // LSU sthu Rt,2(DM2) // LSU addc SUM,SUM,Rt // IU1 bdnz HW_cpyNo_HWs: beq BC_even // b if BC[31]==0 (or DBC[31]==0 when aligned) lbz Rt,-1(S) // LSU Get last src address byte stb Rt,-1(DBC) // LSU Store to last dst address byte rlwinm Rt,Rt,8,16,23 // IU1 Shift odd byte left addc SUM,SUM,Rt // IU1BC_even: addze SRC,SUM blrv_csumcpy: lvsr VP2,0,DST // LSU Permute vector for initial byte mask rlwinm D,DST,0,28,31 // IU1 D = dst[28:31] rlwinm S,SRC,0,28,31 // IU1 Save src address bits s[28:31] lvsr VP4,DST,BC // LSU Permute vector for final byte mask subf. S,S,D // IU1 if D-S<0 essentially shifting left subf DMS,SRC,DST // IU1 Compute dst-src difference lvsr VP3,0,DMS // LSU Permute vector for dst - src shft right li BK,64 // IU1 Index of next cache line vxor V0,V0,V0 // VIU Clear v0 dcbt SRC,BK // LSU Prefetch next cache line at src+64 cmpi cr1,0,D,0 // IU1 Is D0 left justified? vnor V1,V0,V0 // VIU1 Create a vector of all ones addi DR,DST,16 // IU1 Address of second dst vector addi DBK,DBC,-1 // IU1 Address of last dst byte vperm VM,V1,V0,VP2 // VPU D0 select vector for dst left; src right bge Ld_bytes_rt // b if shifting right (D-S>=0) lvx VS0,0,SRC // LSU Get S0 load started addi SRC,SRC,16 // IU1 Increment src base (to keep BK useful)Ld_bytes_rt: // Come here to get VS1 & Don't care what VS0 is lvx VS1,0,SRC // LSU Get S1 (or S0 if D-S>=0) in upper vector rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27] vperm VMM,V0,V1,VP4 // VPU DN select vector for src left; dst right subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16) vxor VPS1,VPS1,VPS1 // VIU Clear VPS1 vxor VCARF,VCARF,VCARF //VIU1 clear VCARF rlwinm QW,QW,28,4,31 // IU1 Quad words remaining rlwinm Rt,DBK,0,28,31 // IU1 (DBK = DST+BC-1)[28:31] li BK,96 // IU1 Index of next cache line cmpi cr5,0,Rt,0xF // IU1 Is DN right justified? subf Rt,DST,DR // IU1 How many bytes in first destination? mtctr QW // IU2 cmpi cr6,0,QW,4 // IU1 Check QW>4 mtcrf 0x01,Rt // IU2 Put bytes in 1st dst in cr7 vperm VPS0,VS0,VS1,VP3 // VPU Align S0 and S1 to D0 vor VS0,VS1,VS1 // VIU1 Move upper vector to lower dcbt SRC,BK // LSU Prefetch next cache line at src+96 beq cr1,Left_just // b if D0 is left justified li BK,0 // IU1 Initialize byte kount index vsel VPS0,VPS0,V0,VM // VIU1 Select zeroes left | S0 bytes right bns cr7,No_B_fwd // b if only even number of bytes to store stvebx VPS0,DST,BK // LSU store first byte at DST+0 addi BK,BK,1 // IU1 increment indexNo_B_fwd: bne cr7,No_H_fwd // b if only words to store stvehx VPS0,DST,BK // LSU store halfword at DST+0/1 addi BK,BK,2 // IU1 increment indexNo_H_fwd: bng cr7,No_W1_fwd // b if exactly zero or two words to store stvewx VPS0,DST,BK // LSU store word 1 of one or three addi BK,BK,4 // IU1 increment indexNo_W1_fwd: bnl cr7,No_W2_fwd // b if there was only one word to store stvewx VPS0,DST,BK // LSU store word 1 of two or 2 of three addi BK,BK,4 // IU1 increment index stvewx VPS0,DST,BK // LSU store word 2 of two or 3 of three b No_W2_fwdLeft_just: stvx VPS0,0,DST // LSU Store 16 bytes at D0No_W2_fwd: vxor VSUM,VSUM,VSUM // VIU1 Clear VSUM li BK,16 // IU1 Re-initialize byte kount indexQW_fwd_loop: lvx VS1,SRC,BK // LSU Get S2 (or S1) vaddcuw VCAR1,VPS0,VSUM // VIU1 data + previous sum ->store carries vadduwm VSUM,VPS0,VSUM // VIU1 data + previous sum (no carries) vperm VPS0,VS0,VS1,VP3 // VPU Align S1 and S2 to D1 vor VS0,VS1,VS1 // VIU1 Move upper vector to lower stvx VPS0,DST,BK // LSU Store 16 bytes at D1(+n*16 where n<4) vadduwm VCARF,VCAR1,VCARF // VIU1 Update VCARF addi BK,BK,16 // IU1 Increment byte kount index bdnzf 25,QW_fwd_loop // b if 4 or less quad words to do add DNX,DST,BK // IU1 address of next store (DST+32 if QW>4) addi QW,QW,-1 // IU1 One more QW stored by now bgt cr6,GT_4QW_fwd // b if >4 quad words leftLast_ld_fwd: // Next 16 bytes is the last; we're done. add DBC,DST,BC // IU1 Recompute address of last dst byte + 1 add SBC,SRC,BC // IU1 Recompute address of last src byte + 1 bge No_ld_fwd // b if shifting right (D-S>=0) addi SBC,SBC,-16 // IU1 if D-S>=0 we didn't add 16 to srcNo_ld_fwd: mtcrf 0x01,DBC // IU2 Put final vector byte count in cr7 addi Rt,SBC,-1 // IU1 Recompute address of last src byte lvx VS1,0,Rt // LSU Get last source S14 (guaranteed SN) vaddcuw VCAR1,VPS0,VSUM // VIU1 data + previous sum ->store carries vadduwm VSUM,VPS0,VSUM // VIU1 data + previous sum (no carries) vadduwm VCARF,VCAR1,VCARF // VIU1 Update VCARF vperm VPS0,VS0,VS1,VP3 // VPU Align S13 and S14 to D14 beq cr5,Rt_just_fwd // b if last destination is right justified vsel VPS0,VPS0,V0,VMM // VIU1 Select src bytes left | zeroes right rlwinm DBK,DBK,0,0,27 // IU1 Round to QW addr of last byte li D,0 // IU1 Initialize index pointer bnl cr7,Only_1W_fwd // b if there was only one or zero words to store stvewx VPS0,DBK,D // LSU store word 1 of two or three addi D,D,4 // IU1 increment index stvewx VPS0,DBK,D // LSU store word 2 of two or three addi D,D,4 // IU1 increment indexOnly_1W_fwd: bng cr7,Only_2W_fwd // b if there were only two or zero words to store stvewx VPS0,DBK,D // LSU store word 3 of three if necessary addi D,D,4 // IU1 increment indexOnly_2W_fwd:
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -