📄 vec_csum.s

📁 Please let me download so long so bad
💻 S
📖 第 1 页 / 共 2 页
字号:
12 下一页
//------------------------------------------------------------------// file:  vec_csum.S//    AltiVec enabled version of linux' checksum routines//------------------------------------------------------------------//------------------------------------------------------------------//	Copyright Motorola, Inc. 2003//	ALL RIGHTS RESERVED////	You are hereby granted a copyright license to use, modify, and //	distribute the SOFTWARE so long as this entire notice is retained //	without alteration in any modified and/or redistributed versions, //	and that such modified versions are clearly identified as such.  //	No licenses are granted by implication, estoppel or otherwise under //	any patents or trademarks of Motorola, Inc.////	The SOFTWARE is provided on an "AS IS" basis and without warranty.  //	To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS //	ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED //	WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR //	PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH //	REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS //	THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS. ////	To the maximum extent permitted by applicable law, IN NO EVENT SHALL //	MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER //	(INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF //	BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS //	INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR //	INABILITY TO USE THE SOFTWARE.   Motorola assumes no responsibility //	for the maintenance and support of the SOFTWARE.//------------------------------------------------------------------//------------------------------------------------------------------// extern  unsigned long csum_partial_copy_generic(src, dst, len, sum,//                                                  src_err, dst_err);// Computes the checksum of a memory block at src, length len,// and adds in "sum" (32-bit), while copying the block to dst.// Returns://  unsigned long sum//------------------------------------------------------------------//------------------------------------------------------------------// extern  unsigned long csum_partial(buff, len, sum);//// computes the checksum of a memory block at buff, length len,// and adds in "sum" (32-bit unsigned long)// Returns://  unsigned long sum//------------------------------------------------------------------//------------------------------------------------------------------// Assumptions from studying the original linux code://   Copying forward is always safe//   src and dst are always half-word aligned//   len may be odd or even 0-n;//   there is no test to see if src and dst are equal.//   returns unsigned int checksum////------------------------------------------------------------------// Revision History://    Rev 0.0	Original                          Chuck Corley	04/19/03////  This is alpha quality code; users are encouraged to make it faster.//  ASSUMPTIONS://     Code is highly likely to be in the cache; data is not (streaming data)// Don't use vectors for BC <= MIN_VEC. Works only if MIN_VEC >= 32 bytes.#define MIN_VEC 48	// Experimentally chosen on 7455@1GHz/133 to beat scalar // Register useage#define Rt r0	// 	r0 when used as a temporary register	#define SRC r3	// 	entering: src ptr; exiting: unsigned long checksum#define DST r4	// 	entering: dst pointer; exiting: #define BC r5	//	entering: Byte_Count#define SUM r6	//	entering: Partial checksum#define SER r7	//	entering: src_err address#define DER r8	//	entering: dst_err address#define DM2 r9//	dst -2 for hw-by-hw forwards initially#define D r9	//	dst[28:31]#define DR r9	//	dst[0:27]#define DNX r9	//	(dst+n*16)[28:31]#define BL r9	//	second byte_kount index pointer#define DBC r10//	dst + byte count initially#define DBK r10//	(dst+byte_count-1) then (dst+byte_count-1)[28:31]#define SM2 r11//	src -2 for hw-by-hw forwards initially#define QW r11	//  	number of quad words (vectors)#define SP8 r11	//	data stream touch block & stride info for Big_loop#define SBC r11//	src + byte count initially then src[28:31]#define BK r12	//  	Byte Kount index#define BLK r12	//      temporary data stream touch block & stride info#define S r12//	src[28:31]#define DMS r12	//      dst - src initially#define V0	v0	// 	all zeros#define VCARS	v0 	//	sum of carries#define V1	v1	// 	all ones#define VMM	v1	// 	mask for final dst right#define VS0	v2	//  	src vector for permuting#define VL	v2	//	low data	#define VS1	v3	//  	src vector for permuting#define VH 	v3	//	high data#define VPS0	v4	// 	permuted source vector to store#define VP2	v5	// 	dst permute register#define VM	v5	// 	mask for first dst left#define VS2	v5	//  	src vector for permuting#define VP3	v6	// 	d - s permute register#define VS3	v6	// 	4th src vector in csum_partial#define VP4	v7	// 	Byte_Count permute register#define VPS1	v7	//  	2nd permuted source vector to store#define VSUM 	v8	//	Updated sum#define VFIN 	v8	//	final sum#define VCAR1 	v9 	//	temp register for carries#define VCAR3 	v9 	//	temp register for carries#define VCAR2 	v10 	//	temp register for carries#define VCARF 	v11 	//	temp register for carries#define	VTEMP 	v12 	//	Temp register// Conditionalize the use of dcba.  It will help if the data is// not in cache and hurt if it is.  Generally, except for small// benchmarks repeated many times, we assume data is not in cache// (data streaming) and using dcba is a performance boost.#ifndef NO_DCBA#if defined(__GNUC__) || defined(__MWERKS__) || defined(_DIAB_TOOL) // gcc and codewarrior and diab don't assemble dcba#define DCBK .long 0x7c0465ec// dcba r4,r12    or    dcba DST,BK#else#ifdef __ghs__.macro DCBK.long 0x7c0465ec.endm#else#define DCBK dcba DST,BK#endif  // __ghs__#endif  // __GNUC__ or __MWERKS__#else#define DCBK nop#endif  // NO_DCBA// Conditionalize the use of dst (data stream touch).  It will help// if the data is not in cache and hurt if it is (though not as badly// as dcbz).  Generally, except for small benchmarks repeated many times,// we assume data is not in cache (data streaming) and using dst is a// performance boost.#ifndef NO_DST#define STRM_F dst	SRC,BLK,0#define STRM_1 dst	SP8,Rt,1#else#define STRM_F	nop#define STRM_1	nop#endif	.text#if __MWERKS__	.align	16#define SP r1#else	.align	4#endif#ifdef LIBMOTOVEC	.global	csum_partial_copy_generic_vec     csum_partial_copy_generic:#else	.global	vec_csum_partial_copy_generic     vec_csum_partial_copy_generic:#endif	li	BK,32		// IU1 	rlwinm	Rt,BC,31,1,31	// IU1 BC/2	cmpi	cr7,0,BC,MIN_VEC	// IU1 Check for minimum byte count	dcbt	SRC,BK		// LSU prefetch next cacheline	cmpi	cr6,0,Rt,0	// IU1 BC/2 == 0?	addic	SUM,SUM,0	// IU1 Zero carry bit	addi	SM2,SRC,-2	// IU1 Pre-bias and duplicate src	add	DBC,DST,BC	// IU1 Address of last dst byte + 1	bgt	cr7,v_csumcpy	// b if BC>MIN_VEC (will copy vectors fwd)	andi.	BK,BC,1		// IU1 BC[31]==0?	addi	DM2,DST,-2	// IU1 Pre-bias and duplicate destination	add	S,SRC,BC	// IU1 Last src byte + 1 (temp use of S)	beq	cr6,No_HWs	// b if BC/2==0	mtctr	Rt		// i=BC/2; do ...;i--; while (i>0)HW_cpy:	lhzu	Rt,2(SM2)	// LSU	sthu	Rt,2(DM2)	// LSU	addc	SUM,SUM,Rt	// IU1	bdnz	HW_cpyNo_HWs:	beq	BC_even		// b if BC[31]==0 (or DBC[31]==0 when aligned)	lbz	Rt,-1(S)	// LSU Get last src address byte	stb	Rt,-1(DBC)	// LSU Store to last dst address byte	rlwinm	Rt,Rt,8,16,23	// IU1 Shift odd byte left	addc	SUM,SUM,Rt	// IU1BC_even:	addze	SRC,SUM	blrv_csumcpy:	lvsr	VP2,0,DST	// LSU Permute vector for initial byte mask	rlwinm	D,DST,0,28,31	// IU1 D = dst[28:31]	rlwinm	S,SRC,0,28,31	// IU1 Save src address bits s[28:31]	lvsr	VP4,DST,BC	// LSU Permute vector for final byte mask	subf.	S,S,D		// IU1 if D-S<0 essentially shifting left	subf	DMS,SRC,DST	// IU1 Compute dst-src difference	lvsr	VP3,0,DMS	// LSU Permute vector for dst - src shft right	li	BK,64		// IU1 Index of next cache line	vxor	V0,V0,V0	// VIU Clear v0	dcbt	SRC,BK		// LSU Prefetch next cache line at src+64	cmpi	cr1,0,D,0	// IU1 Is D0 left justified?	vnor	V1,V0,V0	// VIU1 Create a vector of all ones	addi	DR,DST,16	// IU1 Address of second dst vector	addi	DBK,DBC,-1	// IU1 Address of last dst byte	vperm	VM,V1,V0,VP2	// VPU D0 select vector for dst left; src right		bge	Ld_bytes_rt	// b if shifting right (D-S>=0)	lvx	VS0,0,SRC	// LSU Get S0 load started	addi	SRC,SRC,16	// IU1 Increment src base (to keep BK useful)Ld_bytes_rt:	// Come here to get VS1 & Don't care what VS0 is		lvx	VS1,0,SRC	// LSU Get S1 (or S0 if D-S>=0) in upper vector	rlwinm	DR,DR,0,0,27	// IU1 (DST+16)[0:27]	vperm	VMM,V0,V1,VP4   // VPU DN select vector for src left; dst right 	subf	QW,DR,DBK	// IU1 Bytes of full vectors to move (-16)	vxor	VPS1,VPS1,VPS1	// VIU Clear VPS1		vxor	VCARF,VCARF,VCARF	//VIU1 clear VCARF	rlwinm	QW,QW,28,4,31	// IU1 Quad words remaining	rlwinm	Rt,DBK,0,28,31	// IU1 (DBK = DST+BC-1)[28:31]	li	BK,96		// IU1 Index of next cache line	cmpi	cr5,0,Rt,0xF	// IU1 Is DN right justified?	subf	Rt,DST,DR	// IU1 How many bytes in first destination?	mtctr	QW		// IU2	cmpi	cr6,0,QW,4	// IU1 Check QW>4	mtcrf	0x01,Rt		// IU2 Put bytes in 1st dst in cr7	vperm	VPS0,VS0,VS1,VP3	// VPU Align S0 and S1 to D0	vor	VS0,VS1,VS1	// VIU1 Move upper vector to lower	dcbt	SRC,BK		// LSU Prefetch next cache line at src+96	beq	cr1,Left_just	// b if D0 is left justified	li	BK,0		// IU1 Initialize byte kount index	vsel	VPS0,VPS0,V0,VM	// VIU1 Select zeroes left | S0 bytes right	bns	cr7,No_B_fwd	// b if only even number of bytes to store	stvebx	VPS0,DST,BK	// LSU store first byte at DST+0	addi	BK,BK,1		// IU1 increment indexNo_B_fwd:	bne	cr7,No_H_fwd	// b if only words to store	stvehx	VPS0,DST,BK	// LSU store halfword at DST+0/1	addi	BK,BK,2		// IU1 increment indexNo_H_fwd:	bng	cr7,No_W1_fwd	// b if exactly zero or two words to store	stvewx	VPS0,DST,BK	// LSU store word 1 of one or three	addi	BK,BK,4		// IU1 increment indexNo_W1_fwd:	bnl	cr7,No_W2_fwd	// b if there was only one word to store	stvewx	VPS0,DST,BK	// LSU store word 1 of two or 2 of three	addi	BK,BK,4		// IU1 increment index	stvewx	VPS0,DST,BK	// LSU store word 2 of two or 3 of three	b	No_W2_fwdLeft_just:		stvx	VPS0,0,DST	// LSU Store 16 bytes at D0No_W2_fwd:	vxor	VSUM,VSUM,VSUM	// VIU1 Clear VSUM	li	BK,16		// IU1 Re-initialize byte kount indexQW_fwd_loop:	lvx	VS1,SRC,BK	// LSU Get S2 (or S1)	vaddcuw	VCAR1,VPS0,VSUM	// VIU1 data + previous sum ->store carries	vadduwm	VSUM,VPS0,VSUM	// VIU1 data + previous sum (no carries)	vperm	VPS0,VS0,VS1,VP3	// VPU Align S1 and S2 to D1	vor	VS0,VS1,VS1	// VIU1 Move upper vector to lower	stvx	VPS0,DST,BK	// LSU Store 16 bytes at D1(+n*16 where n<4)	vadduwm	VCARF,VCAR1,VCARF	// VIU1 Update VCARF	addi	BK,BK,16	// IU1 Increment byte kount index	bdnzf	25,QW_fwd_loop	// b if 4 or less quad words to do	add	DNX,DST,BK	// IU1 address of next store (DST+32 if QW>4)	addi	QW,QW,-1	// IU1 One more QW stored by now	bgt	cr6,GT_4QW_fwd	// b if >4 quad words leftLast_ld_fwd:	// Next 16 bytes is the last; we're done.	add	DBC,DST,BC	// IU1 Recompute address of last dst byte + 1	add	SBC,SRC,BC	// IU1 Recompute address of last src byte + 1	bge	No_ld_fwd	// b if shifting right (D-S>=0)	addi	SBC,SBC,-16	// IU1 if D-S>=0 we didn't add 16 to srcNo_ld_fwd:	mtcrf	0x01,DBC	// IU2 Put final vector byte count in cr7	addi	Rt,SBC,-1	// IU1 Recompute address of last src byte	lvx	VS1,0,Rt	// LSU Get last source S14 (guaranteed SN)	vaddcuw	VCAR1,VPS0,VSUM	// VIU1 data + previous sum ->store carries	vadduwm	VSUM,VPS0,VSUM	// VIU1 data + previous sum (no carries)	vadduwm	VCARF,VCAR1,VCARF	// VIU1 Update VCARF	vperm	VPS0,VS0,VS1,VP3	// VPU Align S13 and S14 to D14	beq	cr5,Rt_just_fwd	// b if last destination is right justified	vsel	VPS0,VPS0,V0,VMM   // VIU1 Select src bytes left | zeroes right	rlwinm	DBK,DBK,0,0,27	// IU1 Round to QW addr of last byte	li	D,0		// IU1 Initialize index pointer	bnl	cr7,Only_1W_fwd	// b if there was only one or zero words to store	stvewx	VPS0,DBK,D	// LSU store word 1 of two or three	addi	D,D,4		// IU1 increment index	stvewx	VPS0,DBK,D	// LSU store word 2 of two or three	addi	D,D,4		// IU1 increment indexOnly_1W_fwd:	bng	cr7,Only_2W_fwd	// b if there were only two or zero words to store	stvewx	VPS0,DBK,D	// LSU store word 3 of three if necessary	addi	D,D,4		// IU1 increment indexOnly_2W_fwd:
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -