📄 vec_csum.s

📁 Please let me download so long so bad
💻 S
📖 第 1 页 / 共 2 页
字号:
上一页 12
	bne	cr7,Only_B_fwd	// b if there are no half words to store	stvehx	VPS0,DBK,D	// LSU store one halfword if necessary	addi	D,D,2		// IU1 increment indexOnly_B_fwd:	bns	cr7,All_done_fwd	// b if there are no bytes to store	stvebx	VPS0,DBK,D	// LSU store one byte if necessary	b	All_done_fwdRt_just_fwd:	stvx	VPS0,DST,BK	// LSU Store 16 bytes at D14All_done_fwd:	vaddcuw	VCAR1,VPS0,VPS1	//VIU1 add data and store carries	vadduwm	VTEMP,VPS0,VPS1	//VIU1 add data (no carries)	vaddcuw	VCAR2,VTEMP,VSUM	//VIU1 data + previous sum ->store carries		vadduwm	VSUM,VTEMP,VSUM	//VIU1	data + previous sum		vadduwm	VCAR3,VCAR1,VCAR2	//VIU1 add carries from previous adds	vmrglh	VL,V0,VSUM	// VPU separate low shorts of sum	vadduwm	VCARF,VCAR3,VCARF	//VIU1 update VCARF	vmrghh	VH,V0,VSUM	//VPU separate high shorts of sum	rlwinm	DBK,SP,0,0,27	// IU1 Align stack pointer to QW	vsumsws	VCARS,VCARF,V0	 //VIU2 sum all carries 	vadduwm	VSUM,VL,VH	//VIU1 add low and high data	li	BK,-16		// IU1 Index 0x10 less than SP	vsumsws	VFIN,VSUM,VCARS	//VIU2	sum all data including carries        		stvx	VFIN,DBK,BK	// LSU Store partial checksum from VR		lwz	SRC,-4(DBK)	// LSU Load partial checksum to GPR	addc	SRC,SRC,SUM	addze	SRC,SRC	blr			// Return destination address from entry	#ifdef __MWERKS__	.align	16#else	.align	4#endifGT_4QW_fwd:	// Do once if nxt st is to odd half of cache line, else twice	lvx	VS1,SRC,BK	// LSU Get S3 (or S2)	addi	QW,QW,-1	// IU1 Keeping track of QWs stored	vaddcuw	VCAR1,VPS0,VSUM	// VIU1 data + previous sum ->store carries	vadduwm	VSUM,VPS0,VSUM	// VIU1 data + previous sum (no carries)	mtcrf	0x02,DNX	// IU2 cr6[3]=((DST+32)[27]==1)?1:0;	addi	DNX,DNX,16		// IU1 Update cr6 for next loop	addi	Rt,QW,-2	// IU1 Insure at least 2 QW left after big loop	vperm	VPS0,VS0,VS1,VP3	// VPU Align S2 and S3 to D2	vor	VS0,VS1,VS1	// VIU1 Move upper vector to lower	stvx	VPS0,DST,BK	// LSU Store 16 bytes at D2	addi	BK,BK,16	// IU1 Increment byte count by 16	vadduwm	VCARF,VCAR1,VCARF	// VIU1 Update VCARF	bdnzf	27,GT_4QW_fwd	// b if next store is to lower (even) half of CL// At this point next store will be to even address.	mtcrf	0x02,DBK	// IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?)	addi	SP8,SRC,96	// IU1 Starting address for dcbt	addi	BL,BK,16	// IU1 Create an alternate byte kount + 32// We need the ctr register to reflect an even byte count before entering// the next block - faster to decrement than to reload.	bns	cr6,B32_fwd	// b if DST[27] == 0; i.e, final store is even	bdnz	B32_fwd		// decrement counter for last QW store oddB32_fwd:	// Should be at least 2 stores remaining and next 2 are cache aligned	lvx	VS1,SRC,BK	// LSU Get S4	addi	SP8,SP8,32	// IU1 Next starting address for dcbt	vaddcuw	VCAR1,VPS0,VPS1	// VIU1 add data and store carries	lvx	VS2,SRC,BL	// LSU Get S5	vadduwm	VTEMP,VPS0,VPS1	// VIU1 add data (no carries)	dcbt	0,SP8		// LSU Prefetch cache line 64 bytes ahead	vaddcuw	VCAR2,VTEMP,VSUM //VIU1 data + previous sum ->store carries	DCBK			// LSU Kill instead of RWITM	vadduwm	VSUM,VTEMP,VSUM //VIU1	data + previous sum	vperm	VPS1,VS0,VS1,VP3	// VPU Align S11 and S12 to D11	stvx	VPS1,DST,BK	// LSU Store 16 bytes at D11	vperm	VPS0,VS1,VS2,VP3	// VPU Align S12 and S13 to D12	vadduwm	VCAR3,VCAR1,VCAR2 //VIU1 add carries from previous adds	bdz	Nxt_loc_fwd	// always decrement and branch to next instr		Nxt_loc_fwd:	stvx	VPS0,DST,BL	// LSU Store 16 bytes at D12	vor	VS0,VS2,VS2	// VIU1 Move S13 to S11	vadduwm	VCARF,VCAR3,VCARF //VIU1 update VCARF	addi	BK,BL,16	// IU1 Increment byte count	addi	BL,BK,16	// IU1 Increment alternate byte count	bdnz	B32_fwd		// b if there are at least two more QWs to do	bso	cr6,One_even_QW	// b if there is one even and one odd QW to store	b	Last_ld_fwd	// b if last store is to even address// Come here with two more loads and two stores to doOne_even_QW:	lvx	VS1,SRC,BK	// LSU Get S6 (or S5 if if D-S>=0)	vaddcuw	VCAR1,VPS0,VSUM	// VIU1 data + previous sum ->store carries	vadduwm	VSUM,VPS0,VSUM	// VIU1 data + previous sum (no carries)	vadduwm	VCARF,VCAR1,VCARF	// VIU1 Update VCARF	vperm	VPS0,VS0,VS1,VP3	// VPU Align S13 and S14 to D13	vor	VS0,VS1,VS1	// VIU1 Move upper vector to lower	stvx	VPS0,DST,BK	// LSU Store 16 bytes at D13	addi	BK,BK,16	// IU1 Increment byte count	b	Last_ld_fwd// End of vec_csum_partial_copy_generic in AltiVec// Modified from above Register useage// Don't use vectors for BC <= MIN_VEC_CS. Works only if MIN_VEC >= 32 bytes.#define MIN_VEC_CS 48	// Chosen experimentally on MPC7455@1GHz/133MHz bus#undef	DST	//      will not be using here#undef	BC#define BC r4	//	entering: Byte_Count#undef	SUM#define SUM r5	//	entering: Partial checksum#if __MWERKS__	.align	16#else	.align	4#endif#ifdef LIBMOTOVEC	.global	csum_partial_vec     csum_partial:#else	.global	vec_csum_partial vec_csum_partial:#endif	li	BK,32		// IU1 	rlwinm	Rt,BC,31,1,31	// IU1 BC/2	cmpi	cr7,0,BC,MIN_VEC_CS	// IU1 Check for minimum byte count	dcbt	SRC,BK		// LSU prefetch next cacheline	cmpi	cr6,0,Rt,0	// IU1 BC/2 == 0?	addic	SUM,SUM,0	// IU1 Zero carry bit	addi	SM2,SRC,-2	// IU1 Pre-bias and duplicate src	add	DBC,SRC,BC	// IU1 Compute address of last src byte + 1	bgt	cr7,v_csum	// b if BC>MIN_VEC_CS	andi.	BK,BC,1		// IU1 BC[31]==0?	beq	cr6,No_HWs_cs	// b if BC/2==0	mtctr	Rt		// i=BC/2; do ...;i--; while (i>0)HW_cs:	lhzu	Rt,2(SM2)	// LSU	addc	SUM,SUM,Rt	// IU1	bdnz	HW_csNo_HWs_cs:	beq	BC_even_cs	// b if BC[31]==0 (or DBC[31]==0 when aligned)	lbz	Rt,-1(DBC)	// LSU Get last src address byte	rlwinm	Rt,Rt,8,16,23	// IU1 Shift odd byte left	addc	SUM,SUM,Rt	// IU1BC_even_cs:	addze	SRC,SUM	blrv_csum:	lvsr	VP2,0,SRC	// LSU Permute vector for initial byte mask	addi	DR,SRC,16	// IU1 Address of second src vector	li	BK,64		// IU1 Index of next cache line	lvsr	VP4,SRC,BC	// LSU Permute vector for final byte mask	rlwinm	DR,DR,0,0,27	// IU1 (DST+16)[0:27]	addi	DBK,DBC,-1	// IU1 Address of last src byte	lvx	VS0,0,SRC	// LSU Get S0 load started	subf	QW,DR,DBK	// IU1 Bytes of full vectors to test (-16)	vxor	V0,V0,V0	// VIU Clear v0	dcbt	SRC,BK		// LSU Prefetch next cache line at src+64	rlwinm	QW,QW,28,4,31	// IU1 Quad words remaining	vnor	V1,V0,V0	// VIU1 Create a vector of all ones	mtctr	QW		// IU2	vxor	VCARF,VCARF,VCARF	//VIU1 clear VCARF	vperm	VM,V1,V0,VP2	// VPU D0 select vector for dst left; src right		cmpi	cr6,0,QW,4	// IU1 Check QW>4	vxor	VSUM,VSUM,VSUM	// VIU1 Clear VSUM	vperm	VMM,V0,V1,VP4   // VPU DN select vector for src left; dst right 	li	BK,16		// IU1 Initialize byte kount index	vsel	VS0,VS0,V0,VM	// VIU1 Select zeroes left | S0 bytes rightvp_fwd_loop:	lvx	VS1,SRC,BK	// LSU Get S1	vaddcuw	VCAR1,VS0,VSUM	// VIU1 data + previous sum ->store carries	vadduwm	VSUM,VS0,VSUM	// VIU1 data + previous sum (no carries)	vadduwm	VCARF,VCAR1,VCARF	// VIU1 Update VCARF	addi	BK,BK,16	// IU1 Increment byte kount index	vor	VS0,VS1,VS1	// VIU1 Swap vectors for next loop	bdnzf	25,vp_fwd_loop	// b if 4 or less quad words to do	add	DNX,SRC,BK	// IU1 address of next load (SRC+32 if QW>4)	addi	QW,QW,-1	// IU1 One more QW summed by now	bgt	cr6,GT_4QW_cs	// b if >4 quad words left	vxor	VS1,VS1,VS1	// VIU1 Zero before adding below// Next 16 bytes is the last; we're done.Last_ld_cs:	lvx	VS2,0,DBK	// LSU Get last source (guaranteed SN)	vaddcuw	VCAR1,VS0,VS1	// VIU1 add data and store carries	rlwinm	DBK,DBK,0,28,31 // IU1 (dst + BC -1)[28:31]	vadduwm	VTEMP,VS0,VS1	// VIU1 add data (no carries)	cmpi	cr7,0,DBK,0xF	// IU1 Is last byte right justified?	vaddcuw	VCAR2,VTEMP,VSUM //VIU1 data + previous sum ->store carries	vadduwm	VSUM,VTEMP,VSUM //VIU1	data + previous sum	vadduwm	VCAR3,VCAR1,VCAR2 //VIU1 add carries from previous adds	vadduwm	VCARF,VCAR3,VCARF //VIU1 update VCARF	beq	cr7, Rt_just	// b if right justified.	vsel	VS2,VS2,V0,VMM   // VIU1 Select src bytes left | zeroes rightRt_just:	vaddcuw	VCAR1,VS2,VSUM	// VIU1 data + previous sum ->store carries	vadduwm	VSUM,VS2,VSUM	// VIU1 data + previous sum (no carries)	vadduwm	VCARF,VCAR1,VCARF	// VIU1 Update VCARF	vmrglh	VL,V0,VSUM	// VPU separate low shorts of sum	vmrghh	VH,V0,VSUM	//VPU separate high shorts of sum	rlwinm	DBK,SP,0,0,27	// IU1 Align stack pointer to QW	vsumsws	VCARS,VCARF,V0	 //VIU2 sum all carries 	vadduwm	VSUM,VL,VH	//VIU1 add low and high data	li	BK,-16		// IU1 Index 0x10 less than SP	vsumsws	VFIN,VSUM,VCARS	//VIU2	sum all data including carries        		stvx	VFIN,DBK,BK	// LSU Store partial checksum from VR		lwz	SRC,-4(DBK)	// LSU Load partial checksum to GPR	addc	SRC,SRC,SUM	addze	SRC,SRC	blr			// Return destination address from entry#ifdef __MWERKS__	.align	16#else	.align	4#endifGT_4QW_cs:	// Do once if nxt ld is from odd half of cache line, else twice	lvx	VS1,SRC,BK	// LSU Get S3 (or S2)	addi	QW,QW,-1	// IU1 Keeping track of QWs stored	vaddcuw	VCAR1,VS0,VSUM	// VIU1 data + previous sum ->store carries	vadduwm	VSUM,VS0,VSUM	// VIU1 data + previous sum (no carries)	mtcrf	0x02,DNX	// IU2 cr6[3]=((DST+32)[27]==1)?1:0;	addi	DNX,DNX,16	// IU1 Update cr6 for next loop	addi	Rt,QW,-2	// IU1 Insure at least 2 QW left after big loop	vor	VS0,VS1,VS1	// VIU1 Move upper vector to lower	addi	BK,BK,16	// IU1 Increment byte count by 16	vadduwm	VCARF,VCAR1,VCARF	// VIU1 Update VCARF	bdnzf	27,GT_4QW_cs	// b if next store is to lower (even) half of CL// At this point next store will be to even address.	mtcrf	0x02,DBK	// IU2 cr6[3]=((last load)[27]==1)?1:0; (odd?)	addi	SP8,SRC,96	// IU1 Starting address for dcbt	vxor	VS1,VS1,VS1	// VIU1 Zero before adding below// We need the ctr register to reflect an even byte count before entering// the next block - faster to decrement than to reload.	bns	cr6,B32_cs	// b if DST[27] == 0; i.e, final load is even	bdnz	B32_cs		// decrement counter for last QW load oddB32_cs:	// Should be at least 2 loads remaining and next 2 are cache aligned	lvx	VS2,SRC,BK	// LSU Get S4	addi	BK,BK,16	// IU1 Increment byte count by 16	vaddcuw	VCAR1,VS0,VS1	// VIU1 add data and store carries	lvx	VS3,SRC,BK	// LSU Get S5	addi	SP8,SP8,32	// IU1 Next starting address for dcbt	vadduwm	VTEMP,VS0,VS1	// VIU1 add data (no carries)	dcbt	0,SP8		// LSU Prefetch cache line 64 bytes ahead	addi	BK,BK,16	// IU1 Increment byte count	vaddcuw	VCAR2,VTEMP,VSUM //VIU1 data + previous sum ->store carries	vadduwm	VSUM,VTEMP,VSUM //VIU1	data + previous sum	vadduwm	VCAR3,VCAR1,VCAR2 //VIU1 add carries from previous adds	bdz	Nxt_loc_cs	// always decrement and branch to next instr		Nxt_loc_cs:	vadduwm	VCARF,VCAR3,VCARF //VIU1 update VCARF	vor	VS0,VS2,VS2	// VIU1 Move S13 to S11	vor	VS1,VS3,VS3	// VIU1 Move upper vector to lower	bdnz	B32_cs		// b if there are at least two more QWs to do	bso	cr6,One_even_QW_cs	// b if there is one even and one odd QW to store	b	Last_ld_cs	// b if last store is to even address// Come here with two more loads and two stores to doOne_even_QW_cs:	lvx	VS2,SRC,BK	// LSU Get S6 (or S5 if if D-S>=0)	addi	BK,BK,16	// IU1 Increment byte count	vaddcuw	VCAR1,VS0,VS1	// VIU1 add data and store carries	vadduwm	VTEMP,VS0,VS1	// VIU1 add data (no carries)	vaddcuw	VCAR2,VTEMP,VSUM //VIU1 data + previous sum ->store carries	vadduwm	VSUM,VTEMP,VSUM //VIU1	data + previous sum	vadduwm	VCAR3,VCAR1,VCAR2 //VIU1 add carries from previous adds	vadduwm	VCARF,VCAR3,VCARF //VIU1 update VCARF	vxor	VS1,VS1,VS1	// VIU1 Zero before next add	vor	VS0,VS2,VS2	// VIU1 Move S13 to S11	b	Last_ld_cs// End of vec_csum_partial in AltiVec
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -