📄 vec_memset.s

📁 Please let me download so long so bad
💻 S
📖 第 1 页 / 共 2 页
字号:
上一页 12
	stvebx	v0,DBK,BL	// LSU store one byte if necessary
	b	All_done_fwd

Rt_just_fwd:

	stvx	v0,DST,BK	// LSU Store 16 bytes at D14
All_done_fwd:
#ifdef VRSAVE
	mtspr	VRSV,RSV	// IU1 Restore VRSAVE	
#endif
	blr			// Return destination address from entry

#ifdef __MWERKS__
	.align	16
#else
	.align	4
#endif
GT_4QW_fwd:	// Do once if nxt st is to odd half of cache line, else twice

	addi	QW,QW,-1	// IU1 Keeping track of QWs stored
	mtcrf	0x02,DNX	// IU2 cr6[3]=((DST+32)[27]==1)?1:0;
	addi	DNX,DNX,16	// IU1 Update cr6 for next loop

	stvx	v0,DST,BK	// LSU Store 16 bytes at D2
	addi	BK,BK,16	// IU1 Increment byte count by 16
	bdnzf	27,GT_4QW_fwd	// b if next store is to lower (even) half of CL

	mtcrf	0x02,DBK	// IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?)

	bns	cr6,B32_fwd	// b if DST[27] == 0; i.e, final store is even

// We need the ctr register to reflect an even byte count before entering
// the next block - faster to decrement than to reload.
	bdnz	B32_fwd		// decrement counter for last QW store odd

B32_fwd:	// Should be at least 2 stores remaining and next 2 are cache aligned
	DCBK			// LSU then Kill instead of RWITM

	stvx	v0,DST,BK	// LSU Store 16 bytes at D11
	addi	BK,BK,16	// IU1 Increment byte count
	bdz	Nxt_loc_fwd	// always decrement and branch to next instr		

Nxt_loc_fwd:
	stvx	v0,DST,BK	// LSU Store 16 bytes at D12
	addi	BK,BK,16	// IU1 Increment byte count
	bdnz	B32_fwd		// b if there are at least two more QWs to do

	bso	cr6,One_even_QW	// b if there is one even and one odd QW to store
	b	Last_QW		// b if last store is to even address

// Come here with two more loads and two stores to do
One_even_QW:
	stvx	v0,DST,BK	// LSU Store 16 bytes at D13
	addi	BK,BK,16	// IU1 Increment byte count

	b	Last_QW

// End of memset in AltiVec

#define BCz r4		// in bzero r4 enters with byte count

#ifdef __MWERKS__
	.align	32
#else
	.align	5
#endif

#ifdef LIBMOTOVEC
	.global	bzero     
bzero:
#else
	.global	vec_bzero     
vec_bzero:
#endif

	mr	BC,BCz		// IU1 arg[2] is BC here, not FILL
	li	FILL,0		// IU1 for bzero FILL=0
#ifdef LIBMOTOVEC
	b	memset     
#else
	b	vec_memset     
#endif

// cacheable_memzero will employ dcbz to clear 32 bytes at a time
// of cacheable memory. Like bzero, second entering argument will be BC.
// Using this for non-cacheable memory will generate an alignment exception.

	.text
#ifdef __MWERKS__
	.align	32
#else
	.align	5
#endif

#ifdef LIBMOTOVEC
	.global	cacheable_memzero     
cacheable_memzero:
#else
	.global	vec_cacheable_memzero     
vec_cacheable_memzero:
#endif

	mr	BC,BCz		// IU1 arg[2] is BC here, not FILL
	li	FILL,0		// IU1 for bzero FILL=0
	cmpi	cr7,0,BC,MIN_VEC	// IU1 Check for minimum byte count

	cmpi	cr1,0,BC,0	// IU1 Eliminate zero byte count

	addi	DM1,DST,-1	// IU1 Pre-bias and duplicate destination
	addi	DR,DST,16	// IU1 Address of second dst vector
	add	DBC,DST,BC	// IU1 Address of last dst byte + 1
	bgt	cr7,c_v_memset	// b if BC>MIN_VEC

	mtctr	BC		// for (i=1;i<=BC;i++)
	beqlr	cr1		// return if BC = 0
c_Byte_set:
	stbu	FILL,1(DM1)	// LSU * ++(DST-1) = FILL
	bdnz	c_Byte_set

	blr

c_v_memset:
// Byte count < MIN_VEC bytes will have been set by scalar code above,
// so this will not deal with small block sets < MIN_VEC.

// For systems using VRSAVE, define VRSAV=1 when compiling.  For systems
// that don't, make sure VRSAVE is undefined.
#ifdef VRSAVE
	mfspr	RSV,VRSV	// IU2 Get current VRSAVE contents
#endif
	rlwinm	DR,DR,0,0,27	// IU1 (DST+16)[0:27]
	addi	DBK,DBC,-1	// IU1 Address of last dst byte

#ifdef VRSAVE
	oris	Rt,RSV,0x8000	// IU1 Or in registers used by this routine
#endif
	subf	D,DST,DR	// IU1 How many bytes in first destination?
	li	BK,0		// IU1 Initialize byte kount index

#ifdef VRSAVE
	mtspr	VRSV,Rt	// IU2 Save in VRSAVE before first vec op
#endif
	vxor	v0,v0,v0	// VIU Clear v0
	subf	QW,DR,DBK	// IU1 Bytes of full vectors to move (-16)
	cmpi	cr1,0,D,16	// IU1 Is D0 left justified?

	mtcrf	0x01,D		// IU2 Put bytes in 1st dst in cr7
	rlwinm	QW,QW,28,4,31	// IU1 Quad words remaining
	beq	cr1,c_Left_just	// b if D0 is left justified

	bns	cr7,c_No_B_fwd	// b if only even number of bytes to store

	stvebx	v0,DST,BK	// LSU store first byte at DST+0
	addi	BK,BK,1		// IU1 increment index
c_No_B_fwd:
	bne	cr7,c_No_H_fwd	// b if only words to store

	stvehx	v0,DST,BK	// LSU store halfword at DST+0/1
	addi	BK,BK,2		// IU1 increment index
c_No_H_fwd:
	bng	cr7,c_No_W1_fwd	// b if exactly zero or two words to store

	stvewx	v0,DST,BK	// LSU store word 1 of one or three
	addi	BK,BK,4		// IU1 increment index

c_No_W1_fwd:
	bnl	cr7,c_No_W2_fwd	// b if there was only one word to store
	stvewx	v0,DST,BK	// LSU store word 1 of two or 2 of three
	addi	BK,BK,4		// IU1 increment index

	stvewx	v0,DST,BK	// LSU store word 2 of two or 3 of three
	b	c_No_W2_fwd

c_Left_just:	
	stvx	v0,0,DST	// LSU Store 16 bytes at D0
c_No_W2_fwd:
	rlwinm	Rt,DBK,0,28,31	// IU1 (DBK = DST+BC-1)[28:31]
	cmpi	cr6,0,QW,0	// IU1 Any full vectors to move?

	li	BK,16		// IU1 Re-initialize byte kount index
	cmpi	cr1,0,Rt,0xF	// IU1 Is DN right justified?
	ble	cr6,c_Last_QW	// b if no Quad words to do

	mtctr	QW		// IU2 for (i=0;i<=QW;i++)
	cmpi	cr6,0,QW,4	// IU1 Check QW>4

c_QW_loop:
	stvx	v0,DST,BK	// LSU Store 16 fill bytes
	addi	BK,BK,16	// IU1 Increment byte kount index
	bdnzf	25,c_QW_loop	// b if 4 or less quad words to do

	add	DNX,DST,BK	// IU1 address of next store (DST+32 if QW>4)
	addi	QW,QW,-1	// IU1 One more QW stored by now
	bgt	cr6,c_GT_4QW_fwd	// b if >4 quad words left

c_Last_QW:	// Next vector is the last; we're done.
	mtcrf	0x01,DBC	// IU2 Put final vector byte count in cr7

	beq	cr1,c_Rt_just_fwd	// b if last destination is right justified

	rlwinm	DBK,DBK,0,0,27	// IU1 Round to QW addr of last byte
	li	BL,0		// IU1 Initialize index pointer
	bnl	cr7,c_Only_1W_fwd	// b if there was only one or zero words to store

	stvewx	v0,DBK,BL	// LSU store word 1 of two or three
	addi	BL,BL,4		// IU1 increment index

	stvewx	v0,DBK,BL	// LSU store word 2 of two or three
	addi	BL,BL,4		// IU1 increment index
c_Only_1W_fwd:
	bng	cr7,Only_2W_fwd	// b if there were only two or zero words to store

	stvewx	v0,DBK,BL	// LSU store word 3 of three if necessary
	addi	BL,BL,4		// IU1 increment index
c_Only_2W_fwd:
	bne	cr7,c_Only_B_fwd	// b if there are no half words to store

	stvehx	v0,DBK,BL	// LSU store one halfword if necessary
	addi	BL,BL,2		// IU1 increment index
c_Only_B_fwd:
	bns	cr7,c_All_done_fwd	// b if there are no bytes to store

	stvebx	v0,DBK,BL	// LSU store one byte if necessary
	b	c_All_done_fwd

c_Rt_just_fwd:

	stvx	v0,DST,BK	// LSU Store 16 bytes at D14
c_All_done_fwd:
#ifdef VRSAVE
	mtspr	VRSV,RSV	// IU1 Restore VRSAVE	
#endif
	blr			// Return destination address from entry

#ifdef __MWERKS__
	.align	16
#else
	.align	4
#endif
c_GT_4QW_fwd:	// Do once if nxt st is to odd half of cache line, else twice

	addi	QW,QW,-1	// IU1 Keeping track of QWs stored
	mtcrf	0x02,DNX	// IU2 cr6[3]=((DST+32)[27]==1)?1:0;
	addi	DNX,DNX,16	// IU1 Update cr6 for next loop

	stvx	v0,DST,BK	// LSU Store 16 bytes at D2
	addi	BK,BK,16	// IU1 Increment byte count by 16
	bdnzf	27,c_GT_4QW_fwd	// b if next store is to lower (even) half of CL

	mtcrf	0x02,DBK	// IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?)

	bns	cr6,c_B32_fwd	// b if DST[27] == 0; i.e, final store is even

// We need the ctr register to reflect an even byte count before entering
// the next block - faster to decrement than to reload.
	bdnz	B32_fwd		// decrement counter for last QW store odd

c_B32_fwd:	// Should be at least 2 stores remaining and next 2 are cache aligned
	dcbz	DST,BK		// LSU zero whole cache line
	bdz	c_Nxt_loc_fwd	// always decrement and branch to next instr		

c_Nxt_loc_fwd:
	addi	BK,BK,32	// IU1 Increment byte count
	bdnz	B32_fwd		// b if there are at least two more QWs to do

	bso	cr6,c_One_even_QW	// b if there is one even and one odd QW to store
	b	c_Last_QW		// b if last store is to even address

// Come here with two more loads and two stores to do
c_One_even_QW:
	stvx	v0,DST,BK	// LSU Store 16 bytes at D13
	addi	BK,BK,16	// IU1 Increment byte count

	b	c_Last_QW

// End of cacheable_memzero in AltiVec
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -