📄 vec_memcpy.s

📁 Please let me download so long so bad
💻 S
📖 第 1 页 / 共 3 页
字号:
上一页 1 23

	stvehx	VPS0,DST,BK	// LSU store halfword at DST+0/1
	addi	BK,BK,2		// IU1 increment index
No_H_fwd:
	bng	cr7,No_W1_fwd	// b if exactly zero or two words to store

	stvewx	VPS0,DST,BK	// LSU store word 1 of one or three
	addi	BK,BK,4		// IU1 increment index

No_W1_fwd:
	bnl	cr7,No_W2_fwd	// b if there was only one word to store
	stvewx	VPS0,DST,BK	// LSU store word 1 of two or 2 of three
	addi	BK,BK,4		// IU1 increment index

	stvewx	VPS0,DST,BK	// LSU store word 2 of two or 3 of three
	b	No_W2_fwd

Left_just:	
	stvx	VPS0,0,DST	// LSU Store 16 bytes at D0
No_W2_fwd:
	rlwinm	Rt,DBK,0,28,31	// IU1 (DBK = DST+BC-1)[28:31]
	cmpi	cr6,0,QW,0	// IU1 Any full vectors to move?

	li	BK,16		// IU1 Re-initialize byte kount index
	cmpi	cr1,0,Rt,0xF	// IU1 Is DN right justified?
	cmpi	cr7,0,QW,14	// IU1 Check QW>14
	ble	cr6,Last_ld_fwd	// b if no Quad words to do

	mtctr	QW		// IU2 for (i=0;i<=QW;i++)
	cmpi	cr6,0,QW,4	// IU1 Check QW>4
QW_fwd_loop:
	lvx	VS1,SRC,BK	// LSU Get S2 (or S1)

	vperm	VPS0,VS0,VS1,VP3	// VPU Align S1 and S2 to D1
	vor	VS0,VS1,VS1	// VIU1 Move upper vector to lower

	stvx	VPS0,DST,BK	// LSU Store 16 bytes at D1(+n*16 where n<4)
	addi	BK,BK,16	// IU1 Increment byte kount index
	bdnzf	25,QW_fwd_loop	// b if 4 or less quad words to do

	add	DNX,DST,BK	// IU1 address of next store (DST+32 if QW>4)
	addi	QW,QW,-1	// IU1 One more QW stored by now
	bgt	cr6,GT_4QW_fwd	// b if >4 quad words left

Last_ld_fwd:	// Next 16 bytes is the last; we're done.
	add	DBC,DST,BC	// IU1 Recompute address of last dst byte + 1
	add	SBC,SRC,BC	// IU1 Recompute address of last src byte + 1
	bge	No_ld_fwd	// b if shifting right (D-S>=0)

	addi	SBC,SBC,-16	// IU1 if D-S>=0 we didn't add 16 to src
No_ld_fwd:
	mtcrf	0x01,DBC	// IU2 Put final vector byte count in cr7
	addi	DBK,DBC,-1	// IU1 Recompute address of last dst byte
	addi	Rt,SBC,-1	// IU1 Recompute address of last src byte

// If D-S<0 we have already loaded all the source vectors.
// If D-S>=0 then the first loaded vector went to the upper half of the permute
// pair and we need one more vector.  (This may be a duplicate.)

	lvx	VS1,0,Rt	// LSU Get last source S14 (guaranteed SN)

#ifndef NO_DST				
	dss	0		// Data stream 0 stop

	dss	1		// Data stream 1 stop
#endif
	vperm	VPS0,VS0,VS1,VP3	// VPU Align S13 and S14 to D14
	beq	cr1,Rt_just_fwd	// b if last destination is right justified

	rlwinm	DBK,DBK,0,0,27	// IU1 Round to QW addr of last byte
	li	D,0		// IU1 Initialize index pointer
	bnl	cr7,Only_1W_fwd	// b if there was only one or zero words to store

	stvewx	VPS0,DBK,D	// LSU store word 1 of two or three
	addi	D,D,4		// IU1 increment index

	stvewx	VPS0,DBK,D	// LSU store word 2 of two or three
	addi	D,D,4		// IU1 increment index
Only_1W_fwd:
	bng	cr7,Only_2W_fwd	// b if there were only two or zero words to store

	stvewx	VPS0,DBK,D	// LSU store word 3 of three if necessary
	addi	D,D,4		// IU1 increment index
Only_2W_fwd:
	bne	cr7,Only_B_fwd	// b if there are no half words to store

	stvehx	VPS0,DBK,D	// LSU store one halfword if necessary
	addi	D,D,2		// IU1 increment index
Only_B_fwd:
	bns	cr7,All_done_fwd	// b if there are no bytes to store

	stvebx	VPS0,DBK,D	// LSU store one byte if necessary
	b	All_done_fwd

Rt_just_fwd:

	stvx	VPS0,DST,BK	// LSU Store 16 bytes at D14
All_done_fwd:
#ifdef VRSAVE
	mtspr	VRSV,RSV	// IU1 Restore VRSAVE	
#endif
	blr			// Return destination address from entry
#ifdef __MWERKS__
	.align	16
#else
	.align	4
#endif
GT_4QW_fwd:	// Do once if nxt st is to odd half of cache line, else twice

	lvx	VS1,SRC,BK	// LSU Get S3 (or S2)
	addi	QW,QW,-1	// IU1 Keeping track of QWs stored
	mtcrf	0x02,DNX	// IU2 cr6[3]=((DST+32)[27]==1)?1:0;
	
	addi	DNX,DNX,16	// IU1 Update cr6 for next loop
	addi	Rt,QW,-2	// IU1 Insure at least 2 QW left after big loop

	vperm	VPS0,VS0,VS1,VP3	// VPU Align S2 and S3 to D2
	vor	VS0,VS1,VS1	// VIU1 Move upper vector to lower

	stvx	VPS0,DST,BK	// LSU Store 16 bytes at D2
	addi	BK,BK,16	// IU1 Increment byte count by 16
	bdnzf	27,GT_4QW_fwd	// b if next store is to lower (even) half of CL
// At this point next store will be to even address.

	mtcrf	0x02,DBK	// IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?)
	lis	STR,0x104	// IU1 Stream 4 blocks of 16 bytes
	addi	BL,BK,16	// IU1 Create an alternate byte kount + 32

	ori	STR,STR,32	// IU1 Stream stride 32B
#ifndef NO_BIG_LOOP
	rlwinm	BIG,Rt,29,3,31	// IU1 QW/8 big loops to do

	rlwinm	Rt,Rt,0,0,28	// IU1 How many QWs will be done in big loop
	bgt	cr7,Big_loop	// b if QW > 14
#endif
No_big_loop:
// We need the ctr register to reflect an even byte count before entering
// the next block - faster to decrement than to reload.

	addi	SP8,SRC,256	// IU1 Starting address for data stream touch
	xoris	STR,STR,0x6	// IU1 Reset stream to 2 blocks of 16 bytes
	bns	cr6,B32_fwd	// b if DST[27] == 0; i.e, final store is even

	bdnz	B32_fwd		// decrement counter for last QW store odd

B32_fwd:	// Should be at least 2 stores remaining and next 2 are cache aligned
	lvx	VS1,SRC,BK	// LSU Get S12
	addi	SP8,SP8,32	// IU1 Next starting address for data stream touch

	lvx	VS2,SRC,BL	// LSU Get S13
	vperm	VPS1,VS0,VS1,VP3	// VPU Align S11 and S12 to D11

	STRM_1			// LSU Stream 64 byte blocks ahead of loads

	DCBK			// LSU then Kill instead of RWITM

	vperm	VPS0,VS1,VS2,VP3	// VPU Align S12 and S13 to D12
	vor	VS0,VS2,VS2	// VIU1 Move S13 to S11

	stvx	VPS1,DST,BK	// LSU Store 16 bytes at D11
	addi	BK,BL,16	// IU1 Increment byte count
	bdz	Nxt_loc_fwd	// always decrement and branch to next instr		

Nxt_loc_fwd:
	stvx	VPS0,DST,BL	// LSU Store 16 bytes at D12
	addi	BL,BK,16	// IU1 Increment alternate byte count
	bdnz	B32_fwd		// b if there are at least two more QWs to do

	bso	cr6,One_even_QW	// b if there is one even and one odd QW to store
	b	Last_ld_fwd	// b if last store is to even address

// Come here with two more loads and two stores to do
One_even_QW:
	lvx	VS1,SRC,BK	// LSU Get S14 (or S13 if if D-S>=0)

	vperm	VPS0,VS0,VS1,VP3	// VPU Align S13 and S14 to D13
	vor	VS0,VS1,VS1	// VIU1 Move upper vector to lower

	stvx	VPS0,DST,BK	// LSU Store 16 bytes at D13
	addi	BK,BK,16	// IU1 Increment byte count

	b	Last_ld_fwd

#ifdef __MWERKS__
	.align	16
#else
	.align	4
#endif
Big_loop:
	subf	QW,Rt,QW	// IU1 Should be 2-7 QWs left after big loop
	blt	cr5,No_big_loop	// b back if |DST-SRC|<128; Big_loop won't work.
	mtctr	BIG		// IU2 loop for as many 128B loops as possible
	addi	SP8,SRC,256	// IU1 Starting address for data stream touch

Loop_of_128B:	// Come here with QW>=10 and next store even; VS0 last load
	lvx	VS1,SRC,BK	// LSU Get S4 (or S3 if D-S>=0)
	addi	BL,BK,32	// IU1 Increment Byte_Kount+16 by 32	
	addi	SP8,SP8,128	// IU1 increment address for data stream touch

	lvx	VS3,SRC,BL	// LSU Get S6 (or S5)
	addi	BL,BL,32	// IU1 Increment Byte_Kount+48 by 32	

	lvx	VS5,SRC,BL	// LSU Get S8 (or S7)
	addi	BL,BL,32	// IU1 Increment Byte_Kount+80 by 32	

	lvx	VS7,SRC,BL	// LSU Get S10 (or S9)
	addi	BL,BK,16	// IU1 Increment Byte_Kount+16 by 16	

	lvx	VS2,SRC,BL	// LSU Get S5 (or S4)
	addi	BL,BL,32	// IU1 Increment Byte_Kount+32 by 32	

	lvx	VS4,SRC,BL	// LSU Get S7 (or S6)
	addi	BL,BL,32	// IU1 Increment Byte_Kount+64 by 32	
	
	lvx	VS6,SRC,BL	// LSU Get S9 (or S8)
	addi	BL,BL,32	// IU1 Increment Byte_Kount+96 by 32	
	vperm	VPS0,VS0,VS1,VP3	// VPU

	lvx	VS0,SRC,BL	// LSU Get S11 (or S10)
	vperm	VPS1,VS1,VS2,VP3	// VPU

	STRM_1			// LSU Stream 4 32B blocks, stride 32B

	DCBK			// LSU then Kill instead of RWITM

	stvx	VPS0,DST,BK	// LSU Store D3
	addi	BK,BK,16	// IU1 Increment Byte_Kount+16 by 16	
	vperm	VPS2,VS2,VS3,VP3	// VPU

	stvx	VPS1,DST,BK	// LSU Store D4
	addi	BK,BK,16	// IU1 Increment Byte_Kount+32 by 16	
	vperm	VPS3,VS3,VS4,VP3	// VPU

	DCBK			// LSU then Kill instead of RWITM

	stvx	VPS2,DST,BK	// LSU Store D5
	addi	BK,BK,16	// IU1 Increment Byte_Kount+48 by 16	
	vperm	VPS4,VS4,VS5,VP3	// VPU

	stvx	VPS3,DST,BK	// LSU Store D6
	addi	BK,BK,16	// IU1 Increment Byte_Kount+64 by 16	
	vperm	VPS5,VS5,VS6,VP3	// VPU

	DCBK			// LSU then Kill instead of RWITM

	stvx	VPS4,DST,BK	// LSU Store D7
	addi	BK,BK,16	// IU1 Increment Byte_Kount+80 by 16	
	vperm	VPS6,VS6,VS7,VP3	// VPU

	stvx	VPS5,DST,BK	// LSU Store D8
	addi	BK,BK,16	// IU1 Increment Byte_Kount+96 by 16	
	vperm	VPS7,VS7,VS0,VP3	// VPU

	DCBK			// LSU then Kill instead of RWITM

	stvx	VPS6,DST,BK	// LSU Store D9
	addi	BK,BK,16	// IU1 Increment Byte_Kount+112 by 16	

	stvx	VPS7,DST,BK	// LSU Store D10
	addi	BK,BK,16	// IU1 Increment Byte_Kount+128 by 16	
	bdnz	Loop_of_128B	// b if ctr > 0 (QW/8 still > 0)

	mtctr	QW		// IU1 Restore QW remaining to counter
	addi	BL,BK,16	// IU1 Create an alternate byte kount + 16
	bns	cr6,B32_fwd	// b if DST[27] == 0; i.e, final store is even

	bdnz	B32_fwd		// b and decrement counter for last QW store odd
				// One of the above branches should have taken

// End of memcpy in AltiVec

// bcopy works like memcpy, but the source and destination operands are reversed.
// Following will just reverse the operands and branch to memcpy.

#ifdef LIBMOTOVEC
	.global	bcopy     
bcopy:
#else
	.global	vec_bcopy     
vec_bcopy:
#endif
	mr	Rt,DST		// temp storage for what is really source address (r3)
	mr	DST,SRC		// swap destination address to r3 to match memcpy dst
	mr	SRC,Rt		// Complete swap of destination and source for memcpy
#ifdef LIBMOTOVEC
	b	memcpy		// b to memcpy with correct args in r3 and r4	
#else
	b	vec_memcpy	// b to vec_memcpy with correct args in r3 and r4	
#endif
// End of bcopy in AltiVec
上一页 1 23
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -