📄 vec_memcpy.s

📁 Please let me download so long so bad
💻 S
📖 第 1 页 / 共 3 页
字号:
	
#ifdef __MWERKS__
	.align	16
#else
	.align	4
#endif

v_memmove:
// Byte count < MIN_VEC bytes will have been copied by scalar code above,
// so this will not deal with small block moves < MIN_VEC.

// For systems using VRSAVE, define VRSAVE=1 when compiling.  For systems
// that don't, make sure VRSAVE is undefined.
#ifdef VRSAVE
	mfspr	RSV,VRSV	// IU2 Get current VRSAVE contents
#endif
	rlwinm	S,SRC,0,28,31	// IU1 Save src address bits s[28:31]
	rlwinm	D,DST,0,28,31	// IU1 D = dst[28:31]
	bge	cr6,MC_entry	// b to v_memcpy if DST-SRC>=BC (fwd copy OK)

#ifdef VRSAVE
	oris	Rt,RSV,0xfff0	// IU1 Or in registers used by this routine
#endif	
	lis	BLL,0x010c	// IU1 Stream 12 blocks of 16 bytes
	subf.	SMD,D,S		// IU1 if S-D<0 essentially shifting right

#ifdef VRSAVE
	mtspr	VRSV,Rt		// IU2 Save in VRSAVE before first vec op
#endif
	lvsr	VP3,0,DMS	// LSU Permute vector for dst - src shft right
	ori	BLL,BLL,0xffe0	// IU1 Stream stride -32B

	STRM_B			// LSU Start data stream at SRC+BC
	addi	SBK,SBC,-1	// IU1 Address of last src byte
	bgt	Rt_shft		// Bytes from upper vector = (s-d>0)?s-d:16+s-d;
	addi	SMD,SMD,16	// IU1 Save 16-(d-s)
Rt_shft:

	rlwinm	SBR,SBK,0,0,27	// IU1 (SRC+BC-1)[0:27]
	addi	BK,BC,-1	// IU1 Initialize byte index

	subf	Rt,SBR,SBC	// IU1 How many bytes in first source?
	add	DBK,DST,BK	// IU1 Address of last dst byte
	addi	DR,DST,16	// IU1 Address of second dst vector

	subf.	SMD,Rt,SMD	// IU1 if bytes in 1st src>Bytes in 1st permute
	rlwinm	Rt,DBK,0,28,31	// IU1 (DST+BC-1)[28:31]
	rlwinm	DR,DR,0,0,27	// IU1 (DST+16)[0:27]

// If there are more useful bytes in the upper vector of a permute pair than we
// will get in the first permute, the first loaded vector needs to be in the
// lower half of the permute pair.  The upper half is a don't care then.
	blt	Get_bytes_rt	// b if shifting left (D-S>=0)

	lvx	VS1,SRC,BK	// LSU Get SN load started
// Comments numbering source and destination assume single path through the
// code executing each instruction once.  For vec_memmove, an example would
// be the call memmove(BASE+0x0F, BASE+0x2F, 82). N = 6 in that case.
	addi	SRC,SRC,-16	// IU1 Decrement src base (to keep BK useful)

Get_bytes_rt:	// Come here to get VS0 & Don't care what VS1 is	
	lvx	VS0,SRC,BK	// LSU Get SN-1 (SN if D-S<0) in lower vector
	subf	QW,DR,DBK	// IU1 Bytes of full vectors to move (-16)
	cmpi	cr7,0,Rt,0xF	// IU1 Is Dn right justified?

	cmpi	cr1,0,D,0	// IU1 Is D0 left justified?
	rlwinm	QW,QW,28,4,31	// IU1 Quad words remaining
	add	Rt,DST,BC	// IU1 Refresh the value of DST+BC

	cmpi	cr6,0,QW,0	// IU1 Any full vectors to move?
	vperm	VPS0,VS0,VS1,VP3	// VPU Align SN-1 and SN to DN
	vor	VS1,VS0,VS0	// VIU1 Move lower vector to upper
	beq	cr7,Rt_just	// b if DN is right justified

	mtcrf	0x01,Rt		// IU2 Put final vector byte count in cr7
	rlwinm	DBK,DBK,0,0,27	// IU1 Address of first byte of final vector
	li	D,0		// IU1 Initialize an index pointer
	bnl	cr7,Only_1W_bkwd	// b if there was only one or zero words to store

	stvewx	VPS0,DBK,D	// LSU store word 1 of two or three
	addi	D,D,4		// IU1 increment index

	stvewx	VPS0,DBK,D	// LSU store word 2 of two or three
	addi	D,D,4		// IU1 increment index
Only_1W_bkwd:
	bng	cr7,Only_2W_bkwd	// b if there were only two or zero words to store

	stvewx	VPS0,DBK,D	// LSU store word 3 of three if necessary
	addi	D,D,4		// IU1 increment index
Only_2W_bkwd:
	bne	cr7,Only_B_bkwd	// b if there are no half words to store

	stvehx	VPS0,DBK,D	// LSU store one halfword if necessary
	addi	D,D,2		// IU1 increment index
Only_B_bkwd:
	bns	cr7,All_done_bkwd	// b if there are no bytes to store

	stvebx	VPS0,DBK,D	// LSU store one byte if necessary
	b	All_done_bkwd

Rt_just:	
	stvx	VPS0,DST,BK	// LSU Store 16 bytes at DN
All_done_bkwd:
	addi	BK,BK,-16	// IU1 Decrement destination byte count

	ble	cr6,Last_load	// b if no Quad words to do
	mtctr	QW		// IU2 for (i=0;i<=QW;i++)-execution serializng
	cmpi	cr6,0,QW,4	// IU1 Check QW>4
QW_loop:
	lvx	VS0,SRC,BK	// LSU Get SN-2 (or SN-1 if ADJ==0)

	vperm	VPS0,VS0,VS1,VP3	// VPU Align SN-2 and SN-1 to DN-1
	vor	VS1,VS0,VS0	// VIU1 Move lower vector to upper

	stvx	VPS0,DST,BK	// LSU Store 16 bytes at DN-1
	addi	BK,BK,-16	// IU1 Decrement byte kount
	bdnzf	25,QW_loop	// b if 4 or less quad words to do

	add	DNX,DST,BK	// IU1 address of next store (DST+BC-1-16)
	bgt	cr6,GT_4QW	// b if >4 quad words left

Last_load:	// if D-S>=0, next load will be from same address as last
	blt	No_ld_bkwd	// b if shifting right (S-D>=0)
	addi	SRC,SRC,16	// IU1 recorrect source if it was decremented
No_ld_bkwd:				
	lvx	VS0,0,SRC	// LSU Get last source SN-6 (guaranteed S0)
// Current 16 bytes is the last; we're done.
	dss	0		// Data stream stop
	vperm	VPS0,VS0,VS1,VP3	// VPU Align SN-6 and SN-5 to DN-6
	subfic	D,DST,16	// IU1 How many bytes in first destination?
	beq	cr1,Lt_just	// b if last destination is left justified

	mtcrf	0x01,D		// IU2 Put byte count remaining in cr7
	li	D,0		// IU1 Initialize index pointer
	bns	cr7,No_B_bkwd	// b if only even number of bytes to store

	stvebx	VPS0,DST,D	// LSU store first byte at DST+0
	addi	D,D,1		// IU1 increment index
No_B_bkwd:
	bne	cr7,No_H_bkwd	// b if only words to store
	stvehx	VPS0,DST,D	// LSU store halfword at DST+0/1
	addi	D,D,2		// IU1 increment index

No_H_bkwd:
	bng	cr7,No_W1_bkwd	// b if exactly zero or two words to store
	stvewx	VPS0,DST,D	// LSU store word 1 of one or three
	addi	D,D,4		// IU1 increment index

No_W1_bkwd:
	bnl	cr7,No_W2_bkwd	// b if there was only one word to store
	stvewx	VPS0,DST,D	// LSU store word 1 of two or 2 of three
	addi	D,D,4		// IU1 increment index

	stvewx	VPS0,DST,D	// LSU store word 2 of two or 3 of three
	b	No_W2_bkwd

Lt_just:
	stvx	VPS0,0,DST	// LSU Store 16 bytes at final dst addr D0
No_W2_bkwd:
#ifdef VRSAVE
	mtspr	VRSV,RSV	// IU1 Restore VRSAVE	
#endif
	blr			// Return destination address from entry

GT_4QW:	// Do once if next store is to even half of cache line, else twice

	lvx	VS0,SRC,BK	// LSU Get SN-3 (or SN-2)
	mtcrf	0x02,DNX	// IU2 cr6[3]=((DST+BC-1)[27]==1)?1:0;
	
	vperm	VPS0,VS0,VS1,VP3	// VPU Align SN-3 and SN-2 to Dn-2
	vor	VS1,VS0,VS0	// VIU1 Move lower vector to upper
	addi	DNX,DNX,-16	// IU1 Prepare to update cr6 next loop

	stvx	VPS0,DST,BK	// LSU Store 16 bytes at DN-2
	vor	VS3,VS0,VS0	// VIU Make a copy of lower vector
	addi	BK,BK,-16	// IU1 Decrement byte count by 16
	bdnzt	27,GT_4QW	// b if next store is to upper (odd) half of CL
// At this point next store will be to even address.

	lis	STR,0x102	// IU1 Stream 2 blocks of 16 bytes
	mtcrf	0x02,DST	// IU2 cr6[3]=(DST[27]==1)?1:0; (DST odd?)
	addi	BL,BK,-16	// IU1 Create an alternate byte count - 16

	ori	STR,STR,0xffe0	// IU1 Stream stride -32B
	addi	SP8,SRC,-64	// IU1 Starting address for data stream touch
	bso	cr6,B32_bkwd	// b if DST[27] == 1; i.e, final store is odd

	bdnz	B32_bkwd	// decrement counter for last odd QW store
B32_bkwd:	// Should be at least 2 stores remaining and next 2 are cache aligned
	lvx	VS2,SRC,BK	// LSU Get SN-4 (or SN-3)
	addi	SP8,SP8,-32	// IU1 Next starting address for data stream touch

	lvx	VS1,SRC,BL	// LSU Get SN-5 (or SN-4)
	vperm	VPS0,VS2,VS3,VP3	// VPU Align SN-4 and SN-3 to DN-3

	STRM_1			// LSU Stream 64 byte blocks ahead of loads

	DCBL			// LSU allocate next cache line

	vperm	VPS1,VS1,VS2,VP3	// VPU Align SN-5 and SN-4 to DN-4
	vor	VS3,VS1,VS1	// VIU1 Move SN-5 to SN-3

	stvx	VPS0,DST,BK	// LSU Store 16 bytes at DN-3
	addi	BK,BL,-16	// IU1 Decrement byte count
	bdz	Nxt_loc_bkwd	// always decrement and branch to next instr		

Nxt_loc_bkwd:
	stvx	VPS1,DST,BL	// LSU Store 16 bytes at DN-4
	addi	BL,BK,-16	// IU1 Decrement alternate byte count
	bdnz	B32_bkwd	// b if there are at least two more QWs to do

	bns	cr6,One_odd_QW	// b if there was one more odd QW to store
	b	Last_load

// Come here with two more loads and two stores to do
One_odd_QW:
	lvx	VS1,SRC,BK	// LSU Get SN-6 (or SN-5)

	vperm	VPS1,VS1,VS3,VP3	// VPU Align SN-6 and SN-5 to DN-5

	stvx	VPS1,DST,BK	// LSU Store 16 bytes at DN-5

	b	Last_load

// End of memmove in AltiVec

#ifdef __MWERKS__
	.align	16
#else
	.align	4
#endif
v_memcpy:
// Byte count < MIN_VEC bytes will have been copied by scalar code above,
// so this will not deal with small block moves < MIN_VEC.

#ifdef VRSAVE
	mfspr	RSV,VRSV	// IU2 Get current VRSAVE contents
#endif
	rlwinm	S,SRC,0,28,31	// IU1 Save src address bits s[28:31]
	rlwinm	D,DST,0,28,31	// IU1 D = dst[28:31]

MC_entry:	// enter here from memmove if DST-SRC>=BC; this should be faster
#ifdef VRSAVE
	oris	Rt,RSV,0xfff0	// IU1 Or in registers used by this routine
#endif	
	lis	BLK,0x010c	// IU1 Stream 12 blocks of 16 bytes

	subf.	S,S,D		// IU1 if D-S<0 essentially shifting left

#ifdef VRSAVE
	mtspr	VRSV,Rt		// IU2 Save in VRSAVE before first vec op
#endif
	lvsr	VP3,0,DMS	// LSU Permute vector for dst - src shft right
	ori	BLK,BLK,32	// IU1 Stream stride 32B

	STRM_F			// LSU Start data stream 0 at SRC
	addi	DR,DST,16	// IU1 Address of second dst vector
	addi	DBK,DBC,-1	// IU1 Address of last dst byte

// If D-S<0 we are "kinda" shifting left with the right shift permute vector
// loaded to VP3 and we need both S0 and S1 to permute.  If D-S>=0 then the
// first loaded vector needs to be in the upper half of the permute pair and
// the lower half is a don't care then.
	bge	Ld_bytes_rt	// b if shifting right (D-S>=0)

	lvx	VS0,0,SRC	// LSU Get S0 load started
// Comments numbering source and destination assume single path through the
// code executing each instruction once.  For vec_memcpy, an example would
// be the call memcpy(BASE+0x1E, BASE+0x1F, 259). N = 16 in that case.
	addi	SRC,SRC,16	// IU1 Increment src base (to keep BK useful)

Ld_bytes_rt:	// Come here to get VS1 & Don't care what VS0 is	
	lvx	VS1,0,SRC	// LSU Get S1 (or S0 if D-S>=0) in upper vector
	rlwinm	DR,DR,0,0,27	// IU1 (DST+16)[0:27]
	cmpi	cr1,0,D,0	// IU1 Is D0 left justified?

	subf	Rt,DST,DR	// IU1 How many bytes in first destination?
	subf	QW,DR,DBK	// IU1 Bytes of full vectors to move (-16)
	li	BK,0		// IU1 Initialize byte kount index

	mtcrf	0x01,Rt		// IU2 Put bytes in 1st dst in cr7
	rlwinm	QW,QW,28,4,31	// IU1 Quad words remaining
	vperm	VPS0,VS0,VS1,VP3	// VPU Align S0 and S1 to D0

	vor	VS0,VS1,VS1	// VIU1 Move upper vector to lower
	beq	cr1,Left_just	// b if D0 is left justified

	bns	cr7,No_B_fwd	// b if only even number of bytes to store

	stvebx	VPS0,DST,BK	// LSU store first byte at DST+0
	addi	BK,BK,1		// IU1 increment index
No_B_fwd:
	bne	cr7,No_H_fwd	// b if only words to store
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -