⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 vec_memset.s

📁 Please let me download so long so bad
💻 S
📖 第 1 页 / 共 2 页
字号:
//------------------------------------------------------------------
// file:  vec_memset.S
//    AltiVec enabled version of memset and bzero and cacheable_memzero
//------------------------------------------------------------------

//------------------------------------------------------------------
//	Copyright Motorola, Inc. 2002
//	ALL RIGHTS RESERVED
//
//	You are hereby granted a copyright license to use, modify, and 
//	distribute the SOFTWARE so long as this entire notice is retained 
//	without alteration in any modified and/or redistributed versions, 
//	and that such modified versions are clearly identified as such.  
//	No licenses are granted by implication, estoppel or otherwise under 
//	any patents or trademarks of Motorola, Inc.
//
//	The SOFTWARE is provided on an "AS IS" basis and without warranty.  
//	To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS 
//	ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED 
//	WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR 
//	PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH 
//	REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS 
//	THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS. 
//
//	To the maximum extent permitted by applicable law, IN NO EVENT SHALL 
//	MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER 
//	(INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF 
//	BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS 
//	INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR 
//	INABILITY TO USE THE SOFTWARE.   Motorola assumes no responsibility 
//	for the maintenance and support of the SOFTWARE.
//------------------------------------------------------------------

//------------------------------------------------------------------
// extern void *memset( void *ptr, int val, size_t len );
//   Copies val into each of len characters beginning at ptr.
//                                       - Harbison&Steele 4th ed
//    (despite val being an int, this memset assumes it is never
//     more than a byte.  That seems to be correct from all the
//     memset functions I've seen but I don't know if ANSI allows
//     anthing longer.     Chuck Corley  12/21/02) 
// Returns:
//  void * ptr
//------------------------------------------------------------------

//------------------------------------------------------------------
// extern void * bzero( char *ptr, int len);   
//   Copies 0 into each of len characters at ptr.
//                                       - Harbison&Steele 4th ed
// Returns:
//  void * ptr
//------------------------------------------------------------------

// Revision History:
//    Rev 0.0	Original                        Chuck Corley	02/09/03
//              Could benefit from changes added to memcpy
//    Rev 0.1	Revised per memcpy Rev 0.30     Chuck Corley	05/01/03
//
//  This is beta quality code; users are encouraged to make it faster.
//  ASSUMPTIONS:
//     Code is highly likely to be in the cache; data is not (streaming data)
//     Zero fill could be quite likely.
//     Moving fill byte from GPR to VR as below faster than stw->lvebx via stack

#define VRSV 256	//	VRSAVE spr
// Don't use vectors for BC <= MIN_VEC. Works only if MIN_VEC >= 16 bytes.
#define MIN_VEC 16

// Register useage
#define Rt r0	// 	r0 when used as a temporary register	

#define DST r3	// 	entering: dest pointer; exiting: same dest pointer

#define FILL r4	// 	entering: fill char then fill word

#define BC r5	//	entering: Byte_Count then remaining Byte_Count

#define DBC r6//	dst + byte count

#define BK r7	//  	BC - 1 +/- (n*16)

#define Fsh r8	//	fill byte shifted right one nibble

#define DM1 r9//	dst -1 for byte-by-byte backwards initially
#define D r9	//	(dst+16)[0:27] - dst[28:31]
#define DNX r9	//	(dst+n*16)[28:31]
#define BL r9	//	second byte_kount index pointer

#define DR r10	//	(dst+16)[0:27]
#define QW r10	//  	number of cache lines

#define DBK r11	//	(dst+byte_count-1) then (dst+byte_count-1)[28:31]

#define RSV r12	//  	storage for VRSAVE register if used

//  Condition register use (not including temporary cr0)
//      cr0[2]   = (FILL==0)?
//      cr1[0,2] = (BC == 0)? 1 : 0; (nothing to move)
// then cr1[2]   = (DST[28:31] == 0)? 1 : 0;  (D0 left justified)
// then cr1[2]   = ((DBK = DST+BC-1)[28:31] = 0xF)? 1 : 0; (DN right justified)
//      cr6[2]   = (QW == 0)? 1 : 0;
// then cr6[1]   = (QW > 4)? 1 : 0; (>4 vectors to move?)
// then cr6[3]   = (third store[27] == 1)? 1: 0; (cache line alignment)
// then cr6[3]   = (last store[27] == 1)? 1: 0; (last store odd?)
//      cr7[2]   = (BC>MIN_VEC)?1:0;  (BC big enough to warrant vectors)
// then cr7[0:3] = (DST+16)[0:27]-DST  (How many bytes (iff <16) in first vector?)
// then cr7[0:3] = (DST+BC)[0:27]  (How many bytes (iff <16) in last vector?)

// Conditionalize the use of dcba.  It will help if the data is
// not in cache and hurt if it is.  Generally, except for small
// benchmarks repeated many times, we assume data is not in cache
// (data streaming) and using dcba is a performance boost.
// We use dcba which will noop to non-cacheable memory rather than
// dcbz which will cause an aligment exception.
#ifndef NO_DCBA
#if defined(__GNUC__) || defined(__MWERKS__) || defined(_DIAB_TOOL)
 // gcc and codewarrior and diab don't assemble dcba
#define DCBK .long 0x7c033dec
// dcba r3,r7    or    dcba DST,BK
#else
#ifdef __ghs__
.macro DCBK
.long 0x7c033dec
.endm
#else
#define DCBK dcba DST,BK
#endif  // __ghs__
#endif  // __GNUC__ or __MWERKS__
#else
#define DCBK nop
#endif  // NO_DCBA

	.text
#ifdef __MWERKS__
	.align	32
#else
	.align	5
#endif

#ifdef LIBMOTOVEC
	.global	memset     
memset:
#else
	.global	vec_memset     
vec_memset:
#endif

	cmpi	cr7,0,BC,MIN_VEC	// IU1 Check for minimum byte count
	cmpi	cr1,0,BC,0	// IU1 Eliminate zero byte count
	rlwinm.	Fsh,FILL,28,28,3 // IU1 Is fill byte zero? and shift

	addi	DM1,DST,-1	// IU1 Pre-bias and duplicate destination
	addi	DR,DST,16	// IU1 Address of second dst vector
	add	DBC,DST,BC	// IU1 Address of last dst byte + 1
	bgt	cr7,v_memset	// b if BC>MIN_VEC

	mtctr	BC		// for (i=1;i<=BC;i++)
	beqlr	cr1		// return if BC = 0
Byte_set:
	stbu	FILL,1(DM1)	// LSU * ++(DST-1) = FILL
	bdnz	Byte_set

	blr

v_memset:
// Byte count < MIN_VEC bytes will have been set by scalar code above,
// so this will not deal with small block sets < MIN_VEC.

// For systems using VRSAVE, define VRSAV=1 when compiling.  For systems
// that don't, make sure VRSAVE is undefined.
#ifdef VRSAVE
	mfspr	RSV,VRSV	// IU2 Get current VRSAVE contents
#endif
	rlwinm	DR,DR,0,0,27	// IU1 (DST+16)[0:27]
	addi	DBK,DBC,-1	// IU1 Address of last dst byte

#ifdef VRSAVE
	oris	Rt,RSV,0xe000	// IU1 Or in registers used by this routine
#endif
	subf	D,DST,DR	// IU1 How many bytes in first destination?
	li	BK,0		// IU1 Initialize byte kount index

#ifdef VRSAVE
	mtspr	VRSV,Rt	// IU2 Save in VRSAVE before first vec op
#endif
	vxor	v0,v0,v0	// VIU Clear v0
	subf	QW,DR,DBK	// IU1 Bytes of full vectors to move (-16)
	cmpi	cr1,0,D,16	// IU1 Is D0 left justified?
	beq+	enter_bzero	// b if FILL==0

	lvsl	v0,0,Fsh	// LSU Move upper nibble to byte 0 of VR
	vspltisb	v1,4	// VPU Splat 0x4 to every byte

	lvsl	v2,0,FILL	// LSU Move lower nibble to byte 0 of VR

	vslb	v0,v0,v1	// VIU Move upper nibble to VR[0:3]

	vor	v0,v0,v2	// VIU Form FILL byte in VR[0:7]

	vspltb	v0,v0,0		// VPU Splat the fill byte to all bytes
enter_bzero:
	mtcrf	0x01,D		// IU2 Put bytes in 1st dst in cr7
	rlwinm	QW,QW,28,4,31	// IU1 Quad words remaining
	beq	cr1,Left_just	// b if D0 is left justified

	bns	cr7,No_B_fwd	// b if only even number of bytes to store

	stvebx	v0,DST,BK	// LSU store first byte at DST+0
	addi	BK,BK,1		// IU1 increment index
No_B_fwd:
	bne	cr7,No_H_fwd	// b if only words to store

	stvehx	v0,DST,BK	// LSU store halfword at DST+0/1
	addi	BK,BK,2		// IU1 increment index
No_H_fwd:
	bng	cr7,No_W1_fwd	// b if exactly zero or two words to store

	stvewx	v0,DST,BK	// LSU store word 1 of one or three
	addi	BK,BK,4		// IU1 increment index

No_W1_fwd:
	bnl	cr7,No_W2_fwd	// b if there was only one word to store
	stvewx	v0,DST,BK	// LSU store word 1 of two or 2 of three
	addi	BK,BK,4		// IU1 increment index

	stvewx	v0,DST,BK	// LSU store word 2 of two or 3 of three
	b	No_W2_fwd

Left_just:	
	stvx	v0,0,DST	// LSU Store 16 bytes at D0
No_W2_fwd:
	rlwinm	Rt,DBK,0,28,31	// IU1 (DBK = DST+BC-1)[28:31]
	cmpi	cr6,0,QW,0	// IU1 Any full vectors to move?

	li	BK,16		// IU1 Re-initialize byte kount index
	cmpi	cr1,0,Rt,0xF	// IU1 Is DN right justified?
	ble	cr6,Last_QW	// b if no Quad words to do

	mtctr	QW		// IU2 for (i=0;i<=QW;i++)
	cmpi	cr6,0,QW,4	// IU1 Check QW>4

QW_loop:
	stvx	v0,DST,BK	// LSU Store 16 fill bytes
	addi	BK,BK,16	// IU1 Increment byte kount index
	bdnzf	25,QW_loop	// b if 4 or less quad words to do

	add	DNX,DST,BK	// IU1 address of next store (DST+32 if QW>4)
	addi	QW,QW,-1	// IU1 One more QW stored by now
	bgt	cr6,GT_4QW_fwd	// b if >4 quad words left

Last_QW:	// Next vector is the last; we're done.
	mtcrf	0x01,DBC	// IU2 Put final vector byte count in cr7

	beq	cr1,Rt_just_fwd	// b if last destination is right justified

	rlwinm	DBK,DBK,0,0,27	// IU1 Round to QW addr of last byte
	li	BL,0		// IU1 Initialize index pointer
	bnl	cr7,Only_1W_fwd	// b if there was only one or zero words to store

	stvewx	v0,DBK,BL	// LSU store word 1 of two or three
	addi	BL,BL,4		// IU1 increment index

	stvewx	v0,DBK,BL	// LSU store word 2 of two or three
	addi	BL,BL,4		// IU1 increment index
Only_1W_fwd:
	bng	cr7,Only_2W_fwd	// b if there were only two or zero words to store

	stvewx	v0,DBK,BL	// LSU store word 3 of three if necessary
	addi	BL,BL,4		// IU1 increment index
Only_2W_fwd:
	bne	cr7,Only_B_fwd	// b if there are no half words to store

	stvehx	v0,DBK,BL	// LSU store one halfword if necessary
	addi	BL,BL,2		// IU1 increment index
Only_B_fwd:
	bns	cr7,All_done_fwd	// b if there are no bytes to store

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -