📄 vec_memcpy.s

📁 Please let me download so long so bad
💻 S
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
//------------------------------------------------------------------
// file:  vec_memcpy.S
//    AltiVec enabled version of memcpy and bcopy
//------------------------------------------------------------------

//------------------------------------------------------------------
//	Copyright Motorola, Inc. 2003
//	ALL RIGHTS RESERVED
//
//	You are hereby granted a copyright license to use, modify, and 
//	distribute the SOFTWARE so long as this entire notice is retained 
//	without alteration in any modified and/or redistributed versions, 
//	and that such modified versions are clearly identified as such.  
//	No licenses are granted by implication, estoppel or otherwise under 
//	any patents or trademarks of Motorola, Inc.
//
//	The SOFTWARE is provided on an "AS IS" basis and without warranty.  
//	To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS 
//	ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED 
//	WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR 
//	PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH 
//	REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS 
//	THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS. 
//
//	To the maximum extent permitted by applicable law, IN NO EVENT SHALL 
//	MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER 
//	(INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF 
//	BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS 
//	INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR 
//	INABILITY TO USE THE SOFTWARE.   Motorola assumes no responsibility 
//	for the maintenance and support of the SOFTWARE.
//------------------------------------------------------------------

//------------------------------------------------------------------
// extern  void * memcpy(void *dst, const void *src, size_t len);
// Returns:
//  void *dst
//------------------------------------------------------------------

//------------------------------------------------------------------
// extern void * memmove( void *dst, const void *src, size_t len );
//   Copies len characters from src to dst and returns the value of
//   dst.  Works correctly for overlapping memory regions.
//               - Harbison&Steele 4th ed (corrected as to return)
// Returns:
//  void *dst
//------------------------------------------------------------------

//------------------------------------------------------------------
// extern  void * bcopy(const void *src, void *dst,  size_t len);
// Returns:
//  void *dst
//------------------------------------------------------------------

// memcpy and memmove are combined into one entry point here because of
// the similarity of operation and need to create fool-proof code.
// The following conditions determine what is "fool proof":
//
// if:                                          then single entry:
// (DST-SRC)<0 && (SRC-DST)>=BC && BC>MIN_VEC    will b to v_memcpy
// (DST-SRC)<0 && (SRC-DST)< BC && BC>MIN_VEC    must b to v_memcpy
// (DST-SRC)<0                  && BC<MIN_VEC    copy fwd byte-by-byte
// (DST-SRC)==0                 || BC==0         will just return
// (DST-SRC)>0                  && BC<MIN_VEC    copy bkwd byte-by-byte
// (DST-SRC)>0 && (DST-SRC)< BC && BC>MIN_VEC    must b to v_memmove
// (DST-SRC)>0 && (SRC-DST)>=BC && BC>MIN_VEC    will b to v_memmove

// If you call memmove (or vec_memmove) and |DST-SRC|>=BC,
// this code will branch to v_memcpy anyway for maximum performance.

// Revision History:
//    Rev 0.0	Original                          Chuck Corley	02/03/03
//              Can still add dst, 128B loop, and aligned option
//    Rev 0.01  Fixed JY's seg-fault violation              CJC 02/17/03
//    Rev 0.1   Added 128B loop and dst; cndtnlzd dcbz      CJC 02/18/03
//              (Creating separate path for QW aligned didn't help much)
//    Rev 0.11  Small code schdling; chngd dst for memmove  CJC 02/23/03
//    Rev 0.20  Eliminated alternate entry and cleanup      CJC 02/27/03                   
//    Rev 0.21  Inproved loop branch targets for v_mempcy   CJC 03/01/03                   
//    Rev 0.22  Experimented with dst (sent to H.)          CJC 03/02/03                   
//    Rev 0.23  Substituted dcba for dcbz (sent to JY)      CJC 03/08/03                   
//    Rev 0.24  Use two dst streams                         CJC 03/12/03
//    Rev 0.25  Fix for all compilers, cleanup, and release with
//              libmotovec.a rev 0.10                       CJC 03/14/03
//    Rev 0.30  Fix for pre-empted destination (SNDF-DS)    CJC 04/02/03                   
//
//  Between Rev 0.25 and 0.30 the code was revised to store elements of
//  source at destination when first and/or last vector are less than 16
//  bytes. Areviewer at SNDF observed that loading the destination vector
//  for merging exposed the "uninvolved" destination bytes to incoherency 
//  if an interrupt pre-empted this routine and modified the "uninvolved"
//  destination vector(s) while held in register for merging.  It seems
//  like a low possibility but this revision is no longer subject to that
//  possibility.  (It is also slightly faster than Rev 0.25.)
//  This is beta quality code; users are encouraged to make it faster.
//  ASSUMPTIONS:
//     Code is highly likely to be in the cache; data is not (streaming data)

#define VRSV 256	//	VRSAVE spr
// Don't use vectors for BC <= MIN_VEC. Works only if MIN_VEC >= 16 bytes.
#define MIN_VEC 16
// Don't use Big_loop in v_memcpy for |dst-src|<= minimum overlap.
#define MIN_OVL 128

// Register useage
#define Rt r0	// 	r0 when used as a temporary register	

#define DST r3	// 	entering: dst pointer; exiting: same dst pointer

#define SRC r4	// 	entering: src ptr; then end of src range index (SRC+BC) in memmove

#define BC r5	//	entering: Byte_Count

#define PCS r6	//  	save for partial checksum entering

#define DMS r7	//      dst - src initially
#define BK r7	//  	BC - 1 +/- (n*16)

// Codewarrior will put an unwelcome space as "lbzu	r0,1(r7 )"
// if you don't put the comment right after the r7.  CJC 030314
#define SM1 r8//	src -1 for byte-by-byte forwards initially
#define S r8	//	src[28:31]
#define SMD r8	//      src[0:27]-dst[0:27]
#define STR r8	//	data stream touch block & stride info for Big_loop

#define DM1 r9//	dst -1 for byte-by-byte forwards initially
#define D r9	//	dst[28:31]
#define DNX r9	//	(dst+n*16)[28:31]
#define BL r9	//	second byte_kount index pointer

#define SBC r10//	src + byte count initially then src[28:31]
#define BLK r10	//      temporary data stream touch block & stride info
#define DR r10	//	(dst+16)[0:27]
#define QW r10	//  	number of quad words (vectors)

#define DBC r11//	dst + byte count initially
#define BLL r11	//      temporary data stream touch block & stride info
#define SBK r11	//	(src+byte_count-1)
#define SBR r11	//	(src+byte_count-1)[0:27]
#define DBK r11	//	(dst+byte_count-1) then (dst+byte_count-1)[28:31]
#define BIG r11	//	QW/8 or 128 byte loop count
#define SP8 r11	//      SRC + n*128 (8 QWs) for data streaming after first call

#define RSV r12	//  	storage for VRSAVE register if used

#define VS0   v0	//  	src vector for permuting

#define VS1   v1	//  	src vector for permuting

#define VP3   v2	// 	d - s permute register

#define VPS0  v3	// 	permuted source vector to store

#define VPS1  v4	//  	2nd permuted source vector to store

#define VPS2  v5	//      additional permuted src in Big loop

#define VS2   v6	//  	src vector for permuting
#define VPS3  v6	//      additional permuted src in Big loop

#define VS3   v7	//      additional src load in Big loop
#define VPS4  v7	//      additional permuted src in Big loop

#define VS4   v8	//      additional src load in Big loop
#define VPS5  v8	//      additional permuted src in Big loop

#define VS5   v9	//      additional src load in Big loop
#define VPS6  v9	//      additional permuted src in Big loop

#define VS6   v10	//      additional src load in Big loop
#define VPS7  v10	//      additional permuted src in Big loop

#define VS7   v11	//      additional src load in Big loop

// Conditionalize the use of dcba.  It will help if the data is
// not in cache and hurt if it is.  Generally, except for small
// benchmarks repeated many times, we assume data is not in cache
// (data streaming) and using dcbz is a performance boost.
#ifndef NO_DCBA
#if defined(__GNUC__) || defined(__MWERKS__) || defined(_DIAB_TOOL)
 // gcc and codewarrior and diab don't assemble dcba
#define DCBK .long 0x7c033dec
// dcba r3,r7    or    dcba DST,BK
#define DCBL .long 0x7c034dec
// dcba r3,r9     or    dcba DST,BL
#else
#ifdef __ghs__
.macro DCBK
.long 0x7c033dec
.endm
.macro DCBL
.long 0x7c034dec
.endm
#else
#define DCBK dcba DST,BK
#define DCBL dcba DST,BL
#endif  // __ghs__
#endif  // __GNUC__ or __MWERKS__
#else
#define DCBK nop
#define DCBL nop
#endif  // NO_DCBA

// Conditionalize the use of dst (data stream touch).  It will help
// if the data is not in cache and hurt if it is (though not as badly
// as dcbz).  Generally, except for small benchmarks repeated many times,
// we assume data is not in cache (data streaming) and using dst is a
// performance boost.
#ifndef NO_DST
#define STRM_B dst	SBC,BLL,0
#define STRM_F dst	SRC,BLK,0
#define STRM_1 dst	SP8,STR,1

#else
#define STRM_B	nop
#define STRM_F	nop
#define STRM_1	nop
#endif

//  Condition register use
//      cr0[0:2] = (dst-src==0)? return: ((dst-src>0)? copy_bkwd, copy_fwd;);
// then cr0[0:2] = (dst[28:31]-src[28:31]<0)? "shifting left", "shifting right";
//      cr1[0,2] = (BC == 0)? 1 : 0; (nothing to move)
// then cr1[2]   = (DST[28:31] == 0)? 1 : 0;  (D0 left justified)
// then cr1[2]   = ((DBK = DST+BC-1)[28:31] = 0xF)? 1 : 0; (DN right justified)
//      cr5[0,2] = (|DST-SRC|<=MIN_OVL)?1:0;  (Overlap too small for Big loop?)
//      cr6[1,2] = (DST-SRC>=BC)?1:0;  (Okay for v_memmove to copy forward?)
// then cr6[2]   = (QW == 0)? 1 : 0; (Any full vectors to move?)
// then cr6[1]   = (QW > 4)? 1 : 0; (>4 vectors to move?)
// then cr6[3]   = (third store[27] == 1)? 1: 0; (cache line alignment)
// then cr6[3]   = (last store[27] == 1)? 1: 0; (last store odd?)
//      cr7[2]   = (BC>MIN_VEC)?1:0;  (BC big enough to warrant vectors)
// then cr7[0:3] = (DST+16)[0:27]-DST  (How many bytes (iff <16) in first vector?)
// then cr7[1]   = (QW > 14)? 1 : 0; (>14 vectors to move?)
// then cr7[0:3] = (DST+BC)[0:27]  (How many bytes (iff <16) in last vector?)

	.text
#ifdef __MWERKS__
	.align	32
#else
	.align	5
#endif

#ifdef LIBMOTOVEC
	.global	memmove     
memmove:
	nop			// IU1 Compilers forget first label
	.global	memcpy     
memcpy:
#else
	.global	vec_memmove     
vec_memmove:
	nop			// IU1 Only way I know to preserve both labels
	.global	vec_memcpy     
vec_memcpy:
#endif
	subf.	DMS,SRC,DST	// IU1 Compute dst-src difference
	cmpi	cr1,0,BC,0	// IU1 Eliminate zero byte count moves
	cmpi	cr7,0,BC,MIN_VEC	// IU1 Check for minimum byte count

	addi	SM1,SRC,-1	// IU1 Pre-bias and duplicate src for fwd
	addi	DM1,DST,-1	// IU1 Pre-bias and duplicate destination
	add	SBC,SRC,BC	// IU1 Pre-bias and duplicate src for bkwd
	beqlr			// return if DST = SRC

	add	DBC,DST,BC	// IU1 Pre-bias and duplicate destination
	subf	Rt,DST,SRC	// IU1 Form |DST-SRC| if DST-SRC<0
	beqlr	cr1		// return if BC = 0

	bgt	Cpy_bkwd	// b if DST-SRC>0 (have to copy backward)
	cmpi	cr5,0,Rt,MIN_OVL	// IU1 (|DST-SRC|>128)?1:0; for v_memcpy
	bgt	cr7,v_memcpy	// b if BC>MIN_VEC (okay to copy vectors fwd)

// Copy byte-by-byte forwards if DST-SRC<0 and BC<=MIN_VEC	
	mtctr	BC		// i=BC; do ...;i--; while (i>0)
Byte_cpy_fwd:
	lbzu	Rt,1(SM1)	// LSU * ++(DST-1) = * ++(SRC-1)
	stbu	Rt,1(DM1)	// LSU
	bdnz	Byte_cpy_fwd

	blr
	nop			// IU1 Improve next label as branch target	
Cpy_bkwd:
	cmpi	cr5,0,DMS,MIN_OVL	// IU1 ((DST-SRC)>128)?1:0; for v_memcpy
	cmp	cr6,0,DMS,BC	// IU1 cr6[1,2]=(DST-SRC>=BC)?1:0;
	bgt	cr7,v_memmove	// b if BC>MIN_VEC (copy vectors bkwd)
// Copy byte-by-byte backwards if DST-SRC>0 and BC<=MIN_VEC
	mtctr	BC		// i=BC; do ...;i--; while (i>0)
Byte_cpy_bwd:
	lbzu	Rt,-1(SBC)	// LSU * --(DST+BC) = * --(SRC+BC)
	stbu	Rt,-1(DBC)	// LSU Store it
	bdnz	Byte_cpy_bwd
	blr
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -