📄 vec_memcpy.s
字号:
//------------------------------------------------------------------
// file: vec_memcpy.S
// AltiVec enabled version of memcpy and bcopy
//------------------------------------------------------------------
//------------------------------------------------------------------
// Copyright Motorola, Inc. 2003
// ALL RIGHTS RESERVED
//
// You are hereby granted a copyright license to use, modify, and
// distribute the SOFTWARE so long as this entire notice is retained
// without alteration in any modified and/or redistributed versions,
// and that such modified versions are clearly identified as such.
// No licenses are granted by implication, estoppel or otherwise under
// any patents or trademarks of Motorola, Inc.
//
// The SOFTWARE is provided on an "AS IS" basis and without warranty.
// To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS
// ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED
// WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR
// PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH
// REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS
// THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS.
//
// To the maximum extent permitted by applicable law, IN NO EVENT SHALL
// MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER
// (INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF
// BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS
// INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR
// INABILITY TO USE THE SOFTWARE. Motorola assumes no responsibility
// for the maintenance and support of the SOFTWARE.
//------------------------------------------------------------------
//------------------------------------------------------------------
// extern void * memcpy(void *dst, const void *src, size_t len);
// Returns:
// void *dst
//------------------------------------------------------------------
//------------------------------------------------------------------
// extern void * memmove( void *dst, const void *src, size_t len );
// Copies len characters from src to dst and returns the value of
// dst. Works correctly for overlapping memory regions.
// - Harbison&Steele 4th ed (corrected as to return)
// Returns:
// void *dst
//------------------------------------------------------------------
//------------------------------------------------------------------
// extern void * bcopy(const void *src, void *dst, size_t len);
// Returns:
// void *dst
//------------------------------------------------------------------
// memcpy and memmove are combined into one entry point here because of
// the similarity of operation and need to create fool-proof code.
// The following conditions determine what is "fool proof":
//
// if: then single entry:
// (DST-SRC)<0 && (SRC-DST)>=BC && BC>MIN_VEC will b to v_memcpy
// (DST-SRC)<0 && (SRC-DST)< BC && BC>MIN_VEC must b to v_memcpy
// (DST-SRC)<0 && BC<MIN_VEC copy fwd byte-by-byte
// (DST-SRC)==0 || BC==0 will just return
// (DST-SRC)>0 && BC<MIN_VEC copy bkwd byte-by-byte
// (DST-SRC)>0 && (DST-SRC)< BC && BC>MIN_VEC must b to v_memmove
// (DST-SRC)>0 && (SRC-DST)>=BC && BC>MIN_VEC will b to v_memmove
// If you call memmove (or vec_memmove) and |DST-SRC|>=BC,
// this code will branch to v_memcpy anyway for maximum performance.
// Revision History:
// Rev 0.0 Original Chuck Corley 02/03/03
// Can still add dst, 128B loop, and aligned option
// Rev 0.01 Fixed JY's seg-fault violation CJC 02/17/03
// Rev 0.1 Added 128B loop and dst; cndtnlzd dcbz CJC 02/18/03
// (Creating separate path for QW aligned didn't help much)
// Rev 0.11 Small code schdling; chngd dst for memmove CJC 02/23/03
// Rev 0.20 Eliminated alternate entry and cleanup CJC 02/27/03
// Rev 0.21 Inproved loop branch targets for v_mempcy CJC 03/01/03
// Rev 0.22 Experimented with dst (sent to H.) CJC 03/02/03
// Rev 0.23 Substituted dcba for dcbz (sent to JY) CJC 03/08/03
// Rev 0.24 Use two dst streams CJC 03/12/03
// Rev 0.25 Fix for all compilers, cleanup, and release with
// libmotovec.a rev 0.10 CJC 03/14/03
// Rev 0.30 Fix for pre-empted destination (SNDF-DS) CJC 04/02/03
//
// Between Rev 0.25 and 0.30 the code was revised to store elements of
// source at destination when first and/or last vector are less than 16
// bytes. Areviewer at SNDF observed that loading the destination vector
// for merging exposed the "uninvolved" destination bytes to incoherency
// if an interrupt pre-empted this routine and modified the "uninvolved"
// destination vector(s) while held in register for merging. It seems
// like a low possibility but this revision is no longer subject to that
// possibility. (It is also slightly faster than Rev 0.25.)
// This is beta quality code; users are encouraged to make it faster.
// ASSUMPTIONS:
// Code is highly likely to be in the cache; data is not (streaming data)
#define VRSV 256 // VRSAVE spr
// Don't use vectors for BC <= MIN_VEC. Works only if MIN_VEC >= 16 bytes.
#define MIN_VEC 16
// Don't use Big_loop in v_memcpy for |dst-src|<= minimum overlap.
#define MIN_OVL 128
// Register useage
#define Rt r0 // r0 when used as a temporary register
#define DST r3 // entering: dst pointer; exiting: same dst pointer
#define SRC r4 // entering: src ptr; then end of src range index (SRC+BC) in memmove
#define BC r5 // entering: Byte_Count
#define PCS r6 // save for partial checksum entering
#define DMS r7 // dst - src initially
#define BK r7 // BC - 1 +/- (n*16)
// Codewarrior will put an unwelcome space as "lbzu r0,1(r7 )"
// if you don't put the comment right after the r7. CJC 030314
#define SM1 r8// src -1 for byte-by-byte forwards initially
#define S r8 // src[28:31]
#define SMD r8 // src[0:27]-dst[0:27]
#define STR r8 // data stream touch block & stride info for Big_loop
#define DM1 r9// dst -1 for byte-by-byte forwards initially
#define D r9 // dst[28:31]
#define DNX r9 // (dst+n*16)[28:31]
#define BL r9 // second byte_kount index pointer
#define SBC r10// src + byte count initially then src[28:31]
#define BLK r10 // temporary data stream touch block & stride info
#define DR r10 // (dst+16)[0:27]
#define QW r10 // number of quad words (vectors)
#define DBC r11// dst + byte count initially
#define BLL r11 // temporary data stream touch block & stride info
#define SBK r11 // (src+byte_count-1)
#define SBR r11 // (src+byte_count-1)[0:27]
#define DBK r11 // (dst+byte_count-1) then (dst+byte_count-1)[28:31]
#define BIG r11 // QW/8 or 128 byte loop count
#define SP8 r11 // SRC + n*128 (8 QWs) for data streaming after first call
#define RSV r12 // storage for VRSAVE register if used
#define VS0 v0 // src vector for permuting
#define VS1 v1 // src vector for permuting
#define VP3 v2 // d - s permute register
#define VPS0 v3 // permuted source vector to store
#define VPS1 v4 // 2nd permuted source vector to store
#define VPS2 v5 // additional permuted src in Big loop
#define VS2 v6 // src vector for permuting
#define VPS3 v6 // additional permuted src in Big loop
#define VS3 v7 // additional src load in Big loop
#define VPS4 v7 // additional permuted src in Big loop
#define VS4 v8 // additional src load in Big loop
#define VPS5 v8 // additional permuted src in Big loop
#define VS5 v9 // additional src load in Big loop
#define VPS6 v9 // additional permuted src in Big loop
#define VS6 v10 // additional src load in Big loop
#define VPS7 v10 // additional permuted src in Big loop
#define VS7 v11 // additional src load in Big loop
// Conditionalize the use of dcba. It will help if the data is
// not in cache and hurt if it is. Generally, except for small
// benchmarks repeated many times, we assume data is not in cache
// (data streaming) and using dcbz is a performance boost.
#ifndef NO_DCBA
#if defined(__GNUC__) || defined(__MWERKS__) || defined(_DIAB_TOOL)
// gcc and codewarrior and diab don't assemble dcba
#define DCBK .long 0x7c033dec
// dcba r3,r7 or dcba DST,BK
#define DCBL .long 0x7c034dec
// dcba r3,r9 or dcba DST,BL
#else
#ifdef __ghs__
.macro DCBK
.long 0x7c033dec
.endm
.macro DCBL
.long 0x7c034dec
.endm
#else
#define DCBK dcba DST,BK
#define DCBL dcba DST,BL
#endif // __ghs__
#endif // __GNUC__ or __MWERKS__
#else
#define DCBK nop
#define DCBL nop
#endif // NO_DCBA
// Conditionalize the use of dst (data stream touch). It will help
// if the data is not in cache and hurt if it is (though not as badly
// as dcbz). Generally, except for small benchmarks repeated many times,
// we assume data is not in cache (data streaming) and using dst is a
// performance boost.
#ifndef NO_DST
#define STRM_B dst SBC,BLL,0
#define STRM_F dst SRC,BLK,0
#define STRM_1 dst SP8,STR,1
#else
#define STRM_B nop
#define STRM_F nop
#define STRM_1 nop
#endif
// Condition register use
// cr0[0:2] = (dst-src==0)? return: ((dst-src>0)? copy_bkwd, copy_fwd;);
// then cr0[0:2] = (dst[28:31]-src[28:31]<0)? "shifting left", "shifting right";
// cr1[0,2] = (BC == 0)? 1 : 0; (nothing to move)
// then cr1[2] = (DST[28:31] == 0)? 1 : 0; (D0 left justified)
// then cr1[2] = ((DBK = DST+BC-1)[28:31] = 0xF)? 1 : 0; (DN right justified)
// cr5[0,2] = (|DST-SRC|<=MIN_OVL)?1:0; (Overlap too small for Big loop?)
// cr6[1,2] = (DST-SRC>=BC)?1:0; (Okay for v_memmove to copy forward?)
// then cr6[2] = (QW == 0)? 1 : 0; (Any full vectors to move?)
// then cr6[1] = (QW > 4)? 1 : 0; (>4 vectors to move?)
// then cr6[3] = (third store[27] == 1)? 1: 0; (cache line alignment)
// then cr6[3] = (last store[27] == 1)? 1: 0; (last store odd?)
// cr7[2] = (BC>MIN_VEC)?1:0; (BC big enough to warrant vectors)
// then cr7[0:3] = (DST+16)[0:27]-DST (How many bytes (iff <16) in first vector?)
// then cr7[1] = (QW > 14)? 1 : 0; (>14 vectors to move?)
// then cr7[0:3] = (DST+BC)[0:27] (How many bytes (iff <16) in last vector?)
.text
#ifdef __MWERKS__
.align 32
#else
.align 5
#endif
#ifdef LIBMOTOVEC
.global memmove
memmove:
nop // IU1 Compilers forget first label
.global memcpy
memcpy:
#else
.global vec_memmove
vec_memmove:
nop // IU1 Only way I know to preserve both labels
.global vec_memcpy
vec_memcpy:
#endif
subf. DMS,SRC,DST // IU1 Compute dst-src difference
cmpi cr1,0,BC,0 // IU1 Eliminate zero byte count moves
cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count
addi SM1,SRC,-1 // IU1 Pre-bias and duplicate src for fwd
addi DM1,DST,-1 // IU1 Pre-bias and duplicate destination
add SBC,SRC,BC // IU1 Pre-bias and duplicate src for bkwd
beqlr // return if DST = SRC
add DBC,DST,BC // IU1 Pre-bias and duplicate destination
subf Rt,DST,SRC // IU1 Form |DST-SRC| if DST-SRC<0
beqlr cr1 // return if BC = 0
bgt Cpy_bkwd // b if DST-SRC>0 (have to copy backward)
cmpi cr5,0,Rt,MIN_OVL // IU1 (|DST-SRC|>128)?1:0; for v_memcpy
bgt cr7,v_memcpy // b if BC>MIN_VEC (okay to copy vectors fwd)
// Copy byte-by-byte forwards if DST-SRC<0 and BC<=MIN_VEC
mtctr BC // i=BC; do ...;i--; while (i>0)
Byte_cpy_fwd:
lbzu Rt,1(SM1) // LSU * ++(DST-1) = * ++(SRC-1)
stbu Rt,1(DM1) // LSU
bdnz Byte_cpy_fwd
blr
nop // IU1 Improve next label as branch target
Cpy_bkwd:
cmpi cr5,0,DMS,MIN_OVL // IU1 ((DST-SRC)>128)?1:0; for v_memcpy
cmp cr6,0,DMS,BC // IU1 cr6[1,2]=(DST-SRC>=BC)?1:0;
bgt cr7,v_memmove // b if BC>MIN_VEC (copy vectors bkwd)
// Copy byte-by-byte backwards if DST-SRC>0 and BC<=MIN_VEC
mtctr BC // i=BC; do ...;i--; while (i>0)
Byte_cpy_bwd:
lbzu Rt,-1(SBC) // LSU * --(DST+BC) = * --(SRC+BC)
stbu Rt,-1(DBC) // LSU Store it
bdnz Byte_cpy_bwd
blr
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -