📄 vec_memset.s
字号:
stvebx v0,DBK,BL // LSU store one byte if necessary
b All_done_fwd
Rt_just_fwd:
stvx v0,DST,BK // LSU Store 16 bytes at D14
All_done_fwd:
#ifdef VRSAVE
mtspr VRSV,RSV // IU1 Restore VRSAVE
#endif
blr // Return destination address from entry
#ifdef __MWERKS__
.align 16
#else
.align 4
#endif
GT_4QW_fwd: // Do once if nxt st is to odd half of cache line, else twice
addi QW,QW,-1 // IU1 Keeping track of QWs stored
mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0;
addi DNX,DNX,16 // IU1 Update cr6 for next loop
stvx v0,DST,BK // LSU Store 16 bytes at D2
addi BK,BK,16 // IU1 Increment byte count by 16
bdnzf 27,GT_4QW_fwd // b if next store is to lower (even) half of CL
mtcrf 0x02,DBK // IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?)
bns cr6,B32_fwd // b if DST[27] == 0; i.e, final store is even
// We need the ctr register to reflect an even byte count before entering
// the next block - faster to decrement than to reload.
bdnz B32_fwd // decrement counter for last QW store odd
B32_fwd: // Should be at least 2 stores remaining and next 2 are cache aligned
DCBK // LSU then Kill instead of RWITM
stvx v0,DST,BK // LSU Store 16 bytes at D11
addi BK,BK,16 // IU1 Increment byte count
bdz Nxt_loc_fwd // always decrement and branch to next instr
Nxt_loc_fwd:
stvx v0,DST,BK // LSU Store 16 bytes at D12
addi BK,BK,16 // IU1 Increment byte count
bdnz B32_fwd // b if there are at least two more QWs to do
bso cr6,One_even_QW // b if there is one even and one odd QW to store
b Last_QW // b if last store is to even address
// Come here with two more loads and two stores to do
One_even_QW:
stvx v0,DST,BK // LSU Store 16 bytes at D13
addi BK,BK,16 // IU1 Increment byte count
b Last_QW
// End of memset in AltiVec
#define BCz r4 // in bzero r4 enters with byte count
#ifdef __MWERKS__
.align 32
#else
.align 5
#endif
#ifdef LIBMOTOVEC
.global bzero
bzero:
#else
.global vec_bzero
vec_bzero:
#endif
mr BC,BCz // IU1 arg[2] is BC here, not FILL
li FILL,0 // IU1 for bzero FILL=0
#ifdef LIBMOTOVEC
b memset
#else
b vec_memset
#endif
// cacheable_memzero will employ dcbz to clear 32 bytes at a time
// of cacheable memory. Like bzero, second entering argument will be BC.
// Using this for non-cacheable memory will generate an alignment exception.
.text
#ifdef __MWERKS__
.align 32
#else
.align 5
#endif
#ifdef LIBMOTOVEC
.global cacheable_memzero
cacheable_memzero:
#else
.global vec_cacheable_memzero
vec_cacheable_memzero:
#endif
mr BC,BCz // IU1 arg[2] is BC here, not FILL
li FILL,0 // IU1 for bzero FILL=0
cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count
cmpi cr1,0,BC,0 // IU1 Eliminate zero byte count
addi DM1,DST,-1 // IU1 Pre-bias and duplicate destination
addi DR,DST,16 // IU1 Address of second dst vector
add DBC,DST,BC // IU1 Address of last dst byte + 1
bgt cr7,c_v_memset // b if BC>MIN_VEC
mtctr BC // for (i=1;i<=BC;i++)
beqlr cr1 // return if BC = 0
c_Byte_set:
stbu FILL,1(DM1) // LSU * ++(DST-1) = FILL
bdnz c_Byte_set
blr
c_v_memset:
// Byte count < MIN_VEC bytes will have been set by scalar code above,
// so this will not deal with small block sets < MIN_VEC.
// For systems using VRSAVE, define VRSAV=1 when compiling. For systems
// that don't, make sure VRSAVE is undefined.
#ifdef VRSAVE
mfspr RSV,VRSV // IU2 Get current VRSAVE contents
#endif
rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27]
addi DBK,DBC,-1 // IU1 Address of last dst byte
#ifdef VRSAVE
oris Rt,RSV,0x8000 // IU1 Or in registers used by this routine
#endif
subf D,DST,DR // IU1 How many bytes in first destination?
li BK,0 // IU1 Initialize byte kount index
#ifdef VRSAVE
mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op
#endif
vxor v0,v0,v0 // VIU Clear v0
subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16)
cmpi cr1,0,D,16 // IU1 Is D0 left justified?
mtcrf 0x01,D // IU2 Put bytes in 1st dst in cr7
rlwinm QW,QW,28,4,31 // IU1 Quad words remaining
beq cr1,c_Left_just // b if D0 is left justified
bns cr7,c_No_B_fwd // b if only even number of bytes to store
stvebx v0,DST,BK // LSU store first byte at DST+0
addi BK,BK,1 // IU1 increment index
c_No_B_fwd:
bne cr7,c_No_H_fwd // b if only words to store
stvehx v0,DST,BK // LSU store halfword at DST+0/1
addi BK,BK,2 // IU1 increment index
c_No_H_fwd:
bng cr7,c_No_W1_fwd // b if exactly zero or two words to store
stvewx v0,DST,BK // LSU store word 1 of one or three
addi BK,BK,4 // IU1 increment index
c_No_W1_fwd:
bnl cr7,c_No_W2_fwd // b if there was only one word to store
stvewx v0,DST,BK // LSU store word 1 of two or 2 of three
addi BK,BK,4 // IU1 increment index
stvewx v0,DST,BK // LSU store word 2 of two or 3 of three
b c_No_W2_fwd
c_Left_just:
stvx v0,0,DST // LSU Store 16 bytes at D0
c_No_W2_fwd:
rlwinm Rt,DBK,0,28,31 // IU1 (DBK = DST+BC-1)[28:31]
cmpi cr6,0,QW,0 // IU1 Any full vectors to move?
li BK,16 // IU1 Re-initialize byte kount index
cmpi cr1,0,Rt,0xF // IU1 Is DN right justified?
ble cr6,c_Last_QW // b if no Quad words to do
mtctr QW // IU2 for (i=0;i<=QW;i++)
cmpi cr6,0,QW,4 // IU1 Check QW>4
c_QW_loop:
stvx v0,DST,BK // LSU Store 16 fill bytes
addi BK,BK,16 // IU1 Increment byte kount index
bdnzf 25,c_QW_loop // b if 4 or less quad words to do
add DNX,DST,BK // IU1 address of next store (DST+32 if QW>4)
addi QW,QW,-1 // IU1 One more QW stored by now
bgt cr6,c_GT_4QW_fwd // b if >4 quad words left
c_Last_QW: // Next vector is the last; we're done.
mtcrf 0x01,DBC // IU2 Put final vector byte count in cr7
beq cr1,c_Rt_just_fwd // b if last destination is right justified
rlwinm DBK,DBK,0,0,27 // IU1 Round to QW addr of last byte
li BL,0 // IU1 Initialize index pointer
bnl cr7,c_Only_1W_fwd // b if there was only one or zero words to store
stvewx v0,DBK,BL // LSU store word 1 of two or three
addi BL,BL,4 // IU1 increment index
stvewx v0,DBK,BL // LSU store word 2 of two or three
addi BL,BL,4 // IU1 increment index
c_Only_1W_fwd:
bng cr7,Only_2W_fwd // b if there were only two or zero words to store
stvewx v0,DBK,BL // LSU store word 3 of three if necessary
addi BL,BL,4 // IU1 increment index
c_Only_2W_fwd:
bne cr7,c_Only_B_fwd // b if there are no half words to store
stvehx v0,DBK,BL // LSU store one halfword if necessary
addi BL,BL,2 // IU1 increment index
c_Only_B_fwd:
bns cr7,c_All_done_fwd // b if there are no bytes to store
stvebx v0,DBK,BL // LSU store one byte if necessary
b c_All_done_fwd
c_Rt_just_fwd:
stvx v0,DST,BK // LSU Store 16 bytes at D14
c_All_done_fwd:
#ifdef VRSAVE
mtspr VRSV,RSV // IU1 Restore VRSAVE
#endif
blr // Return destination address from entry
#ifdef __MWERKS__
.align 16
#else
.align 4
#endif
c_GT_4QW_fwd: // Do once if nxt st is to odd half of cache line, else twice
addi QW,QW,-1 // IU1 Keeping track of QWs stored
mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0;
addi DNX,DNX,16 // IU1 Update cr6 for next loop
stvx v0,DST,BK // LSU Store 16 bytes at D2
addi BK,BK,16 // IU1 Increment byte count by 16
bdnzf 27,c_GT_4QW_fwd // b if next store is to lower (even) half of CL
mtcrf 0x02,DBK // IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?)
bns cr6,c_B32_fwd // b if DST[27] == 0; i.e, final store is even
// We need the ctr register to reflect an even byte count before entering
// the next block - faster to decrement than to reload.
bdnz B32_fwd // decrement counter for last QW store odd
c_B32_fwd: // Should be at least 2 stores remaining and next 2 are cache aligned
dcbz DST,BK // LSU zero whole cache line
bdz c_Nxt_loc_fwd // always decrement and branch to next instr
c_Nxt_loc_fwd:
addi BK,BK,32 // IU1 Increment byte count
bdnz B32_fwd // b if there are at least two more QWs to do
bso cr6,c_One_even_QW // b if there is one even and one odd QW to store
b c_Last_QW // b if last store is to even address
// Come here with two more loads and two stores to do
c_One_even_QW:
stvx v0,DST,BK // LSU Store 16 bytes at D13
addi BK,BK,16 // IU1 Increment byte count
b c_Last_QW
// End of cacheable_memzero in AltiVec
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -