📄 vec_memcpy.s
字号:
#ifdef __MWERKS__
.align 16
#else
.align 4
#endif
v_memmove:
// Byte count < MIN_VEC bytes will have been copied by scalar code above,
// so this will not deal with small block moves < MIN_VEC.
// For systems using VRSAVE, define VRSAVE=1 when compiling. For systems
// that don't, make sure VRSAVE is undefined.
#ifdef VRSAVE
mfspr RSV,VRSV // IU2 Get current VRSAVE contents
#endif
rlwinm S,SRC,0,28,31 // IU1 Save src address bits s[28:31]
rlwinm D,DST,0,28,31 // IU1 D = dst[28:31]
bge cr6,MC_entry // b to v_memcpy if DST-SRC>=BC (fwd copy OK)
#ifdef VRSAVE
oris Rt,RSV,0xfff0 // IU1 Or in registers used by this routine
#endif
lis BLL,0x010c // IU1 Stream 12 blocks of 16 bytes
subf. SMD,D,S // IU1 if S-D<0 essentially shifting right
#ifdef VRSAVE
mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op
#endif
lvsr VP3,0,DMS // LSU Permute vector for dst - src shft right
ori BLL,BLL,0xffe0 // IU1 Stream stride -32B
STRM_B // LSU Start data stream at SRC+BC
addi SBK,SBC,-1 // IU1 Address of last src byte
bgt Rt_shft // Bytes from upper vector = (s-d>0)?s-d:16+s-d;
addi SMD,SMD,16 // IU1 Save 16-(d-s)
Rt_shft:
rlwinm SBR,SBK,0,0,27 // IU1 (SRC+BC-1)[0:27]
addi BK,BC,-1 // IU1 Initialize byte index
subf Rt,SBR,SBC // IU1 How many bytes in first source?
add DBK,DST,BK // IU1 Address of last dst byte
addi DR,DST,16 // IU1 Address of second dst vector
subf. SMD,Rt,SMD // IU1 if bytes in 1st src>Bytes in 1st permute
rlwinm Rt,DBK,0,28,31 // IU1 (DST+BC-1)[28:31]
rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27]
// If there are more useful bytes in the upper vector of a permute pair than we
// will get in the first permute, the first loaded vector needs to be in the
// lower half of the permute pair. The upper half is a don't care then.
blt Get_bytes_rt // b if shifting left (D-S>=0)
lvx VS1,SRC,BK // LSU Get SN load started
// Comments numbering source and destination assume single path through the
// code executing each instruction once. For vec_memmove, an example would
// be the call memmove(BASE+0x0F, BASE+0x2F, 82). N = 6 in that case.
addi SRC,SRC,-16 // IU1 Decrement src base (to keep BK useful)
Get_bytes_rt: // Come here to get VS0 & Don't care what VS1 is
lvx VS0,SRC,BK // LSU Get SN-1 (SN if D-S<0) in lower vector
subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16)
cmpi cr7,0,Rt,0xF // IU1 Is Dn right justified?
cmpi cr1,0,D,0 // IU1 Is D0 left justified?
rlwinm QW,QW,28,4,31 // IU1 Quad words remaining
add Rt,DST,BC // IU1 Refresh the value of DST+BC
cmpi cr6,0,QW,0 // IU1 Any full vectors to move?
vperm VPS0,VS0,VS1,VP3 // VPU Align SN-1 and SN to DN
vor VS1,VS0,VS0 // VIU1 Move lower vector to upper
beq cr7,Rt_just // b if DN is right justified
mtcrf 0x01,Rt // IU2 Put final vector byte count in cr7
rlwinm DBK,DBK,0,0,27 // IU1 Address of first byte of final vector
li D,0 // IU1 Initialize an index pointer
bnl cr7,Only_1W_bkwd // b if there was only one or zero words to store
stvewx VPS0,DBK,D // LSU store word 1 of two or three
addi D,D,4 // IU1 increment index
stvewx VPS0,DBK,D // LSU store word 2 of two or three
addi D,D,4 // IU1 increment index
Only_1W_bkwd:
bng cr7,Only_2W_bkwd // b if there were only two or zero words to store
stvewx VPS0,DBK,D // LSU store word 3 of three if necessary
addi D,D,4 // IU1 increment index
Only_2W_bkwd:
bne cr7,Only_B_bkwd // b if there are no half words to store
stvehx VPS0,DBK,D // LSU store one halfword if necessary
addi D,D,2 // IU1 increment index
Only_B_bkwd:
bns cr7,All_done_bkwd // b if there are no bytes to store
stvebx VPS0,DBK,D // LSU store one byte if necessary
b All_done_bkwd
Rt_just:
stvx VPS0,DST,BK // LSU Store 16 bytes at DN
All_done_bkwd:
addi BK,BK,-16 // IU1 Decrement destination byte count
ble cr6,Last_load // b if no Quad words to do
mtctr QW // IU2 for (i=0;i<=QW;i++)-execution serializng
cmpi cr6,0,QW,4 // IU1 Check QW>4
QW_loop:
lvx VS0,SRC,BK // LSU Get SN-2 (or SN-1 if ADJ==0)
vperm VPS0,VS0,VS1,VP3 // VPU Align SN-2 and SN-1 to DN-1
vor VS1,VS0,VS0 // VIU1 Move lower vector to upper
stvx VPS0,DST,BK // LSU Store 16 bytes at DN-1
addi BK,BK,-16 // IU1 Decrement byte kount
bdnzf 25,QW_loop // b if 4 or less quad words to do
add DNX,DST,BK // IU1 address of next store (DST+BC-1-16)
bgt cr6,GT_4QW // b if >4 quad words left
Last_load: // if D-S>=0, next load will be from same address as last
blt No_ld_bkwd // b if shifting right (S-D>=0)
addi SRC,SRC,16 // IU1 recorrect source if it was decremented
No_ld_bkwd:
lvx VS0,0,SRC // LSU Get last source SN-6 (guaranteed S0)
// Current 16 bytes is the last; we're done.
dss 0 // Data stream stop
vperm VPS0,VS0,VS1,VP3 // VPU Align SN-6 and SN-5 to DN-6
subfic D,DST,16 // IU1 How many bytes in first destination?
beq cr1,Lt_just // b if last destination is left justified
mtcrf 0x01,D // IU2 Put byte count remaining in cr7
li D,0 // IU1 Initialize index pointer
bns cr7,No_B_bkwd // b if only even number of bytes to store
stvebx VPS0,DST,D // LSU store first byte at DST+0
addi D,D,1 // IU1 increment index
No_B_bkwd:
bne cr7,No_H_bkwd // b if only words to store
stvehx VPS0,DST,D // LSU store halfword at DST+0/1
addi D,D,2 // IU1 increment index
No_H_bkwd:
bng cr7,No_W1_bkwd // b if exactly zero or two words to store
stvewx VPS0,DST,D // LSU store word 1 of one or three
addi D,D,4 // IU1 increment index
No_W1_bkwd:
bnl cr7,No_W2_bkwd // b if there was only one word to store
stvewx VPS0,DST,D // LSU store word 1 of two or 2 of three
addi D,D,4 // IU1 increment index
stvewx VPS0,DST,D // LSU store word 2 of two or 3 of three
b No_W2_bkwd
Lt_just:
stvx VPS0,0,DST // LSU Store 16 bytes at final dst addr D0
No_W2_bkwd:
#ifdef VRSAVE
mtspr VRSV,RSV // IU1 Restore VRSAVE
#endif
blr // Return destination address from entry
GT_4QW: // Do once if next store is to even half of cache line, else twice
lvx VS0,SRC,BK // LSU Get SN-3 (or SN-2)
mtcrf 0x02,DNX // IU2 cr6[3]=((DST+BC-1)[27]==1)?1:0;
vperm VPS0,VS0,VS1,VP3 // VPU Align SN-3 and SN-2 to Dn-2
vor VS1,VS0,VS0 // VIU1 Move lower vector to upper
addi DNX,DNX,-16 // IU1 Prepare to update cr6 next loop
stvx VPS0,DST,BK // LSU Store 16 bytes at DN-2
vor VS3,VS0,VS0 // VIU Make a copy of lower vector
addi BK,BK,-16 // IU1 Decrement byte count by 16
bdnzt 27,GT_4QW // b if next store is to upper (odd) half of CL
// At this point next store will be to even address.
lis STR,0x102 // IU1 Stream 2 blocks of 16 bytes
mtcrf 0x02,DST // IU2 cr6[3]=(DST[27]==1)?1:0; (DST odd?)
addi BL,BK,-16 // IU1 Create an alternate byte count - 16
ori STR,STR,0xffe0 // IU1 Stream stride -32B
addi SP8,SRC,-64 // IU1 Starting address for data stream touch
bso cr6,B32_bkwd // b if DST[27] == 1; i.e, final store is odd
bdnz B32_bkwd // decrement counter for last odd QW store
B32_bkwd: // Should be at least 2 stores remaining and next 2 are cache aligned
lvx VS2,SRC,BK // LSU Get SN-4 (or SN-3)
addi SP8,SP8,-32 // IU1 Next starting address for data stream touch
lvx VS1,SRC,BL // LSU Get SN-5 (or SN-4)
vperm VPS0,VS2,VS3,VP3 // VPU Align SN-4 and SN-3 to DN-3
STRM_1 // LSU Stream 64 byte blocks ahead of loads
DCBL // LSU allocate next cache line
vperm VPS1,VS1,VS2,VP3 // VPU Align SN-5 and SN-4 to DN-4
vor VS3,VS1,VS1 // VIU1 Move SN-5 to SN-3
stvx VPS0,DST,BK // LSU Store 16 bytes at DN-3
addi BK,BL,-16 // IU1 Decrement byte count
bdz Nxt_loc_bkwd // always decrement and branch to next instr
Nxt_loc_bkwd:
stvx VPS1,DST,BL // LSU Store 16 bytes at DN-4
addi BL,BK,-16 // IU1 Decrement alternate byte count
bdnz B32_bkwd // b if there are at least two more QWs to do
bns cr6,One_odd_QW // b if there was one more odd QW to store
b Last_load
// Come here with two more loads and two stores to do
One_odd_QW:
lvx VS1,SRC,BK // LSU Get SN-6 (or SN-5)
vperm VPS1,VS1,VS3,VP3 // VPU Align SN-6 and SN-5 to DN-5
stvx VPS1,DST,BK // LSU Store 16 bytes at DN-5
b Last_load
// End of memmove in AltiVec
#ifdef __MWERKS__
.align 16
#else
.align 4
#endif
v_memcpy:
// Byte count < MIN_VEC bytes will have been copied by scalar code above,
// so this will not deal with small block moves < MIN_VEC.
#ifdef VRSAVE
mfspr RSV,VRSV // IU2 Get current VRSAVE contents
#endif
rlwinm S,SRC,0,28,31 // IU1 Save src address bits s[28:31]
rlwinm D,DST,0,28,31 // IU1 D = dst[28:31]
MC_entry: // enter here from memmove if DST-SRC>=BC; this should be faster
#ifdef VRSAVE
oris Rt,RSV,0xfff0 // IU1 Or in registers used by this routine
#endif
lis BLK,0x010c // IU1 Stream 12 blocks of 16 bytes
subf. S,S,D // IU1 if D-S<0 essentially shifting left
#ifdef VRSAVE
mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op
#endif
lvsr VP3,0,DMS // LSU Permute vector for dst - src shft right
ori BLK,BLK,32 // IU1 Stream stride 32B
STRM_F // LSU Start data stream 0 at SRC
addi DR,DST,16 // IU1 Address of second dst vector
addi DBK,DBC,-1 // IU1 Address of last dst byte
// If D-S<0 we are "kinda" shifting left with the right shift permute vector
// loaded to VP3 and we need both S0 and S1 to permute. If D-S>=0 then the
// first loaded vector needs to be in the upper half of the permute pair and
// the lower half is a don't care then.
bge Ld_bytes_rt // b if shifting right (D-S>=0)
lvx VS0,0,SRC // LSU Get S0 load started
// Comments numbering source and destination assume single path through the
// code executing each instruction once. For vec_memcpy, an example would
// be the call memcpy(BASE+0x1E, BASE+0x1F, 259). N = 16 in that case.
addi SRC,SRC,16 // IU1 Increment src base (to keep BK useful)
Ld_bytes_rt: // Come here to get VS1 & Don't care what VS0 is
lvx VS1,0,SRC // LSU Get S1 (or S0 if D-S>=0) in upper vector
rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27]
cmpi cr1,0,D,0 // IU1 Is D0 left justified?
subf Rt,DST,DR // IU1 How many bytes in first destination?
subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16)
li BK,0 // IU1 Initialize byte kount index
mtcrf 0x01,Rt // IU2 Put bytes in 1st dst in cr7
rlwinm QW,QW,28,4,31 // IU1 Quad words remaining
vperm VPS0,VS0,VS1,VP3 // VPU Align S0 and S1 to D0
vor VS0,VS1,VS1 // VIU1 Move upper vector to lower
beq cr1,Left_just // b if D0 is left justified
bns cr7,No_B_fwd // b if only even number of bytes to store
stvebx VPS0,DST,BK // LSU store first byte at DST+0
addi BK,BK,1 // IU1 increment index
No_B_fwd:
bne cr7,No_H_fwd // b if only words to store
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -