📄 vec_memcpy.s
字号:
stvehx VPS0,DST,BK // LSU store halfword at DST+0/1
addi BK,BK,2 // IU1 increment index
No_H_fwd:
bng cr7,No_W1_fwd // b if exactly zero or two words to store
stvewx VPS0,DST,BK // LSU store word 1 of one or three
addi BK,BK,4 // IU1 increment index
No_W1_fwd:
bnl cr7,No_W2_fwd // b if there was only one word to store
stvewx VPS0,DST,BK // LSU store word 1 of two or 2 of three
addi BK,BK,4 // IU1 increment index
stvewx VPS0,DST,BK // LSU store word 2 of two or 3 of three
b No_W2_fwd
Left_just:
stvx VPS0,0,DST // LSU Store 16 bytes at D0
No_W2_fwd:
rlwinm Rt,DBK,0,28,31 // IU1 (DBK = DST+BC-1)[28:31]
cmpi cr6,0,QW,0 // IU1 Any full vectors to move?
li BK,16 // IU1 Re-initialize byte kount index
cmpi cr1,0,Rt,0xF // IU1 Is DN right justified?
cmpi cr7,0,QW,14 // IU1 Check QW>14
ble cr6,Last_ld_fwd // b if no Quad words to do
mtctr QW // IU2 for (i=0;i<=QW;i++)
cmpi cr6,0,QW,4 // IU1 Check QW>4
QW_fwd_loop:
lvx VS1,SRC,BK // LSU Get S2 (or S1)
vperm VPS0,VS0,VS1,VP3 // VPU Align S1 and S2 to D1
vor VS0,VS1,VS1 // VIU1 Move upper vector to lower
stvx VPS0,DST,BK // LSU Store 16 bytes at D1(+n*16 where n<4)
addi BK,BK,16 // IU1 Increment byte kount index
bdnzf 25,QW_fwd_loop // b if 4 or less quad words to do
add DNX,DST,BK // IU1 address of next store (DST+32 if QW>4)
addi QW,QW,-1 // IU1 One more QW stored by now
bgt cr6,GT_4QW_fwd // b if >4 quad words left
Last_ld_fwd: // Next 16 bytes is the last; we're done.
add DBC,DST,BC // IU1 Recompute address of last dst byte + 1
add SBC,SRC,BC // IU1 Recompute address of last src byte + 1
bge No_ld_fwd // b if shifting right (D-S>=0)
addi SBC,SBC,-16 // IU1 if D-S>=0 we didn't add 16 to src
No_ld_fwd:
mtcrf 0x01,DBC // IU2 Put final vector byte count in cr7
addi DBK,DBC,-1 // IU1 Recompute address of last dst byte
addi Rt,SBC,-1 // IU1 Recompute address of last src byte
// If D-S<0 we have already loaded all the source vectors.
// If D-S>=0 then the first loaded vector went to the upper half of the permute
// pair and we need one more vector. (This may be a duplicate.)
lvx VS1,0,Rt // LSU Get last source S14 (guaranteed SN)
#ifndef NO_DST
dss 0 // Data stream 0 stop
dss 1 // Data stream 1 stop
#endif
vperm VPS0,VS0,VS1,VP3 // VPU Align S13 and S14 to D14
beq cr1,Rt_just_fwd // b if last destination is right justified
rlwinm DBK,DBK,0,0,27 // IU1 Round to QW addr of last byte
li D,0 // IU1 Initialize index pointer
bnl cr7,Only_1W_fwd // b if there was only one or zero words to store
stvewx VPS0,DBK,D // LSU store word 1 of two or three
addi D,D,4 // IU1 increment index
stvewx VPS0,DBK,D // LSU store word 2 of two or three
addi D,D,4 // IU1 increment index
Only_1W_fwd:
bng cr7,Only_2W_fwd // b if there were only two or zero words to store
stvewx VPS0,DBK,D // LSU store word 3 of three if necessary
addi D,D,4 // IU1 increment index
Only_2W_fwd:
bne cr7,Only_B_fwd // b if there are no half words to store
stvehx VPS0,DBK,D // LSU store one halfword if necessary
addi D,D,2 // IU1 increment index
Only_B_fwd:
bns cr7,All_done_fwd // b if there are no bytes to store
stvebx VPS0,DBK,D // LSU store one byte if necessary
b All_done_fwd
Rt_just_fwd:
stvx VPS0,DST,BK // LSU Store 16 bytes at D14
All_done_fwd:
#ifdef VRSAVE
mtspr VRSV,RSV // IU1 Restore VRSAVE
#endif
blr // Return destination address from entry
#ifdef __MWERKS__
.align 16
#else
.align 4
#endif
GT_4QW_fwd: // Do once if nxt st is to odd half of cache line, else twice
lvx VS1,SRC,BK // LSU Get S3 (or S2)
addi QW,QW,-1 // IU1 Keeping track of QWs stored
mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0;
addi DNX,DNX,16 // IU1 Update cr6 for next loop
addi Rt,QW,-2 // IU1 Insure at least 2 QW left after big loop
vperm VPS0,VS0,VS1,VP3 // VPU Align S2 and S3 to D2
vor VS0,VS1,VS1 // VIU1 Move upper vector to lower
stvx VPS0,DST,BK // LSU Store 16 bytes at D2
addi BK,BK,16 // IU1 Increment byte count by 16
bdnzf 27,GT_4QW_fwd // b if next store is to lower (even) half of CL
// At this point next store will be to even address.
mtcrf 0x02,DBK // IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?)
lis STR,0x104 // IU1 Stream 4 blocks of 16 bytes
addi BL,BK,16 // IU1 Create an alternate byte kount + 32
ori STR,STR,32 // IU1 Stream stride 32B
#ifndef NO_BIG_LOOP
rlwinm BIG,Rt,29,3,31 // IU1 QW/8 big loops to do
rlwinm Rt,Rt,0,0,28 // IU1 How many QWs will be done in big loop
bgt cr7,Big_loop // b if QW > 14
#endif
No_big_loop:
// We need the ctr register to reflect an even byte count before entering
// the next block - faster to decrement than to reload.
addi SP8,SRC,256 // IU1 Starting address for data stream touch
xoris STR,STR,0x6 // IU1 Reset stream to 2 blocks of 16 bytes
bns cr6,B32_fwd // b if DST[27] == 0; i.e, final store is even
bdnz B32_fwd // decrement counter for last QW store odd
B32_fwd: // Should be at least 2 stores remaining and next 2 are cache aligned
lvx VS1,SRC,BK // LSU Get S12
addi SP8,SP8,32 // IU1 Next starting address for data stream touch
lvx VS2,SRC,BL // LSU Get S13
vperm VPS1,VS0,VS1,VP3 // VPU Align S11 and S12 to D11
STRM_1 // LSU Stream 64 byte blocks ahead of loads
DCBK // LSU then Kill instead of RWITM
vperm VPS0,VS1,VS2,VP3 // VPU Align S12 and S13 to D12
vor VS0,VS2,VS2 // VIU1 Move S13 to S11
stvx VPS1,DST,BK // LSU Store 16 bytes at D11
addi BK,BL,16 // IU1 Increment byte count
bdz Nxt_loc_fwd // always decrement and branch to next instr
Nxt_loc_fwd:
stvx VPS0,DST,BL // LSU Store 16 bytes at D12
addi BL,BK,16 // IU1 Increment alternate byte count
bdnz B32_fwd // b if there are at least two more QWs to do
bso cr6,One_even_QW // b if there is one even and one odd QW to store
b Last_ld_fwd // b if last store is to even address
// Come here with two more loads and two stores to do
One_even_QW:
lvx VS1,SRC,BK // LSU Get S14 (or S13 if if D-S>=0)
vperm VPS0,VS0,VS1,VP3 // VPU Align S13 and S14 to D13
vor VS0,VS1,VS1 // VIU1 Move upper vector to lower
stvx VPS0,DST,BK // LSU Store 16 bytes at D13
addi BK,BK,16 // IU1 Increment byte count
b Last_ld_fwd
#ifdef __MWERKS__
.align 16
#else
.align 4
#endif
Big_loop:
subf QW,Rt,QW // IU1 Should be 2-7 QWs left after big loop
blt cr5,No_big_loop // b back if |DST-SRC|<128; Big_loop won't work.
mtctr BIG // IU2 loop for as many 128B loops as possible
addi SP8,SRC,256 // IU1 Starting address for data stream touch
Loop_of_128B: // Come here with QW>=10 and next store even; VS0 last load
lvx VS1,SRC,BK // LSU Get S4 (or S3 if D-S>=0)
addi BL,BK,32 // IU1 Increment Byte_Kount+16 by 32
addi SP8,SP8,128 // IU1 increment address for data stream touch
lvx VS3,SRC,BL // LSU Get S6 (or S5)
addi BL,BL,32 // IU1 Increment Byte_Kount+48 by 32
lvx VS5,SRC,BL // LSU Get S8 (or S7)
addi BL,BL,32 // IU1 Increment Byte_Kount+80 by 32
lvx VS7,SRC,BL // LSU Get S10 (or S9)
addi BL,BK,16 // IU1 Increment Byte_Kount+16 by 16
lvx VS2,SRC,BL // LSU Get S5 (or S4)
addi BL,BL,32 // IU1 Increment Byte_Kount+32 by 32
lvx VS4,SRC,BL // LSU Get S7 (or S6)
addi BL,BL,32 // IU1 Increment Byte_Kount+64 by 32
lvx VS6,SRC,BL // LSU Get S9 (or S8)
addi BL,BL,32 // IU1 Increment Byte_Kount+96 by 32
vperm VPS0,VS0,VS1,VP3 // VPU
lvx VS0,SRC,BL // LSU Get S11 (or S10)
vperm VPS1,VS1,VS2,VP3 // VPU
STRM_1 // LSU Stream 4 32B blocks, stride 32B
DCBK // LSU then Kill instead of RWITM
stvx VPS0,DST,BK // LSU Store D3
addi BK,BK,16 // IU1 Increment Byte_Kount+16 by 16
vperm VPS2,VS2,VS3,VP3 // VPU
stvx VPS1,DST,BK // LSU Store D4
addi BK,BK,16 // IU1 Increment Byte_Kount+32 by 16
vperm VPS3,VS3,VS4,VP3 // VPU
DCBK // LSU then Kill instead of RWITM
stvx VPS2,DST,BK // LSU Store D5
addi BK,BK,16 // IU1 Increment Byte_Kount+48 by 16
vperm VPS4,VS4,VS5,VP3 // VPU
stvx VPS3,DST,BK // LSU Store D6
addi BK,BK,16 // IU1 Increment Byte_Kount+64 by 16
vperm VPS5,VS5,VS6,VP3 // VPU
DCBK // LSU then Kill instead of RWITM
stvx VPS4,DST,BK // LSU Store D7
addi BK,BK,16 // IU1 Increment Byte_Kount+80 by 16
vperm VPS6,VS6,VS7,VP3 // VPU
stvx VPS5,DST,BK // LSU Store D8
addi BK,BK,16 // IU1 Increment Byte_Kount+96 by 16
vperm VPS7,VS7,VS0,VP3 // VPU
DCBK // LSU then Kill instead of RWITM
stvx VPS6,DST,BK // LSU Store D9
addi BK,BK,16 // IU1 Increment Byte_Kount+112 by 16
stvx VPS7,DST,BK // LSU Store D10
addi BK,BK,16 // IU1 Increment Byte_Kount+128 by 16
bdnz Loop_of_128B // b if ctr > 0 (QW/8 still > 0)
mtctr QW // IU1 Restore QW remaining to counter
addi BL,BK,16 // IU1 Create an alternate byte kount + 16
bns cr6,B32_fwd // b if DST[27] == 0; i.e, final store is even
bdnz B32_fwd // b and decrement counter for last QW store odd
// One of the above branches should have taken
// End of memcpy in AltiVec
// bcopy works like memcpy, but the source and destination operands are reversed.
// Following will just reverse the operands and branch to memcpy.
#ifdef LIBMOTOVEC
.global bcopy
bcopy:
#else
.global vec_bcopy
vec_bcopy:
#endif
mr Rt,DST // temp storage for what is really source address (r3)
mr DST,SRC // swap destination address to r3 to match memcpy dst
mr SRC,Rt // Complete swap of destination and source for memcpy
#ifdef LIBMOTOVEC
b memcpy // b to memcpy with correct args in r3 and r4
#else
b vec_memcpy // b to vec_memcpy with correct args in r3 and r4
#endif
// End of bcopy in AltiVec
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -