📄 vec_csum.s
字号:
bne cr7,Only_B_fwd // b if there are no half words to store stvehx VPS0,DBK,D // LSU store one halfword if necessary addi D,D,2 // IU1 increment indexOnly_B_fwd: bns cr7,All_done_fwd // b if there are no bytes to store stvebx VPS0,DBK,D // LSU store one byte if necessary b All_done_fwdRt_just_fwd: stvx VPS0,DST,BK // LSU Store 16 bytes at D14All_done_fwd: vaddcuw VCAR1,VPS0,VPS1 //VIU1 add data and store carries vadduwm VTEMP,VPS0,VPS1 //VIU1 add data (no carries) vaddcuw VCAR2,VTEMP,VSUM //VIU1 data + previous sum ->store carries vadduwm VSUM,VTEMP,VSUM //VIU1 data + previous sum vadduwm VCAR3,VCAR1,VCAR2 //VIU1 add carries from previous adds vmrglh VL,V0,VSUM // VPU separate low shorts of sum vadduwm VCARF,VCAR3,VCARF //VIU1 update VCARF vmrghh VH,V0,VSUM //VPU separate high shorts of sum rlwinm DBK,SP,0,0,27 // IU1 Align stack pointer to QW vsumsws VCARS,VCARF,V0 //VIU2 sum all carries vadduwm VSUM,VL,VH //VIU1 add low and high data li BK,-16 // IU1 Index 0x10 less than SP vsumsws VFIN,VSUM,VCARS //VIU2 sum all data including carries stvx VFIN,DBK,BK // LSU Store partial checksum from VR lwz SRC,-4(DBK) // LSU Load partial checksum to GPR addc SRC,SRC,SUM addze SRC,SRC blr // Return destination address from entry #ifdef __MWERKS__ .align 16#else .align 4#endifGT_4QW_fwd: // Do once if nxt st is to odd half of cache line, else twice lvx VS1,SRC,BK // LSU Get S3 (or S2) addi QW,QW,-1 // IU1 Keeping track of QWs stored vaddcuw VCAR1,VPS0,VSUM // VIU1 data + previous sum ->store carries vadduwm VSUM,VPS0,VSUM // VIU1 data + previous sum (no carries) mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0; addi DNX,DNX,16 // IU1 Update cr6 for next loop addi Rt,QW,-2 // IU1 Insure at least 2 QW left after big loop vperm VPS0,VS0,VS1,VP3 // VPU Align S2 and S3 to D2 vor VS0,VS1,VS1 // VIU1 Move upper vector to lower stvx VPS0,DST,BK // LSU Store 16 bytes at D2 addi BK,BK,16 // IU1 Increment byte count by 16 vadduwm VCARF,VCAR1,VCARF // VIU1 Update VCARF bdnzf 27,GT_4QW_fwd // b if next store is to lower (even) half of CL// At this point next store will be to even address. mtcrf 0x02,DBK // IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?) addi SP8,SRC,96 // IU1 Starting address for dcbt addi BL,BK,16 // IU1 Create an alternate byte kount + 32// We need the ctr register to reflect an even byte count before entering// the next block - faster to decrement than to reload. bns cr6,B32_fwd // b if DST[27] == 0; i.e, final store is even bdnz B32_fwd // decrement counter for last QW store oddB32_fwd: // Should be at least 2 stores remaining and next 2 are cache aligned lvx VS1,SRC,BK // LSU Get S4 addi SP8,SP8,32 // IU1 Next starting address for dcbt vaddcuw VCAR1,VPS0,VPS1 // VIU1 add data and store carries lvx VS2,SRC,BL // LSU Get S5 vadduwm VTEMP,VPS0,VPS1 // VIU1 add data (no carries) dcbt 0,SP8 // LSU Prefetch cache line 64 bytes ahead vaddcuw VCAR2,VTEMP,VSUM //VIU1 data + previous sum ->store carries DCBK // LSU Kill instead of RWITM vadduwm VSUM,VTEMP,VSUM //VIU1 data + previous sum vperm VPS1,VS0,VS1,VP3 // VPU Align S11 and S12 to D11 stvx VPS1,DST,BK // LSU Store 16 bytes at D11 vperm VPS0,VS1,VS2,VP3 // VPU Align S12 and S13 to D12 vadduwm VCAR3,VCAR1,VCAR2 //VIU1 add carries from previous adds bdz Nxt_loc_fwd // always decrement and branch to next instr Nxt_loc_fwd: stvx VPS0,DST,BL // LSU Store 16 bytes at D12 vor VS0,VS2,VS2 // VIU1 Move S13 to S11 vadduwm VCARF,VCAR3,VCARF //VIU1 update VCARF addi BK,BL,16 // IU1 Increment byte count addi BL,BK,16 // IU1 Increment alternate byte count bdnz B32_fwd // b if there are at least two more QWs to do bso cr6,One_even_QW // b if there is one even and one odd QW to store b Last_ld_fwd // b if last store is to even address// Come here with two more loads and two stores to doOne_even_QW: lvx VS1,SRC,BK // LSU Get S6 (or S5 if if D-S>=0) vaddcuw VCAR1,VPS0,VSUM // VIU1 data + previous sum ->store carries vadduwm VSUM,VPS0,VSUM // VIU1 data + previous sum (no carries) vadduwm VCARF,VCAR1,VCARF // VIU1 Update VCARF vperm VPS0,VS0,VS1,VP3 // VPU Align S13 and S14 to D13 vor VS0,VS1,VS1 // VIU1 Move upper vector to lower stvx VPS0,DST,BK // LSU Store 16 bytes at D13 addi BK,BK,16 // IU1 Increment byte count b Last_ld_fwd// End of vec_csum_partial_copy_generic in AltiVec// Modified from above Register useage// Don't use vectors for BC <= MIN_VEC_CS. Works only if MIN_VEC >= 32 bytes.#define MIN_VEC_CS 48 // Chosen experimentally on MPC7455@1GHz/133MHz bus#undef DST // will not be using here#undef BC#define BC r4 // entering: Byte_Count#undef SUM#define SUM r5 // entering: Partial checksum#if __MWERKS__ .align 16#else .align 4#endif#ifdef LIBMOTOVEC .global csum_partial_vec csum_partial:#else .global vec_csum_partial vec_csum_partial:#endif li BK,32 // IU1 rlwinm Rt,BC,31,1,31 // IU1 BC/2 cmpi cr7,0,BC,MIN_VEC_CS // IU1 Check for minimum byte count dcbt SRC,BK // LSU prefetch next cacheline cmpi cr6,0,Rt,0 // IU1 BC/2 == 0? addic SUM,SUM,0 // IU1 Zero carry bit addi SM2,SRC,-2 // IU1 Pre-bias and duplicate src add DBC,SRC,BC // IU1 Compute address of last src byte + 1 bgt cr7,v_csum // b if BC>MIN_VEC_CS andi. BK,BC,1 // IU1 BC[31]==0? beq cr6,No_HWs_cs // b if BC/2==0 mtctr Rt // i=BC/2; do ...;i--; while (i>0)HW_cs: lhzu Rt,2(SM2) // LSU addc SUM,SUM,Rt // IU1 bdnz HW_csNo_HWs_cs: beq BC_even_cs // b if BC[31]==0 (or DBC[31]==0 when aligned) lbz Rt,-1(DBC) // LSU Get last src address byte rlwinm Rt,Rt,8,16,23 // IU1 Shift odd byte left addc SUM,SUM,Rt // IU1BC_even_cs: addze SRC,SUM blrv_csum: lvsr VP2,0,SRC // LSU Permute vector for initial byte mask addi DR,SRC,16 // IU1 Address of second src vector li BK,64 // IU1 Index of next cache line lvsr VP4,SRC,BC // LSU Permute vector for final byte mask rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27] addi DBK,DBC,-1 // IU1 Address of last src byte lvx VS0,0,SRC // LSU Get S0 load started subf QW,DR,DBK // IU1 Bytes of full vectors to test (-16) vxor V0,V0,V0 // VIU Clear v0 dcbt SRC,BK // LSU Prefetch next cache line at src+64 rlwinm QW,QW,28,4,31 // IU1 Quad words remaining vnor V1,V0,V0 // VIU1 Create a vector of all ones mtctr QW // IU2 vxor VCARF,VCARF,VCARF //VIU1 clear VCARF vperm VM,V1,V0,VP2 // VPU D0 select vector for dst left; src right cmpi cr6,0,QW,4 // IU1 Check QW>4 vxor VSUM,VSUM,VSUM // VIU1 Clear VSUM vperm VMM,V0,V1,VP4 // VPU DN select vector for src left; dst right li BK,16 // IU1 Initialize byte kount index vsel VS0,VS0,V0,VM // VIU1 Select zeroes left | S0 bytes rightvp_fwd_loop: lvx VS1,SRC,BK // LSU Get S1 vaddcuw VCAR1,VS0,VSUM // VIU1 data + previous sum ->store carries vadduwm VSUM,VS0,VSUM // VIU1 data + previous sum (no carries) vadduwm VCARF,VCAR1,VCARF // VIU1 Update VCARF addi BK,BK,16 // IU1 Increment byte kount index vor VS0,VS1,VS1 // VIU1 Swap vectors for next loop bdnzf 25,vp_fwd_loop // b if 4 or less quad words to do add DNX,SRC,BK // IU1 address of next load (SRC+32 if QW>4) addi QW,QW,-1 // IU1 One more QW summed by now bgt cr6,GT_4QW_cs // b if >4 quad words left vxor VS1,VS1,VS1 // VIU1 Zero before adding below// Next 16 bytes is the last; we're done.Last_ld_cs: lvx VS2,0,DBK // LSU Get last source (guaranteed SN) vaddcuw VCAR1,VS0,VS1 // VIU1 add data and store carries rlwinm DBK,DBK,0,28,31 // IU1 (dst + BC -1)[28:31] vadduwm VTEMP,VS0,VS1 // VIU1 add data (no carries) cmpi cr7,0,DBK,0xF // IU1 Is last byte right justified? vaddcuw VCAR2,VTEMP,VSUM //VIU1 data + previous sum ->store carries vadduwm VSUM,VTEMP,VSUM //VIU1 data + previous sum vadduwm VCAR3,VCAR1,VCAR2 //VIU1 add carries from previous adds vadduwm VCARF,VCAR3,VCARF //VIU1 update VCARF beq cr7, Rt_just // b if right justified. vsel VS2,VS2,V0,VMM // VIU1 Select src bytes left | zeroes rightRt_just: vaddcuw VCAR1,VS2,VSUM // VIU1 data + previous sum ->store carries vadduwm VSUM,VS2,VSUM // VIU1 data + previous sum (no carries) vadduwm VCARF,VCAR1,VCARF // VIU1 Update VCARF vmrglh VL,V0,VSUM // VPU separate low shorts of sum vmrghh VH,V0,VSUM //VPU separate high shorts of sum rlwinm DBK,SP,0,0,27 // IU1 Align stack pointer to QW vsumsws VCARS,VCARF,V0 //VIU2 sum all carries vadduwm VSUM,VL,VH //VIU1 add low and high data li BK,-16 // IU1 Index 0x10 less than SP vsumsws VFIN,VSUM,VCARS //VIU2 sum all data including carries stvx VFIN,DBK,BK // LSU Store partial checksum from VR lwz SRC,-4(DBK) // LSU Load partial checksum to GPR addc SRC,SRC,SUM addze SRC,SRC blr // Return destination address from entry#ifdef __MWERKS__ .align 16#else .align 4#endifGT_4QW_cs: // Do once if nxt ld is from odd half of cache line, else twice lvx VS1,SRC,BK // LSU Get S3 (or S2) addi QW,QW,-1 // IU1 Keeping track of QWs stored vaddcuw VCAR1,VS0,VSUM // VIU1 data + previous sum ->store carries vadduwm VSUM,VS0,VSUM // VIU1 data + previous sum (no carries) mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0; addi DNX,DNX,16 // IU1 Update cr6 for next loop addi Rt,QW,-2 // IU1 Insure at least 2 QW left after big loop vor VS0,VS1,VS1 // VIU1 Move upper vector to lower addi BK,BK,16 // IU1 Increment byte count by 16 vadduwm VCARF,VCAR1,VCARF // VIU1 Update VCARF bdnzf 27,GT_4QW_cs // b if next store is to lower (even) half of CL// At this point next store will be to even address. mtcrf 0x02,DBK // IU2 cr6[3]=((last load)[27]==1)?1:0; (odd?) addi SP8,SRC,96 // IU1 Starting address for dcbt vxor VS1,VS1,VS1 // VIU1 Zero before adding below// We need the ctr register to reflect an even byte count before entering// the next block - faster to decrement than to reload. bns cr6,B32_cs // b if DST[27] == 0; i.e, final load is even bdnz B32_cs // decrement counter for last QW load oddB32_cs: // Should be at least 2 loads remaining and next 2 are cache aligned lvx VS2,SRC,BK // LSU Get S4 addi BK,BK,16 // IU1 Increment byte count by 16 vaddcuw VCAR1,VS0,VS1 // VIU1 add data and store carries lvx VS3,SRC,BK // LSU Get S5 addi SP8,SP8,32 // IU1 Next starting address for dcbt vadduwm VTEMP,VS0,VS1 // VIU1 add data (no carries) dcbt 0,SP8 // LSU Prefetch cache line 64 bytes ahead addi BK,BK,16 // IU1 Increment byte count vaddcuw VCAR2,VTEMP,VSUM //VIU1 data + previous sum ->store carries vadduwm VSUM,VTEMP,VSUM //VIU1 data + previous sum vadduwm VCAR3,VCAR1,VCAR2 //VIU1 add carries from previous adds bdz Nxt_loc_cs // always decrement and branch to next instr Nxt_loc_cs: vadduwm VCARF,VCAR3,VCARF //VIU1 update VCARF vor VS0,VS2,VS2 // VIU1 Move S13 to S11 vor VS1,VS3,VS3 // VIU1 Move upper vector to lower bdnz B32_cs // b if there are at least two more QWs to do bso cr6,One_even_QW_cs // b if there is one even and one odd QW to store b Last_ld_cs // b if last store is to even address// Come here with two more loads and two stores to doOne_even_QW_cs: lvx VS2,SRC,BK // LSU Get S6 (or S5 if if D-S>=0) addi BK,BK,16 // IU1 Increment byte count vaddcuw VCAR1,VS0,VS1 // VIU1 add data and store carries vadduwm VTEMP,VS0,VS1 // VIU1 add data (no carries) vaddcuw VCAR2,VTEMP,VSUM //VIU1 data + previous sum ->store carries vadduwm VSUM,VTEMP,VSUM //VIU1 data + previous sum vadduwm VCAR3,VCAR1,VCAR2 //VIU1 add carries from previous adds vadduwm VCARF,VCAR3,VCARF //VIU1 update VCARF vxor VS1,VS1,VS1 // VIU1 Zero before next add vor VS0,VS2,VS2 // VIU1 Move S13 to S11 b Last_ld_cs// End of vec_csum_partial in AltiVec
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -