📄 memcpy.s
字号:
r5 == unadjusted len r9 == adjusted Word length r10 == src alignment (1-3) r12 == adjuested src, not aligned r31 == adjusted len First we need to copy word upto but not crossing the next 32-byte boundary. Then perform aligned loads just before and just after the boundary and use shifts and or to gernerate the next aligned word for dst. If more then 32 bytes remain we copy (unaligned src) the next 7 words and repeat the loop until less then 32-bytes remaim. Then if more then 4 bytes remain we again use aligned loads, shifts and or to generate the next dst word. We then process the remaining words using unaligned loads as needed. Finally we check if there more then 0 bytes (1-3) bytes remainting and use halfword and or byte load/stores to complete the copy.*/ mr 4,12 /* restore unaligned adjusted src ptr */ clrlwi 0,12,27 /* Find dist from previous 32-byte boundary. */ slwi 10,10,3 /* calculate number of bits to shift 1st word left */ cmplwi cr5,0,16 subfic 8,0,32 /* Number of bytes to next 32-byte boundary. */ mtcrf 0x01,8 cmplwi cr1,10,16 subfic 9,10,32 /* number of bits to shift 2nd word right *//* This test is reversed because the timing to compare the bytes to 32-byte boundary could not be meet. So we compare the bytes from previous 32-byte boundary and invert the test. */ bge cr5,L(wdu_h32_8) .align 4 lwz 6,0(4) lwz 7,4(4) addi 12,4,16 /* generate alternate pointers to avoid agen */ addi 11,3,16 /* timing issues downstream. */ stw 6,0(3) stw 7,4(3) subi 31,31,16 lwz 6,8(4) lwz 7,12(4) addi 4,4,16 stw 6,8(3) stw 7,12(3) addi 3,3,16 bf 28,L(wdu_h32_4) lwz 6,0(12) lwz 7,4(12) subi 31,31,8 addi 4,4,8 stw 6,0(11) stw 7,4(11) addi 3,3,8 bf 29,L(wdu_h32_0) lwz 6,8(12) addi 4,4,4 subi 31,31,4 stw 6,8(11) addi 3,3,4 b L(wdu_h32_0) .align 4L(wdu_h32_8): bf 28,L(wdu_h32_4) lwz 6,0(4) lwz 7,4(4) subi 31,31,8 bf 29,L(wdu_h32_8x) stw 6,0(3) stw 7,4(3) lwz 6,8(4) addi 4,4,12 subi 31,31,4 stw 6,8(3) addi 3,3,12 b L(wdu_h32_0) .align 4L(wdu_h32_8x): addi 4,4,8 stw 6,0(3) stw 7,4(3) addi 3,3,8 b L(wdu_h32_0) .align 4L(wdu_h32_4): bf 29,L(wdu_h32_0) lwz 6,0(4) subi 31,31,4 addi 4,4,4 stw 6,0(3) addi 3,3,4 .align 4L(wdu_h32_0):/* set up for 32-byte boundry crossing word move and possibly 32-byte move loop. */ clrrwi 12,4,2 cmplwi cr5,31,32 bge cr1,L(wdu2_32)#if 0 b L(wdu1_32)/* cmplwi cr1,10,8 beq cr1,L(wdu1_32) cmplwi cr1,10,16 beq cr1,L(wdu2_32) cmplwi cr1,10,24 beq cr1,L(wdu3_32)*/L(wdu_32): lwz 6,0(12) cmplwi cr6,31,4 srwi 8,31,5 /* calculate the 32 byte loop count */ slw 0,6,10 clrlwi 31,31,27 /* The remaining bytes, < 32. */ blt cr5,L(wdu_32tail) mtctr 8 cmplwi cr6,31,4 .align 4L(wdu_loop32): /* copy 32 bytes at a time */ lwz 8,4(12) addi 12,12,32 lwz 7,4(4) srw 8,8,9 or 0,0,8 stw 0,0(3) stw 7,4(3) lwz 6,8(4) lwz 7,12(4) stw 6,8(3) stw 7,12(3) lwz 6,16(4) lwz 7,20(4) stw 6,16(3) stw 7,20(3) lwz 6,24(4) lwz 7,28(4) lwz 8,0(12) addi 4,4,32 stw 6,24(3) stw 7,28(3) addi 3,3,32 slw 0,8,10 bdnz+ L(wdu_loop32)L(wdu_32tail): mtcrf 0x01,31 cmplwi cr5,31,16 blt cr6,L(wdu_4tail) /* calculate and store the final word */ lwz 8,4(12) srw 8,8,9 or 6,0,8 b L(wdu_32tailx)#endif .align 4L(wdu1_32): lwz 6,-1(4) cmplwi cr6,31,4 srwi 8,31,5 /* calculate the 32 byte loop count */ slwi 6,6,8 clrlwi 31,31,27 /* The remaining bytes, < 32. */ blt cr5,L(wdu1_32tail) mtctr 8 cmplwi cr6,31,4 lwz 8,3(4) lwz 7,4(4)/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */ rlwimi 6,8,8,(32-8),31 b L(wdu1_loop32x) .align 4L(wdu1_loop32): /* copy 32 bytes at a time */ lwz 8,3(4) lwz 7,4(4) stw 10,-8(3) stw 11,-4(3)/* Equivalent to srwi 8,8,32-8; or 6,6,8 */ rlwimi 6,8,8,(32-8),31L(wdu1_loop32x): lwz 10,8(4) lwz 11,12(4) stw 6,0(3) stw 7,4(3) lwz 6,16(4) lwz 7,20(4) stw 10,8(3) stw 11,12(3) lwz 10,24(4) lwz 11,28(4) lwz 8,32-1(4) addi 4,4,32 stw 6,16(3) stw 7,20(3) addi 3,3,32 slwi 6,8,8 bdnz+ L(wdu1_loop32) stw 10,-8(3) stw 11,-4(3)L(wdu1_32tail): mtcrf 0x01,31 cmplwi cr5,31,16 blt cr6,L(wdu_4tail) /* calculate and store the final word */ lwz 8,3(4)/* Equivalent to: srwi 8,8,32-9; or 6,6,8 */ rlwimi 6,8,8,(32-8),31 b L(wdu_32tailx)L(wdu2_32): bgt cr1,L(wdu3_32) lwz 6,-2(4) cmplwi cr6,31,4 srwi 8,31,5 /* calculate the 32 byte loop count */ slwi 6,6,16 clrlwi 31,31,27 /* The remaining bytes, < 32. */ blt cr5,L(wdu2_32tail) mtctr 8 cmplwi cr6,31,4 lwz 8,2(4) lwz 7,4(4)/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */ rlwimi 6,8,16,(32-16),31 b L(wdu2_loop32x) .align 4L(wdu2_loop32): /* copy 32 bytes at a time */ lwz 8,2(4) lwz 7,4(4) stw 10,-8(3) stw 11,-4(3)/* Equivalent to srwi 8,8,32-8; or 6,6,8 */ rlwimi 6,8,16,(32-16),31L(wdu2_loop32x): lwz 10,8(4) lwz 11,12(4) stw 6,0(3) stw 7,4(3) lwz 6,16(4) lwz 7,20(4) stw 10,8(3) stw 11,12(3) lwz 10,24(4) lwz 11,28(4)/* lwz 8,0(12) */ lwz 8,32-2(4) addi 4,4,32 stw 6,16(3) stw 7,20(3) addi 3,3,32 slwi 6,8,16 bdnz+ L(wdu2_loop32) stw 10,-8(3) stw 11,-4(3)L(wdu2_32tail): mtcrf 0x01,31 cmplwi cr5,31,16 blt cr6,L(wdu_4tail) /* calculate and store the final word */ lwz 8,2(4)/* Equivalent to: srwi 8,8,32-9; or 6,6,8 */ rlwimi 6,8,16,(32-16),31 b L(wdu_32tailx)L(wdu3_32):/* lwz 6,0(12) */ lwz 6,-3(4) cmplwi cr6,31,4 srwi 8,31,5 /* calculate the 32 byte loop count */ slwi 6,6,24 clrlwi 31,31,27 /* The remaining bytes, < 32. */ blt cr5,L(wdu3_32tail) mtctr 8 cmplwi cr6,31,4 lwz 8,1(4) lwz 7,4(4)/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */ rlwimi 6,8,24,(32-24),31 b L(wdu3_loop32x) .align 4L(wdu3_loop32): /* copy 32 bytes at a time */ lwz 8,1(4) lwz 7,4(4) stw 10,-8(3) stw 11,-4(3)/* Equivalent to srwi 8,8,32-8; or 6,6,8 */ rlwimi 6,8,24,(32-24),31L(wdu3_loop32x): lwz 10,8(4) lwz 11,12(4) stw 6,0(3) stw 7,4(3) lwz 6,16(4) lwz 7,20(4) stw 10,8(3) stw 11,12(3) lwz 10,24(4) lwz 11,28(4) lwz 8,32-3(4) addi 4,4,32 stw 6,16(3) stw 7,20(3) addi 3,3,32 slwi 6,8,24 bdnz+ L(wdu3_loop32) stw 10,-8(3) stw 11,-4(3)L(wdu3_32tail): mtcrf 0x01,31 cmplwi cr5,31,16 blt cr6,L(wdu_4tail) /* calculate and store the final word */ lwz 8,1(4)/* Equivalent to: srwi 8,8,32-9; or 6,6,8 */ rlwimi 6,8,24,(32-24),31 b L(wdu_32tailx) .align 4L(wdu_32tailx): blt cr5,L(wdu_t32_8) lwz 7,4(4) addi 12,4,16 /* generate alternate pointers to avoid agen */ addi 11,3,16 /* timing issues downstream. */ stw 6,0(3) stw 7,4(3) subi 31,31,16 lwz 6,8(4) lwz 7,12(4) addi 4,4,16 stw 6,8(3) stw 7,12(3) addi 3,3,16 bf 28,L(wdu_t32_4x) lwz 6,0(12) lwz 7,4(12) addi 4,4,8 subi 31,31,8 stw 6,0(11) stw 7,4(11) addi 3,3,8 bf 29,L(wdu_t32_0) lwz 6,8(12) addi 4,4,4 subi 31,31,4 stw 6,8(11) addi 3,3,4 b L(wdu_t32_0) .align 4L(wdu_t32_4x): bf 29,L(wdu_t32_0) lwz 6,0(4) addi 4,4,4 subi 31,31,4 stw 6,0(3) addi 3,3,4 b L(wdu_t32_0) .align 4L(wdu_t32_8): bf 28,L(wdu_t32_4) lwz 7,4(4) subi 31,31,8 bf 29,L(wdu_t32_8x) stw 6,0(3) stw 7,4(3) lwz 6,8(4) subi 31,31,4 addi 4,4,12 stw 6,8(3) addi 3,3,12 b L(wdu_t32_0) .align 4L(wdu_t32_8x): addi 4,4,8 stw 6,0(3) stw 7,4(3) addi 3,3,8 b L(wdu_t32_0) .align 4L(wdu_t32_4): subi 31,31,4 stw 6,0(3) addi 4,4,4 addi 3,3,4 .align 4L(wdu_t32_0):L(wdu_4tail): cmplwi cr6,31,0 beq cr6,L(wdus_0) /* If the tail is 0 bytes we are done! */ bf 30,L(wdus_3) lhz 7,0(4) sth 7,0(3) bf 31,L(wdus_0) lbz 8,2(4) stb 8,2(3) mr 3,30 lwz 30,20(1) lwz 31,24(1) addi 1,1,32 blr .align 4L(wdus_3): bf 31,L(wus_0) lbz 6,0(4) stb 6,0(3) .align 4L(wdus_0): /* Return original dst pointer. */ mr 3,30 lwz 30,20(1) lwz 31,24(1) addi 1,1,32 blrEND (BP_SYM (memcpy))libc_hidden_builtin_def (memcpy)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -