⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 memcpy2.s

📁 memcpy函数优化代码,使用汇编实现,可提高memcpy的实现性能.
💻 S
字号:
#define CPU_ARM#ifdef CPU_ARM     .section .icode,"ax",%progbits #else     .section .icode,"ax",@progbits #endif     .align 2           different_aligns:     /* We must do byte copies */     stmfd r13!, {r0, lr } byte_loop:     ldrb r3, [r1], #1     strb r3, [r0], #1     subs r2, r2, #1     bne byte_loop     ldmfd r13!, {r0, pc }         .global memcpy2         .type memcpy2, %function memcpy2: @void* memcpy2( void* dst, const void* src, size_t len ) @@ Register usage: @@ r0: dst @@ r1: src @@ r2: len @@ @@ r3: various bitmasks, load and store for different_aligns loop @@ r4: counter for multi loop, not used for different_aligns @@ r5-r8: load and store, not used for different_aligns     cmp r2, #0     moveq pc, lr @ just return if caller wants to copy zero bytes          cmp r2, #8     bls different_aligns          /*check for src alignment*/     eor r3, r0, r1 @ r3 = dest | src     tst r3, #3 @ test for same alignment     bne different_aligns @ jump if align( r1 ) != align( ro )     /* else, they have the same same alignment */     stmfd r13!, {r0, r4-r8, lr } @ save regs     ands r3, r0, #3 @ find out what that alignment is     beq multi @ iff align( r1 ) == align( r0 ) == 0 skip to multi          /* otherwise, move up to three bytes to get to a word alignment        if align = 1, we need to move forward 3 bytes to get to a word boundry        if align = 2, we need to move forward 2 bytes to get to a word boundry        if align = 3, we need to move forward 1 byte to get to a word boundry     */     cmp r3, #2 @ "subtract" 2 from either 1, 2, or 3     ldrneb r4, [r1], #1 @ one aligned or three aligned ( 1 != 2 || 3 !- 2 ), ((byte*) src)++     strneb r4, [r0], #1 @ one aligned or three aligned ( 1 != 2 || 3 !- 2 ), store to *dst++     @ldrlsh r4, [r1], #2 @ one or two ( 1 <= 2 || 2 <= 2 ) aligned, ((halfword*) src)++     @strlsh r4, [r0], #2 @ one or two ( 1 <= 2 || 2 <= 2 ) aligned, store to *dst++          ldrlsb r4, [r1], #1 @ one aligned or two aligned, ((byte*) src)++     strlsb r4, [r0], #1 @ one aligned or two aligned, store to *dst++     ldrlsb r4, [r1], #1 @ one aligned or two aligned, ((byte*) src)++     strlsb r4, [r0], #1 @ one aligned or two aligned, store to *dst++          sub r3, r3, #4     add r2, r2, r3 @ length -= bytes written multi:     /* once we get here, we're word aligned */          /*     bytes = length     words = byte / 4, rem = byte moves     instr = quadword = words / 4, rem = partial instructions     loop = instr / 4, rem = jump to instr         010101010     llliiwwbb          21     52631     684268421     */       ands r3, r2, #48 @ #32 | #16, r3 = partial loop count << 3     mov r4, r2, LSR #6 @ r4 = loop count          /* Now, like Duff's device, jump into the loop to perform the extra instructions */     /* Replace later with direct adjustment of PC */     beq loop_test     cmp r3, #32 @r3 = 16 or 32 or 48, corresponding to 1 or 2 or 3     bhi loop3     beq loop2     blo loop1      loop:     ldmia r1!, { r5-r8 } @load four registers     stmia r0!, { r5-r8 } @store four registers loop3:     ldmia r1!, { r5-r8 } @load four registers     stmia r0!, { r5-r8 } @store four registers loop2:     ldmia r1!, { r5-r8 } @load four registers     stmia r0!, { r5-r8 } @store four registers loop1:     ldmia r1!, { r5-r8 } @load four registers     stmia r0!, { r5-r8 } @store four registers      loop_test:     cmp r4, #0     subne r4, r4, #1     bne loop          /* Now do the extra words */     ands r3, r2, #12 @ #8 | #4, r3 = extra words << 2     beq extra_bytes     cmp r3, #8     ldrne r5, [r1], #4 @ if r5 !=8 then it's 4 or 12, so load and store     strne r5, [r0], #4     ldmhsia r1!, {r5-r6} @ if r5 >= 8, it's 8 or 12, so load and store 2     stmhsia r0!, {r5-r6}          /* Now do the extra bytes */ extra_bytes:     tst r2, #2 @ any extra bytes?     beq clean_up     cmp r3, #2 @ subtract 2 from either 1, 2, or 3     ldrneb r5, [r1], #1 @ load and store one byte iff r3 != 2 (i.e, r3 == 1 || r3 == 3)     strneb r5, [r0], #1 @ 1 or 3     ldrhsb r5, [r1], #1 @ load and store a byte iff r3 >= 2     strhsb r5, [r0], #1 @ 2 or 3     ldrhsb r5, [r1], #1 @ load and store a byte iff r3 > 2     strhsb r5, [r0], #1 @ 2 or 3      clean_up:     ldmfd r13!, {r0, r4-r8, pc } @r0 is the retval, must equal original dst end:     .size different_aligns, .end-memcpy2         .align 2      @ Local Variables: @ asm-comment-char: ?@ @ comment-start: "@ " @ block-comment-start: "/*" @ block-comment-end: "*/" @ indent-tabs-mode: t @ End: 

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -