📄 copymblock.asm
字号:
TTL C:\Dokumente und Einstellungen\drops\Eigene Dateien\myprojects\test5\basic_prediction.cpp
AREA |.drectve|, DRECTVE
DCB "-defaultlib:coredll.lib "
DCB "-defaultlib:corelibc.lib "
EXPORT |CopyMBlock| ; CopyMBlock
IMPORT |MessageBoxW|
AREA |.pdata|, PDATA
|$T224| DCD |CopyMBlock|
DCD 0x40000300
AREA |.text|, CODE
|CopyMBlock| PROC ; CopyMBlock
; File C:\Dokumente und Einstellungen\drops\Eigene Dateien\myprojects\test5\basic_prediction.cpp
; Line 2
|$M222|
; if p1 and p2 divisible by four, move words
stmdb sp!, {r4 - r8, lr} ; stmfd
;; optimization assumptions:
;; memory bandwidth is limited
;; advance loads as much as possible
;; minimize the number of stores (and take advantage of store coalescing on XScale and SA1110)
;; first, preload the the first two source chunks
;; 0xf5dRf000 is preload register R in armV5 and a noop in armV4 (load never)
& 0xf5d0f000 ; pld[r0] loop 1 src beginning
& 0xf5d0f00f ; pld[r0,#f] loop 1 src end
add r8,r2,r0 ;
& 0xf5d8f000 ; pld[r8] loop 2 src beginning
& 0xf5d8f00f ; pld[r8,#f] loop 2 src end
add r8,r2,r8 ; r8 has pointer to next src line
; if p1 and p2 divisible by zero, move words
ands r3, r0, #3
andeqs r3, r1, #3
beq copy_full_words
ands r3, r0, #1
bne copy_single_bytes
ands r3, r1, #1
beq copy_half_words
copy_single_bytes
; ======== loop 1 ========
; copy first word
mov r7,#14
cbytes_loop
& 0xf5d8f000 ; pld[r8] loop n+1 src beginning
& 0xf5d8f00f ; pld[r8,#f] loop n+2 src end
add r8,r2,r8 ; r8 has pointer to next src line
ldrb r3, [r0]
ldrb r4, [r0,#1]
ldrb r5, [r0,#2]
ldrb r6, [r0,#3]
strb r3, [r1]
strb r4, [r1,#1]
strb r5, [r1,#2]
strb r6, [r1,#3]
; copy second word
ldrb r3, [r0,#4]
ldrb r4, [r0,#5]
ldrb r5, [r0,#6]
ldrb r6, [r0,#7]
strb r3, [r1,#4]
strb r4, [r1,#5]
strb r5, [r1,#6]
strb r6, [r1,#7]
; copy third word
ldrb r3, [r0,#8]
ldrb r4, [r0,#9]
ldrb r5, [r0,#10]
ldrb r6, [r0,#11]
strb r3, [r1,#8]
strb r4, [r1,#9]
strb r5, [r1,#10]
strb r6, [r1,#11]
; copy fourth word
ldrb r3, [r0,#12]
ldrb r4, [r0,#13]
ldrb r5, [r0,#14]
ldrb r6, [r0,#15]
strb r3, [r1,#12]
strb r4, [r1,#13]
strb r5, [r1,#14]
strb r6, [r1,#15]
; add stride
add r0, r0, r2
add r1, r1, r2
subs r7,r7,#1
bne cbytes_loop
;;; loop for the first 14 iterations (with preload)
;;; unroll the last two (no preload)
; ======== loop 15 ========
;; no preload
ldrb r3, [r0]
ldrb r4, [r0,#1]
ldrb r5, [r0,#2]
ldrb r6, [r0,#3]
strb r3, [r1]
strb r4, [r1,#1]
strb r5, [r1,#2]
strb r6, [r1,#3]
; copy second word
ldrb r3, [r0,#4]
ldrb r4, [r0,#5]
ldrb r5, [r0,#6]
ldrb r6, [r0,#7]
strb r3, [r1,#4]
strb r4, [r1,#5]
strb r5, [r1,#6]
strb r6, [r1,#7]
; copy third word
ldrb r3, [r0,#8]
ldrb r4, [r0,#9]
ldrb r5, [r0,#10]
ldrb r6, [r0,#11]
strb r3, [r1,#8]
strb r4, [r1,#9]
strb r5, [r1,#10]
strb r6, [r1,#11]
; copy fourth word
ldrb r3, [r0,#12]
ldrb r4, [r0,#13]
ldrb r5, [r0,#14]
ldrb r6, [r0,#15]
strb r3, [r1,#12]
strb r4, [r1,#13]
strb r5, [r1,#14]
strb r6, [r1,#15]
; add stride
add r0, r0, r2
add r1, r1, r2
; ======== loop 16 ========
;; no preload
ldrb r3, [r0]
ldrb r4, [r0,#1]
ldrb r5, [r0,#2]
ldrb r6, [r0,#3]
strb r3, [r1]
strb r4, [r1,#1]
strb r5, [r1,#2]
strb r6, [r1,#3]
; copy second word
ldrb r3, [r0,#4]
ldrb r4, [r0,#5]
ldrb r5, [r0,#6]
ldrb r6, [r0,#7]
strb r3, [r1,#4]
strb r4, [r1,#5]
strb r5, [r1,#6]
strb r6, [r1,#7]
; copy third word
ldrb r3, [r0,#8]
ldrb r4, [r0,#9]
ldrb r5, [r0,#10]
ldrb r6, [r0,#11]
strb r3, [r1,#8]
strb r4, [r1,#9]
strb r5, [r1,#10]
strb r6, [r1,#11]
; copy fourth word
ldrb r3, [r0,#12]
ldrb r4, [r0,#13]
ldrb r5, [r0,#14]
ldrb r6, [r0,#15]
strb r3, [r1,#12]
strb r4, [r1,#13]
strb r5, [r1,#14]
strb r6, [r1,#15]
; return
ldmia sp!, {r4 - r8, pc} ; ldmfd
copy_half_words
; ======== loop 1 ========
mov r7,#14
chalf_loop
& 0xf5d8f000 ; pld[r8] loop n+1 src beginning
& 0xf5d8f00f ; pld[r8,#f] loop n+2 src end
add r8,r2,r8 ; r8 has pointer to next src line
; copy word 1-2
ldrh r3, [r0]
ldrh r4, [r0,#2]
ldrh r5, [r0,#4]
ldrh r6, [r0,#6]
strh r3, [r1]
strh r4, [r1,#2]
strh r5, [r1,#4]
strh r6, [r1,#6]
; copy word 3-4
ldrh r3, [r0,#8]
ldrh r4, [r0,#10]
ldrh r5, [r0,#12]
ldrh r6, [r0,#14]
strh r3, [r1,#8]
strh r4, [r1,#10]
strh r5, [r1,#12]
strh r6, [r1,#14]
; add stride
add r0, r0, r2
add r1, r1, r2
subs r7,r7,#1
bne chalf_loop
;;; loop for the first 14 iterations (with preload)
;;; unroll the last two (no preload)
; ======== loop 15 ========
;; no preload
; copy word 1-2
ldrh r3, [r0]
ldrh r4, [r0,#2]
ldrh r5, [r0,#4]
ldrh r6, [r0,#6]
strh r3, [r1]
strh r4, [r1,#2]
strh r5, [r1,#4]
strh r6, [r1,#6]
; copy word 3-4
ldrh r3, [r0,#8]
ldrh r4, [r0,#10]
ldrh r5, [r0,#12]
ldrh r6, [r0,#14]
strh r3, [r1,#8]
strh r4, [r1,#10]
strh r5, [r1,#12]
strh r6, [r1,#14]
; add stride
add r0, r0, r2
add r1, r1, r2
; ======== loop 16 ========
;; no preload
; copy word 1-2
ldrh r3, [r0]
ldrh r4, [r0,#2]
ldrh r5, [r0,#4]
ldrh r6, [r0,#6]
strh r3, [r1]
strh r4, [r1,#2]
strh r5, [r1,#4]
strh r6, [r1,#6]
; copy word 3-4
ldrh r3, [r0,#8]
ldrh r4, [r0,#10]
ldrh r5, [r0,#12]
ldrh r6, [r0,#14]
strh r3, [r1,#8]
strh r4, [r1,#10]
strh r5, [r1,#12]
strh r6, [r1,#14]
; return
ldmia sp!, {r4 - r8, pc} ; ldmfd
;----- copy full words -----
copy_full_words
;; registers:
;; r0 src ptr
;; r1 dst ptr
;; r2 stride
;;
;; r3-r8 available
;; set up r8 to contain a src preload address
; ======== loop 1 ========
; copy 4 words
;; load the words
ldr r3, [r0]
ldr r4, [r0,#4]
ldr r5, [r0,#8]
ldr r6, [r0,#12]
;; preload the n+2 stride
& 0xf5d8f000 ; pld[r8] loop n+2 src beginning
& 0xf5d8f00f ; pld[r8,#f] loop n+2 src end
add r8,r2,r8 ; r8 has pointer to next src line
; add stride to src (for loop n+1)
add r0, r0, r2
str r3, [r1]
str r4, [r1,#4]
str r5, [r1,#8]
str r6, [r1,#12]
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -