📄 jrevdct_arm.asm
字号:
EXPORT j_rev_dct_ARM
AREA |.text|,CODE
ALIGN
j_rev_dct_ARM PROC
stmdb sp!, { r4 - r12, lr } ; all callee saved regs
sub sp, sp, #4 ; reserve some space on the stack
str r0, [ sp ] ; save the DCT pointer to the stack
mov lr, r0 ; lr = pointer to the current row
mov r12, #8 ; r12 = row-counter
; add r11, pc, #(const_array-.-8) ; r11 = base pointer to the constants array
adr r11, const_array
row_loop
ldrsh r0, [lr, # 0] ; r0 = 'd0'
ldrsh r1, [lr, # 8] ; r1 = 'd1'
; Optimization for row that have all items except the first set to 0
; (this works as the DCTELEMS are always 4-byte aligned)
ldr r5, [lr, # 0]
ldr r2, [lr, # 4]
ldr r3, [lr, # 8]
ldr r4, [lr, #12]
orr r3, r3, r4
orr r3, r3, r2
orrs r5, r3, r5
beq end_of_row_loop ; nothing to be done as ALL of them are '0'
orrs r2, r3, r1
beq empty_row
ldrsh r2, [lr, # 2] ; r2 = 'd2'
ldrsh r4, [lr, # 4] ; r4 = 'd4'
ldrsh r6, [lr, # 6] ; r6 = 'd6'
ldr r3, [r11, #4 ]
add r7, r2, r6
ldr r5, [r11, #36 ]
mul r7, r3, r7 ; r7 = z1
ldr r3, [r11, #8 ]
mla r6, r5, r6, r7 ; r6 = tmp2
add r5, r0, r4 ; r5 = tmp0
mla r2, r3, r2, r7 ; r2 = tmp3
sub r3, r0, r4 ; r3 = tmp1
add r0, r2, r5, lsl #13 ; r0 = tmp10
rsb r2, r2, r5, lsl #13 ; r2 = tmp13
add r4, r6, r3, lsl #13 ; r4 = tmp11
rsb r3, r6, r3, lsl #13 ; r3 = tmp12
stmdb sp!, { r0, r2, r3, r4 } ; save on the stack tmp10, tmp13, tmp12, tmp11
ldrsh r3, [lr, #10] ; r3 = 'd3'
ldrsh r5, [lr, #12] ; r5 = 'd5'
ldrsh r7, [lr, #14] ; r7 = 'd7'
add r0, r3, r5 ; r0 = 'z2'
add r2, r1, r7 ; r2 = 'z1'
add r4, r3, r7 ; r4 = 'z3'
add r6, r1, r5 ; r6 = 'z4'
ldr r9, [r11, #12 ]
add r8, r4, r6 ; r8 = z3 + z4
ldr r10, [r11, #32 ]
mul r8, r9, r8 ; r8 = 'z5'
ldr r9, [r11, #44 ]
mul r2, r10, r2 ; r2 = 'z1'
ldr r10, [r11, #40 ]
mul r0, r9, r0 ; r0 = 'z2'
ldr r9, [r11, #28 ]
mla r4, r10, r4, r8 ; r4 = 'z3'
ldr r10, [r11, #0 ]
mla r6, r9, r6, r8 ; r6 = 'z4'
ldr r9, [r11, #20 ]
mla r7, r10, r7, r2 ; r7 = tmp0 + z1
ldr r10, [r11, #24 ]
mla r5, r9, r5, r0 ; r5 = tmp1 + z2
ldr r9, [r11, #16 ]
mla r3, r10, r3, r0 ; r3 = tmp2 + z2
add r7, r7, r4 ; r7 = tmp0
mla r1, r9, r1, r2 ; r1 = tmp3 + z1
add r5, r5, r6 ; r5 = tmp1
add r3, r3, r4 ; r3 = tmp2
add r1, r1, r6 ; r1 = tmp3
ldmia sp!, { r0, r2, r4, r6 } ; r0 = tmp10 / r2 = tmp13 / r4 = tmp12 / r6 = tmp11
; r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0
; Compute DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS)
add r8, r0, r1
add r8, r8, #(1<<10)
mov r8, r8, asr #11
strh r8, [lr, # 0]
; Compute DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS)
sub r8, r0, r1
add r8, r8, #(1<<10)
mov r8, r8, asr #11
strh r8, [lr, #14]
; Compute DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS)
add r8, r6, r3
add r8, r8, #(1<<10)
mov r8, r8, asr #11
strh r8, [lr, # 2]
; Compute DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS)
sub r8, r6, r3
add r8, r8, #(1<<10)
mov r8, r8, asr #11
strh r8, [lr, #12]
; Compute DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS)
add r8, r4, r5
add r8, r8, #(1<<10)
mov r8, r8, asr #11
strh r8, [lr, # 4]
; Compute DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS)
sub r8, r4, r5
add r8, r8, #(1<<10)
mov r8, r8, asr #11
strh r8, [lr, #10]
; Compute DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS)
add r8, r2, r7
add r8, r8, #(1<<10)
mov r8, r8, asr #11
strh r8, [lr, # 6]
; Compute DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS)
sub r8, r2, r7
add r8, r8, #(1<<10)
mov r8, r8, asr #11
strh r8, [lr, # 8]
; End of row loop
add lr, lr, #16
subs r12, r12, #1
bne row_loop
beq start_column_loop
empty_row
ldr r1, [r11, #48 ]
mov r0, r0, lsl #2
and r0, r0, r1
add r0, r0, r0, lsl #16
str r0, [lr, # 0]
str r0, [lr, # 4]
str r0, [lr, # 8]
str r0, [lr, #12]
end_of_row_loop
; End of loop
add lr, lr, #16
subs r12, r12, #1
bne row_loop
start_column_loop
; Start of column loop
ldr lr, [ sp ]
mov r12, #8
column_loop
ldrsh r0, [lr, #( 0*8)] ; r0 = 'd0'
ldrsh r2, [lr, #( 4*8)] ; r2 = 'd2'
ldrsh r4, [lr, #( 8*8)] ; r4 = 'd4'
ldrsh r6, [lr, #(12*8)] ; r6 = 'd6'
ldr r3, [r11, #4 ]
add r1, r2, r6
ldr r5, [r11, #36 ]
mul r1, r3, r1 ; r1 = z1
ldr r3, [r11, #8 ]
mla r6, r5, r6, r1 ; r6 = tmp2
add r5, r0, r4 ; r5 = tmp0
mla r2, r3, r2, r1 ; r2 = tmp3
sub r3, r0, r4 ; r3 = tmp1
add r0, r2, r5, lsl #13 ; r0 = tmp10
rsb r2, r2, r5, lsl #13 ; r2 = tmp13
add r4, r6, r3, lsl #13 ; r4 = tmp11
rsb r6, r6, r3, lsl #13 ; r6 = tmp12
ldrsh r1, [lr, #( 2*8)] ; r1 = 'd1'
ldrsh r3, [lr, #( 6*8)] ; r3 = 'd3'
ldrsh r5, [lr, #(10*8)] ; r5 = 'd5'
ldrsh r7, [lr, #(14*8)] ; r7 = 'd7'
; Check for empty odd column (happens about 20 to 25 % of the time according to my stats)
orr r9, r1, r3
orr r10, r5, r7
orrs r10, r9, r10
beq empty_odd_column
stmdb sp!, { r0, r2, r4, r6 } ; save on the stack tmp10, tmp13, tmp12, tmp11
add r0, r3, r5 ; r0 = 'z2'
add r2, r1, r7 ; r2 = 'z1'
add r4, r3, r7 ; r4 = 'z3'
add r6, r1, r5 ; r6 = 'z4'
ldr r9, [r11, #12 ]
add r8, r4, r6
ldr r10, [r11, #32 ]
mul r8, r9, r8 ; r8 = 'z5'
ldr r9, [r11, #44 ]
mul r2, r10, r2 ; r2 = 'z1'
ldr r10, [r11, #40 ]
mul r0, r9, r0 ; r0 = 'z2'
ldr r9, [r11, #28 ]
mla r4, r10, r4, r8 ; r4 = 'z3'
ldr r10, [r11, #0 ]
mla r6, r9, r6, r8 ; r6 = 'z4'
ldr r9, [r11, #20 ]
mla r7, r10, r7, r2 ; r7 = tmp0 + z1
ldr r10, [r11, #24 ]
mla r5, r9, r5, r0 ; r5 = tmp1 + z2
ldr r9, [r11, #16 ]
mla r3, r10, r3, r0 ; r3 = tmp2 + z2
add r7, r7, r4 ; r7 = tmp0
mla r1, r9, r1, r2 ; r1 = tmp3 + z1
add r5, r5, r6 ; r5 = tmp1
add r3, r3, r4 ; r3 = tmp2
add r1, r1, r6 ; r1 = tmp3
ldmia sp!, { r0, r2, r4, r6 } ; r0 = tmp10 / r2 = tmp13 / r4 = tmp11 / r6 = tmp12
; r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0
; Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
add r8, r0, r1
add r8, r8, #(1<<17)
mov r8, r8, asr #18
strh r8, [lr, #( 0*8)]
; Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
sub r8, r0, r1
add r8, r8, #(1<<17)
mov r8, r8, asr #18
strh r8, [lr, #(14*8)]
; Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
add r8, r4, r3
add r8, r8, #(1<<17)
mov r8, r8, asr #18
strh r8, [lr, #( 2*8)]
; Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
sub r8, r4, r3
add r8, r8, #(1<<17)
mov r8, r8, asr #18
strh r8, [lr, #(12*8)]
; Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
add r8, r6, r5
add r8, r8, #(1<<17)
mov r8, r8, asr #18
strh r8, [lr, #( 4*8)]
; Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
sub r8, r6, r5
add r8, r8, #(1<<17)
mov r8, r8, asr #18
strh r8, [lr, #(10*8)]
; Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
add r8, r2, r7
add r8, r8, #(1<<17)
mov r8, r8, asr #18
strh r8, [lr, #( 6*8)]
; Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
sub r8, r2, r7
add r8, r8, #(1<<17)
mov r8, r8, asr #18
strh r8, [lr, #( 8*8)]
; End of row loop
add lr, lr, #2
subs r12, r12, #1
bne column_loop
beq the_end
empty_odd_column
; Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
; Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
add r0, r0, #(1<<17)
mov r0, r0, asr #18
strh r0, [lr, #( 0*8)]
strh r0, [lr, #(14*8)]
; Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
; Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
add r4, r4, #(1<<17)
mov r4, r4, asr #18
strh r4, [lr, #( 2*8)]
strh r4, [lr, #(12*8)]
; Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
; Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
add r6, r6, #(1<<17)
mov r6, r6, asr #18
strh r6, [lr, #( 4*8)]
strh r6, [lr, #(10*8)]
; Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
; Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
add r2, r2, #(1<<17)
mov r2, r2, asr #18
strh r2, [lr, #( 6*8)]
strh r2, [lr, #( 8*8)]
; End of row loop
add lr, lr, #2
subs r12, r12, #1
bne column_loop
the_end
; The end....
add sp, sp, #4
ldmia sp!, { r4 - r12, pc } ; restore callee saved regs and return
ALIGN
const_array
DCD 2446
DCD 4433
DCD 6270
DCD 9633
DCD 12299
DCD 16819
DCD 25172
DCD -3196
DCD -7373
DCD -15137
DCD -16069
DCD -20995
DCD 0xFFFF
ENDP
END
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -