📄 dcts.s
字号:
MOVS t1, $reg, ASR #16
ADDCS t1, t1, #1 ; get high value
AND t2, t0, #&00FFFFFF ; extract quant coef
MLA t3, t1, t2, q_round ; quantise high (now shifted 16 up)
SUB $reg, $reg, t1, LSL #16 ; extract low
MLA t2, $reg, t2, q_round ; quantise low
AND t3, mask, t3 ; knock bottom bits off high
ORR $reg, t3, t2, LSR #16 ; ORR-combine & shift down low 16
STR $reg, [$ptr, t0, ASR #22] ; save at t0-position
MEND
; Forward DCT part 1
MACRO
fdct_1
; stage 1 for r0-r7
ADD r0, r0, r7
SUB r7, r0, r7, LSL #1
ADD r1, r1, r6
SUB r6, r1, r6, LSL #1
ADD r2, r2, r5
SUB r5, r2, r5, LSL #1
ADD r3, r3, r4
SUB r4, r3, r4, LSL #1
; stage 2 for r0-r3
ADD r0, r0, r3
SUB r3, r0, r3, LSL #1
ADD r1, r1, r2
SUB r2, r1, r2, LSL #1
; stage 3,4 for r0-r1
ADD r0, r0, r1
SUB r1, r0, r1, LSL #1
; r0-r1 now finished
MEND
; Forward DCT part 2
MACRO
fdct_2
; stage 2 for r4-r7
ADD r4, r4, r5
ADD r5, r5, r6
ADD r6, r6, r7
; stage 3 for r2-r3,r5
ADD r2, r2, r3
MUL2C r2, r5, 724, FDCT_SHIFT
; stage 4 for r2-r3
ADD r2, r2, r3
RSB r3, r2, r3, LSL #1
; r2-r3 now finished
MEND
; Forward DCT part 3
MACRO
fdct_3
; stage 3 for r4,r6
ROTATE r4, r6, 392, 946, FDCT_SHIFT
; stage 4 for r4-r7
ADD r5, r5, r7
RSB r7, r5, r7, LSL #1
ADD r4, r4, r7
RSB r7, r4, r7, LSL #1
ADD r5, r5, r6
SUB r6, r5, r6, LSL #1
MEND
; Reverse DCT part 1
MACRO
rdct_1
; invert stage 3,4 for r4-r7
SUB r4, r4, r7
ADD r7, r4, r7, LSL #1
ADD r5, r5, r6
RSB r6, r5, r6, LSL #1
ROTATE r4, r6, 392, 946, RDCT_SHIFT
ADD r7, r7, r5
RSB r5, r7, r5, LSL #1
MULC r5, 724, RDCT_SHIFT
ADD r6, r6, r7
ADD r5, r5, r6
SUB r4, r4, r5
MEND
; Reverse DCT part 2
MACRO
rdct_2
; invert stages 4,3,2 for r2-r3
SUBS r2, r2, r3
ADD r3, r2, r3, LSL #1
BEQ %FT00 ; >10% zero
MULC r2, 724, RDCT_SHIFT
00 ; mul finished
SUB r2, r2, r3
MEND
; Reverse DCT part 3
MACRO
rdct_3
; invert rest of stages
ADD r0, r0, r1
SUB r1, r0, r1, LSL #1
ADD r0, r0, r3
SUB r3, r0, r3, LSL #1
ADD r1, r1, r2
SUB r2, r1, r2, LSL #1
ADD r0, r0, r7
SUB r7, r0, r7, LSL #1
SUB r1, r1, r6
ADD r6, r1, r6, LSL #1
ADD r2, r2, r5
SUB r5, r2, r5, LSL #1
ADD r3, r3, r4
SUB r4, r3, r4, LSL #1
MEND
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Forward DCT - ARM ;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; r0 = DCT buffer, 256 byte-aligned (input at r0, output at r0)
; (shifts up to do horziontal then down again doing vertical+quant)
; r1 = MCU descriptor, with the number of 8x8 FDCTs to do.
; As the FDCT processes 2 blocks at a time, this number is rounded
; upwards to a multiple of 2.
; r2 = quantization array pointer, contains #blocks / 2 pointers to
; quantization tables. The same table is used for each block pair
dctbuf RN 0
mcudesc RN 1
quanta RN 2
EXPORT fdct_fast
fdct_fast
STMFD sp!, {R4-R11, R14}
ADD mcudesc, mcudesc, #1
MOV mcudesc, mcudesc, LSR #1 ; get number of double 8x8 blocks
ADD srce, dctbuf, mcudesc, LSL #8 ; END of source picture
ADD dctbuf, dctbuf, #32 ; doing last row first!
ADD dst, dctbuf, mcudesc, LSL #8 ; destination buffer end
ADD mcudesc, dctbuf, mcudesc, LSL #8 ; dst_end
STMFD sp!, {dctbuf, mcudesc, quanta}
MOV mask, #255 << 16
ORR mask, mask, #255 << 24 ; mask = 0xffff0000
MOV round, #1 << (FDCT_SHIFT - 1) ; round = 0.5
MOV bias, #1024 * 2
ORR bias, bias, bias, LSL #16
; Horizontal DCT loop (backwards from the last block)
fdct_nextrow
LDMDB srce!, {r0 - r7} ; load next 16 pixels - going BACKWARDS
fdct_1
SUB r0, r0, bias ; subtract DC bias
STR r0, [dst, #-4]! ; save data in first block - going BACK
STR r1, [dst, #32*1]
fdct_2
STR r2, [dst, #32*2]
STR r3, [dst, #32*3]
fdct_3
STR r4, [dst, #32*4]
STR r5, [dst, #32*5]
STR r6, [dst, #32*6]
STR r7, [dst, #32*7]
TST dst, #0x1F
BNE fdct_nextrow ; still on same block
LDR dst_end, [sp, #0] ; get start of MCU buffer
SUB dst, dst, #256 - 32 ; move back a block
CMP dst, dst_end ; finished?
BHI fdct_nextrow
ADD srce, dst, #256 - 32
MOV q_round, #1 << 15
; Vertical Block loop
fdct_nextcol1
LDR q_ptr, [sp, #8] ; get ptr to array of quant ptrs
LDR quant, [q_ptr], #4 ; get adr of quantization table
STR q_ptr, [sp, #8] ; store ptr to next quantization table
; Vertical 8xDCT loop
fdct_nextcol
LDMIA srce!, {r0 - r7} ; now go forward
fdct_1
STMDB srce!, {r0, r1}
fdct_2
STMDB srce!, {r2, r3}
fdct_3
quantize r4, srce ; funny order corrected by zig/zag
quantize r5, srce
quantize r6, srce
quantize r7, srce
LDMIA srce!, {r4 - r7}
quantize r4, srce ; f2
quantize r5, srce ; f3
quantize r6, srce ; f0
quantize r7, srce ; f1
TST srce, #255
BNE fdct_nextcol ; in same block
LDR src_end, [sp, #4] ; end of data
CMP srce, src_end
BLS fdct_nextcol1 ; next block
ADD sp, sp, #12 ; 'pull' end markers
RETURN "r4-r11","","","" ; return (rlist, sp, lr, no condition)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Reverse DCT - ARM ;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; r0 = DCT input buffer (256 byte-aligned) input at r0 + 256, output at r0
; r1 = MCU descriptor, with the number of 8x8 RDCTs to do.
; As the FDCT processes 2 blocks at a time, this number is rounded
; upwards to a multiple of 2.
dctbuf RN 0
mcudesc RN 1
EXPORT rdct_fast
rdct_fast
STMFD sp!, {R4-R11, R14}
ADD mcudesc, mcudesc, #1
MOV mcudesc, mcudesc, LSR #1 ; get number of double 8x8 blocks
ADD mcudesc, dctbuf, mcudesc, LSL#8 ; get end mark
STR mcudesc, [sp, #-4]! ; stack it
ADD srce, mcudesc, #256 ; points just behind the last input block
ADD dst, dctbuf, #256 ; points to 2nd block
MOV mask, #255 << 16
ORR mask, mask, #255 << 24 ; mask = 0xffff0000
MOV round, #1 << (RDCT_SHIFT - 1) ; round = 0.5
; Horizontal loop
; (this is actually inverse vertical as the zig/zag did a transpose)
; stored horizontally (so transposed) as well
rdct_nextrow
LDMDB srce!, {r4 - r7}
ORRS const, r4, r5
ORREQS const, r6, r7
BNE rdct_nonzero1 ; if r4-r7 not all 0 do rotate
LDMDB srce!, {r0 - r3}
ORRS const, r2, r3
ORREQS const, r0, r1
BNE rdct_nonzero2 ; if r0-r3 not all 0 do them
; all zero - skip row - don't even neeed to store
TST srce, #255 ; reached the end?
BNE rdct_nextrow ; no
B rdct_nextblock ; yes
rdct_nonzero1
rdct_1
LDMDB srce!, {r2, r3}
SUB srce, srce, #8
rdct_nonzero2
rdct_2
LDMIA srce, {r0, r1}
rdct_3
STMIA srce, {r0 - r7}
TST srce, #255
BNE rdct_nextrow
; Vertical block loop
rdct_nextblock
CMP srce, dst
BHI rdct_nextrow
SUB dst, srce, #256
; Vertical column loop
; (actually the inverse horizontal transform)
; (matrix is transposed when read)
rdct_nextcol
LDR r4, [srce, #32*4]
LDR r5, [srce, #32*5]
LDR r6, [srce, #32*6]
LDR r7, [srce, #32*7]
ORRS const, r4, r5
ORREQS const, r6, r7
BEQ rdct_zero1
rdct_1
rdct_zero1
LDR r2, [srce, #32*2]
LDR r3, [srce, #32*3]
rdct_2
LDR r1, [srce, #32*1]
LDR r0, [srce], #4
rdct_3
STMIA dst!, {r0 - r7}
TST dst, #255
BNE rdct_nextcol
LDR dst_end, [sp, #0]
ADD srce, srce, #256 - 32
CMP dst, dst_end
BLO rdct_nextcol
ADD sp, sp, #4 ; unstack end pointer
RETURN "r4-r11","","","" ; return (rlist, sp, lr, no condition)
END
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -