📄 dcts.s
字号:
;/*
; * Discrete Cosine Transform assembler
; * Copyright (C) ARM Limited 1998-1999. All rights reserved.
; */
; Highly optimised forward and inverse 2DIM DCT
; The 1 DIM 8 element FDCT
; ========================
;
; The 1 DIM 8 element DCT takes 8 inputs f(0)-f(7) and produces 8 outputs,
; T(0)-T(7).
;
; It is described by the formula:
;
; T(u) = C(u) (f(0)c(u,0) + ... + f(7)c(u,7))
;
; where C(u) = 1/(2*sqr(2)) if u=0 and 1/2 if u>0
; and c(u,x) = cos( (2x+1)*u*pi/16 )
;
; The most efficient way of calculating it so far know is by Arai, Agui,
; and Nakajima and can be split up into stages as follows:
;
; Stage 1:
; g(0) = f(0)+f(7)
; g(1) = f(1)+f(6)
; g(2) = f(2)+f(5)
; g(3) = f(3)+f(4)
; g(4) = f(3)-f(4)
; g(5) = f(2)-f(5)
; g(6) = f(1)-f(6)
; g(7) = f(0)-f(7)
;
; Stage 2:
; f(0) = g(0)+g(3)
; f(3) = g(0)-g(3)
;
; f(1) = g(1)+g(2)
; f(2) = g(1)-g(2)
;
; f(4) = g(4)+g(5)
; f(5) = g(5)+g(6)
; f(6) = g(6)+g(7)
; f(7) = g(7)
;
; Stage 3:
; g(0) = f(0)
; g(1) = f(1)
;
; g(2) = (f(2)+f(3)) * a1 (a1 = 1/sqr(2))
; g(3) = f(3)
;
; temp = (f(4)-f(6)) * a5 (a5 = cos(3*pi/8))
; g(4) = temp + f(4) * a2 (a2 = cos(pi/8)-cos(3*pi/8))
; g(6) = temp + f(6) * a4 (a4 = cos(pi/8)+cos(3*pi/8))
;
; temp = f(5) * a3 (a3 = 1/sqr(2))
; g(5) = f(7) + temp
; g(7) = f(7) - temp
;
; Stage 4: (produce the Fourier transform results *8 and *16)
; F(0) = g(0)+g(1)
; F(4) = g(0)-g(1)
;
; F(2) = g(3)+g(2)
; F(6) = g(3)-g(2)
;
; F(5) = g(7)+g(4)
; F(3) = g(7)-g(4)
;
; F(1) = g(5)+g(6)
; F(7) = g(5)-g(6)
;
; Stage 5:
; T(0) = F(0)/2
; T(u) = F(u)/sqr(2)/cos(pi*u/16)/2 == F(u) / ( sqr(2)*cos(pi*u/16)*2 )
;
; The key point being that the calculation to convert F to T can be
; done at the same time as the quantisation! This just leaves 5 multiplies
; for the main part of the DCT.
;
; Constants are a1 = a3 = cos(pi/4) = 1/sqr(2) = 0.707106781
; a2 = cos(pi/8)-cos(3*pi/8) = 0.5411961
; a4 = cos(pi/8)+cos(3*pi/8) = 1.30656297
; a5 = cos(3*pi/8) = 0.382683433
;
; And T(u) = F(u)/t(u)/2 where:
;
; t(u) = 1 if u=0
; t(u) = sqr(2)*cos(pi*u/16) if u>0
;
; The 1 DIM 8 element RDCT
; ========================
;
; The reverse DCT algorithm can be found by reversing each step of the above algorithm.
; For example
; F(1) = g(5)+g(6)
; F(7) = g(5)-g(6)
; is reversed as:
; g(5) = (F(1)+F(7)) / 2
; g(6) = (F(1)-F(7)) / 2
; As an optimization the divisions by 2 are left out. Similarly, multiplications by N
; are replaced by 1/N and the rotation in stage 3 is reversed.
; The result is a reverse DCT which is almost lossless while only using 16 bit arithmetic.
; The 2 DIM 8x8 DCT
; =================
;
; Do 1 DIM DCT horizontally on very row then vertically on every column
; (or the other way around). Note that if the t(u) scalings are not
; done then you are left with a matrix of t(i)t(j) elements to divide
; by. This is the AANscales matrix.
;
; The horizontal DCT magnifies by sqrt(8) and the vertical by sqrt(8)
; so to normalise a right shift of 3 is needed after a horizontal or
; vertical DCT. For extra precision, an initial shift of 1 is included
; in the FDCT (result is 12 bit signed, upshifted by 1+3 bits)
; and 4 for the RDCT (result is 8 bit signed, upshifted by 4+3 bits).
; This implementation of the DCT is highly optimised and so may be hard to follow
; The code performs two DCT's at once on 16 bit data by storing them in the form
;
; reg[n] = f0(n) + (f1(n)<<16) for ARM code
;
; reg[n].h = f1(n) for PICCOLO code
; reg[n].l = f0(n)
;
; where f0 and f1 are the two 8x8 arrays being transformed.
;
; Lines 0-7 of the flowgraph are kept in registers r0-r7. In order to save
; registers, registers r0-r3 are also used as temporaries when calculating
; on coefficients f4 - f7.
; The results are stored out in wierd orders (whatever is optimal from an ARM
; register allocation viewpoint) since the order can be corrected by the modifying
; the zig zag table. Hence the zig-zag table has become quite complicated.
INCLUDE intworkm.h
FASTMUL EQU 0
AREA |C$$code|, CODE, READONLY $interwork
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; ARM MACROS ;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
t0 RN 0
t1 RN 1
t2 RN 2
t3 RN 3
srce RN 8 ; source colour data
dst RN 9 ; destination output
quant RN dst ; quantization table pointer
q_ptr RN t0
round RN 10 ; DCT rounding value (0.5)
q_round RN 11 ; quantization rounding value (0.5)
bias RN 11 ; DC bias
const RN 12 ; current multiplication constant
mask RN 14 ; mask for merging 2 signed 16 bit values
dst_end RN t0
src_end RN t0
FDCT_SHIFT * 10 ; precision of FDCT coefficients after multiply
RDCT_SHIFT * 9 ; precision of RDCT coefficients after multiply
; FAST MULTIPLY
; take two combined values in $reg and multiply both by $val
; uses MUL instruction and two temps
; shift is the number of bits to shift right after the MUL
MACRO
MUL_F $reg, $val, $tmp1, $tmp2, $shift
MOVS $tmp1, $reg, ASR #16
ADDCS $tmp1, $tmp1, #1 ; get high part of $reg
MLA $tmp2, $tmp1, $val, round ; multiply by $val
SUB $reg, $reg, $tmp1, LSL #16 ; get low part of $reg
MLA $tmp1, $reg, $val, round ; multiply by $val
AND $tmp2, mask, $tmp2, LSL #16 - $shift
ADD $reg, $tmp2, $tmp1, ASR #$shift ; combine
MEND
; SLOW MULTIPLY by 724 = (sqrt(2) at q10)
; For processor without fast multiplier
MACRO
MUL_S $reg, $val, $tmp1, $tmp2, $shift
MOVS $tmp1, $reg, ASR #16
ADDCS $tmp1, $tmp1, #1 ; tmp1 = top value
SUB $reg, $reg, $tmp1, LSL #16 ; reg = bottom value
ASSERT $val = 724
ADD $tmp2, $reg, $reg, LSL #1
RSB $tmp2, $tmp2, $tmp2, LSL #4
ADD $tmp2, $reg, $tmp2, LSL #2 ; tmp2 = reg * 181
ADD $tmp2, round, $tmp2, LSL #2 ; *4 = 724
ADD $reg, $tmp1, $tmp1, LSL #1
RSB $reg, $reg, $reg, LSL #4
ADD $reg, $tmp1, $reg, LSL #2 ; reg = tmp1 * 181
ADD $reg, round, $reg, LSL #2 ; *4 = 724
AND $reg, mask, $reg, LSL #16 - $shift
ADD $reg, $reg, $tmp2, ASR #$shift ; combine
MEND
; Multiply 2 combined values ($reg1 and $reg2) by constant $val
MACRO
MUL2C $reg1, $reg2, $val, $shift
IF FASTMUL = 1
MOV const, #$val ; fast multiply
MUL_F $reg1, const, t0, t1, $shift
MUL_F $reg2, const, t0, t1, $shift
ELSE
MUL_S $reg1, $val, t0, t1, $shift ; slow multiply
MUL_S $reg2, $val, t0, t1, $shift
ENDIF
MEND
; Multiply a single combined value in $reg by constant $val
MACRO
MULC $reg, $val, $shift
IF FASTMUL = 1
MOV const, #$val
MUL_F $reg, const, t0, t1, $shift
ELSE
MUL_S $reg, $val, t0, t1, $shift
ENDIF
MEND
; Peform rotation from step 3:
; temp = (f(4)-f(6)) * a5 (a5 = cos(3*pi/8))
; g(4) = temp + f(4) * a2 (a2 = cos(pi/8)-cos(3*pi/8))
; g(6) = temp + f(6) * a4 (a4 = cos(pi/8)+cos(3*pi/8))
; here $r1 and $r2 are the unpacked values f(4) and f(6)
; $val1 = cos(3*pi/8)
; $val2 = cos(pi/8)
MACRO
ROT3_C $r1, $r2, $t1, $t2, $val1, $val2
SUB $t1, $r1, $r2 ; get 'temp'
IF FASTMUL = 1 ; fast multiplier available
MOV const, #$val1
MLA $t2, $t1, const, round
ADD const, const, #$val2 - 2 * $val1 ; $val2 - $val1
MLA $t1, $r1, const, $t2
ADD const, const, #$val1 * 2 ; $val1 + $val2
MLA $t2, $r2, const, $t2
ELSE ; no fast multiplier
ASSERT ($val1 = 392) :LAND: ($val2 = 946)
ADD $t2, $t1, $t1, LSL #1
ADD $t2, $t1, $t2, LSL #4 ; t2 = t1 * 49 (392 >> 3)
ADD $t2, round, $t2, LSL #3
RSB $t1, $r1, $r1, LSL #5
ADD $t1, $t1, $t1, LSL #3
SUB $r1, $t1, $r1, LSL #1 ; r1 *= 277 (554 >> 1)
RSB $t1, $r2, $r2, LSL #3
RSB $t1, $r2, $t1, LSL #5
ADD $r2, $t1, $t1, LSL #1 ; r2 *= 669 (1338 >> 1)
ADD $t1, $t2, $r1, LSL #1
ADD $t2, $t2, $r2, LSL #1
ENDIF
MEND
; Perform rotation from step 3 on COMBINED values in $reg1, $reg2
MACRO
ROTATE $reg1, $reg2, $val1, $val2, $shift
MOVS t0, $reg1, ASR #16
ADDCS t0, t0, #1
SUB $reg1, $reg1, t0, LSL #16 ; split reg1
MOVS t1, $reg2, ASR #16
ADDCS t1, t1, #1
SUB $reg2, $reg2, t1, LSL #16 ; split reg2
ROT3_C t0, t1, t2, t3, $val1, $val2 ; rotate 'high' values
ROT3_C $reg1, $reg2, t0, t1, $val1, $val2 ; rotate 'low' values
AND t2, mask, t2, LSL #16 - $shift
ADD $reg1, t2, t0, ASR #$shift ; combine reg1
AND t3, mask, t3, LSL #16 - $shift
ADD $reg2, t3, t1, ASR #$shift ; combine reg2
MEND
; Compress quantize a pair of combined values
; $reg = combined values
; $ptr = current output ptr
MACRO
quantize $reg, $ptr
LDR t0, [quant], #4 ; find next quant coef + zigzag
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -