📄 dcts.s

📁 arm ads1.2 with crack.rar
💻 S
📖 第 1 页 / 共 2 页
字号:
12 下一页
;/*
; * Discrete Cosine Transform assembler
; * Copyright (C) ARM Limited 1998-1999. All rights reserved.
; */

; Highly optimised forward and inverse 2DIM DCT

; The 1 DIM 8 element FDCT
; ========================
;
; The 1 DIM 8 element DCT takes 8 inputs f(0)-f(7) and produces 8 outputs,
; T(0)-T(7).
;
; It is described by the formula:
;
; T(u) = C(u) (f(0)c(u,0) + ... + f(7)c(u,7))
;
; where C(u) = 1/(2*sqr(2)) if u=0  and  1/2 if u>0
; and   c(u,x) = cos( (2x+1)*u*pi/16 )
;
; The most efficient way of calculating it so far know is by Arai, Agui,
; and Nakajima and can be split up into stages as follows:
;
; Stage 1:
;		g(0) = f(0)+f(7)
;		g(1) = f(1)+f(6)
;		g(2) = f(2)+f(5)
;		g(3) = f(3)+f(4)
;		g(4) = f(3)-f(4)
;		g(5) = f(2)-f(5)
;		g(6) = f(1)-f(6)
;		g(7) = f(0)-f(7)
;
; Stage 2:
;		f(0) = g(0)+g(3)
;		f(3) = g(0)-g(3)
;
;		f(1) = g(1)+g(2)
;		f(2) = g(1)-g(2)
;
;		f(4) = g(4)+g(5)
;		f(5) = g(5)+g(6)
;		f(6) = g(6)+g(7)
;		f(7) = g(7)
;
; Stage 3:
;		g(0) = f(0)
;		g(1) = f(1)
;
;		g(2) = (f(2)+f(3)) * a1		(a1 = 1/sqr(2))
;		g(3) = f(3)
;
;		temp = (f(4)-f(6)) * a5	    (a5 = cos(3*pi/8))
;		g(4) = temp + f(4) * a2		(a2 = cos(pi/8)-cos(3*pi/8))
;		g(6) = temp + f(6) * a4		(a4 = cos(pi/8)+cos(3*pi/8))
;
;		temp = f(5) * a3			(a3 = 1/sqr(2))
;		g(5) = f(7) + temp
;		g(7) = f(7) - temp
;
; Stage 4:	(produce the Fourier transform results *8 and *16)
;		F(0) = g(0)+g(1)
;		F(4) = g(0)-g(1)
;		
;		F(2) = g(3)+g(2)
;		F(6) = g(3)-g(2)
;
;		F(5) = g(7)+g(4)
;		F(3) = g(7)-g(4)
;
;		F(1) = g(5)+g(6)
;		F(7) = g(5)-g(6)
;
; Stage 5:
;		T(0) = F(0)/2
;		T(u) = F(u)/sqr(2)/cos(pi*u/16)/2 == F(u) / ( sqr(2)*cos(pi*u/16)*2 )
;
; The key point being that the calculation to convert F to T can be
; done at the same time as the quantisation! This just leaves 5 multiplies
; for the main part of the DCT.
;
; Constants are	a1 = a3 = cos(pi/4) = 1/sqr(2)	= 0.707106781
;				a2 		= cos(pi/8)-cos(3*pi/8) = 0.5411961
;				a4 		= cos(pi/8)+cos(3*pi/8) = 1.30656297
;				a5 		= cos(3*pi/8) 			= 0.382683433
;
; And T(u) = F(u)/t(u)/2 where:
;
;		t(u) = 1 					if u=0
;		t(u) = sqr(2)*cos(pi*u/16) 	if u>0
;
; The 1 DIM 8 element RDCT
; ========================
;
; The reverse DCT algorithm can be found by reversing each step of the above algorithm.
; For example
;		F(1) = g(5)+g(6)
;		F(7) = g(5)-g(6)
; is reversed as:
;               g(5) = (F(1)+F(7)) / 2
;               g(6) = (F(1)-F(7)) / 2
; As an optimization the divisions by 2 are left out. Similarly, multiplications by N
; are replaced by 1/N and the rotation in stage 3 is reversed.
; The result is a reverse DCT which is almost lossless while only using 16 bit arithmetic.


; The 2 DIM 8x8 DCT
; =================
;
; Do 1 DIM DCT horizontally on very row then vertically on every column
; (or the other way around). Note that if the t(u) scalings are not
; done then you are left with a matrix of t(i)t(j) elements to divide
; by. This is the AANscales matrix.
;
; The horizontal DCT magnifies by sqrt(8) and the vertical by sqrt(8)
; so to normalise a right shift of 3 is needed after a horizontal or
; vertical DCT. For extra precision, an initial shift of 1 is included
; in the FDCT (result is 12 bit signed, upshifted by 1+3 bits)
; and 4 for the RDCT (result is 8 bit signed, upshifted by 4+3 bits).

; This implementation of the DCT is highly optimised and so may be hard to follow
; The code performs two DCT's at once on 16 bit data by storing them in the form
;
;	reg[n] = f0(n) + (f1(n)<<16)	for ARM code
;
;	reg[n].h = f1(n)	for PICCOLO code
;	reg[n].l = f0(n)
;
; where f0 and f1 are the two 8x8 arrays being transformed.
;
; Lines 0-7 of the flowgraph are kept in registers r0-r7. In order to save
; registers, registers r0-r3 are also used as temporaries when calculating
; on coefficients f4 - f7.
; The results are stored out in wierd orders (whatever is optimal from an ARM
; register allocation viewpoint) since the order can be corrected by the modifying
; the zig zag table. Hence the zig-zag table has become quite complicated.

	INCLUDE intworkm.h

FASTMUL EQU 0
        
        AREA    |C$$code|, CODE, READONLY $interwork
        
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;	ARM MACROS		;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

t0      RN  0
t1      RN  1
t2      RN  2
t3      RN  3
srce	RN  8	; source colour data
dst     RN  9	; destination output
quant   RN dst  ; quantization table pointer
q_ptr   RN t0
round   RN 10   ; DCT rounding value (0.5)
q_round RN 11   ; quantization rounding value (0.5)
bias	RN 11	; DC bias
const   RN 12   ; current multiplication constant
mask    RN 14   ; mask for merging 2 signed 16 bit values
dst_end RN t0
src_end RN t0

FDCT_SHIFT * 10 ; precision of FDCT coefficients after multiply
RDCT_SHIFT *  9 ; precision of RDCT coefficients after multiply

	; FAST MULTIPLY
	; take two combined values in $reg and multiply both by $val
	; uses MUL instruction and two temps
	; shift is the number of bits to shift right after the MUL
        MACRO
        MUL_F   $reg, $val, $tmp1, $tmp2, $shift
          MOVS    $tmp1, $reg, ASR #16			
          ADDCS   $tmp1, $tmp1, #1                  ; get high part of $reg
          MLA     $tmp2, $tmp1, $val, round			; multiply by $val
          SUB     $reg, $reg, $tmp1, LSL #16		; get low part of $reg
          MLA     $tmp1, $reg, $val, round			; multiply by $val
          AND     $tmp2, mask, $tmp2, LSL #16 - $shift
          ADD     $reg, $tmp2, $tmp1, ASR #$shift	; combine
        MEND

	; SLOW MULTIPLY by 724 = (sqrt(2) at q10)
	; For processor without fast multiplier
        MACRO
        MUL_S   $reg, $val, $tmp1, $tmp2, $shift
          MOVS    $tmp1, $reg, ASR #16
          ADDCS   $tmp1, $tmp1, #1					; tmp1 = top value
          SUB     $reg, $reg, $tmp1, LSL #16		; reg = bottom value
  
          ASSERT  $val = 724
          ADD     $tmp2, $reg, $reg, LSL #1
          RSB     $tmp2, $tmp2, $tmp2, LSL #4
          ADD     $tmp2, $reg, $tmp2, LSL #2		; tmp2 = reg * 181
          ADD     $tmp2, round, $tmp2, LSL #2		; *4 = 724
  
          ADD     $reg, $tmp1, $tmp1, LSL #1
          RSB     $reg, $reg, $reg, LSL #4
          ADD     $reg, $tmp1, $reg, LSL #2			; reg = tmp1 * 181
          ADD     $reg, round, $reg, LSL #2			; *4 = 724
  
          AND     $reg, mask, $reg, LSL #16 - $shift
          ADD     $reg, $reg, $tmp2, ASR #$shift	; combine
        MEND


	; Multiply 2 combined values ($reg1 and $reg2) by constant $val
        MACRO
        MUL2C  $reg1, $reg2, $val, $shift
          IF      FASTMUL = 1
            MOV     const, #$val					; fast multiply
            MUL_F   $reg1, const, t0, t1, $shift
            MUL_F   $reg2, const, t0, t1, $shift
          ELSE
            MUL_S   $reg1, $val, t0, t1, $shift		; slow multiply
            MUL_S   $reg2, $val, t0, t1, $shift
          ENDIF
        MEND

	; Multiply a single combined value in $reg by constant $val
        MACRO
        MULC  $reg, $val, $shift
          IF      FASTMUL = 1
            MOV     const, #$val
            MUL_F   $reg, const, t0, t1, $shift
          ELSE
            MUL_S   $reg, $val, t0, t1, $shift
          ENDIF
        MEND
        
        ; Peform rotation from step 3:
	; temp = (f(4)-f(6)) * a5	        (a5 = cos(3*pi/8))
	; g(4) = temp + f(4) * a2			(a2 = cos(pi/8)-cos(3*pi/8))
	; g(6) = temp + f(6) * a4			(a4 = cos(pi/8)+cos(3*pi/8))
	; here $r1 and $r2 are the unpacked values f(4) and f(6)
	; $val1 = cos(3*pi/8)
	; $val2 = cos(pi/8)
        MACRO
        ROT3_C  $r1, $r2, $t1, $t2, $val1, $val2
        SUB     $t1, $r1, $r2						; get 'temp'
        IF      FASTMUL = 1							; fast multiplier available
          MOV     const, #$val1
          MLA     $t2, $t1, const, round
          ADD     const, const, #$val2 - 2 * $val1	; $val2 - $val1
          MLA     $t1, $r1, const, $t2
          ADD     const, const, #$val1 * 2			; $val1 + $val2
          MLA     $t2, $r2, const, $t2
        ELSE										; no fast multiplier
          ASSERT ($val1 = 392) :LAND: ($val2 = 946)
          ADD   $t2, $t1, $t1, LSL #1
          ADD   $t2, $t1, $t2, LSL #4           	; t2 = t1 * 49 (392 >> 3)
          ADD   $t2, round, $t2, LSL #3

          RSB   $t1, $r1, $r1, LSL #5
          ADD   $t1, $t1, $t1, LSL #3
          SUB   $r1, $t1, $r1, LSL #1           	; r1 *= 277 (554 >> 1)

          RSB   $t1, $r2, $r2, LSL #3
          RSB   $t1, $r2, $t1, LSL #5
          ADD   $r2, $t1, $t1, LSL #1           	; r2 *= 669 (1338 >> 1)
          
          ADD   $t1, $t2, $r1, LSL #1
          ADD   $t2, $t2, $r2, LSL #1
        ENDIF
        MEND

	; Perform rotation from step 3 on COMBINED values in $reg1, $reg2
        MACRO
        ROTATE  $reg1, $reg2, $val1, $val2, $shift
        MOVS    t0, $reg1, ASR #16
        ADDCS   t0, t0, #1
        SUB     $reg1, $reg1, t0, LSL #16			; split reg1
        MOVS    t1, $reg2, ASR #16
        ADDCS   t1, t1, #1
        SUB     $reg2, $reg2, t1, LSL #16			; split reg2
        ROT3_C  t0, t1, t2, t3, $val1, $val2		; rotate 'high' values
        ROT3_C  $reg1, $reg2, t0, t1, $val1, $val2	; rotate 'low' values
        AND     t2, mask, t2, LSL #16 - $shift
        ADD     $reg1, t2, t0, ASR #$shift			; combine reg1
        AND     t3, mask, t3, LSL #16 - $shift
        ADD     $reg2, t3, t1, ASR #$shift			; combine reg2
        MEND

	; Compress quantize a pair of combined values
	; $reg = combined values
	; $ptr = current output ptr
        MACRO
        quantize $reg, $ptr
        LDR     t0, [quant], #4						; find next quant coef + zigzag
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -