📄 idct.asm
字号:
movaps xmm3, xmm4 ;// tm65
addps xmm4, xmm1 ;// tp65 + tm65
subps xmm1, xmm3 ;// tp65 - tm65
mulps xmm4, TBL_cos4_16 ;// t6
mulps xmm1, TBL_cos4_16 ;// t5
movaps X1, xmm4 ;// t6
movaps X3, xmm0 ;// t4
movaps xmm5, X4 ;// x4
movaps xmm3, X0 ;// x0
movaps xmm6, xmm5 ;// x4
addps xmm5, xmm3 ;// tp03
subps xmm3, xmm6 ;// tp12
movaps xmm0, X2 ;// x2
movaps xmm4, X6 ;// x6
movaps xmm6, xmm0 ;// x2
mulps xmm0, TBL_tg2_16 ;// x2*tg2
movaps xmm7, xmm4 ;// x6
mulps xmm4, TBL_tg2_16 ;// x6*tg2
subps xmm0, xmm7 ;// tm12
addps xmm4, xmm6 ;// tm03
movaps xmm6, xmm0 ;// tm12
addps xmm0, xmm3 ;// t1
subps xmm3, xmm6 ;// t2
movaps xmm6, xmm4 ;// tm03
addps xmm4, xmm5 ;// t0
subps xmm5, xmm6 ;// t3
movaps xmm6, xmm2 ;// t7
addps xmm2, xmm4 ;// t0 + t7
subps xmm4, xmm6 ;// t0 - t7
movaps xmm7, xmm1 ;// t5
addps xmm1, xmm3 ;// t2 + t5
subps xmm3, xmm7 ;// t2 - t5
movaps xmm6, X3 ;// t4
movaps xmm7, xmm5 ;// t3
subps xmm5, xmm6 ;// t3 - t4
addps xmm7, xmm6 ;// t3 + t4
movaps X0, xmm2 ;// save y0
movaps X7, xmm4 ;// save y7
movaps X2, xmm1 ;// save y2
movaps X5, xmm3 ;// save y5
movaps X4, xmm5 ;// save y4
movaps X3, xmm7 ;// save y3
movaps xmm2, X1 ;// t6
movaps xmm3, xmm0 ;// t1
addps xmm0, xmm2 ;// t1 + t6
subps xmm3, xmm2 ;// t1 - t6
movaps X1, xmm0 ;// save y1
movaps X6, xmm3 ;// save y6
add I, 16
jnz ColLoop
PROFILE_OUT "IDCT_8x8_SSE2"
DBG_IDCT_OUT_8x8
add DST, 8*8*sizeof(real4)
jmp [Ctx].RetIdct
IDCT_8x8_SSE2 ENDP
;//=========================================================================
;// Dequantization and IDCT on a block of 8x8 samples, output 16x16 samples
;// params : esp : source pointer
;// params : edi : destination pointer
;// params : eax : pointer to quantization table
;// return : destination pointer += 16*16*sizeof(real4)
;//=========================================================================
IDCT_16x16_SSE2 PROC
QTB EQU <eax>
SRC EQU <QPTR [esp]>
DST EQU <edi>
I EQU <ecx>
W EQU <edx>
J EQU <Ctx>
X0 EQU <[DST][I+4*16+0*16*4]>
X1 EQU <[DST][I+4*16+1*16*4]>
X2 EQU <[DST][I+4*16+2*16*4]>
X3 EQU <[DST][I+4*16+3*16*4]>
X4 EQU <[DST][I+4*16+4*16*4]>
X5 EQU <[DST][I+4*16+5*16*4]>
X6 EQU <[DST][I+4*16+6*16*4]>
X7 EQU <[DST][I+4*16+7*16*4]>
X8 EQU <[DST][I+4*16+8*16*4]>
X9 EQU <[DST][I+4*16+9*16*4]>
X10 EQU <[DST][I+4*16+10*16*4]>
X11 EQU <[DST][I+4*16+11*16*4]>
X12 EQU <[DST][I+4*16+12*16*4]>
X13 EQU <[DST][I+4*16+13*16*4]>
X14 EQU <[DST][I+4*16+14*16*4]>
X15 EQU <[DST][I+4*16+15*16*4]>
DBG_IDCT_IN
PROFILE_IN
;// save context
movd mm1, Ctx
;// apply IDCT to the rows
mov I, -8*4
mov J, -8*16*4
RowLoop:
; /*
; x0 *= q0; x1 *= q1; x2 *= q2; x3 *= q3;
; x4 *= q4; x5 *= q5; x6 *= q6; x7 *= q7;
;
; a0 = x0*w0 + x2*w1 + x4*w2 + x6*w3;
; a1 = x0*w4 + x2*w5 + x4*w6 + x6*w7;
; a2 = x0*w8 + x2*w9 + x4*w10 + x6*w11;
; a3 = x0*w12 + x2*w13 + x4*w14 + x6*w15;
; b0 = x1*w16 + x3*w17 + x5*w18 + x7*w19;
; b1 = x1*w20 + x3*w21 + x5*w22 + x7*w23;
; b2 = x1*w24 + x3*w25 + x5*w26 + x7*w27;
; b3 = x1*w28 + x3*w29 + x5*w30 + x7*w31;
;
; y0 = a0 + b0;
; y1 = a1 + b1;
; y2 = a2 + b2;
; y3 = a3 + b3;
; y12 = a3 - b3;
; y13 = a2 - b2;
; y14 = a1 - b1;
; y15 = a0 - b0;
;
; a4 = x0*w32 + x2*w33 + x4*w34 + x6*w35;
; a5 = x0*w36 + x2*w37 + x4*w38 + x6*w39;
; a6 = x0*w40 + x2*w41 + x4*w42 + x6*w43;
; a7 = x0*w44 + x2*w45 + x4*w46 + x6*w47;
; b4 = x1*w48 + x3*w49 + x5*w50 + x7*w51;
; b5 = x1*w52 + x3*w53 + x5*w54 + x7*w55;
; b6 = x1*w56 + x3*w57 + x5*w58 + x7*w59;
; b7 = x1*w60 + x3*w61 + x5*w62 + x7*w63;
;
; y4 = a4 + b4;
; y5 = a5 + b5;
; y6 = a6 + b6;
; y7 = a7 + b7;
; y8 = a7 - b7;
; y9 = a6 - b6;
; y10 = a5 - b5;
; y11 = a4 - b4;
; */
mov W, TBL_MultRow16x16[I+8*4] ;// row multiplier
cvtdq2ps xmm0, [SRC][8*I+8*8*4] ;// x3 x2 x1 x0
cvtdq2ps xmm1, [SRC][8*I+8*8*4+4*4] ;// x7 x6 x5 x4
mulps xmm0, [QTB][8*I+8*8*4] ;// x3*q3 x2*q2 x1*q1 x0*q0
mulps xmm1, [QTB][8*I+8*8*4+4*4] ;// x7*q7 x6*q6 x5*q5 x4*q4
pshufd xmm2, xmm0, 000000000b ;// x0 x0 x0 x0
pshufd xmm3, xmm0, 010101010b ;// x2 x2 x2 x2
pshufd xmm4, xmm1, 000000000b ;// x4 x4 x4 x4
pshufd xmm5, xmm1, 010101010b ;// x6 x6 x6 x6
movaps xmm6, xmm0 ;// x3 x2 x1 x0
movaps xmm7, xmm1 ;// x7 x6 x5 x4
mulps xmm2, [W] ;// x0*w12 x0*w08 x0*w04 x0*w00
mulps xmm3, [W][4*4] ;// x2*w13 x2*w09 x2*w05 x2*w01
mulps xmm4, [W][8*4] ;// x4*w14 x4*w10 x4*w06 x4*w02
mulps xmm5, [W][12*4] ;// x6*w15 x6*w11 x6*w07 x6*w03
addps xmm2, xmm3 ;// x0*w12+x2*w13 x0*w08+x2*w09 x0*w04+x2*w05 x0*w00+x2*w01
addps xmm4, xmm5 ;// x4*w14+x6*w15 x4*w10+x6*w11 x4*w06+x6*w07 x4*w02+x6*w03
pshufd xmm3, xmm0, 001010101b ;// x1 x1 x1 x1
addps xmm2, xmm4 ;// a3 a2 a1 a0
pshufd xmm0, xmm0, 011111111b ;// x3 x3 x3 x3
pshufd xmm4, xmm1, 001010101b ;// x5 x5 x5 x5
pshufd xmm1, xmm1, 011111111b ;// x7 x7 x7 x7
mulps xmm3, [W][16*4] ;// x1*w28 x1*w24 x1*w20 x1*w16
mulps xmm0, [W][20*4] ;// x3*w29 x3*w25 x3*w21 x3*w17
mulps xmm4, [W][24*4] ;// x5*w30 x5*w26 x5*w22 x5*w18
mulps xmm1, [W][28*4] ;// x7*w31 x7*w27 x7*w23 x7*w19
addps xmm3, xmm0 ;// x1*w28+x3*w29 x1*w24+x3*w25 x1*w20+x3*w21 x1*w16+x3*w17
addps xmm4, xmm1 ;// x5*w30+x7*w31 x5*w26+x7*w27 x5*w22+x7*w23 x5*w18+x7*w19
addps xmm3, xmm4 ;// b3 b2 b1 b0
movaps xmm4, xmm2 ;// a3 a2 a1 a0
addps xmm2, xmm3 ;// y3 y2 y1 y0
subps xmm4, xmm3 ;// y12 y13 y14 y15
shufps xmm4, xmm4, 000011011b ;// y15 y14 y13 y12
movaps [DST][J+8*16*4], xmm2 ;// save y3 y2 y1 y0
movaps [DST][J+8*16*4+12*4], xmm4 ;// save y15 y14 y13 y12
pshufd xmm2, xmm6, 000000000b ;// x0 x0 x0 x0
pshufd xmm3, xmm6, 010101010b ;// x2 x2 x2 x2
pshufd xmm4, xmm7, 000000000b ;// x4 x4 x4 x4
pshufd xmm5, xmm7, 010101010b ;// x6 x6 x6 x6
mulps xmm2, [W][32*4] ;// x0*w44 x0*w40 x0*w36 x0*w32
mulps xmm3, [W][36*4] ;// x2*w45 x2*w41 x2*w37 x2*w33
mulps xmm4, [W][40*4] ;// x4*w46 x4*w42 x4*w38 x4*w34
mulps xmm5, [W][44*4] ;// x6*w47 x6*w43 x6*w39 x6*w35
addps xmm2, xmm3 ;// x0*w44+x2*w45 x0*w40+x2*w41 x0*w36+x2*w37 x0*w32+x2*w33
addps xmm4, xmm5 ;// x4*w46+x6*w47 x4*w42+x6*w43 x4*w38+x6*w39 x4*w34+x6*w35
pshufd xmm3, xmm6, 001010101b ;// x1 x1 x1 x1
addps xmm2, xmm4 ;// a7 a6 a5 a4
pshufd xmm6, xmm6, 011111111b ;// x3 x3 x3 x3
pshufd xmm4, xmm7, 001010101b ;// x5 x5 x5 x5
pshufd xmm7, xmm7, 011111111b ;// x7 x7 x7 x7
mulps xmm3, [W][48*4] ;// x1*w60 x1*w56 x1*w52 x1*w48
mulps xmm6, [W][52*4] ;// x3*w61 x3*w57 x3*w53 x3*w49
mulps xmm4, [W][56*4] ;// x5*w62 x5*w58 x5*w54 x5*w50
mulps xmm7, [W][60*4] ;// x7*w63 x7*w59 x7*w55 x7*w51
addps xmm3, xmm6 ;// x1*w60+x3*w61 x1*w56+x3*w57 x1*w52+x3*w53 x1*w48+x3*w49
addps xmm4, xmm7 ;// x5*w62+x7*w62 x5*w58+x7*w59 x5*w54+x7*w55 x5*w50+x7*w51
addps xmm3, xmm4 ;// b7 b6 b5 b4
movaps xmm4, xmm2 ;// a7 a6 a5 a4
addps xmm2, xmm3 ;// y7 y6 y5 y4
subps xmm4, xmm3 ;// y8 y9 y10 y11
shufps xmm4, xmm4, 000011011b ;// y11 y10 y9 y8
movaps [DST][J+8*16*4+4*4], xmm2 ;// save y7 y6 y5 y4
movaps [DST][J+8*16*4+8*4], xmm4 ;// save y11 y10 y9 y8
add J, 16*4
add I, 4
jnz RowLoop
;// apply IDCT to the columns
mov I, -4*16
ColLoop:
; /*
; a3 = tan(2Pi/16)*x4
; a5 = tan(Pi/16)*x2
; a7 = tan(3Pi/16)*x6
; a9 = tan(Pi/32)*x1
; a11 = tan(3Pi/32)*x3
; a13 = tan(5Pi/32)*x5
; a15 = tan(7Pi/32)*x7
; b0 = x0 + x4
; b1 = x0 + a3
; b2 = x0 - a3
; b3 = x0 - x4
; b4 = x2 + x6
; b5 = x2 - x6
; b6 = a5 + a7
; b7 = a5 - a7
; b8 = x1 + x7
; b9 = x1 - x7
; b10 = a9 + a15
; b11 = a9 - a15
; b12 = x3 + x5
; b13 = x3 - x5
; b14 = a11 + a13
; b15 = a11 - a13
; c8 = b8 + b12
; c9 = b9 + b14
; c10 = b13 - b10
; c11 = b8 - b12
; c12 = b11 + b15
; c13 = b9 - b14
; c14 = b13 + b10
; c15 = b11 - b15
; d5 = cos(4Pi/16)*(b5 + b6)
; d6 = cos(4Pi/16)*(b5 - b6)
; d9 = c9 + tan(2Pi/16)*c14
; d10 = tan(2Pi/16)*c10 + c13
; d11 = cos(4Pi/16)*(c11 + c12)
; d12 = cos(4Pi/16)*(c11 - c12)
; d13 = tan(2Pi/16)*c13 - c10
; d14 = tan(2Pi/16)*c9 - c14
; e0 = b0 + b4
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -