📄 idct.asm
字号:
addps xmm4, xmm5 ;// x4*w14+x6*w15 x4*w10+x6*w11 x4*w06+x6*w07 x4*w02+x6*w03
pshufd xmm3, xmm0, 001010101b ;// x1 x1 x1 x1
addps xmm2, xmm4 ;// a3 a2 a1 a0
pshufd xmm0, xmm0, 011111111b ;// x3 x3 x3 x3
pshufd xmm4, xmm1, 001010101b ;// x5 x5 x5 x5
pshufd xmm1, xmm1, 011111111b ;// x7 x7 x7 x7
mulps xmm3, [W][16*4] ;// x1*w28 x1*w24 x1*w20 x1*w16
mulps xmm0, [W][20*4] ;// x3*w29 x3*w25 x3*w21 x3*w17
mulps xmm4, [W][24*4] ;// x5*w30 x5*w26 x5*w22 x5*w18
mulps xmm1, [W][28*4] ;// x7*w31 x7*w27 x7*w23 x7*w19
addps xmm3, xmm0 ;// x1*w28+x3*w29 x1*w24+x3*w25 x1*w20+x3*w21 x1*w16+x3*w17
addps xmm4, xmm1 ;// x5*w30+x7*w31 x5*w26+x7*w27 x5*w22+x7*w23 x5*w18+x7*w19
addps xmm3, xmm4 ;// b3 b2 b1 b0
movaps xmm4, xmm2 ;// a3 a2 a1 a0
addps xmm2, xmm3 ;// y3 y2 y1 y0
subps xmm4, xmm3 ;// y4 y5 y6 y7
shufps xmm4, xmm4, 000011011b ;// y7 y6 y5 y4
movaps [DST][8*I+8*8*4], xmm2 ;// save y3 y2 y1 y0
movaps [DST][8*I+8*8*4+16], xmm4 ;// save y7 y6 y5 y4
add I, 4
jnz RowLoop
;// apply 16-point IDCT to the columns
mov I, -2*16
ColLoop:
movaps xmm0, X0 ;// x0
movaps xmm1, X4 ;// x4
movaps xmm2, xmm0 ;// x0
movaps xmm3, xmm1 ;// x4
mulps xmm1, TBL_tg2_16 ;// tan(2Pi/16)*x4 = a3
movaps xmm4, xmm0 ;// x0
movaps xmm5, xmm0 ;// x0
addps xmm0, xmm3 ;// x0 + x4 = b0
subps xmm2, xmm3 ;// x0 - x4 = b3
addps xmm4, xmm1 ;// x0 + a3 = b1
subps xmm5, xmm1 ;// x0 - a3 = b2
movaps X0, xmm0 ;// save b0
movaps xmm0, X2 ;// x2
movaps xmm1, X6 ;// x6
movaps xmm3, xmm0 ;// x2
movaps xmm6, xmm1 ;// x6
mulps xmm0, TBL_tg1_16 ;// tan(Pi/16)*x2 = a5
mulps xmm1, TBL_tg3_16 ;// tan(3Pi/16)*x6 = a7
movaps xmm7, xmm3 ;// x2
addps xmm3, xmm6 ;// x2 + x6 = b4
subps xmm7, xmm6 ;// x2 - x6 = b5
movaps xmm6, xmm0 ;// a5
addps xmm0, xmm1 ;// a5 + a7 = b6
subps xmm6, xmm1 ;// a5 - a7 = b7
movaps xmm1, xmm7 ;// b5
addps xmm7, xmm0 ;// b5 + b6
subps xmm1, xmm0 ;// b5 - b6
movaps xmm0, TBL_cos4_16 ;// cos(4Pi/16)
mulps xmm7, xmm0 ;// cos(4Pi/16)*(b5 + b6) = d5
mulps xmm1, xmm0 ;// cos(4Pi/16)*(b5 - b6) = d6
movaps xmm0, xmm4 ;// b1
addps xmm4, xmm7 ;// b1 + d5 = e1
subps xmm0, xmm7 ;// b1 - d5 = e6
movaps xmm7, xmm5 ;// b2
addps xmm5, xmm1 ;// b2 + d6 = e2
subps xmm7, xmm1 ;// b2 - d6 = e5
movaps xmm1, xmm2 ;// b3
addps xmm2, xmm6 ;// b3 + b7 = e3
subps xmm1, xmm6 ;// b3 - b7 = e4
movaps X9, xmm0 ;// save e6
movaps X10, xmm7 ;// save e5
movaps X11, xmm1 ;// save e4
movaps X12, xmm2 ;// save e3
movaps X13, xmm5 ;// save e2
movaps X14, xmm4 ;// save e1
movaps xmm0, X0 ;// b0
movaps xmm1, xmm0 ;// b0
addps xmm0, xmm3 ;// b0 + b4 = e0
subps xmm1, xmm3 ;// b0 - b4 = e7
movaps X15, xmm0 ;// save e0
movaps X8, xmm1 ;// save e7
movaps xmm0, X1 ;// x1
movaps xmm1, X7 ;// x7
movaps xmm2, xmm0 ;// x1
movaps xmm3, xmm1 ;// x7
mulps xmm0, TBL_tg1_32 ;// tan(Pi/32)*x1 = a9
mulps xmm1, TBL_tg7_32 ;// tan(7Pi/32)*x7 = a15
movaps xmm4, xmm2 ;// x1
addps xmm2, xmm3 ;// x1 + x7 = b8
subps xmm4, xmm3 ;// x1 - x7 = b9
movaps xmm5, xmm0 ;// a9
addps xmm0, xmm1 ;// a9 + a15 = b10
subps xmm5, xmm1 ;// a9 - a15 = b11
movaps X0, xmm2 ;// save b8
movaps xmm1, X3 ;// x3
movaps xmm2, X5 ;// x5
movaps xmm3, xmm1 ;// x3
movaps xmm6, xmm2 ;// x5
mulps xmm1, TBL_tg3_32 ;// tan(3Pi/32)*x3 = a11
mulps xmm2, TBL_tg5_32 ;// tan(5Pi/32)*x5 = a13
movaps xmm7, xmm3 ;// x3
addps xmm3, xmm6 ;// x3 + x5 = b12
subps xmm7, xmm6 ;// x3 - x5 = b13
movaps xmm6, xmm1 ;// a11
addps xmm1, xmm2 ;// a11 + a13 = b14
subps xmm6, xmm2 ;// a11 - a13 = b15
movaps xmm2, xmm4 ;// b9
addps xmm4, xmm1 ;// b9 + b14 = c9
subps xmm2, xmm1 ;// b9 - b14 = c13
movaps xmm1, xmm7 ;// b13
addps xmm7, xmm0 ;// b13 + b10 = c14
subps xmm1, xmm0 ;// b13 - b10 = c10
movaps xmm0, xmm5 ;// b11
addps xmm5, xmm6 ;// b11 + b15 = c12
subps xmm0, xmm6 ;// b11 - b15 = c15
movaps xmm6, xmm3 ;// b12
addps xmm3, X0 ;// b8 + b12 = c8
subps xmm6, X0 ;// b12 - b8 = - c11
movaps X0, xmm3 ;// save c8
movaps xmm3, xmm5 ;// c12
addps xmm5, xmm6 ;// c12 - c11
subps xmm3, xmm6 ;// c12 + c11
mulps xmm5, TBL_cos4_16 ;// cos(4Pi/16)*(c12 - c11) = - d12
mulps xmm3, TBL_cos4_16 ;// cos(4Pi/16)*(c12 + c11) = d11
movaps X4, xmm5 ;// save (- d12)
movaps X3, xmm3 ;// save d11
movaps xmm3, TBL_tg2_16 ;// tan(2Pi/16)
movaps xmm5, xmm4 ;// c9
mulps xmm4, xmm3 ;// tan(2Pi/16)*c9
movaps xmm6, xmm7 ;// c14
mulps xmm7, xmm3 ;// tan(2Pi/16)*c14
subps xmm4, xmm6 ;// tan(2Pi/16)*c9 - c14 = d14
addps xmm5, xmm7 ;// c9 + tan(2Pi/16)*c14 = d9
movaps xmm6, xmm1 ;// c10
mulps xmm1, xmm3 ;// tan(2Pi/16)*c10
movaps xmm7, xmm2 ;// c13
mulps xmm2, xmm3 ;// tan(2Pi/16)*c13
addps xmm7, xmm1 ;// c13 + tan(2Pi/16)*c10 = d10
movaps xmm3, TBL_cos2_16 ;// cos(2Pi/16)
subps xmm2, xmm6 ;// tan(2Pi/16)*c13 - c10 = d13
mulps xmm5, xmm3 ;// cos(2Pi/16)*d9 = e9
mulps xmm7, xmm3 ;// cos(2Pi/16)*d10 = e10
mulps xmm2, xmm3 ;// cos(2Pi/16)*d13 = e13
mulps xmm4, xmm3 ;// cos(2Pi/16)*d14 = e14
movaps xmm1, X0 ;// c8
movaps xmm3, X15 ;// e0
movaps xmm6, xmm3 ;// e0
addps xmm3, xmm1 ;// e0 + c8 = y0
subps xmm6, xmm1 ;// e0 - c8 = y15
movaps X0, xmm3 ;// save y0
movaps X15, xmm6 ;// save y15
movaps xmm1, X13 ;// e2
movaps xmm3, X14 ;// e1
movaps xmm6, xmm1 ;// e2
addps xmm1, xmm7 ;// e2 + e10 = y2
subps xmm6, xmm7 ;// e2 - e10 = y13
movaps xmm7, xmm3 ;// e1
addps xmm3, xmm5 ;// e1 + e9 = y1
subps xmm7, xmm5 ;// e1 - e9 = y14
movaps X1, xmm3 ;// save y1
movaps X2, xmm1 ;// save y2
movaps X13, xmm6 ;// save y13
movaps X14, xmm7 ;// save y14
movaps xmm1, X3 ;// d11
movaps xmm3, X4 ;// -d12
movaps xmm5, X11 ;// e4
movaps xmm6, X12 ;// e3
movaps xmm7, xmm5 ;// e4
subps xmm5, xmm3 ;// e4 - (-d12) = y4
addps xmm7, xmm3 ;// e4 - d12 = y11
movaps xmm3, xmm6 ;// e3
addps xmm6, xmm1 ;// e3 + d11 = y3
subps xmm3, xmm1 ;// e3 - d11 = y12
movaps X3, xmm6 ;// save y3
movaps X4, xmm5 ;// save y4
movaps X11, xmm7 ;// save y11
movaps X12, xmm3 ;// save y12
movaps xmm1, X8 ;// e7
movaps xmm3, X9 ;// e6
movaps xmm5, X10 ;// e5
movaps xmm6, xmm3 ;// e6
addps xmm3, xmm4 ;// e6 + e14 = y6
subps xmm6, xmm4 ;// e6 - e14 = y9
movaps xmm4, xmm5 ;// e5
addps xmm5, xmm2 ;// e5 + e13 = y5
subps xmm4, xmm2 ;// e5 - e13 = y10
movaps xmm7, xmm1 ;// e7
addps xmm1, xmm0 ;// e7 + c15 = y7
subps xmm7, xmm0 ;// e7 - c15 = y8
movaps X5, xmm5 ;// save y5
movaps X6, xmm3 ;// save y6
movaps X7, xmm1 ;// save y7
movaps X8, xmm7 ;// save y8
movaps X9, xmm6 ;// save y9
movaps X10, xmm4 ;// save y10
add I, 16
jnz ColLoop
PROFILE_OUT "IDCT_8x16_SSE2"
DBG_IDCT_OUT_8x16
add DST, 8*16*sizeof(real4)
jmp [Ctx].RetIdct
IDCT_8x16_SSE2 ENDP
;//=========================================================================
;// Dequantization and IDCT on a block of 8x8 samples, output 16x8 samples
;// params : esp : source pointer
;// params : edi : destination pointer
;// params : eax : pointer to quantization table
;// return : destination pointer += 16*8*sizeof(real4)
;//=========================================================================
IDCT_16x8_SSE2 PROC
QTB EQU <eax>
SRC EQU <QPTR [esp]>
DST EQU <edi>
I EQU <ecx>
W EQU <edx>
J EQU <Ctx>
X0 EQU <[DST][I+4*16+0*16*4]>
X1 EQU <[DST][I+4*16+1*16*4]>
X2 EQU <[DST][I+4*16+2*16*4]>
X3 EQU <[DST][I+4*16+3*16*4]>
X4 EQU <[DST][I+4*16+4*16*4]>
X5 EQU <[DST][I+4*16+5*16*4]>
X6 EQU <[DST][I+4*16+6*16*4]>
X7 EQU <[DST][I+4*16+7*16*4]>
DBG_IDCT_IN
PROFILE_IN
;// save context
movd mm1, Ctx
;// apply 16-point IDCT to the rows
mov I, -8*4
mov J, -8*16*4
RowLoop:
mov W, TBL_MultRow16x16[I+8*4] ;// row multiplier
cvtdq2ps xmm0, [SRC][8*I+8*8*4] ;// x3 x2 x1 x0
cvtdq2ps xmm1, [SRC][8*I+8*8*4+4*4] ;// x7 x6 x5 x4
mulps xmm0, [QTB][8*I+8*8*4] ;// x3*q3 x2*q2 x1*q1 x0*q0
mulps xmm1, [QTB][8*I+8*8*4+4*4] ;// x7*q7 x6*q6 x5*q5 x4*q4
pshufd xmm2, xmm0, 000000000b ;// x0 x0 x0 x0
pshufd xmm3, xmm0, 010101010b ;// x2 x2 x2 x2
pshufd xmm4, xmm1, 000000000b ;// x4 x4 x4 x4
pshufd xmm5, xmm1, 010101010b ;// x6 x6 x6 x6
movaps xmm6, xmm0 ;// x3 x2 x1 x0
movaps xmm7, xmm1 ;// x7 x6 x5 x4
mulps xmm2, [W] ;// x0*w12 x0*w08 x0*w04 x0*w00
mulps xmm3, [W][4*4] ;// x2*w13 x2*w09 x2*w05 x2*w01
mulps xmm4, [W][8*4] ;// x4*w14 x4*w10 x4*w06 x4*w02
mulps xmm5, [W][12*4] ;// x6*w15 x6*w11 x6*w07 x6*w03
addps xmm2, xmm3 ;// x0*w12+x2*w13 x0*w08+x2*w09 x0*w04+x2*w05 x0*w00+x2*w01
addps xmm4, xmm5 ;// x4*w14+x6*w15 x4*w10+x6*w11 x4*w06+x6*w07 x4*w02+x6*w03
pshufd xmm3, xmm0, 001010101b ;// x1 x1 x1 x1
addps xmm2, xmm4 ;// a3 a2 a1 a0
pshufd xmm0, xmm0, 011111111b ;// x3 x3 x3 x3
pshufd xmm4, xmm1, 001010101b ;// x5 x5 x5 x5
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -