📄 idct.asm
字号:
pshufd xmm1, xmm1, 011111111b ;// x7 x7 x7 x7
mulps xmm3, [W][16*4] ;// x1*w28 x1*w24 x1*w20 x1*w16
mulps xmm0, [W][20*4] ;// x3*w29 x3*w25 x3*w21 x3*w17
mulps xmm4, [W][24*4] ;// x5*w30 x5*w26 x5*w22 x5*w18
mulps xmm1, [W][28*4] ;// x7*w31 x7*w27 x7*w23 x7*w19
addps xmm3, xmm0 ;// x1*w28+x3*w29 x1*w24+x3*w25 x1*w20+x3*w21 x1*w16+x3*w17
addps xmm4, xmm1 ;// x5*w30+x7*w31 x5*w26+x7*w27 x5*w22+x7*w23 x5*w18+x7*w19
addps xmm3, xmm4 ;// b3 b2 b1 b0
movaps xmm4, xmm2 ;// a3 a2 a1 a0
addps xmm2, xmm3 ;// y3 y2 y1 y0
subps xmm4, xmm3 ;// y12 y13 y14 y15
shufps xmm4, xmm4, 000011011b ;// y15 y14 y13 y12
movaps [DST][J+8*16*4], xmm2 ;// save y3 y2 y1 y0
movaps [DST][J+8*16*4+12*4], xmm4 ;// save y15 y14 y13 y12
pshufd xmm2, xmm6, 000000000b ;// x0 x0 x0 x0
pshufd xmm3, xmm6, 010101010b ;// x2 x2 x2 x2
pshufd xmm4, xmm7, 000000000b ;// x4 x4 x4 x4
pshufd xmm5, xmm7, 010101010b ;// x6 x6 x6 x6
mulps xmm2, [W][32*4] ;// x0*w44 x0*w40 x0*w36 x0*w32
mulps xmm3, [W][36*4] ;// x2*w45 x2*w41 x2*w37 x2*w33
mulps xmm4, [W][40*4] ;// x4*w46 x4*w42 x4*w38 x4*w34
mulps xmm5, [W][44*4] ;// x6*w47 x6*w43 x6*w39 x6*w35
addps xmm2, xmm3 ;// x0*w44+x2*w45 x0*w40+x2*w41 x0*w36+x2*w37 x0*w32+x2*w33
addps xmm4, xmm5 ;// x4*w46+x6*w47 x4*w42+x6*w43 x4*w38+x6*w39 x4*w34+x6*w35
pshufd xmm3, xmm6, 001010101b ;// x1 x1 x1 x1
addps xmm2, xmm4 ;// a7 a6 a5 a4
pshufd xmm6, xmm6, 011111111b ;// x3 x3 x3 x3
pshufd xmm4, xmm7, 001010101b ;// x5 x5 x5 x5
pshufd xmm7, xmm7, 011111111b ;// x7 x7 x7 x7
mulps xmm3, [W][48*4] ;// x1*w60 x1*w56 x1*w52 x1*w48
mulps xmm6, [W][52*4] ;// x3*w61 x3*w57 x3*w53 x3*w49
mulps xmm4, [W][56*4] ;// x5*w62 x5*w58 x5*w54 x5*w50
mulps xmm7, [W][60*4] ;// x7*w63 x7*w59 x7*w55 x7*w51
addps xmm3, xmm6 ;// x1*w60+x3*w61 x1*w56+x3*w57 x1*w52+x3*w53 x1*w48+x3*w49
addps xmm4, xmm7 ;// x5*w62+x7*w62 x5*w58+x7*w59 x5*w54+x7*w55 x5*w50+x7*w51
addps xmm3, xmm4 ;// b7 b6 b5 b4
movaps xmm4, xmm2 ;// a7 a6 a5 a4
addps xmm2, xmm3 ;// y7 y6 y5 y4
subps xmm4, xmm3 ;// y8 y9 y10 y11
shufps xmm4, xmm4, 000011011b ;// y11 y10 y9 y8
movaps [DST][J+8*16*4+4*4], xmm2 ;// save y7 y6 y5 y4
movaps [DST][J+8*16*4+8*4], xmm4 ;// save y11 y10 y9 y8
add J, 16*4
add I, 4
jnz RowLoop
;// apply 8-point IDCT to the columns
mov I, -4*16
ColLoop:
movaps xmm0, X1 ;// x1
movaps xmm1, X7 ;// x7
movaps xmm2, xmm0 ;// x1
mulps xmm0, TBL_tg1_16 ;// x1*tg1
movaps xmm3, xmm1 ;// x7
mulps xmm1, TBL_tg1_16 ;// x7*tg1
subps xmm0, xmm3 ;// tp465
addps xmm2, xmm1 ;// tp765
movaps xmm1, X3 ;// x3
movaps xmm3, X5 ;// x5
movaps xmm4, xmm1 ;// x3
mulps xmm1, TBL_tg3_16 ;// x3*tg3
movaps xmm5, xmm3 ;// x5
mulps xmm3, TBL_tg3_16 ;// x5*tg3
subps xmm5, xmm1 ;// tm465
addps xmm3, xmm4 ;// tm765
movaps xmm1, xmm2 ;// tp765
movaps xmm4, xmm0 ;// tp465
addps xmm2, xmm3 ;// t7
subps xmm1, xmm3 ;// tp65
addps xmm0, xmm5 ;// t4
subps xmm4, xmm5 ;// tm65
movaps xmm3, xmm4 ;// tm65
addps xmm4, xmm1 ;// tp65 + tm65
subps xmm1, xmm3 ;// tp65 - tm65
mulps xmm4, TBL_cos4_16 ;// t6
mulps xmm1, TBL_cos4_16 ;// t5
movaps X1, xmm4 ;// t6
movaps X3, xmm0 ;// t4
movaps xmm5, X4 ;// x4
movaps xmm3, X0 ;// x0
movaps xmm6, xmm5 ;// x4
addps xmm5, xmm3 ;// tp03
subps xmm3, xmm6 ;// tp12
movaps xmm0, X2 ;// x2
movaps xmm4, X6 ;// x6
movaps xmm6, xmm0 ;// x2
mulps xmm0, TBL_tg2_16 ;// x2*tg2
movaps xmm7, xmm4 ;// x6
mulps xmm4, TBL_tg2_16 ;// x6*tg2
subps xmm0, xmm7 ;// tm12
addps xmm4, xmm6 ;// tm03
movaps xmm6, xmm0 ;// tm12
addps xmm0, xmm3 ;// t1
subps xmm3, xmm6 ;// t2
movaps xmm6, xmm4 ;// tm03
addps xmm4, xmm5 ;// t0
subps xmm5, xmm6 ;// t3
movaps xmm6, xmm2 ;// t7
addps xmm2, xmm4 ;// t0 + t7
subps xmm4, xmm6 ;// t0 - t7
movaps xmm7, xmm1 ;// t5
addps xmm1, xmm3 ;// t2 + t5
subps xmm3, xmm7 ;// t2 - t5
movaps xmm6, X3 ;// t4
movaps xmm7, xmm5 ;// t3
subps xmm5, xmm6 ;// t3 - t4
addps xmm7, xmm6 ;// t3 + t4
movaps X0, xmm2 ;// save y0
movaps X7, xmm4 ;// save y7
movaps X2, xmm1 ;// save y2
movaps X5, xmm3 ;// save y5
movaps X4, xmm5 ;// save y4
movaps X3, xmm7 ;// save y3
movaps xmm2, X1 ;// t6
movaps xmm3, xmm0 ;// t1
addps xmm0, xmm2 ;// t1 + t6
subps xmm3, xmm2 ;// t1 - t6
movaps X1, xmm0 ;// save y1
movaps X6, xmm3 ;// save y6
add I, 16
jnz ColLoop
movd Ctx, mm1
PROFILE_OUT "IDCT_16x8_SSE2"
DBG_IDCT_OUT_16x8
add DST, 16*8*sizeof(real4)
jmp [Ctx].RetIdct
IDCT_16x8_SSE2 ENDP
;//=========================================================================
;// Dequantization and IDCT on a block of 8x8 samples, output 8x8 samples
;// params : esp : source pointer
;// params : edi : destination pointer
;// params : eax : pointer to quantization table
;// return : destination pointer += 8*8*sizeof(real4)
;//=========================================================================
IDCT_8x8_SSE PROC
QTB EQU <eax>
SRC EQU <QPTR [esp]>
DST EQU <edi>
I EQU <ecx>
W EQU <edx>
X0 EQU [DST][I+2*16+0*8*4]
X1 EQU [DST][I+2*16+1*8*4]
X2 EQU [DST][I+2*16+2*8*4]
X3 EQU [DST][I+2*16+3*8*4]
X4 EQU [DST][I+2*16+4*8*4]
X5 EQU [DST][I+2*16+5*8*4]
X6 EQU [DST][I+2*16+6*8*4]
X7 EQU [DST][I+2*16+7*8*4]
DBG_IDCT_IN
PROFILE_IN
;// apply IDCT to the rows
mov I, -8*4
RowLoop:
mov W, TBL_MultRow8x8[I+8*4] ;// row multiplier
cvtpi2ps xmm0, [SRC][8*I+8*8*4] ;// _ _ x1 x0
cvtpi2ps xmm2, [SRC][8*I+8*8*4+4*2] ;// _ _ x3 x2
cvtpi2ps xmm1, [SRC][8*I+8*8*4+4*4] ;// _ _ x5 x4
cvtpi2ps xmm3, [SRC][8*I+8*8*4+4*6] ;// _ _ x7 x6
shufps xmm0, xmm2, 001000100b ;// x3 x2 x1 x0
shufps xmm1, xmm3, 001000100b ;// x7 x6 x5 x4
mulps xmm0, [QTB][8*I+8*8*4] ;// x3*q3 x2*q2 x1*q1 x0*q0
mulps xmm1, [QTB][8*I+8*8*4+4*4] ;// x7*q7 x6*q6 x5*q5 x4*q4
movaps xmm2, xmm0 ;// x3 x2 x1 x0
movaps xmm3, xmm0 ;// x3 x2 x1 x0
movaps xmm4, xmm1 ;// x7 x6 x5 x4
movaps xmm5, xmm1 ;// x7 x6 x5 x4
shufps xmm2, xmm2, 000000000b ;// x0 x0 x0 x0
shufps xmm3, xmm3, 010101010b ;// x2 x2 x2 x2
shufps xmm4, xmm4, 000000000b ;// x4 x4 x4 x4
shufps xmm5, xmm5, 010101010b ;// x6 x6 x6 x6
mulps xmm2, [W] ;// x0*w12 x0*w08 x0*w04 x0*w00
mulps xmm3, [W][4*4] ;// x2*w13 x2*w09 x2*w05 x2*w01
mulps xmm4, [W][8*4] ;// x4*w14 x4*w10 x4*w06 x4*w02
mulps xmm5, [W][12*4] ;// x6*w15 x6*w11 x6*w07 x6*w03
addps xmm2, xmm3 ;// x0*w12+x2*w13 x0*w08+x2*w09 x0*w04+x2*w05 x0*w00+x2*w01
addps xmm4, xmm5 ;// x4*w14+x6*w15 x4*w10+x6*w11 x4*w06+x6*w07 x4*w02+x6*w03
addps xmm2, xmm4 ;// a3 a2 a1 a0
movaps xmm3, xmm0 ;// x3 x2 x1 x0
movaps xmm4, xmm1 ;// x3 x2 x1 x0
shufps xmm3, xmm3, 001010101b ;// x1 x1 x1 x1
shufps xmm0, xmm0, 011111111b ;// x3 x3 x3 x3
shufps xmm4, xmm4, 001010101b ;// x5 x5 x5 x5
shufps xmm1, xmm1, 011111111b ;// x7 x7 x7 x7
mulps xmm3, [W][16*4] ;// x1*w28 x1*w24 x1*w20 x1*w16
mulps xmm0, [W][20*4] ;// x3*w29 x3*w25 x3*w21 x3*w17
mulps xmm4, [W][24*4] ;// x5*w30 x5*w26 x5*w22 x5*w18
mulps xmm1, [W][28*4] ;// x7*w31 x7*w27 x7*w23 x7*w19
addps xmm3, xmm0 ;// x1*w28+x3*w29 x1*w24+x3*w25 x1*w20+x3*w21 x1*w16+x3*w17
addps xmm4, xmm1 ;// x5*w30+x7*w31 x5*w26+x7*w27 x5*w22+x7*w23 x5*w18+x7*w19
addps xmm3, xmm4 ;// b3 b2 b1 b0
movaps xmm4, xmm2 ;// a3 a2 a1 a0
addps xmm2, xmm3 ;// y3 y2 y1 y0
subps xmm4, xmm3 ;// y4 y5 y6 y7
shufps xmm4, xmm4, 000011011b ;// y7 y6 y5 y4
movaps [DST][8*I+8*8*4], xmm2 ;// save y3 y2 y1 y0
movaps [DST][8*I+8*8*4+16], xmm4 ;// save y7 y6 y5 y4
add I, 4
jnz RowLoop
;// apply IDCT to the columns
mov I, -2*16
ColLoop:
movaps xmm0, X1 ;// x1
movaps xmm1, X7 ;// x7
movaps xmm2, xmm0 ;// x1
mulps xmm0, TBL_tg1_16 ;// x1*tg1
movaps xmm3, xmm1 ;// x7
mulps xmm1, TBL_tg1_16 ;// x7*tg1
subps xmm0, xmm3 ;// tp465
addps xmm2, xmm1 ;// tp765
movaps xmm1, X3 ;// x3
movaps xmm3, X5 ;// x5
movaps xmm4, xmm1 ;// x3
mulps xmm1, TBL_tg3_16 ;// x3*tg3
movaps xmm5, xmm3 ;// x5
mulps xmm3, TBL_tg3_16 ;// x5*tg3
subps xmm5, xmm1 ;// tm465
addps xmm3, xmm4 ;// tm765
movaps xmm1, xmm2 ;// tp765
movaps xmm4, xmm0 ;// tp465
addps xmm2, xmm3 ;// t7
subps xmm1, xmm3 ;// tp65
addps xmm0, xmm5 ;// t4
subps xmm4, xmm5 ;// tm65
movaps xmm3, xmm4 ;// tm65
addps xmm4, xmm1 ;// tp65 + tm65
subps xmm1, xmm3 ;// tp65 - tm65
mulps xmm4, TBL_cos4_16 ;// t6
mulps xmm1, TBL_cos4_16 ;// t5
movaps X1, xmm4 ;// t6
movaps X3, xmm0 ;// t4
movaps xmm5, X4 ;// x4
movaps xmm3, X0 ;// x0
movaps xmm6, xmm5 ;// x4
addps xmm5, xmm3 ;// tp03
subps xmm3, xmm6 ;// tp12
movaps xmm0, X2 ;// x2
movaps xmm4, X6 ;// x6
movaps xmm6, xmm0 ;// x2
mulps xmm0, TBL_tg2_16 ;// x2*tg2
movaps xmm7, xmm4 ;// x6
mulps xmm4, TBL_tg2_16 ;// x6*tg2
subps xmm0, xmm7 ;// tm12
addps xmm4, xmm6 ;// tm03
movaps xmm6, xmm0 ;// tm12
addps xmm0, xmm3 ;// t1
subps xmm3, xmm6 ;// t2
movaps xmm6,
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -