📄 idct.asm
字号:
;// IDCT
COMMENT ^/*
The 8-point IDCT is based on Intel Application Notes 922 and 945
http://perso.wanadoo.fr/dr_manhattan/doc/ap922.pdf
http://perso.wanadoo.fr/dr_manhattan/doc/w_idct.pdf
The optimization of the IDCT is based on matrix decomposition. A decomposition reduces a complex matrix
multiplication to a series of simpler matrix multiplications.
The IDCT is performed in two steps :
1) Each row is multiplied by the IDCT matrix. The intermediate result is an 8x8 matrix noted R.
2) Each column of R is multiplied by the IDCT matrix.
AppNote 922 gives two different matrix decompositions of the 8-point IDCT, one for the rows and one
for the columns :
IDCT_ROW = (1/2) * tA * tM * tP
IDCT_COL = (1/2) * tA * tF * tE * tB * tD * tP
where t means transposition and P,M,A,F,E,B,D are 8x8 matrixes defined in AppNote 922.
IDCT_ROW is relatively simple and is implemented "as is" with a series of packed multiplications and
additions.
IDCT_COL seems more complex but a trick is used to reduce the number of operations. At the end of
step 1), each row is multiplied by a constant g(k) = cos(k*Pi/16) :
- rows 0 and 4 are multiplied by g(4)
- rows 1 and 7 are multiplied by g(1)
- rows 2 and 6 are multiplied by g(2)
- rows 3 and 5 are multiplied by g(3)
From a matrix point of view, the multiplications are equivalent to a multiplication by
a diagonal matrix (noted MULT_ROW). The diagonal of the matrix is :
g(4) g(1) g(2) g(3) g(4) g(3) g(2) g(1)
The result of step 1) can be written :
RM = MULT_ROW * R
Step 2) can be rewritten :
Y = IDCT_COL * R
= IDCT_COL * inverse(MULT_ROW) * RM
If we substitute IDCT_COL with the decomposition :
Y = (1/2) * tA * tF * tE * tB * tD * tP * inverse(MULT_ROW) * RM
The simplification comes from the fact that tD * tP * inverse(MULT_ROW) is a permutation matrix.
It doesn't involve any arithmetic operations, only a swapping of the index.
For example, if the input is the row :
X1, X2, X3, X4, X5, X6, X7, X8
the output will be
X1, X5, X3, X7, X2, X8, X4, X6
With this simplification, step 2) is reduced to :
IDCT_COL = (1/2) * tA * tF * tE * tB
Another simplification is to incorporate the division by 2 in MULT_ROW :
Y = (1/2) * tA * tF * tE * tB * tD * tP * inverse(MULT_ROW) * RM
= tA * tF * tE * tB * tD * tP * inverse(MULT_ROW/2) * RM
Step 1) is changed to :
RM = (MULT_ROW/2) * (1/2) * tA * tM * tP
= (MULT_ROW/4) * tA * tM * tP
Comment about the 16-point IDCT :
--------------------------------
A 16-point IDCT is used to perform IDCT and resampling in one step. A block of 8x8 samples is
transformed to a block of 16x16 pixels. Thanks to Guido Vollbeding for this suggestion.
The 16-point IDCT can also be calculated using a matrix decomposition. The decomposition was derived from the one
used in the 8-point IDCT. If you have Maple, you can download a worksheet showing the decomposition at
http://perso.wanadoo.fr/dr_manhattan/doc/idct16x16.rar
;^*/
.686P
.MODEL FLAT, STDCALL
OPTION CASEMAP:NONE
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
.XMM
INCLUDE jpeg.inc
INCLUDE jpeg_dec.inc
.CODE
;//=========================================================================
;// Init array of IDCT function pointer
;// params : eax : cpu type
;// return : eax : cpu type
;//=========================================================================
IDCT_Init PROC
mov ecx, TBL_IDCT[4*eax-4*CPU_MIN]
mov [Ctx].Idct, ecx
ret
IDCT_Init ENDP
;//=========================================================================
;// Dequantization and IDCT on a block of 8x8 samples, output 8x8 samples
;// params : esp : source pointer
;// params : edi : destination pointer
;// params : eax : pointer to quantization table
;// return : destination pointer += 8*8*sizeof(real4)
;//=========================================================================
IDCT_8x8_SSE2 PROC
QTB EQU <eax>
SRC EQU <QPTR [esp]>
DST EQU <edi>
I EQU <ecx>
W EQU <edx>
X0 EQU [DST][I+2*16+0*8*4]
X1 EQU [DST][I+2*16+1*8*4]
X2 EQU [DST][I+2*16+2*8*4]
X3 EQU [DST][I+2*16+3*8*4]
X4 EQU [DST][I+2*16+4*8*4]
X5 EQU [DST][I+2*16+5*8*4]
X6 EQU [DST][I+2*16+6*8*4]
X7 EQU [DST][I+2*16+7*8*4]
DBG_IDCT_IN
PROFILE_IN
;// apply IDCT to the rows
mov I, -8*4
RowLoop:
; /*
; x0 *= q0; x1 *= q1; x2 *= q2; x3 *= q3;
; x4 *= q4; x5 *= q5; x6 *= q6; x7 *= q7;
;
; a0 = x0*w0 + x2*w1 + x4*w2 + x6*w3;
; a1 = x0*w4 + x2*w5 + x4*w6 + x6*w7;
; a2 = x0*w8 + x2*w9 + x4*w10 + x6*w11;
; a3 = x0*w12 + x2*w13 + x4*w14 + x6*w15;
; b0 = x1*w16 + x3*w17 + x5*w18 + x7*w19;
; b1 = x1*w20 + x3*w21 + x5*w22 + x7*w23;
; b2 = x1*w24 + x3*w25 + x5*w26 + x7*w27;
; b3 = x1*w28 + x3*w29 + x5*w30 + x7*w31;
;
; y0 = a0 + b0
; y1 = a1 + b1
; y2 = a2 + b2
; y3 = a3 + b3
; y4 = a3 - b3
; y5 = a2 - b2
; y6 = a1 - b1
; y7 = a0 - b0
; */
mov W, TBL_MultRow8x8[I+8*4] ;// row multiplier
cvtdq2ps xmm0, [SRC][8*I+8*8*4] ;// x3 x2 x1 x0
cvtdq2ps xmm1, [SRC][8*I+8*8*4+4*4] ;// x7 x6 x5 x4
mulps xmm0, [QTB][8*I+8*8*4] ;// x3*q3 x2*q2 x1*q1 x0*q0
mulps xmm1, [QTB][8*I+8*8*4+4*4] ;// x7*q7 x6*q6 x5*q5 x4*q4
pshufd xmm2, xmm0, 000000000b ;// x0 x0 x0 x0
pshufd xmm3, xmm0, 010101010b ;// x2 x2 x2 x2
pshufd xmm4, xmm1, 000000000b ;// x4 x4 x4 x4
pshufd xmm5, xmm1, 010101010b ;// x6 x6 x6 x6
mulps xmm2, [W] ;// x0*w12 x0*w08 x0*w04 x0*w00
mulps xmm3, [W][4*4] ;// x2*w13 x2*w09 x2*w05 x2*w01
mulps xmm4, [W][8*4] ;// x4*w14 x4*w10 x4*w06 x4*w02
mulps xmm5, [W][12*4] ;// x6*w15 x6*w11 x6*w07 x6*w03
addps xmm2, xmm3 ;// x0*w12+x2*w13 x0*w08+x2*w09 x0*w04+x2*w05 x0*w00+x2*w01
addps xmm4, xmm5 ;// x4*w14+x6*w15 x4*w10+x6*w11 x4*w06+x6*w07 x4*w02+x6*w03
pshufd xmm3, xmm0, 001010101b ;// x1 x1 x1 x1
addps xmm2, xmm4 ;// a3 a2 a1 a0
pshufd xmm0, xmm0, 011111111b ;// x3 x3 x3 x3
pshufd xmm4, xmm1, 001010101b ;// x5 x5 x5 x5
pshufd xmm1, xmm1, 011111111b ;// x7 x7 x7 x7
mulps xmm3, [W][16*4] ;// x1*w28 x1*w24 x1*w20 x1*w16
mulps xmm0, [W][20*4] ;// x3*w29 x3*w25 x3*w21 x3*w17
mulps xmm4, [W][24*4] ;// x5*w30 x5*w26 x5*w22 x5*w18
mulps xmm1, [W][28*4] ;// x7*w31 x7*w27 x7*w23 x7*w19
addps xmm3, xmm0 ;// x1*w28+x3*w29 x1*w24+x3*w25 x1*w20+x3*w21 x1*w16+x3*w17
addps xmm4, xmm1 ;// x5*w30+x7*w31 x5*w26+x7*w27 x5*w22+x7*w23 x5*w18+x7*w19
addps xmm3, xmm4 ;// b3 b2 b1 b0
movaps xmm4, xmm2 ;// a3 a2 a1 a0
addps xmm2, xmm3 ;// y3 y2 y1 y0
subps xmm4, xmm3 ;// y4 y5 y6 y7
shufps xmm4, xmm4, 000011011b ;// y7 y6 y5 y4
movaps [DST][8*I+8*8*4], xmm2 ;// save y3 y2 y1 y0
movaps [DST][8*I+8*8*4+16], xmm4 ;// save y7 y6 y5 y4
add I, 4
jnz RowLoop
;// apply IDCT to the columns
mov I, -2*16
ColLoop:
; /*
; tp765 = x1 + x7 * tg1;
; tp465 = x1 * tg1 - x7;
; tm765 = x5 * tg3 + x3;
; tm465 = x5 - x3 * tg3;
; t7 = tp765 + tm765;
; tp65 = tp765 - tm765;
; t4 = tp465 + tm465;
; tm65 = tp465 - tm465;
; t6 = (tp65 + tm65) * cos4;
; t5 = (tp65 - tm65) * cos4;
; tp03 = x0 + x4;
; tp12 = x0 - x4;
; tm03 = x2 + x6 * tg2;
; tm12 = x2 * tg2 - x6;
; t0 = tp03 + tm03;
; t3 = tp03 - tm03;
; t1 = tp12 + tm12;
; t2 = tp12 - tm12;
; y0 = t0 + t7
; y2 = t2 + t5
; y4 = t3 - t4
; y6 = t1 - t6
; y1 = t1 + t6
; y3 = t3 + t4
; y5 = t2 - t5
; y7 = t0 - t7
; */
movaps xmm0, X1 ;// x1
movaps xmm1, X7 ;// x7
movaps xmm2, xmm0 ;// x1
mulps xmm0, TBL_tg1_16 ;// x1*tg1
movaps xmm3, xmm1 ;// x7
mulps xmm1, TBL_tg1_16 ;// x7*tg1
subps xmm0, xmm3 ;// tp465
addps xmm2, xmm1 ;// tp765
movaps xmm1, X3 ;// x3
movaps xmm3, X5 ;// x5
movaps xmm4, xmm1 ;// x3
mulps xmm1, TBL_tg3_16 ;// x3*tg3
movaps xmm5, xmm3 ;// x5
mulps xmm3, TBL_tg3_16 ;// x5*tg3
subps xmm5, xmm1 ;// tm465
addps xmm3, xmm4 ;// tm765
movaps xmm1, xmm2 ;// tp765
movaps xmm4, xmm0 ;// tp465
addps xmm2, xmm3 ;// t7
subps xmm1, xmm3 ;// tp65
addps xmm0, xmm5 ;// t4
subps xmm4, xmm5 ;// tm65
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -