⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 idct.asm

📁 这是一个JPEG解码器,里面使用了MMX,SSE等汇编指令集
💻 ASM
📖 第 1 页 / 共 5 页
字号:
        movaps      xmm3, xmm4                  ;// tm65
        addps       xmm4, xmm1                  ;// tp65 + tm65
        subps       xmm1, xmm3                  ;// tp65 - tm65
        mulps       xmm4, TBL_cos4_16           ;// t6
        mulps       xmm1, TBL_cos4_16           ;// t5
        movaps      X1, xmm4                    ;// t6
        movaps      X3, xmm0                    ;// t4
        movaps      xmm5, X4                    ;// x4
        movaps      xmm3, X0                    ;// x0
        movaps      xmm6, xmm5                  ;// x4
        addps       xmm5, xmm3                  ;// tp03
        subps       xmm3, xmm6                  ;// tp12
        movaps      xmm0, X2                    ;// x2
        movaps      xmm4, X6                    ;// x6
        movaps      xmm6, xmm0                  ;// x2
        mulps       xmm0, TBL_tg2_16            ;// x2*tg2
        movaps      xmm7, xmm4                  ;// x6
        mulps       xmm4, TBL_tg2_16            ;// x6*tg2
        subps       xmm0, xmm7                  ;// tm12
        addps       xmm4, xmm6                  ;// tm03
        movaps      xmm6, xmm0                  ;// tm12
        addps       xmm0, xmm3                  ;// t1
        subps       xmm3, xmm6                  ;// t2
        movaps      xmm6, xmm4                  ;// tm03
        addps       xmm4, xmm5                  ;// t0
        subps       xmm5, xmm6                  ;// t3
        movaps      xmm6, xmm2                  ;// t7
        addps       xmm2, xmm4                  ;// t0 + t7
        subps       xmm4, xmm6                  ;// t0 - t7
        movaps      xmm7, xmm1                  ;// t5
        addps       xmm1, xmm3                  ;// t2 + t5
        subps       xmm3, xmm7                  ;// t2 - t5
        movaps      xmm6, X3                    ;// t4
        movaps      xmm7, xmm5                  ;// t3
        subps       xmm5, xmm6                  ;// t3 - t4
        addps       xmm7, xmm6                  ;// t3 + t4
        movaps      X0, xmm2                    ;// save y0
        movaps      X7, xmm4                    ;// save y7
        movaps      X2, xmm1                    ;// save y2
        movaps      X5, xmm3                    ;// save y5
        movaps      X4, xmm5                    ;// save y4
        movaps      X3, xmm7                    ;// save y3
        movaps      xmm2, X1                    ;// t6
        movaps      xmm3, xmm0                  ;// t1
        addps       xmm0, xmm2                  ;// t1 + t6
        subps       xmm3, xmm2                  ;// t1 - t6
        movaps      X1, xmm0                    ;// save y1
        movaps      X6, xmm3                    ;// save y6
        add         I, 16
        jnz         ColLoop          

        PROFILE_OUT "IDCT_8x8_SSE2"
        DBG_IDCT_OUT_8x8

        add         DST, 8*8*sizeof(real4)

        jmp         [Ctx].RetIdct

IDCT_8x8_SSE2 ENDP

;//=========================================================================
;// Dequantization and IDCT on a block of 8x8 samples, output 16x16 samples
;// params : esp : source pointer
;// params : edi : destination pointer
;// params : eax : pointer to quantization table
;// return : destination pointer += 16*16*sizeof(real4)
;//=========================================================================

IDCT_16x16_SSE2 PROC

        QTB   EQU <eax>
        SRC   EQU <QPTR [esp]>
        DST   EQU <edi>
        I     EQU <ecx>
        W     EQU <edx>
        J     EQU <Ctx>
        X0    EQU <[DST][I+4*16+0*16*4]>
        X1    EQU <[DST][I+4*16+1*16*4]>
        X2    EQU <[DST][I+4*16+2*16*4]>
        X3    EQU <[DST][I+4*16+3*16*4]>
        X4    EQU <[DST][I+4*16+4*16*4]>
        X5    EQU <[DST][I+4*16+5*16*4]>
        X6    EQU <[DST][I+4*16+6*16*4]>
        X7    EQU <[DST][I+4*16+7*16*4]>
        X8    EQU <[DST][I+4*16+8*16*4]>
        X9    EQU <[DST][I+4*16+9*16*4]>
        X10   EQU <[DST][I+4*16+10*16*4]>
        X11   EQU <[DST][I+4*16+11*16*4]>
        X12   EQU <[DST][I+4*16+12*16*4]>
        X13   EQU <[DST][I+4*16+13*16*4]>
        X14   EQU <[DST][I+4*16+14*16*4]>
        X15   EQU <[DST][I+4*16+15*16*4]>

        DBG_IDCT_IN
        PROFILE_IN

        ;// save context
        movd        mm1, Ctx

        ;// apply IDCT to the rows
        mov         I, -8*4
        mov         J, -8*16*4
RowLoop:
        ; /*
        ; x0 *= q0; x1 *= q1; x2 *= q2; x3 *= q3;
        ; x4 *= q4; x5 *= q5; x6 *= q6; x7 *= q7;
        ;
        ; a0 = x0*w0 + x2*w1 + x4*w2 + x6*w3;
        ; a1 = x0*w4 + x2*w5 + x4*w6 + x6*w7;
        ; a2 = x0*w8 + x2*w9 + x4*w10 + x6*w11;
        ; a3 = x0*w12 + x2*w13 + x4*w14 + x6*w15;
        ; b0 = x1*w16 + x3*w17 + x5*w18 + x7*w19;
        ; b1 = x1*w20 + x3*w21 + x5*w22 + x7*w23;
        ; b2 = x1*w24 + x3*w25 + x5*w26 + x7*w27;
        ; b3 = x1*w28 + x3*w29 + x5*w30 + x7*w31;
        ;
        ; y0  = a0 + b0;
        ; y1  = a1 + b1;
        ; y2  = a2 + b2;
        ; y3  = a3 + b3;
        ; y12 = a3 - b3;
        ; y13 = a2 - b2;
        ; y14 = a1 - b1;
        ; y15 = a0 - b0;
        ;
        ; a4 = x0*w32 + x2*w33 + x4*w34 + x6*w35;
        ; a5 = x0*w36 + x2*w37 + x4*w38 + x6*w39;
        ; a6 = x0*w40 + x2*w41 + x4*w42 + x6*w43;
        ; a7 = x0*w44 + x2*w45 + x4*w46 + x6*w47;
        ; b4 = x1*w48 + x3*w49 + x5*w50 + x7*w51;
        ; b5 = x1*w52 + x3*w53 + x5*w54 + x7*w55;
        ; b6 = x1*w56 + x3*w57 + x5*w58 + x7*w59;
        ; b7 = x1*w60 + x3*w61 + x5*w62 + x7*w63;
        ;
        ; y4  = a4 + b4;
        ; y5  = a5 + b5;
        ; y6  = a6 + b6;
        ; y7  = a7 + b7;
        ; y8  = a7 - b7;
        ; y9  = a6 - b6;
        ; y10 = a5 - b5;
        ; y11 = a4 - b4;
        ; */
        mov         W, TBL_MultRow16x16[I+8*4]  ;// row multiplier
        cvtdq2ps    xmm0, [SRC][8*I+8*8*4]      ;// x3 x2 x1 x0
        cvtdq2ps    xmm1, [SRC][8*I+8*8*4+4*4]  ;// x7 x6 x5 x4
        mulps       xmm0, [QTB][8*I+8*8*4]      ;// x3*q3 x2*q2 x1*q1 x0*q0
        mulps       xmm1, [QTB][8*I+8*8*4+4*4]  ;// x7*q7 x6*q6 x5*q5 x4*q4
        pshufd      xmm2, xmm0, 000000000b      ;// x0 x0 x0 x0
        pshufd      xmm3, xmm0, 010101010b      ;// x2 x2 x2 x2
        pshufd      xmm4, xmm1, 000000000b      ;// x4 x4 x4 x4
        pshufd      xmm5, xmm1, 010101010b      ;// x6 x6 x6 x6
        movaps      xmm6, xmm0                  ;// x3 x2 x1 x0
        movaps      xmm7, xmm1                  ;// x7 x6 x5 x4
        mulps       xmm2, [W]                   ;// x0*w12 x0*w08 x0*w04 x0*w00
        mulps       xmm3, [W][4*4]              ;// x2*w13 x2*w09 x2*w05 x2*w01
        mulps       xmm4, [W][8*4]              ;// x4*w14 x4*w10 x4*w06 x4*w02
        mulps       xmm5, [W][12*4]             ;// x6*w15 x6*w11 x6*w07 x6*w03
        addps       xmm2, xmm3                  ;// x0*w12+x2*w13 x0*w08+x2*w09 x0*w04+x2*w05 x0*w00+x2*w01
        addps       xmm4, xmm5                  ;// x4*w14+x6*w15 x4*w10+x6*w11 x4*w06+x6*w07 x4*w02+x6*w03
        pshufd      xmm3, xmm0, 001010101b      ;// x1 x1 x1 x1
        addps       xmm2, xmm4                  ;// a3 a2 a1 a0
        pshufd      xmm0, xmm0, 011111111b      ;// x3 x3 x3 x3
        pshufd      xmm4, xmm1, 001010101b      ;// x5 x5 x5 x5
        pshufd      xmm1, xmm1, 011111111b      ;// x7 x7 x7 x7
        mulps       xmm3, [W][16*4]             ;// x1*w28 x1*w24 x1*w20 x1*w16
        mulps       xmm0, [W][20*4]             ;// x3*w29 x3*w25 x3*w21 x3*w17
        mulps       xmm4, [W][24*4]             ;// x5*w30 x5*w26 x5*w22 x5*w18
        mulps       xmm1, [W][28*4]             ;// x7*w31 x7*w27 x7*w23 x7*w19
        addps       xmm3, xmm0                  ;// x1*w28+x3*w29 x1*w24+x3*w25 x1*w20+x3*w21 x1*w16+x3*w17
        addps       xmm4, xmm1                  ;// x5*w30+x7*w31 x5*w26+x7*w27 x5*w22+x7*w23 x5*w18+x7*w19
        addps       xmm3, xmm4                  ;// b3 b2 b1 b0
        movaps      xmm4, xmm2                  ;// a3 a2 a1 a0
        addps       xmm2, xmm3                  ;// y3 y2 y1 y0
        subps       xmm4, xmm3                  ;// y12 y13 y14 y15
        shufps      xmm4, xmm4, 000011011b      ;// y15 y14 y13 y12
        movaps      [DST][J+8*16*4], xmm2       ;// save y3 y2 y1 y0
        movaps      [DST][J+8*16*4+12*4], xmm4  ;// save y15 y14 y13 y12
        pshufd      xmm2, xmm6, 000000000b      ;// x0 x0 x0 x0
        pshufd      xmm3, xmm6, 010101010b      ;// x2 x2 x2 x2
        pshufd      xmm4, xmm7, 000000000b      ;// x4 x4 x4 x4
        pshufd      xmm5, xmm7, 010101010b      ;// x6 x6 x6 x6
        mulps       xmm2, [W][32*4]             ;// x0*w44 x0*w40 x0*w36 x0*w32
        mulps       xmm3, [W][36*4]             ;// x2*w45 x2*w41 x2*w37 x2*w33
        mulps       xmm4, [W][40*4]             ;// x4*w46 x4*w42 x4*w38 x4*w34
        mulps       xmm5, [W][44*4]             ;// x6*w47 x6*w43 x6*w39 x6*w35
        addps       xmm2, xmm3                  ;// x0*w44+x2*w45 x0*w40+x2*w41 x0*w36+x2*w37 x0*w32+x2*w33
        addps       xmm4, xmm5                  ;// x4*w46+x6*w47 x4*w42+x6*w43 x4*w38+x6*w39 x4*w34+x6*w35
        pshufd      xmm3, xmm6, 001010101b      ;// x1 x1 x1 x1
        addps       xmm2, xmm4                  ;// a7 a6 a5 a4
        pshufd      xmm6, xmm6, 011111111b      ;// x3 x3 x3 x3
        pshufd      xmm4, xmm7, 001010101b      ;// x5 x5 x5 x5
        pshufd      xmm7, xmm7, 011111111b      ;// x7 x7 x7 x7
        mulps       xmm3, [W][48*4]             ;// x1*w60 x1*w56 x1*w52 x1*w48
        mulps       xmm6, [W][52*4]             ;// x3*w61 x3*w57 x3*w53 x3*w49
        mulps       xmm4, [W][56*4]             ;// x5*w62 x5*w58 x5*w54 x5*w50
        mulps       xmm7, [W][60*4]             ;// x7*w63 x7*w59 x7*w55 x7*w51
        addps       xmm3, xmm6                  ;// x1*w60+x3*w61 x1*w56+x3*w57 x1*w52+x3*w53 x1*w48+x3*w49
        addps       xmm4, xmm7                  ;// x5*w62+x7*w62 x5*w58+x7*w59 x5*w54+x7*w55 x5*w50+x7*w51
        addps       xmm3, xmm4                  ;// b7 b6 b5 b4
        movaps      xmm4, xmm2                  ;// a7 a6 a5 a4
        addps       xmm2, xmm3                  ;// y7 y6 y5 y4
        subps       xmm4, xmm3                  ;// y8 y9 y10 y11
        shufps      xmm4, xmm4, 000011011b      ;// y11 y10 y9 y8
        movaps      [DST][J+8*16*4+4*4], xmm2   ;// save y7 y6 y5 y4
        movaps      [DST][J+8*16*4+8*4], xmm4   ;// save y11 y10 y9 y8
        add         J, 16*4
        add         I, 4
        jnz         RowLoop

        ;// apply IDCT to the columns
        mov         I, -4*16
ColLoop:
        ; /*
        ; a3 = tan(2Pi/16)*x4
        ; a5 = tan(Pi/16)*x2
        ; a7 = tan(3Pi/16)*x6
        ; a9 = tan(Pi/32)*x1
        ; a11 = tan(3Pi/32)*x3
        ; a13 = tan(5Pi/32)*x5
        ; a15 = tan(7Pi/32)*x7

        ; b0  = x0 + x4
        ; b1  = x0 + a3
        ; b2  = x0 - a3
        ; b3  = x0 - x4
        ; b4  = x2 + x6
        ; b5  = x2 - x6
        ; b6  = a5 + a7
        ; b7  = a5 - a7
        ; b8  = x1 + x7
        ; b9  = x1 - x7
        ; b10 = a9 + a15
        ; b11 = a9 - a15
        ; b12 = x3 + x5
        ; b13 = x3 - x5
        ; b14 = a11 + a13
        ; b15 = a11 - a13

        ; c8 = b8 + b12
        ; c9 = b9 + b14
        ; c10 = b13 - b10
        ; c11 = b8 - b12
        ; c12 = b11 + b15
        ; c13 = b9 - b14
        ; c14 = b13 + b10
        ; c15 = b11 - b15

        ; d5 = cos(4Pi/16)*(b5 + b6)
        ; d6 = cos(4Pi/16)*(b5 - b6)
        ; d9 = c9 + tan(2Pi/16)*c14
        ; d10 = tan(2Pi/16)*c10 + c13
        ; d11 = cos(4Pi/16)*(c11 + c12)
        ; d12 = cos(4Pi/16)*(c11 - c12)
        ; d13 = tan(2Pi/16)*c13 - c10
        ; d14 = tan(2Pi/16)*c9 - c14

        ; e0 = b0 + b4

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -