⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 idct.asm

📁 这是一个JPEG解码器,里面使用了MMX,SSE等汇编指令集
💻 ASM
📖 第 1 页 / 共 5 页
字号:
        ; e1 = b1 + d5
        ; e2 = b2 + d6
        ; e3 = b3 + b7
        ; e4 = b3 - b7
        ; e5 = b2 - d6
        ; e6 = b1 - d5
        ; e7 = b0 - b4
        ; e9 = cos(Pi/8)*d9
        ; e10 = cos(Pi/8)*d10
        ; e13 = cos(Pi/8)*d13
        ; e14 = cos(Pi/8)*d14

        ; y0 = e0 + c8
        ; y1 = e1 + e9
        ; y2 = e2 + e10
        ; y3 = e3 + d11
        ; y4 = e4 + d12
        ; y5 = e5 + e13
        ; y6 = e6 + e14
        ; y7 = e7 + c15
        ; y8 = e7 - c15
        ; y9 = e6 - e14
        ; y10 = e5 - e13
        ; y11 = e4 - d12
        ; y12 = e3 - d11
        ; y13 = e2 - e10
        ; y14 = e1 - e9
        ; y15 = e0 - c8
        ;*/
        movaps      xmm0, X0                    ;// x0
        movaps      xmm1, X4                    ;// x4
        movaps      xmm2, xmm0                  ;// x0
        movaps      xmm3, xmm1                  ;// x4
        mulps       xmm1, TBL_tg2_16            ;// tan(2Pi/16)*x4 = a3
        movaps      xmm4, xmm0                  ;// x0
        movaps      xmm5, xmm0                  ;// x0
        addps       xmm0, xmm3                  ;// x0 + x4 = b0
        subps       xmm2, xmm3                  ;// x0 - x4 = b3
        addps       xmm4, xmm1                  ;// x0 + a3 = b1
        subps       xmm5, xmm1                  ;// x0 - a3 = b2
        movaps      X0, xmm0                    ;// save b0
        movaps      xmm0, X2                    ;// x2
        movaps      xmm1, X6                    ;// x6
        movaps      xmm3, xmm0                  ;// x2
        movaps      xmm6, xmm1                  ;// x6
        mulps       xmm0, TBL_tg1_16            ;// tan(Pi/16)*x2 = a5
        mulps       xmm1, TBL_tg3_16            ;// tan(3Pi/16)*x6 = a7
        movaps      xmm7, xmm3                  ;// x2
        addps       xmm3, xmm6                  ;// x2 + x6 = b4
        subps       xmm7, xmm6                  ;// x2 - x6 = b5
        movaps      xmm6, xmm0                  ;// a5
        addps       xmm0, xmm1                  ;// a5 + a7 = b6
        subps       xmm6, xmm1                  ;// a5 - a7 = b7
        movaps      xmm1, xmm7                  ;// b5
        addps       xmm7, xmm0                  ;// b5 + b6
        subps       xmm1, xmm0                  ;// b5 - b6
        movaps      xmm0, TBL_cos4_16           ;// cos(4Pi/16)
        mulps       xmm7, xmm0                  ;// cos(4Pi/16)*(b5 + b6) = d5
        mulps       xmm1, xmm0                  ;// cos(4Pi/16)*(b5 - b6) = d6
        movaps      xmm0, xmm4                  ;// b1
        addps       xmm4, xmm7                  ;// b1 + d5 = e1
        subps       xmm0, xmm7                  ;// b1 - d5 = e6
        movaps      xmm7, xmm5                  ;// b2
        addps       xmm5, xmm1                  ;// b2 + d6 = e2
        subps       xmm7, xmm1                  ;// b2 - d6 = e5
        movaps      xmm1, xmm2                  ;// b3
        addps       xmm2, xmm6                  ;// b3 + b7 = e3
        subps       xmm1, xmm6                  ;// b3 - b7 = e4
        movaps      X9, xmm0                    ;// save e6
        movaps      X10, xmm7                   ;// save e5
        movaps      X11, xmm1                   ;// save e4
        movaps      X12, xmm2                   ;// save e3
        movaps      X13, xmm5                   ;// save e2
        movaps      X14, xmm4                   ;// save e1
        movaps      xmm0, X0                    ;// b0
        movaps      xmm1, xmm0                  ;// b0
        addps       xmm0, xmm3                  ;// b0 + b4 = e0
        subps       xmm1, xmm3                  ;// b0 - b4 = e7
        movaps      X15, xmm0                   ;// save e0
        movaps      X8, xmm1                    ;// save e7
        movaps      xmm0, X1                    ;// x1
        movaps      xmm1, X7                    ;// x7
        movaps      xmm2, xmm0                  ;// x1
        movaps      xmm3, xmm1                  ;// x7
        mulps       xmm0, TBL_tg1_32            ;// tan(Pi/32)*x1 = a9
        mulps       xmm1, TBL_tg7_32            ;// tan(7Pi/32)*x7 = a15
        movaps      xmm4, xmm2                  ;// x1
        addps       xmm2, xmm3                  ;// x1 + x7 = b8
        subps       xmm4, xmm3                  ;// x1 - x7 = b9
        movaps      xmm5, xmm0                  ;// a9
        addps       xmm0, xmm1                  ;// a9 + a15 = b10
        subps       xmm5, xmm1                  ;// a9 - a15 = b11
        movaps      X0, xmm2                    ;// save b8
        movaps      xmm1, X3                    ;// x3
        movaps      xmm2, X5                    ;// x5
        movaps      xmm3, xmm1                  ;// x3
        movaps      xmm6, xmm2                  ;// x5
        mulps       xmm1, TBL_tg3_32            ;// tan(3Pi/32)*x3 = a11
        mulps       xmm2, TBL_tg5_32            ;// tan(5Pi/32)*x5 = a13
        movaps      xmm7, xmm3                  ;// x3
        addps       xmm3, xmm6                  ;// x3 + x5 = b12
        subps       xmm7, xmm6                  ;// x3 - x5 = b13
        movaps      xmm6, xmm1                  ;// a11
        addps       xmm1, xmm2                  ;// a11 + a13 = b14
        subps       xmm6, xmm2                  ;// a11 - a13 = b15          
        movaps      xmm2, xmm4                  ;// b9
        addps       xmm4, xmm1                  ;// b9 + b14 = c9
        subps       xmm2, xmm1                  ;// b9 - b14 = c13
        movaps      xmm1, xmm7                  ;// b13
        addps       xmm7, xmm0                  ;// b13 + b10 = c14
        subps       xmm1, xmm0                  ;// b13 - b10 = c10
        movaps      xmm0, xmm5                  ;// b11
        addps       xmm5, xmm6                  ;// b11 + b15 = c12
        subps       xmm0, xmm6                  ;// b11 - b15 = c15
        movaps      xmm6, xmm3                  ;// b12
        addps       xmm3, X0                    ;// b8 + b12 = c8
        subps       xmm6, X0                    ;// b12 - b8 = - c11
        movaps      X0, xmm3                    ;// save c8
        movaps      xmm3, xmm5                  ;// c12
        addps       xmm5, xmm6                  ;// c12 - c11
        subps       xmm3, xmm6                  ;// c12 + c11
        mulps       xmm5, TBL_cos4_16           ;// cos(4Pi/16)*(c12 - c11) = - d12
        mulps       xmm3, TBL_cos4_16           ;// cos(4Pi/16)*(c12 + c11) = d11
        movaps      X4, xmm5                    ;// save (- d12)
        movaps      X3, xmm3                    ;// save d11
        movaps      xmm3, TBL_tg2_16            ;// tan(2Pi/16)
        movaps      xmm5, xmm4                  ;// c9
        mulps       xmm4, xmm3                  ;// tan(2Pi/16)*c9
        movaps      xmm6, xmm7                  ;// c14
        mulps       xmm7, xmm3                  ;// tan(2Pi/16)*c14
        subps       xmm4, xmm6                  ;// tan(2Pi/16)*c9 - c14 = d14
        addps       xmm5, xmm7                  ;// c9 + tan(2Pi/16)*c14 = d9
        movaps      xmm6, xmm1                  ;// c10
        mulps       xmm1, xmm3                  ;// tan(2Pi/16)*c10
        movaps      xmm7, xmm2                  ;// c13
        mulps       xmm2, xmm3                  ;// tan(2Pi/16)*c13
        addps       xmm7, xmm1                  ;// c13 + tan(2Pi/16)*c10 = d10
        movaps      xmm3, TBL_cos2_16           ;// cos(2Pi/16)
        subps       xmm2, xmm6                  ;// tan(2Pi/16)*c13 - c10 = d13
        mulps       xmm5, xmm3                  ;// cos(2Pi/16)*d9 = e9
        mulps       xmm7, xmm3                  ;// cos(2Pi/16)*d10 = e10
        mulps       xmm2, xmm3                  ;// cos(2Pi/16)*d13 = e13
        mulps       xmm4, xmm3                  ;// cos(2Pi/16)*d14 = e14
        movaps      xmm1, X0                    ;// c8
        movaps      xmm3, X15                   ;// e0
        movaps      xmm6, xmm3                  ;// e0
        addps       xmm3, xmm1                  ;// e0 + c8 = y0
        subps       xmm6, xmm1                  ;// e0 - c8 = y15
        movaps      X0, xmm3                    ;// save y0
        movaps      X15, xmm6                   ;// save y15
        movaps      xmm1, X13                   ;// e2
        movaps      xmm3, X14                   ;// e1
        movaps      xmm6, xmm1                  ;// e2
        addps       xmm1, xmm7                  ;// e2 + e10 = y2
        subps       xmm6, xmm7                  ;// e2 - e10 = y13
        movaps      xmm7, xmm3                  ;// e1
        addps       xmm3, xmm5                  ;// e1 + e9 = y1
        subps       xmm7, xmm5                  ;// e1 - e9 = y14
        movaps      X1, xmm3                    ;// save y1
        movaps      X2, xmm1                    ;// save y2
        movaps      X13, xmm6                   ;// save y13
        movaps      X14, xmm7                   ;// save y14
        movaps      xmm1, X3                    ;// d11
        movaps      xmm3, X4                    ;// -d12
        movaps      xmm5, X11                   ;// e4
        movaps      xmm6, X12                   ;// e3
        movaps      xmm7, xmm5                  ;// e4
        subps       xmm5, xmm3                  ;// e4 - (-d12) = y4
        addps       xmm7, xmm3                  ;// e4 - d12 = y11
        movaps      xmm3, xmm6                  ;// e3
        addps       xmm6, xmm1                  ;// e3 + d11 = y3
        subps       xmm3, xmm1                  ;// e3 - d11 = y12
        movaps      X3, xmm6                    ;// save y3
        movaps      X4, xmm5                    ;// save y4
        movaps      X11, xmm7                   ;// save y11
        movaps      X12, xmm3                   ;// save y12
        movaps      xmm1, X8                    ;// e7
        movaps      xmm3, X9                    ;// e6
        movaps      xmm5, X10                   ;// e5
        movaps      xmm6, xmm3                  ;// e6
        addps       xmm3, xmm4                  ;// e6 + e14 = y6
        subps       xmm6, xmm4                  ;// e6 - e14 = y9
        movaps      xmm4, xmm5                  ;// e5
        addps       xmm5, xmm2                  ;// e5 + e13 = y5
        subps       xmm4, xmm2                  ;// e5 - e13 = y10
        movaps      xmm7, xmm1                  ;// e7
        addps       xmm1, xmm0                  ;// e7 + c15 = y7
        subps       xmm7, xmm0                  ;// e7 - c15 = y8
        movaps      X5, xmm5                    ;// save y5
        movaps      X6, xmm3                    ;// save y6
        movaps      X7, xmm1                    ;// save y7
        movaps      X8, xmm7                    ;// save y8
        movaps      X9, xmm6                    ;// save y9
        movaps      X10, xmm4                   ;// save y10
        add         I, 16
        jnz         ColLoop

        movd        Ctx, mm1

        PROFILE_OUT "IDCT_16x16_SSE2"
        DBG_IDCT_OUT_16x16

        add         DST, 16*16*sizeof(real4)
        jmp         [Ctx].RetIdct

IDCT_16x16_SSE2 ENDP

;//=========================================================================
;// Dequantization and IDCT on a block of 8x8 samples, output 8x16 samples
;// params : esp : source pointer
;// params : edi : destination pointer
;// params : eax : pointer to quantization table
;// return : destination pointer += 8*16*sizeof(real4)
;//=========================================================================

IDCT_8x16_SSE2 PROC

        QTB   EQU <eax>
        SRC   EQU <QPTR [esp]>
        DST   EQU <edi>
        I     EQU <ecx>
        W     EQU <edx>
        X0    EQU <[DST][I+2*16+0*8*4]>
        X1    EQU <[DST][I+2*16+1*8*4]>
        X2    EQU <[DST][I+2*16+2*8*4]>
        X3    EQU <[DST][I+2*16+3*8*4]>
        X4    EQU <[DST][I+2*16+4*8*4]>
        X5    EQU <[DST][I+2*16+5*8*4]>
        X6    EQU <[DST][I+2*16+6*8*4]>
        X7    EQU <[DST][I+2*16+7*8*4]>
        X8    EQU <[DST][I+2*16+8*8*4]>
        X9    EQU <[DST][I+2*16+9*8*4]>
        X10   EQU <[DST][I+2*16+10*8*4]>
        X11   EQU <[DST][I+2*16+11*8*4]>
        X12   EQU <[DST][I+2*16+12*8*4]>
        X13   EQU <[DST][I+2*16+13*8*4]>
        X14   EQU <[DST][I+2*16+14*8*4]>
        X15   EQU <[DST][I+2*16+15*8*4]>

        PROFILE_IN

        ;// apply 8-point IDCT to the rows
        mov         I, -8*4
RowLoop:
        mov         W, TBL_MultRow8x8[I+8*4]   ;// row multiplier
        cvtdq2ps    xmm0, [SRC][8*I+8*8*4]      ;// x3 x2 x1 x0
        cvtdq2ps    xmm1, [SRC][8*I+8*8*4+4*4]  ;// x7 x6 x5 x4
        mulps       xmm0, [QTB][8*I+8*8*4]      ;// x3*q3 x2*q2 x1*q1 x0*q0
        mulps       xmm1, [QTB][8*I+8*8*4+4*4]  ;// x7*q7 x6*q6 x5*q5 x4*q4
        pshufd      xmm2, xmm0, 000000000b      ;// x0 x0 x0 x0
        pshufd      xmm3, xmm0, 010101010b      ;// x2 x2 x2 x2
        pshufd      xmm4, xmm1, 000000000b      ;// x4 x4 x4 x4
        pshufd      xmm5, xmm1, 010101010b      ;// x6 x6 x6 x6
        mulps       xmm2, [W]                   ;// x0*w12 x0*w08 x0*w04 x0*w00
        mulps       xmm3, [W][4*4]              ;// x2*w13 x2*w09 x2*w05 x2*w01
        mulps       xmm4, [W][8*4]              ;// x4*w14 x4*w10 x4*w06 x4*w02
        mulps       xmm5, [W][12*4]             ;// x6*w15 x6*w11 x6*w07 x6*w03
        addps       xmm2, xmm3                  ;// x0*w12+x2*w13 x0*w08+x2*w09 x0*w04+x2*w05 x0*w00+x2*w01

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -