⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 idct.asm

📁 这是一个JPEG解码器,里面使用了MMX,SSE等汇编指令集
💻 ASM
📖 第 1 页 / 共 5 页
字号:
        pshufd      xmm1, xmm1, 011111111b      ;// x7 x7 x7 x7
        mulps       xmm3, [W][16*4]             ;// x1*w28 x1*w24 x1*w20 x1*w16
        mulps       xmm0, [W][20*4]             ;// x3*w29 x3*w25 x3*w21 x3*w17
        mulps       xmm4, [W][24*4]             ;// x5*w30 x5*w26 x5*w22 x5*w18
        mulps       xmm1, [W][28*4]             ;// x7*w31 x7*w27 x7*w23 x7*w19
        addps       xmm3, xmm0                  ;// x1*w28+x3*w29 x1*w24+x3*w25 x1*w20+x3*w21 x1*w16+x3*w17
        addps       xmm4, xmm1                  ;// x5*w30+x7*w31 x5*w26+x7*w27 x5*w22+x7*w23 x5*w18+x7*w19
        addps       xmm3, xmm4                  ;// b3 b2 b1 b0
        movaps      xmm4, xmm2                  ;// a3 a2 a1 a0
        addps       xmm2, xmm3                  ;// y3 y2 y1 y0
        subps       xmm4, xmm3                  ;// y12 y13 y14 y15
        shufps      xmm4, xmm4, 000011011b      ;// y15 y14 y13 y12
        movaps      [DST][J+8*16*4], xmm2      ;// save y3 y2 y1 y0
        movaps      [DST][J+8*16*4+12*4], xmm4 ;// save y15 y14 y13 y12
        pshufd      xmm2, xmm6, 000000000b      ;// x0 x0 x0 x0
        pshufd      xmm3, xmm6, 010101010b      ;// x2 x2 x2 x2
        pshufd      xmm4, xmm7, 000000000b      ;// x4 x4 x4 x4
        pshufd      xmm5, xmm7, 010101010b      ;// x6 x6 x6 x6
        mulps       xmm2, [W][32*4]             ;// x0*w44 x0*w40 x0*w36 x0*w32
        mulps       xmm3, [W][36*4]             ;// x2*w45 x2*w41 x2*w37 x2*w33
        mulps       xmm4, [W][40*4]             ;// x4*w46 x4*w42 x4*w38 x4*w34
        mulps       xmm5, [W][44*4]             ;// x6*w47 x6*w43 x6*w39 x6*w35
        addps       xmm2, xmm3                  ;// x0*w44+x2*w45 x0*w40+x2*w41 x0*w36+x2*w37 x0*w32+x2*w33
        addps       xmm4, xmm5                  ;// x4*w46+x6*w47 x4*w42+x6*w43 x4*w38+x6*w39 x4*w34+x6*w35
        pshufd      xmm3, xmm6, 001010101b      ;// x1 x1 x1 x1
        addps       xmm2, xmm4                  ;// a7 a6 a5 a4
        pshufd      xmm6, xmm6, 011111111b      ;// x3 x3 x3 x3
        pshufd      xmm4, xmm7, 001010101b      ;// x5 x5 x5 x5
        pshufd      xmm7, xmm7, 011111111b      ;// x7 x7 x7 x7
        mulps       xmm3, [W][48*4]             ;// x1*w60 x1*w56 x1*w52 x1*w48
        mulps       xmm6, [W][52*4]             ;// x3*w61 x3*w57 x3*w53 x3*w49
        mulps       xmm4, [W][56*4]             ;// x5*w62 x5*w58 x5*w54 x5*w50
        mulps       xmm7, [W][60*4]             ;// x7*w63 x7*w59 x7*w55 x7*w51
        addps       xmm3, xmm6                  ;// x1*w60+x3*w61 x1*w56+x3*w57 x1*w52+x3*w53 x1*w48+x3*w49
        addps       xmm4, xmm7                  ;// x5*w62+x7*w62 x5*w58+x7*w59 x5*w54+x7*w55 x5*w50+x7*w51
        addps       xmm3, xmm4                  ;// b7 b6 b5 b4
        movaps      xmm4, xmm2                  ;// a7 a6 a5 a4
        addps       xmm2, xmm3                  ;// y7 y6 y5 y4
        subps       xmm4, xmm3                  ;// y8 y9 y10 y11
        shufps      xmm4, xmm4, 000011011b      ;// y11 y10 y9 y8
        movaps      [DST][J+8*16*4+4*4], xmm2   ;// save y7 y6 y5 y4
        movaps      [DST][J+8*16*4+8*4], xmm4   ;// save y11 y10 y9 y8
        add         J, 16*4
        add         I, 4
        jnz         RowLoop

        ;// apply 8-point IDCT to the columns
        mov         I, -4*16
ColLoop:
        movaps      xmm0, X1                    ;// x1
        movaps      xmm1, X7                    ;// x7
        movaps      xmm2, xmm0                  ;// x1
        mulps       xmm0, TBL_tg1_16            ;// x1*tg1
        movaps      xmm3, xmm1                  ;// x7
        mulps       xmm1, TBL_tg1_16            ;// x7*tg1
        subps       xmm0, xmm3                  ;// tp465
        addps       xmm2, xmm1                  ;// tp765
        movaps      xmm1, X3                    ;// x3
        movaps      xmm3, X5                    ;// x5
        movaps      xmm4, xmm1                  ;// x3
        mulps       xmm1, TBL_tg3_16            ;// x3*tg3
        movaps      xmm5, xmm3                  ;// x5
        mulps       xmm3, TBL_tg3_16            ;// x5*tg3
        subps       xmm5, xmm1                  ;// tm465
        addps       xmm3, xmm4                  ;// tm765
        movaps      xmm1, xmm2                  ;// tp765
        movaps      xmm4, xmm0                  ;// tp465
        addps       xmm2, xmm3                  ;// t7
        subps       xmm1, xmm3                  ;// tp65
        addps       xmm0, xmm5                  ;// t4
        subps       xmm4, xmm5                  ;// tm65
        movaps      xmm3, xmm4                  ;// tm65
        addps       xmm4, xmm1                  ;// tp65 + tm65
        subps       xmm1, xmm3                  ;// tp65 - tm65
        mulps       xmm4, TBL_cos4_16           ;// t6
        mulps       xmm1, TBL_cos4_16           ;// t5
        movaps      X1, xmm4                    ;// t6
        movaps      X3, xmm0                    ;// t4
        movaps      xmm5, X4                    ;// x4
        movaps      xmm3, X0                    ;// x0
        movaps      xmm6, xmm5                  ;// x4
        addps       xmm5, xmm3                  ;// tp03
        subps       xmm3, xmm6                  ;// tp12
        movaps      xmm0, X2                    ;// x2
        movaps      xmm4, X6                    ;// x6
        movaps      xmm6, xmm0                  ;// x2
        mulps       xmm0, TBL_tg2_16            ;// x2*tg2
        movaps      xmm7, xmm4                  ;// x6
        mulps       xmm4, TBL_tg2_16            ;// x6*tg2
        subps       xmm0, xmm7                  ;// tm12
        addps       xmm4, xmm6                  ;// tm03
        movaps      xmm6, xmm0                  ;// tm12
        addps       xmm0, xmm3                  ;// t1
        subps       xmm3, xmm6                  ;// t2
        movaps      xmm6, xmm4                  ;// tm03
        addps       xmm4, xmm5                  ;// t0
        subps       xmm5, xmm6                  ;// t3
        movaps      xmm6, xmm2                  ;// t7
        addps       xmm2, xmm4                  ;// t0 + t7
        subps       xmm4, xmm6                  ;// t0 - t7
        movaps      xmm7, xmm1                  ;// t5
        addps       xmm1, xmm3                  ;// t2 + t5
        subps       xmm3, xmm7                  ;// t2 - t5
        movaps      xmm6, X3                    ;// t4
        movaps      xmm7, xmm5                  ;// t3
        subps       xmm5, xmm6                  ;// t3 - t4
        addps       xmm7, xmm6                  ;// t3 + t4
        movaps      X0, xmm2                    ;// save y0
        movaps      X7, xmm4                    ;// save y7
        movaps      X2, xmm1                    ;// save y2
        movaps      X5, xmm3                    ;// save y5
        movaps      X4, xmm5                    ;// save y4
        movaps      X3, xmm7                    ;// save y3
        movaps      xmm2, X1                    ;// t6
        movaps      xmm3, xmm0                  ;// t1
        addps       xmm0, xmm2                  ;// t1 + t6
        subps       xmm3, xmm2                  ;// t1 - t6
        movaps      X1, xmm0                    ;// save y1
        movaps      X6, xmm3                    ;// save y6
        add         I, 16
        jnz         ColLoop          

        movd        Ctx, mm1

        PROFILE_OUT "IDCT_16x8_SSE2"
        DBG_IDCT_OUT_16x8

        add         DST, 16*8*sizeof(real4)
        jmp         [Ctx].RetIdct

IDCT_16x8_SSE2 ENDP

;//=========================================================================
;// Dequantization and IDCT on a block of 8x8 samples, output 8x8 samples
;// params : esp : source pointer
;// params : edi : destination pointer
;// params : eax : pointer to quantization table
;// return : destination pointer += 8*8*sizeof(real4)
;//=========================================================================

IDCT_8x8_SSE PROC

        QTB   EQU <eax>
        SRC   EQU <QPTR [esp]>
        DST   EQU <edi>
        I     EQU <ecx>
        W     EQU <edx>
        X0    EQU [DST][I+2*16+0*8*4]
        X1    EQU [DST][I+2*16+1*8*4]
        X2    EQU [DST][I+2*16+2*8*4]
        X3    EQU [DST][I+2*16+3*8*4]
        X4    EQU [DST][I+2*16+4*8*4]
        X5    EQU [DST][I+2*16+5*8*4]
        X6    EQU [DST][I+2*16+6*8*4]
        X7    EQU [DST][I+2*16+7*8*4]

        DBG_IDCT_IN
        PROFILE_IN

        ;// apply IDCT to the rows
        mov         I, -8*4
RowLoop:
        mov         W, TBL_MultRow8x8[I+8*4]   ;// row multiplier
        cvtpi2ps    xmm0, [SRC][8*I+8*8*4]      ;//  _  _ x1 x0
        cvtpi2ps    xmm2, [SRC][8*I+8*8*4+4*2]  ;//  _  _ x3 x2
        cvtpi2ps    xmm1, [SRC][8*I+8*8*4+4*4]  ;//  _  _ x5 x4
        cvtpi2ps    xmm3, [SRC][8*I+8*8*4+4*6]  ;//  _  _ x7 x6
        shufps      xmm0, xmm2, 001000100b      ;// x3 x2 x1 x0
        shufps      xmm1, xmm3, 001000100b      ;// x7 x6 x5 x4
        mulps       xmm0, [QTB][8*I+8*8*4]      ;// x3*q3 x2*q2 x1*q1 x0*q0
        mulps       xmm1, [QTB][8*I+8*8*4+4*4]  ;// x7*q7 x6*q6 x5*q5 x4*q4
        movaps      xmm2, xmm0                  ;// x3 x2 x1 x0
        movaps      xmm3, xmm0                  ;// x3 x2 x1 x0
        movaps      xmm4, xmm1                  ;// x7 x6 x5 x4
        movaps      xmm5, xmm1                  ;// x7 x6 x5 x4
        shufps      xmm2, xmm2, 000000000b      ;// x0 x0 x0 x0
        shufps      xmm3, xmm3, 010101010b      ;// x2 x2 x2 x2
        shufps      xmm4, xmm4, 000000000b      ;// x4 x4 x4 x4
        shufps      xmm5, xmm5, 010101010b      ;// x6 x6 x6 x6
        mulps       xmm2, [W]                   ;// x0*w12 x0*w08 x0*w04 x0*w00
        mulps       xmm3, [W][4*4]              ;// x2*w13 x2*w09 x2*w05 x2*w01
        mulps       xmm4, [W][8*4]              ;// x4*w14 x4*w10 x4*w06 x4*w02
        mulps       xmm5, [W][12*4]             ;// x6*w15 x6*w11 x6*w07 x6*w03
        addps       xmm2, xmm3                  ;// x0*w12+x2*w13 x0*w08+x2*w09 x0*w04+x2*w05 x0*w00+x2*w01
        addps       xmm4, xmm5                  ;// x4*w14+x6*w15 x4*w10+x6*w11 x4*w06+x6*w07 x4*w02+x6*w03
        addps       xmm2, xmm4                  ;// a3 a2 a1 a0
        movaps      xmm3, xmm0                  ;// x3 x2 x1 x0
        movaps      xmm4, xmm1                  ;// x3 x2 x1 x0
        shufps      xmm3, xmm3, 001010101b      ;// x1 x1 x1 x1
        shufps      xmm0, xmm0, 011111111b      ;// x3 x3 x3 x3
        shufps      xmm4, xmm4, 001010101b      ;// x5 x5 x5 x5
        shufps      xmm1, xmm1, 011111111b      ;// x7 x7 x7 x7
        mulps       xmm3, [W][16*4]             ;// x1*w28 x1*w24 x1*w20 x1*w16
        mulps       xmm0, [W][20*4]             ;// x3*w29 x3*w25 x3*w21 x3*w17
        mulps       xmm4, [W][24*4]             ;// x5*w30 x5*w26 x5*w22 x5*w18
        mulps       xmm1, [W][28*4]             ;// x7*w31 x7*w27 x7*w23 x7*w19
        addps       xmm3, xmm0                  ;// x1*w28+x3*w29 x1*w24+x3*w25 x1*w20+x3*w21 x1*w16+x3*w17
        addps       xmm4, xmm1                  ;// x5*w30+x7*w31 x5*w26+x7*w27 x5*w22+x7*w23 x5*w18+x7*w19
        addps       xmm3, xmm4                  ;// b3 b2 b1 b0
        movaps      xmm4, xmm2                  ;// a3 a2 a1 a0
        addps       xmm2, xmm3                  ;// y3 y2 y1 y0
        subps       xmm4, xmm3                  ;// y4 y5 y6 y7
        shufps      xmm4, xmm4, 000011011b      ;// y7 y6 y5 y4
        movaps      [DST][8*I+8*8*4], xmm2      ;// save y3 y2 y1 y0
        movaps      [DST][8*I+8*8*4+16], xmm4   ;// save y7 y6 y5 y4
        add         I, 4
        jnz         RowLoop

        ;// apply IDCT to the columns
        mov         I, -2*16
ColLoop:
        movaps      xmm0, X1                    ;// x1
        movaps      xmm1, X7                    ;// x7
        movaps      xmm2, xmm0                  ;// x1
        mulps       xmm0, TBL_tg1_16            ;// x1*tg1
        movaps      xmm3, xmm1                  ;// x7
        mulps       xmm1, TBL_tg1_16            ;// x7*tg1
        subps       xmm0, xmm3                  ;// tp465
        addps       xmm2, xmm1                  ;// tp765
        movaps      xmm1, X3                    ;// x3
        movaps      xmm3, X5                    ;// x5
        movaps      xmm4, xmm1                  ;// x3
        mulps       xmm1, TBL_tg3_16            ;// x3*tg3
        movaps      xmm5, xmm3                  ;// x5
        mulps       xmm3, TBL_tg3_16            ;// x5*tg3
        subps       xmm5, xmm1                  ;// tm465
        addps       xmm3, xmm4                  ;// tm765
        movaps      xmm1, xmm2                  ;// tp765
        movaps      xmm4, xmm0                  ;// tp465
        addps       xmm2, xmm3                  ;// t7
        subps       xmm1, xmm3                  ;// tp65
        addps       xmm0, xmm5                  ;// t4
        subps       xmm4, xmm5                  ;// tm65
        movaps      xmm3, xmm4                  ;// tm65
        addps       xmm4, xmm1                  ;// tp65 + tm65
        subps       xmm1, xmm3                  ;// tp65 - tm65
        mulps       xmm4, TBL_cos4_16           ;// t6
        mulps       xmm1, TBL_cos4_16           ;// t5
        movaps      X1, xmm4                    ;// t6
        movaps      X3, xmm0                    ;// t4
        movaps      xmm5, X4                    ;// x4
        movaps      xmm3, X0                    ;// x0
        movaps      xmm6, xmm5                  ;// x4
        addps       xmm5, xmm3                  ;// tp03
        subps       xmm3, xmm6                  ;// tp12
        movaps      xmm0, X2                    ;// x2
        movaps      xmm4, X6                    ;// x6
        movaps      xmm6, xmm0                  ;// x2
        mulps       xmm0, TBL_tg2_16            ;// x2*tg2
        movaps      xmm7, xmm4                  ;// x6
        mulps       xmm4, TBL_tg2_16            ;// x6*tg2
        subps       xmm0, xmm7                  ;// tm12
        addps       xmm4, xmm6                  ;// tm03
        movaps      xmm6, xmm0                  ;// tm12
        addps       xmm0, xmm3                  ;// t1
        subps       xmm3, xmm6                  ;// t2
        movaps      xmm6,

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -