idct_ap922sse_opt.cpp

来自「这是一组DCT和iDCT的代码」· C++ 代码 · 共 1,367 行 · 第 1/3 页
CPP
1,367 行
    ;

   cvtps2pi mm6, xmm3;          ;// (6b) mm6 <= y[2] low
    movhlps xmm3, xmm3;

   cvtps2pi mm7, xmm3;          ;// (6b) mm7 <= y[2] high
// xmm3 free
// xmm3 used
    movaps xmm3, [INPC + 5*32] ; // col+4 (1) xmm3 <= dp[7]

   subps xmm7, xmm2;            ;// (5) xmm7 <= y[7] = f[0] - f[4]
// xmm2 free
// xmm2 used
     movaps xmm2, xmm6;         ;// col+4 (2a) xmm2 <= dp[6]

    // 2) Apply B_T (for columns 4-7)
    //
    //    b[4] =   dp[4] + ( dp[5]*tg_1_16f );
    //    b[5] = - dp[5] + ( dp[4]*tg_1_16f );
    //    b[6] =   dp[6] + ( dp[7]*tg_3_16f );
    //    b[7] = - dp[7] + ( dp[6]*tg_3_16f );

   psraw mm4, DESCALE_SHIFT2;    // (6d) clip y[] to {-256,+255}
    mulps xmm6, [_PTAN3];       ;// col+4 (2a) xmm6 <= (dp[6]*tg_3_16f)

   addps xmm5, xmm0;            ;// (6a) ROUND compensation
    movq [OUTC + 4*16], mm4;    ;// y[4] stored

   packssdw mm6, mm7;           ;// (6c) mm6 <= y[2]
    ;

   addps xmm4, xmm0             ;// (6a) ROUND compensation
    ;

   psraw mm6, DESCALE_SHIFT2;    // (6d) clip y[] to {-256,+255}
    ;

   cvtps2pi mm0, xmm5;          ;// (6b) mm0 <= y[5] low
    movhlps xmm5, xmm5;

   cvtps2pi mm1, xmm5;          ;// (6b) mm1 <= y[5] high
// xmm5 free
// xmm5 used
   movaps xmm5, [INPC + 7*32];  // col+4 (1) xmm5 <= dp[5]

   cvtps2pi mm2, xmm4;          ;// (6b) mm2 <= y[6] low
    movhlps xmm4, xmm4;

   cvtps2pi mm3, xmm4;          ;// (6b) mm3 <= y[6] high
// xmm4 free
// xmm4 used
    movaps xmm4, [INPC + 1*32] ; // col+4 (1) xmm4 <= dp[4]

   movq [OUTC + 2*16], mm6;    ;// y[2] stored
    addps xmm1, xmm0             ;// (6a) ROUND compensation

   packssdw mm0, mm1;           ;// (6c) mm0 <= y[5]
    ;

   addps xmm7, xmm0             ;// (6a) ROUND compensation
// xmm0 free
// xmm0 used
    movaps xmm0, xmm4;          // col+4 (1) xmm0 <= dp[4]

   psraw mm0, DESCALE_SHIFT2;    // (6d) clip y[] to {-256,+255}
    mulps xmm4, [_PTAN1];    ;// col+4 (2a) xmm4 <= (dp[4]*tg_1_16f)

   cvtps2pi mm4, xmm1;          ;// (6b) mm4 <= y[0] low
    movhlps xmm1, xmm1;

   packssdw mm2, mm3;           ;// (6c) mm2 <= y[6]
    ;

   cvtps2pi mm5, xmm1;          ;// (6b) mm5 <= y[0] high
// xmm1 free
    movq [OUTC + 5*16], mm0;

// xmm1 used
   movaps xmm1, xmm5           ;// col+4 (2a) xmm1 <= dp[5] 
    psraw mm2, DESCALE_SHIFT2;   // (6d) clip y[] to {-256,+255}

   cvtps2pi mm6, xmm7;         ;// (6b) mm6 <= y[7] low
    movhlps xmm7, xmm7;

   cvtps2pi mm7, xmm7;         ;// (6b) mm7 <= y[7] high
// xmm7 free
    mulps xmm5, [_PTAN1];    ;// col+4 (2a) xmm5 <= (dp[5]*tg_1_16f)

   subps xmm4, xmm1;   ;// col+4 (2b) xmm4 <= b[5] = (dp[4]*tg_1_16f) - dp[5]
// xmm1 free
    movaps xmm7, xmm3; ;// col+4 (2a) xmm7 <= dp[7]

   packssdw mm4, mm5;          ;// (6c) mm4 <= y[0]
    mulps xmm7, [_PTAN3];    ;// col+4 (2a) xmm7 <= (dp[7]*tg_3_16f)

   subps xmm6, xmm3;    ;// col+4 (2b) xmm6 <= b[7] = (dp[6]*tg_1_16f) - dp[7]
// xmm3 free
    movq [OUTC + 6*16], mm2;
 
// xmm3 used
   movaps xmm3, [INPC + 6*32]   ;// col+4 (2a) xmm3 <= dp[3]
    psraw mm4, DESCALE_SHIFT2;   // (6d) clip y[] to {-256,+255}

// xmm1 used
   movaps xmm1, xmm4;   ;// col+4 (2b) xmm1 <= b[5]
    addps xmm5, xmm0;  ;// col+4 (2b) xmm5 <= b[4] = (dp[5]*tg_1_16f) + dp[4]

   movq [OUTC + 0*16], mm4;
    packssdw mm6, mm7;          ;// (6c) mm6 <= y[7]

   psraw mm6, DESCALE_SHIFT2;   // (6d) clip y[] to {-256,+255}
    addps xmm7, xmm2;   // col+4 (2b) xmm7 <= b[6] = (dp[7]*tg_1_16f) + dp[6]
// xmm2 free

// xmm0 used
   movaps xmm0, xmm5;      ;// col+4 (2b) xmm0 <= b[4]

 
    // 3) Apply E_T
    //
    //     e[4] = b[4] + b[6];
    //     e[5] = b[4] - b[6];
    //     e[6] = b[5] + b[7];
    //     e[7] = b[5] - b[7];

// xmm2 used
     movaps xmm2, xmm3;          ;// col+4 (2a) xmm2 <= dp[3]

    addps xmm1, xmm6;          ;// col+4 (3) xmm1 <= e[6] = b[5] + b[7]
     movq [OUTC + 7*16], mm6;

   ////////////////////////////////////
   // end of iDCT column 0-3 processing
   ////////////////////////////////////

    mulps xmm3, [_PTAN2];       ;// (2a) xmm3 <= (dp[3]*tg_2_16f)
     add OUTC, 8;               ;// advance output to columns 4-7

    subps xmm5, xmm7;           ;// (3) xmm5 <= e[5] = b[4] - b[6]

    ;

    subps xmm4, xmm6;           ;// (3) xmm4 <= e[7] = b[5] - b[7]
// xmm6 free
// xmm6 used
     movaps xmm6, [INPC + 2*32] ;// (2a) xmm6 <= dp[2]

    ;

    addps xmm0, xmm7;           ;// (3) xmm0 <= e[4] = b[4] + b[6]
// xmm7 free
// xmm7 used
     movaps xmm7, xmm5;         ;// (3) xmm7 <= e[5]

    // 4) Apply F_T
    //
    //    f[4] = e[4]
    //    f[5] = (e[5] + e[6]) * cos_4_16f;
    //    f[6] = (e[5] - e[6]) * cos_4_16f;
    //    f[7] = e[7]

// xmm0 <= f[4] = e[4]
// xmm4 <= f[7] = e[7]

    ;

    addps xmm5, xmm1;           ;// (4a) xmm5 <= (e[5]+e[6])

    ;

    subps xmm7, xmm1;           ;// (4a) xmm7 <= (e[5]-e[6])
// xmm1 free
// xmm1 used
     movaps xmm1, [INPC + 0*32] ;// (1) xmm1 <= dp[0]

    movaps [SCRATCH], xmm0;         ;// store f[4] in scratch space
// xmm0 free (in scratch)

    mulps xmm5, [_PCOS4]     ;// (4b) xmm5 <= f[5] = (e[5] + e[6]) * cos_4_16f;
     addps xmm3, xmm6;       ;// (2b) xmm3 <= b[2] = (dp[3]*tg_2_16f) + dp[2]

//movups [SCRATCH+3*16], xmm4;  ;// store f[7] in scratch space
// xmm4 f[7] free (in scratch)

    ;

    mulps xmm6, [_PTAN2];       ;// (2a) xmm6 <= (dp[2]*tg_2_16f)

    ;

    mulps xmm7, [_PCOS4]        ;// (4b) xmm7 <= f[6] = (e[5] - e[6]) * cos_4_16f;
    
    ;

    subps xmm6, xmm2;           ;// (2b) xmm6 <= b[3] = (dp[2]*tg_2_16f) - dp[3]
// xmm2 free
// xmm2 used
//movups [SCRATCH+1*16], xmm5;  ;// xmm5 <= store f[5] in scratch space
// xmm5 f[5] free (in scratch)
     movaps xmm2, [INPC+4*32];  ;// (1) xmm2 <= dp[1]

// xmm0 used
    movaps xmm0, xmm1;          ;// (1) xmm0 <= dp[0]
     ;

    addps xmm1, xmm2;           ;// (2b) xmm1 <= b[0] = dp[0] + dp[1]

// movups [SCRATCH+2*16], xmm7; ;// xmm7 <= store f[6] in scratch space
// xmm7 f[6] free (in scratch)

    ;

//    movups [INP+1*16], xmm1;  ;// temp store

    subps xmm0, xmm2;           ;// (2b) xmm0 <= b[1] = dp[0] - dp[1]
// xmm2 free

//    e[0] = b[0] + b[2];
//    e[1] = b[1] + b[3];
//    e[2] = b[1] - b[3];
//    e[3] = b[0] - b[2];

    ;

// xmm2 used 
    movaps xmm2, xmm1;          ;// (2b) xmm1 <= b[0]
     addps xmm1, xmm3;          ;// (3) xmm1 <= e[0] = b[0] + b[2];

    ;

    subps xmm2, xmm3;           ;// (3) xmm3 <= e[3] = b[0] - b[2];
// xmm3 free
// xmm3 used
     movaps xmm3, xmm0;         ;// (2b) xmm3 <= b[1]

    ;

    addps xmm0, xmm6;           ;// (3) xmm0 <= e[1] = b[1] + b[3];

    ;

    subps xmm3, xmm6;           ;// (3) xmm3 <= e[2] = b[1] - b[3];
// xmm6 free
// xmm6 used
     movaps xmm6, xmm2;         ;// (4) xmm6 <= f[3]

// xmm1 <= f[0]
// xmm0 <= f[1]
// xmm3 <= f[2]
// xmm2 <= f[3]
/*

    y[0*8] = SHIFT_ROUND_COLF( _F0 + _F4 );
    y[1*8] = SHIFT_ROUND_COLF( _F1 + _F5 );
    y[2*8] = SHIFT_ROUND_COLF( _F2 + _F6 );
    y[3*8] = SHIFT_ROUND_COLF( _F3 + _F7 );

    y[4*8] = SHIFT_ROUND_COLF( _F3 - _F7 );
    y[5*8] = SHIFT_ROUND_COLF( _F2 - _F6 );
    y[6*8] = SHIFT_ROUND_COLF( _F1 - _F5 );
    y[7*8] = SHIFT_ROUND_COLF( _F0 - _F4 );

    6a) round compensation, adjust float value to correct round alignment
        addps (ROUND_COMP - 0.5)

    6b) float -> int32 conversion (cvtps2pi)
       cvtps2pi mm0, xmm0;  // mm0 <= int32 versions of y[1 0]
        movhlps xmm0, xmm0; 
       cvtps2pi mm1, xmm0;  // mm1 <= int32 versions of y[3 2]

    6c) int32 -> int16 conversion (packssdw)
        packssdw mm0, mm1; // mm0 <= int16 versions of y[3 2 1 0]

    6d) int16 decsale (psraw)
       psraw mm0, DESCALE_SHIFT2; // mm0 <= final descaled output, {-256,+255}

    ** The descale shift is chosen to implement a {-256,+255} output range.

*/
    ;

    addps xmm2, xmm4;           ;// (5) xmm2 <= y[3] = f[3] + f[7]

    ;

    subps xmm6, xmm4;           ;// (5) xmm6 <= y[4] = f[3] - f[7]
// xmm4 free
// xmm4 used
     movaps xmm4, xmm0;         ;// xmm0 <= f[1]

    ;

    addps xmm2, [_PRND_INV_ROW]; // (6a) ROUND compensation

    ;

    addps xmm0, xmm5;           ;// (5) xmm0 <= y[1] = f[1] + f[5]

    ;

   cvtps2pi mm0, xmm2;          ;// (6b) mm0 <= y[3] low
    movhlps xmm2, xmm2;

   cvtps2pi mm1, xmm2;          ;// (6b) mm1 <= y[3] high
    ;

// xmm2 free
   addps xmm0, [_PRND_INV_ROW]; ;// (6a) ROUND compensation

   ;

   subps xmm4, xmm5;            ;// (5) xmm4 <= y[6] = f[1] - f[5]
// xmm5 free
// xmm2 used
    movaps xmm2, [SCRATCH];     ;// xmm2 <= f[4]

   packssdw mm0, mm1;           ;// (6c) mm0 <= y[3] 
    ;

   cvtps2pi mm2, xmm0;          ;// (6b) mm2 <= y[1] low
    movhlps xmm0, xmm0;

   cvtps2pi mm3, xmm0;          ;// (6b) mm3 <= y[1] high
// xmm5 used
    movaps xmm5, xmm3;          ;// xmm5 <= f[2]

   addps xmm3, xmm7;            ;// (5) xmm3 <= y[2] = f[2] + f[6]
    ;

   psraw mm0, DESCALE_SHIFT2;    // (6d) clip y[] to {-256,+255}
    ;

   addps xmm6, [_PRND_INV_ROW]; ;// (6a) ROUND compensation
    ;

   packssdw mm2, mm3;           ;// (6c) mm2 <= y[1]
    ;

   subps xmm5, xmm7;            ;// (5) xmm5 <= y[5] = f[2] - f[6]
// xmm7 free
    movq [OUTC + 3*16], mm0;    ;// mm0 <= y[3] stored

   psraw mm2, DESCALE_SHIFT2;    // (6d) clip y[] to {-256,+255}
    ;

   cvtps2pi mm4, xmm6;          ;// (6b) mm4 <= y[4] low
    movhlps xmm6, xmm6;

   cvtps2pi mm5, xmm6;          ;// (6b) mm5 <= y[4] high
// xmm6 free
    ;
  
   addps xmm3, [_PRND_INV_ROW]; ;// (6a) ROUND compensation
    movq [OUTC + 1*16], mm2;    ;// y[1] stored

// xmm7 used
   movaps xmm7, xmm1;           ;// xmm7 <= f[0]
    ;

   addps xmm1, xmm2;            ;// (5) xmm2 <= y[0] = f[0] + f[4]
    ;

   packssdw mm4, mm5;           ;// (6c) mm4 <= y[4]
    ;

   cvtps2pi mm6, xmm3;          ;// (6b) mm6 <= y[2] low
    movhlps xmm3, xmm3;

   cvtps2pi mm7, xmm3;          ;// (6b) mm7 <= y[2] high
// xmm3 free
    ;

   subps xmm7, xmm2;            ;// (5) xmm7 <= y[7] = f[0] - f[4]
// xmm2 free

   psraw mm4, DESCALE_SHIFT2;    // (6d) clip y[] to {-256,+255}
    ;

   addps xmm5, [_PRND_INV_ROW]; ;// (6a) ROUND compensation
    movq [OUTC + 4*16], mm4;    ;// y[4] stored

   packssdw mm6, mm7;           ;// (6c) mm6 <= y[2]
    ;

   addps xmm4, [_PRND_INV_ROW]; ;// (6a) ROUND compensation
    add INPC, 16;

   psraw mm6, DESCALE_SHIFT2;    // (6d) clip y[] to {-256,+255}
    ;

   cvtps2pi mm0, xmm5;          ;// (6b) mm0 <= y[5] low
    movhlps xmm5, xmm5;

   cvtps2pi mm1, xmm5;          ;// (6b) mm1 <= y[5] high
// xmm5 free
    movq [OUTC + 2*16], mm6;    ;// y[2] stored

   cvtps2pi mm2, xmm4;          ;// (6b) mm2 <= y[6] low
    movhlps xmm4, xmm4;

   cvtps2pi mm3, xmm4;          ;// (6b) mm3 <= y[6] high
// xmm4 free
    ;

   addps xmm1, [_PRND_INV_ROW]; ;// (6a) ROUND compensation
    ;

   packssdw mm0, mm1;           ;// (6c) mm0 <= y[5]
    ;

   addps xmm7, [_PRND_INV_ROW]; ;// (6a) ROUND compensation
    ;

   psraw mm0, DESCALE_SHIFT2;    // (6d) clip y[] to {-256,+255}
    ;

   cvtps2pi mm4, xmm1;          ;// (6b) mm4 <= y[0] low
    movhlps xmm1, xmm1;

   packssdw mm2, mm3;           ;// (6c) mm2 <= y[6]
    ;

   cvtps2pi mm5, xmm1;          ;// (6b) mm5 <= y[0] high
// xmm1 free
    movq [OUTC + 5*16], mm0;

   psraw mm2, DESCALE_SHIFT2;   // (6d) clip y[] to {-256,+255}
    ;

   cvtps2pi mm6, xmm7;         ;// (6b) mm6 <= y[7] low
    movhlps xmm7, xmm7;

   cvtps2pi mm7, xmm7;         ;// (6b) mm7 <= y[7] high
// xmm7 free
    ;

   packssdw mm4, mm5;          ;// (6c) mm4 <= y[0]
    ;

   psraw mm4, DESCALE_SHIFT2;   // (6d) clip y[] to {-256,+255}
    movq [OUTC + 6*16], mm2;

   packssdw mm6, mm7;          ;// (6c) mm6 <= y[7]
    ;

   psraw mm6, DESCALE_SHIFT2;   // (6d) clip y[] to {-256,+255}
    movq [OUTC + 0*16], mm4;

   movq [OUTC + 7*16], mm6;

 //   ldmxcsr [_mxcsr_backup]; // restore state

   emms;
   }
}
idct_ap922sse_opt.cpp - 源码说明

本页面展示了「这是一组DCT和iDCT的代码」中的 idct_ap922sse_opt.cpp 源码文件，采用 C++ 编程语言编写，共 1,367 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与iDCT相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?