⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 idct_ap922sse_opt.cpp

📁 这是一组DCT和iDCT的代码
💻 CPP
📖 第 1 页 / 共 3 页
字号:
//     movups [OUTR + 1*16], xmm5;  ;// (2b) store [y7 y6 y5 y4
// xmm5 free
    cmp edi, 0x07;
       // end for ( x = 0; x < 7; ++x )  // transform one row per iteration
    jl  acc_idct_rowloop1;

// Since the above code integrates processing for rows(x) and (x+1),
// the last row(#7) must be "unrolled" without the (x+1) code.

    //
    //
    /////////////////////////////////////////////

    /////////////////////////////////////////////
    //
    //  iDCT row #7

    addps xmm1, xmm0;           ;// (1b) xmm1 <= a_v0 + a_v1
// xmm0 free
     pand mm1, mm7;              // (0) mm1 <=[x3 x3]*SCALEUP (dword)

// this prefetch retrieves the TAN1, TAN2, TAN3, COS4 coefficients
// from the upcoming col_idct() operation
    prefetcht0 [TABLEP1 + 0*32] ;// x+1 prefetch coeff COL COEFFICIENTS
     pshufw mm0, mm4, 01010101b; // (0) mm0 <= [x5 x5 x5 x5]

    cvtpi2ps xmm5, mm1;          // (0) xmm5 <=[__ __ x3 x3] float
     pand mm0, mm7;              // (0) mm0 <= [x5 x5]*SCALEUP (dword)
// mm1 free

    cvtpi2ps xmm6, mm0;          // (0) xmm6 <=[__ __ x5 x5] float
 // mm0 free
     movlhps xmm4, xmm4;         // (0) xmm4 <= [x1 x1 x1 x1]

    mulps xmm4, [TABLE + 4*16]  ;// (1a) xmm4 <= b_v0
     pshufw mm1, mm4, 11111111b; // (0) mm1 <= [x7 x7 x7 x7]
// mm4 free

    addps xmm2, xmm1;           ;// (1b) xmm2 + a_v2 <= (a_v0+a_v1) + a_v2
// xmm1 free
     movlhps xmm5, xmm5;        ;// (0) xmm5 <=[x3 x3 x3 x3] float

    mulps xmm5, [TABLE + 5*16]  ;// (1a) xmm5 <= b_v1
     pand mm1, mm7;             ;// (0) mm1 <= [x7 x7]*SCALEUP (dword)

    cvtpi2ps xmm7, mm1;         ;// (0) xmm7 <= [__ __ x7 x7]
// mm1 free
     movlhps xmm6, xmm6;         // (0) xmm6 <= [x5 x5 x5 x5] float

    mulps xmm6, [TABLE + 6*16]  ;// (1a) xmm6 <= b_v2
     add OUTR, 32;              ;// OUT <= OUT + 32 (increment row) 

    addps xmm3, xmm2;           ;// (1b) xmm3 + a_v3 <=(a_v0+a_v1+a_v2) + a_v3
// xmm3 = complete a-vector ( a_v[3..0] )
// xmm2 free

// this prefetch retrieves the TAN1, TAN2, TAN3, COS4 coefficients
// from the upcoming col_idct() operation
    prefetcht0 [TABLEP1 + 1*32] ;// prefetch COL coefficients
     movlhps xmm7, xmm7;
// TABLEP1 (esi) free

    mulps xmm7, [TABLE + 7*16]  ;// (1a) xmm7 <= b_v3
     addps xmm5, xmm4;          ;// (1b) xmm5 <= b_v0 + b_v1
// xmm4 free

;/////////////////////////////////////////////////////////////////////
;//
;//
;// Begin iDCT column transform (columns 0-3)
;//
;//
;/////////////////////////////////////////////////////////////////////
    add TABLE, 128; // COL SETUP
     mov edi, 0x00; // i = 0; // COL SETUP

  //  TABLE now points to the proper location, no need to reload with lea
  //  lea TABLE, [tab_i_01234567ssef];; // COL SETUP
  //  add TABLE, 1024; ; // COL SETUP// increment TABLE to the COS4 entry

#define SCRATCH esi  // column iDCT
#define INPC eax     // column iDCT
#define OUTC edx     // column iDCT
#define _PCOS4        TABLE   
#define _PTAN1        TABLE+16
#define _PTAN2        TABLE+32
#define _PTAN3        TABLE+48
#define _PRND_INV_ROW TABLE+64

    mov INPC, dword ptr [fTempArray]; ; // COL SETUP
     addps xmm6, xmm5;           // (1b) xmm6 + b_v2 <= (b_v0+b_v1) + b_v2
// xmm5 free

//    lea SCRATCH, [scratch]    ; // COL SETUP
    mov SCRATCH, INPC;           ;// SCRATCH PAD writes over temp data
// xmm4 used
     movaps xmm4, xmm3;           // (1b) xmm4 <= a_v[3..0] 

    movaps xmm0, [INPC + 1*32] ; // col (1) xmm0 <= dp[4]
// xmm5 used
     movaps xmm5, xmm3;         ;// xmm5 <= a_v[3..0]
// xmm3 free

    addps xmm7, xmm6;            // (1b) xmm7 + b_v3 <=(b_v0+b_v1+b_v2) + b_v3
// xmm6 free
// xmm7 = complete b-vector ( b_v[3..0] )
     ;//

// output y[4..7] = a[7..4] + b[7..4], output order must be reversed!
    movaps xmm2, [INPC + 3*32];  // col (1) xmm2 <= dp[6]
     ;

    ;

    movaps xmm3, [INPC + 5*32] ; // col (1) xmm3 <= dp[7]
     ;

    addps xmm4, xmm7;      ;// (2a) xmm4 <= a_v[3..0]+b_v[3..0] (y3 y2 y1 y0)
     ;

    ;// COLUMN IDCT
    ;// 1) Apply (D_T * P_T) - the cos() coefficients of D_T are implicit
    ;//    in the idct_row operation.  But we still need to apply the
    ;//    shuffling operation of D_T.

    ;
     ;

// output y[3..0] = a[3..0] + b[3..0]
    subps xmm5, xmm7;       ;// (2a) xmm5 <= a_v[3..0]-b_v[3..0] (y4 y5 y6 y7)
// xmm7 free

// fix output order, y[4..7] -> y[7..4]
    shufps xmm5, xmm5, 00011011b; // (2a) xmm5 [y7 y6 y5 y4] <- [y4 y5 y6 y7]
     ;

    movaps xmm6, xmm2;         ;// col (2a) xmm6 <= dp[6]
  // movaps xmm1, [INPC + 7*32];  // col (1) xmm1 <= dp[5]
     movaps xmm1, xmm4;            // col (1) xmm1 <= dp[5]

// !*@&#@ Visual C++ refuses to align data to 16-byte offset, SIGH

    movaps [OUTR + 0*16], xmm4;  // (2b) store [y3 y2 y1 y0]
     movaps xmm4, xmm0;          // col (1) xmm4 <= dp[4]
//     movups [OUTR + 0*16], xmm4;   // (2b) store [y3 y2 y1 y0]
// xmm4 free
// xmm4 used 
    

    // 2) Apply B_T
    //
    //    b[4] =   dp[4] + ( dp[5]*tg_1_16f );
    //    b[5] = - dp[5] + ( dp[4]*tg_1_16f );
    //    b[6] =   dp[6] + ( dp[7]*tg_3_16f );
    //    b[7] = - dp[7] + ( dp[6]*tg_3_16f );

    mulps xmm4, [_PTAN1];    ;// (2a) xmm4 <= (dp[4]*tg_1_16f)
     ;

    movaps [OUTR + 1*16], xmm5; ;// (2b) store [y7 y6 y5 y4]
//     movups [OUTR + 1*16], xmm5;  ;// (2b) store [y7 y6 y5 y4]
// xmm5 free
// xmm5 used
     movaps xmm5, xmm1       ;// (2a) xmm5 <= dp[5] 

   ////////////////////////////////////
   // end of iDCT row #7 processing
   ////////////////////////////////////

    mulps xmm5, [_PTAN1];    ;// (2a) xmm5 <= (dp[5]*tg_1_16f)

    ;

    mulps xmm6, [_PTAN3];    ;// (2a) xmm6 <= (dp[6]*tg_3_16f)
     movaps xmm7, xmm3;      ;// (2a) xmm7 <= dp[7]

    subps xmm4, xmm1;        ;// (2b) xmm4 <= b[5] = (dp[4]*tg_1_16f) - dp[5]
// xmm1 free
     ;

     
    mulps xmm7, [_PTAN3];    ;// (2a) xmm7 <= (dp[7]*tg_3_16f)
     ;

    addps xmm5, xmm0;        ;// (2b) xmm5 <= b[4] = (dp[5]*tg_1_16f) + dp[4]
// xmm0 free

    ;


    mov OUTC, dword ptr [data];   ;//; // COL SETUP row 0
    subps xmm6, xmm3;        ;// (2b) xmm6 <= b[7] = (dp[6]*tg_1_16f) - dp[7]
// xmm3 free


// xmm1 used
     movaps xmm1, xmm4;      ;// (2b) xmm1 <= b[5]

// xmm3 used
    movaps xmm3, [INPC + 6*32]   ;// (2a) xmm3 <= dp[3]
     ;

    addps xmm7, xmm2;         // (2b) xmm7 <= b[6] = (dp[7]*tg_1_16f) + dp[6]
// xmm2 free
// xmm0 used
     movaps xmm0, xmm5;      ;// (2b) xmm0 <= b[4]

    ;

    // 3) Apply E_T
    //
    //     e[4] = b[4] + b[6];
    //     e[5] = b[4] - b[6];
    //     e[6] = b[5] + b[7];
    //     e[7] = b[5] - b[7];

// xmm2 used
    movaps xmm2, xmm3;          ;// (2a) xmm2 <= dp[3]
     addps xmm1, xmm6;          ;// (3) xmm1 <= e[6] = b[5] + b[7]

    mulps xmm3, [_PTAN2];       ;// (2a) xmm3 <= (dp[3]*tg_2_16f)
     ;

    subps xmm5, xmm7;           ;// (3) xmm5 <= e[5] = b[4] - b[6]

    ;

    subps xmm4, xmm6;           ;// (3) xmm4 <= e[7] = b[5] - b[7]
// xmm6 free
// xmm6 used
     movaps xmm6, [INPC + 2*32] ;// (2a) xmm6 <= dp[2]

    ;

    addps xmm0, xmm7;           ;// (3) xmm0 <= e[4] = b[4] + b[6]
// xmm7 free
// xmm7 used
     movaps xmm7, xmm5;         ;// (3) xmm7 <= e[5]

    // 4) Apply F_T
    //
    //    f[4] = e[4]
    //    f[5] = (e[5] + e[6]) * cos_4_16f;
    //    f[6] = (e[5] - e[6]) * cos_4_16f;
    //    f[7] = e[7]

// xmm0 <= f[4] = e[4]
// xmm4 <= f[7] = e[7]

    ;

    addps xmm5, xmm1;           ;// (4a) xmm5 <= (e[5]+e[6])

    ;

    subps xmm7, xmm1;           ;// (4a) xmm7 <= (e[5]-e[6])
// xmm1 free
// xmm1 used
     movaps xmm1, [INPC + 0*32] ;// (1) xmm1 <= dp[0]

    movaps [SCRATCH], xmm0;         ;// store f[4] in scratch space
// xmm0 free (in scratch)

    mulps xmm5, [_PCOS4]     ;// (4b) xmm5 <= f[5] = (e[5] + e[6]) * cos_4_16f;
     addps xmm3, xmm6;       ;// (2b) xmm3 <= b[2] = (dp[3]*tg_2_16f) + dp[2]

//movups [SCRATCH+3*16], xmm4;  ;// store f[7] in scratch space
// xmm4 f[7] free (in scratch)

    ;

    mulps xmm6, [_PTAN2];       ;// (2a) xmm6 <= (dp[2]*tg_2_16f)

    ;

    mulps xmm7, [_PCOS4]        ;// (4b) xmm7 <= f[6] = (e[5] - e[6]) * cos_4_16f;
    
    ;

    subps xmm6, xmm2;           ;// (2b) xmm6 <= b[3] = (dp[2]*tg_2_16f) - dp[3]
// xmm2 free
// xmm2 used
//movups [SCRATCH+1*16], xmm5;  ;// xmm5 <= store f[5] in scratch space
// xmm5 f[5] free (in scratch)
     movaps xmm2, [INPC+4*32];  ;// (1) xmm2 <= dp[1]

// xmm0 used
    movaps xmm0, xmm1;          ;// (1) xmm0 <= dp[0]
     ;

    addps xmm1, xmm2;           ;// (2b) xmm1 <= b[0] = dp[0] + dp[1]

// movups [SCRATCH+2*16], xmm7; ;// xmm7 <= store f[6] in scratch space
// xmm7 f[6] free (in scratch)

    ;

//    movups [INP+1*16], xmm1;  ;// temp store

    subps xmm0, xmm2;           ;// (2b) xmm0 <= b[1] = dp[0] - dp[1]
// xmm2 free

//    e[0] = b[0] + b[2];
//    e[1] = b[1] + b[3];
//    e[2] = b[1] - b[3];
//    e[3] = b[0] - b[2];

    ;

// xmm2 used 
    movaps xmm2, xmm1;          ;// (2b) xmm1 <= b[0]
     addps xmm1, xmm3;          ;// (3) xmm1 <= e[0] = b[0] + b[2];

    ;

    subps xmm2, xmm3;           ;// (3) xmm3 <= e[3] = b[0] - b[2];
// xmm3 free
// xmm3 used
     movaps xmm3, xmm0;         ;// (2b) xmm3 <= b[1]

    ;

    addps xmm0, xmm6;           ;// (3) xmm0 <= e[1] = b[1] + b[3];

    ;

    subps xmm3, xmm6;           ;// (3) xmm3 <= e[2] = b[1] - b[3];
// xmm6 free
// xmm6 used
     movaps xmm6, xmm2;         ;// (4) xmm6 <= f[3]

// xmm1 <= f[0]
// xmm0 <= f[1]
// xmm3 <= f[2]
// xmm2 <= f[3]
/*

    y[0*8] = SHIFT_ROUND_COLF( _F0 + _F4 );
    y[1*8] = SHIFT_ROUND_COLF( _F1 + _F5 );
    y[2*8] = SHIFT_ROUND_COLF( _F2 + _F6 );
    y[3*8] = SHIFT_ROUND_COLF( _F3 + _F7 );

    y[4*8] = SHIFT_ROUND_COLF( _F3 - _F7 );
    y[5*8] = SHIFT_ROUND_COLF( _F2 - _F6 );
    y[6*8] = SHIFT_ROUND_COLF( _F1 - _F5 );
    y[7*8] = SHIFT_ROUND_COLF( _F0 - _F4 );

    6a) round compensation, adjust float value to correct round alignment
        addps (ROUND_COMP - 0.5)

    6b) float -> int32 conversion (cvtps2pi)
       cvtps2pi mm0, xmm0;  // mm0 <= int32 versions of y[1 0]
        movhlps xmm0, xmm0; 
       cvtps2pi mm1, xmm0;  // mm1 <= int32 versions of y[3 2]

    6c) int32 -> int16 conversion (packssdw)
        packssdw mm0, mm1; // mm0 <= int16 versions of y[3 2 1 0]

    6d) int16 decsale (psraw)
       psraw mm0, DESCALE_SHIFT2; // mm0 <= final descaled output, {-256,+255}

    ** The descale shift is chosen to implement a {-256,+255} output range.

*/
    ;

    addps xmm2, xmm4;           ;// (5) xmm2 <= y[3] = f[3] + f[7]

    ;

    subps xmm6, xmm4;           ;// (5) xmm6 <= y[4] = f[3] - f[7]
// xmm4 free
// xmm4 used
     movaps xmm4, xmm0;         ;// xmm0 <= f[1]

    ;

    addps xmm2, [_PRND_INV_ROW]; // (6a) ROUND compensation

    ;

    addps xmm0, xmm5;           ;// (5) xmm0 <= y[1] = f[1] + f[5]

    ;

   cvtps2pi mm0, xmm2;          ;// (6b) mm0 <= y[3] low
    movhlps xmm2, xmm2;

   cvtps2pi mm1, xmm2;          ;// (6b) mm1 <= y[3] high
    ;

// xmm2 free
   addps xmm0, [_PRND_INV_ROW]; ;// (6a) ROUND compensation

   ;

   subps xmm4, xmm5;            ;// (5) xmm4 <= y[6] = f[1] - f[5]
// xmm5 free
// xmm2 used
    movaps xmm2, [SCRATCH];     ;// xmm2 <= f[4]

   packssdw mm0, mm1;           ;// (6c) mm0 <= y[3] 
    ;

   cvtps2pi mm2, xmm0;          ;// (6b) mm2 <= y[1] low
    movhlps xmm0, xmm0;

   cvtps2pi mm3, xmm0;          ;// (6b) mm3 <= y[1] high
// xmm0 free
// xmm0 used
// xmm5 used
    movaps xmm0, [_PRND_INV_ROW];

   movaps xmm5, xmm3;          ;// xmm5 <= f[2]
    addps xmm3, xmm7;            ;// (5) xmm3 <= y[2] = f[2] + f[6]

   psraw mm0, DESCALE_SHIFT2;    // (6d) clip y[] to {-256,+255}
    add INPC, 16;

   addps xmm6, [_PRND_INV_ROW]; ;// (6a) ROUND compensation
    ;

   packssdw mm2, mm3;           ;// (6c) mm2 <= y[1]
    ;

   subps xmm5, xmm7;            ;// (5) xmm5 <= y[5] = f[2] - f[6]
// xmm7 free
    movq [OUTC + 3*16], mm0;    ;// mm0 <= y[3] stored

   psraw mm2, DESCALE_SHIFT2;    // (6d) clip y[] to {-256,+255}
    ;

   cvtps2pi mm4, xmm6;          ;// (6b) mm4 <= y[4] low
    movhlps xmm6, xmm6;

   cvtps2pi mm5, xmm6;          ;// (6b) mm5 <= y[4] high
// xmm6 free

   ////////////////////////////////////
   // Start of iDCT column 4-7 processing (denote "col+4")
   ////////////////////////////////////

// xmm6 used
    movaps xmm6, [INPC + 3*32];  // col+4 (1) xmm6 <= dp[6]
  
   addps xmm3, xmm0             ;// (6a) ROUND compensation
    movq [OUTC + 1*16], mm2;    ;// y[1] stored

// xmm7 used
   movaps xmm7, xmm1;           ;// xmm7 <= f[0]
    ;

   addps xmm1, xmm2;            ;// (5) xmm2 <= y[0] = f[0] + f[4]
    ;

   packssdw mm4, mm5;           ;// (6c) mm4 <= y[4]

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -