idctmm32.cpp

来自「这是一组DCT和iDCT的代码」· C++ 代码 · 共 1,707 行 · 第 1/4 页

CPP
1,707
字号
  //      store DWORD -> [ cols (1,0) "row 3"];
  //      store DWORD -> [ cols (1,0) "row 5"];
  //      store DWORD -> [ cols (1,0) "row 7"];
  //      store DWORD -> [ cols (1,0) "row 4"];
  //      store DWORD -> [ cols (1,0) "row 6"];
  //  }
  //
  //  Note that we have DOUBLED the # store instructions (because we can
  //  only store one dword per instruction, instead of a qword.)  But
  //  we eliminate the output-matrix post-transpose requirement, so overall
  //  the algorithm is faster.
  //
  //  You'll notice that this assembly block contains two copies of the same 
  //  row-transformation algorithm.  The loops are nearly identical.  They
  //  use different shift&round constants.

  __asm {
  //;------------------------------------------------------
  //DCT_8_INV_ROW_8 MACRO INP:REQ, OUT:REQ, TABLE:REQ

  mov INP, dword ptr [blk];   ;// input data is row 0 of blk[]
   mov edi, 0x00; //yx = 0

  lea TABLE, dword ptr [tab_i_01234567]; // row 0 of the tab_i...[]

  lea OUT, dword ptr [qwTemp]; // write output to qwTemp[]
  lea round_inv_row, dword ptr [r_inv_row]

  // for ( y = 0; y<8; y=y+2 )    // transform two rows per iteration
lp_mmx32_rows:
  movq mm0, qword ptr [INP] ;   // 0 ; x3 x2 x1 x0

  movq mm1, qword ptr [INP+8] ; // 1 ; x7 x6 x5 x4
   movq mm2, mm0 ;        // 2 ; x3 x2 x1 x0

  movq mm3, qword ptr [TABLE] ; // 3 ; w06 w04 w02 w00
   punpcklwd mm0, mm1 ;     // x5 x1 x4 x0

// ---------- processing row #y+0
  movq mm5, mm0 ;         // 5 ; x5 x1 x4 x0
   punpckldq mm0, mm0 ;     // x4 x0 x4 x0

  movq mm4, qword ptr [TABLE+8] ; // 4 ; w07 w05 w03 w01
   punpckhwd mm2, mm1 ;     // 1 ; x7 x3 x6 x2

  pmaddwd mm3, mm0 ;        // x4*w06+x0*w04 x4*w02+x0*w00
   movq mm6, mm2 ;        // 6 ; x7 x3 x6 x2

  movq mm1, qword ptr [TABLE+32] ;// 1 ; w22 w20 w18 w16
   punpckldq mm2, mm2 ;     // x6 x2 x6 x2

  pmaddwd mm4, mm2 ;        // x6*w07+x2*w05 x6*w03+x2*w01
   punpckhdq mm5, mm5 ;     // x5 x1 x5 x1

  pmaddwd mm0, qword ptr [TABLE+16] ;// x4*w14+x0*w12 x4*w10+x0*w08
   punpckhdq mm6, mm6 ;     // x7 x3 x7 x3

  movq mm7, qword ptr [TABLE+40] ;// 7 ; w23 w21 w19 w17
   pmaddwd mm1, mm5 ;       // x5*w22+x1*w20 x5*w18+x1*w16

  paddd mm3, qword ptr [round_inv_row];// +rounder
   pmaddwd mm7, mm6 ;       // x7*w23+x3*w21 x7*w19+x3*w17

  pmaddwd mm2, qword ptr [TABLE+24] ;// x6*w15+x2*w13 x6*w11+x2*w09
   paddd mm3, mm4 ;       // 4 ; a1=sum(even1) a0=sum(even0)

  pmaddwd mm5, qword ptr [TABLE+48] ;// x5*w30+x1*w28 x5*w26+x1*w24
   movq mm4, mm3 ;        // 4 ; a1 a0

  pmaddwd mm6, qword ptr [TABLE+56] ;// x7*w31+x3*w29 x7*w27+x3*w25
   paddd mm1, mm7 ;       // 7 ; b1=sum(odd1) b0=sum(odd0)

  paddd mm0, qword ptr [round_inv_row];// +rounder
   psubd mm3, mm1 ;       // a1-b1 a0-b0

  psrad mm3, SHIFT_INV_ROW ;    // y6=a1-b1 y7=a0-b0
   paddd mm1, mm4 ;       // 4 ; a1+b1 a0+b0

  paddd mm0, mm2 ;        // 2 ; a3=sum(even3) a2=sum(even2)
   psrad mm1, SHIFT_INV_ROW ;   // y1=a1+b1 y0=a0+b0

  paddd mm5, mm6 ;        // 6 ; b3=sum(odd3) b2=sum(odd2)
   movq mm4, mm0 ;        // 4 ; a3 a2

  paddd mm0, mm5 ;        // a3+b3 a2+b2
   psubd mm4, mm5 ;       // 5 ; a3-b3 a2-b2

  movq mm2, qword ptr [INP+16] ;    // row+1; 0;  x3 x2 x1 x0
   psrad mm4, SHIFT_INV_ROW ;   // y4=a3-b3 y5=a2-b2

  movq mm7, qword ptr [INP+24] ;  // row+1;  7 ; x7 x6 x5 x4
   psrad mm0, SHIFT_INV_ROW ;   // y3=a3+b3 y2=a2+b2

  add INP, 32;          // increment INPUT pointer +2 rows
   packssdw mm4, mm3 ;        // 3 ; y6 y7 y4 y5

  movq mm3, qword ptr [TABLE] ; // 3 ; w06 w04 w02 w00
   packssdw mm1, mm0 ;        // 0 ; y3 y2 y1 y0

  movq qword ptr [scratch2], mm4 ;  // 7 ; save y6 y7 y4 y5
   movq mm0, mm2 ;          // row+1;  2 ; x3 x2 x1 x0

  movq qword ptr [scratch1], mm1 ;  // 1 ; save y3 y2 y1 y0
   punpcklwd mm0, mm7 ;     // row+1; 0   x5 x1 x4 x0

 // begin processing row #y+1
  movq mm5, mm0 ;         // 5 ; x5 x1 x4 x0
   punpckldq mm0, mm0 ;     // x4 x0 x4 x0

  movq mm4, qword ptr [TABLE+8] ; // 4 ; w07 w05 w03 w01
   punpckhwd mm2, mm7 ;     // 1 ; x7 x3 x6 x2

  pmaddwd mm3, mm0 ;        // x4*w06+x0*w04 x4*w02+x0*w00
   movq mm6, mm2 ;        // 6 ; x7 x3 x6 x2

  movq mm1, qword ptr [TABLE+32] ;// 1 ; w22 w20 w18 w16
   punpckldq mm2, mm2 ;     // x6 x2 x6 x2

  pmaddwd mm4, mm2 ;        // x6*w07+x2*w05 x6*w03+x2*w01
   punpckhdq mm5, mm5 ;     // x5 x1 x5 x1

  pmaddwd mm0, qword ptr [TABLE+16] ;// x4*w14+x0*w12 x4*w10+x0*w08
   punpckhdq mm6, mm6 ;     // x7 x3 x7 x3

  movq mm7, qword ptr [TABLE+40] ;// 7 ; w23 w21 w19 w17
   pmaddwd mm1, mm5 ;       // x5*w22+x1*w20 x5*w18+x1*w16

  paddd mm3, qword ptr [round_inv_row];// +rounder
   pmaddwd mm7, mm6 ;       // x7*w23+x3*w21 x7*w19+x3*w17

  pmaddwd mm2, qword ptr [TABLE+24] ;// x6*w15+x2*w13 x6*w11+x2*w09
   paddd mm3, mm4 ;       // 4 ; a1=sum(even1) a0=sum(even0)

  pmaddwd mm5, qword ptr [TABLE+48] ;// x5*w30+x1*w28 x5*w26+x1*w24
   movq mm4, mm3 ;        // 4 ; a1 a0

  pmaddwd mm6, qword ptr [TABLE+56] ;// x7*w31+x3*w29 x7*w27+x3*w25
   paddd mm1, mm7 ;       // 7 ; b1=sum(odd1) b0=sum(odd0)

  paddd mm0, qword ptr [round_inv_row];// +rounder
   psubd mm3, mm1 ;       // a1-b1 a0-b0

  psrad mm3, SHIFT_INV_ROW ;    // y6=a1-b1 y7=a0-b0
   paddd mm1, mm4 ;       // 4 ; a1+b1 a0+b0

  paddd mm0, mm2 ;        // 2 ; a3=sum(even3) a2=sum(even2)
   psrad mm1, SHIFT_INV_ROW ;   // y1=a1+b1 y0=a0+b0

  paddd mm5, mm6 ;        // 6 ; b3=sum(odd3) b2=sum(odd2)
   movq mm4, mm0 ;        // 4 ; a3 a2

  paddd mm0, mm5 ;        // a3+b3 a2+b2
   psubd mm4, mm5 ;       // 5 ; a3-b3 a2-b2

   // retrieve row#(y+0) data
  movq mm2, qword ptr [scratch1]; // mm2 = row(0) y3 y2 y1 y0
   psrad mm0, SHIFT_INV_ROW ;   // y3=a3+b3 y2=a2+b2

  packssdw mm1, mm0 ;       // 0 ; y3 y2 y1 y0
   movq mm0, mm2;         // mm0 = copy of row(0) y3 y2 y1 y0

  movq mm7, qword ptr [scratch2]; // mm7 = row(0) y6 y7 y4 y5
   punpcklwd mm0, mm1;      // mm0 = [ (r1)y1 (r0)y1 (r1)y0 (r0)y0 ]

  punpckhwd mm2, mm1;     // mm2 = [ (r1)y3 (r0)y3 (r1)y2 (r0)y2 ]
   movq mm6, mm7;         // mm6 = copy of row(0) y6 y7 y4 y5

  movd dword ptr [OUT+ROW_STRIDE*0], mm0; // store row(0) [ y1 y0 ]
   psrlq mm0, 32;

  movd dword ptr [OUT+ROW_STRIDE*2], mm2; // store row(2) [ y1 y0 ]
   psrlq mm2, 32;

  movd dword ptr [OUT+ROW_STRIDE*1], mm0; // store row(1) [ y1 y0 ]
   psrad mm4, SHIFT_INV_ROW ;   // y4=a3-b3 y5=a2-b2

  movd dword ptr [OUT+ROW_STRIDE*3], mm2; // store row(3) [ y1 y0 ]
   packssdw mm4, mm3 ;        // 3 ; y6 y7 y4 y5

  add OUT, 4;         // increment OUTPUT pointer +2 cols
   punpcklwd mm6, mm4;        // mm6 = [ (r1)y4 (r0)y4 (r1)y5 (r0)y5 ]

  add edi, 0x02;          // y = y + 2
   punpckhwd mm7, mm4;        // mm7 = [ (r1)y6 (r0)y7 (r1)y6 (r0)y7 ]

  movd dword ptr [OUT+ROW_STRIDE*5 - 4], mm6 ; // store row(5) [ y1 y0]
   psrlq mm6, 32;

  movd dword ptr [OUT+ROW_STRIDE*7 - 4], mm7 ; // store row(7) [ y1 y0]
   psrlq mm7, 32;

  movd dword ptr [OUT+ROW_STRIDE*4 - 4], mm6 ; // store row(4) [ y1 y0]

  movd dword ptr [OUT+ROW_STRIDE*6 - 4], mm7 ; // store row(6) [ y1 y0]
   cmp edi, 0x08;   // compare ( y <> 8 )

  jl lp_mmx32_rows;   // end for ( y = 0; y < 8; y = y + 2 )  

  // done with row-transformation
  //
  // qwTemp[] contains the output matrix (already transposed)
  //
  //////////////////////////////////////////////////////////////////////

  //////////////////////////////////////////////////////////////////////
  // the following subroutine repeats the row-transform operation, 
  // except with different shift&round constants.
  //
  //  The output is stored into blk[], which destroys the original
  //  input data.

  lea INP, dword ptr [qwTemp];    ;// input is qwTemp[]
   mov edi, 0x00; //yx = 0

  lea TABLE, dword ptr [tab_i_01234567]; // row 0 of table

  mov OUT, dword ptr [blk]; // final output is to blk[]

  lea round_inv_col, dword ptr [r_inv_col];

  // for ( y = 0; y<8; y=y+2 )    // transform two rows per iteration
lp_mmx32_cols:
  movq mm0, qword ptr [INP] ;   // 0 ; x3 x2 x1 x0

  movq mm1, qword ptr [INP+8] ; // 1 ; x7 x6 x5 x4
   movq mm2, mm0 ;        // 2 ; x3 x2 x1 x0

  movq mm3, qword ptr [TABLE] ; // 3 ; w06 w04 w02 w00
   punpcklwd mm0, mm1 ;     // x5 x1 x4 x0

// ---------- processing row #y+0
  movq mm5, mm0 ;         // 5 ; x5 x1 x4 x0
   punpckldq mm0, mm0 ;     // x4 x0 x4 x0

  movq mm4, qword ptr [TABLE+8] ; // 4 ; w07 w05 w03 w01
   punpckhwd mm2, mm1 ;     // 1 ; x7 x3 x6 x2

  pmaddwd mm3, mm0 ;        // x4*w06+x0*w04 x4*w02+x0*w00
   movq mm6, mm2 ;        // 6 ; x7 x3 x6 x2

  movq mm1, qword ptr [TABLE+32] ;// 1 ; w22 w20 w18 w16
   punpckldq mm2, mm2 ;     // x6 x2 x6 x2

  pmaddwd mm4, mm2 ;        // x6*w07+x2*w05 x6*w03+x2*w01
   punpckhdq mm5, mm5 ;     // x5 x1 x5 x1

  pmaddwd mm0, qword ptr [TABLE+16] ;// x4*w14+x0*w12 x4*w10+x0*w08
   punpckhdq mm6, mm6 ;     // x7 x3 x7 x3

  movq mm7, qword ptr [TABLE+40] ;// 7 ; w23 w21 w19 w17
   pmaddwd mm1, mm5 ;       // x5*w22+x1*w20 x5*w18+x1*w16

  paddd mm3, qword ptr [round_inv_col];// +rounder
   pmaddwd mm7, mm6 ;       // x7*w23+x3*w21 x7*w19+x3*w17

  pmaddwd mm2, qword ptr [TABLE+24] ;// x6*w15+x2*w13 x6*w11+x2*w09
   paddd mm3, mm4 ;       // 4 ; a1=sum(even1) a0=sum(even0)

  pmaddwd mm5, qword ptr [TABLE+48] ;// x5*w30+x1*w28 x5*w26+x1*w24
   movq mm4, mm3 ;        // 4 ; a1 a0

  pmaddwd mm6, qword ptr [TABLE+56] ;// x7*w31+x3*w29 x7*w27+x3*w25
   paddd mm1, mm7 ;       // 7 ; b1=sum(odd1) b0=sum(odd0)

  paddd mm0, qword ptr [round_inv_col];// +rounder
   psubd mm3, mm1 ;       // a1-b1 a0-b0

  psrad mm3, SHIFT_INV_COL ;    // y6=a1-b1 y7=a0-b0
   paddd mm1, mm4 ;       // 4 ; a1+b1 a0+b0

  paddd mm0, mm2 ;        // 2 ; a3=sum(even3) a2=sum(even2)
   psrad mm1, SHIFT_INV_COL;    // y1=a1+b1 y0=a0+b0

  paddd mm5, mm6 ;        // 6 ; b3=sum(odd3) b2=sum(odd2)
   movq mm4, mm0 ;        // 4 ; a3 a2

  paddd mm0, mm5 ;        // a3+b3 a2+b2
   psubd mm4, mm5 ;       // 5 ; a3-b3 a2-b2

  movq mm2, qword ptr [INP+16] ;    // row+1; 0;  x3 x2 x1 x0
   psrad mm4, SHIFT_INV_COL ;   // y4=a3-b3 y5=a2-b2

  movq mm7, qword ptr [INP+24] ;  // row+1;  7 ; x7 x6 x5 x4
   psrad mm0, SHIFT_INV_COL ;   // y3=a3+b3 y2=a2+b2

  add INP, 32;          // increment INPUT pointer +2 rows
   packssdw mm4, mm3 ;        // 3 ; y6 y7 y4 y5

  movq mm3, qword ptr [TABLE] ; // 3 ; w06 w04 w02 w00
   packssdw mm1, mm0 ;        // 0 ; y3 y2 y1 y0

  movq qword ptr [scratch2], mm4 ;  // 7 ; save y6 y7 y4 y5
   movq mm0, mm2 ;          // row+1;  2 ; x3 x2 x1 x0

  movq qword ptr [scratch1], mm1 ;  // 1 ; save y3 y2 y1 y0
   punpcklwd mm0, mm7 ;     // row+1; 0   x5 x1 x4 x0

 // begin processing row #y+1
  movq mm5, mm0 ;         // 5 ; x5 x1 x4 x0
   punpckldq mm0, mm0 ;     // x4 x0 x4 x0

  movq mm4, qword ptr [TABLE+8] ; // 4 ; w07 w05 w03 w01
   punpckhwd mm2, mm7 ;     // 1 ; x7 x3 x6 x2

  pmaddwd mm3, mm0 ;        // x4*w06+x0*w04 x4*w02+x0*w00
   movq mm6, mm2 ;        // 6 ; x7 x3 x6 x2

  movq mm1, qword ptr [TABLE+32] ;// 1 ; w22 w20 w18 w16
   punpckldq mm2, mm2 ;     // x6 x2 x6 x2

  pmaddwd mm4, mm2 ;        // x6*w07+x2*w05 x6*w03+x2*w01
   punpckhdq mm5, mm5 ;     // x5 x1 x5 x1

  pmaddwd mm0, qword ptr [TABLE+16] ;// x4*w14+x0*w12 x4*w10+x0*w08
   punpckhdq mm6, mm6 ;     // x7 x3 x7 x3

  movq mm7, qword ptr [TABLE+40] ;// 7 ; w23 w21 w19 w17
   pmaddwd mm1, mm5 ;       // x5*w22+x1*w20 x5*w18+x1*w16

  paddd mm3, qword ptr [round_inv_col];// +rounder
   pmaddwd mm7, mm6 ;       // x7*w23+x3*w21 x7*w19+x3*w17

  pmaddwd mm2, qword ptr [TABLE+24] ;// x6*w15+x2*w13 x6*w11+x2*w09
   paddd mm3, mm4 ;       // 4 ; a1=sum(even1) a0=sum(even0)

  pmaddwd mm5, qword ptr [TABLE+48] ;// x5*w30+x1*w28 x5*w26+x1*w24
   movq mm4, mm3 ;        // 4 ; a1 a0

  pmaddwd mm6, qword ptr [TABLE+56] ;// x7*w31+x3*w29 x7*w27+x3*w25
   paddd mm1, mm7 ;       // 7 ; b1=sum(odd1) b0=sum(odd0)

  paddd mm0, qword ptr [round_inv_col];// +rounder
   psubd mm3, mm1 ;       // a1-b1 a0-b0

  psrad mm3, SHIFT_INV_COL ;    // y6=a1-b1 y7=a0-b0
   paddd mm1, mm4 ;       // 4 ; a1+b1 a0+b0

  paddd mm0, mm2 ;        // 2 ; a3=sum(even3) a2=sum(even2)
   psrad mm1, SHIFT_INV_COL ;   // y1=a1+b1 y0=a0+b0

  paddd mm5, mm6 ;        // 6 ; b3=sum(odd3) b2=sum(odd2)
   movq mm4, mm0 ;        // 4 ; a3 a2

  paddd mm0, mm5 ;        // a3+b3 a2+b2
   psubd mm4, mm5 ;       // 5 ; a3-b3 a2-b2

   // retrieve row#(y+0) data
  movq mm2, qword ptr [scratch1]; // mm2 = row(0) y3 y2 y1 y0
   psrad mm0, SHIFT_INV_COL ;   // y3=a3+b3 y2=a2+b2

  packssdw mm1, mm0 ;       // 0 ; y3 y2 y1 y0
   movq mm0, mm2;         // mm0 = copy of row(0) y3 y2 y1 y0

  movq mm7, qword ptr [scratch2]; // mm7 = row(0) y6 y7 y4 y5
   punpcklwd mm0, mm1;      // mm0 = [ (r1)y1 (r0)y1 (r1)y0 (r0)y0 ]

  punpckhwd mm2, mm1;     // mm2 = [ (r1)y3 (r0)y3 (r1)y2 (r0)y2 ]
   movq mm6, mm7;         // mm6 = copy of row(0) y6 y7 y4 y5

  movd dword ptr [OUT+ROW_STRIDE*0], mm0; // store row(0) [ y1 y0 ]
   psrlq mm0, 32;

  movd dword ptr [OUT+ROW_STRIDE*2], mm2; // store row(2) [ y1 y0 ]
   psrlq mm2, 32;

  movd dword ptr [OUT+ROW_STRIDE*1], mm0; // store row(1) [ y1 y0 ]
   psrad mm4, SHIFT_INV_COL ;   // y4=a3-b3 y5=a2-b2

  movd dword ptr [OUT+ROW_STRIDE*3], mm2; // store row(3) [ y1 y0 ]
   packssdw mm4, mm3 ;        // 3 ; y6 y7 y4 y5

  add OUT, 4;         // increment OUTPUT pointer +2 cols
   punpcklwd mm6, mm4;        // mm6 = [ (r1)y4 (r0)y4 (r1)y5 (r0)y5 ]

  add edi, 0x02;          // y = y + 2
   punpckhwd mm7, mm4;        // mm7 = [ (r1)y6 (r0)y7 (r1)y6 (r0)y7 ]

  movd dword ptr [OUT+ROW_STRIDE*5 - 4], mm6 ; // store row(5) [ y1 y0]
   psrlq mm6, 32;

  movd dword ptr [OUT+ROW_STRIDE*7 - 4], mm7 ; // store row(7) [ y1 y0]
   psrlq mm7, 32;

  movd dword ptr [OUT+ROW_STRIDE*4 - 4], mm6 ; // store row(4) [ y1 y0]

  movd dword ptr [OUT+ROW_STRIDE*6 - 4], mm7 ; // store row(6) [ y1 y0]
   cmp edi, 0x08;   // compare ( y <> 8 )

  jl lp_mmx32_cols;   // end for ( y = 0; y < 8; y = y + 2 )  

  } // end __asm 
}


// v0.16b34 no longer in use
//  the old implementation is here for educational purposes.

/*
static void 
idct_mmx32_cols( short *blk ) // transform all 8 cols of 8x8 iDCT block
{

  // Despite the function's name, the matrix is transformed
  // row by row.  This function is identical to idct_mmx32_rows(),
  // except for the SHIFT amount and ROUND_INV amount.

  // this subroutine performs two operations
  // 1) iDCT row transform
  //    for( i = 0; i < 8; ++ i)
  //      DCT_8_INV_ROW_1( blk[i*8], qwTemp[i] );
  //
  // 2) transpose the matrix (which was stored in qwTemp[])
  //        qwTemp[] -> [8x8 matrix transpose] -> blk[]


  __asm {
  //;------------------------------------------------------
  //DCT_8_INV_ROW_1 MACRO INP:REQ, OUT:REQ, TABLE:REQ

  mov INP, dword ptr [blk];   ;// row 0
   mov edi, 0x00; // x = 0
 
  lea TABLE, dword ptr [tab_i_01234567]; // row 0
  lea OUT, dword ptr [qwTemp];
//   mov OUT, INP;  // algorithm writes data in-place  -> row 0

  lea round_inv_col, dword ptr [r_inv_col]

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?