idctmm32.cpp

来自「这是一组DCT和iDCT的代码」· C++ 代码 · 共 1,707 行 · 第 1/4 页

CPP
1,707
字号

  // for ( x = 0; x < 8; ++x )  // transform one row per iteration
  acc_idct_colloop1:

  movq mm0, qword ptr [INP] ;   // 0 ; x3 x2 x1 x0

  movq mm1, qword ptr [INP+8] ; // 1 ; x7 x6 x5 x4
   movq mm2, mm0 ;        // 2 ; x3 x2 x1 x0

  movq mm3, qword ptr [TABLE] ; // 3 ; w06 w04 w02 w00
   punpcklwd mm0, mm1 ;     // x5 x1 x4 x0

// ----------
  movq mm5, mm0 ;         // 5 ; x5 x1 x4 x0
   punpckldq mm0, mm0 ;     // x4 x0 x4 x0

  movq mm4, qword ptr [TABLE+8] ; // 4 ; w07 w05 w03 w01
   punpckhwd mm2, mm1 ;     // 1 ; x7 x3 x6 x2

  pmaddwd mm3, mm0 ;        // x4*w06+x0*w04 x4*w02+x0*w00
   movq mm6, mm2 ;        // 6 ; x7 x3 x6 x2

  movq mm1, qword ptr [TABLE+32] ;// 1 ; w22 w20 w18 w16
   punpckldq mm2, mm2 ;     // x6 x2 x6 x2

  pmaddwd mm4, mm2 ;        // x6*w07+x2*w05 x6*w03+x2*w01
   punpckhdq mm5, mm5 ;     // x5 x1 x5 x1

  pmaddwd mm0, qword ptr [TABLE+16] ;// x4*w14+x0*w12 x4*w10+x0*w08
   punpckhdq mm6, mm6 ;     // x7 x3 x7 x3

  movq mm7, qword ptr [TABLE+40] ;// 7 ; w23 w21 w19 w17
   pmaddwd mm1, mm5 ;       // x5*w22+x1*w20 x5*w18+x1*w16

  paddd mm3, qword ptr [round_inv_col] ;// +rounder
   pmaddwd mm7, mm6 ;       // x7*w23+x3*w21 x7*w19+x3*w17

  pmaddwd mm2, qword ptr [TABLE+24] ;// x6*w15+x2*w13 x6*w11+x2*w09
   paddd mm3, mm4 ;       // 4 ; a1=sum(even1) a0=sum(even0)

  pmaddwd mm5, qword ptr [TABLE+48] ;// x5*w30+x1*w28 x5*w26+x1*w24
   movq mm4, mm3 ;        // 4 ; a1 a0

  pmaddwd mm6, qword ptr [TABLE+56] ;// x7*w31+x3*w29 x7*w27+x3*w25
   paddd mm1, mm7 ;       // 7 ; b1=sum(odd1) b0=sum(odd0)

  paddd mm0, qword ptr [round_inv_col] ;// +rounder
   psubd mm3, mm1 ;       // a1-b1 a0-b0

  psrad mm3, SHIFT_INV_COL;   // y6=a1-b1 y7=a0-b0
   paddd mm1, mm4 ;       // 4 ; a1+b1 a0+b0

  paddd mm0, mm2 ;        // 2 ; a3=sum(even3) a2=sum(even2)
   psrad mm1, SHIFT_INV_COL;    // y1=a1+b1 y0=a0+b0

  paddd mm5, mm6 ;        // 6 ; b3=sum(odd3) b2=sum(odd2)
   movq mm4, mm0 ;        // 4 ; a3 a2

  paddd mm0, mm5 ;        // a3+b3 a2+b2
   psubd mm4, mm5 ;       // 5 ; a3-b3 a2-b2

  add INP, 16;          // increment INPUT pointer -> row 1
   psrad mm4, SHIFT_INV_COL;    // y4=a3-b3 y5=a2-b2

  add TABLE, 0;         // ptr TABLE += 64 -> row 1
   psrad mm0, SHIFT_INV_COL;    // y3=a3+b3 y2=a2+b2

//  movq mm2, qword ptr [INP] ;   // row+1; 0;  x3 x2 x1 x0
   packssdw mm4, mm3 ;        // 3 ; y6 y7 y4 y5

  packssdw mm1, mm0 ;       // 0 ; y3 y2 y1 y0
   movq mm7, mm4 ;        // 7 ; y6 y7 y4 y5

//  movq mm0, mm2 ;         // row+1;  2 ; x3 x2 x1 x0
//  por mm1, qword ptr one_corr ; // correction y2 +0.5
   psrld mm4, 16 ;          // 0 y6 0 y4

  movq qword ptr [OUT], mm1 ; // 1 ; save y3 y2 y1 y0
   pslld mm7, 16 ;          // y7 0 y5 0

//  movq mm1, qword ptr [INP+8] ; // row+1;  1 ; x7 x6 x5 x4
//  por mm7, qword ptr one_corr ; // correction y2 +0.5
   por mm7, mm4 ;         // 4 ; y7 y6 y5 y4

//  movq mm3, qword ptr [TABLE] ; // 3 ; w06 w04 w02 w00
//   punpcklwd mm0, mm1 ;     // row+1;  x5 x1 x4 x0

   // begin processing row 1
  movq qword ptr [OUT+8], mm7 ; // 7 ; save y7 y6 y5 y4
   add edi, 0x01;

  add OUT, 16;
   cmp edi, 0x08; // compare x <> 8

  jl  acc_idct_colloop1;  // end for ( x = 0; x < 8; ++x )  

  // done with the iDCT column-transformation

    // now we have to transpose the output 8x8 matrix
    // 8x8 (OUT) -> 8x8't' (IN)

    // the transposition is implemented as 4 sub-operations.
  // 1) transpose upper-left quad
  // 2) transpose lower-right quad
  // 3) transpose lower-left quad
  // 4) transpose upper-right quad


 
  // mm0 = 1st row [ A B C D ] row1
  // mm1 = 2nd row [ E F G H ] 2
  // mm2 = 3rd row [ I J K L ] 3
  // mm3 = 4th row [ M N O P ] 4

  // 1) transpose upper-left quad
  lea OUT, dword ptr [qwTemp];

  movq mm0, qword ptr [OUT + ROW_STRIDE * 0 ]

  movq mm1, qword ptr [OUT + ROW_STRIDE * 1 ]
   movq mm4, mm0; // mm4 = copy of row1[A B C D]
  
  movq mm2, qword ptr [OUT + ROW_STRIDE * 2 ]
   punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
  
  movq mm3, qword ptr [OUT + ROW_STRIDE * 3]
   punpckhwd mm4, mm1; // mm4 = [ 2 6 3 7]

  movq mm6, mm2;
   punpcklwd mm2, mm3;  // mm2 = [ 8 12 9 13]

  punpckhwd mm6, mm3; // mm6 = 10 14 11 15]
   movq mm1, mm0; // mm1 = [ 0 4 1 5]

  mov INP, dword ptr [blk]; // load input address
   punpckldq mm0, mm2;  // final result mm0 = row1 [0 4 8 12]

  movq mm3, mm4;  // mm3 = [ 2 6 3 7]
   punpckhdq mm1, mm2; // mm1 = final result mm1 = row2 [1 5 9 13]

  movq qword ptr [ INP + ROW_STRIDE * 0 ], mm0; // store row 1
   punpckldq mm4, mm6; // final result mm4 = row3 [2 6 10 14]

// begin reading next quadrant (lower-right)
  movq mm0, qword ptr [OUT + ROW_STRIDE*4 + 8]; 
   punpckhdq mm3, mm6; // final result mm3 = row4 [3 7 11 15]

  movq qword ptr [ INP +ROW_STRIDE * 2], mm4; // store row 3
   movq mm4, mm0; // mm4 = copy of row1[A B C D]

  movq qword ptr [ INP +ROW_STRIDE * 1], mm1; // store row 2

  movq mm1, qword ptr [OUT + ROW_STRIDE*5 + 8]

  movq qword ptr [ INP +ROW_STRIDE * 3], mm3; // store row 4
   punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]

  // 2) transpose lower-right quadrant

//  movq mm0, qword ptr [OUT + ROW_STRIDE*4 + 8]

//  movq mm1, qword ptr [OUT + ROW_STRIDE*5 + 8]
//   movq mm4, mm0; // mm4 = copy of row1[A B C D]
  
  movq mm2, qword ptr [OUT + ROW_STRIDE*6 + 8]
//   punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
   punpckhwd mm4, mm1; // mm4 = [ 2 6 3 7]
  
  movq mm3, qword ptr [OUT + ROW_STRIDE*7 + 8]
   movq mm6, mm2;

  punpcklwd mm2, mm3; // mm2 = [ 8 12 9 13]
   movq mm1, mm0; // mm1 = [ 0 4 1 5]

  punpckhwd mm6, mm3; // mm6 = 10 14 11 15]
   movq mm3, mm4; // mm3 = [ 2 6 3 7]

  punpckldq mm0, mm2; // final result mm0 = row1 [0 4 8 12]

  punpckhdq mm1, mm2; // mm1 = final result mm1 = row2 [1 5 9 13]
  ; // slot

  movq qword ptr [ INP + ROW_STRIDE*4 + 8], mm0; // store row 1
   punpckldq mm4, mm6; // final result mm4 = row3 [2 6 10 14]

  movq mm0, qword ptr [OUT + ROW_STRIDE * 4 ]
   punpckhdq mm3, mm6; // final result mm3 = row4 [3 7 11 15]
  movq qword ptr [ INP +ROW_STRIDE*6 + 8], mm4; // store row 3
   movq mm4, mm0; // mm4 = copy of row1[A B C D]

  movq qword ptr [ INP +ROW_STRIDE*5 + 8], mm1; // store row 2
   ; // slot
  movq mm1, qword ptr [OUT + ROW_STRIDE * 5 ]
   ; // slot

  movq qword ptr [ INP +ROW_STRIDE*7 + 8], mm3; // store row 4
   punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]

  // 3) transpose lower-left
//  movq mm0, qword ptr [OUT + ROW_STRIDE * 4 ]

//  movq mm1, qword ptr [OUT + ROW_STRIDE * 5 ]
//   movq mm4, mm0; // mm4 = copy of row1[A B C D]
  
  movq mm2, qword ptr [OUT + ROW_STRIDE * 6 ]
//   punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
   punpckhwd mm4, mm1; // mm4 = [ 2 6 3 7]
  
  movq mm3, qword ptr [OUT + ROW_STRIDE * 7 ]
   movq mm6, mm2;

  punpcklwd mm2, mm3; // mm2 = [ 8 12 9 13]
   movq mm1, mm0; // mm1 = [ 0 4 1 5]

  punpckhwd mm6, mm3; // mm6 = 10 14 11 15]
   movq mm3, mm4; // mm3 = [ 2 6 3 7]

  punpckldq mm0, mm2; // final result mm0 = row1 [0 4 8 12]

  punpckhdq mm1, mm2; // mm1 = final result mm1 = row2 [1 5 9 13]
   ;//slot

  movq qword ptr [ INP + ROW_STRIDE * 0 + 8 ], mm0; // store row 1
   punpckldq mm4, mm6; // final result mm4 = row3 [2 6 10 14]

// begin reading next quadrant (upper-right)
  movq mm0, qword ptr [OUT + ROW_STRIDE*0 + 8]; 
   punpckhdq mm3, mm6; // final result mm3 = row4 [3 7 11 15]

  movq qword ptr [ INP +ROW_STRIDE * 2 + 8], mm4; // store row 3
   movq mm4, mm0; // mm4 = copy of row1[A B C D]

  movq qword ptr [ INP +ROW_STRIDE * 1 + 8 ], mm1; // store row 2
  movq mm1, qword ptr [OUT + ROW_STRIDE*1 + 8]

  movq qword ptr [ INP +ROW_STRIDE * 3 + 8], mm3; // store row 4
   punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]


  // 2) transpose lower-right quadrant

//  movq mm0, qword ptr [OUT + ROW_STRIDE*4 + 8]

//  movq mm1, qword ptr [OUT + ROW_STRIDE*5 + 8]
//   movq mm4, mm0; // mm4 = copy of row1[A B C D]
  
  movq mm2, qword ptr [OUT + ROW_STRIDE*2 + 8]
//   punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
   punpckhwd mm4, mm1; // mm4 = [ 2 6 3 7]
  
  movq mm3, qword ptr [OUT + ROW_STRIDE*3 + 8]
   movq mm6, mm2;

  punpcklwd mm2, mm3; // mm2 = [ 8 12 9 13]
   movq mm1, mm0; // mm1 = [ 0 4 1 5]

  punpckhwd mm6, mm3; // mm6 = 10 14 11 15]
   movq mm3, mm4; // mm3 = [ 2 6 3 7]

  punpckldq mm0, mm2; // final result mm0 = row1 [0 4 8 12]

  punpckhdq mm1, mm2; // mm1 = final result mm1 = row2 [1 5 9 13]
  ; // slot

  movq qword ptr [ INP + ROW_STRIDE*4 ], mm0; // store row 1
   punpckldq mm4, mm6; // final result mm4 = row3 [2 6 10 14]

  movq qword ptr [ INP +ROW_STRIDE*5 ], mm1; // store row 2
   punpckhdq mm3, mm6; // final result mm3 = row4 [3 7 11 15]

  movq qword ptr [ INP +ROW_STRIDE*6 ], mm4; // store row 3
   ; // slot

  movq qword ptr [ INP +ROW_STRIDE*7 ], mm3; // store row 4
   ; // slot
  } // end __asm

}


static void 
idct_mmx32_rows( short *blk ) // transform all 8 rows of 8x8 iDCT block
{
  // this subroutine performs two operations
  // 1) iDCT row transform
  //    for( i = 0; i < 8; ++ i)
  //      DCT_8_INV_ROW_1( blk[i*8], qwTemp[i] );
  //
  // 2) transpose the matrix (which was stored in qwTemp[])
  //        qwTemp[] -> [8x8 matrix transpose] -> blk[]


  __asm {
  //;------------------------------------------------------
  //DCT_8_INV_ROW_1 MACRO INP:REQ, OUT:REQ, TABLE:REQ

  mov INP, dword ptr [blk];   ;// row 0
   mov edi, 0x00; //x = 0

  lea TABLE, dword ptr [tab_i_01234567]; // row 0
//   mov OUT, INP;  // algorithm writes data in-place  -> row 0

  lea OUT, dword ptr [qwTemp];
  lea round_inv_row, dword ptr [r_inv_row]

  // for ( x = 0; x < 8; ++x )  // transform one row per iteration
lpa:
  movq mm0, qword ptr [INP] ;   // 0 ; x3 x2 x1 x0

  movq mm1, qword ptr [INP+8] ; // 1 ; x7 x6 x5 x4
   movq mm2, mm0 ;        // 2 ; x3 x2 x1 x0

  movq mm3, qword ptr [TABLE] ; // 3 ; w06 w04 w02 w00
   punpcklwd mm0, mm1 ;     // x5 x1 x4 x0

// ----------
  movq mm5, mm0 ;         // 5 ; x5 x1 x4 x0
   punpckldq mm0, mm0 ;     // x4 x0 x4 x0

  movq mm4, qword ptr [TABLE+8] ; // 4 ; w07 w05 w03 w01
   punpckhwd mm2, mm1 ;     // 1 ; x7 x3 x6 x2

  pmaddwd mm3, mm0 ;        // x4*w06+x0*w04 x4*w02+x0*w00
   movq mm6, mm2 ;        // 6 ; x7 x3 x6 x2

  movq mm1, qword ptr [TABLE+32] ;// 1 ; w22 w20 w18 w16
   punpckldq mm2, mm2 ;     // x6 x2 x6 x2

  pmaddwd mm4, mm2 ;        // x6*w07+x2*w05 x6*w03+x2*w01
   punpckhdq mm5, mm5 ;     // x5 x1 x5 x1

  pmaddwd mm0, qword ptr [TABLE+16] ;// x4*w14+x0*w12 x4*w10+x0*w08
   punpckhdq mm6, mm6 ;     // x7 x3 x7 x3

  movq mm7, qword ptr [TABLE+40] ;// 7 ; w23 w21 w19 w17
   pmaddwd mm1, mm5 ;       // x5*w22+x1*w20 x5*w18+x1*w16

  paddd mm3, qword ptr [round_inv_row];// +rounder
   pmaddwd mm7, mm6 ;       // x7*w23+x3*w21 x7*w19+x3*w17

  pmaddwd mm2, qword ptr [TABLE+24] ;// x6*w15+x2*w13 x6*w11+x2*w09
   paddd mm3, mm4 ;       // 4 ; a1=sum(even1) a0=sum(even0)

  pmaddwd mm5, qword ptr [TABLE+48] ;// x5*w30+x1*w28 x5*w26+x1*w24
   movq mm4, mm3 ;        // 4 ; a1 a0

  pmaddwd mm6, qword ptr [TABLE+56] ;// x7*w31+x3*w29 x7*w27+x3*w25
   paddd mm1, mm7 ;       // 7 ; b1=sum(odd1) b0=sum(odd0)

  paddd mm0, qword ptr [round_inv_row];// +rounder
   psubd mm3, mm1 ;       // a1-b1 a0-b0

  psrad mm3, SHIFT_INV_ROW ;    // y6=a1-b1 y7=a0-b0
   paddd mm1, mm4 ;       // 4 ; a1+b1 a0+b0

  paddd mm0, mm2 ;        // 2 ; a3=sum(even3) a2=sum(even2)
   psrad mm1, SHIFT_INV_ROW ;   // y1=a1+b1 y0=a0+b0

  paddd mm5, mm6 ;        // 6 ; b3=sum(odd3) b2=sum(odd2)
   movq mm4, mm0 ;        // 4 ; a3 a2

  paddd mm0, mm5 ;        // a3+b3 a2+b2
   psubd mm4, mm5 ;       // 5 ; a3-b3 a2-b2

  add INP, 16;          // increment INPUT pointer -> row 1
   psrad mm4, SHIFT_INV_ROW ;   // y4=a3-b3 y5=a2-b2

//  add TABLE, 0;         // ptr TABLE += 64 -> row 1
   psrad mm0, SHIFT_INV_ROW ;   // y3=a3+b3 y2=a2+b2

//  movq mm2, qword ptr [INP] ;   // row+1; 0;  x3 x2 x1 x0
   packssdw mm4, mm3 ;        // 3 ; y6 y7 y4 y5

  packssdw mm1, mm0 ;       // 0 ; y3 y2 y1 y0
   movq mm7, mm4 ;        // 7 ; y6 y7 y4 y5

//  movq mm0, mm2 ;         // row+1;  2 ; x3 x2 x1 x0
   psrld mm4, 16 ;          // 0 y6 0 y4

  movq qword ptr [OUT], mm1 ; // 1 ; save y3 y2 y1 y0
   pslld mm7, 16 ;          // y7 0 y5 0

//  movq mm1, qword ptr [INP+8] ; // row+1;  1 ; x7 x6 x5 x4
   por mm7, mm4 ;         // 4 ; y7 y6 y5 y4

  movq mm3, qword ptr [TABLE] ; // 3 ; w06 w04 w02 w00
//   punpcklwd mm0, mm1 ;     // row+1;  x5 x1 x4 x0

   // begin processing row 1
  movq qword ptr [OUT+8], mm7 ; // 7 ; save y7 y6 y5 y4
   add edi, 0x01;

  add OUT, 16;          // increment OUTPUT pointer -> row 1
   cmp edi, 0x08;
  jl lpa;   // end for ( x = 0; x < 8; ++x )  

  // done with the iDCT row-transformation

  // now we have to transpose the output 8x8 matrix
  // 8x8 (OUT) -> 8x8't' (IN)
  // the transposition is implemented as 4 sub-operations.
  // 1) transpose upper-left quad
  // 2) transpose lower-right quad
  // 3) transpose lower-left quad
  // 4) transpose upper-right quad

 
  // mm0 = 1st row [ A B C D ] row1
  // mm1 = 2nd row [ E F G H ] 2
  // mm2 = 3rd row [ I J K L ] 3
  // mm3 = 4th row [ M N O P ] 4

  // 1) transpose upper-left quad
  lea OUT, dword ptr [qwTemp];

  movq mm0, qword ptr [OUT + ROW_STRIDE * 0 ]

  movq mm1, qword ptr [OUT + ROW_STRIDE * 1 ]
   movq mm4, mm0; // mm4 = copy of row1[A B C D]
  
  movq mm2, qword ptr [OUT + ROW_STRIDE * 2 ]
   punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
  
  movq mm3, qword ptr [OUT + ROW_STRIDE * 3]
   punpckhwd mm4, mm1; // mm4 = [ 2 6 3 7]

  movq mm6, mm2;

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?