idctmm32.cpp

来自「这是一组DCT和iDCT的代码」· C++ 代码 · 共 1,707 行 · 第 1/4 页

CPP
1,707
字号
   punpcklwd mm2, mm3;  // mm2 = [ 8 12 9 13]

  punpckhwd mm6, mm3; // mm6 = 10 14 11 15]
   movq mm1, mm0; // mm1 = [ 0 4 1 5]

  mov INP, dword ptr [blk]; // load input address
   punpckldq mm0, mm2;  // final result mm0 = row1 [0 4 8 12]

  movq mm3, mm4;  // mm3 = [ 2 6 3 7]
   punpckhdq mm1, mm2; // mm1 = final result mm1 = row2 [1 5 9 13]

  movq qword ptr [ INP + ROW_STRIDE * 0 ], mm0; // store row 1
   punpckldq mm4, mm6; // final result mm4 = row3 [2 6 10 14]

// begin reading next quadrant (lower-right)
  movq mm0, qword ptr [OUT + ROW_STRIDE*4 + 8]; 
   punpckhdq mm3, mm6; // final result mm3 = row4 [3 7 11 15]

  movq qword ptr [ INP +ROW_STRIDE * 2], mm4; // store row 3
   movq mm4, mm0; // mm4 = copy of row1[A B C D]

  movq qword ptr [ INP +ROW_STRIDE * 1], mm1; // store row 2

  movq mm1, qword ptr [OUT + ROW_STRIDE*5 + 8]

  movq qword ptr [ INP +ROW_STRIDE * 3], mm3; // store row 4
   punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]

  // 2) transpose lower-right quadrant

//  movq mm0, qword ptr [OUT + ROW_STRIDE*4 + 8]

//  movq mm1, qword ptr [OUT + ROW_STRIDE*5 + 8]
//   movq mm4, mm0; // mm4 = copy of row1[A B C D]
  
  movq mm2, qword ptr [OUT + ROW_STRIDE*6 + 8]
//   punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
   punpckhwd mm4, mm1; // mm4 = [ 2 6 3 7]
  
  movq mm3, qword ptr [OUT + ROW_STRIDE*7 + 8]
   movq mm6, mm2;

  punpcklwd mm2, mm3; // mm2 = [ 8 12 9 13]
   movq mm1, mm0; // mm1 = [ 0 4 1 5]

  punpckhwd mm6, mm3; // mm6 = 10 14 11 15]
   movq mm3, mm4; // mm3 = [ 2 6 3 7]

  punpckldq mm0, mm2; // final result mm0 = row1 [0 4 8 12]

  punpckhdq mm1, mm2; // mm1 = final result mm1 = row2 [1 5 9 13]
  ; // slot

  movq qword ptr [ INP + ROW_STRIDE*4 + 8], mm0; // store row 1
   punpckldq mm4, mm6; // final result mm4 = row3 [2 6 10 14]

  movq mm0, qword ptr [OUT + ROW_STRIDE * 4 ]
   punpckhdq mm3, mm6; // final result mm3 = row4 [3 7 11 15]
  movq qword ptr [ INP +ROW_STRIDE*6 + 8], mm4; // store row 3
   movq mm4, mm0; // mm4 = copy of row1[A B C D]

  movq qword ptr [ INP +ROW_STRIDE*5 + 8], mm1; // store row 2
   ; // slot
  movq mm1, qword ptr [OUT + ROW_STRIDE * 5 ]
   ; // slot

  movq qword ptr [ INP +ROW_STRIDE*7 + 8], mm3; // store row 4
   punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]

  // 3) transpose lower-left
//  movq mm0, qword ptr [OUT + ROW_STRIDE * 4 ]

//  movq mm1, qword ptr [OUT + ROW_STRIDE * 5 ]
//   movq mm4, mm0; // mm4 = copy of row1[A B C D]
  
  movq mm2, qword ptr [OUT + ROW_STRIDE * 6 ]
//   punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
   punpckhwd mm4, mm1; // mm4 = [ 2 6 3 7]
  
  movq mm3, qword ptr [OUT + ROW_STRIDE * 7 ]
   movq mm6, mm2;

  punpcklwd mm2, mm3; // mm2 = [ 8 12 9 13]
   movq mm1, mm0; // mm1 = [ 0 4 1 5]

  punpckhwd mm6, mm3; // mm6 = 10 14 11 15]
   movq mm3, mm4; // mm3 = [ 2 6 3 7]

  punpckldq mm0, mm2; // final result mm0 = row1 [0 4 8 12]

  punpckhdq mm1, mm2; // mm1 = final result mm1 = row2 [1 5 9 13]
   ;//slot

  movq qword ptr [ INP + ROW_STRIDE * 0 + 8 ], mm0; // store row 1
   punpckldq mm4, mm6; // final result mm4 = row3 [2 6 10 14]

// begin reading next quadrant (upper-right)
  movq mm0, qword ptr [OUT + ROW_STRIDE*0 + 8]; 
   punpckhdq mm3, mm6; // final result mm3 = row4 [3 7 11 15]

  movq qword ptr [ INP +ROW_STRIDE * 2 + 8], mm4; // store row 3
   movq mm4, mm0; // mm4 = copy of row1[A B C D]

  movq qword ptr [ INP +ROW_STRIDE * 1 + 8 ], mm1; // store row 2
  movq mm1, qword ptr [OUT + ROW_STRIDE*1 + 8]

  movq qword ptr [ INP +ROW_STRIDE * 3 + 8], mm3; // store row 4
   punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]


  // 2) transpose lower-right quadrant

//  movq mm0, qword ptr [OUT + ROW_STRIDE*4 + 8]

//  movq mm1, qword ptr [OUT + ROW_STRIDE*5 + 8]
//   movq mm4, mm0; // mm4 = copy of row1[A B C D]
  
  movq mm2, qword ptr [OUT + ROW_STRIDE*2 + 8]
//   punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
   punpckhwd mm4, mm1; // mm4 = [ 2 6 3 7]
  
  movq mm3, qword ptr [OUT + ROW_STRIDE*3 + 8]
   movq mm6, mm2;

  punpcklwd mm2, mm3; // mm2 = [ 8 12 9 13]
   movq mm1, mm0; // mm1 = [ 0 4 1 5]

  punpckhwd mm6, mm3; // mm6 = 10 14 11 15]
   movq mm3, mm4; // mm3 = [ 2 6 3 7]

  punpckldq mm0, mm2; // final result mm0 = row1 [0 4 8 12]

  punpckhdq mm1, mm2; // mm1 = final result mm1 = row2 [1 5 9 13]
  ; // slot

  movq qword ptr [ INP + ROW_STRIDE*4 ], mm0; // store row 1
   punpckldq mm4, mm6; // final result mm4 = row3 [2 6 10 14]

  movq qword ptr [ INP +ROW_STRIDE*5 ], mm1; // store row 2
   punpckhdq mm3, mm6; // final result mm3 = row4 [3 7 11 15]

  movq qword ptr [ INP +ROW_STRIDE*6 ], mm4; // store row 3
   ; // slot

  movq qword ptr [ INP +ROW_STRIDE*7 ], mm3; // store row 4
   ; // slot
  } // end __asm 
}


//  
// public interface, IDCT 8x8 operation
// v0.16B33, the old implementation

void
j_rev_dct( short *blk )
{
  // 1) iDCT row transformation
  idct_mmx32_rows( blk ); // 1) transform iDCT row, and transpose

  // 2) iDCT column transformation
  idct_mmx32_cols( blk ); // 2) transform iDCT row, and transpose

  // all done

}
*/


const short tab_iz_01234567[] = {
  //row0, this row is required
16384, 16384, 16384, 16384, 
21407, 8867, -8867, -21407, 
16384, -16384, -16384, 16384, 
8867, -21407, 21407, -8867, 
22725, 19266, 12873, 4520, 
19266, -4520, -22725, -12873, 
12873, -22725, 4520, 19266, 
4520, -12873, 19266, -22725

};

#define SHIFT_ROUND_COL( x ) ( ( x + RND_INV_COL ) >> (SHIFT_INV_COL) )
#define SHIFT_ROUND_ROW( x ) ( ( x + RND_INV_ROW ) >> (SHIFT_INV_ROW) )

const static short tg_1_16[4] = {13036, 13036, 13036, 13036 }; //tg * (2<<16) + 0.5
const static short tg_2_16[4] = {27146, 27146, 27146, 27146 }; //tg * (2<<16) + 0.5
const static short tg_3_16[4] = {-21746, -21746, -21746, -21746 }; //tg * (2<<16) + 0.5
const static short cos_4_16[4] = {-19195, -19195, -19195, -19195 }; //cos * (2<<16) + 0.5
const static short ocos_4_16[4] = {23170, 23170, 23170, 23170 }; //cos * (2<<15) + 0.5

// Regular X86 integer implementation of AP922 MMX iDCT
// NOT WORKING YET
void
idct_ap922int86( short *data )
{
  static short temp[64];
  static short tmp[8];
  static int a0, a1, a2, a3, b0, b1, b2, b3;

  static short const _one_corr=0x0001; 
 
  short *x; // pointer to input
  short *y; // pointer to output
  short *ti; // pointer to table

  int i,j;
  short ev0, ev1, ev2, ev3, od0, od1, od2, od3;
  int x0, x1, x2, x3, y0, y1, y2, y3;
  int e0, e1, e2, e3, o0, o1, o2, o3;
  int a, b;

  ti = (short*)&tab_iz_01234567[ 0 ];


  for ( i = 0; i < 8; ++i )
  {
    ti = (short*)&tab_i_01234567[ i * 32 ];
    x = &data[ 8*i ];
    y = &temp[ 8*i ];
    
    a0 = x[0]*ti[0]  + x[4]*ti[1]  + x[2]*ti[4]  + x[6]*ti[5];
    a1 = x[0]*ti[2]  + x[4]*ti[3]  + x[2]*ti[6]  + x[6]*ti[7];
    a2 = x[0]*ti[8]  + x[4]*ti[9]  + x[2]*ti[12] + x[6]*ti[13];
    a3 = x[0]*ti[10] + x[4]*ti[11] + x[2]*ti[14] + x[6]*ti[15];

    b0 = x[1]*ti[16] + x[5]*ti[17] + x[3]*ti[20] + x[7]*ti[21];
    b1 = x[1]*ti[18] + x[5]*ti[19] + x[3]*ti[22] + x[7]*ti[23];
    b2 = x[1]*ti[24] + x[5]*ti[25] + x[3]*ti[28] + x[7]*ti[29];
    b3 = x[1]*ti[26] + x[5]*ti[27] + x[3]*ti[30] + x[7]*ti[31];    

    y[0] = SHIFT_ROUND_ROW ( a0 + b0 );
    y[1] = SHIFT_ROUND_ROW ( a1 + b1 );
    y[2] = SHIFT_ROUND_ROW ( a2 + b2 );
    y[3] = SHIFT_ROUND_ROW ( a3 + b3 );

    y[4] = SHIFT_ROUND_ROW ( a3 - b3 );
    y[5] = SHIFT_ROUND_ROW ( a2 - b2 );
    y[6] = SHIFT_ROUND_ROW ( a1 - b1 );
    y[7] = SHIFT_ROUND_ROW ( a0 - b0 );

  } // end for( i = 0; i < 8; ++i )


// pmulhw/pmulhrw emulation macros 
#define X86_PMULHW( X ) ((short) ( ((int)X)>>16 ))   //Intel MMX
//#define X86_PMULHRW( X ) ((short) ( ( (((int)X)>>15)+1) >>1) ) //3DNow-MMX

  // AP922 iDCT column transform
  // Base code courtesy of VirtualDub (Avery Lee),
  //   http://www.concentric.net/~psilon

  // Note, this code is *NOT* working yet.
  // The output fails IEEE-1180/1990 tests!!!

  for ( i = 0; i < 8; ++i )
  {
    x = &temp[ i ];
    y = &data[ i ];

  ev0 = x[0];
  ev1 = x[16];
  ev2 = x[32];
  ev3 = x[48];
  od0 = x[8];
  od1 = x[24];
  od2 = x[40];
  od3 = x[56];
 
  // 1) Apply D8T - the cos() coefficients are implicit in row calculation,
  //    but we still need to shuffle the input data.
 
  tmp[0] = ev0;
  tmp[1] = ev2;
  tmp[2] = ev1;
  tmp[3] = ev3;
  tmp[4] = od0;
  tmp[5] = od3;
  tmp[6] = od1;
  tmp[7] = od2;
 
  // 2) Apply B8T.
 
 
  x0 =   (tmp[0] + tmp[1]);
  x1 = (- tmp[1] + tmp[0]);

//  x2 = x2 | _one_corr; // "one correction", no effect
//  x3 = x3 | _one_corr; // "one correction", no effect
//  x2 = x2 + _one_corr; // "one correction", no effect
//  x3 = x3 + _one_corr; // "one correction", no effect

  x2 =   tmp[2] + X86_PMULHW( tg_2_16[0]*tmp[3] );
  x3 = - tmp[3] + X86_PMULHW( tmp[2]*tg_2_16[0] ) ;

//  x2 = x2 | _one_corr; // "one correction", nope
//  x2 = x2 + _one_corr; // "one correction", nope
//  x3 = x3 | _one_corr; // "one correction", nope
//  x3 = x3 + _one_corr; // "one correction", nope

  y0 =   tmp[4] + X86_PMULHW( tg_1_16[0]*tmp[5] );
  y1 = - tmp[5] + X86_PMULHW( tmp[4]*tg_1_16[0] );

  //y0 = y0 + _one_corr; // "one correction", nope
//  y0 = y0 | _one_corr; // "one correction", nope
//  y1 = y1 + _one_corr; // "one correction", nope
//  y1 = y1 | _one_corr; // "one correction", nope

  //y2 =   tmp[6] + X86_PMULHW( tg_3_16[0]*tmp[7] );
  //y3 = - tmp[7] + X86_PMULHW( tmp[6]*tg_3_16[0] );
  
  // tg_3_16[] trick
  y2 = X86_PMULHW( tg_3_16[0]*tmp[7] ) + tmp[7];

  //y2 = y2 | _one_corr; // "one correction", nope
  //y2 = y2 + _one_corr; // "one correction", nope

  y2 =   tmp[6] + y2;
  y3 = X86_PMULHW( tmp[6]*tg_3_16[0] ) + tmp[6];

  //y3 = y3 | _one_corr; // "one correction", nope
  //y3 = y3 + _one_corr; // "one correction", nope
  y3 = - tmp[7] + y3;
 
  // 3) Apply E8T.
  //
  //  1  0  1  0
  //  0  1  0  1
  //  0  1  0 -1
  //  1  0 -1  0
  //              1  0  1  0
  //              1  0 -1  0
  //              0  1  0  1
  //              0  1  0 -1
 
 
  e0 = x0 + x2;
  e1 = x1 + x3;
  e2 = x1 - x3;
  e3 = x0 - x2;
  o0 = (y0 + y2) + _one_corr;
  //o0 = (y0 + y2);
  o1 = y0 - y2;
  o2 = y1 + y3;
  o3 = y1 - y3;

  // 4) Apply F8T.
 
 
//  a = (o1 + o2) * cos_4_16[0];
//  b = (o1 - o2) * cos_4_16[0];

  a = (( o1 + o2 ) << 1);
  b = (( o1 - o2 ) << 1);
  o1 = X86_PMULHW( a * ocos_4_16[0] );
  o2 = X86_PMULHW( b * ocos_4_16[0] );
 
  o1 = o1 + _one_corr; // decreases error
  //o2 = o2 + _one_corr; // increases error
  //o2 = o2 | _one_corr; // increases error

  // 5) Apply output butterfly (A8T).
  //
  // 1 0 0 0  1  0  0  0
  // 0 1 0 0  0  1  0  0
  // 0 0 1 0  0  0  1  0
  // 0 0 0 1  0  0  0  1
  // 0 0 0 1  0  0  0 -1
  // 0 0 1 0  0  0 -1  0
  // 0 1 0 0  0 -1  0  0
  // 1 0 0 0 -1  0  0  0
 
  y[0]  = SHIFT_ROUND_COL(e0 + o0);
  y[8]  = SHIFT_ROUND_COL(e1 + o1);
  y[16] = SHIFT_ROUND_COL(e2 + o2);
  y[24] = SHIFT_ROUND_COL(e3 + o3);

  y[32] = SHIFT_ROUND_COL(e3 - o3);
  y[40] = SHIFT_ROUND_COL(e2 - o2);
  y[48] = SHIFT_ROUND_COL(e1 - o1);
  y[56] = SHIFT_ROUND_COL(e0 - o0);
 
  } // end for ( i = 0; i < 8; ++i )

/*
  for ( i = 0; i < 8; ++i )
  {
    //w = &tab_i_01234567[ i * 32 ];
    x = &temp[ 8*i ];
    y = &data[ i ];
    

//    a0 = x[0] * w[0]  + x[2] * w[4]  + x[4] * w[1]  + x[6] * w[5];
//    a1 = x[0] * w[2]  + x[2] * w[6]  + x[4] * w[3]  + x[6] * w[7];
//    a2 = x[0] * w[ 8] + x[2] * w[12] + x[4] * w[9 ] + x[6] * w[13];
//    a3 = x[0] * w[10] + x[2] * w[14] + x[4] * w[11] + x[6] * w[15];

//    b0 = x[1] * w[16] + x[3] * w[20] + x[5] * w[17] + x[7] * w[21];
//    b1 = x[1] * w[18] + x[3] * w[22] + x[5] * w[19] + x[7] * w[23];
//    b2 = x[1] * w[24] + x[3] * w[28] + x[5] * w[25] + x[7] * w[29];
//    b3 = x[1] * w[26] + x[3] * w[30] + x[5] * w[27] + x[7] * w[31];
 

    a0 = x[0] * w[0] + x[2] * w[4]  + x[4] * w[8]  + x[6] * w[12];
    a1 = x[0] * w[1] + x[2] * w[5]  + x[4] * w[9]  + x[6] * w[13];
    a2 = x[0] * w[2] + x[2] * w[6]  + x[4] * w[10] + x[6] * w[14];
    a3 = x[0] * w[3 ]+ x[2] * w[7]  + x[4] * w[11] + x[6] * w[15];

    b0 = x[1] * w[16]+ x[3] * w[20] + x[5] * w[24] + x[7] * w[28];
    b1 = x[1] * w[17]+ x[3] * w[21] + x[5] * w[25] + x[7] * w[29];
    b2 = x[1] * w[18]+ x[3] * w[22] + x[5] * w[26] + x[7] * w[30];
    b3 = x[1] * w[19]+ x[3] * w[23] + x[5] * w[27] + x[7] * w[31];

    y[0] = SHIFT_ROUND_COL ( a0 + b0 );
    y[8] = SHIFT_ROUND_COL ( a1 + b1 );
    y[16] = SHIFT_ROUND_COL ( a2 + b2 );
    y[24] = SHIFT_ROUND_COL ( a3 + b3 );

    y[32] = SHIFT_ROUND_COL ( a3 - b3 );
    y[40] = SHIFT_ROUND_COL ( a2 - b2 );
    y[48] = SHIFT_ROUND_COL ( a1 - b1 );
    y[56] = SHIFT_ROUND_COL ( a0 - b0 );

  } // end for( i = 0; i < 8; ++i )
*/
}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?