idctmm32.cpp
来自「这是一组DCT和iDCT的代码」· C++ 代码 · 共 1,707 行 · 第 1/4 页
CPP
1,707 行
punpcklwd mm2, mm3; // mm2 = [ 8 12 9 13]
punpckhwd mm6, mm3; // mm6 = 10 14 11 15]
movq mm1, mm0; // mm1 = [ 0 4 1 5]
mov INP, dword ptr [blk]; // load input address
punpckldq mm0, mm2; // final result mm0 = row1 [0 4 8 12]
movq mm3, mm4; // mm3 = [ 2 6 3 7]
punpckhdq mm1, mm2; // mm1 = final result mm1 = row2 [1 5 9 13]
movq qword ptr [ INP + ROW_STRIDE * 0 ], mm0; // store row 1
punpckldq mm4, mm6; // final result mm4 = row3 [2 6 10 14]
// begin reading next quadrant (lower-right)
movq mm0, qword ptr [OUT + ROW_STRIDE*4 + 8];
punpckhdq mm3, mm6; // final result mm3 = row4 [3 7 11 15]
movq qword ptr [ INP +ROW_STRIDE * 2], mm4; // store row 3
movq mm4, mm0; // mm4 = copy of row1[A B C D]
movq qword ptr [ INP +ROW_STRIDE * 1], mm1; // store row 2
movq mm1, qword ptr [OUT + ROW_STRIDE*5 + 8]
movq qword ptr [ INP +ROW_STRIDE * 3], mm3; // store row 4
punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
// 2) transpose lower-right quadrant
// movq mm0, qword ptr [OUT + ROW_STRIDE*4 + 8]
// movq mm1, qword ptr [OUT + ROW_STRIDE*5 + 8]
// movq mm4, mm0; // mm4 = copy of row1[A B C D]
movq mm2, qword ptr [OUT + ROW_STRIDE*6 + 8]
// punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
punpckhwd mm4, mm1; // mm4 = [ 2 6 3 7]
movq mm3, qword ptr [OUT + ROW_STRIDE*7 + 8]
movq mm6, mm2;
punpcklwd mm2, mm3; // mm2 = [ 8 12 9 13]
movq mm1, mm0; // mm1 = [ 0 4 1 5]
punpckhwd mm6, mm3; // mm6 = 10 14 11 15]
movq mm3, mm4; // mm3 = [ 2 6 3 7]
punpckldq mm0, mm2; // final result mm0 = row1 [0 4 8 12]
punpckhdq mm1, mm2; // mm1 = final result mm1 = row2 [1 5 9 13]
; // slot
movq qword ptr [ INP + ROW_STRIDE*4 + 8], mm0; // store row 1
punpckldq mm4, mm6; // final result mm4 = row3 [2 6 10 14]
movq mm0, qword ptr [OUT + ROW_STRIDE * 4 ]
punpckhdq mm3, mm6; // final result mm3 = row4 [3 7 11 15]
movq qword ptr [ INP +ROW_STRIDE*6 + 8], mm4; // store row 3
movq mm4, mm0; // mm4 = copy of row1[A B C D]
movq qword ptr [ INP +ROW_STRIDE*5 + 8], mm1; // store row 2
; // slot
movq mm1, qword ptr [OUT + ROW_STRIDE * 5 ]
; // slot
movq qword ptr [ INP +ROW_STRIDE*7 + 8], mm3; // store row 4
punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
// 3) transpose lower-left
// movq mm0, qword ptr [OUT + ROW_STRIDE * 4 ]
// movq mm1, qword ptr [OUT + ROW_STRIDE * 5 ]
// movq mm4, mm0; // mm4 = copy of row1[A B C D]
movq mm2, qword ptr [OUT + ROW_STRIDE * 6 ]
// punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
punpckhwd mm4, mm1; // mm4 = [ 2 6 3 7]
movq mm3, qword ptr [OUT + ROW_STRIDE * 7 ]
movq mm6, mm2;
punpcklwd mm2, mm3; // mm2 = [ 8 12 9 13]
movq mm1, mm0; // mm1 = [ 0 4 1 5]
punpckhwd mm6, mm3; // mm6 = 10 14 11 15]
movq mm3, mm4; // mm3 = [ 2 6 3 7]
punpckldq mm0, mm2; // final result mm0 = row1 [0 4 8 12]
punpckhdq mm1, mm2; // mm1 = final result mm1 = row2 [1 5 9 13]
;//slot
movq qword ptr [ INP + ROW_STRIDE * 0 + 8 ], mm0; // store row 1
punpckldq mm4, mm6; // final result mm4 = row3 [2 6 10 14]
// begin reading next quadrant (upper-right)
movq mm0, qword ptr [OUT + ROW_STRIDE*0 + 8];
punpckhdq mm3, mm6; // final result mm3 = row4 [3 7 11 15]
movq qword ptr [ INP +ROW_STRIDE * 2 + 8], mm4; // store row 3
movq mm4, mm0; // mm4 = copy of row1[A B C D]
movq qword ptr [ INP +ROW_STRIDE * 1 + 8 ], mm1; // store row 2
movq mm1, qword ptr [OUT + ROW_STRIDE*1 + 8]
movq qword ptr [ INP +ROW_STRIDE * 3 + 8], mm3; // store row 4
punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
// 2) transpose lower-right quadrant
// movq mm0, qword ptr [OUT + ROW_STRIDE*4 + 8]
// movq mm1, qword ptr [OUT + ROW_STRIDE*5 + 8]
// movq mm4, mm0; // mm4 = copy of row1[A B C D]
movq mm2, qword ptr [OUT + ROW_STRIDE*2 + 8]
// punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
punpckhwd mm4, mm1; // mm4 = [ 2 6 3 7]
movq mm3, qword ptr [OUT + ROW_STRIDE*3 + 8]
movq mm6, mm2;
punpcklwd mm2, mm3; // mm2 = [ 8 12 9 13]
movq mm1, mm0; // mm1 = [ 0 4 1 5]
punpckhwd mm6, mm3; // mm6 = 10 14 11 15]
movq mm3, mm4; // mm3 = [ 2 6 3 7]
punpckldq mm0, mm2; // final result mm0 = row1 [0 4 8 12]
punpckhdq mm1, mm2; // mm1 = final result mm1 = row2 [1 5 9 13]
; // slot
movq qword ptr [ INP + ROW_STRIDE*4 ], mm0; // store row 1
punpckldq mm4, mm6; // final result mm4 = row3 [2 6 10 14]
movq qword ptr [ INP +ROW_STRIDE*5 ], mm1; // store row 2
punpckhdq mm3, mm6; // final result mm3 = row4 [3 7 11 15]
movq qword ptr [ INP +ROW_STRIDE*6 ], mm4; // store row 3
; // slot
movq qword ptr [ INP +ROW_STRIDE*7 ], mm3; // store row 4
; // slot
} // end __asm
}
//
// public interface, IDCT 8x8 operation
// v0.16B33, the old implementation
void
j_rev_dct( short *blk )
{
// 1) iDCT row transformation
idct_mmx32_rows( blk ); // 1) transform iDCT row, and transpose
// 2) iDCT column transformation
idct_mmx32_cols( blk ); // 2) transform iDCT row, and transpose
// all done
}
*/
const short tab_iz_01234567[] = {
//row0, this row is required
16384, 16384, 16384, 16384,
21407, 8867, -8867, -21407,
16384, -16384, -16384, 16384,
8867, -21407, 21407, -8867,
22725, 19266, 12873, 4520,
19266, -4520, -22725, -12873,
12873, -22725, 4520, 19266,
4520, -12873, 19266, -22725
};
#define SHIFT_ROUND_COL( x ) ( ( x + RND_INV_COL ) >> (SHIFT_INV_COL) )
#define SHIFT_ROUND_ROW( x ) ( ( x + RND_INV_ROW ) >> (SHIFT_INV_ROW) )
const static short tg_1_16[4] = {13036, 13036, 13036, 13036 }; //tg * (2<<16) + 0.5
const static short tg_2_16[4] = {27146, 27146, 27146, 27146 }; //tg * (2<<16) + 0.5
const static short tg_3_16[4] = {-21746, -21746, -21746, -21746 }; //tg * (2<<16) + 0.5
const static short cos_4_16[4] = {-19195, -19195, -19195, -19195 }; //cos * (2<<16) + 0.5
const static short ocos_4_16[4] = {23170, 23170, 23170, 23170 }; //cos * (2<<15) + 0.5
// Regular X86 integer implementation of AP922 MMX iDCT
// NOT WORKING YET
void
idct_ap922int86( short *data )
{
static short temp[64];
static short tmp[8];
static int a0, a1, a2, a3, b0, b1, b2, b3;
static short const _one_corr=0x0001;
short *x; // pointer to input
short *y; // pointer to output
short *ti; // pointer to table
int i,j;
short ev0, ev1, ev2, ev3, od0, od1, od2, od3;
int x0, x1, x2, x3, y0, y1, y2, y3;
int e0, e1, e2, e3, o0, o1, o2, o3;
int a, b;
ti = (short*)&tab_iz_01234567[ 0 ];
for ( i = 0; i < 8; ++i )
{
ti = (short*)&tab_i_01234567[ i * 32 ];
x = &data[ 8*i ];
y = &temp[ 8*i ];
a0 = x[0]*ti[0] + x[4]*ti[1] + x[2]*ti[4] + x[6]*ti[5];
a1 = x[0]*ti[2] + x[4]*ti[3] + x[2]*ti[6] + x[6]*ti[7];
a2 = x[0]*ti[8] + x[4]*ti[9] + x[2]*ti[12] + x[6]*ti[13];
a3 = x[0]*ti[10] + x[4]*ti[11] + x[2]*ti[14] + x[6]*ti[15];
b0 = x[1]*ti[16] + x[5]*ti[17] + x[3]*ti[20] + x[7]*ti[21];
b1 = x[1]*ti[18] + x[5]*ti[19] + x[3]*ti[22] + x[7]*ti[23];
b2 = x[1]*ti[24] + x[5]*ti[25] + x[3]*ti[28] + x[7]*ti[29];
b3 = x[1]*ti[26] + x[5]*ti[27] + x[3]*ti[30] + x[7]*ti[31];
y[0] = SHIFT_ROUND_ROW ( a0 + b0 );
y[1] = SHIFT_ROUND_ROW ( a1 + b1 );
y[2] = SHIFT_ROUND_ROW ( a2 + b2 );
y[3] = SHIFT_ROUND_ROW ( a3 + b3 );
y[4] = SHIFT_ROUND_ROW ( a3 - b3 );
y[5] = SHIFT_ROUND_ROW ( a2 - b2 );
y[6] = SHIFT_ROUND_ROW ( a1 - b1 );
y[7] = SHIFT_ROUND_ROW ( a0 - b0 );
} // end for( i = 0; i < 8; ++i )
// pmulhw/pmulhrw emulation macros
#define X86_PMULHW( X ) ((short) ( ((int)X)>>16 )) //Intel MMX
//#define X86_PMULHRW( X ) ((short) ( ( (((int)X)>>15)+1) >>1) ) //3DNow-MMX
// AP922 iDCT column transform
// Base code courtesy of VirtualDub (Avery Lee),
// http://www.concentric.net/~psilon
// Note, this code is *NOT* working yet.
// The output fails IEEE-1180/1990 tests!!!
for ( i = 0; i < 8; ++i )
{
x = &temp[ i ];
y = &data[ i ];
ev0 = x[0];
ev1 = x[16];
ev2 = x[32];
ev3 = x[48];
od0 = x[8];
od1 = x[24];
od2 = x[40];
od3 = x[56];
// 1) Apply D8T - the cos() coefficients are implicit in row calculation,
// but we still need to shuffle the input data.
tmp[0] = ev0;
tmp[1] = ev2;
tmp[2] = ev1;
tmp[3] = ev3;
tmp[4] = od0;
tmp[5] = od3;
tmp[6] = od1;
tmp[7] = od2;
// 2) Apply B8T.
x0 = (tmp[0] + tmp[1]);
x1 = (- tmp[1] + tmp[0]);
// x2 = x2 | _one_corr; // "one correction", no effect
// x3 = x3 | _one_corr; // "one correction", no effect
// x2 = x2 + _one_corr; // "one correction", no effect
// x3 = x3 + _one_corr; // "one correction", no effect
x2 = tmp[2] + X86_PMULHW( tg_2_16[0]*tmp[3] );
x3 = - tmp[3] + X86_PMULHW( tmp[2]*tg_2_16[0] ) ;
// x2 = x2 | _one_corr; // "one correction", nope
// x2 = x2 + _one_corr; // "one correction", nope
// x3 = x3 | _one_corr; // "one correction", nope
// x3 = x3 + _one_corr; // "one correction", nope
y0 = tmp[4] + X86_PMULHW( tg_1_16[0]*tmp[5] );
y1 = - tmp[5] + X86_PMULHW( tmp[4]*tg_1_16[0] );
//y0 = y0 + _one_corr; // "one correction", nope
// y0 = y0 | _one_corr; // "one correction", nope
// y1 = y1 + _one_corr; // "one correction", nope
// y1 = y1 | _one_corr; // "one correction", nope
//y2 = tmp[6] + X86_PMULHW( tg_3_16[0]*tmp[7] );
//y3 = - tmp[7] + X86_PMULHW( tmp[6]*tg_3_16[0] );
// tg_3_16[] trick
y2 = X86_PMULHW( tg_3_16[0]*tmp[7] ) + tmp[7];
//y2 = y2 | _one_corr; // "one correction", nope
//y2 = y2 + _one_corr; // "one correction", nope
y2 = tmp[6] + y2;
y3 = X86_PMULHW( tmp[6]*tg_3_16[0] ) + tmp[6];
//y3 = y3 | _one_corr; // "one correction", nope
//y3 = y3 + _one_corr; // "one correction", nope
y3 = - tmp[7] + y3;
// 3) Apply E8T.
//
// 1 0 1 0
// 0 1 0 1
// 0 1 0 -1
// 1 0 -1 0
// 1 0 1 0
// 1 0 -1 0
// 0 1 0 1
// 0 1 0 -1
e0 = x0 + x2;
e1 = x1 + x3;
e2 = x1 - x3;
e3 = x0 - x2;
o0 = (y0 + y2) + _one_corr;
//o0 = (y0 + y2);
o1 = y0 - y2;
o2 = y1 + y3;
o3 = y1 - y3;
// 4) Apply F8T.
// a = (o1 + o2) * cos_4_16[0];
// b = (o1 - o2) * cos_4_16[0];
a = (( o1 + o2 ) << 1);
b = (( o1 - o2 ) << 1);
o1 = X86_PMULHW( a * ocos_4_16[0] );
o2 = X86_PMULHW( b * ocos_4_16[0] );
o1 = o1 + _one_corr; // decreases error
//o2 = o2 + _one_corr; // increases error
//o2 = o2 | _one_corr; // increases error
// 5) Apply output butterfly (A8T).
//
// 1 0 0 0 1 0 0 0
// 0 1 0 0 0 1 0 0
// 0 0 1 0 0 0 1 0
// 0 0 0 1 0 0 0 1
// 0 0 0 1 0 0 0 -1
// 0 0 1 0 0 0 -1 0
// 0 1 0 0 0 -1 0 0
// 1 0 0 0 -1 0 0 0
y[0] = SHIFT_ROUND_COL(e0 + o0);
y[8] = SHIFT_ROUND_COL(e1 + o1);
y[16] = SHIFT_ROUND_COL(e2 + o2);
y[24] = SHIFT_ROUND_COL(e3 + o3);
y[32] = SHIFT_ROUND_COL(e3 - o3);
y[40] = SHIFT_ROUND_COL(e2 - o2);
y[48] = SHIFT_ROUND_COL(e1 - o1);
y[56] = SHIFT_ROUND_COL(e0 - o0);
} // end for ( i = 0; i < 8; ++i )
/*
for ( i = 0; i < 8; ++i )
{
//w = &tab_i_01234567[ i * 32 ];
x = &temp[ 8*i ];
y = &data[ i ];
// a0 = x[0] * w[0] + x[2] * w[4] + x[4] * w[1] + x[6] * w[5];
// a1 = x[0] * w[2] + x[2] * w[6] + x[4] * w[3] + x[6] * w[7];
// a2 = x[0] * w[ 8] + x[2] * w[12] + x[4] * w[9 ] + x[6] * w[13];
// a3 = x[0] * w[10] + x[2] * w[14] + x[4] * w[11] + x[6] * w[15];
// b0 = x[1] * w[16] + x[3] * w[20] + x[5] * w[17] + x[7] * w[21];
// b1 = x[1] * w[18] + x[3] * w[22] + x[5] * w[19] + x[7] * w[23];
// b2 = x[1] * w[24] + x[3] * w[28] + x[5] * w[25] + x[7] * w[29];
// b3 = x[1] * w[26] + x[3] * w[30] + x[5] * w[27] + x[7] * w[31];
a0 = x[0] * w[0] + x[2] * w[4] + x[4] * w[8] + x[6] * w[12];
a1 = x[0] * w[1] + x[2] * w[5] + x[4] * w[9] + x[6] * w[13];
a2 = x[0] * w[2] + x[2] * w[6] + x[4] * w[10] + x[6] * w[14];
a3 = x[0] * w[3 ]+ x[2] * w[7] + x[4] * w[11] + x[6] * w[15];
b0 = x[1] * w[16]+ x[3] * w[20] + x[5] * w[24] + x[7] * w[28];
b1 = x[1] * w[17]+ x[3] * w[21] + x[5] * w[25] + x[7] * w[29];
b2 = x[1] * w[18]+ x[3] * w[22] + x[5] * w[26] + x[7] * w[30];
b3 = x[1] * w[19]+ x[3] * w[23] + x[5] * w[27] + x[7] * w[31];
y[0] = SHIFT_ROUND_COL ( a0 + b0 );
y[8] = SHIFT_ROUND_COL ( a1 + b1 );
y[16] = SHIFT_ROUND_COL ( a2 + b2 );
y[24] = SHIFT_ROUND_COL ( a3 + b3 );
y[32] = SHIFT_ROUND_COL ( a3 - b3 );
y[40] = SHIFT_ROUND_COL ( a2 - b2 );
y[48] = SHIFT_ROUND_COL ( a1 - b1 );
y[56] = SHIFT_ROUND_COL ( a0 - b0 );
} // end for( i = 0; i < 8; ++i )
*/
}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?