📄 idct_ap922hybr.cpp
字号:
cmp edi, 8;
psraw mm7, DESCALE_SHIFT2; // (6e) clip y[6,7] to {-256,+255}
movd [ OUTC + 5*16], mm5; // (6f) store y[5]
punpckhdq mm5, mm5; ;// (6f) mm3 = [y0c1 y0c0 y0c1 y0c0] <int16>
movd [ OUTC + 7*16], mm7; // (6f) store y[7]
punpckhdq mm7, mm7; ;// (6f) mm3 = [y6c1 y6c0 y6c1 y6c0] <int16>
movd [ OUTC + 0*16], mm5; // (6f) store y[0]
movd [ OUTC + 6*16], mm7; // (6f) store y[6]
jl acc_idct_colloop1;
// end for ( x=0; x < 8; x=x+2 )
FEMMS;
}
}
*/
////////////////////////////////////////////////////////////////////////
//
//
// AP922hybr_x87(), conventional x87 implementation of hybrid AP922
//
//
////////////////////////////////////////////////////////////////////////
void
idct_ap922hybr_x87( short *data )
{
// static tdnfloat fTempArray[64]; // intermediate (row-iDCT output) matrix
static int ar[8], br[8];
static tdnfloat b[8], e[8], dp[8]; // intermediate results
static tdnfloat tmp[2]; // temp calculation, scratch pad
static short xi[8]; // temp 16-bit ints for initial P_T operator
// static short const _one_corr=0x0001;
short *ti; // pointer to IDCT row coefficient table
short *x; // pointer to input
short *y; // pointer to final output
tdnfloat *yr, *xc; // pointer to output row
int i;
////////////////////////////////////////////////////////////////////////
//
// AP922 iDCT row transform (int32 version)
// Core transform code is very similar to the original AP922mmx iDCT_row
// except full 32-bit output is preserved (instead of shift/rnd to 16-bit)
//
//
// Row IDCT operator :A_T*M_T*P_T
// Let Y=[output column data, 8 elements] 32-bit IEEE-754 float
// X=[input column data, 8 elements] 16-bit short integer
//
// Y= [ A_T*M_T*P_T ] * X
//
// (Y and X are both column vectors)
for ( i = 0; i < 8; ++i ) // row iDCT
{
ti = (short*)&tab_i_01234567hybr[ i * 32 ];
x = &data[ 8*i ];
yr = ((tdnfloat *) fTempArray) + 8*i; // intermediate output, select row#i
//
// 1) Apply M_T * P_T operator
//
// The operation "output(32-bit) = x(16-bit) * y(16-bit)" produces a
// 32-bit result. Later, the 32-bit result is converted to float,
// which means the prior integer-mult effectively scales up the
// result by 65536 (left shift 16.)
// ar[3..0], br[3..0] = 32-bit integers
//
// The AP922hybrid iDCT_row computes ar[3..0] & br[3..0] without
// any post-shift/round. As a result, the pmaddwd/paddd sequence
// has the side-effect of upscaling by a factor of 2 (left shift by 1.)
//
// This effect is reflected in the constant PSCALE_SHIFT, which has
// been adjusted upward from 16 to 17.
ar[0] = x[0] * ti[0] + x[4] * ti[1] + x[2] * ti[4] + x[6] * ti[5];
ar[1] = x[0] * ti[2] + x[4] * ti[3] + x[2] * ti[6] + x[6] * ti[7];
ar[2] = x[0] * ti[8] + x[4] * ti[9] + x[2] * ti[12] + x[6] * ti[13];
ar[3] = x[0] * ti[10] + x[4] * ti[11] + x[2] * ti[14] + x[6] * ti[15];
br[0] = x[1] * ti[16] + x[5] * ti[17] + x[3] * ti[20] + x[7] * ti[21];
br[1] = x[1] * ti[18] + x[5] * ti[19] + x[3] * ti[22] + x[7] * ti[23];
br[2] = x[1] * ti[24] + x[5] * ti[25] + x[3] * ti[28] + x[7] * ti[29];
br[3] = x[1] * ti[26] + x[5] * ti[27] + x[3] * ti[30] + x[7] * ti[31];
// 2) Apply A_T operator, store outputs
//
// 1 0 0 0 1 0 0 0
// 0 1 0 0 0 1 0 0
// 0 0 1 0 0 0 1 0
// 0 0 0 1 0 0 0 1
// 0 0 0 1 0 0 0 -1
// 0 0 1 0 0 0 -1 0
// 0 1 0 0 0 -1 0 0
// 1 0 0 0 -1 0 0 0
// The output is 32-bit integer. The 32-bit int is converted to
// float and stored in a temporary array,
yr[0] = (float)( ar[0] + br[0] ); // store as 32-bit int
yr[1] = (float)( ar[1] + br[1] ); // store as 32-bit int
yr[2] = (float)( ar[2] + br[2] ); // store as 32-bit int
yr[3] = (float)( ar[3] + br[3] ); // store as 32-bit int
yr[4] = (float)( ar[3] - br[3] ); // store as 32-bit int
yr[5] = (float)( ar[2] - br[2] ); // store as 32-bit int
yr[6] = (float)( ar[1] - br[1] ); // store as 32-bit int
yr[7] = (float)( ar[0] - br[0] ); // store as 32-bit int
} // end for( i = 0; i < 8; ++i ) // end of row iDCT
// AP922hybr iDCT row transform done
//
////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////
//
// Column IDCT operator :A_T*(F_T*E_T*B_T*D_T)*P_T
// AP922hybrid and AP922float share *identical* column_idct operators
//
// (minor difference in the final descale_shift :
// 17bits instead of 16bits.)
//
// Let Y=[output column data, 8 elements], 16-bit short integer
// X=[input column data, 8 elements], 32-bit IEEE-754 float
//
// Y= [ A_T*(F_T*E_T*B_T*D_T)*P_T ] * X
//
// (Y and X are both column vectors)
for ( i = 0; i < 8; ++i ) // column iDCT
{
xc = ((tdnfloat *) fTempArray) + i; // select column #i
y = &data[ i ];
// 1) Apply (D_T * P_T) - the cos() coefficients of D_T are implicit
// in the idct_row operation. But we still need to apply the
// shuffling operation of D_T.
//
// 1 0 0 0 0 0 0 0
// 0 0 0 0 1 0 0 0
// 0 0 1 0 0 0 0 0
// 0 0 0 0 0 0 1 0
// 0 1 0 0 0 0 0 0
// 0 0 0 0 0 0 0 1
// 0 0 0 1 0 0 0 0
// 0 0 0 0 0 1 0 0
dp[0] = xc[ 0 *8];
dp[1] = xc[ 4 *8];
dp[2] = xc[ 2 *8];
dp[3] = xc[ 6 *8];
dp[4] = xc[ 1 *8];
dp[5] = xc[ 7 *8];
dp[6] = xc[ 3 *8];
dp[7] = xc[ 5 *8];
// 2) Apply B_T
//
// 1 1 0 0
// 1 -1 0 0
// 0 0 1 t2
// 0 0 t2 -1
// 1 t1 0 0
// t1 -1 0 0
// 0 0 1 t3
// 0 0 t3 -1
b[0] = dp[1] + dp[0];
b[1] = dp[0] - dp[1];
b[2] = ( dp[3]*tg_2_16f ) + dp[2];
b[3] = ( dp[2]*tg_2_16f ) - dp[3];
b[4] = ( dp[5]*tg_1_16f ) + dp[4];
b[5] = ( dp[4]*tg_1_16f ) - dp[5];
b[6] = ( dp[7]*tg_3_16f ) + dp[6];
b[7] = ( dp[6]*tg_3_16f ) - dp[7];
// 3) Apply E_T
//
// 1 0 1 0
// 0 1 0 1
// 0 1 0 -1
// 1 0 -1 0
// 1 0 1 0
// 1 0 -1 0
// 0 1 0 1
// 0 1 0 -1
e[0] = b[0] + b[2];
e[1] = b[1] + b[3];
e[2] = b[1] - b[3];
e[3] = b[0] - b[2];
e[4] = b[4] + b[6];
e[5] = b[4] - b[6];
e[6] = b[5] + b[7];
e[7] = b[5] - b[7];
// 4) Apply F_T
//
// 1 0 0 0
// 0 1 0 0
// 0 0 1 0
// 0 0 0 1
// 1 0 0 0
// 0 1 0 0
// 0 0 1 0
// 0 0 0 1
#define _F0 e[0]
#define _F1 e[1]
#define _F2 e[2]
#define _F3 e[3]
#define _F4 e[4]
#define _F5 e[5]
#define _F6 e[6]
#define _F7 e[7]
tmp[0] = (e[5] + e[6]) * cos_4_16f;
tmp[1] = (e[5] - e[6]) * cos_4_16f;
_F5 = tmp[0];
_F6 = tmp[1];
// 5) Apply A_T
//
// 1 0 0 0 1 0 0 0
// 0 1 0 0 0 1 0 0
// 0 0 1 0 0 0 1 0
// 0 0 0 1 0 0 0 1
// 0 0 0 1 0 0 0 -1
// 0 0 1 0 0 0 -1 0
// 0 1 0 0 0 -1 0 0
// 1 0 0 0 -1 0 0 0
//
// yfloat[0]= F0 + F4
// yfloat[1]= F1 + F5
// yfloat[2]= F2 + F6
// yfloat[3]= F3 + F7
//
// yfloat[4]= F3 - F7
// yfloat[5]= F2 - F6
// yfloat[6]= F1 - F5
// yfloat[7]= F0 - F4
//
//
// 6) float -> int, shift/round to final output y[]
// The final shift&round operation reverses the row-input prescaling.
// It also applies the chosen rounding-mode (accurate or fast.)
//
// Note, the C-code below differs *substantially* from the AMD_3DNOW
// implementation. The 3D_Now code applies this basic sequence:
//
// ;// mm0 = [y1 y0] <float32>
// ;// mm1 = [y3 y2] <float32>
//
// PF2ID mm0, mm0; // mm0 <= [y1 y0] <int32>
// PF2ID mm1, mm1; // mm1 <= [y3 y2] <int32>
//
// ;// "0.5" is a 32-bit integer constant scaled up by some bitshift
// paddd mm0, [rnd_compensation]; // mm0 <=[y1+"0.5" y0+"0.5"]
// paddd mm1, [rnd_compensation]; // mm0 <=[y3+"0.5" y2+"0.5"]
//
// psrad mm0, DESCALE_SHIFT1; // stage1 shift
// psrad mm1, DESCALE_SHIFT1; // stage1 shift
//
// packssdw mm0, mm1; // mm0 <= [y3 y2 y1 y0] <int16>
//
// psraw mm0, DESCALE_SHIFT2; // clip y[] to the range {-256,+255}
//
// ;// DESCALE_SHIFT1 + DESCALE_SHIFT2 = PRESCALE_SHIFT
// ;// 10 + 7 = 16+1
//
// movq [OUTC+...*8], mm0;
y[0*8] = (short)SHIFT_ROUND_COLF( _F0 + _F4 );
y[1*8] = (short)SHIFT_ROUND_COLF( _F1 + _F5 );
y[2*8] = (short)SHIFT_ROUND_COLF( _F2 + _F6 );
y[3*8] = (short)SHIFT_ROUND_COLF( _F3 + _F7 );
y[4*8] = (short)SHIFT_ROUND_COLF( _F3 - _F7 );
y[5*8] = (short)SHIFT_ROUND_COLF( _F2 - _F6 );
y[6*8] = (short)SHIFT_ROUND_COLF( _F1 - _F5 );
y[7*8] = (short)SHIFT_ROUND_COLF( _F0 - _F4 );
} // end for ( i = 0; i < 8; ++i ) // end of SSEfloat column iDCT
// AP922hybr iDCT column transform done
//
////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////
// Post transform clipping
//
// In standard-C, the output clip code adds significantly to
// execution time.
#define IDCT_CLIP_LOW -256 // IDCT output range is 9-bits
#define IDCT_CLIP_HIGH 255 // IDCT output range is 9-bits
for ( i = 0; i < 64; ++i )
{ // clip output to {-256,+255}
if ( data[ i ] < IDCT_CLIP_LOW )
data[ i ] = IDCT_CLIP_LOW;
if ( data[ i ] > IDCT_CLIP_HIGH )
data[ i ] = IDCT_CLIP_HIGH;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -