📄 idct_ap922sse_rawcode.cpp
字号:
NCOS4_16*RS7, NCOS4_16*RS7, NCOS4_16*RS7, NCOS4_16*RS7,
NCOS2_16*RS7, NCOS6_16*RS7, -NCOS6_16*RS7, -NCOS2_16*RS7,
NCOS4_16*RS7, -NCOS4_16*RS7, -NCOS4_16*RS7, NCOS4_16*RS7,
NCOS6_16*RS7, -NCOS2_16*RS7, NCOS2_16*RS7, -NCOS6_16*RS7,
NCOS1_16*RS7, NCOS3_16*RS7, NCOS5_16*RS7, NCOS7_16*RS7,
NCOS3_16*RS7, -NCOS7_16*RS7, -NCOS1_16*RS7, -NCOS5_16*RS7,
NCOS5_16*RS7, -NCOS1_16*RS7, NCOS7_16*RS7, NCOS3_16*RS7,
NCOS7_16*RS7, -NCOS5_16*RS7, NCOS3_16*RS7, -NCOS1_16*RS7,
// iDCT column coefficients
NCOS4_16, NCOS4_16, NCOS4_16, NCOS4_16,
TANG1_16, TANG1_16, TANG1_16, TANG1_16,
TANG2_16, TANG2_16, TANG2_16, TANG2_16,
TANG3_16, TANG3_16, TANG3_16, TANG3_16,
RND_FLOAT_OFFSET, RND_FLOAT_OFFSET, RND_FLOAT_OFFSET, RND_FLOAT_OFFSET,
0 , 0 , 0 , 0, // not used, padding
0 , 0 , 0 , 0, // not used, padding
0 , 0 , 0 , 0 // not used, padding
};
/*
const static float cos_4_16f= NCOS4_16;
const static float tg_1_16f = TANG1_16;
const static float tg_2_16f = TANG2_16;
const static float tg_3_16f = TANG3_16;
; The original, unscaled idct coefficient table
; static const short w[32] = {
; FIX(cos_4_16), FIX(cos_2_16), FIX(cos_4_16), FIX(cos_6_16),
; FIX(cos_4_16), FIX(cos_6_16), -FIX(cos_4_16), -FIX(cos_2_16),
; FIX(cos_4_16), -FIX(cos_6_16), -FIX(cos_4_16), FIX(cos_2_16),
; FIX(cos_4_16), -FIX(cos_2_16), FIX(cos_4_16), -FIX(cos_6_16),
; FIX(cos_1_16), FIX(cos_3_16), FIX(cos_5_16), FIX(cos_7_16),
; FIX(cos_3_16), -FIX(cos_7_16), -FIX(cos_1_16), -FIX(cos_5_16),
; FIX(cos_5_16), -FIX(cos_1_16), FIX(cos_7_16), FIX(cos_3_16),
; FIX(cos_7_16), -FIX(cos_5_16), FIX(cos_3_16), -FIX(cos_1_16) };
;
; #define DCT_8_INV_ROW(x, y)
#define BITS_INV_ACC 4 //; 4 or 5 for IEEE
// 5 yields higher accuracy, but lessens dynamic range on the input matrix
#define SHIFT_INV_ROW (16 - BITS_INV_ACC)
#define SHIFT_INV_COL (1 + BITS_INV_ACC )
#define RND_INV_ROW (1 << (SHIFT_INV_ROW-1))
#define RND_INV_COL (1 << (SHIFT_INV_COL-1))
#define RND_INV_CORR (RND_INV_COL - 1) //; correction -1.0 and round
//#define RND_INV_ROW (1024 * (6 - BITS_INV_ACC)) //; 1 << (SHIFT_INV_ROW-1)
//#define RND_INV_COL (16 * (BITS_INV_ACC - 3)) //; 1 << (SHIFT_INV_COL-1)
#define SHIFT_ROUND_COL( x ) ( ( x + RND_INV_COL ) >> (SHIFT_INV_COL) )
#define SHIFT_ROUND_ROW( x ) ( ( x + RND_INV_ROW ) >> (SHIFT_INV_ROW) )
//.data
//Align 16
//const static long r_inv_row[2] = { RND_INV_ROW, RND_INV_ROW};
const static short tg_1_16[4] = {13036, 13036, 13036, 13036 }; //tg * (2<<16) + 0.5
const static short tg_2_16[4] = {27146, 27146, 27146, 27146 }; //tg * (2<<16) + 0.5
const static short tg_3_16[4] = {-21746, -21746, -21746, -21746 }; //tg * (2<<16) + 0.5
//const static short cos_4_16[4] = {-19195, -19195, -19195, -19195 }; //cos * (2<<16) + 0.5
const static short ocos_4_16[4] = {23170, 23170, 23170, 23170 }; //cos * (2<<15) + 0.5
//#define SHIFT_ROUND_COLF( x ) ( floor( (x)*0.125*0.5 + 0.5 ) )
#define SHIFT_ROUND_COLF( x ) ( floor( (x)*0.5 + 0.5 ) )
#define SHIFT_ROUND_ROWF( x ) ( x )
*/
// externally allocated array of 64 floats, for temporary storage
extern void *fTempArray;
void
idct_ap922float_sse( short *data )
{
static const __int64 mmAndmask = 0xFFFF0000FFFF0000; // [ XX __ XX __ ]
/*
static float a[8], b[8], e[8];
static float dp[8], tmp[2];
short *x; // pointer to input
short *y; // pointer to final output
float *yr, *xc; // pointer to output row
float *ti; // pointer to IDCT row coefficient table
int i;
*/
// AP-922 inverse_DCT row transform (floating point version)
//
// Row IDCT operator :A_T*M_T*P_T
// Let Y=[output column data, 8 elements] 32-bit IEEE-754 float
// X=[input column data, 8 elements] 16-bit short integer
//
// Y= [ A_T*M_T*P_T ] * X
//
// (Y and X are both column vectors)
#define INPR eax
#define TABLE ecx
#define TABLEP1 esi // starting address of (TABLE +32 elements)
#define OUTR edx
__asm {
mov OUTR, dword ptr [fTempArray];
mov edi, 0x00; // i = 0
mov INPR, dword ptr [data]; ;// row 0
sub OUTR, 32; // precompensate OUT
lea TABLEP1, dword ptr [tab_i_01234567ssef]; // row 0
movq mm7, [mmAndmask]; // mm7 <= [ FFFF 0000 FFFF 0000]
mov TABLE, TABLEP1;
add TABLEP1, 128; // initialize
;// mov OUT, INP; // algorithm writes data in-place -> row 0
;// for ( x = 0; x < 8; ++x ) // transform one row per iteration
ALIGN 16 ;// align jump address to 16 byte offset
acc_idct_rowloop1:
;//
;// begin (0)- Convert short -> float data
;// --------------------------------------
;// Step 0a) : pshufw, generate 64-bit vectors [x0 x0 x0 x0],[x1 x1 x1 x1], ...
;// Step 0b) : pand, mask out lower-word of each DWORD,
;// example [x0 x0 x0 x0] -> [x0 __ x0 __]
;// This effectively prescales all x[] inputs by 65536
;// (denoted ROW_PRESCALE_SHIFT)
;// Step 0c) Convert DWORD vectors [X X] into ssefloat vectors [ X X X X]
;// (use cvtpi2ps and movlhps)
;//
movq mm2, [INPR] ; // (0) mm2 <= x3 x2 x1 x0
;//
movq mm4, [INPR+8] ; // (0) mm4 <= x7 x6 x5 x4
pshufw mm0, mm2, 00000000b ;// (0a) mm0 <= x0 x0 x0 x0
pshufw mm1, mm2, 10101010b ;// (0a) mm1 <= x2 x2 x2 x2
pand mm0, mm7; ;// (0b) mm0 = [ x0 x0 ]*SCALEUP (dword)
cvtpi2ps xmm0, mm0; ;// (0c) xmm0 <= [__ __ x0 x0] float
;// mm0 free
pand mm1, mm7; ;// (0b) mm1 = [ x2 x2 ]*SCALEUP (dword)
prefetcht0 [TABLEP1 + 0*32] ;// prefetch coeff +1row, [w7..0]
pshufw mm0, mm4, 00000000b ;// (0a) mm0 <= x4 x4 x4 x4
cvtpi2ps xmm1, mm1; ;// (0c) xmm1 <= [__ __ x2 x2] float
;// mm1 free
pand mm0, mm7; ;// (0b) mm4 <= [x4 x4]*SCALEUP (dword)
movlhps xmm0, xmm0; ;// (0c) xmm0 <= [x0 x0 x0 x0] float
cvtpi2ps xmm2, mm0; ;// (0c) xmm2 <= [__ __ x4 x4] float
;// mm0 free
;// 1) Apply M_T*P_T operator (P_T is implicit with table/element order)
;// (table order has been changed for optimum SSE execution)
;// 1a) mulps to generate a_v0, a_v1, a_v2, a_v3, b_v0, b_v1, b_v2, b_v3
;// 1b) addps to generate a_v[] = a_v0 + a_v1 + a_v2 + a_v3
;// b_v[] = b_v0 + b_v1 + b_v2 + b_v3
mulps xmm0, [TABLE + 0*16] ;// (1a) xmm0 <= a_v0 = [w3..w0] * x0
pshufw mm1, mm4, 10101010b; // (0a) mm1 <= [x6 x6 x6 x6]
pand mm1, mm7; ;// (0b) mm1 <= [x6 x6]*SCALEUP (dword)
movlhps xmm1, xmm1; ;// (0c) xmm1 <=[ x2 x2 x2 x2] float
mulps xmm1, [TABLE + 1*16] ;// (1a) xmm1 <= a_v1 = [w7..w4] * x2
cvtpi2ps xmm3, mm1; ;// (0c) xmm3 <=[ __ __ x6 x6] float
;// mm1 free
movlhps xmm2, xmm2; ;// (0c) xmm2 <= [x4 x4 x4 x4] float
pshufw mm0, mm2, 01010101b; // (0a) mm1 <=[x1 x1 x1 x1]
mulps xmm2, [TABLE + 2*16]; // (1a) xmm2 <= a_v2 = [w11..w8] * x4
pshufw mm1, mm2, 11111111b; // (0a) mm1 <=[x3 x3 x3 x3]
;// mm2 free
pand mm0, mm7; // (0b) mm0 <=[x1 x1]*SCALEUP (dword)
movlhps xmm3, xmm3; ;// (0) xmm3 <= [x6 x6 x6 x6]
mulps xmm3, [TABLE + 3*16] ; // (1a) xmm3 <= a_v3 = [w15..w12] * x6
cvtpi2ps xmm4, mm0; // (0c) xmm4 <=[__ __ x1 x1] float
;// mm0 free
addps xmm1, xmm0; ;// (1b) xmm1 <= a_v0 + a_v1
;// xmm0 free
pand mm1, mm7; // (0b) mm1 <=[x3 x3]*SCALEUP (dword)
prefetcht0 [TABLEP1 + 1*32] ;// prefetch coeff +1row [w15..8]
pshufw mm0, mm4, 01010101b; // (0a) mm0 <= [x5 x5 x5 x5]
cvtpi2ps xmm5, mm1; // (0c) xmm5 <=[__ __ x3 x3] float
pand mm0, mm7; // (0b) mm0 <= [x5 x5]*SCALEUP (dword)
;// mm1 free
cvtpi2ps xmm6, mm0; // (0c) xmm6 <=[__ __ x5 x5] float
;// mm0 free
movlhps xmm4, xmm4; // (0c) xmm4 <= [x1 x1 x1 x1]
mulps xmm4, [TABLE + 4*16] ;// (1a) xmm4 <= b_v0
pshufw mm1, mm4, 11111111b; // (0a) mm1 <= [x7 x7 x7 x7]
;// mm4 free
addps xmm2, xmm1; ;// (1b) xmm2 + a_v2 <= (a_v0+a_v1) + a_v2
;// xmm1 free
movlhps xmm5, xmm5; ;// (0c) xmm5 <=[x3 x3 x3 x3] float
mulps xmm5, [TABLE + 5*16] ;// (1a) xmm5 <= b_v1
pand mm1, mm7; ;// (0b) mm1 <= [x7 x7]*SCALEUP (dword)
cvtpi2ps xmm7, mm1; ;// (0c) xmm7 <= [__ __ x7 x7]
;// mm1 free
movlhps xmm6, xmm6; // (0c) xmm6 <= [x5 x5 x5 x5] float
mulps xmm6, [TABLE + 6*16] ;// (1a) xmm6 <= b_v2
add OUTR, 32; ;// OUT <= OUT + 32 (increment row)
addps xmm3, xmm2; ;// (1b) xmm3 + a_v3 <=(a_v0+a_v1+a_v2) + a_v3
;// xmm3 = complete a-vector ( a_v[] )
;// xmm2 free
add INPR, 16; // increment INP(psrc) + 1row
prefetcht0 [TABLEP1 + 2*32] ;// prefetch coeff +1row [w23..15]
movlhps xmm7, xmm7;
mulps xmm7, [TABLE + 7*16] ;// (1a) xmm7 <= b_v3
addps xmm5, xmm4; ;// (1b) xmm5 <= b_v0 + b_v1
;// xmm4 free
add TABLE, 128; // move TABLE to w[32]
add edi, 1; // i = i + 1 (loop index)
prefetcht0 [TABLEP1 + 3*32] ;// prefetch coeff +1row [w31..24]
add TABLEP1, 128; // move TABLEP1 to w[32]
movaps xmm2, xmm3; // (1b) xmm2 <= a_v[3..0]
addps xmm6, xmm5; // (1b) xmm6 + b_v2 <=(b_v0+b_v1) + b_v2
;// xmm5 free
addps xmm7, xmm6; // (1b) xmm7 + b_v3 <=(b_v0+b_v1+b_v2) + b_v3
;// xmm6 free
;// xmm7 = complete b-vector ( b_v[] )
;// 2) Apply A_T operator to generate outputs y[7..0]
;// 2a) y[3..0] = a_v[] + b_v[]
;// 2b) *y[4..7] = a_v[] - b_v[] (* requires shufps to invert order)
;// 2c) store y[] to temporary memory
;// output *y[4..7] = a[7..4] + b[7..4], output order must be reversed!
subps xmm3, xmm7; ;// 2b) xmm3 <= a_v[3..0]-b_v[3..0] (y4 y5 y6 y7)
;// output y[3..0] = a[3..0] + b[3..0]
addps xmm2, xmm7; ;// 2a) xmm2 <= a_v[3..0]+b_v[3..0] (y3 y2 y1 y0)
;// xmm7 free
;// fix output order, y[4..7] -> y[7..4]
shufps xmm3, xmm3, 00011011b;// 2b) xmm3 [y7 y6 y5 y4] <- [y4 y5 y6 y7]
;// !*@&#@ Visual C++ refuses to align data to 16-byte offset, SIGH
;// movups [OUT + 0*16], xmm2; ;// 2c) store [y3 y2 y1 y0]
movaps [OUTR + 0*16], xmm2; ;// 2c) store [y3 y2 y1 y0]
;// xmm2 free
;// movups [OUT + 1*16], xmm3; ;// 2c) store [y7 y6 y5 y4]
movaps [OUTR + 1*16], xmm3; ;// 2c) store [y7 y6 y5 y4]
;// xmm3 free
cmp edi, 0x08;
;// end for ( x = 0; x < 8; ++x ) // transform one row per iteration
jl acc_idct_rowloop1;
;/////////////////////////////////////////////////////////////////////
;//
;//
;// iDCT column transform
;//
;//
;/////////////////////////////////////////////////////////////////////
;// 1) Apply (D_T * P_T) - the cos() coefficients of D_T are implicit
;// in the idct_row operation. But we still need to apply the
;// shuffling operation of D_T.
;//
;// 1 0 0 0 0 0 0 0
;// 0 0 0 0 1 0 0 0
;// 0 0 1 0 0 0 0 0
;// 0 0 0 0 0 0 1 0
;// 0 1 0 0 0 0 0 0
;// 0 0 0 0 0 0 0 1
;// 0 0 0 1 0 0 0 0
;// 0 0 0 0 0 1 0 0
;// dp[0] = xc[ 0 *8];
;// dp[1] = xc[ 4 *8];
;// dp[2] = xc[ 2 *8];
;// dp[3] = xc[ 6 *8];
;// dp[4] = xc[ 1 *8];
;// dp[5] = xc[ 7 *8];
;// dp[6] = xc[ 3 *8];
;// dp[7] = xc[ 5 *8];
#define SCRATCH esi
#define INPC edx
#define OUTC eax
mov OUTC, dword ptr [data]; ;// row 0
mov edi, 0x00; // i = 0
mov INPC, dword ptr [fTempArray];
// sub OUT, 16; // precompensate OUT
#define _PCOS4 TABLE
#define _PTAN1 TABLE+16
#define _PTAN2 TABLE+32
#define _PTAN3 TABLE+48
#define _PRND_INV_ROW TABLE+64
lea TABLE, [tab_i_01234567ssef];
mov SCRATCH, INPC; // after we read the first input column, use as scratch
add TABLE, 1024; // increment TABLE to the COS4 entry
//
ALIGN 16
acc_idct_colloop1:
movaps xmm0, [INPC + 1*32] ; // (1) xmm0 <= dp[4]
add edi, 1;
;
movaps xmm2, [INPC + 3*32]; // (1) xmm2 <= dp[6]
;
movaps xmm1, [INPC + 7*32]; // (1) xmm1 <= dp[5]
movaps xmm4, xmm0; // (1) xmm4 <= dp[4]
;
movaps xmm3, [INPC + 5*32] ; // (1) xmm3 <= dp[7]
movaps xmm6, xmm2; ;// (2a) xmm6 <= dp[6]
// 2) Apply B_T
//
// b[4] = dp[4] + ( dp[5]*tg_1_16f );
// b[5] = - dp[5] + ( dp[4]*tg_1_16f );
// b[6] = dp[6] + ( dp[7]*tg_3_16f );
// b[7] = - dp[7] + ( dp[6]*tg_3_16f );
;
mulps xmm4, [_PTAN1]; ;// (2a) xmm4 <= (dp[4]*tg_1_16f)
movaps xmm5, xmm1 ;// (2a) xmm5 <= dp[5]
;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -