📄 idct_ap922sse_opt.cpp
字号:
// movups [OUTR + 1*16], xmm5; ;// (2b) store [y7 y6 y5 y4
// xmm5 free
cmp edi, 0x07;
// end for ( x = 0; x < 7; ++x ) // transform one row per iteration
jl acc_idct_rowloop1;
// Since the above code integrates processing for rows(x) and (x+1),
// the last row(#7) must be "unrolled" without the (x+1) code.
//
//
/////////////////////////////////////////////
/////////////////////////////////////////////
//
// iDCT row #7
addps xmm1, xmm0; ;// (1b) xmm1 <= a_v0 + a_v1
// xmm0 free
pand mm1, mm7; // (0) mm1 <=[x3 x3]*SCALEUP (dword)
// this prefetch retrieves the TAN1, TAN2, TAN3, COS4 coefficients
// from the upcoming col_idct() operation
prefetcht0 [TABLEP1 + 0*32] ;// x+1 prefetch coeff COL COEFFICIENTS
pshufw mm0, mm4, 01010101b; // (0) mm0 <= [x5 x5 x5 x5]
cvtpi2ps xmm5, mm1; // (0) xmm5 <=[__ __ x3 x3] float
pand mm0, mm7; // (0) mm0 <= [x5 x5]*SCALEUP (dword)
// mm1 free
cvtpi2ps xmm6, mm0; // (0) xmm6 <=[__ __ x5 x5] float
// mm0 free
movlhps xmm4, xmm4; // (0) xmm4 <= [x1 x1 x1 x1]
mulps xmm4, [TABLE + 4*16] ;// (1a) xmm4 <= b_v0
pshufw mm1, mm4, 11111111b; // (0) mm1 <= [x7 x7 x7 x7]
// mm4 free
addps xmm2, xmm1; ;// (1b) xmm2 + a_v2 <= (a_v0+a_v1) + a_v2
// xmm1 free
movlhps xmm5, xmm5; ;// (0) xmm5 <=[x3 x3 x3 x3] float
mulps xmm5, [TABLE + 5*16] ;// (1a) xmm5 <= b_v1
pand mm1, mm7; ;// (0) mm1 <= [x7 x7]*SCALEUP (dword)
cvtpi2ps xmm7, mm1; ;// (0) xmm7 <= [__ __ x7 x7]
// mm1 free
movlhps xmm6, xmm6; // (0) xmm6 <= [x5 x5 x5 x5] float
mulps xmm6, [TABLE + 6*16] ;// (1a) xmm6 <= b_v2
add OUTR, 32; ;// OUT <= OUT + 32 (increment row)
addps xmm3, xmm2; ;// (1b) xmm3 + a_v3 <=(a_v0+a_v1+a_v2) + a_v3
// xmm3 = complete a-vector ( a_v[3..0] )
// xmm2 free
// this prefetch retrieves the TAN1, TAN2, TAN3, COS4 coefficients
// from the upcoming col_idct() operation
prefetcht0 [TABLEP1 + 1*32] ;// prefetch COL coefficients
movlhps xmm7, xmm7;
// TABLEP1 (esi) free
mulps xmm7, [TABLE + 7*16] ;// (1a) xmm7 <= b_v3
addps xmm5, xmm4; ;// (1b) xmm5 <= b_v0 + b_v1
// xmm4 free
;/////////////////////////////////////////////////////////////////////
;//
;//
;// Begin iDCT column transform (columns 0-3)
;//
;//
;/////////////////////////////////////////////////////////////////////
add TABLE, 128; // COL SETUP
mov edi, 0x00; // i = 0; // COL SETUP
// TABLE now points to the proper location, no need to reload with lea
// lea TABLE, [tab_i_01234567ssef];; // COL SETUP
// add TABLE, 1024; ; // COL SETUP// increment TABLE to the COS4 entry
#define SCRATCH esi // column iDCT
#define INPC eax // column iDCT
#define OUTC edx // column iDCT
#define _PCOS4 TABLE
#define _PTAN1 TABLE+16
#define _PTAN2 TABLE+32
#define _PTAN3 TABLE+48
#define _PRND_INV_ROW TABLE+64
mov INPC, dword ptr [fTempArray]; ; // COL SETUP
addps xmm6, xmm5; // (1b) xmm6 + b_v2 <= (b_v0+b_v1) + b_v2
// xmm5 free
// lea SCRATCH, [scratch] ; // COL SETUP
mov SCRATCH, INPC; ;// SCRATCH PAD writes over temp data
// xmm4 used
movaps xmm4, xmm3; // (1b) xmm4 <= a_v[3..0]
movaps xmm0, [INPC + 1*32] ; // col (1) xmm0 <= dp[4]
// xmm5 used
movaps xmm5, xmm3; ;// xmm5 <= a_v[3..0]
// xmm3 free
addps xmm7, xmm6; // (1b) xmm7 + b_v3 <=(b_v0+b_v1+b_v2) + b_v3
// xmm6 free
// xmm7 = complete b-vector ( b_v[3..0] )
;//
// output y[4..7] = a[7..4] + b[7..4], output order must be reversed!
movaps xmm2, [INPC + 3*32]; // col (1) xmm2 <= dp[6]
;
;
movaps xmm3, [INPC + 5*32] ; // col (1) xmm3 <= dp[7]
;
addps xmm4, xmm7; ;// (2a) xmm4 <= a_v[3..0]+b_v[3..0] (y3 y2 y1 y0)
;
;// COLUMN IDCT
;// 1) Apply (D_T * P_T) - the cos() coefficients of D_T are implicit
;// in the idct_row operation. But we still need to apply the
;// shuffling operation of D_T.
;
;
// output y[3..0] = a[3..0] + b[3..0]
subps xmm5, xmm7; ;// (2a) xmm5 <= a_v[3..0]-b_v[3..0] (y4 y5 y6 y7)
// xmm7 free
// fix output order, y[4..7] -> y[7..4]
shufps xmm5, xmm5, 00011011b; // (2a) xmm5 [y7 y6 y5 y4] <- [y4 y5 y6 y7]
;
movaps xmm6, xmm2; ;// col (2a) xmm6 <= dp[6]
// movaps xmm1, [INPC + 7*32]; // col (1) xmm1 <= dp[5]
movaps xmm1, xmm4; // col (1) xmm1 <= dp[5]
// !*@&#@ Visual C++ refuses to align data to 16-byte offset, SIGH
movaps [OUTR + 0*16], xmm4; // (2b) store [y3 y2 y1 y0]
movaps xmm4, xmm0; // col (1) xmm4 <= dp[4]
// movups [OUTR + 0*16], xmm4; // (2b) store [y3 y2 y1 y0]
// xmm4 free
// xmm4 used
// 2) Apply B_T
//
// b[4] = dp[4] + ( dp[5]*tg_1_16f );
// b[5] = - dp[5] + ( dp[4]*tg_1_16f );
// b[6] = dp[6] + ( dp[7]*tg_3_16f );
// b[7] = - dp[7] + ( dp[6]*tg_3_16f );
mulps xmm4, [_PTAN1]; ;// (2a) xmm4 <= (dp[4]*tg_1_16f)
;
movaps [OUTR + 1*16], xmm5; ;// (2b) store [y7 y6 y5 y4]
// movups [OUTR + 1*16], xmm5; ;// (2b) store [y7 y6 y5 y4]
// xmm5 free
// xmm5 used
movaps xmm5, xmm1 ;// (2a) xmm5 <= dp[5]
////////////////////////////////////
// end of iDCT row #7 processing
////////////////////////////////////
mulps xmm5, [_PTAN1]; ;// (2a) xmm5 <= (dp[5]*tg_1_16f)
;
mulps xmm6, [_PTAN3]; ;// (2a) xmm6 <= (dp[6]*tg_3_16f)
movaps xmm7, xmm3; ;// (2a) xmm7 <= dp[7]
subps xmm4, xmm1; ;// (2b) xmm4 <= b[5] = (dp[4]*tg_1_16f) - dp[5]
// xmm1 free
;
mulps xmm7, [_PTAN3]; ;// (2a) xmm7 <= (dp[7]*tg_3_16f)
;
addps xmm5, xmm0; ;// (2b) xmm5 <= b[4] = (dp[5]*tg_1_16f) + dp[4]
// xmm0 free
;
mov OUTC, dword ptr [data]; ;//; // COL SETUP row 0
subps xmm6, xmm3; ;// (2b) xmm6 <= b[7] = (dp[6]*tg_1_16f) - dp[7]
// xmm3 free
// xmm1 used
movaps xmm1, xmm4; ;// (2b) xmm1 <= b[5]
// xmm3 used
movaps xmm3, [INPC + 6*32] ;// (2a) xmm3 <= dp[3]
;
addps xmm7, xmm2; // (2b) xmm7 <= b[6] = (dp[7]*tg_1_16f) + dp[6]
// xmm2 free
// xmm0 used
movaps xmm0, xmm5; ;// (2b) xmm0 <= b[4]
;
// 3) Apply E_T
//
// e[4] = b[4] + b[6];
// e[5] = b[4] - b[6];
// e[6] = b[5] + b[7];
// e[7] = b[5] - b[7];
// xmm2 used
movaps xmm2, xmm3; ;// (2a) xmm2 <= dp[3]
addps xmm1, xmm6; ;// (3) xmm1 <= e[6] = b[5] + b[7]
mulps xmm3, [_PTAN2]; ;// (2a) xmm3 <= (dp[3]*tg_2_16f)
;
subps xmm5, xmm7; ;// (3) xmm5 <= e[5] = b[4] - b[6]
;
subps xmm4, xmm6; ;// (3) xmm4 <= e[7] = b[5] - b[7]
// xmm6 free
// xmm6 used
movaps xmm6, [INPC + 2*32] ;// (2a) xmm6 <= dp[2]
;
addps xmm0, xmm7; ;// (3) xmm0 <= e[4] = b[4] + b[6]
// xmm7 free
// xmm7 used
movaps xmm7, xmm5; ;// (3) xmm7 <= e[5]
// 4) Apply F_T
//
// f[4] = e[4]
// f[5] = (e[5] + e[6]) * cos_4_16f;
// f[6] = (e[5] - e[6]) * cos_4_16f;
// f[7] = e[7]
// xmm0 <= f[4] = e[4]
// xmm4 <= f[7] = e[7]
;
addps xmm5, xmm1; ;// (4a) xmm5 <= (e[5]+e[6])
;
subps xmm7, xmm1; ;// (4a) xmm7 <= (e[5]-e[6])
// xmm1 free
// xmm1 used
movaps xmm1, [INPC + 0*32] ;// (1) xmm1 <= dp[0]
movaps [SCRATCH], xmm0; ;// store f[4] in scratch space
// xmm0 free (in scratch)
mulps xmm5, [_PCOS4] ;// (4b) xmm5 <= f[5] = (e[5] + e[6]) * cos_4_16f;
addps xmm3, xmm6; ;// (2b) xmm3 <= b[2] = (dp[3]*tg_2_16f) + dp[2]
//movups [SCRATCH+3*16], xmm4; ;// store f[7] in scratch space
// xmm4 f[7] free (in scratch)
;
mulps xmm6, [_PTAN2]; ;// (2a) xmm6 <= (dp[2]*tg_2_16f)
;
mulps xmm7, [_PCOS4] ;// (4b) xmm7 <= f[6] = (e[5] - e[6]) * cos_4_16f;
;
subps xmm6, xmm2; ;// (2b) xmm6 <= b[3] = (dp[2]*tg_2_16f) - dp[3]
// xmm2 free
// xmm2 used
//movups [SCRATCH+1*16], xmm5; ;// xmm5 <= store f[5] in scratch space
// xmm5 f[5] free (in scratch)
movaps xmm2, [INPC+4*32]; ;// (1) xmm2 <= dp[1]
// xmm0 used
movaps xmm0, xmm1; ;// (1) xmm0 <= dp[0]
;
addps xmm1, xmm2; ;// (2b) xmm1 <= b[0] = dp[0] + dp[1]
// movups [SCRATCH+2*16], xmm7; ;// xmm7 <= store f[6] in scratch space
// xmm7 f[6] free (in scratch)
;
// movups [INP+1*16], xmm1; ;// temp store
subps xmm0, xmm2; ;// (2b) xmm0 <= b[1] = dp[0] - dp[1]
// xmm2 free
// e[0] = b[0] + b[2];
// e[1] = b[1] + b[3];
// e[2] = b[1] - b[3];
// e[3] = b[0] - b[2];
;
// xmm2 used
movaps xmm2, xmm1; ;// (2b) xmm1 <= b[0]
addps xmm1, xmm3; ;// (3) xmm1 <= e[0] = b[0] + b[2];
;
subps xmm2, xmm3; ;// (3) xmm3 <= e[3] = b[0] - b[2];
// xmm3 free
// xmm3 used
movaps xmm3, xmm0; ;// (2b) xmm3 <= b[1]
;
addps xmm0, xmm6; ;// (3) xmm0 <= e[1] = b[1] + b[3];
;
subps xmm3, xmm6; ;// (3) xmm3 <= e[2] = b[1] - b[3];
// xmm6 free
// xmm6 used
movaps xmm6, xmm2; ;// (4) xmm6 <= f[3]
// xmm1 <= f[0]
// xmm0 <= f[1]
// xmm3 <= f[2]
// xmm2 <= f[3]
/*
y[0*8] = SHIFT_ROUND_COLF( _F0 + _F4 );
y[1*8] = SHIFT_ROUND_COLF( _F1 + _F5 );
y[2*8] = SHIFT_ROUND_COLF( _F2 + _F6 );
y[3*8] = SHIFT_ROUND_COLF( _F3 + _F7 );
y[4*8] = SHIFT_ROUND_COLF( _F3 - _F7 );
y[5*8] = SHIFT_ROUND_COLF( _F2 - _F6 );
y[6*8] = SHIFT_ROUND_COLF( _F1 - _F5 );
y[7*8] = SHIFT_ROUND_COLF( _F0 - _F4 );
6a) round compensation, adjust float value to correct round alignment
addps (ROUND_COMP - 0.5)
6b) float -> int32 conversion (cvtps2pi)
cvtps2pi mm0, xmm0; // mm0 <= int32 versions of y[1 0]
movhlps xmm0, xmm0;
cvtps2pi mm1, xmm0; // mm1 <= int32 versions of y[3 2]
6c) int32 -> int16 conversion (packssdw)
packssdw mm0, mm1; // mm0 <= int16 versions of y[3 2 1 0]
6d) int16 decsale (psraw)
psraw mm0, DESCALE_SHIFT2; // mm0 <= final descaled output, {-256,+255}
** The descale shift is chosen to implement a {-256,+255} output range.
*/
;
addps xmm2, xmm4; ;// (5) xmm2 <= y[3] = f[3] + f[7]
;
subps xmm6, xmm4; ;// (5) xmm6 <= y[4] = f[3] - f[7]
// xmm4 free
// xmm4 used
movaps xmm4, xmm0; ;// xmm0 <= f[1]
;
addps xmm2, [_PRND_INV_ROW]; // (6a) ROUND compensation
;
addps xmm0, xmm5; ;// (5) xmm0 <= y[1] = f[1] + f[5]
;
cvtps2pi mm0, xmm2; ;// (6b) mm0 <= y[3] low
movhlps xmm2, xmm2;
cvtps2pi mm1, xmm2; ;// (6b) mm1 <= y[3] high
;
// xmm2 free
addps xmm0, [_PRND_INV_ROW]; ;// (6a) ROUND compensation
;
subps xmm4, xmm5; ;// (5) xmm4 <= y[6] = f[1] - f[5]
// xmm5 free
// xmm2 used
movaps xmm2, [SCRATCH]; ;// xmm2 <= f[4]
packssdw mm0, mm1; ;// (6c) mm0 <= y[3]
;
cvtps2pi mm2, xmm0; ;// (6b) mm2 <= y[1] low
movhlps xmm0, xmm0;
cvtps2pi mm3, xmm0; ;// (6b) mm3 <= y[1] high
// xmm0 free
// xmm0 used
// xmm5 used
movaps xmm0, [_PRND_INV_ROW];
movaps xmm5, xmm3; ;// xmm5 <= f[2]
addps xmm3, xmm7; ;// (5) xmm3 <= y[2] = f[2] + f[6]
psraw mm0, DESCALE_SHIFT2; // (6d) clip y[] to {-256,+255}
add INPC, 16;
addps xmm6, [_PRND_INV_ROW]; ;// (6a) ROUND compensation
;
packssdw mm2, mm3; ;// (6c) mm2 <= y[1]
;
subps xmm5, xmm7; ;// (5) xmm5 <= y[5] = f[2] - f[6]
// xmm7 free
movq [OUTC + 3*16], mm0; ;// mm0 <= y[3] stored
psraw mm2, DESCALE_SHIFT2; // (6d) clip y[] to {-256,+255}
;
cvtps2pi mm4, xmm6; ;// (6b) mm4 <= y[4] low
movhlps xmm6, xmm6;
cvtps2pi mm5, xmm6; ;// (6b) mm5 <= y[4] high
// xmm6 free
////////////////////////////////////
// Start of iDCT column 4-7 processing (denote "col+4")
////////////////////////////////////
// xmm6 used
movaps xmm6, [INPC + 3*32]; // col+4 (1) xmm6 <= dp[6]
addps xmm3, xmm0 ;// (6a) ROUND compensation
movq [OUTC + 1*16], mm2; ;// y[1] stored
// xmm7 used
movaps xmm7, xmm1; ;// xmm7 <= f[0]
;
addps xmm1, xmm2; ;// (5) xmm2 <= y[0] = f[0] + f[4]
;
packssdw mm4, mm5; ;// (6c) mm4 <= y[4]
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -