📄 idct_ap922tdn.cpp
字号:
// mm5 free
PFMULM( mm0, TABLE, 96); // 2d) mm0 <=[x3*w25 x1*w24] (b2a)
;
// mm7 used
movq mm7, mm3; // 2d) mm7 <= [x7 x5] (b3b)
PFMULM( mm3, TABLE, 104); // 2d) mm3 <=[x7*w27 x5*w26] (b2b)
PFACC( mm4, mm1); // 2e) mm4 <= [b1b b1a] (b1)
// mm1 free
PFMULM( mm2, TABLE, 112); // 2d) mm2 <=[x3*w29 x1*w28] (b3a)
PFMULM( mm7, TABLE, 120); // 2d) mm7 <=[x7*w31 x5*w30] (b3b)
PFACC( mm0, mm3); // 2e) mm0 <= [b2b b2a] (b2)
// mm3 free
// mm3 used
movq mm3, [ OUTR]; // 3a) mm3 <= [a1 a0] (fetch from memory)
PFACC( mm6, mm4 ); // 2f) mm6 <=[b1 b0] done
// mm4 free, mm6 = [b1 b0]
PFACC( mm2, mm7); // 2f) mm2 <=[b3b b3a] (b3)
//mm7 free
//mm1 used
movq mm1, mm5; // 3a) mm1 <= [a3 a2]
//mm4 used
movq mm4, mm3; // 3a) mm4 <= [a1 a0]
PFADD( mm3, mm6 ); // 3a) mm3 <= [y1 y0] <= [a1+b1 a0+b0]
add TABLE, 128; // increment table pointer (+32 floats)
PFACC( mm0, mm2); // 2f) mm0 <= [b3 b2] done
// mm2 free, mm0 = [b3 b2]
PFSUB( mm4, mm6 ); // 3a) mm4 <= [y6 y7] <= [a1-b1 a0-b0]
// mm6 free
add INPR, 16; // increment input pointer (+8 shorts)
PFADD( mm5, mm0 ); // 3a) mm5 <= [y3 y2] <= [a3+b3 a2+b2]
movq [ OUTR + 0], mm3; // 3b) store [y1 y0]
// mm3 free
PFSUB( mm1, mm0 ); // 3a) mm1 <= [y4 y5] <= [a3-b3 a2-b2]
// mm0 free
// mm6 used
punpckldq mm6, mm4; // 3b) mm6 = [y7 XX]
punpckhdq mm4, mm6; // 3b) mm4 = [y7 y6] done
// mm6 free
cmp edi, 0x08; // evaluate { x <> 8 }
movq [ OUTR + 8], mm5; // 3b) store [y3 y2]
// mm5 free
punpckldq mm6, mm1; // 3b) mm6 = [y5 XX]
movq [ OUTR + 24], mm4; // 3b) store [y7 y6] done
// mm4 free
punpckhdq mm1, mm6; // 3b) mm1 = [y5 y4]
// mm6 free
movq [ OUTR + 16], mm1; // 3b) store [y5 y4] done
// mm1 free
jl acc_idct_rowloop1; // end for ( x=0; x < 8; x=x+1)
// FEMMS;
// }
//
// done with the iDCT row-transformation
////////////////////////////////////////////////////////////////////////
;/////////////////////////////////////////////////////////////////////
;//
;//
;// AP922float-3DNow iDCT column transform
;//
;//
;/////////////////////////////////////////////////////////////////////
#define SCRATCH esi
#define INPC edx
#define OUTC eax
mov OUTC, dword ptr [data]; ;// row 0
mov edi, 0x00; // x = 0
mov INPC, dword ptr [fTempArray];
sub OUTC, 4; // precompensate out
#define _PRND_INV_ROW TABLE
#define _OCOS4 0
#define _OTAN1 8
#define _OTAN2 16
#define _OTAN3 24
#define _ORND_INV_ROW 32
// TABLE is already set to proper location by the previous idct_row operation
// lea TABLE, [tab_i_col];
mov SCRATCH, INPC; // after we read the first input column, use as scratch
// for ( x = 0; x < 8; x=x+2 ) // process two columns per pass
ALIGN 16
acc_idct_colloop1:
movq mm0, [INPC + 1*32] ; // (1) xmm0 <= dp[4]
add edi, 2; // x = x +2 (two columns per pass)
movq mm2, [INPC + 3*32]; // (1) xmm2 <= dp[6]
movq mm1, [INPC + 7*32]; // (1) xmm1 <= dp[5]
movq mm4, mm0; // (1) xmm4 <= dp[4]
movq mm3, [INPC + 5*32] ; // (1) xmm3 <= dp[7]
movq mm6, mm2; ;// (2a) xmm6 <= dp[6]
;// 1) Apply (D_T * P_T) - the cos() coefficients of D_T are implicit
;// in the idct_row operation. But we still need to apply the
;// shuffling operation of D_T.
;//
;// dp[0] = xc[ 0 *8];
;// dp[1] = xc[ 4 *8];
;// dp[2] = xc[ 2 *8];
;// dp[3] = xc[ 6 *8];
;// dp[4] = xc[ 1 *8];
;// dp[5] = xc[ 7 *8];
;// dp[6] = xc[ 3 *8];
;// dp[7] = xc[ 5 *8];
// 2) Apply B_T
//
// b[4] = dp[4] + ( dp[5]*tg_1_16f );
// b[5] = - dp[5] + ( dp[4]*tg_1_16f );
// b[6] = dp[6] + ( dp[7]*tg_3_16f );
// b[7] = - dp[7] + ( dp[6]*tg_3_16f );
PFMULM( mm4, TABLE, _OTAN1); ;// (2a) xmm4 <= (dp[4]*tg_1_16f)
movq mm5, mm1 ;// (2a) xmm5 <= dp[5]
PFMULM( mm5, TABLE, _OTAN1); ;// (2a) xmm5 <= (dp[5]*tg_1_16f)
movq mm7, mm3; ;// (2a) xmm7 <= dp[7]
PFMULM( mm6, TABLE, _OTAN3); ;// (2a) xmm6 <= (dp[6]*tg_3_16f)
PFSUB( mm4, mm1); ;// (2b) xmm4 <= b[5] = (dp[4]*tg_1_16f) - dp[5]
// xmm1 free
PFMULM( mm7, TABLE, _OTAN3); ;// (2a) xmm7 <= (dp[7]*tg_3_16f)
PFADD( mm5, mm0); ;// (2b) xmm5 <= b[4] = (dp[5]*tg_1_16f) + dp[4]
// xmm0 free
PFSUB( mm6, mm3); ;// (2b) xmm6 <= b[7] = (dp[6]*tg_1_16f) - dp[7]
// xmm3 free
// xmm1 used
movq mm1, mm4; ;// (2b) xmm1 <= b[5]
// xmm3 used
movq mm3, [INPC + 6*32] ;// (2a) xmm3 <= dp[3]
PFADD( mm7, mm2); // (2b) xmm7 <= b[6] = (dp[7]*tg_1_16f) + dp[6]
// xmm2 free
// xmm0 used
movq mm0, mm5; ;// (2b) xmm0 <= b[4]
// 3) Apply E_T
//
// e[4] = b[4] + b[6];
// e[5] = b[4] - b[6];
// e[6] = b[5] + b[7];
// e[7] = b[5] - b[7];
PFADD( mm1, mm6 ); ;// (3) xmm1 <= e[6] = b[5] + b[7]
// xmm2 used
movq mm2, mm3; ;// (2a) xmm2 <= dp[3]
PFSUB( mm5, mm7); ;// (3) xmm5 <= e[5] = b[4] - b[6]
PFMULM( mm3, TABLE, _OTAN2); ;// (2a) xmm3 <= (dp[3]*tg_2_16f)
PFSUB( mm4, mm6); ;// (3) xmm4 <= e[7] = b[5] - b[7]
// xmm6 free
// xmm6 used
movq mm6, [INPC + 2*32]; // (2a) xmm6 <= dp[2]
PFADD( mm0, mm7); ;// (3) xmm0 <= e[4] = b[4] + b[6]
// xmm7 free
// xmm7 used
movq mm7, mm5; ;// (3) xmm7 <= e[5]
// 4) Apply F_T
//
// f[4] = e[4]
// f[5] = (e[5] + e[6]) * cos_4_16f;
// f[6] = (e[5] - e[6]) * cos_4_16f;
// f[7] = e[7]
// xmm0 <= f[4] = e[4]
// xmm4 <= f[7] = e[7]
PFADD( mm5, mm1 ); ;// (4a) xmm5 <= (e[5]+e[6])
PFSUB( mm7, mm1 ); ;// (4a) xmm7 <= (e[5]-e[6])
// xmm1 free
// xmm1 used
movq mm1, [INPC + 0*32]; // (1) xmm1 <= dp[0]
movq [SCRATCH], mm0; ;// store f[4] in scratch space
// xmm0 free (in scratch)
PFMULM( mm5, TABLE, _OCOS4); ;// (4b) xmm5 <= f[5] = (e[5] + e[6]) * cos_4_16f;
PFADD( mm3, mm6); ;// (2b) xmm3 <= b[2] = (dp[3]*tg_2_16f) + dp[2]
// xmm4 f[7] free (in scratch)
PFMULM( mm6, TABLE, _OTAN2); ;// (2a) xmm6 <= (dp[2]*tg_2_16f)
;
PFMULM( mm7, TABLE, _OCOS4); ;// (4b) xmm7 <= f[6] = (e[5] - e[6]) * cos_4_16f;
// xmm0 used
movq mm0, mm1; ;// (1) xmm0 <= dp[0]
PFSUB( mm6, mm2 ); ;// (2b) xmm6 <= b[3] = (dp[2]*tg_2_16f) - dp[3]
// xmm2 free
// xmm2 used
movq mm2, [INPC+4*32]; ;// (1) xmm2 <= dp[1]
PFADD( mm1, mm2 ); ;// (2b) xmm1 <= b[0] = dp[0] + dp[1]
add INPC, 8;
PFSUB( mm0, mm2 ); ;// (2b) xmm0 <= b[1] = dp[0] - dp[1]
// xmm2 free
add OUTC, 4;
// e[0] = b[0] + b[2];
// e[1] = b[1] + b[3];
// e[2] = b[1] - b[3];
// e[3] = b[0] - b[2];
// xmm2 used
movq mm2, mm1; ;// (2b) xmm1 <= b[0]
PFADD( mm1, mm3); ;// (3) xmm1 <= e[0] = b[0] + b[2];
PFSUB( mm2, mm3); ;// (3) xmm3 <= e[3] = b[0] - b[2];
// xmm3 free
// xmm3 used
movq mm3, mm0; ;// (2b) xmm3 <= b[1]
PFADD( mm0, mm6); ;// (3) xmm0 <= e[1] = b[1] + b[3];
;
PFSUB( mm3, mm6); ;// (3) xmm3 <= e[2] = b[1] - b[3];
// xmm6 free
// xmm6 used
movq mm6, mm2; ;// (4) xmm6 <= f[3]
// xmm1 <= f[0]
// xmm0 <= f[1]
// xmm3 <= f[2]
// xmm2 <= f[3]
/*
y[0*8] = SHIFT_ROUND_COLF( _F0 + _F4 );
y[1*8] = SHIFT_ROUND_COLF( _F1 + _F5 );
y[2*8] = SHIFT_ROUND_COLF( _F2 + _F6 );
y[3*8] = SHIFT_ROUND_COLF( _F3 + _F7 );
y[4*8] = SHIFT_ROUND_COLF( _F3 - _F7 );
y[5*8] = SHIFT_ROUND_COLF( _F2 - _F6 );
y[6*8] = SHIFT_ROUND_COLF( _F1 - _F5 );
y[7*8] = SHIFT_ROUND_COLF( _F0 - _F4 );
6a) float -> int32 conversion (cvtps2pi)
pf2id mm0, mm0; // mm0 <= int32 versions of y[1 0]
6b) round compensation, adjust int32 value to correct round alignment
paddd mm0, [rnd_compensation]
6c) stage1 descaling shift
psrad mm0, DESCALE_SHIFT1
6d) int32 -> int16 conversion (packssdw)
packssdw mm0, mm0; // mm0 <= int16 versions of y[1 0 1 0] (duplicates)
6e) stage2 descaling shift (psraw)
psraw mm0, DESCALE_SHIFT2; // mm0 <= final descaled output, {-256,+255}
6f) store results
** The descale1/descale2 shifts are chosen to clip final result to
{-256,+255} (9-bit range)
*/
PFADD( mm2, mm4); ;// (5) xmm2 <= y[3] = f[3] + f[7]
;
PFSUB( mm6, mm4); ;// (5) xmm6 <= y[4] = f[3] - f[7]
// xmm4 free
// xmm4 used
movq mm4, mm0; ;// xmm0 <= f[1]
PF2ID( mm2, mm2); ;// (6a) [y3] <float> -> <int32>
;
PFADD( mm0, mm5); ;// (5) xmm0 <= y[1] = f[1] + f[5]
;
paddd mm2, [ rnd_compensation ] ;// (6b) ROUND compensation
PF2ID( mm0, mm0 ) ;// (6a) [y1] <float> -> <int32>
psrad mm2, DESCALE_SHIFT1;
PFSUB( mm4, mm5); ;// (5) xmm4 <= y[6] = f[1] - f[5]
// xmm5 free
paddd mm0, [ rnd_compensation ] ;// (6b) ROUND compensation
// xmm5 used
movq mm5, mm3; ;// xmm5 <= f[2]
psrad mm0, DESCALE_SHIFT1;
PF2ID( mm6, mm6 ); ;// (6a) [y4] <float> -> <int32>
packssdw mm0, mm2; ;// (6d) mm0 <= [ y[3] | y[1]]
PFADD( mm3, mm7 ); ;// (5) xmm3 <= y[2] = f[2] + f[6]
// xmm2 free
psraw mm0, DESCALE_SHIFT2; // (6e) clip y[3,1] to {-256,+255}
PFSUB( mm5, mm7); ;// (5) xmm5 <= y[5] = f[2] - f[6]
// xmm7 free
paddd mm6, [ rnd_compensation ] ;// (6b) ROUND compensation
PF2ID( mm3, mm3 ); ;// (6a) [y2] <float> -> <int32>
movd [OUTC + 1*16], mm0; l// (6f) store y[1] (store y[3] next)
PF2ID( mm5, mm5 ); ;// (6a) [y5] <float> -> <int32>
movq mm2, [SCRATCH]; ;// xmm2 <= f[4]
psrad mm6, DESCALE_SHIFT1;
punpckhdq mm0, mm0; // (6f) mm0 = [y3c1 y3c0 y3c1 y3c0] <int16>
movq mm7, mm1; ;// xmm7 <= f[0]
paddd mm3, [rnd_compensation]; ;// (6b) ROUND compensation
PFADD( mm1, mm2); ;// (5) xmm2 <= y[0] = f[0] + f[4]
movd [OUTC + 3*16], mm0; // (6f) store y[1] (store y[3] next)
// mm0 free
PFSUB( mm7, mm2); ;// (5) xmm7 <= y[7] = f[0] - f[4]
psrad mm3, DESCALE_SHIFT1;
PF2ID( mm1, mm1) ;// (6a) [y0] <float> -> <int32>
packssdw mm3, mm6; ;// (6d) mm3 <= [ y[4] y[2] ]
PF2ID( mm7, mm7) ;// (6a) [y7] <float> -> <int32>
paddd mm5, [rnd_compensation]; ;// (6b) ROUND compensation
PF2ID( mm4, mm4) ;// (6a) [y6] <float> -> <int32>
paddd mm1, [rnd_compensation]; ;// (6b) ROUND compensation
psraw mm3, DESCALE_SHIFT2; // (6e) clip y[4,2] to {-256,+255}
paddd mm7, [rnd_compensation]; ;// (6b) ROUND compensation
psrad mm5, DESCALE_SHIFT1;
paddd mm4, [rnd_compensation]; ;// (6b) ROUND compensation
psrad mm1, DESCALE_SHIFT1;
movd [ OUTC + 2*16], mm3; // (6f) store y[2] (store y[4] next)
psrad mm4, DESCALE_SHIFT1;
// mm1 free
punpckhdq mm3, mm3; ;//(6f) mm3 = [y4c1 y4c0 y4c1 y4c0] <int16>
psrad mm7, DESCALE_SHIFT1;
packssdw mm5, mm1; ;// (6d) mm5 = [y0 y5] <int16>
packssdw mm7, mm4; ;// (6d) mm7 = [y6 y7] <int16>
movd [ OUTC + 4*16], mm3; // (6f) store y[4]
psraw mm5, DESCALE_SHIFT2; // (6e) clip y[5,0] to {-256,+255}
cmp edi, 8;
psraw mm7, DESCALE_SHIFT2; // (6e) clip y[6,7] to {-256,+255}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -