📄 idct_ap922sse_opt.cpp
字号:
;
cvtps2pi mm6, xmm3; ;// (6b) mm6 <= y[2] low
movhlps xmm3, xmm3;
cvtps2pi mm7, xmm3; ;// (6b) mm7 <= y[2] high
// xmm3 free
// xmm3 used
movaps xmm3, [INPC + 5*32] ; // col+4 (1) xmm3 <= dp[7]
subps xmm7, xmm2; ;// (5) xmm7 <= y[7] = f[0] - f[4]
// xmm2 free
// xmm2 used
movaps xmm2, xmm6; ;// col+4 (2a) xmm2 <= dp[6]
// 2) Apply B_T (for columns 4-7)
//
// b[4] = dp[4] + ( dp[5]*tg_1_16f );
// b[5] = - dp[5] + ( dp[4]*tg_1_16f );
// b[6] = dp[6] + ( dp[7]*tg_3_16f );
// b[7] = - dp[7] + ( dp[6]*tg_3_16f );
psraw mm4, DESCALE_SHIFT2; // (6d) clip y[] to {-256,+255}
mulps xmm6, [_PTAN3]; ;// col+4 (2a) xmm6 <= (dp[6]*tg_3_16f)
addps xmm5, xmm0; ;// (6a) ROUND compensation
movq [OUTC + 4*16], mm4; ;// y[4] stored
packssdw mm6, mm7; ;// (6c) mm6 <= y[2]
;
addps xmm4, xmm0 ;// (6a) ROUND compensation
;
psraw mm6, DESCALE_SHIFT2; // (6d) clip y[] to {-256,+255}
;
cvtps2pi mm0, xmm5; ;// (6b) mm0 <= y[5] low
movhlps xmm5, xmm5;
cvtps2pi mm1, xmm5; ;// (6b) mm1 <= y[5] high
// xmm5 free
// xmm5 used
movaps xmm5, [INPC + 7*32]; // col+4 (1) xmm5 <= dp[5]
cvtps2pi mm2, xmm4; ;// (6b) mm2 <= y[6] low
movhlps xmm4, xmm4;
cvtps2pi mm3, xmm4; ;// (6b) mm3 <= y[6] high
// xmm4 free
// xmm4 used
movaps xmm4, [INPC + 1*32] ; // col+4 (1) xmm4 <= dp[4]
movq [OUTC + 2*16], mm6; ;// y[2] stored
addps xmm1, xmm0 ;// (6a) ROUND compensation
packssdw mm0, mm1; ;// (6c) mm0 <= y[5]
;
addps xmm7, xmm0 ;// (6a) ROUND compensation
// xmm0 free
// xmm0 used
movaps xmm0, xmm4; // col+4 (1) xmm0 <= dp[4]
psraw mm0, DESCALE_SHIFT2; // (6d) clip y[] to {-256,+255}
mulps xmm4, [_PTAN1]; ;// col+4 (2a) xmm4 <= (dp[4]*tg_1_16f)
cvtps2pi mm4, xmm1; ;// (6b) mm4 <= y[0] low
movhlps xmm1, xmm1;
packssdw mm2, mm3; ;// (6c) mm2 <= y[6]
;
cvtps2pi mm5, xmm1; ;// (6b) mm5 <= y[0] high
// xmm1 free
movq [OUTC + 5*16], mm0;
// xmm1 used
movaps xmm1, xmm5 ;// col+4 (2a) xmm1 <= dp[5]
psraw mm2, DESCALE_SHIFT2; // (6d) clip y[] to {-256,+255}
cvtps2pi mm6, xmm7; ;// (6b) mm6 <= y[7] low
movhlps xmm7, xmm7;
cvtps2pi mm7, xmm7; ;// (6b) mm7 <= y[7] high
// xmm7 free
mulps xmm5, [_PTAN1]; ;// col+4 (2a) xmm5 <= (dp[5]*tg_1_16f)
subps xmm4, xmm1; ;// col+4 (2b) xmm4 <= b[5] = (dp[4]*tg_1_16f) - dp[5]
// xmm1 free
movaps xmm7, xmm3; ;// col+4 (2a) xmm7 <= dp[7]
packssdw mm4, mm5; ;// (6c) mm4 <= y[0]
mulps xmm7, [_PTAN3]; ;// col+4 (2a) xmm7 <= (dp[7]*tg_3_16f)
subps xmm6, xmm3; ;// col+4 (2b) xmm6 <= b[7] = (dp[6]*tg_1_16f) - dp[7]
// xmm3 free
movq [OUTC + 6*16], mm2;
// xmm3 used
movaps xmm3, [INPC + 6*32] ;// col+4 (2a) xmm3 <= dp[3]
psraw mm4, DESCALE_SHIFT2; // (6d) clip y[] to {-256,+255}
// xmm1 used
movaps xmm1, xmm4; ;// col+4 (2b) xmm1 <= b[5]
addps xmm5, xmm0; ;// col+4 (2b) xmm5 <= b[4] = (dp[5]*tg_1_16f) + dp[4]
movq [OUTC + 0*16], mm4;
packssdw mm6, mm7; ;// (6c) mm6 <= y[7]
psraw mm6, DESCALE_SHIFT2; // (6d) clip y[] to {-256,+255}
addps xmm7, xmm2; // col+4 (2b) xmm7 <= b[6] = (dp[7]*tg_1_16f) + dp[6]
// xmm2 free
// xmm0 used
movaps xmm0, xmm5; ;// col+4 (2b) xmm0 <= b[4]
// 3) Apply E_T
//
// e[4] = b[4] + b[6];
// e[5] = b[4] - b[6];
// e[6] = b[5] + b[7];
// e[7] = b[5] - b[7];
// xmm2 used
movaps xmm2, xmm3; ;// col+4 (2a) xmm2 <= dp[3]
addps xmm1, xmm6; ;// col+4 (3) xmm1 <= e[6] = b[5] + b[7]
movq [OUTC + 7*16], mm6;
////////////////////////////////////
// end of iDCT column 0-3 processing
////////////////////////////////////
mulps xmm3, [_PTAN2]; ;// (2a) xmm3 <= (dp[3]*tg_2_16f)
add OUTC, 8; ;// advance output to columns 4-7
subps xmm5, xmm7; ;// (3) xmm5 <= e[5] = b[4] - b[6]
;
subps xmm4, xmm6; ;// (3) xmm4 <= e[7] = b[5] - b[7]
// xmm6 free
// xmm6 used
movaps xmm6, [INPC + 2*32] ;// (2a) xmm6 <= dp[2]
;
addps xmm0, xmm7; ;// (3) xmm0 <= e[4] = b[4] + b[6]
// xmm7 free
// xmm7 used
movaps xmm7, xmm5; ;// (3) xmm7 <= e[5]
// 4) Apply F_T
//
// f[4] = e[4]
// f[5] = (e[5] + e[6]) * cos_4_16f;
// f[6] = (e[5] - e[6]) * cos_4_16f;
// f[7] = e[7]
// xmm0 <= f[4] = e[4]
// xmm4 <= f[7] = e[7]
;
addps xmm5, xmm1; ;// (4a) xmm5 <= (e[5]+e[6])
;
subps xmm7, xmm1; ;// (4a) xmm7 <= (e[5]-e[6])
// xmm1 free
// xmm1 used
movaps xmm1, [INPC + 0*32] ;// (1) xmm1 <= dp[0]
movaps [SCRATCH], xmm0; ;// store f[4] in scratch space
// xmm0 free (in scratch)
mulps xmm5, [_PCOS4] ;// (4b) xmm5 <= f[5] = (e[5] + e[6]) * cos_4_16f;
addps xmm3, xmm6; ;// (2b) xmm3 <= b[2] = (dp[3]*tg_2_16f) + dp[2]
//movups [SCRATCH+3*16], xmm4; ;// store f[7] in scratch space
// xmm4 f[7] free (in scratch)
;
mulps xmm6, [_PTAN2]; ;// (2a) xmm6 <= (dp[2]*tg_2_16f)
;
mulps xmm7, [_PCOS4] ;// (4b) xmm7 <= f[6] = (e[5] - e[6]) * cos_4_16f;
;
subps xmm6, xmm2; ;// (2b) xmm6 <= b[3] = (dp[2]*tg_2_16f) - dp[3]
// xmm2 free
// xmm2 used
//movups [SCRATCH+1*16], xmm5; ;// xmm5 <= store f[5] in scratch space
// xmm5 f[5] free (in scratch)
movaps xmm2, [INPC+4*32]; ;// (1) xmm2 <= dp[1]
// xmm0 used
movaps xmm0, xmm1; ;// (1) xmm0 <= dp[0]
;
addps xmm1, xmm2; ;// (2b) xmm1 <= b[0] = dp[0] + dp[1]
// movups [SCRATCH+2*16], xmm7; ;// xmm7 <= store f[6] in scratch space
// xmm7 f[6] free (in scratch)
;
// movups [INP+1*16], xmm1; ;// temp store
subps xmm0, xmm2; ;// (2b) xmm0 <= b[1] = dp[0] - dp[1]
// xmm2 free
// e[0] = b[0] + b[2];
// e[1] = b[1] + b[3];
// e[2] = b[1] - b[3];
// e[3] = b[0] - b[2];
;
// xmm2 used
movaps xmm2, xmm1; ;// (2b) xmm1 <= b[0]
addps xmm1, xmm3; ;// (3) xmm1 <= e[0] = b[0] + b[2];
;
subps xmm2, xmm3; ;// (3) xmm3 <= e[3] = b[0] - b[2];
// xmm3 free
// xmm3 used
movaps xmm3, xmm0; ;// (2b) xmm3 <= b[1]
;
addps xmm0, xmm6; ;// (3) xmm0 <= e[1] = b[1] + b[3];
;
subps xmm3, xmm6; ;// (3) xmm3 <= e[2] = b[1] - b[3];
// xmm6 free
// xmm6 used
movaps xmm6, xmm2; ;// (4) xmm6 <= f[3]
// xmm1 <= f[0]
// xmm0 <= f[1]
// xmm3 <= f[2]
// xmm2 <= f[3]
/*
y[0*8] = SHIFT_ROUND_COLF( _F0 + _F4 );
y[1*8] = SHIFT_ROUND_COLF( _F1 + _F5 );
y[2*8] = SHIFT_ROUND_COLF( _F2 + _F6 );
y[3*8] = SHIFT_ROUND_COLF( _F3 + _F7 );
y[4*8] = SHIFT_ROUND_COLF( _F3 - _F7 );
y[5*8] = SHIFT_ROUND_COLF( _F2 - _F6 );
y[6*8] = SHIFT_ROUND_COLF( _F1 - _F5 );
y[7*8] = SHIFT_ROUND_COLF( _F0 - _F4 );
6a) round compensation, adjust float value to correct round alignment
addps (ROUND_COMP - 0.5)
6b) float -> int32 conversion (cvtps2pi)
cvtps2pi mm0, xmm0; // mm0 <= int32 versions of y[1 0]
movhlps xmm0, xmm0;
cvtps2pi mm1, xmm0; // mm1 <= int32 versions of y[3 2]
6c) int32 -> int16 conversion (packssdw)
packssdw mm0, mm1; // mm0 <= int16 versions of y[3 2 1 0]
6d) int16 decsale (psraw)
psraw mm0, DESCALE_SHIFT2; // mm0 <= final descaled output, {-256,+255}
** The descale shift is chosen to implement a {-256,+255} output range.
*/
;
addps xmm2, xmm4; ;// (5) xmm2 <= y[3] = f[3] + f[7]
;
subps xmm6, xmm4; ;// (5) xmm6 <= y[4] = f[3] - f[7]
// xmm4 free
// xmm4 used
movaps xmm4, xmm0; ;// xmm0 <= f[1]
;
addps xmm2, [_PRND_INV_ROW]; // (6a) ROUND compensation
;
addps xmm0, xmm5; ;// (5) xmm0 <= y[1] = f[1] + f[5]
;
cvtps2pi mm0, xmm2; ;// (6b) mm0 <= y[3] low
movhlps xmm2, xmm2;
cvtps2pi mm1, xmm2; ;// (6b) mm1 <= y[3] high
;
// xmm2 free
addps xmm0, [_PRND_INV_ROW]; ;// (6a) ROUND compensation
;
subps xmm4, xmm5; ;// (5) xmm4 <= y[6] = f[1] - f[5]
// xmm5 free
// xmm2 used
movaps xmm2, [SCRATCH]; ;// xmm2 <= f[4]
packssdw mm0, mm1; ;// (6c) mm0 <= y[3]
;
cvtps2pi mm2, xmm0; ;// (6b) mm2 <= y[1] low
movhlps xmm0, xmm0;
cvtps2pi mm3, xmm0; ;// (6b) mm3 <= y[1] high
// xmm5 used
movaps xmm5, xmm3; ;// xmm5 <= f[2]
addps xmm3, xmm7; ;// (5) xmm3 <= y[2] = f[2] + f[6]
;
psraw mm0, DESCALE_SHIFT2; // (6d) clip y[] to {-256,+255}
;
addps xmm6, [_PRND_INV_ROW]; ;// (6a) ROUND compensation
;
packssdw mm2, mm3; ;// (6c) mm2 <= y[1]
;
subps xmm5, xmm7; ;// (5) xmm5 <= y[5] = f[2] - f[6]
// xmm7 free
movq [OUTC + 3*16], mm0; ;// mm0 <= y[3] stored
psraw mm2, DESCALE_SHIFT2; // (6d) clip y[] to {-256,+255}
;
cvtps2pi mm4, xmm6; ;// (6b) mm4 <= y[4] low
movhlps xmm6, xmm6;
cvtps2pi mm5, xmm6; ;// (6b) mm5 <= y[4] high
// xmm6 free
;
addps xmm3, [_PRND_INV_ROW]; ;// (6a) ROUND compensation
movq [OUTC + 1*16], mm2; ;// y[1] stored
// xmm7 used
movaps xmm7, xmm1; ;// xmm7 <= f[0]
;
addps xmm1, xmm2; ;// (5) xmm2 <= y[0] = f[0] + f[4]
;
packssdw mm4, mm5; ;// (6c) mm4 <= y[4]
;
cvtps2pi mm6, xmm3; ;// (6b) mm6 <= y[2] low
movhlps xmm3, xmm3;
cvtps2pi mm7, xmm3; ;// (6b) mm7 <= y[2] high
// xmm3 free
;
subps xmm7, xmm2; ;// (5) xmm7 <= y[7] = f[0] - f[4]
// xmm2 free
psraw mm4, DESCALE_SHIFT2; // (6d) clip y[] to {-256,+255}
;
addps xmm5, [_PRND_INV_ROW]; ;// (6a) ROUND compensation
movq [OUTC + 4*16], mm4; ;// y[4] stored
packssdw mm6, mm7; ;// (6c) mm6 <= y[2]
;
addps xmm4, [_PRND_INV_ROW]; ;// (6a) ROUND compensation
add INPC, 16;
psraw mm6, DESCALE_SHIFT2; // (6d) clip y[] to {-256,+255}
;
cvtps2pi mm0, xmm5; ;// (6b) mm0 <= y[5] low
movhlps xmm5, xmm5;
cvtps2pi mm1, xmm5; ;// (6b) mm1 <= y[5] high
// xmm5 free
movq [OUTC + 2*16], mm6; ;// y[2] stored
cvtps2pi mm2, xmm4; ;// (6b) mm2 <= y[6] low
movhlps xmm4, xmm4;
cvtps2pi mm3, xmm4; ;// (6b) mm3 <= y[6] high
// xmm4 free
;
addps xmm1, [_PRND_INV_ROW]; ;// (6a) ROUND compensation
;
packssdw mm0, mm1; ;// (6c) mm0 <= y[5]
;
addps xmm7, [_PRND_INV_ROW]; ;// (6a) ROUND compensation
;
psraw mm0, DESCALE_SHIFT2; // (6d) clip y[] to {-256,+255}
;
cvtps2pi mm4, xmm1; ;// (6b) mm4 <= y[0] low
movhlps xmm1, xmm1;
packssdw mm2, mm3; ;// (6c) mm2 <= y[6]
;
cvtps2pi mm5, xmm1; ;// (6b) mm5 <= y[0] high
// xmm1 free
movq [OUTC + 5*16], mm0;
psraw mm2, DESCALE_SHIFT2; // (6d) clip y[] to {-256,+255}
;
cvtps2pi mm6, xmm7; ;// (6b) mm6 <= y[7] low
movhlps xmm7, xmm7;
cvtps2pi mm7, xmm7; ;// (6b) mm7 <= y[7] high
// xmm7 free
;
packssdw mm4, mm5; ;// (6c) mm4 <= y[0]
;
psraw mm4, DESCALE_SHIFT2; // (6d) clip y[] to {-256,+255}
movq [OUTC + 6*16], mm2;
packssdw mm6, mm7; ;// (6c) mm6 <= y[7]
;
psraw mm6, DESCALE_SHIFT2; // (6d) clip y[] to {-256,+255}
movq [OUTC + 0*16], mm4;
movq [OUTC + 7*16], mm6;
// ldmxcsr [_mxcsr_backup]; // restore state
emms;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -