📄 mmxidct.cpp
字号:
*
* Error: None
*
***************************************************************************************
*/
// RowIDCT gets ready to transpose.
template<class Ti,class Tj,class Tc> static __forceinline void RowIDCT(__m64 &r0,__m64 &r1,__m64 &r2,__m64 &r3,__m64 &r4,__m64 &r5,__m64 &r6,__m64 &r7,Ti I,Tj J, Tc C)
{
BeginIDCT(r0,r1,r2,r3,r4,r5,r6,r7,I,J,C);
movq (r3, I(2) );/* r3 = D. */
psubsw (r4, r7 );/* r4 = E. = E - G */
paddsw (r1, r1 );/* r1 = H. + H. */
paddsw (r7, r7 );/* r7 = G + G */
paddsw (r1, r2 );/* r1 = R1 = A.. + H. */
paddsw (r7, r4 );/* r7 = G. = E + G */
psubsw (r4, r3 );/* r4 = R4 = E. - D. */
paddsw (r3, r3 );
psubsw (r6, r5 );/* r6 = R6 = F. - B.. */
paddsw (r5, r5 );
paddsw (r3, r4 );/* r3 = R3 = E. + D. */
paddsw (r5, r6 );/* r5 = R5 = F. + B.. */
psubsw (r7, r0 );/* r7 = R7 = G. - C. */
paddsw (r0, r0 );
movq (I(1), r1 );/* save R1 */
paddsw (r0, r7 );/* r0 = R0 = G. + C. */
}
// end RowIDCT macro (8 + 38 = 46 cycles)
/**************************************************************************************
*
* Routine: ColumnIDCT
*
* Description: The Macro does 1-D IDct on 4 columns
*
* Input: None
*
* Output: None
*
* Return: None
*
* Special Note: None
*
* Error: None
*
***************************************************************************************
*/
// Column IDCT normalizes and stores final results.
template<class Ti,class Tj,class Tc> static __forceinline void ColumnIDCT(__m64 &r0,__m64 &r1,__m64 &r2,__m64 &r3,__m64 &r4,__m64 &r5,__m64 &r6,__m64 &r7,Ti I,Tj J, Tc C,unsigned char *Eight)
{
BeginIDCT (r0,r1,r2,r3,r4,r5,r6,r7,I,J,C);
paddsw (r2, Eight );/* adjust R2 (and R1) for shift */
paddsw (r1, r1 );/* r1 = H. + H. */
paddsw (r1, r2 );/* r1 = R1 = A.. + H. */
psraw (r2, 4 );/* r2 = NR2 */
psubsw (r4, r7 );/* r4 = E. = E - G */
psraw (r1, 4 );/* r1 = NR1 */
movq (r3, I(2) );/* r3 = D. */
paddsw (r7, r7 );/* r7 = G + G */
movq (I(2), r2 );/* store NR2 at I2 */
paddsw (r7, r4 );/* r7 = G. = E + G */
movq (I(1), r1 );/* store NR1 at I1 */
psubsw (r4, r3 );/* r4 = R4 = E. - D. */
paddsw (r4, Eight );/* adjust R4 (and R3) for shift */
paddsw (r3, r3 );/* r3 = D. + D. */
paddsw (r3, r4 );/* r3 = R3 = E. + D. */
psraw (r4, 4 );/* r4 = NR4 */
psubsw (r6, r5 );/* r6 = R6 = F. - B.. */
psraw (r3, 4 );/* r3 = NR3 */
paddsw (r6, Eight );/* adjust R6 (and R5) for shift */
paddsw (r5, r5 );/* r5 = B.. + B.. */
paddsw (r5, r6 );/* r5 = R5 = F. + B.. */
psraw (r6, 4 );/* r6 = NR6 */
movq (J(4), r4 );/* store NR4 at J4 */
psraw (r5, 4 );/* r5 = NR5 */
movq (I(3), r3 );/* store NR3 at I3 */
psubsw (r7, r0 );/* r7 = R7 = G. - C. */
paddsw (r7, Eight );/* adjust R7 (and R0) for shift */
paddsw (r0, r0 );/* r0 = C. + C. */
paddsw (r0, r7 );/* r0 = R0 = G. + C. */
psraw (r7, 4 );/* r7 = NR7 */
movq (J(6), r6 );/* store NR6 at J6 */
psraw (r0, 4 );/* r0 = NR0 */
movq (J(5), r5 );/* store NR5 at J5 */
movq (J(7), r7 );/* store NR7 at J7 */
movq (I(0), r0 );/* store NR0 at I0 */
}
// end ColumnIDCT macro (38 + 19 = 57 cycles)
/**************************************************************************************
*
* Routine: Transpose
*
* Description: The Macro does two 4x4 transposes in place.
*
* Input: None
*
* Output: None
*
* Return: None
*
* Special Note: None
*
* Error: None
*
***************************************************************************************
*/
/* Following macro does two 4x4 transposes in place.
At entry (we assume):
r0 = a3 a2 a1 a0
I(1) = b3 b2 b1 b0
r2 = c3 c2 c1 c0
r3 = d3 d2 d1 d0
r4 = e3 e2 e1 e0
r5 = f3 f2 f1 f0
r6 = g3 g2 g1 g0
r7 = h3 h2 h1 h0
At exit, we have:
I(0) = d0 c0 b0 a0
I(1) = d1 c1 b1 a1
I(2) = d2 c2 b2 a2
I(3) = d3 c3 b3 a3
J(4) = h0 g0 f0 e0
J(5) = h1 g1 f1 e1
J(6) = h2 g2 f2 e2
J(7) = h3 g3 f3 e3
I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
Since r1 is free at entry, we calculate the Js first. */
template<class Ti,class Tj> static __forceinline void Transpose(__m64 &r0,__m64 &r1,__m64 &r2,__m64 &r3,__m64 &r4,__m64 &r5,__m64 &r6,__m64 &r7,Ti I,Tj J)
{
movq (r1, r4 );// r1 = e3 e2 e1 e0
punpcklwd (r4, r5 );// r4 = f1 e1 f0 e0
movq (I(0), r0 );// save a3 a2 a1 a0
punpckhwd (r1, r5 );// r1 = f3 e3 f2 e2
movq (r0, r6 );// r0 = g3 g2 g1 g0
punpcklwd (r6, r7 );// r6 = h1 g1 h0 g0
movq (r5, r4 );// r5 = f1 e1 f0 e0
punpckldq (r4, r6 );// r4 = h0 g0 f0 e0 = R4
punpckhdq (r5, r6 );// r5 = h1 g1 f1 e1 = R5
movq (r6, r1 );// r6 = f3 e3 f2 e2
movq (J(4), r4 );
punpckhwd (r0, r7 );// r0 = h3 g3 h2 g2
movq (J(5), r5 );
punpckhdq (r6, r0 );// r6 = h3 g3 f3 e3 = R7
movq (r4, I(0) );// r4 = a3 a2 a1 a0
punpckldq (r1, r0 );// r1 = h2 g2 f2 e2 = R6
movq (r5, I(1) );// r5 = b3 b2 b1 b0
movq (r0, r4 );// r0 = a3 a2 a1 a0
movq (J(7), r6 );
punpcklwd (r0, r5 );// r0 = b1 a1 b0 a0
movq (J(6), r1 );
punpckhwd (r4, r5 );// r4 = b3 a3 b2 a2
movq (r5, r2 );// r5 = c3 c2 c1 c0
punpcklwd (r2, r3 );// r2 = d1 c1 d0 c0
movq (r1, r0 );// r1 = b1 a1 b0 a0
punpckldq (r0, r2 );// r0 = d0 c0 b0 a0 = R0
punpckhdq (r1, r2 );// r1 = d1 c1 b1 a1 = R1
movq (r2, r4 );// r2 = b3 a3 b2 a2
movq (I(0), r0 );
punpckhwd (r5, r3 );// r5 = d3 c3 d2 c2
movq (I(1), r1 );
punpckhdq (r4, r5 );// r4 = d3 c3 b3 a3 = R3
punpckldq (r2, r5 );// r2 = d2 c2 b2 a2 = R2
movq (I(3), r4 );
movq (I(2), r2 );
}
// end Transpose macro (19 cycles).
/**************************************************************************************
*
* Routine: MMX_idct
*
* Description: Perform IDCT on a 8x8 block
*
* Input: Pointer to input and output buffer
*
* Output: None
*
* Return: None
*
* Special Note: The input coefficients are in ZigZag order
*
* Error: None
*
***************************************************************************************
*/
struct I10_1
{
private:
unsigned char *edx;
public:
I10_1(unsigned char *Iedx):edx(Iedx) {}
unsigned char* operator()(int K) const {return edx + ( K * 16);}
};
struct I_2
{
private:
unsigned char *edx;
public:
I_2(unsigned char *Iedx):edx(Iedx) {}
unsigned char* operator()(int K) const {return edx + ( K * 16) + 64;}
};
struct I10_2
{
private:
unsigned char *edx;
public:
I10_2(unsigned char *Iedx):edx(Iedx) {}
unsigned char* operator()(int K) const {return edx + (K * 16);}
};
struct I10_3
{
private:
unsigned char *edx;
public:
I10_3(unsigned char *Iedx):edx(Iedx) {}
unsigned char* operator()(int K) const {return edx + (K * 16) + 8;}
};
struct J10_1
{
private:
unsigned char *edx;
public:
J10_1(unsigned char *Iedx):edx(Iedx) {}
unsigned char* operator()(int K) const {return edx + ( (K - 4) * 16) + 8;}
};
struct J_2
{
private:
unsigned char *edx;
public:
J_2(unsigned char *Iedx):edx(Iedx) {}
unsigned char* operator()(int K) const {return edx + ( (K - 4) * 16) + 72;}
};
struct C10
{
private:
unsigned char *ecx;
public:
C10(unsigned char *Iecx):ecx(Iecx) {}
unsigned char* operator()(int I) const {return ecx + CosineOffset + (I-1)*8;}
};
extern "C" void MMX_idct ( ogg_int16_t * input, ogg_int16_t * qtbl, ogg_int16_t * output)
{
// uint16 *constants = idctconstants;
# define M(I) (ecx + MaskOffset + I*8)
//# define C(I) [ecx + CosineOffset + (I-1)*8]
//# define Eight [ecx + EightOffset]
__m64 r0,r1,r2,r3,r4,r5,r6,r7;
unsigned char *eax=(unsigned char*)input; // eax = quantized input
unsigned char *edx=(unsigned char*)output; // edx = destination (= idct buffer)
/*
mov ecx, [edx] // (+1 at least) preload the cache before writing
mov ebx, [edx+28] // in case proc doesn't cache on writes
mov ecx, [edx+56] // gets all the cache lines
mov ebx, [edx+84] // regardless of alignment (beyond 32-bit)
mov ecx, [edx+112] // also avoids address contention stalls
mov ebx, [edx+124]
*/
unsigned char *ebx=(unsigned char*)qtbl; // ebx = quantization table
unsigned char *ecx=(unsigned char*)idctconstants; ////[0]//
movq (r0, eax);
//
pmullw (r0, ebx );// r0 = 03 02 01 00
//
movq (r1, eax+16);
//
pmullw (r1, ebx+16 );// r1 = 13 12 11 10
//
movq (r2, M(0) );// r2 = __ __ __ FF
movq (r3, r0 );// r3 = 03 02 01 00
movq (r4, eax+8);
psrlq (r0, 16 );// r0 = __ 03 02 01
pmullw (r4, ebx+8 );// r4 = 07 06 05 04
pand (r3, r2 );// r3 = __ __ __ 00
movq (r5, r0 );// r5 = __ 03 02 01
movq (r6, r1 );// r6 = 13 12 11 10
pand (r5, r2 );// r5 = __ __ __ 01
psllq (r6, 32 );// r6 = 11 10 __ __
movq (r7, M(3) );// r7 = FF __ __ __
pxor (r0, r5 );// r0 = __ 03 02 __
pand (r7, r6 );// r7 = 11 __ __ __
por (r0, r3 );// r0 = __ 03 02 00
pxor (r6, r7 );// r6 = __ 10 __ __
por (r0, r7 );// r0 = 11 03 02 00 = R0
movq (r7, M(3) );// r7 = FF __ __ __
movq (r3, r4 );// r3 = 07 06 05 04
movq (edx, r0 );// write R0 = r0
pand (r3, r2 );// r3 = __ __ __ 04
movq (r0, eax+32);
psllq (r3, 16 );// r3 = __ __ 04 __
pmullw (r0, ebx+32 );// r0 = 23 22 21 20
pand (r7, r1 );// r7 = 13 __ __ __
por ( r5, r3 );// r5 = __ __ 04 01
por (r7, r6 );// r7 = 13 10 __ __
movq (r3, eax+24);
por (r7, r5 );// r7 = 13 10 04 01 = R1
pmullw (r3, ebx+24 );// r3 = 17 16 15 14
psrlq (r4, 16 );// r4 = __ 07 06 05
movq (edx+16, r7 );// write R1 = r7
movq (r5, r4 );// r5 = __ 07 06 05
movq (r7, r0 );// r7 = 23 22 21 20
psrlq (r4, 16 );// r4 = __ __ 07 06
psrlq (r7, 48 );// r7 = __ __ __ 23
movq (r6, r2 );// r6 = __ __ __ FF
pand (r5, r2 );// r5 = __ __ __ 05
pand (r6, r4 );// r6 = __ __ __ 06
movq (edx+80, r7 );// partial R9 = __ __ __ 23
pxor (r4, r6 );// r4 = __ __ 07 __
psrlq (r1, 32 );// r1 = __ __ 13 12
por (r4, r5 );// r4 = __ __ 07 05
movq (r7, M(3) );// r7 = FF __ __ __
pand (r1, r2 );// r1 = __ __ __ 12
movq (r5, eax+48);
psllq (r0, 16 );// r0 = 22 21 20 __
pmullw (r5, ebx+48 );// r5 = 33 32 31 30
pand (r7, r0 );// r7 = 22 __ __ __
movq (edx+64, r1 );// partial R8 = __ __ __ 12
por (r7, r4 );// r7 = 22 __ 07 05
movq (r4, r3 );// r4 = 17 16 15 14
pand (r3, r2 );// r3 = __ __ __ 14
movq (r1, M(2) );// r1 = __ FF __ __
psllq (r3, 32 );// r3 = __ 14 __ __
por ( r7, r3 );// r7 = 22 14 07 05 = R2
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -