📄 mmxidct.cpp
字号:
psraw (r5, 4 );/* r5 = NR5 */
movq (I(3), r3 );/* store NR3 at I3 */
psubsw (r7, r0 );/* r7 = R7 = G. - C. */
paddsw (r7, Eight );/* adjust R7 (and R0) for shift */
paddsw (r0, r0 );/* r0 = C. + C. */
paddsw (r0, r7 );/* r0 = R0 = G. + C. */
psraw (r7, 4 );/* r7 = NR7 */
movq (J(6), r6 );/* store NR6 at J6 */
psraw (r0, 4 );/* r0 = NR0 */
movq (J(5), r5 );/* store NR5 at J5 */
movq (J(7), r7 );/* store NR7 at J7 */
movq (I(0), r0 );/* store NR0 at I0 */
}
// end ColumnIDCT macro (38 + 19 = 57 cycles)
/* --------------------------------------------------------------- */
/* --------------------------------------------------------------- */
/* IDCT 10 */
extern "C" void MMX_idct10 ( ogg_int16_t * input, ogg_int16_t * qtbl, ogg_int16_t * output)
{
# define M(I) (ecx + MaskOffset + I*8)
__m64 r0,r1,r2,r3,r4,r5,r6,r7;
unsigned char * eax=(unsigned char*)input;// eax = quantized input
unsigned char *edx =(unsigned char*)output;// edx = destination (= idct buffer)
/*
mov ecx, [edx] // (+1 at least) preload the cache before writing
mov ebx, [edx+28] // in case proc doesn't cache on writes
mov ecx, [edx+56] // gets all the cache lines
mov ebx, [edx+84] // regardless of alignment (beyond 32-bit)
mov ecx, [edx+112] // also avoids address contention stalls
mov ebx, [edx+124]
*/
unsigned char *ebx=(unsigned char*)qtbl; // ebx = quantization table
unsigned char *ecx=(unsigned char*)idctconstants; //// [0]//
movq (r0, eax);
//
pmullw (r0, ebx); // r0 = 03 02 01 00
//
movq (r1, eax+16);
//
pmullw (r1, ebx+16); // r1 = 13 12 11 10
//
movq (r2, M(0)); // r2 = __ __ __ FF
movq (r3, r0 );// r3 = 03 02 01 00
movq (r4, eax+8);
psrlq (r0, 16 );// r0 = __ 03 02 01
pmullw (r4, ebx+8 );// r4 = 07 06 05 04
pand (r3, r2 );// r3 = __ __ __ 00
movq (r5, r0 );// r5 = __ 03 02 01
movq (r6, r1 );// r6 = 13 12 11 10
pand (r5, r2 );// r5 = __ __ __ 01
psllq (r6, 32 );// r6 = 11 10 __ __
movq (r7, M(3) );// r7 = FF __ __ __
pxor (r0, r5 );// r0 = __ 03 02 __
pand (r7, r6 );// r7 = 11 __ __ __
por (r0, r3 );// r0 = __ 03 02 00
pxor (r6, r7 );// r6 = __ 10 __ __
por (r0, r7 );// r0 = 11 03 02 00 = R0
movq (r7, M(3) );// r7 = FF __ __ __
movq (r3, r4 );// r3 = 07 06 05 04
movq (edx, r0 );// write R0 = r0
pand (r3, r2 );// r3 = __ __ __ 04
movq (r0, eax+32);
psllq (r3, 16 );// r3 = __ __ 04 __
pmullw (r0, ebx+32 );// r0 = 23 22 21 20
pand (r7, r1 );// r7 = 13 __ __ __
por ( r5, r3 );// r5 = __ __ 04 01
por (r7, r6 );// r7 = 13 10 __ __
movq (r3, eax+24);
por (r7, r5 );// r7 = 13 10 04 01 = R1
pmullw (r3, ebx+24 );// r3 = 17 16 15 14
psrlq (r4, 16 );// r4 = __ 07 06 05
movq (edx+16, r7 );// write R1 = r7
movq (r5, r4 );// r5 = __ 07 06 05
movq (r7, r0 );// r7 = 23 22 21 20
psrlq (r4, 16 );// r4 = __ __ 07 06
psrlq (r7, 48 );// r7 = __ __ __ 23
movq (r6, r2 );// r6 = __ __ __ FF
pand (r5, r2 );// r5 = __ __ __ 05
pand (r6, r4 );// r6 = __ __ __ 06
movq (edx+80, r7 );// partial R9 = __ __ __ 23
pxor (r4, r6 );// r4 = __ __ 07 __
psrlq (r1, 32 );// r1 = __ __ 13 12
por (r4, r5 );// r4 = __ __ 07 05
movq (r7, M(3) );// r7 = FF __ __ __
pand (r1, r2 );// r1 = __ __ __ 12
movq (r5, eax+48);
psllq (r0, 16 );// r0 = 22 21 20 __
pmullw (r5, ebx+48 );// r5 = 33 32 31 30
pand (r7, r0 );// r7 = 22 __ __ __
movq (edx+64, r1 );// partial R8 = __ __ __ 12
por (r7, r4 );// r7 = 22 __ 07 05
movq (r4, r3 );// r4 = 17 16 15 14
pand (r3, r2 );// r3 = __ __ __ 14
movq (r1, M(2) );// r1 = __ FF __ __
psllq (r3, 32 );// r3 = __ 14 __ __
por ( r7, r3 );// r7 = 22 14 07 05 = R2
movq (r3, r5 );// r3 = 33 32 31 30
psllq (r3, 48 );// r3 = 30 __ __ __
pand (r1, r0 );// r1 = __ 21 __ __
movq (edx+32, r7 );// write R2 = r7
por (r6, r3 );// r6 = 30 __ __ 06
movq (r7, M(1) );// r7 = __ __ FF __
por (r6, r1 );// r6 = 30 21 __ 06
movq (r1, eax+56);
pand (r7, r4 );// r7 = __ __ 15 __
pmullw (r1, ebx+56 );// r1 = 37 36 35 34
por (r7, r6 );// r7 = 30 21 15 06 = R3
pand (r0, M(1) );// r0 = __ __ 20 __
psrlq (r4, 32 );// r4 = __ __ 17 16
movq (edx+48, r7 );// write R3 = r7
movq (r6, r4 );// r6 = __ __ 17 16
movq (r7, M(3) );// r7 = FF __ __ __
pand (r4, r2 );// r4 = __ __ __ 16
movq (r3, M(1) );// r3 = __ __ FF __
pand (r7, r1 );// r7 = 37 __ __ __
pand (r3, r5 );// r3 = __ __ 31 __
por (r0, r4 );// r0 = __ __ 20 16
psllq (r3, 16 );// r3 = __ 31 __ __
por (r7, r0 );// r7 = 37 __ 20 16
movq (r4, M(2) );// r4 = __ FF __ __
por (r7, r3 );// r7 = 37 31 20 16 = R4
movq (r0, eax+80);
movq (r3, r4 );// r3 = __ __ FF __
pmullw (r0, ebx+80 );// r0 = 53 52 51 50
pand (r4, r5 );// r4 = __ 32 __ __
movq (edx+8, r7 );// write R4 = r7
por (r6, r4 );// r6 = __ 32 17 16
movq (r4, r3 );// r4 = __ FF __ __
psrlq (r6, 16 );// r6 = __ __ 32 17
movq (r7, r0 );// r7 = 53 52 51 50
pand (r4, r1 );// r4 = __ 36 __ __
psllq (r7, 48 );// r7 = 50 __ __ __
por (r6, r4 );// r6 = __ 36 32 17
movq (r4, eax+88);
por (r7, r6 );// r7 = 50 36 32 17 = R5
pmullw (r4, ebx+88 );// r4 = 57 56 55 54
psrlq (r3, 16 );// r3 = __ __ FF __
movq (edx+24, r7 );// write R5 = r7
pand (r3, r1 );// r3 = __ __ 35 __
psrlq (r5, 48 );// r5 = __ __ __ 33
pand (r1, r2 );// r1 = __ __ __ 34
movq (r6, eax+104);
por (r5, r3 );// r5 = __ __ 35 33
pmullw (r6, ebx+104 );// r6 = 67 66 65 64
psrlq (r0, 16 );// r0 = __ 53 52 51
movq (r7, r4 );// r7 = 57 56 55 54
movq (r3, r2 );// r3 = __ __ __ FF
psllq (r7, 48 );// r7 = 54 __ __ __
pand (r3, r0 );// r3 = __ __ __ 51
pxor (r0, r3 );// r0 = __ 53 52 __
psllq (r3, 32 );// r3 = __ 51 __ __
por ( r7, r5 );// r7 = 54 __ 35 33
movq (r5, r6 );// r5 = 67 66 65 64
pand (r6, M(1) );// r6 = __ __ 65 __
por (r7, r3 );// r7 = 54 51 35 33 = R6
psllq (r6, 32 );// r6 = 65 __ __ __
por (r0, r1 );// r0 = __ 53 52 34
movq (edx+40, r7 );// write R6 = r7
por (r0, r6 );// r0 = 65 53 52 34 = R7
movq (r7, eax+120);
movq (r6, r5 );// r6 = 67 66 65 64
pmullw (r7, ebx+120 );// r7 = 77 76 75 74
psrlq (r5, 32 );// r5 = __ __ 67 66
pand (r6, r2 );// r6 = __ __ __ 64
movq (r1, r5 );// r1 = __ __ 67 66
movq (edx+56, r0 );// write R7 = r0
pand (r1, r2 );// r1 = __ __ __ 66
movq (r0, eax+112);
movq (r3, r7 );// r3 = 77 76 75 74
pmullw (r0, ebx+112 );// r0 = 73 72 71 70
psllq (r3, 16 );// r3 = 76 75 74 __
pand (r7, M(3) );// r7 = 77 __ __ __
pxor (r5, r1 );// r5 = __ __ 67 __
por ( r6, r5 );// r6 = __ __ 67 64
movq (r5, r3 );// r5 = 76 75 74 __
pand (r5, M(3) );// r5 = 76 __ __ __
por (r7, r1 );// r7 = 77 __ __ 66
movq (r1, eax+96);
pxor (r3, r5 );// r3 = __ 75 74 __
pmullw (r1, ebx+96 );// r1 = 63 62 61 60
por (r7, r3 );// r7 = 77 75 74 66 = R15
por ( r6, r5 );// r6 = 76 __ 67 64
movq (r5, r0 );// r5 = 73 72 71 70
movq (edx+120, r7 );// store R15 = r7
psrlq (r5, 16 );// r5 = __ 73 72 71
pand (r5, M(2) );// r5 = __ 73 __ __
movq (r7, r0 );// r7 = 73 72 71 70
por ( r6, r5 );// r6 = 76 73 67 64 = R14
pand (r0, r2 );// r0 = __ __ __ 70
pxor (r7, r0 );// r7 = 73 72 71 __
psllq (r0, 32 );// r0 = __ 70 __ __
movq (edx+104, r6 );// write R14 = r6
psrlq (r4, 16 );// r4 = __ 57 56 55
movq (r5, eax+72);
psllq (r7, 16 );// r7 = 72 71 __ __
pmullw (r5, ebx+72 );// r5 = 47 46 45 44
movq (r6, r7 );// r6 = 72 71 __ __
movq (r3, M(2) );// r3 = __ FF __ __
psllq (r6, 16 );// r6 = 71 __ __ __
pand (r7, M(3) );// r7 = 72 __ __ __
pand (r3, r1 );// r3 = __ 62 __ __
por ( r7, r0 );// r7 = 72 70 __ __
movq (r0, r1 );// r0 = 63 62 61 60
pand (r1, M(3) );// r1 = 63 __ __ __
por (r6, r3 );// r6 = 71 62 __ __
movq (r3, r4 );// r3 = __ 57 56 55
psrlq (r1, 32 );// r1 = __ __ 63 __
pand (r3, r2 );// r3 = __ __ __ 55
por (r7, r1 );// r7 = 72 70 63 __
por ( r7, r3 );// r7 = 72 70 63 55 = R13
movq (r3, r4 );// r3 = __ 57 56 55
pand (r3, M(1) );// r3 = __ __ 56 __
movq (r1, r5 );// r1 = 47 46 45 44
movq (edx+88, r7 );// write R13 = r7
psrlq (r5, 48 );// r5 = __ __ __ 47
movq (r7, eax+64);
por (r6, r3 );// r6 = 71 62 56 __
pmullw (r7, ebx+64 );// r7 = 43 42 41 40
por (r6, r5 );// r6 = 71 62 56 47 = R12
pand (r4, M(2) );// r4 = __ 57 __ __
psllq (r0, 32 );// r0 = 61 60 __ __
movq (edx+72, r6 );// write R12 = r6
movq (r6, r0 );// r6 = 61 60 __ __
pand (r0, M(3) );// r0 = 61 __ __ __
psllq (r6, 16 );// r6 = 60 __ __ __
movq (r5, eax+40);
movq (r3, r1 );// r3 = 47 46 45 44
pmullw (r5, ebx+40 );// r5 = 27 26 25 24
psrlq (r1, 16 );// r1 = __ 47 46 45
pand (r1, M(1) );// r1 = __ __ 46 __
por (r0, r4 );// r0 = 61 57 __ __
pand (r2, r7 );// r2 = __ __ __ 40
por (r0, r1 );// r0 = 61 57 46 __
por ( r0, r2 );// r0 = 61 57 46 40 = R11
psllq (r3, 16 );// r3 = 46 45 44 __
movq (r4, r3 );// r4 = 46 45 44 __
movq (r2, r5 );// r2 = 27 26 25 24
movq (edx+112, r0 );// write R11 = r0
psrlq (r2, 48 );// r2 = __ __ __ 27
pand (r4, M(2) );// r4 = __ 45 __ __
por (r6, r2 );// r6 = 60 __ __ 27
movq (r2, M(1) );// r2 = __ __ FF __
por (r6, r4 );// r6 = 60 45 __ 27
pand (r2, r7 );// r2 = __ __ 41 __
psllq (r3, 32 );// r3 = 44 __ __ __
por ( r3, edx+80 );// r3 = 44 __ __ 23
por (r6, r2 );// r6 = 60 45 41 27 = R10
movq (r2, M(3) );// r2 = FF __ __ __
psllq (r5, 16 );// r5 = 26 25 24 __
movq (edx+96, r6 );// store R10 = r6
pand (r2, r5 );// r2 = 26 __ __ __
movq (r6, M(2) );// r6 = __ FF __ __
pxor (r5, r2 );// r5 = __ 25 24 __
pand (r6, r7 );// r6 = __ 42 __ __
psrlq (r2, 32 );// r2 = __ __ 26 __
pand (r7, M(3) );// r7 = 43 __ __ __
por (r3, r2 );// r3 = 44 __ 26 23
por ( r7, edx+64 );// r7 = 43 __ __ 12
por (r6, r3 );// r6 = 44 42 26 23 = R9
por ( r7, r5 );// r7 = 43 25 24 12 = R8
movq (edx+80, r6 );// store R9 = r6
movq (edx+64, r7 );// store R8 = r7
//
// 123c ( / 64 coeffs < 2c / coeff)
# undef M
// Done w/dequant + descramble + partial transpose// now do the idct itself.
//# define I( K) [edx + ( K * 16)]
//# define J( K) [edx + ( (K - 4) * 16) + 8]
RowIDCT_10(r0,r1,r2,r3,r4,r5,r6,r7,I10_1(edx),C10(ecx)); // 33 c
Transpose(r0,r1,r2,r3,r4,r5,r6,r7,I10_1(edx),J10_1(edx)); // 19 c
//# define I( K) [edx + ( K * 16) + 64]
//# define J( K) [edx + ( (K - 4) * 16) + 72]
// RowIDCT // 46 c
// Transpose // 19 c
//# define I( K) [edx + (K * 16)]
//# define J( K) I( K)
ColumnIDCT_10(r0,r1,r2,r3,r4,r5,r6,r7,I10_2(edx),I10_2(edx),C10(ecx),ecx + EightOffset); // 44 c
//# define I( K) [edx + (K * 16) + 8]
//# define J( K) I( K)
ColumnIDCT_10(r0,r1,r2,r3,r4,r5,r6,r7,I10_3(edx),I10_3(edx),C10(ecx),ecx + EightOffset); // 44 c
}
/**************************************************************************************
*
* Routine: MMX_idct1
*
* Description: Perform IDCT on a 8x8 block with at most 1 nonzero coefficients
*
* Input: Pointer to input and output buffer
*
* Output: None
*
* Return: None
*
* Special Note: None
*
* Error: None
*
***************************************************************************************
*/
/* --------------------------------------------------------------- */
/* IDCT 1 */
extern "C" void MMX_idct1 (ogg_int16_t * input, ogg_int16_t * qtbl, ogg_int16_t * output)
{
if(input[0])
{
int i;
ogg_int32_t temp = (ogg_int32_t)input[0];
__m64 *iBuf=(__m64*)output;
temp *= qtbl[0];
//necessary in order to match tim's
temp += 15;
temp >>= 5;
temp &= 0xffff;
temp += temp << 16;
__m64 temp8=_mm_set1_pi32(temp);
for(i = 0; i < 16; i += 2)
{
iBuf[i] = temp8;
iBuf[i+1] = temp8;
}
}
else
{
/* special case where there is only a 0 dc coeff */
memset( output, 0, 128);
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -