📄 idct_mmx.c
字号:
#define C4 23170
const __m64 _T1 = _mm_set1_pi16(T1);
const __m64 _T2 = _mm_set1_pi16(T2);
const __m64 _T3 = _mm_set1_pi16(T3);
const __m64 _C4 = _mm_set1_pi16(C4);
/* column code adapted from peter gubanov */
/* http://www.elecard.com/peter/idct.shtml */
movq_m2r (_T1, mm0); /* mm0 = T1 */
movq_m2r (*(col+offset+1*8), mm1); /* mm1 = x1 */
movq_r2r (mm0, mm2); /* mm2 = T1 */
movq_m2r (*(col+offset+7*8), mm4); /* mm4 = x7 */
pmulhw_r2r (mm1, mm0); /* mm0 = T1*x1 */
movq_m2r (_T3, mm5); /* mm5 = T3 */
pmulhw_r2r (mm4, mm2); /* mm2 = T1*x7 */
movq_m2r (*(col+offset+5*8), mm6); /* mm6 = x5 */
movq_r2r (mm5, mm7); /* mm7 = T3-1 */
movq_m2r (*(col+offset+3*8), mm3); /* mm3 = x3 */
psubsw_r2r (mm4, mm0); /* mm0 = v17 */
movq_m2r (_T2, mm4); /* mm4 = T2 */
pmulhw_r2r (mm3, mm5); /* mm5 = (T3-1)*x3 */
paddsw_r2r (mm2, mm1); /* mm1 = u17 */
pmulhw_r2r (mm6, mm7); /* mm7 = (T3-1)*x5 */
/* slot */
movq_r2r (mm4, mm2); /* mm2 = T2 */
paddsw_r2r (mm3, mm5); /* mm5 = T3*x3 */
pmulhw_m2r (*(col+offset+2*8), mm4);/* mm4 = T2*x2 */
paddsw_r2r (mm6, mm7); /* mm7 = T3*x5 */
psubsw_r2r (mm6, mm5); /* mm5 = v35 */
paddsw_r2r (mm3, mm7); /* mm7 = u35 */
movq_m2r (*(col+offset+6*8), mm3); /* mm3 = x6 */
movq_r2r (mm0, mm6); /* mm6 = v17 */
pmulhw_r2r (mm3, mm2); /* mm2 = T2*x6 */
psubsw_r2r (mm5, mm0); /* mm0 = b3 */
psubsw_r2r (mm3, mm4); /* mm4 = v26 */
paddsw_r2r (mm6, mm5); /* mm5 = v12 */
movq_r2m (mm0, *(col+offset+3*8)); /* save b3 in scratch0 */
movq_r2r (mm1, mm6); /* mm6 = u17 */
paddsw_m2r (*(col+offset+2*8), mm2);/* mm2 = u26 */
paddsw_r2r (mm7, mm6); /* mm6 = b0 */
psubsw_r2r (mm7, mm1); /* mm1 = u12 */
movq_r2r (mm1, mm7); /* mm7 = u12 */
movq_m2r (*(col+offset+0*8), mm3); /* mm3 = x0 */
paddsw_r2r (mm5, mm1); /* mm1 = u12+v12 */
movq_m2r (_C4, mm0); /* mm0 = C4/2 */
psubsw_r2r (mm5, mm7); /* mm7 = u12-v12 */
movq_r2m (mm6, *(col+offset+5*8)); /* save b0 in scratch1 */
pmulhw_r2r (mm0, mm1); /* mm1 = b1/2 */
movq_r2r (mm4, mm6); /* mm6 = v26 */
pmulhw_r2r (mm0, mm7); /* mm7 = b2/2 */
movq_m2r (*(col+offset+4*8), mm5); /* mm5 = x4 */
movq_r2r (mm3, mm0); /* mm0 = x0 */
psubsw_r2r (mm5, mm3); /* mm3 = v04 */
paddsw_r2r (mm5, mm0); /* mm0 = u04 */
paddsw_r2r (mm3, mm4); /* mm4 = a1 */
movq_r2r (mm0, mm5); /* mm5 = u04 */
psubsw_r2r (mm6, mm3); /* mm3 = a2 */
paddsw_r2r (mm2, mm5); /* mm5 = a0 */
paddsw_r2r (mm1, mm1); /* mm1 = b1 */
psubsw_r2r (mm2, mm0); /* mm0 = a3 */
paddsw_r2r (mm7, mm7); /* mm7 = b2 */
movq_r2r (mm3, mm2); /* mm2 = a2 */
movq_r2r (mm4, mm6); /* mm6 = a1 */
paddsw_r2r (mm7, mm3); /* mm3 = a2+b2 */
psraw_i2r (COL_SHIFT, mm3); /* mm3 = y2 */
paddsw_r2r (mm1, mm4); /* mm4 = a1+b1 */
psraw_i2r (COL_SHIFT, mm4); /* mm4 = y1 */
psubsw_r2r (mm1, mm6); /* mm6 = a1-b1 */
movq_m2r (*(col+offset+5*8), mm1); /* mm1 = b0 */
psubsw_r2r (mm7, mm2); /* mm2 = a2-b2 */
psraw_i2r (COL_SHIFT, mm6); /* mm6 = y6 */
movq_r2r (mm5, mm7); /* mm7 = a0 */
movq_r2m (mm4, *(col+offset+1*8)); /* save y1 */
psraw_i2r (COL_SHIFT, mm2); /* mm2 = y5 */
movq_r2m (mm3, *(col+offset+2*8)); /* save y2 */
paddsw_r2r (mm1, mm5); /* mm5 = a0+b0 */
movq_m2r (*(col+offset+3*8), mm4); /* mm4 = b3 */
psubsw_r2r (mm1, mm7); /* mm7 = a0-b0 */
psraw_i2r (COL_SHIFT, mm5); /* mm5 = y0 */
movq_r2r (mm0, mm3); /* mm3 = a3 */
movq_r2m (mm2, *(col+offset+5*8)); /* save y5 */
psubsw_r2r (mm4, mm3); /* mm3 = a3-b3 */
psraw_i2r (COL_SHIFT, mm7); /* mm7 = y7 */
paddsw_r2r (mm0, mm4); /* mm4 = a3+b3 */
movq_r2m (mm5, *(col+offset+0*8)); /* save y0 */
psraw_i2r (COL_SHIFT, mm3); /* mm3 = y4 */
movq_r2m (mm6, *(col+offset+6*8)); /* save y6 */
psraw_i2r (COL_SHIFT, mm4); /* mm4 = y3 */
movq_r2m (mm7, *(col+offset+7*8)); /* save y7 */
movq_r2m (mm3, *(col+offset+4*8)); /* save y4 */
movq_r2m (mm4, *(col+offset+3*8)); /* save y3 */
}
static __align16(const int32_t, rounder0[]) = rounder ((1 << (COL_SHIFT - 1)) - 0.5);
static __align16(const int32_t, rounder4[]) = rounder (0);
static __align16(const int32_t, rounder1[]) = rounder (1.25683487303); /* C1*(C1/C4+C1+C7)/2 */
static __align16(const int32_t, rounder7[]) = rounder (-0.25); /* C1*(C7/C4+C7-C1)/2 */
static __align16(const int32_t, rounder2[]) = rounder (0.60355339059); /* C2 * (C6+C2)/2 */
static __align16(const int32_t, rounder6[]) = rounder (-0.25); /* C2 * (C6-C2)/2 */
static __align16(const int32_t, rounder3[]) = rounder (0.087788325588); /* C3*(-C3/C4+C3+C5)/2 */
static __align16(const int32_t, rounder5[]) = rounder (-0.441341716183); /* C3*(-C5/C4+C5-C3)/2 */
#define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid) \
static inline void idct (int16_t * const block) \
{ \
__m64 mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7; \
static __align16(const int16_t,table04[]) = \
table (22725, 21407, 19266, 16384, 12873, 8867, 4520); \
static __align16(const int16_t,table17[]) = \
table (31521, 29692, 26722, 22725, 17855, 12299, 6270); \
static __align16(const int16_t,table26[]) = \
table (29692, 27969, 25172, 21407, 16819, 11585, 5906); \
static __align16(const int16_t,table35[]) = \
table (26722, 25172, 22654, 19266, 15137, 10426, 5315); \
\
idct_row_head (block, 0*8, table04,mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7); \
idct_row (table04, rounder0,mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7); \
idct_row_mid (block, 0*8, 4*8, table04,mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7); \
idct_row (table04, rounder4,mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7); \
idct_row_mid (block, 4*8, 1*8, table17,mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7); \
idct_row (table17, rounder1,mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7); \
idct_row_mid (block, 1*8, 7*8, table17,mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7); \
idct_row (table17, rounder7,mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7); \
idct_row_mid (block, 7*8, 2*8, table26,mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7); \
idct_row (table26, rounder2,mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7); \
idct_row_mid (block, 2*8, 6*8, table26,mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7); \
idct_row (table26, rounder6,mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7); \
idct_row_mid (block, 6*8, 3*8, table35,mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7); \
idct_row (table35, rounder3,mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7); \
idct_row_mid (block, 3*8, 5*8, table35,mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7); \
idct_row (table35, rounder5,mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7); \
idct_row_tail (block, 5*8,mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7); \
\
idct_col (block, 0,mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7); \
idct_col (block, 4,mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7); \
}
#define COPY_MMX(offset,r0,r1,r2) \
do { \
movq_m2r (*(block+offset), r0); \
dest += stride; \
movq_m2r (*(block+offset+4), r1); \
movq_r2m (r2, *dest); \
packuswb_r2r (r1, r0); \
} while (0)
static inline void block_copy (int16_t * const block, uint8_t * dest,
const int stride)
{
__m64 mm0,mm1,mm2,mm3;
movq_m2r (*(block+0*8), mm0);
movq_m2r (*(block+0*8+4), mm1);
movq_m2r (*(block+1*8), mm2);
packuswb_r2r (mm1, mm0);
movq_m2r (*(block+1*8+4), mm3);
movq_r2m (mm0, *dest);
packuswb_r2r (mm3, mm2);
COPY_MMX (2*8, mm0, mm1, mm2);
COPY_MMX (3*8, mm2, mm3, mm0);
COPY_MMX (4*8, mm0, mm1, mm2);
COPY_MMX (5*8, mm2, mm3, mm0);
COPY_MMX (6*8, mm0, mm1, mm2);
COPY_MMX (7*8, mm2, mm3, mm0);
movq_r2m (mm2, *(dest+stride));
}
#define ADD_MMX(offset,r1,r2,r3,r4) \
do { \
movq_m2r (*(dest+2*stride), r1); \
packuswb_r2r (r4, r3); \
movq_r2r (r1, r2); \
dest += stride; \
movq_r2m (r3, *dest); \
punpcklbw_r2r (mm0, r1); \
paddsw_m2r (*(block+offset), r1); \
punpckhbw_r2r (mm0, r2); \
paddsw_m2r (*(block+offset+4), r2); \
} while (0)
static __forceinline void block_add (int16_t * const block, uint8_t * dest,
const int stride)
{
__m64 mm0,mm1,mm2,mm3,mm4;
movq_m2r (*dest, mm1);
pxor_r2r (mm0, mm0);
movq_m2r (*(dest+stride), mm3);
movq_r2r (mm1, mm2);
punpcklbw_r2r (mm0, mm1);
movq_r2r (mm3, mm4);
paddsw_m2r (*(block+0*8), mm1);
punpckhbw_r2r (mm0, mm2);
paddsw_m2r (*(block+0*8+4), mm2);
punpcklbw_r2r (mm0, mm3);
paddsw_m2r (*(block+1*8), mm3);
packuswb_r2r (mm2, mm1);
punpckhbw_r2r (mm0, mm4);
movq_r2m (mm1, *dest);
paddsw_m2r (*(block+1*8+4), mm4);
ADD_MMX (2*8, mm1, mm2, mm3, mm4);
ADD_MMX (3*8, mm3, mm4, mm1, mm2);
ADD_MMX (4*8, mm1, mm2, mm3, mm4);
ADD_MMX (5*8, mm3, mm4, mm1, mm2);
ADD_MMX (6*8, mm1, mm2, mm3, mm4);
ADD_MMX (7*8, mm3, mm4, mm1, mm2);
packuswb_r2r (mm4, mm3);
movq_r2m (mm3, *(dest+stride));
}
static __forceinline void block_zero (int16_t * const block)
{
__m64 mm0;
pxor_r2r (mm0, mm0);
movq_r2m (mm0, *(block+0*4));
movq_r2m (mm0, *(block+1*4));
movq_r2m (mm0, *(block+2*4));
movq_r2m (mm0, *(block+3*4));
movq_r2m (mm0, *(block+4*4));
movq_r2m (mm0, *(block+5*4));
movq_r2m (mm0, *(block+6*4));
movq_r2m (mm0, *(block+7*4));
movq_r2m (mm0, *(block+8*4));
movq_r2m (mm0, *(block+9*4));
movq_r2m (mm0, *(block+10*4));
movq_r2m (mm0, *(block+11*4));
movq_r2m (mm0, *(block+12*4));
movq_r2m (mm0, *(block+13*4));
movq_r2m (mm0, *(block+14*4));
movq_r2m (mm0, *(block+15*4));
}
#define CPU_MMXEXT 0
#define CPU_MMX 1
#define dup4(reg) \
do { \
if (cpu != CPU_MMXEXT) { \
punpcklwd_r2r (reg, reg); \
punpckldq_r2r (reg, reg); \
} else \
pshufw_r2r (reg, reg, 0x00); \
} while (0)
static __forceinline void block_add_DC (int16_t * const block, uint8_t * dest,
const int stride, const int cpu)
{
__m64 mm0,mm1,mm2,mm3;
movd_v2r ((block[0] + 64) >> 7, mm0);
pxor_r2r (mm1, mm1);
movq_m2r (*dest, mm2);
dup4 (mm0);
psubsw_r2r (mm0, mm1);
packuswb_r2r (mm0, mm0);
paddusb_r2r (mm0, mm2);
packuswb_r2r (mm1, mm1);
movq_m2r (*(dest + stride), mm3);
psubusb_r2r (mm1, mm2);
block[0] = 0;
paddusb_r2r (mm0, mm3);
movq_r2m (mm2, *dest);
psubusb_r2r (mm1, mm3);
movq_m2r (*(dest + 2*stride), mm2);
dest += stride;
movq_r2m (mm3, *dest);
paddusb_r2r (mm0, mm2);
movq_m2r (*(dest + 2*stride), mm3);
psubusb_r2r (mm1, mm2);
dest += stride;
paddusb_r2r (mm0, mm3);
movq_r2m (mm2, *dest);
psubusb_r2r (mm1, mm3);
movq_m2r (*(dest + 2*stride), mm2);
dest += stride;
movq_r2m (mm3, *dest);
paddusb_r2r (mm0, mm2);
movq_m2r (*(dest + 2*stride), mm3);
psubusb_r2r (mm1, mm2);
dest += stride;
paddusb_r2r (mm0, mm3);
movq_r2m (mm2, *dest);
psubusb_r2r (mm1, mm3);
movq_m2r (*(dest + 2*stride), mm2);
dest += stride;
movq_r2m (mm3, *dest);
paddusb_r2r (mm0, mm2);
movq_m2r (*(dest + 2*stride), mm3);
psubusb_r2r (mm1, mm2);
block[63] = 0;
paddusb_r2r (mm0, mm3);
movq_r2m (mm2, *(dest + stride));
psubusb_r2r (mm1, mm3);
movq_r2m (mm3, *(dest + 2*stride));
}
declare_idct (mmxext_idct, mmxext_table,
mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid)
void mpeg2_idct_copy_mmxext (int16_t * const block, uint8_t * const dest,
const int stride)
{
mmxext_idct (block);
block_copy (block, dest, stride);
block_zero (block);
}
void mpeg2_idct_add_mmxext (const int last, int16_t * const block,
uint8_t * const dest, const int stride)
{
if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) {
mmxext_idct (block);
block_add (block, dest, stride);
block_zero (block);
} else
block_add_DC (block, dest, stride, CPU_MMXEXT);
}
declare_idct (mmx_idct, mmx_table,
mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid)
void mpeg2_idct_copy_mmx (int16_t * const block, uint8_t * const dest,
const int stride)
{
mmx_idct (block);
block_copy (block, dest, stride);
block_zero (block);
}
void mpeg2_idct_add_mmx (const int last, int16_t * const block,
uint8_t * const dest, const int stride)
{
if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) {
mmx_idct (block);
block_add (block, dest, stride);
block_zero (block);
} else
block_add_DC (block, dest, stride, CPU_MMX);
}
void mpeg2_idct_mmx_init (void)
{
extern __align16(uint8_t,mpeg2_scan_norm[64]);
extern __align16(uint8_t,mpeg2_scan_alt[64]);
int i, j;
/* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
for (i = 0; i < 64; i++) {
j = mpeg2_scan_norm[i];
mpeg2_scan_norm[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2);
j = mpeg2_scan_alt[i];
mpeg2_scan_alt[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2);
}
}
#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -