⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 idct_sse2.c

📁 linux下实现视频播放的播放器
💻 C
字号:
#ifdef __MINGW32__#include <inttypes.h>#include "simd.h"#include "attributes.h"#ifdef __SSE2__static const int BITS_INV_ACC=4;static const int SHIFT_INV_ROW=16-BITS_INV_ACC;static const int SHIFT_INV_COL=1+BITS_INV_ACC;static const int RND_INV_ROW  =1024*(6-BITS_INV_ACC);static const int RND_INV_COL  =16*(BITS_INV_ACC-3);static const int RND_INV_CORR =RND_INV_COL-1;static __align16(const short,M128_round_inv_row[8]) = {RND_INV_ROW, 0, RND_INV_ROW, 0, RND_INV_ROW, 0, RND_INV_ROW, 0};static __align16(const short,M128_one_corr[8]) = {1,1,1,1,1,1,1,1};static __align16(const short,M128_round_inv_col[8]) = {RND_INV_COL, RND_INV_COL, RND_INV_COL, RND_INV_COL, RND_INV_COL, RND_INV_COL, RND_INV_COL, RND_INV_COL};static __align16(const short,M128_round_inv_corr[8])= {RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR};static __align16(const short,M128_tg_1_16[8]) = {13036, 13036, 13036, 13036, 13036, 13036, 13036, 13036}; // tg * (2<<16) + 0.5static __align16(const short,M128_tg_2_16[8]) = {27146, 27146, 27146, 27146, 27146, 27146, 27146, 27146}; // tg * (2<<16) + 0.5static __align16(const short,M128_tg_3_16[8]) = {-21746, -21746, -21746, -21746, -21746, -21746, -21746, -21746}; // tg * (2<<16) + 0.5static __align16(const short,M128_cos_4_16[8]) = {-19195, -19195, -19195, -19195, -19195, -19195, -19195, -19195};// cos * (2<<16) + 0.5static __align16(const int16_t,M128_tab_i_04[])={16384, 21407, 16384,  8867, 16384,  -8867, 16384, -21407, 16384,  8867, -16384, -21407, -16384, 21407, 16384,  -8867, 22725, 19266, 19266, -4520, 12873, -22725, 4520, -12873, 12873, 4520, -22725, -12873, 4520, 19266, 19266, -22725};static __align16(const int16_t,M128_tab_i_17[])={22725, 29692, 22725, 12299, 22725, -12299, 22725, -29692, 22725, 12299, -22725, -29692, -22725, 29692, 22725, -12299, 31521, 26722, 26722, -6270, 17855, -31521, 6270, -17855, 17855, 6270, -31521, -17855, 6270, 26722, 26722, -31521};static __align16(const int16_t,M128_tab_i_26[])={21407, 27969, 21407, 11585, 21407, -11585, 21407, -27969, 21407, 11585, -21407, -27969, -21407, 27969, 21407, -11585, 29692, 25172, 25172, -5906, 16819, -29692, 5906, -16819, 16819, 5906, -29692, -16819, 5906, 25172, 25172, -29692};static __align16(const int16_t,M128_tab_i_35[])={19266, 25172, 19266, 10426, 19266, -10426, 19266, -25172, 19266, 10426, -19266, -25172, -19266, 25172, 19266, -10426, 26722, 22654, 22654, -5315, 15137, -26722, 5315, -15137, 15137, 5315, -26722, -15137, 5315, 22654, 22654, -26722}; static __forceinline void DCT_8_INV_ROW(const uint8_t * const ecx,const uint8_t * const esi,__m128i &xmm0,__m128i &xmm1,__m128i &xmm2,__m128i &xmm3,__m128i &xmm4,__m128i &xmm5,__m128i &xmm6,__m128i &xmm7){	 xmm0=_mm_shufflelo_epi16(xmm0, 0xD8 );	 xmm1=_mm_shuffle_epi32( xmm0, 0 );	 pmaddwd (xmm1, esi);	 xmm3=_mm_shuffle_epi32( xmm0, 0x55);	 xmm0=_mm_shufflehi_epi16( xmm0, 0xD8 );	 pmaddwd( xmm3, esi+32 );	 xmm2=_mm_shuffle_epi32( xmm0, 0xAA );	 xmm0=_mm_shuffle_epi32( xmm0, 0xFF );	 pmaddwd( xmm2, esi+16 );	 xmm4=_mm_shufflehi_epi16( xmm4, 0xD8 );	 paddd (xmm1, M128_round_inv_row);	 xmm4=_mm_shufflelo_epi16 (xmm4, 0xD8 );	 pmaddwd (xmm0, esi+48 );	 xmm5=_mm_shuffle_epi32( xmm4, 0 );	 xmm6=_mm_shuffle_epi32( xmm4, 0xAA );	 pmaddwd (xmm5, ecx );	 paddd (xmm1, xmm2 );	 movdqa (xmm2, xmm1 );	 xmm7=_mm_shuffle_epi32( xmm4, 0x55 );	 pmaddwd (xmm6, ecx+16 );	 paddd (xmm0, xmm3 );	 xmm4=_mm_shuffle_epi32( xmm4, 0xFF );	 psubd (xmm2, xmm0 );	 pmaddwd (xmm7, ecx+32 );	 paddd (xmm0, xmm1 );	 psrad (xmm2, 12 );	 paddd (xmm5, M128_round_inv_row);	 pmaddwd (xmm4, ecx+48 );	 paddd (xmm5, xmm6 );	 movdqa (xmm6, xmm5 );	 psrad (xmm0, 12 );	 xmm2=_mm_shuffle_epi32( xmm2, 0x1B );	 packssdw (xmm0, xmm2 );	 paddd (xmm4, xmm7 );	 psubd (xmm6, xmm4 );	 paddd (xmm4, xmm5 );	 psrad (xmm6, 12 );	 psrad (xmm4, 12 );	 xmm6=_mm_shuffle_epi32( xmm6, 0x1B );	 packssdw (xmm4, xmm6 );}static __forceinline void DCT_8_INV_COL_8(__m128i &src0,__m128i &src1,__m128i &src2,__m128i &src3,__m128i &src4,__m128i &src5,__m128i &src6,__m128i &src7,                                          __m128i &xmm0,__m128i &xmm1,__m128i &xmm2,__m128i &xmm3,__m128i &xmm4,__m128i &xmm5,__m128i &xmm6,__m128i &xmm7){	movdqa( xmm1,  M128_tg_3_16  );	movdqa( xmm2, xmm0           );	movdqa( xmm3,  src3      );	pmulhw( xmm0, xmm1           );	pmulhw( xmm1, xmm3           );	movdqa( xmm5,  M128_tg_1_16  );	movdqa( xmm6, xmm4           );	pmulhw( xmm4, xmm5           );	paddsw( xmm0, xmm2           );	pmulhw( xmm5, src1       );	paddsw( xmm1, xmm3           );	movdqa( xmm7,  src6      );	paddsw( xmm0, xmm3           );	movdqa( xmm3,  M128_tg_2_16  );	psubsw( xmm2, xmm1           );	pmulhw( xmm7, xmm3           );	movdqa( xmm1, xmm0           );	pmulhw( xmm3, src2       );	psubsw( xmm5, xmm6           );	paddsw( xmm4, src1       );	paddsw( xmm0, xmm4           );	paddsw( xmm0,  M128_one_corr );	psubsw( xmm4, xmm1           );	movdqa( xmm6, xmm5           );	psubsw( xmm5, xmm2           );	paddsw( xmm5,  M128_one_corr );	paddsw( xmm6, xmm2           );	movdqa( src7, xmm0       );	movdqa( xmm1, xmm4           );	movdqa( xmm0,  M128_cos_4_16 );	paddsw( xmm4, xmm5           );	movdqa( xmm2,  M128_cos_4_16 );	pmulhw( xmm2, xmm4           );	movdqa( src3, xmm6       );	psubsw( xmm1, xmm5           );	paddsw( xmm7, src2       );	psubsw( xmm3, src6       );	movdqa( xmm6, src0           );	pmulhw( xmm0, xmm1           );	movdqa( xmm5, src4       );	paddsw( xmm5, xmm6          );	psubsw( xmm6, src4       );	paddsw( xmm4, xmm2           );	por   (  xmm4,  M128_one_corr     );	paddsw(  xmm0, xmm1                 );	por   (  xmm0,  M128_one_corr     );	movdqa( xmm2, xmm5                  );	paddsw( xmm5, xmm7                  );	movdqa( xmm1, xmm6                  );	paddsw( xmm5,  M128_round_inv_col );	psubsw( xmm2, xmm7                  );	movdqa( xmm7, src7            );	paddsw( xmm6, xmm3                  );	paddsw( xmm6,  M128_round_inv_col );	paddsw( xmm7, xmm5                  );	psraw ( xmm7, SHIFT_INV_COL           );	psubsw( xmm1, xmm3                   );	paddsw( xmm1,  M128_round_inv_corr );	movdqa( xmm3, xmm6                   );	paddsw( xmm2,  M128_round_inv_corr );	paddsw( xmm6, xmm4                   );	movdqa( src0,xmm7                  );	psraw (xmm6, SHIFT_INV_COL           );	movdqa( xmm7, xmm1                   );	paddsw( xmm1, xmm0                   );	movdqa( src1, xmm6             );	psraw (xmm1, SHIFT_INV_COL           );	movdqa( xmm6, src3             );	psubsw( xmm7, xmm0                   );	psraw (xmm7, SHIFT_INV_COL           );	movdqa( src2, xmm1             );	psubsw( xmm5, src7             );	psraw (xmm5, SHIFT_INV_COL           );	movdqa( src7, xmm5             );	psubsw( xmm3, xmm4                   );	paddsw( xmm6, xmm2                   );	psubsw( xmm2, src3             );	psraw (xmm6, SHIFT_INV_COL           );	psraw (xmm2, SHIFT_INV_COL           );	movdqa( src3, xmm6             );	psraw (xmm3, SHIFT_INV_COL           );	movdqa( src4, xmm2             );	movdqa( src5, xmm7             );	movdqa( src6, xmm3             );}static __forceinline void idct_M128ASM(__m128i &src0,__m128i &src1,__m128i &src2,__m128i &src3,__m128i &src4,__m128i &src5,__m128i &src6,__m128i &src7){	 src0=_mm_srai_epi16(src0,4);	 src1=_mm_srai_epi16(src1,4);	 src2=_mm_srai_epi16(src2,4);	 src3=_mm_srai_epi16(src3,4);	 src4=_mm_srai_epi16(src4,4);	 src5=_mm_srai_epi16(src5,4);	 src6=_mm_srai_epi16(src6,4);	 src7=_mm_srai_epi16(src7,4);__m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7;	 movdqa (xmm0, src0);	 uint8_t *esi=(uint8_t*)M128_tab_i_04;	 movdqa (xmm4, src2);	 uint8_t *ecx=(uint8_t*)M128_tab_i_26;	DCT_8_INV_ROW(ecx,esi,xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7);	 movdqa (src0, xmm0);	 movdqa (src2, xmm4);	 movdqa (xmm0, src4);	 movdqa (xmm4, src6);	DCT_8_INV_ROW(ecx,esi,xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7);	 movdqa (src4, xmm0);	 movdqa (src6, xmm4);	 movdqa (xmm0, src3);	 esi=(uint8_t*)M128_tab_i_35;	 movdqa (xmm4, src1);	 ecx=(uint8_t*)M128_tab_i_17;	DCT_8_INV_ROW(ecx,esi,xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7);	 movdqa (src3, xmm0);	 movdqa (src1, xmm4);	 movdqa (xmm0, src5);	 movdqa (xmm4, src7);	DCT_8_INV_ROW(ecx,esi,xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7);	DCT_8_INV_COL_8(src0,src1,src2,src3,src4,src5,src6,src7,	                xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7);}void mpeg2_idct_copy_sse2(int16_t* block, uint8_t* dest, const int stride){ __m128i &src0=*(__m128i*)(block+0*16/2); __m128i &src1=*(__m128i*)(block+1*16/2); __m128i &src2=*(__m128i*)(block+2*16/2); __m128i &src3=*(__m128i*)(block+3*16/2); __m128i &src4=*(__m128i*)(block+4*16/2); __m128i &src5=*(__m128i*)(block+5*16/2); __m128i &src6=*(__m128i*)(block+6*16/2); __m128i &src7=*(__m128i*)(block+7*16/2);	        idct_M128ASM (src0,src1,src2,src3,src4,src5,src6,src7);	__m128i zero = _mm_setzero_si128();	__m128i r0 = _mm_packus_epi16(_mm_load_si128(&src0), _mm_load_si128(&src1));	__m128i r1 = _mm_packus_epi16(_mm_load_si128(&src2), _mm_load_si128(&src3));	__m128i r2 = _mm_packus_epi16(_mm_load_si128(&src4), _mm_load_si128(&src5));	__m128i r3 = _mm_packus_epi16(_mm_load_si128(&src6), _mm_load_si128(&src7));	_mm_storel_pi((__m64*)&dest[0*stride], *(__m128*)&r0);	_mm_storeh_pi((__m64*)&dest[1*stride], *(__m128*)&r0);	_mm_storel_pi((__m64*)&dest[2*stride], *(__m128*)&r1);	_mm_storeh_pi((__m64*)&dest[3*stride], *(__m128*)&r1);	_mm_storel_pi((__m64*)&dest[4*stride], *(__m128*)&r2);	_mm_storeh_pi((__m64*)&dest[5*stride], *(__m128*)&r2);	_mm_storel_pi((__m64*)&dest[6*stride], *(__m128*)&r3);	_mm_storeh_pi((__m64*)&dest[7*stride], *(__m128*)&r3);	_mm_store_si128(&src0, zero);	_mm_store_si128(&src1, zero);	_mm_store_si128(&src2, zero);	_mm_store_si128(&src3, zero);	_mm_store_si128(&src4, zero);	_mm_store_si128(&src5, zero);	_mm_store_si128(&src6, zero);	_mm_store_si128(&src7, zero);}void mpeg2_idct_add_sse2(int,int16_t* block, uint8_t* dest, const int stride){ __m128i &src0=*(__m128i*)(block+0*16/2); __m128i &src1=*(__m128i*)(block+1*16/2); __m128i &src2=*(__m128i*)(block+2*16/2); __m128i &src3=*(__m128i*)(block+3*16/2); __m128i &src4=*(__m128i*)(block+4*16/2); __m128i &src5=*(__m128i*)(block+5*16/2); __m128i &src6=*(__m128i*)(block+6*16/2); __m128i &src7=*(__m128i*)(block+7*16/2);	        idct_M128ASM (src0,src1,src2,src3,src4,src5,src6,src7);	        	__m128i zero = _mm_setzero_si128();	__m128i r0 = _mm_load_si128(&src0);	__m128i r1 = _mm_load_si128(&src1);	__m128i r2 = _mm_load_si128(&src2);	__m128i r3 = _mm_load_si128(&src3);	__m128i r4 = _mm_load_si128(&src4);	__m128i r5 = _mm_load_si128(&src5);	__m128i r6 = _mm_load_si128(&src6);	__m128i r7 = _mm_load_si128(&src7);	__m128 q0 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[0*stride]);	__m128 q1 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[1*stride]);	__m128 q2 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[2*stride]);	__m128 q3 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[3*stride]);	__m128 q4 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[4*stride]);	__m128 q5 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[5*stride]);	__m128 q6 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[6*stride]);	__m128 q7 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[7*stride]);	r0 = _mm_adds_epi16(r0, _mm_unpacklo_epi8(*(__m128i*)&q0, zero));	r1 = _mm_adds_epi16(r1, _mm_unpacklo_epi8(*(__m128i*)&q1, zero));	r2 = _mm_adds_epi16(r2, _mm_unpacklo_epi8(*(__m128i*)&q2, zero));	r3 = _mm_adds_epi16(r3, _mm_unpacklo_epi8(*(__m128i*)&q3, zero));	r4 = _mm_adds_epi16(r4, _mm_unpacklo_epi8(*(__m128i*)&q4, zero));	r5 = _mm_adds_epi16(r5, _mm_unpacklo_epi8(*(__m128i*)&q5, zero));	r6 = _mm_adds_epi16(r6, _mm_unpacklo_epi8(*(__m128i*)&q6, zero));	r7 = _mm_adds_epi16(r7, _mm_unpacklo_epi8(*(__m128i*)&q7, zero));	r0 = _mm_packus_epi16(r0, r1);	r1 = _mm_packus_epi16(r2, r3);	r2 = _mm_packus_epi16(r4, r5);	r3 = _mm_packus_epi16(r6, r7);	_mm_storel_pi((__m64*)&dest[0*stride], *(__m128*)&r0);	_mm_storeh_pi((__m64*)&dest[1*stride], *(__m128*)&r0);	_mm_storel_pi((__m64*)&dest[2*stride], *(__m128*)&r1);	_mm_storeh_pi((__m64*)&dest[3*stride], *(__m128*)&r1);	_mm_storel_pi((__m64*)&dest[4*stride], *(__m128*)&r2);	_mm_storeh_pi((__m64*)&dest[5*stride], *(__m128*)&r2);	_mm_storel_pi((__m64*)&dest[6*stride], *(__m128*)&r3);	_mm_storeh_pi((__m64*)&dest[7*stride], *(__m128*)&r3);	_mm_store_si128(&src0, zero);	_mm_store_si128(&src1, zero);	_mm_store_si128(&src2, zero);	_mm_store_si128(&src3, zero);	_mm_store_si128(&src4, zero);	_mm_store_si128(&src5, zero);	_mm_store_si128(&src6, zero);	_mm_store_si128(&src7, zero);}#endif //__SSE2__#endif // __MINGW32__

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -