simd.h.svn-base

来自「ffshow源码」· SVN-BASE 代码 · 共 582 行 · 第 1/2 页

SVN-BASE
582
字号
   por     (mm2, mm3);              // mm2 = low word copied to all four words   return mm2;  }  static __forceinline __m64 _mm_shuffle_pi16_1(const __m64 &src)  {   static const __int64 const1=0x00000000FFFF0000LL;   static const __int64 const2=0x000000000000FFFFLL;   __m64 w0=_mm_srli_si64(_mm_and_si64(src,*(__m64*)&const1),16);   __m64 w1=_mm_and_si64(src,*(__m64*)&const2);   return _mm_or_si64(_mm_or_si64(_mm_or_si64(_mm_slli_si64(w1,48),_mm_slli_si64(w1,32)),_mm_slli_si64(w1,16)),w0);  }  static __forceinline __m64 _mm_shuffle_pi16_14(const __m64 &src)  {   static const __int64 const1=0x000000000000FFFFLL;   static const __int64 const2=0xffffffff00000000ULL;   __m64 w34=_mm_and_si64(src,*(__m64*)&const1);   __m64 w12=_mm_srli_si64(_mm_and_si64(src,*(__m64*)&const2),32);   return _mm_or_si64(w12,_mm_or_si64(_mm_slli_si64(w34,32),_mm_slli_si64(w34,48)));  }  static __forceinline __m64 _mm_shuffle_pi16_x50(const __m64 &src)  {   static const __int64 const1=0x00000000ffff0000LL;   static const __int64 const2=0x000000000000ffffLL;   __m64 w3=_mm_and_si64(src,*(__m64*)&const1);   __m64 w4=_mm_and_si64(src,*(__m64*)&const2);   return _mm_or_si64( _mm_or_si64(_mm_slli_si64(w3,32),_mm_slli_si64(w3,16)) , _mm_or_si64(_mm_slli_si64(w4,16),w4));  }  static __forceinline void psadbw(__m64 &mm0,const __m64 &SourceMM)  {   __m64 mm1;   movq (mm1, SourceMM);   __m64 mm4;   movq (mm4, mm0);   psubusb (mm0, mm1);   psubusb (mm1, mm4);   por (mm0, mm1);   __m64 mm7=_mm_setzero_si64();   movq (mm1,mm0);   punpcklbw (mm0,mm7);   punpckhbw (mm1,mm7);   paddusw (mm0,mm1);   static const __int64 mmx_one=0x0001000100010001LL;   pmaddwd (mm0, mmx_one);   movq (mm7, mm0);   psrlq (mm7, 32);   paddd (mm0, mm7);   static const __int64 mmx_ffff=0x00000000000fffffLL;   pand (mm0, mmx_ffff);  } static __forceinline __m64 min_pu8(const __m64 &mm1,const __m64 &mm2)  {   __m64 mm0=mm1;   pminub(mm0,mm2);   return mm0;  } static __forceinline __m64 max_pu8(const __m64 &mm1,const __m64 &mm2)  {   __m64 mm0=mm1;   pmaxub(mm0,mm2);   return mm0;  } static __forceinline __m64 min_pi16(const __m64 &mm1,const __m64 &mm2)  {   __m64 mm0=mm1;   pminsw(mm0,mm2);   return mm0;  } static __forceinline __m64 max_pi16(const __m64 &mm1,const __m64 &mm2)  {   __m64 mm0=mm1;   pmaxsw(mm0,mm2);   return mm0;  } MMX_INSTRUCTIONS };//===================================== MMXEXT =====================================struct Tmmxext{ typedef Tmmx::__m __m; typedef Tmmx::int2 int2; static const size_t size=Tmmx::size; static const int align=Tmmx::align; typedef Tmmxext T64; static __forceinline void pmaxub(__m64 &mmr1,const __m64 &mmr2)  {   mmr1=_mm_max_pu8(mmr1,mmr2);  } static __forceinline void pmaxub(__m64 &mmr1,const void *mmr2)  {   pmaxub(mmr1,*(__m64*)mmr2);  } static __forceinline void pminub(__m64 &mmr1,const __m64 &mmr2)  {   mmr1=_mm_min_pu8(mmr1,mmr2);  } static __forceinline void pminub(__m64 &mmr1,const void *mmr2)  {   pminub(mmr1,*(__m64*)mmr2);  } static __forceinline void pminsw(__m64 &mmr1,const __m64 &mmr2)  {   mmr1=_mm_min_pi16(mmr1,mmr2);  } static __forceinline void pavgb(__m64 &mmr1,const __m64 &mmr2)  {   mmr1=_mm_avg_pu8(mmr1,mmr2);  } static __forceinline void pavgb(__m64 &mmr1,const void *mmr2)  {   mmr1=_mm_avg_pu8(mmr1,*(__m64*)mmr2);  } static __forceinline void v_pavgb(__m64 &mmr1,const __m64 &mmr2,__m64,__int64)  {   mmr1=_mm_avg_pu8(mmr1,mmr2);  } static __forceinline void v_pavgb(__m64 &mmr1,const void *mmr2,__m64,__int64)  {   mmr1=_mm_avg_pu8(mmr1,*(__m64*)mmr2);  } static __forceinline void sfence(void)  {   _mm_sfence();  }  static __forceinline void movntq(void *dst,const __m64 &src)  {   _mm_stream_pi((__m64*)dst,src);  } static __forceinline void v_pminub(__m64 &mmr1,const __m64 &mmr2,__m64)  {   mmr1=_mm_min_pu8(mmr1,mmr2);  } static __forceinline void v_pminub(__m64 &mmr1,const __int64 &mmr2,__m64 &mmrw)  {   v_pminub(mmr1,*(const __m64*)&mmr2,mmrw);  } static __forceinline void pmulhuw(__m64 &mmr1,const __m64 &mmr2)  {   mmr1=_mm_mulhi_pu16(mmr1,mmr2);  } static __forceinline void prefetchnta(const void *ptr)  {   _mm_prefetch((const char*)ptr,_MM_HINT_NTA);  } static __forceinline void prefetcht0(const void *ptr)  {   _mm_prefetch((const char*)ptr,_MM_HINT_T0);  } static __forceinline __m64 _mm_shuffle_pi16_0(const __m64 &src)  {   return _mm_shuffle_pi16(src,0);   }  static __forceinline __m64 _mm_shuffle_pi16_1(const __m64 &src)  {   return _mm_shuffle_pi16(src,1);   }  static __forceinline __m64 _mm_shuffle_pi16_14(const __m64 &src)  {   return _mm_shuffle_pi16(src,(3 << 2) + 2);   }  static __forceinline __m64 _mm_shuffle_pi16_x50(const __m64 &src)  {   return _mm_shuffle_pi16(src,0x50);   }  static __forceinline void psadbw(__m64 &mm3,const __m64 &mm2)  {   mm3=_mm_sad_pu8(mm3,mm2);  } static __forceinline __m64 min_pu8(const __m64 &mm1,const __m64 &mm2)  {   return _mm_min_pu8(mm1,mm2);  } static __forceinline __m64 max_pu8(const __m64 &mm1,const __m64 &mm2)  {   return _mm_max_pu8(mm1,mm2);  } static __forceinline __m64 min_pi16(const __m64 &mm1,const __m64 &mm2)  {   return _mm_min_pi16(mm1,mm2);  } static __forceinline __m64 max_pi16(const __m64 &mm1,const __m64 &mm2)  {   return _mm_max_pi16(mm1,mm2);  } static __forceinline void pmaxsw(__m64 &dst,const __m64 &src)  {   dst=_mm_max_pi16(dst,src);  }  MMX_INSTRUCTIONS };static __forceinline __m64 _mm_absdif_u8(__m64 mm1,__m64 mm2){ __m64 mm7=mm1; mm1=_mm_subs_pu8(mm1,mm2); mm2=_mm_subs_pu8(mm2,mm7); return _mm_or_si64(mm2,mm1);}static __forceinline void memadd(unsigned char *dst,const unsigned char *src,unsigned int len){ __m64 *dst8=(__m64*)dst;const __m64 *src8=(__m64*)src; for (unsigned int i=0;i<len/8;i++,src8++,dst8++)  *dst8=_mm_adds_pu8(*src8,*dst8);}//====================================== SSE2 ======================================#ifdef __SSE2__struct Tsse2{ typedef __m128i __m; typedef __m64 int2; typedef int64_t integer2_t; static const size_t size=sizeof(__m); static const int align=16; typedef Tmmxext T64; static __forceinline __m setzero_si64(void) {return _mm_setzero_si128();} static __forceinline __m set_pi8(char b7,char b6,char b5,char b4,char b3,char b2,char b1,char b0) {return _mm_set_epi8(b7,b6,b5,b4,b3,b2,b1,b0,b7,b6,b5,b4,b3,b2,b1,b0);} static __forceinline __m set_pi32(int i1,int i0) {return _mm_set_epi32(i1,i0,i1,i0);} static __forceinline __m set1_pi8(char b) {return _mm_set1_epi8(b);} static __forceinline __m set1_pi16(short s) {return _mm_set1_epi16(s);} static __forceinline __m set1_pi64(int64_t s) {__align16(int64_t,x[])={s,s};return *(__m*)x;}//__m128i _mm_set1_epi64(*(__m64*)&s); TODO: _mm_set1_epi64x static __forceinline __m packs_pu16(const __m &m1,const __m &m2) {return _mm_packus_epi16(m1,m2);} static __forceinline __m slli_pi16(const __m &m,int count) {return _mm_slli_epi16(m,count);} static __forceinline __m srli_pi16(const __m &m,int count) {return _mm_srli_epi16(m,count);} static __forceinline __m srli_si64(const __m &m,int count) {return _mm_srli_epi64(m,count);} static __forceinline __m srai_pi16(const __m &m,int count) {return _mm_srai_epi16(m,count);} static __forceinline __m madd_pi16(const __m &m1,const __m &m2) {return _mm_madd_epi16(m1,m2);} static __forceinline __m add_pi16(const __m &m1,const __m &m2) {return _mm_add_epi16(m1,m2);} static __forceinline __m adds_pi16(const __m &m1,const __m &m2) {return _mm_adds_epi16(m1,m2);} static __forceinline __m adds_pu16(const __m &m1,const __m &m2) {return _mm_adds_epu16(m1,m2);} static __forceinline __m adds_pu8(const __m &m1,const __m &m2) {return _mm_adds_epu8(m1,m2);} static __forceinline __m sub_pi16(const __m &m1,const __m &m2) {return _mm_sub_epi16(m1,m2);} static __forceinline __m subs_pi16(const __m &m1,const __m &m2) {return _mm_subs_epi16(m1,m2);} static __forceinline __m subs_pu16(const __m &m1,const __m &m2) {return _mm_subs_epu16(m1,m2);} static __forceinline __m subs_pu8(const __m &m1,const __m &m2) {return _mm_subs_epu8(m1,m2);} static __forceinline __m or_si64(const __m &m1,const __m &m2) {return _mm_or_si128(m1,m2);} static __forceinline __m xor_si64(const __m &m1,const __m &m2) {return _mm_xor_si128(m1,m2);} static __forceinline __m and_si64(const __m &m1,const __m &m2) {return _mm_and_si128(m1,m2);} static __forceinline __m andnot_si64(const __m &m1,const __m &m2) {return _mm_andnot_si128(m1,m2);} static __forceinline __m mullo_pi16(const __m &m1,const __m &m2) {return _mm_mullo_epi16(m1,m2);} static __forceinline __m mulhi_pi16(const __m &m1,const __m &m2) {return _mm_mulhi_epi16(m1,m2);} static __forceinline __m unpacklo_pi8(const __m &m1,const __m &m2) {return _mm_unpacklo_epi8(m1,m2);} static __forceinline __m unpackhi_pi8(const __m &m1,const __m &m2) {return _mm_unpackhi_epi8(m1,m2);} static __forceinline __m cmpgt_pi16(const __m &m1,const __m &m2) {return _mm_cmpgt_epi16(m1,m2);} static __forceinline __m cmpeq_pi16(const __m &m1,const __m &m2) {return _mm_cmpeq_epi16(m1,m2);} static __forceinline __m cmpeq_pi8(const __m &m1,const __m &m2) {return _mm_cmpeq_epi8(m1,m2);} static __forceinline __m min_pi16(const __m &mm1,const __m &mm2) {return _mm_min_epi16(mm1,mm2);} static __forceinline __m max_pi16(const __m &mm1,const __m &mm2) {return _mm_max_epi16(mm1,mm2);} static __forceinline __m load2(const void *ptr) {return _mm_loadl_epi64((const __m128i*)ptr);} static __forceinline void store2(void *ptr,const __m &m) {_mm_storel_epi64((__m128i*)ptr,m);} static __forceinline void storeU(void *ptr,const __m &m) { _mm_storeu_si128((__m*)ptr,m);} static __forceinline __m loadU(const void *ptr) {return _mm_loadu_si128((const __m*)ptr);} static __forceinline void empty(void) {/*_mm_empty();*/}  static __forceinline void psadbw(__m &mm3,const __m &mm2) {mm3=_mm_sad_epu8(mm3,mm2);} static __forceinline void prefetchnta(const void *ptr) {_mm_prefetch((const char*)ptr,_MM_HINT_NTA);} static __forceinline __m shuffle_pi16_0(const __m &mm0) {return _mm_shufflehi_epi16(_mm_shufflelo_epi16(mm0,0),0);}  static __forceinline void pmaxub(__m &mmr1,const __m &mmr2) {mmr1=_mm_max_epu8(mmr1,mmr2);} static __forceinline void pmulhuw(__m &mmr1,const __m &mmr2) {mmr1=_mm_mulhi_epu16(mmr1,mmr2);} static __forceinline void movntq(void *dst,const __m &src) {_mm_stream_si128((__m128i*)dst,src);} static __forceinline void pavgb(__m &mmr1,const __m &mmr2) {mmr1=_mm_avg_epu8(mmr1,mmr2);} static __forceinline void pavgb(__m &mmr1,const void *mmr2) {mmr1=_mm_avg_epu8(mmr1,*(__m*)mmr2);} static __forceinline void sfence(void) {_mm_sfence();} };#endif //__SSE2__template<class _mm> static __forceinline typename _mm::__m abs_16(const typename _mm::__m &mm0){ typename _mm::__m mm6=_mm::srai_pi16(mm0,15); return _mm::sub_pi16(_mm::xor_si64(mm0,mm6),mm6);}template<class _mm> static __forceinline typename _mm::__m absdif_s16(typename _mm::__m mm0,typename _mm::__m mm1){ typename _mm::__m mm2=mm0; mm0=_mm::cmpgt_pi16(mm0,mm1); typename _mm::__m mm4=mm2; mm2=_mm::xor_si64(mm2,mm1); mm2=_mm::and_si64(mm2,mm0); typename _mm::__m mm3=mm2; mm4=_mm::xor_si64(mm4,mm2); mm1=_mm::xor_si64(mm1,mm3); return _mm::sub_pi16(mm1,mm4);}#pragma warning(pop)#endif

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?