📄 simd.h
字号:
#ifndef _SIMD_H_#define _SIMD_H_#include "simd_common.h"#pragma warning(push)#pragma warning(disable:4799)#pragma warning(disable:4309)#pragma warning(disable:4700)#define MMX_INSTRUCTION(instruction,function) \ static __forceinline void instruction(__m64 &dst,const __m64 &src) {dst=function(dst, src);} \ static __forceinline void instruction(__m64 &dst,const void *src) {dst=function(dst,*(__m64*) src);} \ static __forceinline void instruction(__m64 &dst,const __int64 &src) {dst=function(dst,*(__m64*)&src);} #define SSE2I_INSTRUCTION(instruction,function) \ static __forceinline void instruction(__m128i &dst,const __m128i &src) {dst=function(dst, src);} \ static __forceinline void instruction(__m128i &dst,const void *src) {dst=function(dst,*(__m128i*)src);}#include "simd_instructions.h"#undef MMX_INSTRUCTION#undef SSE2I_INSTRUCTIONstatic __forceinline void movq(__m64 &dst,const __m64 &src) {dst=src;}static __forceinline void movq(__m64 &dst,const void *src) {dst=*(__m64*)src;}static __forceinline void movq(__m64 &dst,const __int64 &src) {dst=*(__m64*)&src;}static __forceinline void movq(void *dst,const __m64 &src) {*(__m64*)dst=src;}static __forceinline void movntq(void *dst,const __m64 &src) {_mm_stream_pi((__m64*)dst,src);}static __forceinline void movdqu(__m64 &dst,const void *src) {dst=*(__m64*)src;}static __forceinline void movd(__m64 &dst,int src) {dst=_mm_cvtsi32_si64(src);}static __forceinline void movd(int &dst,const __m64 &src) {dst=_mm_cvtsi64_si32(src);}static __forceinline void movd(__m64 &dst,const void *src) {dst=_mm_cvtsi32_si64(*(const int*)src);}static __forceinline void movd(void *dst,const __m64 &src) {*(int*)dst=_mm_cvtsi64_si32(src);}static __forceinline void psllq(__m64 &dst,int i) {dst=_mm_slli_si64(dst,i);}static __forceinline void pslld(__m64 &dst,int i) {dst=_mm_slli_pi32(dst,i);}static __forceinline void psllw(__m64 &dst,int i) {dst=_mm_slli_pi16(dst,i);}static __forceinline void psrlq(__m64 &dst,int i) {dst=_mm_srli_si64(dst,i);}static __forceinline void psrld(__m64 &dst,int i) {dst=_mm_srli_pi32(dst,i);}static __forceinline void psrlw(__m64 &dst,int i) {dst=_mm_srli_pi16(dst,i);}static __forceinline void psraw(__m64 &dst,int i) {dst=_mm_srai_pi16(dst,i);}static __forceinline void psraw(__m64 &dst,const __m64 &src) {dst=_mm_sra_pi16(dst,src);}static __forceinline void psrad(__m64 &dst,int i) {dst=_mm_srai_pi32(dst,i);}static __forceinline void prefetcht0(const void *a) {_mm_prefetch((char*)a,_MM_HINT_T0);}static __forceinline void movaps(__m128 &dst,const __m128 &src) {dst=src;}static __forceinline void movaps(void *dst,const __m128 &src) {*(__m128*)dst=src;}static __forceinline void movups(__m128 &dst,const void *src) {dst=_mm_loadu_ps((float*)src);}static __forceinline void movups(void *dst,const __m128 &src) {_mm_storeu_ps((float*)dst,src);}static __forceinline void movss(__m128 &dst,const void *src) {dst=_mm_load_ss((float*)src);}static __forceinline void movss(void *dst,const __m128 &src) {_mm_store_ss((float*)dst,src);}static __forceinline void movhlps(__m128 &dst,const __m128 &src) {dst=_mm_movehl_ps(dst,src);}static __forceinline void movlhps(__m128 &dst,const __m128 &src) {dst=_mm_movelh_ps(dst,src);}static __forceinline void movlps(__m128 &dst,const void *src) {dst=_mm_loadl_pi(dst,(const __m64*)src);}static __forceinline void movlps(void *dst,const __m128 &src) {_mm_storel_pi((__m64*)dst,src);}static __forceinline void movhps(__m128 &dst,const void *src) {dst=_mm_loadh_pi(dst,(const __m64*)src);}static __forceinline void movhps(void *dst,const __m128 &src) {_mm_storeh_pi((__m64*)dst,src);}static __forceinline void xorps(__m128 &dst,const __m128 &src) {dst=_mm_xor_ps(dst,src);}static __forceinline void addps(__m128 &dst,const __m128 &src) {dst=_mm_add_ps(dst,src);}static __forceinline void addss(__m128 &dst,const __m128 &src) {dst=_mm_add_ss(dst,src);}static __forceinline void mulps(__m128 &dst,const __m128 &src) {dst=_mm_mul_ps(dst,src);}static __forceinline void mulss(__m128 &dst,const __m128 &src) {dst=_mm_mul_ss(dst,src);}static __forceinline void minps(__m128 &dst,const __m128 &src) {dst=_mm_min_ps(dst,src);}static __forceinline void cvtps2pi(__m64 &dst,const __m128 &src) {dst=_mm_cvtps_pi32(src);}static __forceinline void cmpnltps(__m128 &dst,const __m128 &src) {dst=_mm_cmpnlt_ps(dst,src);}static __forceinline void cvtpi2ps(__m128 &dst,const __m64 &src) {dst=_mm_cvtpi32_ps(dst,src);}#ifdef __SSE2__static __forceinline void movq(__m128i &dst,const __m128i &src) {dst=src;}static __forceinline void movq(__m128i &dst,const void *src) {dst=*(__m128i*)src;}static __forceinline void movq(const void *dst,__m128i &src) {*(__m128i*)dst=src;}static __forceinline void movd(__m128i &dst,const void *src) {dst=_mm_loadl_epi64((__m128i*)src);}static __forceinline void movd(void *dst,const __m128i &src) {_mm_storel_epi64((__m128i*)dst,src);}static __forceinline void movdqu(__m128i &dst,const void *src) {dst=_mm_loadu_si128((__m128i*)src);}static __forceinline void movdqu(__m128i &dst,const __m128i &src) {dst=_mm_loadu_si128(&src);}static __forceinline void movdqa(__m128i &dst,const __m128i &src) {dst=src;}static __forceinline void movdqa(__m128i &dst,const void * src) {dst=_mm_load_si128((__m128i*)src);}static __forceinline void movdqa(void *dst,const __m128i &src) {_mm_store_si128((__m128i*)dst,src);}static __forceinline void movntdq(void *dst,const __m128i &src) {_mm_stream_si128((__m128i*)dst,src);}static __forceinline void movdq2q(__m64 &dst,const __m128i &src) {dst=_mm_movepi64_pi64(src);} static __forceinline void psrlw(__m128i &dst,int i) {dst=_mm_srli_epi16(dst,i);}static __forceinline void psrlq(__m128i &dst,int i) {dst=_mm_srli_epi64(dst,i);}static __forceinline void psrad(__m128i &dst,int i) {dst=_mm_srai_epi32(dst,i);}static __forceinline void psraw(__m128i &dst,int i) {dst=_mm_srai_epi16(dst,i);}static __forceinline void psraw(__m128i &dst,const __m128i &src) {dst=_mm_sra_epi16(dst,src);}static __forceinline void psllw(__m128i &dst,int i) {dst=_mm_slli_epi16(dst,i);}static __forceinline void pslld(__m128i &dst,int i) {dst=_mm_slli_epi32(dst,i);}static __forceinline void psllq(__m128i &dst,int i) {dst=_mm_slli_epi64(dst,i);}//static __forceinline void pshufd(__m128i &dst,const __m128i &src,const int i) {dst=_mm_shuffle_epi32(src,i);}//static __forceinline void pshuflw(__m128i &dst,const __m128i &src,const int i) {dst=_mm_shufflelo_epi16(src,i);}//static __forceinline void pshufhw(__m128i &dst,const __m128i &src,const int i) {dst=_mm_shufflehi_epi16(src,i);}static __forceinline void cvtps2dq(__m128i &dst,const __m128 &src) {dst=_mm_cvtps_epi32(src);}static __forceinline void cvtdq2ps(__m128 &dst,const __m128i &src) {dst=_mm_cvtepi32_ps(src);}static __forceinline void movlpd(__m128d &dst,const void *src) {dst=_mm_loadl_pd(dst,(double*)src);}static __forceinline void movhpd(__m128d &dst,const void *src) {dst=_mm_loadh_pd(dst,(double*)src);}static __forceinline void movlpd(void *dst,const __m128d &src) {_mm_storel_pd((double*)dst,src);}static __forceinline void movhpd(void *dst,const __m128d &src) {_mm_storeh_pd((double*)dst,src);}#if defined(__INTEL_COMPILER) || (defined(__GNUC__) && __GNUC__>=4) static __forceinline void movlpd(__m128i &dst,const void *src) {dst=_mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(dst),(double*)src));} static __forceinline void movhpd(__m128i &dst,const void *src) {dst=_mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(dst),(double*)src));} static __forceinline void movlpd(void *dst,const __m128i &src) {_mm_storel_pd((double*)dst,_mm_castsi128_pd(src));} static __forceinline void movhpd(void *dst,const __m128i &src) {_mm_storeh_pd((double*)dst,_mm_castsi128_pd(src));} static __forceinline void movlps(__m128i &dst,const void *src) {dst=_mm_castps_si128(_mm_loadl_pi(_mm_castsi128_ps(dst),(const __m64*)src));} static __forceinline void movlps(void *dst,const __m128i &src) {_mm_storel_pi((__m64*)dst,_mm_castsi128_ps(src));} static __forceinline void movhps(__m128i &dst,const void *src) {dst=_mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(dst),(const __m64*)src));} static __forceinline void movhps(void *dst,const __m128i &src) {_mm_storeh_pi((__m64*)dst,_mm_castsi128_ps(src));}#else static __forceinline __m128i _mm_castps_si128(__m128 &src) {return (__m128i&)src;} static __forceinline void movlpd(__m128i &dst,const void *src) {(__m128d&)dst=_mm_loadl_pd((__m128d&)dst,(double*)src);} static __forceinline void movhpd(__m128i &dst,const void *src) {(__m128d&)dst=_mm_loadh_pd((__m128d&)dst,(double*)src);} static __forceinline void movlpd(void *dst,const __m128i &src) {_mm_storel_pd((double*)dst,(const __m128d&)src);} static __forceinline void movhpd(void *dst,const __m128i &src) {_mm_storeh_pd((double*)dst,(const __m128d&)src);} static __forceinline void movlps(__m128i &dst,const void *src) {(__m128&)dst=_mm_loadl_pi((__m128&)dst,(const __m64*)src);} static __forceinline void movlps(void *dst,const __m128i &src) {_mm_storel_pi((__m64*)dst,(const __m128&)src);} static __forceinline void movhps(__m128i &dst,const void *src) {(__m128&)dst=_mm_loadh_pi((__m128&)dst,(const __m64*)src);} static __forceinline void movhps(void *dst,const __m128i &src) {_mm_storeh_pi((__m64*)dst,(const __m128&)src);}#endif #endif //__SSE2__//======================================= MMX ======================================#define MMX_INSTRUCTIONS \ static __forceinline __m setzero_si64(void) {return _mm_setzero_si64();} \ static __forceinline __m set_pi8(char b7,char b6,char b5,char b4,char b3,char b2,char b1,char b0) {return _mm_set_pi8(b7,b6,b5,b4,b3,b2,b1,b0);} \ static __forceinline __m set_pi32(int i1,int i0) {return _mm_set_pi32(i1,i0);} \ static __forceinline __m set1_pi8(char b) {return _mm_set1_pi8(b);} \ static __forceinline __m set1_pi16(short s) {return _mm_set1_pi16(s);} \ static __forceinline __m set1_pi64(int64_t s) {return *(__m64*)&s;} \ static __forceinline __m packs_pu16(const __m &m1,const __m &m2) {return _mm_packs_pu16(m1,m2);} \ static __forceinline __m slli_pi16(const __m &m,int count) {return _mm_slli_pi16(m,count);} \ static __forceinline __m srli_pi16(const __m &m,int count) {return _mm_srli_pi16(m,count);} \ static __forceinline __m srli_si64(const __m &m,int count) {return _mm_srli_si64(m,count);} \ static __forceinline __m srai_pi16(const __m &m,int count) {return _mm_srai_pi16(m,count);} \ static __forceinline __m madd_pi16(const __m &m1,const __m &m2) {return _mm_madd_pi16(m1,m2);} \ static __forceinline __m add_pi16(const __m &m1,const __m &m2) {return _mm_add_pi16(m1,m2);} \ static __forceinline __m adds_pi16(const __m &m1,const __m &m2) {return _mm_adds_pi16(m1,m2);} \ static __forceinline __m adds_pu16(const __m &m1,const __m &m2) {return _mm_adds_pu16(m1,m2);} \ static __forceinline __m adds_pu8(const __m &m1,const __m &m2) {return _mm_adds_pu8(m1,m2);} \ static __forceinline __m sub_pi16(const __m &m1,const __m &m2) {return _mm_sub_pi16(m1,m2);} \ static __forceinline __m subs_pi16(const __m &m1,const __m &m2) {return _mm_subs_pi16(m1,m2);} \ static __forceinline __m subs_pu16(const __m &m1,const __m &m2) {return _mm_subs_pu16(m1,m2);} \ static __forceinline __m subs_pu8(const __m &m1,const __m &m2) {return _mm_subs_pu8(m1,m2);} \ static __forceinline __m or_si64(const __m &m1,const __m &m2) {return _mm_or_si64(m1,m2);} \ static __forceinline __m xor_si64(const __m &m1,const __m &m2) {return _mm_xor_si64(m1,m2);} \ static __forceinline __m and_si64(const __m &m1,const __m &m2) {return _mm_and_si64(m1,m2);} \ static __forceinline __m andnot_si64(const __m &m1,const __m &m2) {return _mm_andnot_si64(m1,m2);} \ static __forceinline __m mullo_pi16(const __m &m1,const __m &m2) {return _mm_mullo_pi16(m1,m2);} \ static __forceinline __m mulhi_pi16(const __m &m1,const __m &m2) {return _mm_mulhi_pi16(m1,m2);} \ static __forceinline __m unpacklo_pi8(const __m &m1,const __m &m2) {return _mm_unpacklo_pi8(m1,m2);} \ static __forceinline __m unpackhi_pi8(const __m &m1,const __m &m2) {return _mm_unpackhi_pi8(m1,m2);} \ static __forceinline __m cmpgt_pi16(const __m &m1,const __m &m2) {return _mm_cmpgt_pi16(m1,m2);} \ static __forceinline __m cmpeq_pi16(const __m &m1,const __m &m2) {return _mm_cmpeq_pi16(m1,m2);} \ static __forceinline __m cmpeq_pi8(const __m &m1,const __m &m2) {return _mm_cmpeq_pi8(m1,m2);} \ static __forceinline __m shuffle_pi16_0(__m64 mm3) {return _mm_shuffle_pi16_0(mm3);} \ static __forceinline void store2(void *ptr,const __m &m) {*(int2*)ptr=_mm_cvtsi64_si32(m);} \ static __forceinline __m load2(const void *ptr) {return _mm_cvtsi32_si64(*(int2*)ptr);} \ static __forceinline void storeU(void *ptr,const __m &m) {*(__m*)ptr=m;} \ static __forceinline __m loadU(const void *ptr) {return *(__m*)ptr;} \ static __forceinline void empty(void) {_mm_empty();}struct Tmmx{ typedef __m64 __m; typedef int32_t int2; typedef int32_t integer2_t; static const size_t size=sizeof(__m); static const int align=0; typedef Tmmx T64; static __forceinline void pmaxub(__m64 &mmr1,const __m64 &mmr2) { mmr1=_mm_subs_pu8(mmr1,mmr2); mmr1=_mm_adds_pu8(mmr1,mmr2); } static __forceinline void pmaxub(__m64 &mmr1,const void *mmr2) { pmaxub(mmr1,*(__m64*)mmr2); } static __forceinline void pminub(__m64 &mmr1,const __m64 &mmr2) { __m64 mmrw; pcmpeqb (mmrw,mmrw ); psubusb (mmrw,mmr2 ); paddusb (mmr1, mmrw); psubusb (mmr1, mmrw); } static __forceinline void pminub(__m64 &mmr1,const void *mmr2) { pminub(mmr1,*(__m64*)mmr2); } static __forceinline void pmaxsw(__m64 &a,const __m64 &b) { psubusw(a,b); paddw(a,b); } static __forceinline void pminsw(__m64 &mm4,const __m64 &mm0) { __m64 mm2; movq (mm2,mm4); psubusw(mm2,mm0); psubw (mm4,mm2); } static __forceinline void pavgb(__m64 ®a,__m64 regb) { __m64 regr; static const __int64 regfe=0xfefefefefefefefeULL;//_mm_set1_pi8(/*0xfe*/-2); movq (regr,rega); por (regr,regb); pxor (regb,rega); pand (regb,regfe); psrlq (regb,1); psubb (regr,regb); rega=regr; } static __forceinline void pavgb(__m64 ®a,const void *regb) { pavgb(rega,*(__m64*)regb); } static __forceinline void v_pavgb(__m64 &mmr1,const __m64 &mmr2,__m64 &mmrw,const __int64 &smask) { movq( mmrw,mmr2 ); pand( mmrw, smask ); psrlw( mmrw,1 ); pand( mmr1,smask ); psrlw( mmr1,1 ); paddusb( mmr1,mmrw ); } static __forceinline void v_pavgb(__m64 &mmr1,const void *mmr2,__m64 &mmrw,const __int64 &smask) { v_pavgb(mmr1,*(__m64*)mmr2,mmrw,smask); } static __forceinline void sfence(void) { } static __forceinline void movntq(void *dst,const __m64 &src) { movq(dst,src); } static __forceinline void v_pminub(__m64 &mmr1,const __m64 &mmr2,__m64 &mmrw) { pcmpeqb (mmrw,mmrw ); psubusb (mmrw,mmr2 ); paddusb (mmr1, mmrw); psubusb (mmr1, mmrw); } static __forceinline void v_pminub(__m64 &mmr1,const __int64 &mmr2,__m64 &mmrw) { v_pminub(mmr1,*(const __m64*)&mmr2,mmrw); } static __forceinline void pmulhuw(__m64 &mm3,const __m64 &mm2) { __m64 mm5; movq ( mm5, mm2); psraw ( mm5, 15 ); pand ( mm5, mm3); pmulhw ( mm3, mm2); paddw ( mm3, mm5); } static __forceinline void prefetchnta(const void*) { } static __forceinline void prefetcht0(const void*) { } static __forceinline __m64 _mm_shuffle_pi16_0(__m64 mm3) { __m64 mm2; static const __int64 qwLowWord=0x000000000000FFFF; pand (mm3, qwLowWord); // mm3 = same limited to low word movq (mm2, mm3); // mm2 = same psllq (mm3, 16 ); // mm3 = moved to second word por (mm2, mm3); // mm2 = copied to first and second words movq (mm3, mm2); // mm3 = same psllq (mm3, 32 ); // mm3 = moved to third and fourth words por (mm2, mm3); // mm2 = low word copied to all four words return mm2;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -