📄 bmsse2.h
字号:
__m128i xmm0 = _mm_set_epi32 (value, value, value, value); do { _mm_store_si128(dst, xmm0);/* _mm_store_si128(dst+1, xmm0); _mm_store_si128(dst+2, xmm0); _mm_store_si128(dst+3, xmm0); _mm_store_si128(dst+4, xmm0); _mm_store_si128(dst+5, xmm0); _mm_store_si128(dst+6, xmm0); _mm_store_si128(dst+7, xmm0); dst += 8;*/ } while (++dst < dst_end); _mm_sfence();}/*! @brief SSE2 block copy *dst = *src @ingroup SSE2*/__forceinline void sse2_copy_block(__m128i* BMRESTRICT dst, const __m128i* BMRESTRICT src, const __m128i* BMRESTRICT src_end){ __m128i xmm0, xmm1, xmm2, xmm3; do { _mm_prefetch((const char*)(src)+512, _MM_HINT_NTA); xmm0 = _mm_load_si128(src+0); xmm1 = _mm_load_si128(src+1); xmm2 = _mm_load_si128(src+2); xmm3 = _mm_load_si128(src+3); _mm_store_si128(dst+0, xmm0); _mm_store_si128(dst+1, xmm1); _mm_store_si128(dst+2, xmm2); _mm_store_si128(dst+3, xmm3); xmm0 = _mm_load_si128(src+4); xmm1 = _mm_load_si128(src+5); xmm2 = _mm_load_si128(src+6); xmm3 = _mm_load_si128(src+7); _mm_store_si128(dst+4, xmm0); _mm_store_si128(dst+5, xmm1); _mm_store_si128(dst+6, xmm2); _mm_store_si128(dst+7, xmm3); src += 8; dst += 8; } while (src < src_end); }/*! @brief Invert array elements *dst = ~*dst or *dst ^= *dst @ingroup SSE2*/__forceinline void sse2_invert_arr(bm::word_t* first, bm::word_t* last){ __m128i xmm1 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF); __m128i* wrd_ptr = (__m128i*)first; do { _mm_prefetch((const char*)(wrd_ptr)+512, _MM_HINT_NTA); __m128i xmm0 = _mm_load_si128(wrd_ptr); xmm0 = _mm_xor_si128(xmm0, xmm1); _mm_store_si128(wrd_ptr, xmm0); ++wrd_ptr; } while (wrd_ptr < (__m128i*)last);}/*! SSE2 optimized bitcounting function implements parallel bitcounting algorithm for SSE2 instruction set.<pre>unsigned CalcBitCount32(unsigned b){ b = (b & 0x55555555) + (b >> 1 & 0x55555555); b = (b & 0x33333333) + (b >> 2 & 0x33333333); b = (b + (b >> 4)) & 0x0F0F0F0F; b = b + (b >> 8); b = (b + (b >> 16)) & 0x0000003F; return b;}</pre> @ingroup SSE2*/inline bm::id_t sse2_bit_count(const __m128i* block, const __m128i* block_end){ const unsigned mu1 = 0x55555555; const unsigned mu2 = 0x33333333; const unsigned mu3 = 0x0F0F0F0F; const unsigned mu4 = 0x0000003F; // Loading masks __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1); __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2); __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3); __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4); __m128i mcnt; mcnt = _mm_xor_si128(m1, m1); // cnt = 0 __m128i tmp1, tmp2; do { __m128i b = _mm_load_si128(block); ++block; // b = (b & 0x55555555) + (b >> 1 & 0x55555555); tmp1 = _mm_srli_epi32(b, 1); // tmp1 = (b >> 1 & 0x55555555) tmp1 = _mm_and_si128(tmp1, m1); tmp2 = _mm_and_si128(b, m1); // tmp2 = (b & 0x55555555) b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2 // b = (b & 0x33333333) + (b >> 2 & 0x33333333); tmp1 = _mm_srli_epi32(b, 2); // (b >> 2 & 0x33333333) tmp1 = _mm_and_si128(tmp1, m2); tmp2 = _mm_and_si128(b, m2); // (b & 0x33333333) b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2 // b = (b + (b >> 4)) & 0x0F0F0F0F; tmp1 = _mm_srli_epi32(b, 4); // tmp1 = b >> 4 b = _mm_add_epi32(b, tmp1); // b = b + (b >> 4) b = _mm_and_si128(b, m3); // & 0x0F0F0F0F // b = b + (b >> 8); tmp1 = _mm_srli_epi32 (b, 8); // tmp1 = b >> 8 b = _mm_add_epi32(b, tmp1); // b = b + (b >> 8) // b = (b + (b >> 16)) & 0x0000003F; tmp1 = _mm_srli_epi32 (b, 16); // b >> 16 b = _mm_add_epi32(b, tmp1); // b + (b >> 16) b = _mm_and_si128(b, m4); // (b >> 16) & 0x0000003F; mcnt = _mm_add_epi32(mcnt, b); // mcnt += b } while (block < block_end); __declspec(align(16)) bm::id_t tcnt[4]; _mm_store_si128((__m128i*)tcnt, mcnt); return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];}__forceinline __m128i sse2_and(__m128i a, __m128i b){ return _mm_and_si128(a, b);}__forceinline __m128i sse2_or(__m128i a, __m128i b){ return _mm_or_si128(a, b);}__forceinline __m128i sse2_xor(__m128i a, __m128i b){ return _mm_xor_si128(a, b);}__forceinline __m128i sse2_sub(__m128i a, __m128i b){ return _mm_andnot_si128(b, a);}template<class Func>bm::id_t sse2_bit_count_op(const __m128i* BMRESTRICT block, const __m128i* BMRESTRICT block_end, const __m128i* BMRESTRICT mask_block, Func sse2_func){ const unsigned mu1 = 0x55555555; const unsigned mu2 = 0x33333333; const unsigned mu3 = 0x0F0F0F0F; const unsigned mu4 = 0x0000003F; // Loading masks __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1); __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2); __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3); __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4); __m128i mcnt; mcnt = _mm_xor_si128(m1, m1); // cnt = 0 do { __m128i tmp1, tmp2; __m128i b = _mm_load_si128(block++); tmp1 = _mm_load_si128(mask_block++); b = sse2_func(b, tmp1); // b = (b & 0x55555555) + (b >> 1 & 0x55555555); tmp1 = _mm_srli_epi32(b, 1); // tmp1 = (b >> 1 & 0x55555555) tmp1 = _mm_and_si128(tmp1, m1); tmp2 = _mm_and_si128(b, m1); // tmp2 = (b & 0x55555555) b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2 // b = (b & 0x33333333) + (b >> 2 & 0x33333333); tmp1 = _mm_srli_epi32(b, 2); // (b >> 2 & 0x33333333) tmp1 = _mm_and_si128(tmp1, m2); tmp2 = _mm_and_si128(b, m2); // (b & 0x33333333) b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2 // b = (b + (b >> 4)) & 0x0F0F0F0F; tmp1 = _mm_srli_epi32(b, 4); // tmp1 = b >> 4 b = _mm_add_epi32(b, tmp1); // b = b + (b >> 4) b = _mm_and_si128(b, m3); // & 0x0F0F0F0F // b = b + (b >> 8); tmp1 = _mm_srli_epi32 (b, 8); // tmp1 = b >> 8 b = _mm_add_epi32(b, tmp1); // b = b + (b >> 8) // b = (b + (b >> 16)) & 0x0000003F; tmp1 = _mm_srli_epi32 (b, 16); // b >> 16 b = _mm_add_epi32(b, tmp1); // b + (b >> 16) b = _mm_and_si128(b, m4); // (b >> 16) & 0x0000003F; mcnt = _mm_add_epi32(mcnt, b); // mcnt += b } while (block < block_end); __declspec(align(16)) bm::id_t tcnt[4]; _mm_store_si128((__m128i*)tcnt, mcnt); return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];}#define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\ sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)#define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\ sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)#define VECT_BITCOUNT(first, last) \ sse2_bit_count((__m128i*) (first), (__m128i*) (last)) #define VECT_BITCOUNT_AND(first, last, mask) \ sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and) #define VECT_BITCOUNT_OR(first, last, mask) \ sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or) #define VECT_BITCOUNT_XOR(first, last, mask) \ sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor) #define VECT_BITCOUNT_SUB(first, last, mask) \ sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub) #define VECT_INVERT_ARR(first, last) \ sse2_invert_arr(first, last);#define VECT_AND_ARR(dst, src, src_end) \ sse2_and_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))#define VECT_OR_ARR(dst, src, src_end) \ sse2_or_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))#define VECT_SUB_ARR(dst, src, src_end) \ sse2_sub_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))#define VECT_XOR_ARR(dst, src, src_end) \ sse2_xor_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))#define VECT_COPY_BLOCK(dst, src, src_end) \ sse2_copy_block((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))#define VECT_SET_BLOCK(dst, dst_end, value) \ sse2_set_block((__m128i*) dst, (__m128i*) (dst_end), (value))} // namespace#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -