📄 bmsse2.h
字号:
/* * =========================================================================== * PRODUCTION $Log: bmsse2.h,v $ * PRODUCTION Revision 1000.0 2004/04/21 16:00:52 gouriano * PRODUCTION PRODUCTION: IMPORTED [CATCHUP_003] Dev-tree R1.1 * PRODUCTION * =========================================================================== *//*Copyright (c) 2002,2003 Anatoliy Kuznetsov.Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.*/#ifndef BMSSE2__H__INCLUDED__#define BMSSE2__H__INCLUDED__// Header implements processor specific intrinsics declarations for SSE2// instruction set#include<emmintrin.h>namespace bm{/** @defgroup SSE2 Processor specific optimizations for SSE2 instructions * *//*! @brief SSE2 reinitialization guard class SSE2 requires to call _mm_empty() if we are intermixing MMX integer commands with floating point arithmetics. This class guards critical code fragments where SSE2 integer is used. @ingroup SSE2*/class sse2_empty_guard{public: __forceinline sse2_empty_guard() { _mm_empty(); } __forceinline ~sse2_empty_guard() { _mm_empty(); }};/*# ifndef BM_SET_MMX_GUARD# define BM_SET_MMX_GUARD sse2_empty_guard bm_mmx_guard_;# endif*//*! @brief XOR array elements to specified mask *dst = *src ^ mask @ingroup SSE2*/__forceinline void sse2_xor_arr_2_mask(__m128i* BMRESTRICT dst, const __m128i* BMRESTRICT src, const __m128i* BMRESTRICT src_end, bm::word_t mask){ __m128i xmm2 = _mm_set_epi32(mask, mask, mask, mask); do { __m128i xmm1 = _mm_load_si128(src); xmm1 = _mm_xor_si128(xmm1, xmm2); _mm_store_si128(dst, xmm1); ++dst; ++src; } while (src < src_end);}/*! @brief Inverts array elements and NOT them to specified mask *dst = ~*src & mask @ingroup SSE2*/__forceinline void sse2_andnot_arr_2_mask(__m128i* BMRESTRICT dst, const __m128i* BMRESTRICT src, const __m128i* BMRESTRICT src_end, bm::word_t mask){ __m128i xmm2 = _mm_set_epi32(mask, mask, mask, mask); do { //_mm_prefetch((const char*)(src)+1024, _MM_HINT_NTA); //_mm_prefetch((const char*)(src)+1088, _MM_HINT_NTA); __m128i xmm1 = _mm_load_si128(src); xmm1 = _mm_andnot_si128(xmm1, xmm2); // xmm1 = (~xmm1) & xmm2 _mm_store_si128(dst, xmm1); ++dst; ++src; } while (src < src_end);}/*! @brief AND array elements against another array *dst &= *src @ingroup SSE2*/__forceinline void sse2_and_arr(__m128i* BMRESTRICT dst, const __m128i* BMRESTRICT src, const __m128i* BMRESTRICT src_end){ __m128i xmm1, xmm2; do { _mm_prefetch((const char*)(src)+512, _MM_HINT_NTA); xmm1 = _mm_load_si128(src++); xmm2 = _mm_load_si128(dst); xmm1 = _mm_and_si128(xmm1, xmm2); _mm_store_si128(dst++, xmm1); xmm1 = _mm_load_si128(src++); xmm2 = _mm_load_si128(dst); xmm1 = _mm_and_si128(xmm1, xmm2); _mm_store_si128(dst++, xmm1); xmm1 = _mm_load_si128(src++); xmm2 = _mm_load_si128(dst); xmm1 = _mm_and_si128(xmm1, xmm2); _mm_store_si128(dst++, xmm1); xmm1 = _mm_load_si128(src++); xmm2 = _mm_load_si128(dst); xmm1 = _mm_and_si128(xmm1, xmm2); _mm_store_si128(dst++, xmm1); } while (src < src_end);}/*! @brief OR array elements against another array *dst |= *src @ingroup SSE2*/__forceinline void sse2_or_arr(__m128i* BMRESTRICT dst, const __m128i* BMRESTRICT src, const __m128i* BMRESTRICT src_end){ __m128i xmm1, xmm2; do { _mm_prefetch((const char*)(src)+512, _MM_HINT_NTA); xmm1 = _mm_load_si128(src++); xmm2 = _mm_load_si128(dst); xmm1 = _mm_or_si128(xmm1, xmm2); _mm_store_si128(dst++, xmm1); xmm1 = _mm_load_si128(src++); xmm2 = _mm_load_si128(dst); xmm1 = _mm_or_si128(xmm1, xmm2); _mm_store_si128(dst++, xmm1); xmm1 = _mm_load_si128(src++); xmm2 = _mm_load_si128(dst); xmm1 = _mm_or_si128(xmm1, xmm2); _mm_store_si128(dst++, xmm1); xmm1 = _mm_load_si128(src++); xmm2 = _mm_load_si128(dst); xmm1 = _mm_or_si128(xmm1, xmm2); _mm_store_si128(dst++, xmm1); } while (src < src_end);}/*! @brief OR array elements against another array *dst |= *src @ingroup SSE2*/__forceinline void sse2_xor_arr(__m128i* BMRESTRICT dst, const __m128i* BMRESTRICT src, const __m128i* BMRESTRICT src_end){ __m128i xmm1, xmm2; do { _mm_prefetch((const char*)(src)+512, _MM_HINT_NTA); xmm1 = _mm_load_si128(src++); xmm2 = _mm_load_si128(dst); xmm1 = _mm_xor_si128(xmm1, xmm2); _mm_store_si128(dst++, xmm1); xmm1 = _mm_load_si128(src++); xmm2 = _mm_load_si128(dst); xmm1 = _mm_xor_si128(xmm1, xmm2); _mm_store_si128(dst++, xmm1); xmm1 = _mm_load_si128(src++); xmm2 = _mm_load_si128(dst); xmm1 = _mm_xor_si128(xmm1, xmm2); _mm_store_si128(dst++, xmm1); xmm1 = _mm_load_si128(src++); xmm2 = _mm_load_si128(dst); xmm1 = _mm_xor_si128(xmm1, xmm2); _mm_store_si128(dst++, xmm1); } while (src < src_end);}/*! @brief AND-NOT (SUB) array elements against another array *dst &= ~*src @ingroup SSE2*/__forceinline void sse2_sub_arr(__m128i* BMRESTRICT dst, const __m128i* BMRESTRICT src, const __m128i* BMRESTRICT src_end){ __m128i xmm1, xmm2; do { _mm_prefetch((const char*)(src)+512, _MM_HINT_NTA); xmm1 = _mm_load_si128(src++); xmm2 = _mm_load_si128(dst); xmm1 = _mm_andnot_si128(xmm1, xmm2); _mm_store_si128(dst++, xmm1); xmm1 = _mm_load_si128(src++); xmm2 = _mm_load_si128(dst); xmm1 = _mm_andnot_si128(xmm1, xmm2); _mm_store_si128(dst++, xmm1); xmm1 = _mm_load_si128(src++); xmm2 = _mm_load_si128(dst); xmm1 = _mm_andnot_si128(xmm1, xmm2); _mm_store_si128(dst++, xmm1); xmm1 = _mm_load_si128(src++); xmm2 = _mm_load_si128(dst); xmm1 = _mm_andnot_si128(xmm1, xmm2); _mm_store_si128(dst++, xmm1); } while (src < src_end); }/*! @brief SSE2 block memset *dst = value @ingroup SSE2*/__forceinline void sse2_set_block(__m128i* BMRESTRICT dst, __m128i* BMRESTRICT dst_end, bm::word_t value){
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -