📄 vnl_sse.h
字号:
#ifndef vnl_sse_h_
#define vnl_sse_h_
//! \file
// \author Kieran O'Mahony
// \date Sep 2007
// \brief Support for Streaming SIMD Extensions to speed up vector arithmetic
#include <vcl_compiler.h> //for macro decisions based on compiler type
#include <vxl_config.h> // for checking supported integer data types
#include <vcl_cfloat.h> // for DBL_MAX and FLT_MAX
#include <vnl/vnl_config.h> //is SSE enabled
#include <vnl/vnl_alloc.h> //is SSE enabled
//some caveats...
// - Due to the way vnl_matrix is represented in memory cannot guarantee 16-byte alignment,
// therefore have to use slower unaligned loading intrinsics for matrices.
// - The GCC 3.4 intrinsics seem to be horrendously slow...
// - On Mac OS X, in order to support Universal Binaries, we do not consider it a hard
// error if VNL_CONFIG_ENABLE_SSE2 is true for PowerPC builds. PowerPC obviously does
// not support SSE2, so we simply redefine it to false.
#if VNL_CONFIG_ENABLE_SSE2
# if defined(__APPLE__) && (defined(__ppc__) || defined(__ppc64__))
# undef VNL_CONFIG_ENABLE_SSE2
# define VNL_CONFIG_ENABLE_SSE2 0
# elif !VXL_HAS_EMMINTRIN_H
# error "Required file emmintrin.h for SSE2 not found"
# else
# include <emmintrin.h> //sse 2 intrinsics
# endif
#endif
//Try and use compiler instructions for forcing inlining if possible
//Also instruction for aligning stack memory is compiler dependent
#if defined(VCL_GCC) && ! defined(VCL_GCC_3)
# define VNL_SSE_FORCE_INLINE __attribute__((always_inline)) inline
# define VNL_SSE_STACK_ALIGNED(x) __attribute__((aligned(x)))
#elif defined VCL_VC || defined VCL_ICC
# define VNL_SSE_FORCE_INLINE __forceinline
# define VNL_SSE_STACK_ALIGNED(x) __declspec(align(x))
#else
# define VNL_SSE_FORCE_INLINE inline
# define VNL_SSE_STACK_ALIGNED(x)
# define VNL_SSE_STACK_STORE(pf) _mm_storeu_##pf //no stack alignment so use unaligned store (slower!)
#endif
// SSE operates faster with 16 byte aligned memory addresses.
// Check what memory alignment function is supported
#if VNL_CONFIG_ENABLE_SSE2 && VXL_HAS_MM_MALLOC
# define VNL_SSE_ALLOC(n,s,a) _mm_malloc(n*s,a)
# define VNL_SSE_FREE(v,n,s) _mm_free(v)
#elif VNL_CONFIG_ENABLE_SSE2 && VXL_HAS_ALIGNED_MALLOC
# include <malloc.h>
# define VNL_SSE_ALLOC(n,s,a) _aligned_malloc(n*s,a)
# define VNL_SSE_FREE(v,n,s) _aligned_free(v)
#elif VNL_CONFIG_ENABLE_SSE2 && VXL_HAS_MINGW_ALIGNED_MALLOC
# include <malloc.h>
# define VNL_SSE_ALLOC(n,s,a) __mingw_aligned_malloc(n*s,a)
# define VNL_SSE_FREE(v,n,s) __mingw_aligned_free(v)
#elif VNL_CONFIG_ENABLE_SSE2 && VXL_HAS_POSIX_MEMALIGN
# include <vcl_cstdlib.h>
# define VNL_SSE_ALLOC(n,s,a) memalign(a,n*s)
# define VNL_SSE_FREE(v,n,s) free(v)
#else //sse2 disabled or could not get memory alignment support, use slower unaligned based intrinsics
# define VNL_SSE_HEAP_STORE(pf) _mm_storeu_##pf
# define VNL_SSE_HEAP_LOAD(pf) _mm_loadu_##pf
# if VNL_CONFIG_THREAD_SAFE
# define VNL_SSE_ALLOC(n,s,a) new char[n*s]
# define VNL_SSE_FREE(v,n,s) delete [] static_cast<char*>(v)
# else
# define VNL_SSE_ALLOC(n,s,a) vnl_alloc::allocate((n == 0) ? 8 : (n * s));
# define VNL_SSE_FREE(v,n,s) if (v) vnl_alloc::deallocate(v, (n == 0) ? 8 : (n * s));
# endif
#endif
// Stack memory can be aligned -> use SSE aligned store
#ifndef VNL_SSE_STACK_STORE
# define VNL_SSE_STACK_STORE(pf) _mm_store_##pf
#endif
// Heap memory can be aligned -> use SSE aligned load & store
#ifndef VNL_SSE_HEAP_STORE
# define VNL_SSE_HEAP_STORE(pf) _mm_store_##pf
# define VNL_SSE_HEAP_LOAD(pf) _mm_load_##pf
#endif
//! Custom memory allocation function to force 16 byte alignment of data
VNL_SSE_FORCE_INLINE void* vnl_sse_alloc(unsigned n, unsigned size)
{
return VNL_SSE_ALLOC(n,size,16);
}
//! Custom memory deallocation function to free 16 byte aligned of data
VNL_SSE_FORCE_INLINE void vnl_sse_dealloc(void* mem, unsigned n, unsigned size)
{
VNL_SSE_FREE(mem,n,size);
}
//avoid inlining when debugging
#ifndef NDEBUG
#undef VNL_SSE_FORCE_INLINE
#define VNL_SSE_FORCE_INLINE
#endif
//! Bog standard (no sse) implementation for non sse enabled hardware and any type
// which doesn't have a template specialisation.
template <class T>
class vnl_sse {
public:
static VNL_SSE_FORCE_INLINE void element_product(const T* x, const T* y, T* r, unsigned n)
{
for(unsigned i = 0; i < n; ++i)
r[i] = x[i] * y[i];
}
static VNL_SSE_FORCE_INLINE T dot_product(const T* x, const T* y, unsigned n)
{
T sum(0);
for(unsigned i = 0; i < n; ++i)
sum += x[i] * y[i];
return sum;
}
static VNL_SSE_FORCE_INLINE T euclid_dist_sq(const T* x, const T* y, unsigned n)
{
//IMS: Unable to optimise this any further for MSVC compiler
T sum(0);
#ifdef VCL_VC_6
for (unsigned i=0; i<n; ++i)
{
const T diff = x[i] - y[i];
sum += diff*diff;
}
#else
--x;
--y;
while (n!=0)
{
const T diff = x[n] - y[n];
sum += diff*diff;
--n;
}
#endif
return sum;
}
static VNL_SSE_FORCE_INLINE void vector_x_matrix(const T* v, const T* m, T* r, unsigned rows, unsigned cols)
{
for (unsigned int j=0; j<cols; ++j) {
T som(0);
for (unsigned int i=0; i<rows; ++i)
som += (m+i*cols)[j] * v[i];
r[j] = som;
}
}
static VNL_SSE_FORCE_INLINE void matrix_x_vector(const T* m, const T* v, T* r, unsigned rows, unsigned cols)
{
for (unsigned int i=0; i<rows; ++i) {
T som(0);
for (unsigned int j=0; j<cols; ++j)
som += (m+i*cols)[j] * v[j];
r[i] = som;
}
}
static VNL_SSE_FORCE_INLINE T sum(const T* v, unsigned n)
{
T tot(0);
for (unsigned i = 0; i < n; ++i)
tot += *v++;
return tot;
}
static VNL_SSE_FORCE_INLINE T max(const T* v, unsigned n)
{
T tmp = v[0];
for (unsigned i=1; i<n; ++i)
if (v[i] > tmp)
tmp = v[i];
return tmp;
}
static VNL_SSE_FORCE_INLINE T min(const T* v, unsigned n)
{
T tmp = v[0];
for (unsigned i=1; i<n; ++i)
if (v[i] < tmp)
tmp = v[i];
return tmp;
}
};
#if VNL_CONFIG_ENABLE_SSE2
//! SSE2 implementation for double precision floating point (64 bit)
VCL_DEFINE_SPECIALIZATION
class vnl_sse<double> {
public:
static VNL_SSE_FORCE_INLINE void element_product(const double* x, const double* y, double* r, unsigned n)
{
switch(n % 4)
{
// do scalar (single value) load, multiply and store for end elements
case 3: --n; _mm_store_sd(r+n,_mm_mul_sd(_mm_load_sd(x+n),_mm_load_sd(y+n)));
case 2: --n; _mm_store_sd(r+n,_mm_mul_sd(_mm_load_sd(x+n),_mm_load_sd(y+n)));
case 1: --n; _mm_store_sd(r+n,_mm_mul_sd(_mm_load_sd(x+n),_mm_load_sd(y+n)));
case 0: ;
}
//load, multiply and store two doubles at a time
//loop unroll to handle 4
for(int i = n-4; i >= 0; i-=4)
{
VNL_SSE_HEAP_STORE(pd)(r+i,_mm_mul_pd(VNL_SSE_HEAP_LOAD(pd)(x+i),VNL_SSE_HEAP_LOAD(pd)(y+i)));
VNL_SSE_HEAP_STORE(pd)(r+i+2,_mm_mul_pd(VNL_SSE_HEAP_LOAD(pd)(x+i+2),VNL_SSE_HEAP_LOAD(pd)(y+i+2)));
}
}
static VNL_SSE_FORCE_INLINE double dot_product(const double* x, const double* y, unsigned n)
{
double ret;
__m128d sum;
if(n%2)
{
// handle single element at end of odd sized vectors
n--; sum = _mm_mul_sd(_mm_load_sd(x+n),_mm_load_sd(y+n));
}
else
sum = _mm_setzero_pd();
for(int i = n-2; i >= 0; i-=2)
sum = _mm_add_pd(_mm_mul_pd(VNL_SSE_HEAP_LOAD(pd)(x+i), VNL_SSE_HEAP_LOAD(pd)(y+i)),sum);
// sum will contain 2 accumulated values, need to add them together
sum = _mm_add_sd(sum,_mm_unpackhi_pd(sum,_mm_setzero_pd()));
_mm_store_sd(&ret,sum);
return ret;
}
static VNL_SSE_FORCE_INLINE double euclid_dist_sq(const double* x, const double* y, unsigned n)
{
double ret;
__m128d sum,a;
if(n%2)
{
// handle single element at end of odd sized vectors
n--; a = _mm_sub_sd(_mm_load_sd(x+n),_mm_load_sd(y+n));
sum = _mm_mul_sd(a,a);
}
else
sum = _mm_setzero_pd();
for ( int i = n-2; i >= 0; i-=2 )
{
a = _mm_sub_pd(VNL_SSE_HEAP_LOAD(pd)(x+i),VNL_SSE_HEAP_LOAD(pd)(y+i));
sum = _mm_add_pd(_mm_mul_pd(a,a),sum);
}
// sum will contain 2 accumulated values, need to add them together
sum = _mm_add_sd(sum,_mm_unpackhi_pd(sum,_mm_setzero_pd()));
_mm_store_sd(&ret,sum);
return ret;
}
static VNL_SSE_FORCE_INLINE void vector_x_matrix(const double* v, const double* m, double* r, unsigned rows, unsigned cols)
{
__m128d accum, x,y,z,w;
//calculate if there are any left-over rows/columns
unsigned r_left = rows%4;
unsigned r_nice = rows - r_left;
unsigned c_left = cols%2;
unsigned c_nice = cols - c_left;
//handle 2 matrix columns at a time
for (unsigned j = 0; j < c_nice; j+=2)
{
//handle 4 matrix rows at a time
accum = _mm_setzero_pd();
unsigned i = 0;
while( i < r_nice )
{
//load vector data so that
// y = (v0,v1) , w = (v2,v3)
y = VNL_SSE_HEAP_LOAD(pd)(v+i);
w = VNL_SSE_HEAP_LOAD(pd)(v+i+2);
_mm_prefetch((const char*)(v+i+4), _MM_HINT_NTA);
// after shuffling
// x = (v0, v0)
// y = (v1, v1)
// z = (v2, v2)
// w = (v3, v3)
x = _mm_shuffle_pd(y,y,_MM_SHUFFLE2(0,0));
y = _mm_shuffle_pd(y,y,_MM_SHUFFLE2(1,1));
z = _mm_shuffle_pd(w,w,_MM_SHUFFLE2(0,0));
w = _mm_shuffle_pd(w,w,_MM_SHUFFLE2(1,1));
// multipy the two matrix columns
// i.e. x = ( v0 * m00, v0 * m01)
// y = ( v1 * m10, v1 * m11)
// z = ( v2 * m20, v2 * m21)
// w = ( v3 * m30, v3 * m31)
x = _mm_mul_pd(x,_mm_loadu_pd(i++*cols+m+j));
y = _mm_mul_pd(y,_mm_loadu_pd(i++*cols+m+j));
z = _mm_mul_pd(z,_mm_loadu_pd(i++*cols+m+j));
w = _mm_mul_pd(w,_mm_loadu_pd(i++*cols+m+j));
//now sum both columns
accum = _mm_add_pd(x,accum);
accum = _mm_add_pd(y,accum);
accum = _mm_add_pd(z,accum);
accum = _mm_add_pd(w,accum);
//accum is now ( v0 * m00 + v1 * m10 + v2 * m20 + v3 * m30,
// v0 * m01 + v1 * m11 + v2 * m21 + v3 * m31 )
}
// handle left-over rows
switch(r_left)
{
case 3: accum = _mm_add_pd(_mm_mul_pd(_mm_load1_pd(v+i),_mm_loadu_pd(m+i*cols+j)), accum); i++;
case 2: accum = _mm_add_pd(_mm_mul_pd(_mm_load1_pd(v+i),_mm_loadu_pd(m+i*cols+j)), accum); i++;
case 1: accum = _mm_add_pd(_mm_mul_pd(_mm_load1_pd(v+i),_mm_loadu_pd(m+i*cols+j)), accum);
case 0: ;
}
//store the 2 values of the result vector
//use stream to avoid polluting the cache
_mm_stream_pd(r+j,accum);
}
// handle the left over columns
if( c_left )
{
accum = _mm_setzero_pd();
for (unsigned int i=0; i<rows; ++i)
accum = _mm_add_sd(_mm_mul_sd(_mm_load_sd(m+i*cols+cols-1),_mm_load_sd(v+i)),accum);
_mm_store_sd(r+cols-1, accum);
}
}
static VNL_SSE_FORCE_INLINE void matrix_x_vector(const double* m, const double* v, double* r, unsigned rows, unsigned cols)
{
__m128d accum, x,y,mxy1,mxy2;
//calculate if there are any left-over rows/columns
unsigned r_left = rows%2;
unsigned r_nice = rows - r_left;
unsigned c_left = cols%2;
unsigned c_nice = cols - c_left;
//handle 2 matrix rows at a time
for (unsigned i = 0; i < r_nice; i+=2)
{
//handle 4 matrix columns at a time
accum = _mm_setzero_pd();
const double *r1 = m+i*cols, *r2 = m+(i+1)*cols;
unsigned j = 0;
for (; j < c_nice; j+=2)
{
// load vector data so that
// y = (v0, v1)
y = VNL_SSE_HEAP_LOAD(pd)(v+j);
//shuffle so that
// x = (v0,v0) y = (v1,v1)
x = _mm_shuffle_pd(y,y,_MM_SHUFFLE2(0,0));
y = _mm_shuffle_pd(y,y,_MM_SHUFFLE2(1,1));
//load the matrix data so that
// mxy1 = (m00,m01), mxy2 = (m10,m11)
mxy1 = _mm_loadu_pd(r1+j);
mxy2 = _mm_loadu_pd(r2+j);
//unpack matrix data to acheive
// (v0,v0) * (m00,m10)
// (v1,v1) * (m01,m11)
x = _mm_mul_pd(x,_mm_unpacklo_pd(mxy1,mxy2));
y = _mm_mul_pd(y,_mm_unpackhi_pd(mxy1,mxy2));
//now sum the results
accum = _mm_add_pd(x,accum);
accum = _mm_add_pd(y,accum);
//accum is now ( v0 * m00 + v1 * m01,
// v0 * m11 + v1 * m11 )
}
// handle the left over columns
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -