⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 vnl_sse.h

📁 DTMK软件开发包,此为开源软件,是一款很好的医学图像开发资源.
💻 H
📖 第 1 页 / 共 2 页
字号:
#ifndef vnl_sse_h_
#define vnl_sse_h_

//! \file 
//  \author Kieran O'Mahony
//  \date Sep 2007
//  \brief Support for Streaming SIMD Extensions to speed up vector arithmetic

#include <vcl_compiler.h> //for macro decisions based on compiler type
#include <vxl_config.h> // for checking supported integer data types
#include <vcl_cfloat.h> // for DBL_MAX and FLT_MAX

#include <vnl/vnl_config.h> //is SSE enabled
#include <vnl/vnl_alloc.h> //is SSE enabled

//some caveats...
// - Due to the way vnl_matrix is represented in memory cannot guarantee 16-byte alignment,
//   therefore have to use slower unaligned loading intrinsics for matrices.
// - The GCC 3.4 intrinsics seem to be horrendously slow...
// - On Mac OS X, in order to support Universal Binaries, we do not consider it a hard
//   error if VNL_CONFIG_ENABLE_SSE2 is true for PowerPC builds. PowerPC obviously does
//   not support SSE2, so we simply redefine it to false.

#if VNL_CONFIG_ENABLE_SSE2
# if defined(__APPLE__) && (defined(__ppc__) || defined(__ppc64__))
#   undef VNL_CONFIG_ENABLE_SSE2
#   define VNL_CONFIG_ENABLE_SSE2 0
# elif !VXL_HAS_EMMINTRIN_H
#   error "Required file emmintrin.h for SSE2 not found"
# else
#   include <emmintrin.h> //sse 2 intrinsics
# endif
#endif


//Try and use compiler instructions for forcing inlining if possible
//Also instruction for aligning stack memory is compiler dependent
#if defined(VCL_GCC) && ! defined(VCL_GCC_3)
# define VNL_SSE_FORCE_INLINE __attribute__((always_inline)) inline
# define VNL_SSE_STACK_ALIGNED(x)  __attribute__((aligned(x)))
#elif defined VCL_VC || defined VCL_ICC
# define VNL_SSE_FORCE_INLINE __forceinline
# define VNL_SSE_STACK_ALIGNED(x)  __declspec(align(x))
#else
# define VNL_SSE_FORCE_INLINE inline
# define VNL_SSE_STACK_ALIGNED(x) 
# define VNL_SSE_STACK_STORE(pf) _mm_storeu_##pf //no stack alignment so use unaligned store (slower!) 
#endif


// SSE operates faster with 16 byte aligned memory addresses.
// Check what memory alignment function is supported 
#if VNL_CONFIG_ENABLE_SSE2 && VXL_HAS_MM_MALLOC
# define VNL_SSE_ALLOC(n,s,a) _mm_malloc(n*s,a)
# define VNL_SSE_FREE(v,n,s) _mm_free(v)
#elif VNL_CONFIG_ENABLE_SSE2 && VXL_HAS_ALIGNED_MALLOC
# include <malloc.h>
# define VNL_SSE_ALLOC(n,s,a) _aligned_malloc(n*s,a)
# define VNL_SSE_FREE(v,n,s) _aligned_free(v)
#elif VNL_CONFIG_ENABLE_SSE2 && VXL_HAS_MINGW_ALIGNED_MALLOC
# include <malloc.h>
# define VNL_SSE_ALLOC(n,s,a) __mingw_aligned_malloc(n*s,a)
# define VNL_SSE_FREE(v,n,s) __mingw_aligned_free(v)
#elif VNL_CONFIG_ENABLE_SSE2 && VXL_HAS_POSIX_MEMALIGN
# include <vcl_cstdlib.h>
# define VNL_SSE_ALLOC(n,s,a) memalign(a,n*s)
# define VNL_SSE_FREE(v,n,s) free(v)
#else //sse2 disabled or could not get memory alignment support, use slower unaligned based intrinsics
# define VNL_SSE_HEAP_STORE(pf) _mm_storeu_##pf
# define VNL_SSE_HEAP_LOAD(pf) _mm_loadu_##pf
# if VNL_CONFIG_THREAD_SAFE
#   define VNL_SSE_ALLOC(n,s,a) new char[n*s]
#   define VNL_SSE_FREE(v,n,s) delete [] static_cast<char*>(v)
# else
#   define VNL_SSE_ALLOC(n,s,a) vnl_alloc::allocate((n == 0) ? 8 : (n * s));
#   define VNL_SSE_FREE(v,n,s) if (v) vnl_alloc::deallocate(v, (n == 0) ? 8 : (n * s));
# endif
#endif


// Stack memory can be aligned -> use SSE aligned store
#ifndef VNL_SSE_STACK_STORE
# define VNL_SSE_STACK_STORE(pf) _mm_store_##pf 
#endif

// Heap memory can be aligned -> use SSE aligned load & store
#ifndef VNL_SSE_HEAP_STORE
# define VNL_SSE_HEAP_STORE(pf) _mm_store_##pf
# define VNL_SSE_HEAP_LOAD(pf) _mm_load_##pf
#endif

//! Custom memory allocation function to force 16 byte alignment of data 
VNL_SSE_FORCE_INLINE void* vnl_sse_alloc(unsigned n, unsigned size)
{
  return VNL_SSE_ALLOC(n,size,16);
}

//! Custom memory deallocation function to free 16 byte aligned of data 
VNL_SSE_FORCE_INLINE void vnl_sse_dealloc(void* mem, unsigned n, unsigned size)
{
  VNL_SSE_FREE(mem,n,size); 
}

//avoid inlining when debugging 
#ifndef NDEBUG
#undef VNL_SSE_FORCE_INLINE
#define VNL_SSE_FORCE_INLINE 
#endif
  
//! Bog standard (no sse) implementation for non sse enabled hardware and any type 
//  which doesn't have a template specialisation.
template <class T>
class vnl_sse {
  public:

    static VNL_SSE_FORCE_INLINE void element_product(const T* x, const T* y, T* r, unsigned n)
    {
      for(unsigned i = 0; i < n; ++i)
        r[i] = x[i] * y[i];
    }

    static VNL_SSE_FORCE_INLINE T dot_product(const T* x, const T* y, unsigned n)
    {
      T sum(0);
      for(unsigned i = 0; i < n; ++i)
        sum += x[i] * y[i];
      return sum;
    }

    static VNL_SSE_FORCE_INLINE T euclid_dist_sq(const T* x, const T* y, unsigned n) 
    {
      //IMS: Unable to optimise this any further for MSVC compiler
      T sum(0);
    #ifdef VCL_VC_6
      for (unsigned i=0; i<n; ++i)
      {
        const T diff = x[i] - y[i];
        sum += diff*diff;
      }
    #else
      --x;
      --y;
      while (n!=0)
      {
        const T diff = x[n] - y[n];
        sum += diff*diff;
        --n;
      }
    #endif
      return sum;
    }
    
    static VNL_SSE_FORCE_INLINE void vector_x_matrix(const T* v, const T* m, T* r, unsigned rows, unsigned cols)
    {
      for (unsigned int j=0; j<cols; ++j) {
        T som(0);
        for (unsigned int i=0; i<rows; ++i)
          som += (m+i*cols)[j] * v[i];
        r[j] = som;
      }
    }
    
    static VNL_SSE_FORCE_INLINE void matrix_x_vector(const T* m, const T* v, T* r, unsigned rows, unsigned cols)
    {
      for (unsigned int i=0; i<rows; ++i) {
        T som(0);
        for (unsigned int j=0; j<cols; ++j)
          som += (m+i*cols)[j] * v[j];
        r[i] = som;
      }
    }
    
    static VNL_SSE_FORCE_INLINE T sum(const T* v, unsigned n)
    {
      T tot(0);
      for (unsigned i = 0; i < n; ++i)
        tot += *v++;
      return tot;
    }
    
    static VNL_SSE_FORCE_INLINE T max(const T* v, unsigned n)
    {
      T tmp = v[0];
      for (unsigned i=1; i<n; ++i)
        if (v[i] > tmp)
          tmp = v[i];
      return tmp;
    }
    
    static VNL_SSE_FORCE_INLINE T min(const T* v, unsigned n)
    {
      T tmp = v[0];
      for (unsigned i=1; i<n; ++i)
        if (v[i] < tmp)
          tmp = v[i];
      return tmp;
    }
    
};

#if VNL_CONFIG_ENABLE_SSE2

//! SSE2 implementation for double precision floating point (64 bit)
VCL_DEFINE_SPECIALIZATION
class vnl_sse<double> {
  public:
    
    static VNL_SSE_FORCE_INLINE void element_product(const double* x, const double* y, double* r, unsigned n) 
    {      
      switch(n % 4) 
      {
        // do scalar (single value) load, multiply and store for end elements 
        case 3: --n; _mm_store_sd(r+n,_mm_mul_sd(_mm_load_sd(x+n),_mm_load_sd(y+n)));   
        case 2: --n; _mm_store_sd(r+n,_mm_mul_sd(_mm_load_sd(x+n),_mm_load_sd(y+n)));
        case 1: --n; _mm_store_sd(r+n,_mm_mul_sd(_mm_load_sd(x+n),_mm_load_sd(y+n)));
        case 0: ;
      }      
      
      //load, multiply and store two doubles at a time
      //loop unroll to handle 4
      for(int i = n-4; i >= 0; i-=4)
      {        
        VNL_SSE_HEAP_STORE(pd)(r+i,_mm_mul_pd(VNL_SSE_HEAP_LOAD(pd)(x+i),VNL_SSE_HEAP_LOAD(pd)(y+i)));
        VNL_SSE_HEAP_STORE(pd)(r+i+2,_mm_mul_pd(VNL_SSE_HEAP_LOAD(pd)(x+i+2),VNL_SSE_HEAP_LOAD(pd)(y+i+2)));
      }
    }
    
    static VNL_SSE_FORCE_INLINE double dot_product(const double* x, const double* y, unsigned n)
    {
      double ret;
      __m128d sum;   
      if(n%2) 
      { 
        // handle single element at end of odd sized vectors
        n--; sum = _mm_mul_sd(_mm_load_sd(x+n),_mm_load_sd(y+n));
      }
      else
        sum = _mm_setzero_pd();  
      
      for(int i = n-2; i >= 0; i-=2)
        sum = _mm_add_pd(_mm_mul_pd(VNL_SSE_HEAP_LOAD(pd)(x+i), VNL_SSE_HEAP_LOAD(pd)(y+i)),sum);

      // sum will contain 2 accumulated values, need to add them together
      sum = _mm_add_sd(sum,_mm_unpackhi_pd(sum,_mm_setzero_pd()));
      _mm_store_sd(&ret,sum);
      return ret;
    }
    
    static VNL_SSE_FORCE_INLINE double euclid_dist_sq(const double* x, const double* y, unsigned n) 
    {
      double ret;
      __m128d sum,a;
      
      if(n%2) 
      { 
        // handle single element at end of odd sized vectors
          n--; a = _mm_sub_sd(_mm_load_sd(x+n),_mm_load_sd(y+n));
          sum = _mm_mul_sd(a,a);
      }
      else
        sum = _mm_setzero_pd();        
      
      for ( int i = n-2; i >= 0; i-=2 )
      {        
        a = _mm_sub_pd(VNL_SSE_HEAP_LOAD(pd)(x+i),VNL_SSE_HEAP_LOAD(pd)(y+i));        
        sum = _mm_add_pd(_mm_mul_pd(a,a),sum);
      }

      // sum will contain 2 accumulated values, need to add them together
      sum = _mm_add_sd(sum,_mm_unpackhi_pd(sum,_mm_setzero_pd()));
      _mm_store_sd(&ret,sum);
      return ret;
    }
    
    static VNL_SSE_FORCE_INLINE void vector_x_matrix(const double* v, const double* m, double* r, unsigned rows, unsigned cols)
    {
      __m128d accum, x,y,z,w;

      //calculate if there are any left-over rows/columns
      unsigned r_left = rows%4; 
      unsigned r_nice = rows - r_left;
      unsigned c_left = cols%2;
      unsigned c_nice = cols - c_left;

      //handle 2 matrix columns at a time
      for (unsigned j = 0; j < c_nice; j+=2)
      {

        //handle 4 matrix rows at a time
        accum = _mm_setzero_pd();
        unsigned i = 0;
        while( i < r_nice )
        {
          //load vector data so that
          // y = (v0,v1) , w = (v2,v3)
          y = VNL_SSE_HEAP_LOAD(pd)(v+i);
          w = VNL_SSE_HEAP_LOAD(pd)(v+i+2);

          _mm_prefetch((const char*)(v+i+4), _MM_HINT_NTA);
          
          // after shuffling
          // x = (v0, v0)
          // y = (v1, v1)
          // z = (v2, v2)
          // w = (v3, v3)
          x = _mm_shuffle_pd(y,y,_MM_SHUFFLE2(0,0));
          y = _mm_shuffle_pd(y,y,_MM_SHUFFLE2(1,1));
          z = _mm_shuffle_pd(w,w,_MM_SHUFFLE2(0,0));
          w = _mm_shuffle_pd(w,w,_MM_SHUFFLE2(1,1));
          
          // multipy the two matrix columns
          // i.e.  x = ( v0 * m00, v0 * m01)
          //       y = ( v1 * m10, v1 * m11)
          //       z = ( v2 * m20, v2 * m21)
          //       w = ( v3 * m30, v3 * m31)
          x = _mm_mul_pd(x,_mm_loadu_pd(i++*cols+m+j));
          y = _mm_mul_pd(y,_mm_loadu_pd(i++*cols+m+j));
          z = _mm_mul_pd(z,_mm_loadu_pd(i++*cols+m+j));
          w = _mm_mul_pd(w,_mm_loadu_pd(i++*cols+m+j));
          
          //now sum both columns
          accum = _mm_add_pd(x,accum);
          accum = _mm_add_pd(y,accum);
          accum = _mm_add_pd(z,accum);
          accum = _mm_add_pd(w,accum);
          
          //accum is now ( v0 * m00 + v1 * m10 + v2 * m20 + v3 * m30, 
          //               v0 * m01 + v1 * m11 + v2 * m21 + v3 * m31 )
        }

        // handle left-over rows
        switch(r_left)
        {
          case 3: accum = _mm_add_pd(_mm_mul_pd(_mm_load1_pd(v+i),_mm_loadu_pd(m+i*cols+j)), accum); i++;
          case 2: accum = _mm_add_pd(_mm_mul_pd(_mm_load1_pd(v+i),_mm_loadu_pd(m+i*cols+j)), accum); i++;
          case 1: accum = _mm_add_pd(_mm_mul_pd(_mm_load1_pd(v+i),_mm_loadu_pd(m+i*cols+j)), accum); 
          case 0: ;
        }
        
        //store the 2 values of the result vector
        //use stream to avoid polluting the cache
        _mm_stream_pd(r+j,accum);
      }

      // handle the left over columns
      if( c_left ) 
      {
        accum = _mm_setzero_pd();
        for (unsigned int i=0; i<rows; ++i)
          accum = _mm_add_sd(_mm_mul_sd(_mm_load_sd(m+i*cols+cols-1),_mm_load_sd(v+i)),accum);
        _mm_store_sd(r+cols-1, accum);
      }
    }
    
    static VNL_SSE_FORCE_INLINE void matrix_x_vector(const double* m, const double* v, double* r, unsigned rows, unsigned cols)
    {
      __m128d accum, x,y,mxy1,mxy2;

      //calculate if there are any left-over rows/columns
      unsigned r_left = rows%2;
      unsigned r_nice = rows - r_left;
      unsigned c_left = cols%2;
      unsigned c_nice = cols - c_left;
      
      //handle 2 matrix rows at a time
      for (unsigned i = 0; i < r_nice; i+=2)
      {

        //handle 4 matrix columns at a time
        accum = _mm_setzero_pd();
        const double *r1 = m+i*cols, *r2 = m+(i+1)*cols;
        unsigned j = 0;
        for (; j < c_nice; j+=2)
        {
          // load  vector data so that
          //  y = (v0, v1)
          y = VNL_SSE_HEAP_LOAD(pd)(v+j);
          
          //shuffle so that
          //  x = (v0,v0) y = (v1,v1)
          x = _mm_shuffle_pd(y,y,_MM_SHUFFLE2(0,0));
          y = _mm_shuffle_pd(y,y,_MM_SHUFFLE2(1,1));

          //load the matrix data so that
          // mxy1 = (m00,m01), mxy2 = (m10,m11)
          mxy1 = _mm_loadu_pd(r1+j);
          mxy2 = _mm_loadu_pd(r2+j);
                    
          //unpack matrix data to acheive 
          //  (v0,v0) * (m00,m10)  
          //  (v1,v1) * (m01,m11)
          x = _mm_mul_pd(x,_mm_unpacklo_pd(mxy1,mxy2));
          y = _mm_mul_pd(y,_mm_unpackhi_pd(mxy1,mxy2));

          //now sum the results
          accum = _mm_add_pd(x,accum);
          accum = _mm_add_pd(y,accum);

          //accum is now ( v0 * m00 + v1 * m01, 
          //               v0 * m11 + v1 * m11 )
        }
        // handle the left over columns

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -