📄 vnl_sse.h

📁 DTMK软件开发包,此为开源软件,是一款很好的医学图像开发资源.
💻 H
📖 第 1 页 / 共 2 页
字号:
上一页 12
        if(c_left)
          accum = _mm_add_pd(_mm_mul_pd(_mm_load1_pd(v+j),_mm_set_pd(*(r1+j),*(r2+j))), accum);   

        //store the 2 values of the result vector
        //use stream to avoid polluting the cache
        _mm_stream_pd(r+i,accum);
      } 

      // handle the left over rows
      if( r_left ) 
      {
        accum = _mm_setzero_pd();
        const double* p = m+(rows-1)*cols;
        for (unsigned int j=0; j<cols; ++j)
          accum = _mm_add_sd(_mm_mul_sd(_mm_load_sd(p+j),_mm_load_sd(v+j)),accum);
        _mm_store_sd(r+rows-1, accum);        
      }
    }
    
    static VNL_SSE_FORCE_INLINE double sum(const double* x, unsigned n) 
    {         
      double ret;
      // decision logic for odd sized vectors
      __m128d sum = n%2 ? _mm_load_sd(x+--n) : _mm_setzero_pd();              
      
      //sum two elements at a time, sum will contain two running totals 
      for(int i = n-2; i >= 0; i-=2)
        sum = _mm_add_pd(VNL_SSE_HEAP_LOAD(pd)(x+i),sum);

      // sum will contain 2 accumulated values, need to add them together
      sum = _mm_add_sd(sum,_mm_unpackhi_pd(sum,_mm_setzero_pd()));
      _mm_store_sd(&ret,sum);
      return ret;
    }
    
    static VNL_SSE_FORCE_INLINE double max(const double* x, unsigned n)
    {
      double ret;
      // decision logic for odd sized vectors
      __m128d max = n%2 ? _mm_load_sd(x+--n) : _mm_setzero_pd();

      //handle two elements at a time, max will contain two max values 
      for (int i=n-2; i>=0; i-=2)
        max = _mm_max_pd(VNL_SSE_HEAP_LOAD(pd)(x+i), max);

      // need to store max's two values
      max = _mm_max_sd(max,_mm_unpackhi_pd(max,_mm_setzero_pd()));
      _mm_store_sd(&ret,max);
      return ret;
    }
    
    static VNL_SSE_FORCE_INLINE double min(const double* x, unsigned n)
    {
      double ret;
      __m128d min = _mm_set1_pd(DBL_MAX);

      // hand last element if odd sized vector
      if(n%2)
        min = _mm_min_sd(min,_mm_load_sd(x+--n));

      //handle two elements at a time, min will contain two min values 
      for (int i=n-2; i>=0; i-=2)
        min = _mm_min_pd(VNL_SSE_HEAP_LOAD(pd)(x+i), min);

      // need to store min's two values
      min = _mm_min_sd(min,_mm_unpackhi_pd(min,_mm_setzero_pd()));
      _mm_store_sd(&ret,min);
      return ret;
    }
};

//! SSE2 implementation for single precision floating point (32 bit)
VCL_DEFINE_SPECIALIZATION
class vnl_sse<float> {
  public:
    
    static VNL_SSE_FORCE_INLINE void element_product(const float* x, const float* y, float* r, unsigned n) 
    { 
      switch(n % 4) 
      {
        // do scalar (single value) load, multiply and store for end elements 
        case 3: --n; _mm_store_ss(r+n,_mm_mul_ss(_mm_load_ss(x+n),_mm_load_ss(y+n)));   
        case 2: --n; _mm_store_ss(r+n,_mm_mul_ss(_mm_load_ss(x+n),_mm_load_ss(y+n)));
        case 1: --n; _mm_store_ss(r+n,_mm_mul_ss(_mm_load_ss(x+n),_mm_load_ss(y+n)));
        case 0: ;
      }

      //load, multiply and store four floats at a time
      for(int i = n-4; i >= 0; i-=4)        
        VNL_SSE_HEAP_STORE(ps)(r+i,_mm_mul_ps(VNL_SSE_HEAP_LOAD(ps)(x+i),VNL_SSE_HEAP_LOAD(ps)(y+i)));   
    }
    
    static VNL_SSE_FORCE_INLINE float dot_product(const float* x, const float* y, unsigned n)
    { 
      float ret;
      __m128 sum = _mm_setzero_ps();  
      switch(n % 4) 
      {
        // handle elements at end of vectors with sizes not divisable by 4
        case 3: n--; sum = _mm_mul_ss(_mm_load_ss(x+n), _mm_load_ss(y+n));
        case 2: n--; sum = _mm_add_ss(_mm_mul_ss(_mm_load_ss(x+n), _mm_load_ss(y+n)),sum);
        case 1: n--; sum = _mm_add_ss(_mm_mul_ss(_mm_load_ss(x+n), _mm_load_ss(y+n)),sum);
        case 0: ;
      }
          
      for(int i = n-4; i >= 0; i-=4)
        sum = _mm_add_ps(_mm_mul_ps(VNL_SSE_HEAP_LOAD(ps)(x+i), VNL_SSE_HEAP_LOAD(ps)(y+i)),sum);

      // sum will contain 4 accumulated values, need to add them together    
      sum = _mm_add_ps(sum,_mm_movehl_ps(_mm_setzero_ps(),sum));
      sum = _mm_add_ss(sum,_mm_shuffle_ps(sum,sum,_MM_SHUFFLE(3,2,1,1)));
      
      _mm_store_ss(&ret,sum);
      return ret;
    }
    
    static VNL_SSE_FORCE_INLINE float euclid_dist_sq(const float* x, const float* y, unsigned n) 
    {
      float ret;
      __m128 sum,a;
      sum = a = _mm_setzero_ps();
      switch(n % 4) 
      {
        // handle elements at end of vectors with sizes not divisable by 4
        case 3: --n; a = _mm_sub_ss(_mm_load_ss(x+n),_mm_load_ss(y+n));    
        case 2: --n; a = _mm_shuffle_ps(_mm_sub_ss(_mm_load_ss(x+n),_mm_load_ss(y+n)), a ,_MM_SHUFFLE(1,0,0,1));    
        case 1: --n; a = _mm_move_ss(a,_mm_sub_ss(_mm_load_ss(x+n),_mm_load_ss(y+n)));
                sum = _mm_mul_ps(a,a);  
        case 0: ;
      }
      
      for ( int i = n-4; i >= 0; i-=4 )
      {
        a = _mm_sub_ps(VNL_SSE_HEAP_LOAD(ps)(x+i),VNL_SSE_HEAP_LOAD(ps)(y+i));    
        sum = _mm_add_ps(_mm_mul_ps(a,a),sum);
      }

      // sum will contain 4 accumulated values, need to add them together      
      sum = _mm_add_ps(sum,_mm_movehl_ps(_mm_setzero_ps(),sum));
      sum = _mm_add_ss(sum,_mm_shuffle_ps(sum,sum,_MM_SHUFFLE(3,2,1,1)));

      _mm_store_ss(&ret,sum);
      return ret;
    }

    static VNL_SSE_FORCE_INLINE void vector_x_matrix(const float* v, const float* m, float* r, unsigned rows, unsigned cols)
    {
      __m128 accum, x,y,z,w;

      //calculate if there are any left-over rows/columns
      unsigned r_left = rows%4;
      unsigned r_nice = rows - r_left;
      unsigned c_left = cols%4;
      unsigned c_nice = cols - c_left;
      
      //handle 4 matrix columns at a time
      for (unsigned j = 0; j < c_nice; j+=4)
      {
        //handle 4 matrix rows at a time
        accum = _mm_setzero_ps();
        unsigned i = 0;
        while ( i < r_nice )
        {          
          //load vector data so that
          // w = (v0,v1,v2,v3)
          w = VNL_SSE_HEAP_LOAD(ps)(v+i);

          // after shuffling
          // x = (v0, v0, v0, v0)
          // y = (v1, v1, v1, v1)
          // z = (v2, v2, v2, v2)
          // w = (v3, v3, v3, v3)
          x = _mm_shuffle_ps(w,w,_MM_SHUFFLE(0,0,0,0));
          y = _mm_shuffle_ps(w,w,_MM_SHUFFLE(1,1,1,1));
          z = _mm_shuffle_ps(w,w,_MM_SHUFFLE(2,2,2,2));
          w = _mm_shuffle_ps(w,w,_MM_SHUFFLE(3,3,3,3));

          // multipy the four matrix columns
          // i.e.  x = ( v0 * m00, v0 * m01, v0 * m02, v0 * m03)
          //       y = ( v1 * m10, v1 * m11, v1 * m12, v1 * m13)
          //       z = ( v2 * m20, v2 * m21, v2 * m22, v2 * m23)
          //       w = ( v3 * m30, v3 * m31, v3 * m32, v3 * m33)
          x = _mm_mul_ps(x,_mm_loadu_ps(m+i++*cols+j));
          y = _mm_mul_ps(y,_mm_loadu_ps(m+i++*cols+j));
          z = _mm_mul_ps(z,_mm_loadu_ps(m+i++*cols+j));
          w = _mm_mul_ps(w,_mm_loadu_ps(m+i++*cols+j));

          //now sum the four columns
          accum = _mm_add_ps(x,accum);
          accum = _mm_add_ps(y,accum);
          accum = _mm_add_ps(z,accum);
          accum = _mm_add_ps(w,accum);

          //accum is now ( v0 * m00 + v1 * m10 + v2 * m20 + v3 * m30, 
          //               v0 * m01 + v1 * m11 + v2 * m21 + v3 * m31,
          //               v0 * m02 + v1 * m12 + v2 * m22 + v3 * m32,
          //               v0 * m03 + v1 * m13 + v2 * m23 + v3 * m33 )
        }

        // handle left-over rows
        switch(r_left)
        {
          case 3: accum = _mm_add_ps(_mm_mul_ps(_mm_load1_ps(v+i),_mm_loadu_ps(m+i*cols+j)), accum); i++;
          case 2: accum = _mm_add_ps(_mm_mul_ps(_mm_load1_ps(v+i),_mm_loadu_ps(m+i*cols+j)), accum); i++;
          case 1: accum = _mm_add_ps(_mm_mul_ps(_mm_load1_ps(v+i),_mm_loadu_ps(m+i*cols+j)), accum); 
          case 0: ;
        }
        
        //store the 4 values of the result vector
        //use stream to avoid polluting the cache
        _mm_stream_ps(r+j,accum);
      }

      // handle the left over columns
      for(; c_left > 0; --c_left) {
        accum = _mm_setzero_ps();
        for (unsigned int i=0; i<rows; ++i)
          accum = _mm_add_ss(_mm_mul_ss(_mm_load_ss(m+i*cols+cols-c_left), _mm_load_ss(v+i)),accum);
        _mm_store_ss(r+cols-c_left,accum);
      }
    }
    
    static VNL_SSE_FORCE_INLINE void matrix_x_vector(const float* m, const float* v, float* r, unsigned rows, unsigned cols)
    {
      __m128 accum, x,y,z,w,mxy1,mxy2,mzw1,mzw2, mr1,mr2,mr3,mr4;

      //calculate if there are any left-over rows/columns
      unsigned r_left = rows%4;
      unsigned r_nice = rows - r_left;
      unsigned c_left = cols%4;
      unsigned c_nice = cols - c_left;

      //handle 4 matrix rows at a time
      for (unsigned i = 0; i < r_nice; i+=4)
      {
        //handle 4 matrix columns at a time
        accum = _mm_setzero_ps();
        const float *r1 = m+i*cols, *r2 = m+(i+1)*cols,
                    *r3 = m+(i+2)*cols, *r4 = m+(i+3)*cols;
        unsigned j = 0;
        for(; j < c_nice; j+=4)
        {
          // load  vector data so that
          //  w = (v0, v1, v2, v3)
          w = VNL_SSE_HEAP_LOAD(ps)(v+j);

          // after shuffling
          // x = (v0, v0, v0, v0)
          // y = (v1, v1, v1, v1)
          // z = (v2, v2, v2, v2)
          // w = (v3, v3, v3, v3)
          x = _mm_shuffle_ps(w,w,_MM_SHUFFLE(0,0,0,0));
          y = _mm_shuffle_ps(w,w,_MM_SHUFFLE(1,1,1,1));
          z = _mm_shuffle_ps(w,w,_MM_SHUFFLE(2,2,2,2));
          w = _mm_shuffle_ps(w,w,_MM_SHUFFLE(3,3,3,3));

          // load form first two rows of the matrix
          //i.e. mr1 = (m00, m01, m02, m03)
          //     mr2 = (m10, m11, m12, m13)
          mr1 = _mm_loadu_ps(r1+j);
          mr2 = _mm_loadu_ps(r2+j);
          
          //unpack into xy and zw parts
          // i.e mxy1 = (m00, m10, m01, m11)
          //     mxy1 = (m02, m12, m03, m13)
          mxy1 = _mm_unpacklo_ps(mr1,mr2);
          mzw1 = _mm_unpackhi_ps(mr1,mr2);
          
          //similarly for the next two rows
          mr3 = _mm_loadu_ps(r3+j);
          mr4 = _mm_loadu_ps(r4+j);

          //unpack into xy and zw parts
          // i.e mxy2 = (m20, m30, m21, m31)
          //     mxy2 = (m22, m32, m23, m33)
          mxy2 = _mm_unpacklo_ps(mr3,mr4);
          mzw2 = _mm_unpackhi_ps(mr3,mr4);
                    
          // move around matrix data and multiply so that
          // x = (v0,v0,v0,v0) * (m00,m10,m20,m30)
          // y = (v1,v1,v1,v1) * (m01,m11,m21,m31)
          // z = (v2,v2,v2,v2) * (m02,m12,m22,m32)
          // w = (v3,v3,v3,v3) * (m03,m13,m23,m33)
          x = _mm_mul_ps(x,_mm_movelh_ps(mxy1,mxy2));
          y = _mm_mul_ps(y,_mm_movehl_ps(mxy1,mxy2));
          z = _mm_mul_ps(z,_mm_movelh_ps(mzw1,mzw2));
          w = _mm_mul_ps(w,_mm_movehl_ps(mzw1,mzw2));

          //now sum the four results
          accum = _mm_add_ps(x,accum);
          accum = _mm_add_ps(y,accum);
          accum = _mm_add_ps(z,accum);
          accum = _mm_add_ps(w,accum);
          
          //accum is now ( v0 * m00 + v1 * m01 + v2 * m02 + v3 * m03, 
          //               v0 * m10 + v1 * m11 + v2 * m12 + v3 * m13, 
          //               v0 * m20 + v1 * m21 + v2 * m22 + v3 * m23, 
          //               v0 * m30 + v1 * m31 + v2 * m32 + v3 * m33 )
        }
        
        // handle the left over columns
        switch(c_left)
        {
          case 3: accum = _mm_add_ps(_mm_mul_ps(_mm_load1_ps(v+j),_mm_set_ps(*(r1+j),*(r2+j),*(r3+j),*(r4+j))), accum); j++;
          case 2: accum = _mm_add_ps(_mm_mul_ps(_mm_load1_ps(v+j),_mm_set_ps(*(r1+j),*(r2+j),*(r3+j),*(r4+j))), accum); j++;
          case 1: accum = _mm_add_ps(_mm_mul_ps(_mm_load1_ps(v+j),_mm_set_ps(*(r1+j),*(r2+j),*(r3+j),*(r4+j))), accum); 
          case 0: ;
        }
        //store the 4 values of the result vector
        //use stream to avoid polluting the cache
        _mm_stream_ps(r+i,accum);
      }  

      // handle the left over rows
      for(; r_left > 0; --r_left) {
        accum = _mm_setzero_ps();
        const float* p = m+(rows-r_left)*cols; 
        for (unsigned int j=0; j<cols; ++j)
          accum = _mm_add_ss(_mm_mul_ss(_mm_load_ss(p+j), _mm_load_ss(v+j)),accum);
        _mm_store_ss(r+rows-r_left,accum);
      }
    }
    
    static VNL_SSE_FORCE_INLINE float sum(const float* x, unsigned n) 
    {         
      float ret;
      __m128 sum = _mm_setzero_ps();              
      switch(n % 4) 
      { // handle vector sizes which aren't divisible by 4
        case 3: sum = _mm_load_ss(x+--n);   
        case 2: sum = _mm_shuffle_ps(_mm_load_ss(x+--n), sum ,_MM_SHUFFLE(1,0,0,1));    
        case 1: sum = _mm_move_ss(sum,_mm_load_ss(x+--n));
        case 0: ;
      }

      //sum four elements at a time, sum will contain four running totals 
      for(int i = n-4; i >= 0; i-=4)
        sum = _mm_add_ps(VNL_SSE_HEAP_LOAD(ps)(x+i),sum);

      // sum will contain 4 accumulated values, need to add them together      
      sum = _mm_add_ps(sum,_mm_movehl_ps(_mm_setzero_ps(),sum));
      sum = _mm_add_ss(sum,_mm_shuffle_ps(sum,sum,_MM_SHUFFLE(3,2,1,1)));
      _mm_store_ss(&ret,sum);
      return ret;
    }
    
    static VNL_SSE_FORCE_INLINE float max(const float* x, unsigned n)
    {
      float ret;
      __m128 max = _mm_setzero_ps();              
      switch(n % 4) 
      { // handle vector sizes which aren't divisible by 4
        case 3: max = _mm_load_ss(x+--n);   
        case 2: max = _mm_shuffle_ps(_mm_load_ss(x+--n), max ,_MM_SHUFFLE(1,0,0,1));    
        case 1: max = _mm_move_ss(max,_mm_load_ss(x+--n));
        case 0: ;
      }
      
      //handle four elements at a time, max will contain four max values 
      for(int i = n-4; i >= 0; i-=4)
        max = _mm_max_ps(VNL_SSE_HEAP_LOAD(ps)(x+i), max);

      // need compare max's four values
      max = _mm_max_ps(max,_mm_movehl_ps(_mm_setzero_ps(),max));
      max = _mm_max_ss(max,_mm_shuffle_ps(max,max,_MM_SHUFFLE(3,2,1,1)));
      _mm_store_ss(&ret,max);   
      
      return ret;
    }
    
    static VNL_SSE_FORCE_INLINE float min(const float* x, unsigned n)
    {
      float ret;
      __m128 min = _mm_set1_ps(FLT_MAX);
      
      switch(n%4)
      { // handle vector sizes which aren't divisible by 4
        case 3: min = _mm_min_ss(min,_mm_load_ss(x+--n));   
        case 2: min = _mm_min_ss(min,_mm_load_ss(x+--n));    
        case 1: min = _mm_min_ss(min,_mm_load_ss(x+--n));
        case 0: ;
      }

      //handle four elements at a time, min will contain four min values 
      for(int i = n-4; i >= 0; i-=4)
        min = _mm_min_ps(VNL_SSE_HEAP_LOAD(ps)(x+i), min);


      // need compare min's four values
      min = _mm_min_ps(min,_mm_movehl_ps(_mm_setzero_ps(),min));
      min = _mm_min_ss(min,_mm_shuffle_ps(min,min,_MM_SHUFFLE(3,2,1,1)));
      _mm_store_ss(&ret,min);   
      
      return ret;
    }
};

#endif

#endif //vnl_sse_h_
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -