blocksearch_kc.cpp

来自「H.264完整的C语言代码和DCT的代码」· C++ 代码 · 共 345 行
CPP
345 行
#include "idb_kernelc.hpp"   
#include "mpeg.hpp"
#include "idb_kernelc2.hpp"   

KERNELDEF(blocksearch, "mpeg_sc/blocksearch_kc.uc");

// Performs a right shift by two (division by four) on a ubyte4
#define UBYTE4_DIV4( x ) \
  ubyte4(int(shift( half2(x), minus_2 )) & shift_mask )

// COMPUTE_SAD does the sad calculation for a complete macroblock against one
// vertical position in the current reference column.  Two different reference
// variables are passed in because the compare can straddle the rows in some
// cases.
//
// m      - macroblock expand
// moff   - offset into the macroblock expand (for picking one of the two macroblocks)
// r1     - first reference row
// r1_off - offset into the first reference row
// r2     - second reference row
// r2_off - offset into the second reference row
// valid  - non-zero if the row should be considered valid (allows mv_sad and
//            mv_xy to be updated). This is used at the top and bottom of the image.
// xoff   - Horizontal offset of the current search (used for motion vector).
// yoff   - Vertical offset of the current search (used for motion vector).
// mv_sad - Lowest current sad of the given macroblock.
// mv_xy  - Motion vector of the lowest sad.
//
// NOTES: I didn't put in anything special to limit the chance of saturation of
// the ubyte4's when summing the rows of the macroblock.

#define COMPUTE_SAD( m, moff, r1, r1_off, r2, r2_off, valid, xoff, yoff, mv_sad, mv_xy ) \
  sad4 = \
      ( ( abd( m[moff], r1[r1_off] )     + abd( mb[moff+1], r2[r2_off] ) ) + \
        ( abd( m[moff+2], r1[r1_off+2] ) + abd( mb[moff+3], r2[r2_off+2] ) ) ); \
  t1 = shuffled( sad4, 0x88318820 ); \
  sad_total = half2(hi(t1)) + half2(lo(t1)); \
  sad4 = \
      ( ( abd( m[moff+4], r1[r1_off+4] ) + abd( mb[moff+5], r2[r2_off+4] ) ) + \
        ( abd( m[moff+6], r1[r1_off+6] ) + abd( mb[moff+7], r2[r2_off+6] ) ) ); \
  t1 = shuffled( sad4, 0x88318820 ); \
  sad_total = sad_total + half2(hi(t1)) + half2(lo(t1)); \
  t2 = shuffled( sad_total, 0x88883120 ); \
  sad_total = half2(hi(t2)) + half2(lo(t2)); \
  sad_total = sad_total + commucperm( tree_sum1, sad_total ); \
  sad_total = sad_total + commucperm( tree_sum2, sad_total ); \
  sad_total = sad_total + commucperm( tree_sum3, sad_total ); \
  \
  sad_less = itocc( ( int( sad_total ) < int( mv_sad ) ) & int(valid) ); \
  mv_sad = select( sad_less, sad_total, mv_sad ); \
  mv_xy  = select( sad_less, shift( int(yoff) & 0xffff, 16 ) | int(xoff) & 0xffff, mv_xy );

// COMPUTE_ALL_MB does the sad calculation for both macroblocks against
// the current reference column.
//
// r1     - first reference row
// r1_off - offset into the first reference row
// r2     - second reference row
// r2_off - offset into the second reference row
// valid  - non-zero if the row should be considered valid (allows mv_sad and
//            mv_xy to be updated). This is used at the top and bottom of the image.
// xoff   - Horizontal offset of the current search (used for motion vector).
// yoff   - Vertical offset of the current search (used for motion vector).

#define COMPUTE_ALL_MB( r1, r1_off, r2, r2_off, valid, xoff, yoff ) \
  COMPUTE_SAD( mb, 0,  r1, r1_off, r2, r2_off, valid, -16+xoff, yoff, mv_sad[0], mv_xy[0] ); \
  COMPUTE_SAD( mb, 8,  r1, r1_off, r2, r2_off, valid, xoff, yoff, mv_sad[1], mv_xy[1] );

//  COMPUTE_SAD( mb, 16, r1, r1_off, r2, r2_off, 16+xoff, yoff, mv_sad[2], mv_xy[2] );

// VERTICAL_SHIFT rotates the entire current column up (towards the top of the
// image).  This eliminates the topmost row.

#define VERTICAL_SHIFT \
  r0[0] = commucperm( rotate1, select( node0, r0[1], r0[0] ) ); \
  r0[2] = commucperm( rotate1, select( node0, r0[3], r0[2] ) ); \
  r0[4] = commucperm( rotate1, select( node0, r0[5], r0[4] ) ); \
  r0[6] = commucperm( rotate1, select( node0, r0[7], r0[6] ) ); \
  \
  r0[1] = commucperm( rotate1, select( node0, r1[0], r0[1] ) ); \
  r0[3] = commucperm( rotate1, select( node0, r1[2], r0[3] ) ); \
  r0[5] = commucperm( rotate1, select( node0, r1[4], r0[5] ) ); \
  r0[7] = commucperm( rotate1, select( node0, r1[6], r0[7] ) ); \
  \
  r1[0] = commucperm( rotate1, select( node0, r1[1], r1[0] ) ); \
  r1[2] = commucperm( rotate1, select( node0, r1[3], r1[2] ) ); \
  r1[4] = commucperm( rotate1, select( node0, r1[5], r1[4] ) ); \
  r1[6] = commucperm( rotate1, select( node0, r1[7], r1[6] ) ); \
  \
  r1[1] = commucperm( rotate1, select( node0, r2[0], r1[1] ) ); \
  r1[3] = commucperm( rotate1, select( node0, r2[2], r1[3] ) ); \
  r1[5] = commucperm( rotate1, select( node0, r2[4], r1[5] ) ); \
  r1[7] = commucperm( rotate1, select( node0, r2[6], r1[7] ) ); \
  \
  r2[0] = commucperm( rotate1, select( node0, r2[1], r2[0] ) ); \
  r2[2] = commucperm( rotate1, select( node0, r2[3], r2[2] ) ); \
  r2[4] = commucperm( rotate1, select( node0, r2[5], r2[4] ) ); \
  r2[6] = commucperm( rotate1, select( node0, r2[7], r2[6] ) ); \
  \
  r2[1] = commucperm( rotate1, r2[1] ); \
  r2[3] = commucperm( rotate1, r2[3] ); \
  r2[5] = commucperm( rotate1, r2[5] ); \
  r2[7] = commucperm( rotate1, r2[7] );

// COMPUTE_VERT does the 4 overlapping macroblock sad's that can be 
// computed for a single rotation of the reference column.

#define PASS_MB_STATE \
  mb[0] %= mb[0]; \
  mb[1] %= mb[1]; \
  mb[2] %= mb[2]; \
  mb[3] %= mb[3]; \
  mb[4] %= mb[4]; \
  mb[5] %= mb[5]; \
  mb[6] %= mb[6]; \
  mb[7] %= mb[7]; \
  mb[8] %= mb[8]; \
  mb[9] %= mb[9]; \
  mb[10] %= mb[10]; \
  mb[11] %= mb[11]; \
  mb[12] %= mb[12]; \
  mb[13] %= mb[13]; \
  mb[14] %= mb[14]; \
  mb[15] %= mb[15];  \
  \
  mv_sad[0] %= mv_sad[0]; \
  mv_sad[1] %= mv_sad[1]; \
  \
  mv_xy[0] %= mv_xy[0]; \
  mv_xy[1] %= mv_xy[1];


#define PASS_R_STATE(n) \
  r##n[0] %= r##n[0]; \
  r##n[1] %= r##n[1]; \
  r##n[2] %= r##n[2]; \
  r##n[3] %= r##n[3]; \
  r##n[4] %= r##n[4]; \
  r##n[5] %= r##n[5]; \
  r##n[6] %= r##n[6]; \
  r##n[7] %= r##n[7];


#define COMPUTE_VERT( xoff, yoff, valid ) \
  PASS_MB_STATE \
  PASS_R_STATE(0) \
  COMPUTE_ALL_MB( r0, 0, r0, 1, row0_valid, xoff, -16+yoff ) \
  PASS_R_STATE(1) \
  COMPUTE_ALL_MB( r0, 1, r1, 0, row0_valid, xoff, -8+yoff  ) \
  barrier(); \
  PASS_MB_STATE \
  COMPUTE_ALL_MB( r1, 0, r1, 1, int( valid ), xoff, yoff   ) \
  PASS_R_STATE(2) \
  COMPUTE_ALL_MB( r1, 1, r2, 0, row2_valid, xoff, 8+yoff   ) \
  VERTICAL_SHIFT

// Blocksearch
//
// row0,row1,row2 - The rows of the reference image.
// mblocks - The row of macroblocks to be checked against the image.
// motions - The motion vectors of the mblocks (one per mblock).
// location - bit 0 is set for top row, bit 1 is set for bottom row
//
// *The mblocks stream should be the same length as all of the row streams.
//

kernel blocksearch( istream<ubyte4> row0, istream<ubyte4> row1, istream<ubyte4> row2, 
                    istream<ubyte4> mblocks, costream<half2> motions, 
                    uc<int>& location )
{
  // stored rows
  expand<ubyte4> r0( 8 );
  expand<ubyte4> r1( 8 );
  expand<ubyte4> r2( 8 );
  array<ubyte4> r_save( 4 );

  uc<int> rotate1   = 0x07654321;

  uc<int> tree_sum1 = 0x67452301;
  uc<int> tree_sum2 = 0x44660022;
  uc<int> tree_sum3 = 0x00004444;

  half2 mv;

  // stored macroblocks
  expand<ubyte4> mb( 16 );

  // best motion vectors
  expand<int> mv_xy( 3 );
  expand<half2> mv_sad( 3 );

  // mask to make right shift by 4 work for ubyte4's
  int shift_mask = 0xff3fff3f;
  int minus_2    = 0 - 2;

  mblocks >> mb[0]; mb[0] = UBYTE4_DIV4( mb[0] );
  mblocks >> mb[1]; mb[1] = UBYTE4_DIV4( mb[1] );
  mblocks >> mb[2]; mb[2] = UBYTE4_DIV4( mb[2] );
  mblocks >> mb[3]; mb[3] = UBYTE4_DIV4( mb[3] );
  mblocks >> mb[4]; mb[4] = UBYTE4_DIV4( mb[4] );
  mblocks >> mb[5]; mb[5] = UBYTE4_DIV4( mb[5] );
  mblocks >> mb[6]; mb[6] = UBYTE4_DIV4( mb[6] );
  mblocks >> mb[7]; mb[7] = UBYTE4_DIV4( mb[7] );

  row0 >> r0[0]; r0[0] = UBYTE4_DIV4( r0[0] );
  row0 >> r0[1]; r0[1] = UBYTE4_DIV4( r0[1] );
  row0 >> r0[2]; r0[2] = UBYTE4_DIV4( r0[2] );
  row0 >> r0[3]; r0[3] = UBYTE4_DIV4( r0[3] );
  row0 >> r0[4]; r0[4] = UBYTE4_DIV4( r0[4] );
  row0 >> r0[5]; r0[5] = UBYTE4_DIV4( r0[5] );
  row0 >> r0[6]; r0[6] = UBYTE4_DIV4( r0[6] );
  row0 >> r0[7]; r0[7] = UBYTE4_DIV4( r0[7] );

  row1 >> r1[0]; r1[0] = UBYTE4_DIV4( r1[0] );
  row1 >> r1[1]; r1[1] = UBYTE4_DIV4( r1[1] );
  row1 >> r1[2]; r1[2] = UBYTE4_DIV4( r1[2] );
  row1 >> r1[3]; r1[3] = UBYTE4_DIV4( r1[3] );
  row1 >> r1[4]; r1[4] = UBYTE4_DIV4( r1[4] );
  row1 >> r1[5]; r1[5] = UBYTE4_DIV4( r1[5] );
  row1 >> r1[6]; r1[6] = UBYTE4_DIV4( r1[6] );
  row1 >> r1[7]; r1[7] = UBYTE4_DIV4( r1[7] );

  row2 >> r2[0]; r2[0] = UBYTE4_DIV4( r2[0] );
  row2 >> r2[1]; r2[1] = UBYTE4_DIV4( r2[1] );
  row2 >> r2[2]; r2[2] = UBYTE4_DIV4( r2[2] );
  row2 >> r2[3]; r2[3] = UBYTE4_DIV4( r2[3] );
  row2 >> r2[4]; r2[4] = UBYTE4_DIV4( r2[4] );
  row2 >> r2[5]; r2[5] = UBYTE4_DIV4( r2[5] );
  row2 >> r2[6]; r2[6] = UBYTE4_DIV4( r2[6] );
  row2 >> r2[7]; r2[7] = UBYTE4_DIV4( r2[7] );

  cc node0 = itocc( cid( ) == 0 );

  int row0_valid = ( ( commclperm( 8, 0, location ) & 0x1 ) == 0 );
  int row2_valid = ( ( commclperm( 8, 0, location ) & 0x2 ) == 0 );

  // if both the top and bottom rows are invalid we can only search
  // vertical offsets of zero - store this in master_vert

  int master_vert;

  master_vert = select( itocc( row0_valid | row2_valid ), 8, 1 );

  mv_sad[0] = 0x7fff;
  mv_sad[1] = 0x7fff;

  mv_xy[0]  = 0;
  mv_xy[1]  = 0;

  loop_stream( row0 ) {
   
    // rotate the stored macroblock and read the next one

    mb[8]  = mb[0]; mb[9]  = mb[1]; mb[10] = mb[2]; mb[11] = mb[3];
    mb[12] = mb[4]; mb[13] = mb[5]; mb[14] = mb[6]; mb[15] = mb[7];
    mv_sad[1] = mv_sad[0]; mv_xy[1] = mv_xy[0];

    mblocks >> mb[0]; mb[0] = UBYTE4_DIV4( mb[0] );
    mblocks >> mb[1]; mb[1] = UBYTE4_DIV4( mb[1] );
    mblocks >> mb[2]; mb[2] = UBYTE4_DIV4( mb[2] );
    mblocks >> mb[3]; mb[3] = UBYTE4_DIV4( mb[3] );
    mblocks >> mb[4]; mb[4] = UBYTE4_DIV4( mb[4] );
    mblocks >> mb[5]; mb[5] = UBYTE4_DIV4( mb[5] );
    mblocks >> mb[6]; mb[6] = UBYTE4_DIV4( mb[6] );
    mblocks >> mb[7]; mb[7] = UBYTE4_DIV4( mb[7] );

    mv_sad[0] = 0x7fff; mv_xy[0] = 0;

    uc<int> horz_cnt = 4;
    int horz_off = 0;

    loop_count( horz_cnt ) { 
      uc<int> vert_cnt;
      int vert_off = 0;
      int dummy;

      dummy = commclperm( 0, master_vert, 0, vert_cnt );

      // save the "top" row data which gets
      // destroyed by the vertical rotations

      r_save[0] = r0[0]; r_save[1] = r0[2];
      r_save[2] = r0[4]; r_save[3] = r0[6];

      // Do all the vertical compares for the current
      // reference column.

      int no_vert_offset = 0xffffffff;
      loop_count( vert_cnt ) {
        ubyte4 sad4;
        double<ubyte4> t1;
        double<half2>  t2;

        half2 sad_total;
        cc sad_less;

        COMPUTE_VERT( horz_off, vert_off, row2_valid | no_vert_offset );

        vert_off = vert_off + 1;
        no_vert_offset = 0;
      }

      // These copies do a couple of things:
      //
      //  - Restores the rows after vertical shifting (using
      //      the r_save values).
      //  - Does a horizontal shift of the image to make
      //      room for the next row.
      //

      r2[1] = r2[2]; r2[3] = r2[4]; r2[5] = r2[6]; 
      r2[0] = r1[3]; r2[2] = r1[5]; r2[4] = r1[7]; 
      r1[1] = r1[2]; r1[3] = r1[4]; r1[5] = r1[6]; 
      r1[0] = r0[3]; r1[2] = r0[5]; r1[4] = r0[7]; 
      r0[1] = r0[2]; r0[3] = r0[4]; r0[5] = r0[6]; 
      r0[0] = r_save[1]; r0[2] = r_save[2]; r0[4] = r_save[3];

      horz_off = horz_off + 4;

      // read the next byte4 column for the reference
      row0 >> r0[6]; r0[6] = UBYTE4_DIV4( r0[6] );
      row0 >> r0[7]; r0[7] = UBYTE4_DIV4( r0[7] );
      row1 >> r1[6]; r1[6] = UBYTE4_DIV4( r1[6] );
      row1 >> r1[7]; r1[7] = UBYTE4_DIV4( r1[7] );
      row2 >> r2[6]; r2[6] = UBYTE4_DIV4( r2[6] );
      row2 >> r2[7]; r2[7] = UBYTE4_DIV4( r2[7] );
    }

    // output the motion vector of the current macroblock

    // Shift to make MV's 1/2 pixel values
    mv = shift(half2(mv_xy[1]), 1);
    motions(node0) << mv;
    motions(node0) << mv_sad[1]; 
  }

  // there is a single block leftover ... write it out.

  // Shift to make MV's 1/2 pixel values
  mv = shift(half2(mv_xy[0]), 1);
  motions(node0) << mv;
  motions(node0) << mv_sad[0];
  flush( motions, 0 );
}
blocksearch_kc.cpp - 源码说明

本页面展示了「H.264完整的C语言代码和DCT的代码」中的 blocksearch_kc.cpp 源码文件，采用 C++ 编程语言编写，共 345 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与264相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?