📄 me_fns_kc.cpp
字号:
#include "idb_kernelc.hpp"
#include "me_kc.hpp"
#include "idb_kernelc2.hpp"
#define SHUF0 0x38281808
#define SHUF1 0x28180883
#define SHUF2 0x18088382
#define SHUF3 0x08838281
// Returns -1 if test_x and test_y are within the specified range for
// the search region, else returns 0. Therefore the range values are
// not absolute positions within the reference frame, but instead
// offsets relative to the current macroblock being processed.
// Therefore, for instance, the top and left ranges can be negative.
// For example, for a 16x16 search range algorithm when we are
// processing a macroblock somewhere in the middle of the top row, the
// specified ranges would be (0, 16, -16, 16).
inline int check_xy(int top_range, int bottom_range,
int left_range, int right_range,
int test_x, int test_y)
{
int outside, in_range;
// check if out-of-bounds on top
outside = test_y < top_range;
in_range = ~outside;
// check if out-of-bounds on bottom
outside = test_y > bottom_range;
in_range = in_range & ~outside;
// check if out-of-bounds on left
outside = test_x < left_range;
in_range = in_range & ~outside;
// check if out-of-bounds on right
outside = test_x > right_range;
in_range = in_range & ~outside;
return(in_range);
}
// Save a macroblock to an array in the scratchpad
inline void save_MB(array<ubyte4>& save_arr,
MB_PARAM_VAL(ubyte4, mb))
{
save_arr[0] = mb0;
save_arr[1] = mb1;
save_arr[2] = mb2;
save_arr[3] = mb3;
save_arr[4] = mb4;
save_arr[5] = mb5;
save_arr[6] = mb6;
save_arr[7] = mb7;
}
// Retrieve a saved macroblock from the scratchpad
inline void load_MB(array<ubyte4>& save_arr,
MB_PARAM_REF(ubyte4, mb))
{
mb0 = save_arr[0];
mb1 = save_arr[1];
mb2 = save_arr[2];
mb3 = save_arr[3];
mb4 = save_arr[4];
mb5 = save_arr[5];
mb6 = save_arr[6];
mb7 = save_arr[7];
}
// Get the MB located at (x,y) in the search region.
inline void extract_ref_MB4(array<ubyte4>& search_region,
int start_idx,
int x, int y,
int range_x, int range_y,
int& sr_row, int& sr_col, int& rot_perm,
MB_PARAM_REF(ubyte4, ref_mb))
{
// Calculate starting index into search_region
int num_cols, sr_row_num;
int new_x = x + range_x;
int new_y = y + range_y;
num_cols = shift(range_x, -1) + 4;
int sr_col0 = shift(new_x, -2) + start_idx;
int sr_col1 = sr_col0 + 1;
int sr_col2 = sr_col1 + 1;
int sr_col3 = sr_col2 + 1;
sr_col0 = select(itocc(sr_col0 >= num_cols), sr_col0 - num_cols, sr_col0);
sr_col1 = select(itocc(sr_col1 >= num_cols), sr_col1 - num_cols, sr_col1);
sr_col2 = select(itocc(sr_col2 >= num_cols), sr_col2 - num_cols, sr_col2);
sr_col3 = select(itocc(sr_col3 >= num_cols), sr_col3 - num_cols, sr_col3);
sr_col = sr_col0;
cc row_inc = itocc(cid() < (new_y & 0x7));
sr_row_num = shift(new_y, -3);
sr_row_num = select(row_inc, sr_row_num + 1, sr_row_num);
sr_row = lo(sr_row_num * num_cols);
int sr_idx0 = sr_row + sr_col0;
int sr_idx1 = sr_row + sr_col1;
int sr_idx2 = sr_row + sr_col2;
int sr_idx3 = sr_row + sr_col3;
int sr_idx4 = sr_idx0 + num_cols;
int sr_idx5 = sr_idx1 + num_cols;
int sr_idx6 = sr_idx2 + num_cols;
int sr_idx7 = sr_idx3 + num_cols;
// Get rows 0-7 in the clusters.
ref_mb0 = search_region[sr_idx0];
ref_mb1 = search_region[sr_idx1];
ref_mb2 = search_region[sr_idx2];
ref_mb3 = search_region[sr_idx3];
// Get rows 8-15 in the clusters
ref_mb4 = search_region[sr_idx4];
ref_mb5 = search_region[sr_idx5];
ref_mb6 = search_region[sr_idx6];
ref_mb7 = search_region[sr_idx7];
// rotate values so that the first row of search region is in
// cluster 0
rot_perm = (new_y + cid()) & 0x7;
ref_mb0 = commclperm(rot_perm, ref_mb0);
ref_mb1 = commclperm(rot_perm, ref_mb1);
ref_mb2 = commclperm(rot_perm, ref_mb2);
ref_mb3 = commclperm(rot_perm, ref_mb3);
ref_mb4 = commclperm(rot_perm, ref_mb4);
ref_mb5 = commclperm(rot_perm, ref_mb5);
ref_mb6 = commclperm(rot_perm, ref_mb6);
ref_mb7 = commclperm(rot_perm, ref_mb7);
}
// Get the MB located at (x,y) in the search region
inline void extract_ref_MB(array<ubyte4>& search_region,
int start_idx,
int x, int y,
int range_x, int range_y,
MB_PARAM_REF(ubyte4, ref_mb))
{
// Calculate starting index into search_region
int num_cols, sr_row, sr_row_idx;
int new_x = x + range_x;
int new_y = y + range_y;
num_cols = shift(range_x, -1) + 4;
int sr_col0 = shift(new_x, -2) + start_idx;
int sr_col1 = sr_col0 + 1;
int sr_col2 = sr_col1 + 1;
int sr_col3 = sr_col2 + 1;
int sr_col4 = sr_col3 + 1;
sr_col0 = select(itocc(sr_col0 >= num_cols), sr_col0 - num_cols, sr_col0);
sr_col1 = select(itocc(sr_col1 >= num_cols), sr_col1 - num_cols, sr_col1);
sr_col2 = select(itocc(sr_col2 >= num_cols), sr_col2 - num_cols, sr_col2);
sr_col3 = select(itocc(sr_col3 >= num_cols), sr_col3 - num_cols, sr_col3);
sr_col4 = select(itocc(sr_col4 >= num_cols), sr_col4 - num_cols, sr_col4);
cc row_inc = itocc(cid() < (new_y & 0x7));
sr_row = shift(new_y, -3);
sr_row = select(row_inc, sr_row + 1, sr_row);
sr_row_idx = lo(sr_row * num_cols);
int sr_idx0 = sr_row_idx + sr_col0;
int sr_idx1 = sr_row_idx + sr_col1;
int sr_idx2 = sr_row_idx + sr_col2;
int sr_idx3 = sr_row_idx + sr_col3;
int sr_idx4 = sr_row_idx + sr_col4;
int sr_idx5 = sr_idx0 + num_cols;
int sr_idx6 = sr_idx1 + num_cols;
int sr_idx7 = sr_idx2 + num_cols;
int sr_idx8 = sr_idx3 + num_cols;
int sr_idx9 = sr_idx4 + num_cols;
// Calculate shuffle permutation necessary to extract the correct 16
// bytes out of a total of 5 32-bit words (to account for horizontal
// indices that are not a multiple of 4).
cc x_nlsb = itocc((new_x & 2) != 0);
cc x_lsb = itocc((new_x & 1) != 0);
byte4 ctrl = select(x_nlsb,
select(x_lsb, SHUF3, SHUF2),
select(x_lsb, SHUF1, SHUF0));
ubyte4 word0_lo, word0_hi, word1_lo, word1_hi;
ubyte4 word2_lo, word2_hi, word3_lo, word3_hi;
ubyte4 dummy;
// Get rows 0-7 in the clusters.
hi_lo(word0_hi, dummy) = shuffled(search_region[sr_idx0], ctrl);
hi_lo(word1_hi, word0_lo) = shuffled(search_region[sr_idx1], ctrl);
hi_lo(word2_hi, word1_lo) = shuffled(search_region[sr_idx2], ctrl);
hi_lo(word3_hi, word2_lo) = shuffled(search_region[sr_idx3], ctrl);
hi_lo(dummy, word3_lo) = shuffled(search_region[sr_idx4], ctrl);
ref_mb0 = word0_hi | word0_lo;
ref_mb1 = word1_hi | word1_lo;
ref_mb2 = word2_hi | word2_lo;
ref_mb3 = word3_hi | word3_lo;
// Get rows 8-15 in the clusters
hi_lo(word0_hi, dummy) = shuffled(search_region[sr_idx5], ctrl);
hi_lo(word1_hi, word0_lo) = shuffled(search_region[sr_idx6], ctrl);
hi_lo(word2_hi, word1_lo) = shuffled(search_region[sr_idx7], ctrl);
hi_lo(word3_hi, word2_lo) = shuffled(search_region[sr_idx8], ctrl);
hi_lo(dummy, word3_lo) = shuffled(search_region[sr_idx9], ctrl);
ref_mb4 = word0_hi | word0_lo;
ref_mb5 = word1_hi | word1_lo;
ref_mb6 = word2_hi | word2_lo;
ref_mb7 = word3_hi | word3_lo;
// rotate values so that the first row of search region is in
// cluster 0
int rot_perm = (new_y + cid()) & 0x7;
ref_mb0 = commclperm(rot_perm, ref_mb0);
ref_mb1 = commclperm(rot_perm, ref_mb1);
ref_mb2 = commclperm(rot_perm, ref_mb2);
ref_mb3 = commclperm(rot_perm, ref_mb3);
ref_mb4 = commclperm(rot_perm, ref_mb4);
ref_mb5 = commclperm(rot_perm, ref_mb5);
ref_mb6 = commclperm(rot_perm, ref_mb6);
ref_mb7 = commclperm(rot_perm, ref_mb7);
}
// Shifts the reference macroblock 4 pixels to the right in the search
// region. It assumes that the macroblock is originally aligned to a
// 4 pixel horizontal boundary.
inline void shift_ref_MB4(array<ubyte4>& search_region,
int sr_row, int& sr_col,
int range_x,
int rot_perm,
MB_PARAM_REF(ubyte4, ref_mb))
{
ref_mb0 = ref_mb1;
ref_mb1 = ref_mb2;
ref_mb2 = ref_mb3;
ref_mb4 = ref_mb5;
ref_mb5 = ref_mb6;
ref_mb6 = ref_mb7;
int num_cols = shift(range_x, -1) + 4;
int sr_col_idx = sr_col + 4;
sr_col_idx = select(itocc(sr_col_idx >= num_cols),
sr_col_idx - num_cols,
sr_col_idx);
int sr_idx = sr_row + sr_col_idx;
ref_mb3 = commclperm(rot_perm, search_region[sr_idx]);
ref_mb7 = commclperm(rot_perm, search_region[sr_idx + num_cols]);
sr_col = sr_col + 1;
sr_col = select(itocc(sr_col == num_cols), 0, sr_col);
}
// Compare the SAD of the MB at the location (test_x, test_y) in the
// search region with the previous best, and save (test_x, test_y) if
// its SAD is better.
inline void compare_MB(MB_PARAM_VAL(ubyte4, ref_mb),
MB_PARAM_VAL(ubyte4, mb),
int test_x, int test_y, int in_range,
int& mv_x, int& mv_y, uint& mv_sad)
{
// Take absolute differences of 8-bit values
ubyte4 diff0 = abd(ref_mb0, mb0);
ubyte4 diff1 = abd(ref_mb1, mb1);
ubyte4 diff2 = abd(ref_mb2, mb2);
ubyte4 diff3 = abd(ref_mb3, mb3);
ubyte4 diff4 = abd(ref_mb4, mb4);
ubyte4 diff5 = abd(ref_mb5, mb5);
ubyte4 diff6 = abd(ref_mb6, mb6);
ubyte4 diff7 = abd(ref_mb7, mb7);
// Convert to 16-bit values and start summing in each cluster, and
// convert to 32-bit value at the end
double<uhalf2> sadA0, sadA1;
uhalf2 sadB0, sadB1, sadC0, sadC1;
uint sad;
sadA0 = shuffled(uhalf2(diff0), 0x88318820);
sadA1 = shuffled(uhalf2(diff1), 0x88318820);
sadB0 = (hi(sadA0) + lo(sadA0)) + (hi(sadA1) + lo(sadA1));
sadA0 = shuffled(uhalf2(diff2), 0x88318820);
sadA1 = shuffled(uhalf2(diff3), 0x88318820);
sadB1 = (hi(sadA0) + lo(sadA0)) + (hi(sadA1) + lo(sadA1));
sadC0 = sadB0 + sadB1;
sadA0 = shuffled(uhalf2(diff4), 0x88318820);
sadA1 = shuffled(uhalf2(diff5), 0x88318820);
sadB0 = (hi(sadA0) + lo(sadA0)) + (hi(sadA1) + lo(sadA1));
sadA0 = shuffled(uhalf2(diff6), 0x88318820);
sadA1 = shuffled(uhalf2(diff7), 0x88318820);
sadB1 = (hi(sadA0) + lo(sadA0)) + (hi(sadA1) + lo(sadA1));
sadC1 = sadB0 + sadB1;
double<uhalf2> final_sad = shuffled(sadC0 + sadC1, 0x88883120);
sad = uint(hi(final_sad) + lo(final_sad));
// accumulate across clusters
uc<int> tree_sum1 = 0x67452301;
uc<int> tree_sum2 = 0x44660022;
uc<int> tree_sum3 = 0x00004444;
sad = sad + commucperm(tree_sum1, sad);
sad = sad + commucperm(tree_sum2, sad);
sad = sad + commucperm(tree_sum3, sad);
// Save current loc as best match if SAD is less than previous best
cc better = itocc((sad < mv_sad) & in_range);
mv_x = select(better, test_x, mv_x);
mv_y = select(better, test_y, mv_y);
mv_sad = select(better, sad, mv_sad);
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -