📄 me_fast_jitter2_kc.i
字号:
outside = test_y > bottom_range;
in_range = in_range & ~outside;
outside = test_x < left_range;
in_range = in_range & ~outside;
outside = test_x > right_range;
in_range = in_range & ~outside;
return(in_range);
}
inline void save_MB(array<ubyte4>& save_arr,
ubyte4 mb0, ubyte4 mb1, ubyte4 mb2, ubyte4 mb3, ubyte4 mb4, ubyte4 mb5, ubyte4 mb6, ubyte4 mb7)
{
save_arr[0] = mb0;
save_arr[1] = mb1;
save_arr[2] = mb2;
save_arr[3] = mb3;
save_arr[4] = mb4;
save_arr[5] = mb5;
save_arr[6] = mb6;
save_arr[7] = mb7;
}
inline void load_MB(array<ubyte4>& save_arr,
ubyte4& mb0, ubyte4& mb1, ubyte4& mb2, ubyte4& mb3, ubyte4& mb4, ubyte4& mb5, ubyte4& mb6, ubyte4& mb7)
{
mb0 = save_arr[0];
mb1 = save_arr[1];
mb2 = save_arr[2];
mb3 = save_arr[3];
mb4 = save_arr[4];
mb5 = save_arr[5];
mb6 = save_arr[6];
mb7 = save_arr[7];
}
inline void extract_ref_MB4(array<ubyte4>& search_region,
int start_idx,
int x, int y,
int range_x, int range_y,
int& sr_row, int& sr_col, int& rot_perm,
ubyte4& ref_mb0, ubyte4& ref_mb1, ubyte4& ref_mb2, ubyte4& ref_mb3, ubyte4& ref_mb4, ubyte4& ref_mb5, ubyte4& ref_mb6, ubyte4& ref_mb7)
{
int num_cols, sr_row_num;
int new_x = x + range_x;
int new_y = y + range_y;
num_cols = shift(range_x, -1) + 4;
int sr_col0 = shift(new_x, -2) + start_idx;
int sr_col1 = sr_col0 + 1;
int sr_col2 = sr_col1 + 1;
int sr_col3 = sr_col2 + 1;
sr_col0 = select(itocc(sr_col0 >= num_cols), sr_col0 - num_cols, sr_col0);
sr_col1 = select(itocc(sr_col1 >= num_cols), sr_col1 - num_cols, sr_col1);
sr_col2 = select(itocc(sr_col2 >= num_cols), sr_col2 - num_cols, sr_col2);
sr_col3 = select(itocc(sr_col3 >= num_cols), sr_col3 - num_cols, sr_col3);
sr_col = sr_col0;
cc row_inc = itocc(cid() < (new_y & 0x7));
sr_row_num = shift(new_y, -3);
sr_row_num = select(row_inc, sr_row_num + 1, sr_row_num);
sr_row = lo(sr_row_num * num_cols);
int sr_idx0 = sr_row + sr_col0;
int sr_idx1 = sr_row + sr_col1;
int sr_idx2 = sr_row + sr_col2;
int sr_idx3 = sr_row + sr_col3;
int sr_idx4 = sr_idx0 + num_cols;
int sr_idx5 = sr_idx1 + num_cols;
int sr_idx6 = sr_idx2 + num_cols;
int sr_idx7 = sr_idx3 + num_cols;
ref_mb0 = search_region[sr_idx0];
ref_mb1 = search_region[sr_idx1];
ref_mb2 = search_region[sr_idx2];
ref_mb3 = search_region[sr_idx3];
ref_mb4 = search_region[sr_idx4];
ref_mb5 = search_region[sr_idx5];
ref_mb6 = search_region[sr_idx6];
ref_mb7 = search_region[sr_idx7];
rot_perm = (new_y + cid()) & 0x7;
ref_mb0 = commclperm(rot_perm, ref_mb0);
ref_mb1 = commclperm(rot_perm, ref_mb1);
ref_mb2 = commclperm(rot_perm, ref_mb2);
ref_mb3 = commclperm(rot_perm, ref_mb3);
ref_mb4 = commclperm(rot_perm, ref_mb4);
ref_mb5 = commclperm(rot_perm, ref_mb5);
ref_mb6 = commclperm(rot_perm, ref_mb6);
ref_mb7 = commclperm(rot_perm, ref_mb7);
}
inline void extract_ref_MB(array<ubyte4>& search_region,
int start_idx,
int x, int y,
int range_x, int range_y,
ubyte4& ref_mb0, ubyte4& ref_mb1, ubyte4& ref_mb2, ubyte4& ref_mb3, ubyte4& ref_mb4, ubyte4& ref_mb5, ubyte4& ref_mb6, ubyte4& ref_mb7)
{
int num_cols, sr_row, sr_row_idx;
int new_x = x + range_x;
int new_y = y + range_y;
num_cols = shift(range_x, -1) + 4;
int sr_col0 = shift(new_x, -2) + start_idx;
int sr_col1 = sr_col0 + 1;
int sr_col2 = sr_col1 + 1;
int sr_col3 = sr_col2 + 1;
int sr_col4 = sr_col3 + 1;
sr_col0 = select(itocc(sr_col0 >= num_cols), sr_col0 - num_cols, sr_col0);
sr_col1 = select(itocc(sr_col1 >= num_cols), sr_col1 - num_cols, sr_col1);
sr_col2 = select(itocc(sr_col2 >= num_cols), sr_col2 - num_cols, sr_col2);
sr_col3 = select(itocc(sr_col3 >= num_cols), sr_col3 - num_cols, sr_col3);
sr_col4 = select(itocc(sr_col4 >= num_cols), sr_col4 - num_cols, sr_col4);
cc row_inc = itocc(cid() < (new_y & 0x7));
sr_row = shift(new_y, -3);
sr_row = select(row_inc, sr_row + 1, sr_row);
sr_row_idx = lo(sr_row * num_cols);
int sr_idx0 = sr_row_idx + sr_col0;
int sr_idx1 = sr_row_idx + sr_col1;
int sr_idx2 = sr_row_idx + sr_col2;
int sr_idx3 = sr_row_idx + sr_col3;
int sr_idx4 = sr_row_idx + sr_col4;
int sr_idx5 = sr_idx0 + num_cols;
int sr_idx6 = sr_idx1 + num_cols;
int sr_idx7 = sr_idx2 + num_cols;
int sr_idx8 = sr_idx3 + num_cols;
int sr_idx9 = sr_idx4 + num_cols;
cc x_nlsb = itocc((new_x & 2) != 0);
cc x_lsb = itocc((new_x & 1) != 0);
byte4 ctrl = select(x_nlsb,
select(x_lsb, 0x08838281, 0x18088382),
select(x_lsb, 0x28180883, 0x38281808));
ubyte4 word0_lo, word0_hi, word1_lo, word1_hi;
ubyte4 word2_lo, word2_hi, word3_lo, word3_hi;
ubyte4 dummy;
hi_lo(word0_hi, dummy) = shuffled(search_region[sr_idx0], ctrl);
hi_lo(word1_hi, word0_lo) = shuffled(search_region[sr_idx1], ctrl);
hi_lo(word2_hi, word1_lo) = shuffled(search_region[sr_idx2], ctrl);
hi_lo(word3_hi, word2_lo) = shuffled(search_region[sr_idx3], ctrl);
hi_lo(dummy, word3_lo) = shuffled(search_region[sr_idx4], ctrl);
ref_mb0 = word0_hi | word0_lo;
ref_mb1 = word1_hi | word1_lo;
ref_mb2 = word2_hi | word2_lo;
ref_mb3 = word3_hi | word3_lo;
hi_lo(word0_hi, dummy) = shuffled(search_region[sr_idx5], ctrl);
hi_lo(word1_hi, word0_lo) = shuffled(search_region[sr_idx6], ctrl);
hi_lo(word2_hi, word1_lo) = shuffled(search_region[sr_idx7], ctrl);
hi_lo(word3_hi, word2_lo) = shuffled(search_region[sr_idx8], ctrl);
hi_lo(dummy, word3_lo) = shuffled(search_region[sr_idx9], ctrl);
ref_mb4 = word0_hi | word0_lo;
ref_mb5 = word1_hi | word1_lo;
ref_mb6 = word2_hi | word2_lo;
ref_mb7 = word3_hi | word3_lo;
int rot_perm = (new_y + cid()) & 0x7;
ref_mb0 = commclperm(rot_perm, ref_mb0);
ref_mb1 = commclperm(rot_perm, ref_mb1);
ref_mb2 = commclperm(rot_perm, ref_mb2);
ref_mb3 = commclperm(rot_perm, ref_mb3);
ref_mb4 = commclperm(rot_perm, ref_mb4);
ref_mb5 = commclperm(rot_perm, ref_mb5);
ref_mb6 = commclperm(rot_perm, ref_mb6);
ref_mb7 = commclperm(rot_perm, ref_mb7);
}
inline void shift_ref_MB4(array<ubyte4>& search_region,
int sr_row, int& sr_col,
int range_x,
int rot_perm,
ubyte4& ref_mb0, ubyte4& ref_mb1, ubyte4& ref_mb2, ubyte4& ref_mb3, ubyte4& ref_mb4, ubyte4& ref_mb5, ubyte4& ref_mb6, ubyte4& ref_mb7)
{
ref_mb0 = ref_mb1;
ref_mb1 = ref_mb2;
ref_mb2 = ref_mb3;
ref_mb4 = ref_mb5;
ref_mb5 = ref_mb6;
ref_mb6 = ref_mb7;
int num_cols = shift(range_x, -1) + 4;
int sr_col_idx = sr_col + 4;
sr_col_idx = select(itocc(sr_col_idx >= num_cols),
sr_col_idx - num_cols,
sr_col_idx);
int sr_idx = sr_row + sr_col_idx;
ref_mb3 = commclperm(rot_perm, search_region[sr_idx]);
ref_mb7 = commclperm(rot_perm, search_region[sr_idx + num_cols]);
sr_col = sr_col + 1;
sr_col = select(itocc(sr_col == num_cols), 0, sr_col);
}
inline void compare_MB(ubyte4 ref_mb0, ubyte4 ref_mb1, ubyte4 ref_mb2, ubyte4 ref_mb3, ubyte4 ref_mb4, ubyte4 ref_mb5, ubyte4 ref_mb6, ubyte4 ref_mb7,
ubyte4 mb0, ubyte4 mb1, ubyte4 mb2, ubyte4 mb3, ubyte4 mb4, ubyte4 mb5, ubyte4 mb6, ubyte4 mb7,
int test_x, int test_y, int in_range,
int& mv_x, int& mv_y, uint& mv_sad)
{
ubyte4 diff0 = abd(ref_mb0, mb0);
ubyte4 diff1 = abd(ref_mb1, mb1);
ubyte4 diff2 = abd(ref_mb2, mb2);
ubyte4 diff3 = abd(ref_mb3, mb3);
ubyte4 diff4 = abd(ref_mb4, mb4);
ubyte4 diff5 = abd(ref_mb5, mb5);
ubyte4 diff6 = abd(ref_mb6, mb6);
ubyte4 diff7 = abd(ref_mb7, mb7);
double<uhalf2> sadA0, sadA1;
uhalf2 sadB0, sadB1, sadC0, sadC1;
uint sad;
sadA0 = shuffled(uhalf2(diff0), 0x88318820);
sadA1 = shuffled(uhalf2(diff1), 0x88318820);
sadB0 = (hi(sadA0) + lo(sadA0)) + (hi(sadA1) + lo(sadA1));
sadA0 = shuffled(uhalf2(diff2), 0x88318820);
sadA1 = shuffled(uhalf2(diff3), 0x88318820);
sadB1 = (hi(sadA0) + lo(sadA0)) + (hi(sadA1) + lo(sadA1));
sadC0 = sadB0 + sadB1;
sadA0 = shuffled(uhalf2(diff4), 0x88318820);
sadA1 = shuffled(uhalf2(diff5), 0x88318820);
sadB0 = (hi(sadA0) + lo(sadA0)) + (hi(sadA1) + lo(sadA1));
sadA0 = shuffled(uhalf2(diff6), 0x88318820);
sadA1 = shuffled(uhalf2(diff7), 0x88318820);
sadB1 = (hi(sadA0) + lo(sadA0)) + (hi(sadA1) + lo(sadA1));
sadC1 = sadB0 + sadB1;
double<uhalf2> final_sad = shuffled(sadC0 + sadC1, 0x88883120);
sad = uint(hi(final_sad) + lo(final_sad));
uc<int> tree_sum1 = 0x67452301;
uc<int> tree_sum2 = 0x44660022;
uc<int> tree_sum3 = 0x00004444;
sad = sad + commucperm(tree_sum1, sad);
sad = sad + commucperm(tree_sum2, sad);
sad = sad + commucperm(tree_sum3, sad);
cc better = itocc((sad < mv_sad) & in_range);
mv_x = select(better, test_x, mv_x);
mv_y = select(better, test_y, mv_y);
mv_sad = select(better, sad, mv_sad);
}
#line 87 "D:\\working\\im_apps\\h264\\me_kc.hpp"
#line 88 "D:\\working\\im_apps\\h264\\me_kc.hpp"
#line 90 "D:\\working\\im_apps\\h264\\me_kc.hpp"
#line 4 "D:\\working\\im_apps\\h264\\me_fast_jitter2_kc.cpp"
#line 1 "D:/working/tools/isim/isimexe/blank_headers\\idb_kernelc2.hpp"
#line 5 "D:\\working\\im_apps\\h264\\me_fast_jitter2_kc.cpp"
;
kernel me_fast_jitter2(istream<ubyte4> row0,
istream<ubyte4> row1,
istream<ubyte4> row2,
istream<ubyte4> mblocks,
cistream<half2> motions_in,
costream<half2> motions_out,
uc<int>& uc_margin,
uc<int>& uc_mblks)
{
synch();
int margin = commclperm(8, 0, uc_margin);
int mblks = commclperm(8, 0, uc_mblks);
int top_margin = margin & 0xFF;
int bottom_margin = shift(margin, -8) & 0xFF;
int left_margin = shift(margin, -16) & 0xFF;
int right_margin = shift(margin, -24) & 0xFF;
cc clzero = itocc(cid() == 0);
cc cc_true = itocc(0 == 0);
cc dummy_cc;
ubyte4 mb0, mb1, mb2, mb3, mb4, mb5, mb6, mb7;
ubyte4 ref_mb0, ref_mb1, ref_mb2, ref_mb3, ref_mb4, ref_mb5, ref_mb6, ref_mb7;
half2 mv_xy;
int mv_x, mv_y, x, y, test_x, test_y;
cc wrap;
uint mv_sad;
int in_range;
int top_range = 0 - select(itocc(top_margin < 16), top_margin, 16);
int bottom_range = select(itocc(bottom_margin < 15), bottom_margin, 15);
int left_range = 0 - left_margin;
int num_cols_to_right = shift(mblks, 4) + right_margin - 16;
int right_range = select(itocc(num_cols_to_right < 15), num_cols_to_right, 15);
array<ubyte4> search_region(72);
int i = shift(left_margin + right_range + 1, -2);
uc<int> loopcnt;
i = commclperm(0, i, 0, loopcnt);
int idx = shift(16 - left_margin, -2);
loop_count(loopcnt) pipeline(1) {
row0(ALL, cc_true) >> search_region[0+idx]; row0(ALL, cc_true) >> search_region[12+idx]; row1(ALL, cc_true) >> search_region[24+idx]; row1(ALL, cc_true) >> search_region[36+idx]; row2(ALL, cc_true) >> search_region[48+idx]; row2(ALL, cc_true) >> search_region[60+idx]; idx = idx + 1;
}
int num_cols_left_to_load = shift(mblks, 4);
loop_stream(mblocks) {
cc do_read = itocc(num_cols_left_to_load > 0);
loopcnt = 4;
loop_count(loopcnt) pipeline(1) {
row0(ALL, do_read) >> search_region[0+idx]; row0(ALL, do_read) >> search_region[12+idx]; row1(ALL, do_read) >> search_region[24+idx]; row1(ALL, do_read) >> search_region[36+idx]; row2(ALL, do_read) >> search_region[48+idx]; row2(ALL, do_read) >> search_region[60+idx]; idx = idx + 1;;
}
cc wrap_idx = itocc(idx == 12);
idx = select(wrap_idx, 0, idx);
mblocks >> mb0 >> mb4 >> mb1 >> mb5;
mblocks >> mb2 >> mb6 >> mb3 >> mb7;
motions_in(clzero, dummy_cc) >> mv_xy;
motions_in(clzero, dummy_cc) >> mv_sad;
mv_xy = commclperm(0, mv_xy);
mv_sad = commclperm(0, mv_sad);
mv_xy = shifta(mv_xy, -1);
hi_lo(mv_y, mv_x) = shuffled(int(mv_xy), 0x75643120);
x = mv_x - 2;
y = mv_y - 2;
i = 0;
loopcnt = 9;
loop_count(loopcnt) pipeline(1) {
test_x %= x;
test_y %= y;
in_range = check_xy(top_range, bottom_range, left_range, right_range, test_x, test_y);
extract_ref_MB(search_region, idx, test_x, test_y, 16, 16, ref_mb0, ref_mb1, ref_mb2, ref_mb3, ref_mb4, ref_mb5, ref_mb6, ref_mb7);
compare_MB(ref_mb0, ref_mb1, ref_mb2, ref_mb3, ref_mb4, ref_mb5, ref_mb6, ref_mb7, mb0, mb1, mb2, mb3, mb4, mb5, mb6, mb7, test_x, test_y, in_range, mv_x, mv_y, mv_sad);
i = i + 1;
wrap = itocc(i == 3);
i = select(wrap, 0, i);
x = select(wrap, x - 4, x + 2);
y = select(wrap, y + 2, y);
}
mv_xy = shift(half2(shift(mv_y, 16) | (mv_x & 0xffff)), 1);
motions_out(clzero) << mv_xy;
motions_out(clzero) << mv_sad;
left_range = select(itocc(left_range > -16), left_range - 16, left_range);
left_range = select(itocc(left_range < -16), -16, left_range);
num_cols_to_right = num_cols_to_right - 16;
right_range = select(itocc(num_cols_to_right < 15), num_cols_to_right, 15);
num_cols_left_to_load = num_cols_left_to_load - 16;
}
flush(motions_out, 0);
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -