📄 me_fast_jitter1_kc.cpp
字号:
#include "idb_kernelc.hpp"
#include "mpeg.hpp"
#include "me_kc.hpp"
#include "idb_kernelc2.hpp"
#define RANGE_X 16
#define RANGE_X_MINUS_1 15
// NOTE: If you change RANGE_Y, then you also have to change
// LOAD_4_SEARCH_COLS in me_kc.hpp
#define RANGE_Y 16
#define RANGE_Y_MINUS_1 15
// S.R. == search region
// Input parameters:
//
// uc_margin:
//
// each byte is an uint that indicates how many blocks of "margin"
// are available on the outskirts of the S.R. In other words, if
// mblocks is an entire row of macroblocks, the left and right
// margins would be '0' since we cannot search any further in
// those directions. Note, that macroblocks not on the edges will
// not necessarily care about the margin values. The top and
// bottom margins are the same for each macroblock to be matched.
// The order of the bytes from LSByte to MSByte is top, bottom,
// left, right.
KERNELDEF(me_fast_jitter1, KERNELS_DIR "me_fast_jitter1_kc.uc");
kernel me_fast_jitter1(istream<ubyte4> row0,
istream<ubyte4> row1,
istream<ubyte4> row2,
istream<ubyte4> mblocks,
cistream<half2> motions_in,
costream<half2> motions_out,
ostream<ubyte4> refyblks,
ostream<uint> crcbindices,
uc<int>& uc_margin,
uc<int>& uc_offsets, // x, y offset of current block
uc<int>& uc_mblks, // num MBs in mblocks
uc<int>& uc_mb_width) // number of macroblocks in row
{
// read uc parameter
synch();
int margin = commclperm(8, 0, uc_margin);
int mblks = commclperm(8, 0, uc_mblks);
uint offsets = uint(commclperm(8, 0, uc_offsets));
uint mb_width = uint(commclperm(8, 0, uc_mb_width));
// init constants
int top_margin = margin & 0xFF;
int bottom_margin = shift(margin, -8) & 0xFF;
int left_margin = shift(margin, -16) & 0xFF;
int right_margin = shift(margin, -24) & 0xFF;
uint xoffset = offsets & 0xffff;
uint yoffset = shift(offsets, -16);
uint crefx = shift(xoffset, -1) + uint(cid());
uint crefy = shift(yoffset, -1);
uint crowlen = shift(mb_width, 6);
cc clzero = itocc(cid() == 0);
cc cc_true = itocc(0 == 0);
cc dummy_cc;
// Curent macroblock
ubyte4 MB_EXPAND(mb);
// Macroblock from S.R. to compare to.
ubyte4 MB_EXPAND(ref_mb);
// best motion vector and block sum
half2 mv_xy;
int mv_x, mv_y, x, y, test_x, test_y;
cc wrap;
uint mv_sad;
int in_range;
// The *_range variables contain the size of the S.R. for the
// current macroblock that is being matched. At most, it can be
// RANGE_X or RANGE_Y. However, the value of the *_range variables
// can be smaller depending on the values of the *_margin variables
// and whether the current macroblock is on an edge or not.
int top_range = 0 - select(itocc(top_margin < RANGE_Y), top_margin, RANGE_Y);
int bottom_range = select(itocc(bottom_margin < RANGE_Y_MINUS_1), bottom_margin, RANGE_Y_MINUS_1);
int left_range = 0 - left_margin;
// num_cols_to_right is the maximum size of the S.R. to the right
int num_cols_to_right = shift(mblks, 4) + right_margin - 16;
int right_range = select(itocc(num_cols_to_right < RANGE_X_MINUS_1), num_cols_to_right, RANGE_X_MINUS_1);
// This array holds a 3x3 MB S.R.
array<ubyte4> search_region(72);
// All valid MB columns, except the right-most column, are read in
// before the main loop to prime the array. The final right-most
// column for each MB is read in during the main loop.
int i = shift(left_margin + right_range + 1, -2);
uc<int> loopcnt;
i = commclperm(0, i, 0, loopcnt);
int idx = shift(RANGE_X - left_margin, -2);
loop_count(loopcnt) pipeline(1) {
LOAD_4_SEARCH_COLS(row0, row1, row2, cc_true, search_region, idx)
}
int num_cols_left_to_load = shift(mblks, 4);
loop_stream(mblocks) {
// read next MB column in S.R.
cc do_read = itocc(num_cols_left_to_load > 0);
loopcnt = 4;
loop_count(loopcnt) pipeline(1) {
LOAD_4_SEARCH_COLS(row0, row1, row2, do_read, search_region, idx);
}
cc wrap_idx = itocc(idx == 12);
idx = select(wrap_idx, 0, idx);
// read next MB to process
mblocks >> mb0 >> mb4 >> mb1 >> mb5;
mblocks >> mb2 >> mb6 >> mb3 >> mb7;
// Read previous motion vectors.
motions_in(clzero, dummy_cc) >> mv_xy;
motions_in(clzero, dummy_cc) >> mv_sad;
mv_xy = commclperm(0, mv_xy);
mv_sad = commclperm(0, mv_sad);
// Shift to make MV's full pixel values
mv_xy = shifta(mv_xy, -1);
hi_lo(mv_y, mv_x) = shuffled(int(mv_xy), 0x75643120);
// Loop 3: granularity = 1
x = mv_x - 1;
y = mv_y - 1;
i = 0;
loopcnt = 9;
loop_count(loopcnt) pipeline(1) {
test_x %= x;
test_y %= y;
in_range = check_xy(top_range, bottom_range, left_range, right_range, test_x, test_y);
extract_ref_MB(search_region, idx, test_x, test_y, RANGE_X, RANGE_Y, MB_EXPAND(ref_mb));
compare_MB(MB_EXPAND(ref_mb), MB_EXPAND(mb), test_x, test_y, in_range, mv_x, mv_y, mv_sad);
i = i + 1;
wrap = itocc(i == 3);
i = select(wrap, 0, i);
x = select(wrap, x - 2, x + 1);
y = select(wrap, y + 1, y);
}
// Output motion vectors. Shift to make MV's 1/2 pixel values
mv_xy = shift(half2(shift(mv_y, 16) | (mv_x & 0xffff)), 1);
motions_out(clzero) << mv_xy;
motions_out(clzero) << mv_sad;
// Output Y vals for best match
extract_ref_MB(search_region, idx, mv_x, mv_y, RANGE_X, RANGE_Y, MB_EXPAND(ref_mb));
refyblks << ref_mb0 << ref_mb4 << ref_mb1 << ref_mb5;
refyblks << ref_mb2 << ref_mb6 << ref_mb3 << ref_mb7;
// Output load indices for chrominance block that corresponds to
// the best match
// add one if deltas are negative to get correct biasing when
// using shift by 1 in place of divide by 2
int dx = mv_x + select(itocc(mv_x < 0), 1, 0);
int dy = mv_y + select(itocc(mv_y < 0), 1, 0);
uint fetchx = crefx + uint(shifta(dx, -1));
uint fetchy = crefy + uint(shifta(dy, -1));
uint fetchxoffset = shift(shift(fetchx, -3), 6) + (fetchx & 0x7);
crefx = crefx + 8;
loopcnt = 8;
loop_count(loopcnt) pipeline(1) {
uint fetchyoffset = lo(shift(fetchy, -3) * crowlen) + shift(fetchy & 0x7, 3);
fetchy = fetchy + 1;
uint c_out = fetchxoffset + fetchyoffset;
crcbindices << c_out;
}
left_range = select(itocc(left_range > -RANGE_X), left_range - 16, left_range);
left_range = select(itocc(left_range < -RANGE_X), -RANGE_X, left_range);
num_cols_to_right = num_cols_to_right - 16;
right_range = select(itocc(num_cols_to_right < RANGE_X_MINUS_1), num_cols_to_right, RANGE_X_MINUS_1);
num_cols_left_to_load = num_cols_left_to_load - 16;
}
flush(motions_out, 0);
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -