📄 filter_block_kc.h
字号:
// The row of 4x4 blocks for each lane are named as followed://// Neighbor Current block// --------------------------------------// | left || cur0 | cur1 | cur2 | cur3 ||// --------------------------------------//// Within each block, the naming convention is://// -----------------------// -----------------------// || p3 | p2 | p1 | p0 ||// -----------------------// || p3 | p2 | p1 | p0 ||// -----------------------// || p3 | p2 | p1 | p0 ||// -----------------------// || p3 | p2 | p1 | p0 ||// -----------------------// -----------------------//// So, there are 4vec uint8x4's to store each 4x4 block.( // Input: boundary strengths for each block vec uint8x4 bs_3210(in), // Inputs: alpha(in), beta(in), and tc0 for each block vec uint8x4 cur0_alpha(in), vec uint8x4 cur1_alpha(in), vec uint8x4 cur2_alpha(in), vec uint8x4 cur3_alpha(in), vec uint8x4 cur0_beta(in), vec uint8x4 cur1_beta(in), vec uint8x4 cur2_beta(in), vec uint8x4 cur3_beta(in), vec uint16x2 cur0_tc0(in), vec uint16x2 cur1_tc0(in), vec uint16x2 cur2_tc0(in), vec uint16x2 cur3_tc0(in), // Input/Output: block to the left of this block (p0(in), p1(in), p2 updated) vec uint8x4 left_p3(in), vec uint8x4 left_p2(in), vec uint8x4 left_p1(in), vec uint8x4 left_p0(in), // Input/Outputs: the current blocks vec uint8x4 cur0_p3(in), vec uint8x4 cur0_p2(in), vec uint8x4 cur0_p1(in), vec uint8x4 cur0_p0(in), vec uint8x4 cur1_p3(in), vec uint8x4 cur1_p2(in), vec uint8x4 cur1_p1(in), vec uint8x4 cur1_p0(in), vec uint8x4 cur2_p3(in), vec uint8x4 cur2_p2(in), vec uint8x4 cur2_p1(in), vec uint8x4 cur2_p0(in), vec uint8x4 cur3_p3(in), vec uint8x4 cur3_p2(in), vec uint8x4 cur3_p1(in), vec uint8x4 cur3_p0(in), // Input vec bool32x1 dont_filter_edges(in), // Output pixels vec uint8x4 new_left_p2(out), vec uint8x4 new_left_p1(out), vec uint8x4 new_left_p0(out), // Input/Outputs: the current blocks vec uint8x4 new_cur0_p3(out), vec uint8x4 new_cur0_p2(out), vec uint8x4 new_cur0_p1(out), vec uint8x4 new_cur0_p0(out), vec uint8x4 new_cur1_p3(out), vec uint8x4 new_cur1_p2(out), vec uint8x4 new_cur1_p1(out), vec uint8x4 new_cur1_p0(out), vec uint8x4 new_cur2_p3(out), vec uint8x4 new_cur2_p2(out), vec uint8x4 new_cur2_p1(out), vec uint8x4 new_cur2_p0(out), vec uint8x4 new_cur3_p3(out), vec uint8x4 new_cur3_p2(out), vec uint8x4 new_cur3_p1(out), vec uint8x4 new_cur3_p0(out) )//------------------------------------------------------------------{ // ************************************************************* // Process left-most edge // Do both BS4 and BS123 cases, select the right one // ************************************************************* vec uint32x1 cur0_bs; vec uint8x4 left_p2_bs4, left_p1_bs4, left_p0_bs4; vec uint8x4 left_p1_bs123, left_p0_bs123; vec uint8x4 cur0_p3_bs4, cur0_p2_bs4, cur0_p1_bs4; vec uint8x4 cur0_p3_bs123, cur0_p2_bs123; vec bool8x4 bs_is_4; vec uint32x1 bs0_is_not_0, bs1_is_not_0, bs2_is_not_0, bs3_is_not_0; bs0_is_not_0 = (vec uint32x1)spi_vshuffledu_lo(0x10101010, spi_vne8(bs_3210, 0), 0); bs1_is_not_0 = (vec uint32x1)spi_vshuffledu_hi(0x10101010, spi_vne8(bs_3210, 0), 0); bs2_is_not_0 = (vec uint32x1)spi_vshuffledu_lo(0x32323232, spi_vne8(bs_3210, 0), 0); bs3_is_not_0 = (vec uint32x1)spi_vshuffledu_hi(0x32323232, spi_vne8(bs_3210, 0), 0); cur0_bs = (vec uint32x1)spi_vshuffleu(0x08080800, (vec uint32x1)bs_3210, 0); bs_is_4 = (vec bool8x4)spi_veq32(cur0_bs, 4); filter_luma_edge_bs4(cur0_alpha, cur0_beta, //cur0_tc0, left_p3, left_p2, left_p1, left_p0, cur0_p3, cur0_p2, cur0_p1, cur0_p0, dont_filter_edges, left_p2_bs4, left_p1_bs4, left_p0_bs4, cur0_p3_bs4, cur0_p2_bs4, cur0_p1_bs4); filter_luma_edge_bs123(bs0_is_not_0, cur0_alpha, cur0_beta, cur0_tc0, left_p2, left_p1, left_p0, cur0_p3, cur0_p2, cur0_p1, dont_filter_edges, left_p1_bs123, left_p0_bs123, cur0_p3_bs123, cur0_p2_bs123); left_p2 = spi_vselect8(bs_is_4, left_p2_bs4, left_p2); left_p1 = spi_vselect8(bs_is_4, left_p1_bs4, left_p1_bs123); left_p0 = spi_vselect8(bs_is_4, left_p0_bs4, left_p0_bs123); cur0_p3 = spi_vselect8(bs_is_4, cur0_p3_bs4, cur0_p3_bs123); cur0_p2 = spi_vselect8(bs_is_4, cur0_p2_bs4, cur0_p2_bs123); cur0_p1 = spi_vselect8(bs_is_4, cur0_p1_bs4, cur0_p1); // ************************************************************* // Process second edge // Only need to do bs<4 case // ************************************************************* filter_luma_edge_bs123(bs1_is_not_0, cur1_alpha, cur1_beta, cur1_tc0, cur0_p2, cur0_p1, cur0_p0, cur1_p3, cur1_p2, cur1_p1, dont_filter_edges, cur0_p1, cur0_p0, cur1_p3, cur1_p2); // ************************************************************* // Process third edge // Only need to do bs<4 case // ************************************************************* filter_luma_edge_bs123(bs2_is_not_0, cur2_alpha, cur2_beta, cur2_tc0, cur1_p2, cur1_p1, cur1_p0, cur2_p3, cur2_p2, cur2_p1, dont_filter_edges, cur1_p1, cur1_p0, cur2_p3, cur2_p2); // ************************************************************* // Process fourth edge // Only need to do bs<4 case // ************************************************************* filter_luma_edge_bs123(bs3_is_not_0, cur3_alpha, cur3_beta, cur3_tc0, cur2_p2, cur2_p1, cur2_p0, cur3_p3, cur3_p2, cur3_p1, dont_filter_edges, cur2_p1, cur2_p0, cur3_p3, cur3_p2); // Copy the modified pixels into output values. new_left_p2 = left_p2; new_left_p1 = left_p1; new_left_p0 = left_p0; new_cur0_p3 = cur0_p3; new_cur0_p2 = cur0_p2; new_cur0_p1 = cur0_p1; new_cur0_p0 = cur0_p0; new_cur1_p3 = cur1_p3; new_cur1_p2 = cur1_p2; new_cur1_p1 = cur1_p1; new_cur1_p0 = cur1_p0; new_cur2_p3 = cur2_p3; new_cur2_p2 = cur2_p2; new_cur2_p1 = cur2_p1; new_cur2_p0 = cur2_p0; new_cur3_p3 = cur3_p3; new_cur3_p2 = cur3_p2; new_cur3_p1 = cur3_p1; new_cur3_p0 = cur3_p0;}//--------------------------------------------------------------------inline kernel void filter_chroma_edge//--------------------------------------------------------------------// KernelC inline functions to perform deblocking filter on one chroma// 4x4 block edge. It is agnostic to whether the edge is a vertical// or horizontal edge. Unlike the luma filter function, the chroma// filtering is simpler and can be all done in parallel, and does not// need to be split into three sequential phases. The data layour and// variables are organized similar to filter_luma_edge(), so refer to// the comments for that function for more details.//--------------------------------------------------------------------( // boundary strength (need one value for each group of two pixel lines) vec uint16x2 bs(in), // as per the standard(in), replicated for each byte vec uint8x4 alpha(in), vec uint8x4 beta(in), vec int8x4 tc0(in), // Output: p0 is updated vec uint8x4 new_p0(out), // Output: q0 is updated vec uint8x4 new_q0(out), // Input: transposed version of neighboring block vec uint8x4 p0(in), vec uint8x4 p1(in), // Input: transposed version of the current block vec uint8x4 q0(in), vec uint8x4 q1(in), // Input vec bool32x1 dont_filter_edges(in) )//--------------------------------------------------------------------{ // For storing outputs of filtering processes. These will be // muxed with select()s later based on the actual value of bs. vec uint8x4 p0_out_bs123; vec uint8x4 q0_out_bs123; vec uint8x4 p0_out_bs4; vec uint8x4 q0_out_bs4; // Extract each {q1, q0, p0, p1} sequence into one transposed word vec uint8x4 t_3, t_2, t_1, t_0; vec bool8x4 bs4; vec bool8x4 filt_samp_flag; vec int8x4 tC; vec int8x4 dot_prod; vec int8x4 delta; transpose_block(q1, q0, p0, p1, t_0, t_1, t_2, t_3); bs4 = (vec bool8x4)spi_veq16((bs & 0x00070007p2), 0x00040004p2); // Calculate filt_samp_flag filt_samp_flag = ((vec bool8x4)spi_vne16(bs, 0p2) & spi_vlt8u(spi_vabd8u(p0, q0), alpha) & spi_vlt8u(spi_vabd8u(p1, p0), beta) & spi_vlt8u(spi_vabd8u(q1, q0), beta) & (vec bool8x4)spi_vnot32(dont_filter_edges)); /////////////////////////////////////////////////////////////////////////// // filtering for p0 and q0 for bs < 4 /////////////////////////////////////////////////////////////////////////// tC = tc0 + 0x01010101p4; // Calculate delta matrix_mult_int4x4(t_3, t_2, t_1, t_0, (vec int8x4)0x01fc04ff, (vec int32x1)4, (vec int32x1)-3, dot_prod); clip3_8i(-tC, tC, dot_prod, delta); // filtered p0 and q0 p0_out_bs123 = spi_vadds8ui(p0, delta); q0_out_bs123 = spi_vsubs8ui(q0, delta); /////////////////////////////////////////////////////////////////////////// // filtering for p0, q0 for bs == 4 /////////////////////////////////////////////////////////////////////////// // filtered p0 and q0 matrix_mult_uint4x4(t_3, t_2, t_1, t_0, (vec uint8x4)0x02010001, (vec uint32x1)2, (vec int32x1)-2, p0_out_bs4); matrix_mult_uint4x4(t_3, t_2, t_1, t_0, (vec uint8x4)0x01000102, (vec uint32x1)2, (vec int32x1)-2, q0_out_bs4); /////////////////////////////////////////////////////////////////////////// // select correct output /////////////////////////////////////////////////////////////////////////// new_p0 = spi_vselect8(filt_samp_flag, spi_vselect8(bs4, p0_out_bs4, p0_out_bs123), p0); new_q0 = spi_vselect8(filt_samp_flag, spi_vselect8(bs4, q0_out_bs4, q0_out_bs123), q0);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -