📄 filter_block_kc.h
字号:
// addsui8 and for subsui8 upper 8 bits are zeroed using shuffle. // filtered q1 // (P0 + Q0 + -4*Q1 + 2*Q2 + 1)>>2 //dot_prod_q1 = spi_vshifta16(p0_r23 + q0_r23 - q1_r23_x4 + q2_r23_x2 + 0x00010001p2, -2); //dot_prod_q1 = spi_vshifta16(p0_q0_p1_r23 - q1_r23_x4 + q2_r23_x2, -2); dot_prod_q1 = spi_vmula16i(q2_r23, 0x00020002p2, spi_vmula16i(q1_r23, 0xFFFCFFFCp2, p0_q0_p1_r23)); dot_prod_q1 = spi_vshifta16(dot_prod_q1, -2); clip3_16i(-(vec int16x2)tc0, (vec int16x2)tc0, dot_prod_q1, tmp_clip16x2); q1_out_bs123 = spi_vadds8ui(q1, (vec int8x4)tmp_clip16x2); // Since ap_lt_beta/aq_lt_beta can be true in bytes 1 and 3, // do a shuffle to sign extend bytes 0, 2 into bytes 1,3. tC = (vec uint16x2)spi_vsub8i(spi_vsub8i((vec int8x4)tc0, (vec int8x4)ap_lt_beta), (vec int8x4)aq_lt_beta); //tC = (tc0 + // (vec uint16x2)spi_vselect8(ap_lt_beta, 0x00010001p4, 0p4) + // (vec uint16x2)spi_vselect8(aq_lt_beta, 0x00010001p4, 0p4)); // If filt_samp_flag is 0, tC to 0 to avoid p0,q0 pixels // getting modified. tC = (vec uint16x2)spi_vselect8(filt_samp_flag, (vec uint8x4)tC, 0p4); // Calculate delta // (P1 + -4*P0 + 4*Q0 + -1*Q1 + 4)>>3 //dot_prod_delta = spi_vshifta16(p1_r23 - p0_r23_x4 + q0_r23_x4 - q1_r23 + 0x00040004p2, -3); dot_prod_delta = spi_vshifta16(spi_vshifta16(q0_r23 - p0_r23, 2) + p1_r23 - q1_r23 + 0x00040004p2, -3); clip3_16i(-(vec int16x2)tC, (vec int16x2)tC, dot_prod_delta, delta); // filtered p0 and q0 p0_out_bs123 = spi_vadds8ui(p0, (vec int8x4)delta); // Unlike addsui8, result of subsui8 can be positive, hence might // not get saturated. Force upper bytes of delta to be '0', while using // with subs. q0_out_bs123 = spi_vsubs8ui(q0, (vec int8x4)spi_vshuffleu(0x08020800, delta, 0)); // filtered p1 // (2*P2 + -4*P1 + P0 + Q0 + 1)>>2 //dot_prod_p1 = spi_vshifta16(p2_r23_x2 - p1_r23_x4 + p0_r23 + q0_r23 + 0x00010001p2, -2); //dot_prod_p1 = spi_vshifta16(p2_r23_x2 - p1_r23_x4 + p0_q0_p1_r23, -2); dot_prod_p1 = spi_vmula16i(p2_r23, 0x00020002p2, spi_vmula16i(p1_r23, 0xFFFCFFFCp2, p0_q0_p1_r23)); dot_prod_p1 = spi_vshifta16(dot_prod_p1, -2); clip3_16i(-(vec int16x2)tc0, (vec int16x2)tc0, dot_prod_p1, tmp_clip16x2); p1_out_bs123 = spi_vadds8ui(p1, (vec int8x4)tmp_clip16x2); // Conditionally update new_p1 = spi_vselect8((vec uint8x4)(filt_samp_flag&ap_lt_beta), p1_out_bs123, p1); //new_p0 = spi_vselect8((vec uint8x4)(filt_samp_flag), p0_out_bs123, p0); //new_q0 = spi_vselect8((vec uint8x4)(filt_samp_flag), q0_out_bs123, q0); new_q1 = spi_vselect8((vec uint8x4)(filt_samp_flag&aq_lt_beta), q1_out_bs123, q1); new_p0 = p0_out_bs123; new_q0 = q0_out_bs123;}//------------------------------------------------------------------inline kernel void filter_luma_edge_bs4//------------------------------------------------------------------// KernelC inline functions to perform deblocking filter on // luma vertical edges b,c,d (i.e. cover the bs==4 cases only). //// Each cluster does the vertical edges for one 4x4 block. The // inputs come in as packed 8-bit data for two blocks,// four rows per word as shown below. The output new_p's and new_q's// end up packed the same way. // --------------------------------------------// --------------------------------------------// || p3 | p2 | p1 | p0 || q0 | q1 | q2 | q3 ||// --------------------------------------------// || p3 | p2 | p1 | p0 || q0 | q1 | q2 | q3 ||// --------------------------------------------// || p3 | p2 | p1 | p0 || q0 | q1 | q2 | q3 ||// --------------------------------------------// || p3 | p2 | p1 | p0 || q0 | q1 | q2 | q3 ||// --------------------------------------------// --------------------------------------------//------------------------------------------------------------------( // as per the standard, replicated for each byte vec uint8x4 alpha(in), vec uint8x4 beta(in), //vec int8x4 tc0(in), // Input: block to the left of this block vec uint8x4 p3(in), vec uint8x4 p2(in), vec uint8x4 p1(in), vec uint8x4 p0(in), // Input: the current block vec uint8x4 q0(in), vec uint8x4 q1(in), vec uint8x4 q2(in), vec uint8x4 q3(in), // Input vec bool32x1 dont_filter_edges(in), // Output: p2,p1,p0 are updated vec uint8x4 new_p2(out), vec uint8x4 new_p1(out), vec uint8x4 new_p0(out), // Output: q0(out), q1(out), q2 are updated vec uint8x4 new_q0(out), vec uint8x4 new_q1(out), vec uint8x4 new_q2(out) )//------------------------------------------------------------------{ /////////////////////////////////////////////////////////////////////////// // Preliminaries /////////////////////////////////////////////////////////////////////////// // For storing outputs of filtering processes. These will be // muxed with select()s later based on the actual value of bs. vec uint8x4 p0_out_bs4, p1_out_bs4, p2_out_bs4; vec uint8x4 q0_out_bs4, q1_out_bs4, q2_out_bs4; // For accumulating dot products for filtering vec uint8x4 dot_prod_u, dot_prod2_u; //vec int8x4 dot_prod_i; vec uint8x4 abd_p0_q0; vec bool8x4 p1_term; vec bool8x4 filt_samp_flag; vec uint8x4 ap; vec bool8x4 ap_lt_beta; vec uint8x4 aq; vec bool8x4 aq_lt_beta; vec uint16x2 p0_r23, p1_r23, p2_r23, p3_r23; vec uint16x2 q0_r23, q1_r23, q2_r23, q3_r23; vec bool8x4 alpha_term; vec bool8x4 filter_p_bs4; vec bool8x4 filter_q_bs4; //vec uint16x2 p3_r23_x2; //vec uint16x2 p2_r23_x2; //vec uint16x2 p1_r23_x2; //vec uint16x2 p0_r23_x2; //vec uint16x2 q0_r23_x2; //vec uint16x2 q1_r23_x2; //vec uint16x2 q2_r23_x2; //vec uint16x2 q3_r23_x2; // Calculate filt_samp_flag abd_p0_q0 = spi_vabd8u(p0, q0); p1_term = spi_vlt8u(spi_vabd8u(p1, p0), beta); // New language doesn't seem to like using logical ands // in expression below. filt_samp_flag = (spi_vlt8u(abd_p0_q0, alpha) & spi_vlt8u(spi_vabd8u(q1, q0), beta) & (vec bool8x4)spi_vnot32(dont_filter_edges)) & p1_term; // Calculate ap and aq ap = spi_vabd8u(p2, p0); ap_lt_beta = spi_vlt8u(ap, beta); aq = spi_vabd8u(q2, q0); aq_lt_beta = spi_vlt8u(aq, beta); // ********************** // Unpack input data // ********************** p0_r23 = (vec uint16x2)p0; p1_r23 = (vec uint16x2)p1; p2_r23 = (vec uint16x2)p2; p3_r23 = (vec uint16x2)p3; q0_r23 = (vec uint16x2)q0; q1_r23 = (vec uint16x2)q1; q2_r23 = (vec uint16x2)q2; q3_r23 = (vec uint16x2)q3; // ******************************************************* // filtering for p2, p1, p0, q0, q1, q2 for bs == 4 // ******************************************************* alpha_term = spi_vlt8u(abd_p0_q0, ((vec uint8x4)spi_vshift8(alpha, -2) + (vec uint8x4)0x00020002)); filter_p_bs4 = ap_lt_beta & alpha_term; filter_q_bs4 = aq_lt_beta & alpha_term; //p3_r23_x2 = spi_vshift16(p3_r23,1); //p2_r23_x2 = spi_vshift16(p2_r23,1); //p1_r23_x2 = spi_vshift16(p1_r23,1); //p0_r23_x2 = spi_vshift16(p0_r23,1); //q0_r23_x2 = spi_vshift16(q0_r23,1); //q1_r23_x2 = spi_vshift16(q1_r23,1); //q2_r23_x2 = spi_vshift16(q2_r23,1); //q3_r23_x2 = spi_vshift16(q3_r23,1); // filtered p0 and q0 // (P2 + 2*P1 + 2*P0 + 2*Q0 + Q1 + 4)>>3 //dot_prod_u = (vec uint8x4)spi_vshift16(p2_r23 + p1_r23_x2 + // p0_r23_x2 + q0_r23_x2 + // q1_r23 + (vec uint16x2)0x00040004, -3); dot_prod_u = (vec uint8x4)spi_vshift16(spi_vsuma8u(p2, q1, 0x00040004p2) + spi_vshift16(spi_vsuma8u(p1, p0, (vec uint16x2)q0), 1), -3); // (2*P1 + P0 + Q1 + 2)>>2 //dot_prod2_u = (vec uint8x4)spi_vshift16(p1_r23_x2 + p0_r23 + // q1_r23 + (vec uint16x2)0x00020002, -2); dot_prod2_u = (vec uint8x4)spi_vshift16(spi_vsuma8u(p1, p1, spi_vsuma8u(p0, q1, 0x00020002p2)), -2); p0_out_bs4 = spi_vselect8(filter_p_bs4, dot_prod_u, dot_prod2_u); // (P1 + 2*P0 + 2*Q0 + 2*Q1 + Q2 + 4)>>3 //dot_prod_u = (vec uint8x4)spi_vshift16(p1_r23 + p0_r23_x2 + q0_r23_x2 + // q1_r23_x2 + q2_r23 + (vec uint16x2)0x00040004, -3); dot_prod_u = (vec uint8x4)spi_vshift16(spi_vshift16(spi_vsuma8u(p0, q0, (vec uint16x2)q1), 1) + spi_vsuma8u(p1, q2, 0x00040004p2), -3); // (P1 + Q0 + 2*Q1 + 2)>>2 //dot_prod2_u = (vec uint8x4)spi_vshift16(p1_r23 + q0_r23 + q1_r23_x2 + // (vec uint16x2)0x00020002, -2); dot_prod2_u = (vec uint8x4)spi_vshift16(spi_vsuma8u(p1, q0, spi_vsuma8u(q1, q1, 0x00020002p2)), -2); q0_out_bs4 = spi_vselect8(filter_q_bs4, dot_prod_u, dot_prod2_u); // filtered p1 and q1 // (P2 + P1 + P0 + Q0 + 2)>>2 //p1_out_bs4 = (vec uint8x4)spi_vshift16(p2_r23 + p1_r23 + p0_r23 + // q0_r23 + (vec uint16x2)0x00020002, -2); p1_out_bs4 = (vec uint8x4)spi_vshift16(spi_vsuma8u(p2, p1, spi_vsuma8u(p0, q0, 0x00020002p2)), -2); // (P0 + Q0 + Q1 + Q2 + 2)>>2 //q1_out_bs4 = (vec uint8x4)spi_vshift16(p0_r23 + q0_r23 + q1_r23 + // q2_r23 + (vec uint16x2)0x00020002, -2); q1_out_bs4 = (vec uint8x4)spi_vshift16(spi_vsuma8u(q2, q1, spi_vsuma8u(p0, q0, 0x00020002p2)), -2); // filtered p2 and q2 // (2*P3 + 3*P2 + P1 + P0 + Q0 + 4)>>3 //p2_out_bs4 = (vec uint8x4)spi_vshift16(p3_r23_x2 + p2_r23_x2 + p2_r23 + p1_r23 + // p0_r23 + q0_r23 + (vec uint16x2)0x00040004, -3); p2_out_bs4 = (vec uint8x4)spi_vshift16(spi_vsuma8u(p2, p1, spi_vsuma8u(p0, q0, 0x00020002p2)) + spi_vsuma8u(p3, p2, spi_vsuma8u(p3, p2, 0x00020002p2)), -3); // (P0 + Q0 + Q1 + 3*Q2 + 2*Q3 + 4)>>3 //q2_out_bs4 = (vec uint8x4)spi_vshift16(p0_r23 + q0_r23 + q1_r23 + q2_r23 + q2_r23_x2 + // q3_r23_x2 + (vec uint16x2)0x00040004, -3); q2_out_bs4 = (vec uint8x4)spi_vshift16(spi_vsuma8u(q2, q1, spi_vsuma8u(p0, q0, 0x00020002p2)) + spi_vsuma8u(q3, q2, spi_vsuma8u(q3, q2, 0x00020002p2)), -3); /////////////////////////////////////////////////////////////////////////// // select correct output /////////////////////////////////////////////////////////////////////////// new_p0 = spi_vselect8((filt_samp_flag), p0_out_bs4, p0); new_p1 = spi_vselect8((filt_samp_flag&filter_p_bs4), p1_out_bs4, p1); new_p2 = spi_vselect8((filt_samp_flag&filter_p_bs4), p2_out_bs4, p2); new_q0 = spi_vselect8((filt_samp_flag), q0_out_bs4, q0); new_q1 = spi_vselect8((filt_samp_flag&filter_q_bs4), q1_out_bs4, q1); new_q2 = spi_vselect8((filt_samp_flag&filter_q_bs4), q2_out_bs4, q2);}//------------------------------------------------------------------inline kernel void filter_luma_edges//------------------------------------------------------------------// KernelC inline function to process 4 vertical edges in // sequence. Each cluster is doing a different 4x4 block from// a different row. Each quad of 4 clusters is doing a different// macroblock.// // The input data to this function is the block to the// left of the current macroblock as well as the 4 blocks from// this row in the current macroblock. Data comes in as packed// 8-bit data (4 rows per word).
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -