📄 filter_block_kc.h
字号:
// CoeffP, CoeffQ: 4x1 vectors// add_val: Scalar// shift_val: Amount to shift dotproduct results// by for scaling (must be a negative// shift amount to scale down)// Outputs: none// Return Value: SumVec: 4x1 vector//------------------------------------------------------------------// Signed data, coefficients and resultsinline kernel void matrix_mult_int4x4(vec uint8x4 t_3(in), vec uint8x4 t_2(in), vec uint8x4 t_1(in), vec uint8x4 t_0(in), vec int8x4 coeff(in), vec int32x1 add_val(in), vec int32x1 shift_val(in), vec int8x4 ret_val(out)){ ret_val = spi_vclip16i(spi_vshifta16(spi_vclip32i(spi_vdotpa8ui(t_3, coeff, add_val), spi_vdotpa8ui(t_2, coeff, add_val)), shift_val), spi_vshifta16(spi_vclip32i(spi_vdotpa8ui(t_1, coeff, add_val), spi_vdotpa8ui(t_0, coeff, add_val)), shift_val));}// Unsigned data, coefficients and resultsinline kernel void matrix_mult_uint4x4(vec uint8x4 t_3(in), vec uint8x4 t_2(in), vec uint8x4 t_1(in), vec uint8x4 t_0(in), vec uint8x4 coeff(in), vec uint32x1 add_val(in), vec int32x1 shift_val(in), vec uint8x4 ret_val(out)){ ret_val = spi_vclip16u(spi_vshift16(spi_vclip32u(spi_vdotpa8u(t_3, coeff, add_val), spi_vdotpa8u(t_2, coeff, add_val)), shift_val), spi_vshift16(spi_vclip32u(spi_vdotpa8u(t_1, coeff, add_val), spi_vdotpa8u(t_0, coeff, add_val)), shift_val));}//------------------------------------------------------------------inline kernel void transpose_block//------------------------------------------------------------------// KernelC inline function to transpose the elements of a 4x4 block.// It accepts four elements, each four bytes wide. Transpose, so that// the first output gets the first byte from each input, the second// output gets the second byte from each input, etc.//--------------------------------------------------------------------( // Input: block to be transformed vec uint8x4 data0(in), vec uint8x4 data1(in), vec uint8x4 data2(in), vec uint8x4 data3(in), // Output: transposed block vec uint8x4 t_data0(out), vec uint8x4 t_data1(out), vec uint8x4 t_data2(out), vec uint8x4 t_data3(out) )//--------------------------------------------------------------------{ vec uint8x4 tmp_data0, tmp_data1, tmp_data2, tmp_data3; tmp_data1 = (vec uint8x4)spi_vshuffledu_hi(0x76325410, (vec uint32x1)data0, (vec uint32x1)data1); tmp_data0 = (vec uint8x4)spi_vshuffledu_lo(0x76325410, (vec uint32x1)data0, (vec uint32x1)data1); tmp_data3 = (vec uint8x4)spi_vshuffledu_hi(0x76325410, (vec uint32x1)data2, (vec uint32x1)data3); tmp_data2 = (vec uint8x4)spi_vshuffledu_lo(0x76325410, (vec uint32x1)data2, (vec uint32x1)data3); t_data2 = (vec uint8x4)spi_vshuffledu_hi(0x75643120, (vec uint32x1)tmp_data0, (vec uint32x1)tmp_data2); t_data0 = (vec uint8x4)spi_vshuffledu_lo(0x75643120, (vec uint32x1)tmp_data0, (vec uint32x1)tmp_data2); t_data3 = (vec uint8x4)spi_vshuffledu_hi(0x75643120, (vec uint32x1)tmp_data1, (vec uint32x1)tmp_data3); t_data1 = (vec uint8x4)spi_vshuffledu_lo(0x75643120, (vec uint32x1)tmp_data1, (vec uint32x1)tmp_data3);}//------------------------------------------------------------------inline kernel void transpose_block_from_sparse//------------------------------------------------------------------// KernelC inline function to transpose the elements of a // sparse 4x4 block producing only 2 words as output. This // is useful in luma deblocking as transposed block has only// 2 pixels packed into a word in half word positions (bytes 0 and 2). // See below for full 4x4 transpose.//--------------------------------------------------------------------( // Input: block to be transformed vec uint8x4 data0(in), vec uint8x4 data1(in), vec uint8x4 data2(in), vec uint8x4 data3(in), // Output: transposed block vec uint8x4 t_data0(out), vec uint8x4 t_data2(out) )//--------------------------------------------------------------------{ vec uint8x4 tmp_data0, tmp_data1, tmp_data2, tmp_data3; tmp_data1 = (vec uint8x4)spi_vshuffledu_hi(0x76325410, (vec uint32x1)data0, (vec uint32x1)data1); tmp_data0 = (vec uint8x4)spi_vshuffledu_lo(0x76325410, (vec uint32x1)data0, (vec uint32x1)data1); tmp_data3 = (vec uint8x4)spi_vshuffledu_hi(0x76325410, (vec uint32x1)data2, (vec uint32x1)data3); tmp_data2 = (vec uint8x4)spi_vshuffledu_lo(0x76325410, (vec uint32x1)data2, (vec uint32x1)data3); t_data2 = (vec uint8x4)spi_vshuffledu_hi(0x75643120, (vec uint32x1)tmp_data0, (vec uint32x1)tmp_data2); t_data0 = (vec uint8x4)spi_vshuffledu_lo(0x75643120, (vec uint32x1)tmp_data0, (vec uint32x1)tmp_data2);}//------------------------------------------------------------------inline kernel void transpose_block_to_sparse//------------------------------------------------------------------// KernelC inline function to transpose the elements of a // 2 input words into sparse 4x4 block as output. This // is useful in luma deblocking as transposed block has only// 2 pixels packed into a word in half word positions (bytes 0 and 2). // See above for full 4x4 transpose.//--------------------------------------------------------------------( // Input: block to be transformed vec uint8x4 data0(in), vec uint8x4 data1(in), // Output: transposed block vec uint8x4 t_data0(out), vec uint8x4 t_data1(out), vec uint8x4 t_data2(out), vec uint8x4 t_data3(out) )//--------------------------------------------------------------------{ t_data1 = (vec uint8x4)spi_vshuffledu_hi(0x88108854, (vec uint32x1)data1, (vec uint32x1)data0); t_data0 = (vec uint8x4)spi_vshuffledu_lo(0x88108854, (vec uint32x1)data1, (vec uint32x1)data0); t_data3 = (vec uint8x4)spi_vshuffledu_hi(0x88328876, (vec uint32x1)data1, (vec uint32x1)data0); t_data2 = (vec uint8x4)spi_vshuffledu_lo(0x88328876, (vec uint32x1)data1, (vec uint32x1)data0);}inline kernel void edge_filt_int_clip(vec int16x2 sum_r01(in), vec int16x2 sum_r23(in), vec int32x1 shift_val(in), vec int8x4 ret_val(out)){ ret_val = spi_vclip16i(spi_vshifta16(sum_r01, shift_val), spi_vshifta16(sum_r23, shift_val));}inline kernel void edge_filt_uint_clip(vec uint16x2 sum_r01(in), vec uint16x2 sum_r23(in), vec int32x1 shift_val(in), vec uint8x4 ret_val(out)){ ret_val = spi_vclip16u(spi_vshift16(sum_r01, shift_val), spi_vshift16(sum_r23, shift_val));}//------------------------------------------------------------------inline kernel void filter_luma_edge_bs123//------------------------------------------------------------------// KernelC inline functions to perform deblocking filter on // luma vertical edges b,c,d (i.e. cover the bs<4 cases only). //// Each cluster does the vertical edges for one 4x4 block. The // inputs come in as packed 8-bit data for two blocks,// four rows per word as shown below. The output new_p's and new_q's// end up packed the same way. // --------------------------------------------// --------------------------------------------// || XX | p2 | p1 | p0 || q0 | q1 | q2 | XX ||// --------------------------------------------// || XX | p2 | p1 | p0 || q0 | q1 | q2 | XX ||// --------------------------------------------// || XX | p2 | p1 | p0 || q0 | q1 | q2 | XX ||// --------------------------------------------// || XX | p2 | p1 | p0 || q0 | q1 | q2 | XX ||// --------------------------------------------// --------------------------------------------//------------------------------------------------------------------( // Boundary Strength vec uint32x1 bs_is_not_0(in), // as per the standard(in), replicated for each byte(in) vec uint8x4 alpha(in), vec uint8x4 beta(in), vec uint16x2 tc0(in), // Input: block to the left of this block(in) vec uint8x4 p2(in), vec uint8x4 p1(in), vec uint8x4 p0(in), // Input: the current block(in) vec uint8x4 q0(in), vec uint8x4 q1(in), vec uint8x4 q2(in), // Input vec bool32x1 dont_filter_edges(in), // Output: p1,p0 are updated vec uint8x4 new_p1(out), vec uint8x4 new_p0(out), // Output: q0(out), q1 are updated(out) vec uint8x4 new_q0(out), vec uint8x4 new_q1(out) )//------------------------------------------------------------------{ /////////////////////////////////////////////////////////////////////////// // Preliminaries /////////////////////////////////////////////////////////////////////////// // For storing outputs of filtering processes. These will be // muxed with select()s later based on the actual value of bs. vec uint8x4 p0_out_bs123, p1_out_bs123; vec uint8x4 q0_out_bs123, q1_out_bs123; // Calculate filt_samp_flag vec uint8x4 abd_p0_q0; vec bool8x4 p1_term; vec bool8x4 filt_samp_flag; vec uint8x4 ap; vec bool8x4 ap_lt_beta; vec uint8x4 aq; vec bool8x4 aq_lt_beta; vec int16x2 p0_r23, p1_r23, p2_r23; vec int16x2 q0_r23, q1_r23, q2_r23; //vec int16x2 p2_r23_x2; //vec int16x2 p1_r23_x4; //vec int16x2 p0_r23_x4; //vec int16x2 q0_r23_x4; //vec int16x2 q1_r23_x4; //vec int16x2 q2_r23_x2; vec int16x2 p0_q0_p1_r23; vec int16x2 dot_prod_q1; vec int8x4 tmp_clip8x4; vec int16x2 tmp_clip16x2; vec uint16x2 tC; vec int16x2 dot_prod_delta; vec int16x2 delta; vec int16x2 dot_prod_p1; abd_p0_q0 = spi_vabd8u(p0, q0); p1_term = spi_vlt8u(spi_vabd8u(p1, p0), beta); filt_samp_flag = ((vec bool8x4)bs_is_not_0 & spi_vlt8u(abd_p0_q0, alpha) & spi_vlt8u(spi_vabd8u(q1, q0), beta) & (vec bool8x4)spi_vnot32(dont_filter_edges)) & p1_term; // Calculate ap and aq, although ap is only valid for the MB edge // clusters, and will have to be updated in Phase II ap = spi_vabd8u(p2, p0); ap_lt_beta = spi_vlt8u(ap, beta); aq = spi_vabd8u(q2, q0); aq_lt_beta = spi_vlt8u(aq, beta); // Upper bytes of p0 and q0 are guaranteed to be 0, so // suma works well in this scenario. p0_q0_p1_r23 = spi_vsuma8u(p0, q0, 0x00010001p2); // ********************** // Unpack input data // ********************** // In this implementation of loop filter top 2 bytes of // packed pixel are not used. p0_r23 = (vec int16x2)p0; p1_r23 = (vec int16x2)p1; p2_r23 = (vec int16x2)p2; q0_r23 = (vec int16x2)q0; q1_r23 = (vec int16x2)q1; q2_r23 = (vec int16x2)q2; // *********************************************************************** // filtering for p1, p0, q0, q1 for bs < 4 // *********************************************************************** //p2_r23_x2 = spi_vshift16(p2_r23,1); //p1_r23_x4 = spi_vshift16(p1_r23,2); //p0_r23_x4 = spi_vshift16(p0_r23,2); //q0_r23_x4 = spi_vshift16(q0_r23,2); //q1_r23_x4 = spi_vshift16(q1_r23,2); //q2_r23_x2 = spi_vshift16(q2_r23,1); // Arithmetic below is a little tricky because there is no explicit // translation of 16 bit results into 8 bits. 16 bit results generated by // by the math are cliiped to tC, since tC value is guranteed to be // 8 bits, delta values will be clipped to be 8 bits. For negative // delta values upper 8 bits are automatically zeroed out because of
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -