📄 filter_block_kc.h

📁 deblocking 在SPI DSP平台优化好的代码,超级强
💻 H
📖 第 1 页 / 共 4 页
字号:
//                CoeffP, CoeffQ: 4x1 vectors//                add_val:         Scalar//                shift_val:       Amount to shift dotproduct results//                                by for scaling (must be a negative//                                shift amount to scale down)//  Outputs:      none//  Return Value: SumVec:         4x1 vector//------------------------------------------------------------------// Signed data, coefficients and resultsinline kernel void matrix_mult_int4x4(vec uint8x4 t_3(in), vec uint8x4 t_2(in),                                      vec uint8x4 t_1(in), vec uint8x4 t_0(in),                                       vec int8x4 coeff(in), vec int32x1 add_val(in),                                      vec int32x1 shift_val(in), vec int8x4 ret_val(out)){    ret_val = spi_vclip16i(spi_vshifta16(spi_vclip32i(spi_vdotpa8ui(t_3, coeff, add_val),                                                      spi_vdotpa8ui(t_2, coeff, add_val)),                                         shift_val),                           spi_vshifta16(spi_vclip32i(spi_vdotpa8ui(t_1, coeff, add_val),                                                      spi_vdotpa8ui(t_0, coeff, add_val)),                                         shift_val));}// Unsigned data, coefficients and resultsinline kernel void matrix_mult_uint4x4(vec uint8x4 t_3(in), vec uint8x4 t_2(in),                                        vec uint8x4 t_1(in), vec uint8x4 t_0(in),                                        vec uint8x4 coeff(in), vec uint32x1 add_val(in),                                        vec int32x1 shift_val(in), vec uint8x4 ret_val(out)){    ret_val = spi_vclip16u(spi_vshift16(spi_vclip32u(spi_vdotpa8u(t_3, coeff, add_val),                                                     spi_vdotpa8u(t_2, coeff, add_val)),                                        shift_val),                           spi_vshift16(spi_vclip32u(spi_vdotpa8u(t_1, coeff, add_val),                                                     spi_vdotpa8u(t_0, coeff, add_val)),                                        shift_val));}//------------------------------------------------------------------inline kernel void transpose_block//------------------------------------------------------------------// KernelC inline function to transpose the elements of a 4x4 block.// It accepts four elements, each four bytes wide.  Transpose, so that// the first output gets the first byte from each input, the second// output gets the second byte from each input, etc.//--------------------------------------------------------------------( // Input: block to be transformed vec uint8x4 data0(in), vec uint8x4 data1(in),  vec uint8x4 data2(in), vec uint8x4 data3(in),  // Output: transposed block vec uint8x4 t_data0(out), vec uint8x4 t_data1(out),  vec uint8x4 t_data2(out), vec uint8x4 t_data3(out) )//--------------------------------------------------------------------{    vec uint8x4 tmp_data0, tmp_data1, tmp_data2, tmp_data3;    tmp_data1 = (vec uint8x4)spi_vshuffledu_hi(0x76325410, (vec uint32x1)data0, (vec uint32x1)data1);    tmp_data0 = (vec uint8x4)spi_vshuffledu_lo(0x76325410, (vec uint32x1)data0, (vec uint32x1)data1);    tmp_data3 = (vec uint8x4)spi_vshuffledu_hi(0x76325410, (vec uint32x1)data2, (vec uint32x1)data3);    tmp_data2 = (vec uint8x4)spi_vshuffledu_lo(0x76325410, (vec uint32x1)data2, (vec uint32x1)data3);    t_data2   = (vec uint8x4)spi_vshuffledu_hi(0x75643120, (vec uint32x1)tmp_data0, (vec uint32x1)tmp_data2);    t_data0   = (vec uint8x4)spi_vshuffledu_lo(0x75643120, (vec uint32x1)tmp_data0, (vec uint32x1)tmp_data2);    t_data3   = (vec uint8x4)spi_vshuffledu_hi(0x75643120, (vec uint32x1)tmp_data1, (vec uint32x1)tmp_data3);    t_data1   = (vec uint8x4)spi_vshuffledu_lo(0x75643120, (vec uint32x1)tmp_data1, (vec uint32x1)tmp_data3);}//------------------------------------------------------------------inline kernel void transpose_block_from_sparse//------------------------------------------------------------------// KernelC inline function to transpose the elements of a // sparse 4x4 block producing only 2 words as output. This // is useful in luma deblocking as transposed block has only// 2 pixels packed into a word in half word positions (bytes 0 and 2). // See below for full 4x4 transpose.//--------------------------------------------------------------------( // Input: block to be transformed vec uint8x4 data0(in), vec uint8x4 data1(in),  vec uint8x4 data2(in), vec uint8x4 data3(in),  // Output: transposed block vec uint8x4 t_data0(out), vec uint8x4 t_data2(out) )//--------------------------------------------------------------------{    vec uint8x4 tmp_data0, tmp_data1, tmp_data2, tmp_data3;    tmp_data1 = (vec uint8x4)spi_vshuffledu_hi(0x76325410, (vec uint32x1)data0, (vec uint32x1)data1);    tmp_data0 = (vec uint8x4)spi_vshuffledu_lo(0x76325410, (vec uint32x1)data0, (vec uint32x1)data1);    tmp_data3 = (vec uint8x4)spi_vshuffledu_hi(0x76325410, (vec uint32x1)data2, (vec uint32x1)data3);    tmp_data2 = (vec uint8x4)spi_vshuffledu_lo(0x76325410, (vec uint32x1)data2, (vec uint32x1)data3);    t_data2 = (vec uint8x4)spi_vshuffledu_hi(0x75643120, (vec uint32x1)tmp_data0, (vec uint32x1)tmp_data2);    t_data0 = (vec uint8x4)spi_vshuffledu_lo(0x75643120, (vec uint32x1)tmp_data0, (vec uint32x1)tmp_data2);}//------------------------------------------------------------------inline kernel void transpose_block_to_sparse//------------------------------------------------------------------// KernelC inline function to transpose the elements of a // 2 input words into sparse 4x4 block as output. This // is useful in luma deblocking as transposed block has only// 2 pixels packed into a word in half word positions (bytes 0 and 2). // See above for full 4x4 transpose.//--------------------------------------------------------------------( // Input: block to be transformed vec uint8x4 data0(in),  vec uint8x4 data1(in),  // Output: transposed block vec uint8x4 t_data0(out), vec uint8x4 t_data1(out),  vec uint8x4 t_data2(out), vec uint8x4 t_data3(out) )//--------------------------------------------------------------------{    t_data1 = (vec uint8x4)spi_vshuffledu_hi(0x88108854, (vec uint32x1)data1, (vec uint32x1)data0);    t_data0 = (vec uint8x4)spi_vshuffledu_lo(0x88108854, (vec uint32x1)data1, (vec uint32x1)data0);    t_data3 = (vec uint8x4)spi_vshuffledu_hi(0x88328876, (vec uint32x1)data1, (vec uint32x1)data0);    t_data2 = (vec uint8x4)spi_vshuffledu_lo(0x88328876, (vec uint32x1)data1, (vec uint32x1)data0);}inline kernel void edge_filt_int_clip(vec int16x2 sum_r01(in), vec int16x2 sum_r23(in),                                vec int32x1 shift_val(in), vec int8x4 ret_val(out)){    ret_val = spi_vclip16i(spi_vshifta16(sum_r01, shift_val),                           spi_vshifta16(sum_r23, shift_val));}inline kernel void edge_filt_uint_clip(vec uint16x2 sum_r01(in), vec uint16x2 sum_r23(in),                                        vec int32x1 shift_val(in), vec uint8x4 ret_val(out)){    ret_val = spi_vclip16u(spi_vshift16(sum_r01, shift_val),                           spi_vshift16(sum_r23, shift_val));}//------------------------------------------------------------------inline kernel void filter_luma_edge_bs123//------------------------------------------------------------------//    KernelC inline functions to perform deblocking filter on //    luma vertical edges b,c,d (i.e. cover the bs<4 cases only).  ////    Each cluster does the vertical edges for one 4x4 block.  The //    inputs come in as packed 8-bit data for two blocks,//    four rows per word as shown below.  The output new_p's and new_q's//    end up packed the same way.  //    --------------------------------------------//    --------------------------------------------//    || XX | p2 | p1 | p0 || q0 | q1 | q2 | XX ||//    --------------------------------------------//    || XX | p2 | p1 | p0 || q0 | q1 | q2 | XX ||//    --------------------------------------------//    || XX | p2 | p1 | p0 || q0 | q1 | q2 | XX ||//    --------------------------------------------//    || XX | p2 | p1 | p0 || q0 | q1 | q2 | XX ||//    --------------------------------------------//    --------------------------------------------//------------------------------------------------------------------( // Boundary Strength vec uint32x1 bs_is_not_0(in),  // as per the standard(in),  replicated for each byte(in)  vec uint8x4 alpha(in), vec uint8x4 beta(in), vec uint16x2 tc0(in),  // Input:  block to the left of this block(in)  vec uint8x4 p2(in), vec uint8x4 p1(in), vec uint8x4 p0(in), // Input:  the current block(in)  vec uint8x4 q0(in), vec uint8x4 q1(in), vec uint8x4 q2(in), // Input vec bool32x1 dont_filter_edges(in), // Output: p1,p0 are updated vec uint8x4 new_p1(out), vec uint8x4 new_p0(out), // Output: q0(out), q1 are updated(out)  vec uint8x4 new_q0(out), vec uint8x4 new_q1(out) )//------------------------------------------------------------------{    ///////////////////////////////////////////////////////////////////////////    // Preliminaries    ///////////////////////////////////////////////////////////////////////////    // For storing outputs of filtering processes.  These will be    // muxed with select()s later based on the actual value of bs.    vec uint8x4 p0_out_bs123, p1_out_bs123;    vec uint8x4 q0_out_bs123, q1_out_bs123;        // Calculate filt_samp_flag    vec uint8x4 abd_p0_q0;    vec bool8x4 p1_term;    vec bool8x4 filt_samp_flag;    vec uint8x4 ap;    vec bool8x4 ap_lt_beta;    vec uint8x4 aq;    vec bool8x4 aq_lt_beta;    vec int16x2 p0_r23, p1_r23, p2_r23;    vec int16x2 q0_r23, q1_r23, q2_r23;    //vec int16x2 p2_r23_x2;    //vec int16x2 p1_r23_x4;    //vec int16x2 p0_r23_x4;    //vec int16x2 q0_r23_x4;    //vec int16x2 q1_r23_x4;    //vec int16x2 q2_r23_x2;    vec int16x2 p0_q0_p1_r23;    vec int16x2 dot_prod_q1;    vec int8x4 tmp_clip8x4;    vec int16x2 tmp_clip16x2;    vec uint16x2 tC;    vec int16x2 dot_prod_delta;    vec int16x2 delta;    vec int16x2 dot_prod_p1;    abd_p0_q0 = spi_vabd8u(p0, q0);    p1_term   = spi_vlt8u(spi_vabd8u(p1, p0), beta);        filt_samp_flag = ((vec bool8x4)bs_is_not_0 &                      spi_vlt8u(abd_p0_q0, alpha) &                      spi_vlt8u(spi_vabd8u(q1, q0), beta) &                      (vec bool8x4)spi_vnot32(dont_filter_edges)) & p1_term;        // Calculate ap and aq, although ap is only valid for the MB edge    // clusters, and will have to be updated in Phase II    ap = spi_vabd8u(p2, p0);    ap_lt_beta = spi_vlt8u(ap, beta);    aq = spi_vabd8u(q2, q0);    aq_lt_beta = spi_vlt8u(aq, beta);    // Upper bytes of p0 and q0 are guaranteed to be 0, so    // suma works well in this scenario.    p0_q0_p1_r23 = spi_vsuma8u(p0, q0, 0x00010001p2);        // **********************    // Unpack input data    // **********************    // In this implementation of loop filter top 2 bytes of     // packed pixel are not used.    p0_r23 = (vec int16x2)p0;    p1_r23 = (vec int16x2)p1;    p2_r23 = (vec int16x2)p2;        q0_r23 = (vec int16x2)q0;    q1_r23 = (vec int16x2)q1;    q2_r23 = (vec int16x2)q2;        // ***********************************************************************    // filtering for p1, p0, q0, q1 for bs < 4    // ***********************************************************************        //p2_r23_x2 = spi_vshift16(p2_r23,1);    //p1_r23_x4 = spi_vshift16(p1_r23,2);    //p0_r23_x4 = spi_vshift16(p0_r23,2);    //q0_r23_x4 = spi_vshift16(q0_r23,2);    //q1_r23_x4 = spi_vshift16(q1_r23,2);    //q2_r23_x2 = spi_vshift16(q2_r23,1);    // Arithmetic below is a little tricky because there is no explicit    // translation of 16 bit results into 8 bits. 16 bit results generated by    // by the math are cliiped to tC, since tC value is guranteed to be    // 8 bits, delta values will be clipped to be 8 bits. For negative    // delta values upper 8 bits are automatically zeroed out because of
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -