📄 deblock_luma_kc.sc

📁 deblocking 在SPI DSP平台优化好的代码,超级强
💻 SC
📖 第 1 页 / 共 4 页
字号:
    vec uint32x1 top_in_idxc, bot_out_idxc, top_out_idxc, left_out_idxc, cur_in_idxc;    vec uint8x4 topac_2, topac_3, botac_2, botac_3;    vec uint8x4 b_tc1_a_tc0_v, b_tc1_a_tc0_h;    vec uint8x4 tmp1_8x4u, tmp0_8x4u;    vec uint8x4 data_right;    vec uint32x1 in_idx_c;    vec bool32x1 prev_prcoess_mb;    vec uint32x1 loop_ctr;    vec uint32x1 abtc_idx;    vec uint8x4 data_bottom;    int32x1 s_filter_first_mb_vert_edges;    int32x1 s_filter_last_mb_horz_edges;    uint32x1 s_no_of_iter;    uint32x1 orig_s_no_of_iter;    int32x1 i;    // Offset to reach out_frame_top in out_frame_inter stream    vec int32x1 out_frame_top_offset;        in_idx_c = 0;    prev_prcoess_mb = 0;    loop_ctr = 0;    abtc_idx = 0;        data_prev_c_0 = 0p4;    data_prev_c_1 = 0p4;    data_prev_c_2 = 0p4;    data_prev_c_3 = 0p4;    t_left_0 = t_left_1 = t_left_2 = t_left_3 = 0p4;    bota_0 = bota_1 = bota_2 = bota_3 = 0p4;    s_filter_first_mb_vert_edges = spi_shuffledu_hi(0x88888832,                                                    s_packed_fmbve_lmbhe_iter, 0);    s_filter_last_mb_horz_edges  = spi_shuffledu_lo(0x88888832,                                                    s_packed_fmbve_lmbhe_iter, 0);    s_no_of_iter = spi_shuffleu(0x08080800, s_packed_fmbve_lmbhe_iter, 0);        strip_size_p2 = s_no_of_iter;    orig_s_no_of_iter = s_no_of_iter;    in_pitch = strip_size_p2 << 2;    out_frame_top_offset = spi_vshift32(spi_vadd32i(strip_size_p2, 1), 3);    cid_6_7_14_15 = spi_vle32u(6U, spi_laneid()&7);    cid_lt_8 = spi_vlt32u(spi_laneid(), 8);        // Permuations for cluster communication    perm_get_left_c  = ((spi_laneid() - 1) & 0x1) + (spi_laneid() & 0xe);    //0 - 10, 1 - 11, 2 - 0, 3 - 1, 4 - 14, 5 - 15, 6 - 4, 7 - 5,     //8 - 2, 9 - 3, 10 - 8, 11 - 9, 12 - 6, 13 - 7, 14 - 12, 15 - 13     perm_get_top_c   = ((spi_laneid()&0x2) == 2) ?        spi_laneid()-2 : (spi_laneid()+10)&0xF;    //0 - 2, 1 - 3, 2 - 8, 3 - 9, 4 - 6, 5 - 7, 6 - 12, 7 - 13    //8 - 10, 9 - 11, 10 - 0, 11 - 1, 12 - 14, 13 - 15, 14 - 4, 15 - 5    perm_get_bottom_c = (((spi_laneid()&0x2) == 2) ?                         spi_laneid()+6 : spi_laneid()+2)&0xF;    perm_get_right_c  = ((spi_laneid() + 1) & 0x1) + (spi_laneid() & 0xe);    // Conditions for selecting subsets of 4x4 blocks (clusters)    left_edge_c   = spi_veq32((spi_laneid() & 1), 0);    right_edge_c  = spi_veq32((spi_laneid() & 1), 1);    top_edge_c    = spi_veq32((spi_laneid() & 2), 0);    bottom_edge_c = spi_veq32((spi_laneid() & 2), 2);        cid_b2  = spi_vne32((spi_laneid() & 4), 0);    cid_b1  = spi_vne32((spi_laneid() & 2), 0);    cid_b0  = spi_vne32((spi_laneid() & 1), 0);    perm_b3 = spi_laneid() ^ 0x08;    perm_b2 = spi_laneid() ^ 0x04;    perm_b1 = spi_laneid() ^ 0x02;    perm_b0 = spi_laneid() ^ 0x01;        while(s_no_of_iter){        vec bool32x1 dont_filter_horz_edges;        vec bool32x1 dont_filter_vert_edges;        vec bool32x1 process_mb;                // Create a lag of 2 MBs between MB processed by clusters        // 0-7 and MB processed by clusters 8-15.        process_mb = (spi_vgt32u(loop_ctr, 1U) & (spi_vnot32(cid_lt_8))) |            (spi_vlt32u(loop_ctr, (strip_size_p2 - 2U)) & (cid_lt_8));                // Do not filter vertical edges if this MB is the first in strip and        // not the first in a MB row or if this MB is the padding MB used for        // wave front approach in 16 clusters.        dont_filter_vert_edges = ((((spi_veq32(loop_ctr, 0) & (cid_lt_8)) |                                    (spi_veq32(loop_ctr, 2) & spi_vnot32(cid_lt_8))) &                                   spi_eq32(s_filter_first_mb_vert_edges, 0)) |                                  spi_vnot32(process_mb));        dont_filter_horz_edges = ((((spi_veq32(loop_ctr, (strip_size_p2 - 3U)) & (cid_lt_8)) |                                    (spi_veq32(loop_ctr, (strip_size_p2 - 1U)) & spi_vnot32(cid_lt_8))) &                                   spi_eq32(s_filter_last_mb_horz_edges, 0)) |                                  spi_vnot32(process_mb));        // in_idx_c * 12        abtc_idx = (in_idx_c<<3) + (in_idx_c<<2);                // If chroma is not processed in the same kernel as luma,         // skip over the BS/Alpha/Beta/tC0 entries related to Chroma.        spi_read(bs_a_b_tc_str, tmp0_8x4u);        spi_read(bs_a_b_tc_str, b_tc1_a_tc0_v);        spi_read(bs_a_b_tc_str, tmp1_8x4u);        spi_read(bs_a_b_tc_str, b_tc1_a_tc0_h);                // Process Luma blocks        ////////////////////////////////////////////////////        // Load data        ////////////////////////////////////////////////////        // Load 2 rows of 16 pixels into each cluster. Input        // Data is organized such that first 8 clusters will get        // top MB row and clusters 8-15 get the data from bottom        // MB row. Each row of pixels in LRF are separated by         // pitch.        // FFD doesn't seem like reading past the size of         // stream. Would this be an issue with hardware.        // FIXME, may be this is not needed in hardware        cur_in_idx = spi_vselect32(process_mb, in_idx_c, 0)<<2;        // Input strip contains extra MB on both left and right side        // of the strip. Ignore the padded MB while reading        spi_array_read(in_frame, cur0_3210_0, cur_in_idx+4U);        spi_array_read(in_frame, cur1_3210_0, cur_in_idx+5U);        spi_array_read(in_frame, cur2_3210_0, cur_in_idx+6U);        spi_array_read(in_frame, cur3_3210_0, cur_in_idx+7U);                spi_array_read(in_frame, cur0_3210_1, cur_in_idx+in_pitch+4U);        spi_array_read(in_frame, cur1_3210_1, cur_in_idx+in_pitch+5U);        spi_array_read(in_frame, cur2_3210_1, cur_in_idx+in_pitch+6U);        spi_array_read(in_frame, cur3_3210_1, cur_in_idx+in_pitch+7U);                 // ****************************************************        // Transpose horizontals to verticals        // ****************************************************        // Each cluster has only 2 lines per block to process. These        // 2 lines are transposed to 4 sparse lines. Bytes 0 and 2        // are used.        transpose_block_to_sparse(cur0_3210_0, cur0_3210_1,                                  t_cur0_0, t_cur0_1, t_cur0_2, t_cur0_3);        transpose_block_to_sparse(cur1_3210_0, cur1_3210_1,                                  t_cur1_0, t_cur1_1, t_cur1_2, t_cur1_3);        transpose_block_to_sparse(cur2_3210_0, cur2_3210_1,                                  t_cur2_0, t_cur2_1, t_cur2_2, t_cur2_3);        transpose_block_to_sparse(cur3_3210_0, cur3_3210_1,                                  t_cur3_0, t_cur3_1, t_cur3_2, t_cur3_3);        // ****************************************************        // Filter vertical edges        // ****************************************************        // Load packed BS/Alpha/Beta/tC0 for vertical edge        //load4(bs_a_b_tc_str, abtc_idx, bs, alpha_10, beta_10, tc0_3210);        spi_read(bs_a_b_tc_str, bs);        spi_read(bs_a_b_tc_str, alpha_10);        spi_read(bs_a_b_tc_str, beta_10);        spi_read(bs_a_b_tc_str, tc0_3210);                 alpha1 = (vec uint8x4)spi_vshuffledu_hi(0xA820A820, (vec uint32x1)alpha_10, 0);        alpha0 = (vec uint8x4)spi_vshuffledu_lo(0xA820A820, (vec uint32x1)alpha_10, 0);                 beta1 = (vec uint8x4)spi_vshuffledu_hi(0xA820A820, (vec uint32x1)beta_10, 0);        beta0 = (vec uint8x4)spi_vshuffledu_lo(0xA820A820, (vec uint32x1)beta_10, 0);                tc0_2 = (vec uint16x2)spi_vshuffledu_hi(0xA820A820, (vec uint32x1)tc0_3210, 0);        tc0_0 = (vec uint16x2)spi_vshuffledu_lo(0xA820A820, (vec uint32x1)tc0_3210, 0);                tc0_3 = (vec uint16x2)spi_vshuffledu_hi(0xB931B931, (vec uint32x1)tc0_3210, 0);        tc0_1 = (vec uint16x2)spi_vshuffledu_lo(0xB931B931, (vec uint32x1)tc0_3210, 0);        // Filter vertical edges.        // New langauge internal kernel functions doesn't support io type        // vectors. Same variables are used for input and output arguments.        filter_luma_edges(bs,                          alpha0, alpha1, alpha1, alpha1,                          beta0, beta1, beta1, beta1,                          tc0_0, tc0_1, tc0_2, tc0_3,                          t_left_0, t_left_1, t_left_2, t_left_3,                          t_cur0_0, t_cur0_1, t_cur0_2, t_cur0_3,                          t_cur1_0, t_cur1_1, t_cur1_2, t_cur1_3,                          t_cur2_0, t_cur2_1, t_cur2_2, t_cur2_3,                          t_cur3_0, t_cur3_1, t_cur3_2, t_cur3_3,                          dont_filter_vert_edges,                          t_left_1, t_left_2, t_left_3,                          t_cur0_0, t_cur0_1, t_cur0_2, t_cur0_3,                          t_cur1_0, t_cur1_1, t_cur1_2, t_cur1_3,                          t_cur2_0, t_cur2_1, t_cur2_2, t_cur2_3,                          t_cur3_0, t_cur3_1, t_cur3_2, t_cur3_3                          );                // ****************************************************        // Transpose and output left blocks:        // ****************************************************        transpose_block_from_sparse(t_left_0, t_left_1, t_left_2, t_left_3,                                    left_0_3, left_1_3);                // Vertical edge filtering could have modified left blocks,         // some of these left block pixels are also bottom pixels of        // previous MB(in clusters 6,7,14,15). Update these pixels to        // ensure bottom MB row of current strip can indeed get updated        // Top values from top MB row of current strip.        bot0 = left_0_3;         bot1 = left_1_3;                // Each cluster 2 rows from left block, swap these with neighboring        // clusters and transpose to create 2 vertical lines per cluster.        bot1_tmp = (vec uint8x4)spi_vshuffledu_lo(0x75643120, (vec uint32x1)bot0, (vec uint32x1)bot1);        bot0_tmp = (vec uint8x4)spi_vshuffledu_hi(0x75643120, (vec uint32x1)bot0, (vec uint32x1)bot1);                bot1 = spi_vselect8((vec uint8x4)cid_b0, bot1_tmp, bot0_tmp);        bot0 = spi_vselect8((vec uint8x4)cid_b0, bot0_tmp, bot1_tmp);                bot1 = (vec uint8x4)spi_vperm32(perm_b0, (vec uint32x1)bot1, 0);        bot1_tmp = bot1;        bot0_tmp = bot0;                bot1 = spi_vselect8((vec uint8x4)cid_b0, bot0_tmp, bot1_tmp);        bot0 = spi_vselect8((vec uint8x4)cid_b0, bot1_tmp, bot0_tmp);                bota_1u = (vec uint8x4)spi_vshuffledu_hi(0x88318820, (vec uint32x1)bot0, 0);        bota_0u = (vec uint8x4)spi_vshuffledu_lo(0x88318820, (vec uint32x1)bot0, 0);        bota_3u = (vec uint8x4)spi_vshuffledu_hi(0x88318820, (vec uint32x1)bot1, 0);        bota_2u = (vec uint8x4)spi_vshuffledu_lo(0x88318820, (vec uint32x1)bot1, 0);        // This update should only happen for clusters 6,7,14,15.        bota_0 = spi_vselect8((vec uint8x4)cid_6_7_14_15, bota_0u, bota_0);        bota_1 = spi_vselect8((vec uint8x4)cid_6_7_14_15, bota_1u, bota_1);        bota_2 = spi_vselect8((vec uint8x4)cid_6_7_14_15, bota_2u, bota_2);        bota_3 = spi_vselect8((vec uint8x4)cid_6_7_14_15, bota_3u, bota_3);                // ****************************************************        // 8-cluster vertical to horizontal transpose: After        // the transpose each cluster will have 2 columns of pixels        // of the full MB in place of 2 rows cluster originally had.         // In addition to transpose, upper bytes 1 and 3 of each word        // are to be padded with zeros.        // ****************************************************        transpose_macroblock16x2(cid_b2, cid_b1, cid_b0, perm_b2, perm_b1, perm_b0,                                 t_cur0_0, t_cur0_1, t_cur0_2, t_cur0_3,                                 t_cur1_0, t_cur1_1, t_cur1_2, t_cur1_3,                                 t_cur2_0, t_cur2_1, t_cur2_2, t_cur2_3,                                 t_cur3_0, t_cur3_1, t_cur3_2, t_cur3_3,                                 cur0_0, cur0_1, cur0_2, cur0_3,                                 cur1_0, cur1_1, cur1_2, cur1_3,                                 cur2_0, cur2_1, cur2_2, cur2_3,                                 cur3_0, cur3_1, cur3_2, cur3_3);                //Read top from the stream which was stored        //as bottom in previous strip.        top_in_idx = spi_vselect32((cid_lt_8 & spi_vle32u(2U, loop_ctr)), loop_ctr - 2U, loop_ctr)<<2;        spi_array_read(in_out_frame_bot, topa_0, top_in_idx);        spi_array_read(in_out_frame_bot, topa_1, top_in_idx+1U);        spi_array_read(in_out_frame_bot, topa_2, top_in_idx+2U);        spi_array_read(in_out_frame_bot, topa_3, top_in_idx+3U);                // Bottoms written out by clusters 8-15 in previous strip        // are used as Top for current strip in clusters 0-7. Swap        // the bottoms.        topa_0 = (vec uint8x4)spi_vperm32(perm_b3, (vec uint32x1)topa_0, 0);        topa_1 = (vec uint8x4)spi_vperm32(perm_b3, (vec uint32x1)topa_1, 0);        topa_2 = (vec uint8x4)spi_vperm32(perm_b3, (vec uint32x1)topa_2, 0);        topa_3 = (vec uint8x4)spi_vperm32(perm_b3, (vec uint32x1)topa_3, 0);
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -