📄 deblock_luma_kc.sc

📁 deblocking 在SPI DSP平台优化好的代码,超级强
💻 SC
📖 第 1 页 / 共 4 页
字号:
                // Write bottoms out only after corresponding array entries        // are read as Tops. Instead of writing dummy entry at index        // 0, write after strip_size. Arrays passed into kernel should        // be declared large enough to support this.        bot_out_idx = spi_vselect32(prev_prcoess_mb, in_idx_c - 1U, strip_size_p2)<<2;        spi_array_write(in_out_frame_bot, bota_0, bot_out_idx);        spi_array_write(in_out_frame_bot, bota_1, bot_out_idx+1U);        spi_array_write(in_out_frame_bot, bota_2, bot_out_idx+2U);        spi_array_write(in_out_frame_bot, bota_3, bot_out_idx+3U);                        // ****************************************************        // Filter horizontal edges:        // ****************************************************        // Each cluster will process all 4 horizontal edges of        // 2 columns of an MB. 2 clusters together will process        // 4 horizontal edges.         //load4(bs_a_b_tc_str, abtc_idx+4, bs, alpha_10, beta_10, tc0_3210);        spi_read(bs_a_b_tc_str, bs);        spi_read(bs_a_b_tc_str, alpha_10);        spi_read(bs_a_b_tc_str, beta_10);        spi_read(bs_a_b_tc_str, tc0_3210);                alpha1 = (vec uint8x4)spi_vshuffledu_hi(0xA820A820, (vec uint32x1)alpha_10, 0);        alpha0 = (vec uint8x4)spi_vshuffledu_lo(0xA820A820, (vec uint32x1)alpha_10, 0);                 beta1 = (vec uint8x4)spi_vshuffledu_hi(0xA820A820, (vec uint32x1)beta_10, 0);        beta0 = (vec uint8x4)spi_vshuffledu_lo(0xA820A820, (vec uint32x1)beta_10, 0);                tc0_2 = (vec uint16x2)spi_vshuffledu_hi(0xA820A820, (vec uint32x1)tc0_3210, 0);        tc0_0 = (vec uint16x2)spi_vshuffledu_lo(0xA820A820, (vec uint32x1)tc0_3210, 0);                tc0_3 = (vec uint16x2)spi_vshuffledu_hi(0xB931B931, (vec uint32x1)tc0_3210, 0);        tc0_1 = (vec uint16x2)spi_vshuffledu_lo(0xB931B931, (vec uint32x1)tc0_3210, 0);                // Filter horizontal edges        filter_luma_edges(bs,                          alpha0, alpha1, alpha1, alpha1,                           beta0, beta1, beta1, beta1,                           tc0_0, tc0_1, tc0_2, tc0_3,                           topa_0, topa_1, topa_2, topa_3,                          cur0_0, cur0_1, cur0_2, cur0_3,                          cur1_0, cur1_1, cur1_2, cur1_3,                          cur2_0, cur2_1, cur2_2, cur2_3,                          cur3_0, cur3_1, cur3_2, cur3_3,                          dont_filter_horz_edges,                          topa_1, topa_2, topa_3,                          cur0_0, cur0_1, cur0_2, cur0_3,                          cur1_0, cur1_1, cur1_2, cur1_3,                          cur2_0, cur2_1, cur2_2, cur2_3,                          cur3_0, cur3_1, cur3_2, cur3_3                          );                // Write top out as is. This top will be merged into current        // and top strips by the post data munge kernel.        //top_out_idx = spi_vselect32(process_mb, in_idx_c, strip_size_p2)<<2;        // Add out_frame_inter offset to reach to out_frame_top        top_out_idx = spi_vadd32i(out_frame_top_offset,                                  spi_vselect32(process_mb, in_idx_c, strip_size_p2)<<2);        // out_frame_top is merged to be a substream of out_frame_inter         spi_array_write(out_frame_inter/*top*/, topa_0, top_out_idx);        spi_array_write(out_frame_inter/*top*/, topa_1, top_out_idx+1U);        spi_array_write(out_frame_inter/*top*/, topa_2, top_out_idx+2U);        spi_array_write(out_frame_inter/*top*/, topa_3, top_out_idx+3U);                // Set bottom to bottom pixels of current MB. Bottom        // is written out only after next MB vertical edges are        // done (see above).        bota_0 = cur3_0; bota_1 = cur3_1;        bota_2 = cur3_2; bota_3 = cur3_3;                // ****************************************************        // Store data        // ****************************************************        // Get the current MB data back into original order        // where each cluster has 2 rows worth of data.        transpose_macroblock16x2(cid_b2, cid_b1, cid_b0, perm_b2, perm_b1, perm_b0,                                 cur0_0,cur0_1,cur0_2,cur0_3,                                 cur1_0,cur1_1,cur1_2,cur1_3,                                 cur2_0,cur2_1,cur2_2,cur2_3,                                 cur3_0,cur3_1,cur3_2,cur3_3,                                 t_cur0_0,t_cur0_1,t_cur0_2,t_cur0_3,                                 t_cur1_0,t_cur1_1,t_cur1_2,t_cur1_3,                                 t_cur2_0,t_cur2_1,t_cur2_2,t_cur2_3,                                 t_cur3_0,t_cur3_1,t_cur3_2,t_cur3_3);                transpose_block_from_sparse(t_cur0_0, t_cur0_1, t_cur0_2, t_cur0_3,                                     cur0_3210_0, cur0_3210_1);        transpose_block_from_sparse(t_cur1_0, t_cur1_1, t_cur1_2, t_cur1_3,                                     cur1_3210_0, cur1_3210_1);        transpose_block_from_sparse(t_cur2_0, t_cur2_1, t_cur2_2, t_cur2_3,                                    cur2_3210_0, cur2_3210_1);        transpose_block_from_sparse(t_cur3_0, t_cur3_1, t_cur3_2, t_cur3_3,                                    cur3_3210_0, cur3_3210_1);                // Setup the lefts for next iteration        t_left_0 = t_cur3_0;        t_left_1 = t_cur3_1;        t_left_2 = t_cur3_2;        t_left_3 = t_cur3_3;                // Left pixels are written out only if previous MB was        // actually processed (not a wavefront padded MB) case.         left_out_idx = (spi_vselect32(prev_prcoess_mb, in_idx_c-1U, strip_size_p2)<<3)+6U;        spi_array_write(out_frame_inter, left_0_3, left_out_idx);        spi_array_write(out_frame_inter, left_1_3, left_out_idx+1U);                // Write first 3 blocks, last block will be written out        // as left in next iteration.        cur_out_idx  = spi_vselect32(process_mb, in_idx_c, strip_size_p2)<<3;        spi_array_write(out_frame_inter, cur0_3210_0, cur_out_idx);        spi_array_write(out_frame_inter, cur0_3210_1, cur_out_idx+1U);        spi_array_write(out_frame_inter, cur1_3210_0, cur_out_idx+2U);        spi_array_write(out_frame_inter, cur1_3210_1, cur_out_idx+3U);        spi_array_write(out_frame_inter, cur2_3210_0, cur_out_idx+4U);        spi_array_write(out_frame_inter, cur2_3210_1, cur_out_idx+5U);                in_idx_c = spi_vselect32(process_mb, in_idx_c + 1U, in_idx_c);        loop_ctr = loop_ctr + 1U;        prev_prcoess_mb = process_mb;                s_no_of_iter = (int32x1)s_no_of_iter - 1;    }    // Write the left out before exiting the loop    transpose_block_from_sparse(t_left_0, t_left_1, t_left_2, t_left_3,                                 left_0_3, left_1_3);    left_out_idx = (spi_vselect32(prev_prcoess_mb, (in_idx_c-1U), strip_size_p2)<<3)+6U;    spi_array_write(out_frame_inter, left_0_3, left_out_idx);    spi_array_write(out_frame_inter, left_1_3, left_out_idx+1U);        // Write the bottoms out for the last MB processed.    bot_out_idx = spi_vselect32(prev_prcoess_mb, (in_idx_c - 1U), strip_size_p2)<<2;    spi_array_write(in_out_frame_bot, bota_0, bot_out_idx);    spi_array_write(in_out_frame_bot, bota_1, bot_out_idx+1U);    spi_array_write(in_out_frame_bot, bota_2, bot_out_idx+2U);    spi_array_write(in_out_frame_bot, bota_3, bot_out_idx+3U);                  post_deblock_data_munge_inline(out_frame_inter, //out_frame_top,                                   spi_sub32i(orig_s_no_of_iter, 2),                                   in_frame, post_data_munge_out,                                   s_filter_first_mb_vert_edges,                                   s_filter_last_mb_horz_edges);}// All data munging functions are created in order to convert the between// data formats efficient for DMAing and for actual processing. Deblocking// kernel is wrtten assuming, data in LRF is available in a processing// efficient format with minimum use of receives. But data is DMAed directly// to maintain the processing efficient format, EMIF utilization will be// very poor. To bridge between these needs, data is DMAed in DMA efficient// format and then data munging kernels are used to convert the DMA// efficient format into processing efficient format. This is reversed while// data is written back into external memory. These kernels also handle the// merging of top strip pixels modified by current strip deblocking with top// strip. Top strip can be written out only after this merge is complete.//--------------------------------------------------------------------// FunctionName:        post_deblock_data_munge//--------------------------------------------------------------------// Merge bottom pixels of each MB row updated during the processing// of MB row below. Since each strip actually consists of 2 MB rows,// pixels of both current strip and previous strip will get updated// during the filtering of current MB strip. Merge these updated pixels// current and top strips.//--------------------------------------------------------------------inline kernel void post_deblock_data_munge_inline(                                                  // Input/output pixel data                                                  stream uint8x4 in_frame(array_io),                                                  //stream uint8x4 in_frame_top_inter(array_io),                                                  // No of iterations                                                  uint32x1 no_of_iter(in),                                                  stream uint8x4 out_frame(array_io),                                                  // Input/output pixel data for TOP                                                  stream uint8x4 in_out_frame_top(array_io),                                                  uint32x1 s_filter_first_mb_vert_edges(in),                                                  uint32x1 s_filter_last_mb_horz_edges(in)                                                  ){    vec uint8x4 topa_0, topa_1, topa_2, topa_3;    vec uint8x4 top0_32, top0_10, top1_32, top1_10, top0, top1;    vec uint8x4 top0_32_1, top0_10_1, top1_32_1, top1_10_1;    vec uint8x4 cur0_0, cur0_1, cur1_0, cur1_1;    vec uint8x4 cur2_0, cur2_1, cur3_0, cur3_1;    vec uint8x4 top0_0, top0_1, top1_0, top1_1;    vec uint8x4 top2_0, top2_1, top3_0, top3_1;    vec uint8x4 pud0_0, pud0_1, pud1_0, pud1_1;    vec uint8x4 pud2_0, pud2_1, pud3_0, pud3_1;        // Transpose control    vec bool32x1 cid_b2;    vec bool32x1 cid_b1;    vec bool32x1 cid_b0;    vec uint32x1 perm_b3;    vec uint32x1 perm_b2;    vec uint32x1 perm_b1;    vec uint32x1 perm_b0;    vec uint32x1 perm_cid_a9;    vec uint32x1 perm_cid_aB;    vec uint32x1 perm_cid_aD;    vec bool32x1 cid_6_7_14_15;    vec bool32x1 cid_6_7;    vec bool32x1 cid_14_15;    vec uint32x1 in_out_idx;    vec uint32x1 in_idx;    vec uint32x1 pitch;    vec uint32x1 in_fti_idx;    vec uint8x4 left_pad_0, left_pad_1;    vec uint8x4 right_pad_0, right_pad_1;    // Offset to reach in_frame_top_inter in in_frame stream    vec int32x1 in_frame_topi_offset;    // Ignore the extra MB padded on left    in_out_idx = 4;    in_idx = 0;    cid_b2  = spi_vne32((spi_laneid() & 4), 0);    cid_b1  = spi_vne32((spi_laneid() & 2), 0);    cid_b0  = spi_vne32((spi_laneid() & 1), 0);    perm_b3 = spi_laneid() ^ 0x08;    perm_b2 = spi_laneid() ^ 0x04;    perm_b1 = spi_laneid() ^ 0x02;    perm_b0 = spi_laneid() ^ 0x01;    perm_cid_a9 = spi_laneid() & 0x9;    perm_cid_aB = spi_laneid() & 0xB;    perm_cid_aD = spi_laneid() & 0xD;        cid_6_7_14_15 = spi_vle32u(6, spi_laneid()&7);    cid_6_7       = spi_veq32(spi_laneid(), 6) | spi_veq32(spi_laneid(), 7);    cid_14_15     = spi_veq32(spi_laneid(), 14) | spi_veq32(spi_laneid(), 15);    pitch = spi_shift32(spi_add32i(no_of_iter, 2), 2);        // Offset to reach in_frame_top_inter in in_frame stream    in_frame_topi_offset = spi_vshift32(spi_vadd32i(no_of_iter, 3), 3);    in_fti_idx = in_frame_topi_offset;    // Pad left and right for both out_frame and in_out_frame_top    if (s_filter_first_mb_vert_edges != 0){        // Load top intermeidates                spi_array_read(in_frame/*_top_inter*/, topa_0, in_fti_idx);        spi_array_read(in_frame/*_top_inter*/, topa_1, in_fti_idx + (vec uint32x1)1);        spi_array_read(in_frame/*_top_inter*/, topa_2, in_fti_idx + (vec uint32x1)2);        spi_array_read(in_frame/*_top_inter*/, topa_3, in_fti_idx + (vec uint32x1)3);                // Each cluster has 4 lines of top MB (2 pixels per row), re        // arrange the pixels such that each cluster will have 2 full        // rows (4pixels) to write out. Swap data with neighboring         // clusters.         top0_32 = (vec uint8x4)spi_vperm32(perm_b0, (vec uint32x1)spi_vselect8((vec uint8x4)cid_b0, topa_0, topa_2), 0);        top1_32 = (vec uint8x4)spi_vperm32(perm_b0, (vec uint32x1)spi_vselect8((vec uint8x4)cid_b0, topa_1, topa_3), 0);                top0_10 = spi_vselect8((vec uint8x4)cid_b0, topa_2, topa_0);        top1_10 = spi_vselect8((vec uint8x4)cid_b0, topa_3, topa_1);                top0_32_1 = spi_vselect8((vec uint8x4)cid_b0, top0_10, top0_32);        top0_10_1 = spi_vselect8((vec uint8x4)cid_b0, top0_32, top0_10);
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -