📄 deblock_luma_kc.sc
字号:
// Write bottoms out only after corresponding array entries // are read as Tops. Instead of writing dummy entry at index // 0, write after strip_size. Arrays passed into kernel should // be declared large enough to support this. bot_out_idx = spi_vselect32(prev_prcoess_mb, in_idx_c - 1U, strip_size_p2)<<2; spi_array_write(in_out_frame_bot, bota_0, bot_out_idx); spi_array_write(in_out_frame_bot, bota_1, bot_out_idx+1U); spi_array_write(in_out_frame_bot, bota_2, bot_out_idx+2U); spi_array_write(in_out_frame_bot, bota_3, bot_out_idx+3U); // **************************************************** // Filter horizontal edges: // **************************************************** // Each cluster will process all 4 horizontal edges of // 2 columns of an MB. 2 clusters together will process // 4 horizontal edges. //load4(bs_a_b_tc_str, abtc_idx+4, bs, alpha_10, beta_10, tc0_3210); spi_read(bs_a_b_tc_str, bs); spi_read(bs_a_b_tc_str, alpha_10); spi_read(bs_a_b_tc_str, beta_10); spi_read(bs_a_b_tc_str, tc0_3210); alpha1 = (vec uint8x4)spi_vshuffledu_hi(0xA820A820, (vec uint32x1)alpha_10, 0); alpha0 = (vec uint8x4)spi_vshuffledu_lo(0xA820A820, (vec uint32x1)alpha_10, 0); beta1 = (vec uint8x4)spi_vshuffledu_hi(0xA820A820, (vec uint32x1)beta_10, 0); beta0 = (vec uint8x4)spi_vshuffledu_lo(0xA820A820, (vec uint32x1)beta_10, 0); tc0_2 = (vec uint16x2)spi_vshuffledu_hi(0xA820A820, (vec uint32x1)tc0_3210, 0); tc0_0 = (vec uint16x2)spi_vshuffledu_lo(0xA820A820, (vec uint32x1)tc0_3210, 0); tc0_3 = (vec uint16x2)spi_vshuffledu_hi(0xB931B931, (vec uint32x1)tc0_3210, 0); tc0_1 = (vec uint16x2)spi_vshuffledu_lo(0xB931B931, (vec uint32x1)tc0_3210, 0); // Filter horizontal edges filter_luma_edges(bs, alpha0, alpha1, alpha1, alpha1, beta0, beta1, beta1, beta1, tc0_0, tc0_1, tc0_2, tc0_3, topa_0, topa_1, topa_2, topa_3, cur0_0, cur0_1, cur0_2, cur0_3, cur1_0, cur1_1, cur1_2, cur1_3, cur2_0, cur2_1, cur2_2, cur2_3, cur3_0, cur3_1, cur3_2, cur3_3, dont_filter_horz_edges, topa_1, topa_2, topa_3, cur0_0, cur0_1, cur0_2, cur0_3, cur1_0, cur1_1, cur1_2, cur1_3, cur2_0, cur2_1, cur2_2, cur2_3, cur3_0, cur3_1, cur3_2, cur3_3 ); // Write top out as is. This top will be merged into current // and top strips by the post data munge kernel. //top_out_idx = spi_vselect32(process_mb, in_idx_c, strip_size_p2)<<2; // Add out_frame_inter offset to reach to out_frame_top top_out_idx = spi_vadd32i(out_frame_top_offset, spi_vselect32(process_mb, in_idx_c, strip_size_p2)<<2); // out_frame_top is merged to be a substream of out_frame_inter spi_array_write(out_frame_inter/*top*/, topa_0, top_out_idx); spi_array_write(out_frame_inter/*top*/, topa_1, top_out_idx+1U); spi_array_write(out_frame_inter/*top*/, topa_2, top_out_idx+2U); spi_array_write(out_frame_inter/*top*/, topa_3, top_out_idx+3U); // Set bottom to bottom pixels of current MB. Bottom // is written out only after next MB vertical edges are // done (see above). bota_0 = cur3_0; bota_1 = cur3_1; bota_2 = cur3_2; bota_3 = cur3_3; // **************************************************** // Store data // **************************************************** // Get the current MB data back into original order // where each cluster has 2 rows worth of data. transpose_macroblock16x2(cid_b2, cid_b1, cid_b0, perm_b2, perm_b1, perm_b0, cur0_0,cur0_1,cur0_2,cur0_3, cur1_0,cur1_1,cur1_2,cur1_3, cur2_0,cur2_1,cur2_2,cur2_3, cur3_0,cur3_1,cur3_2,cur3_3, t_cur0_0,t_cur0_1,t_cur0_2,t_cur0_3, t_cur1_0,t_cur1_1,t_cur1_2,t_cur1_3, t_cur2_0,t_cur2_1,t_cur2_2,t_cur2_3, t_cur3_0,t_cur3_1,t_cur3_2,t_cur3_3); transpose_block_from_sparse(t_cur0_0, t_cur0_1, t_cur0_2, t_cur0_3, cur0_3210_0, cur0_3210_1); transpose_block_from_sparse(t_cur1_0, t_cur1_1, t_cur1_2, t_cur1_3, cur1_3210_0, cur1_3210_1); transpose_block_from_sparse(t_cur2_0, t_cur2_1, t_cur2_2, t_cur2_3, cur2_3210_0, cur2_3210_1); transpose_block_from_sparse(t_cur3_0, t_cur3_1, t_cur3_2, t_cur3_3, cur3_3210_0, cur3_3210_1); // Setup the lefts for next iteration t_left_0 = t_cur3_0; t_left_1 = t_cur3_1; t_left_2 = t_cur3_2; t_left_3 = t_cur3_3; // Left pixels are written out only if previous MB was // actually processed (not a wavefront padded MB) case. left_out_idx = (spi_vselect32(prev_prcoess_mb, in_idx_c-1U, strip_size_p2)<<3)+6U; spi_array_write(out_frame_inter, left_0_3, left_out_idx); spi_array_write(out_frame_inter, left_1_3, left_out_idx+1U); // Write first 3 blocks, last block will be written out // as left in next iteration. cur_out_idx = spi_vselect32(process_mb, in_idx_c, strip_size_p2)<<3; spi_array_write(out_frame_inter, cur0_3210_0, cur_out_idx); spi_array_write(out_frame_inter, cur0_3210_1, cur_out_idx+1U); spi_array_write(out_frame_inter, cur1_3210_0, cur_out_idx+2U); spi_array_write(out_frame_inter, cur1_3210_1, cur_out_idx+3U); spi_array_write(out_frame_inter, cur2_3210_0, cur_out_idx+4U); spi_array_write(out_frame_inter, cur2_3210_1, cur_out_idx+5U); in_idx_c = spi_vselect32(process_mb, in_idx_c + 1U, in_idx_c); loop_ctr = loop_ctr + 1U; prev_prcoess_mb = process_mb; s_no_of_iter = (int32x1)s_no_of_iter - 1; } // Write the left out before exiting the loop transpose_block_from_sparse(t_left_0, t_left_1, t_left_2, t_left_3, left_0_3, left_1_3); left_out_idx = (spi_vselect32(prev_prcoess_mb, (in_idx_c-1U), strip_size_p2)<<3)+6U; spi_array_write(out_frame_inter, left_0_3, left_out_idx); spi_array_write(out_frame_inter, left_1_3, left_out_idx+1U); // Write the bottoms out for the last MB processed. bot_out_idx = spi_vselect32(prev_prcoess_mb, (in_idx_c - 1U), strip_size_p2)<<2; spi_array_write(in_out_frame_bot, bota_0, bot_out_idx); spi_array_write(in_out_frame_bot, bota_1, bot_out_idx+1U); spi_array_write(in_out_frame_bot, bota_2, bot_out_idx+2U); spi_array_write(in_out_frame_bot, bota_3, bot_out_idx+3U); post_deblock_data_munge_inline(out_frame_inter, //out_frame_top, spi_sub32i(orig_s_no_of_iter, 2), in_frame, post_data_munge_out, s_filter_first_mb_vert_edges, s_filter_last_mb_horz_edges);}// All data munging functions are created in order to convert the between// data formats efficient for DMAing and for actual processing. Deblocking// kernel is wrtten assuming, data in LRF is available in a processing// efficient format with minimum use of receives. But data is DMAed directly// to maintain the processing efficient format, EMIF utilization will be// very poor. To bridge between these needs, data is DMAed in DMA efficient// format and then data munging kernels are used to convert the DMA// efficient format into processing efficient format. This is reversed while// data is written back into external memory. These kernels also handle the// merging of top strip pixels modified by current strip deblocking with top// strip. Top strip can be written out only after this merge is complete.//--------------------------------------------------------------------// FunctionName: post_deblock_data_munge//--------------------------------------------------------------------// Merge bottom pixels of each MB row updated during the processing// of MB row below. Since each strip actually consists of 2 MB rows,// pixels of both current strip and previous strip will get updated// during the filtering of current MB strip. Merge these updated pixels// current and top strips.//--------------------------------------------------------------------inline kernel void post_deblock_data_munge_inline( // Input/output pixel data stream uint8x4 in_frame(array_io), //stream uint8x4 in_frame_top_inter(array_io), // No of iterations uint32x1 no_of_iter(in), stream uint8x4 out_frame(array_io), // Input/output pixel data for TOP stream uint8x4 in_out_frame_top(array_io), uint32x1 s_filter_first_mb_vert_edges(in), uint32x1 s_filter_last_mb_horz_edges(in) ){ vec uint8x4 topa_0, topa_1, topa_2, topa_3; vec uint8x4 top0_32, top0_10, top1_32, top1_10, top0, top1; vec uint8x4 top0_32_1, top0_10_1, top1_32_1, top1_10_1; vec uint8x4 cur0_0, cur0_1, cur1_0, cur1_1; vec uint8x4 cur2_0, cur2_1, cur3_0, cur3_1; vec uint8x4 top0_0, top0_1, top1_0, top1_1; vec uint8x4 top2_0, top2_1, top3_0, top3_1; vec uint8x4 pud0_0, pud0_1, pud1_0, pud1_1; vec uint8x4 pud2_0, pud2_1, pud3_0, pud3_1; // Transpose control vec bool32x1 cid_b2; vec bool32x1 cid_b1; vec bool32x1 cid_b0; vec uint32x1 perm_b3; vec uint32x1 perm_b2; vec uint32x1 perm_b1; vec uint32x1 perm_b0; vec uint32x1 perm_cid_a9; vec uint32x1 perm_cid_aB; vec uint32x1 perm_cid_aD; vec bool32x1 cid_6_7_14_15; vec bool32x1 cid_6_7; vec bool32x1 cid_14_15; vec uint32x1 in_out_idx; vec uint32x1 in_idx; vec uint32x1 pitch; vec uint32x1 in_fti_idx; vec uint8x4 left_pad_0, left_pad_1; vec uint8x4 right_pad_0, right_pad_1; // Offset to reach in_frame_top_inter in in_frame stream vec int32x1 in_frame_topi_offset; // Ignore the extra MB padded on left in_out_idx = 4; in_idx = 0; cid_b2 = spi_vne32((spi_laneid() & 4), 0); cid_b1 = spi_vne32((spi_laneid() & 2), 0); cid_b0 = spi_vne32((spi_laneid() & 1), 0); perm_b3 = spi_laneid() ^ 0x08; perm_b2 = spi_laneid() ^ 0x04; perm_b1 = spi_laneid() ^ 0x02; perm_b0 = spi_laneid() ^ 0x01; perm_cid_a9 = spi_laneid() & 0x9; perm_cid_aB = spi_laneid() & 0xB; perm_cid_aD = spi_laneid() & 0xD; cid_6_7_14_15 = spi_vle32u(6, spi_laneid()&7); cid_6_7 = spi_veq32(spi_laneid(), 6) | spi_veq32(spi_laneid(), 7); cid_14_15 = spi_veq32(spi_laneid(), 14) | spi_veq32(spi_laneid(), 15); pitch = spi_shift32(spi_add32i(no_of_iter, 2), 2); // Offset to reach in_frame_top_inter in in_frame stream in_frame_topi_offset = spi_vshift32(spi_vadd32i(no_of_iter, 3), 3); in_fti_idx = in_frame_topi_offset; // Pad left and right for both out_frame and in_out_frame_top if (s_filter_first_mb_vert_edges != 0){ // Load top intermeidates spi_array_read(in_frame/*_top_inter*/, topa_0, in_fti_idx); spi_array_read(in_frame/*_top_inter*/, topa_1, in_fti_idx + (vec uint32x1)1); spi_array_read(in_frame/*_top_inter*/, topa_2, in_fti_idx + (vec uint32x1)2); spi_array_read(in_frame/*_top_inter*/, topa_3, in_fti_idx + (vec uint32x1)3); // Each cluster has 4 lines of top MB (2 pixels per row), re // arrange the pixels such that each cluster will have 2 full // rows (4pixels) to write out. Swap data with neighboring // clusters. top0_32 = (vec uint8x4)spi_vperm32(perm_b0, (vec uint32x1)spi_vselect8((vec uint8x4)cid_b0, topa_0, topa_2), 0); top1_32 = (vec uint8x4)spi_vperm32(perm_b0, (vec uint32x1)spi_vselect8((vec uint8x4)cid_b0, topa_1, topa_3), 0); top0_10 = spi_vselect8((vec uint8x4)cid_b0, topa_2, topa_0); top1_10 = spi_vselect8((vec uint8x4)cid_b0, topa_3, topa_1); top0_32_1 = spi_vselect8((vec uint8x4)cid_b0, top0_10, top0_32); top0_10_1 = spi_vselect8((vec uint8x4)cid_b0, top0_32, top0_10);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -