📄 deblock_luma_kc.sc
字号:
vec uint32x1 top_in_idxc, bot_out_idxc, top_out_idxc, left_out_idxc, cur_in_idxc; vec uint8x4 topac_2, topac_3, botac_2, botac_3; vec uint8x4 b_tc1_a_tc0_v, b_tc1_a_tc0_h; vec uint8x4 tmp1_8x4u, tmp0_8x4u; vec uint8x4 data_right; vec uint32x1 in_idx_c; vec bool32x1 prev_prcoess_mb; vec uint32x1 loop_ctr; vec uint32x1 abtc_idx; vec uint8x4 data_bottom; int32x1 s_filter_first_mb_vert_edges; int32x1 s_filter_last_mb_horz_edges; uint32x1 s_no_of_iter; uint32x1 orig_s_no_of_iter; int32x1 i; // Offset to reach out_frame_top in out_frame_inter stream vec int32x1 out_frame_top_offset; in_idx_c = 0; prev_prcoess_mb = 0; loop_ctr = 0; abtc_idx = 0; data_prev_c_0 = 0p4; data_prev_c_1 = 0p4; data_prev_c_2 = 0p4; data_prev_c_3 = 0p4; t_left_0 = t_left_1 = t_left_2 = t_left_3 = 0p4; bota_0 = bota_1 = bota_2 = bota_3 = 0p4; s_filter_first_mb_vert_edges = spi_shuffledu_hi(0x88888832, s_packed_fmbve_lmbhe_iter, 0); s_filter_last_mb_horz_edges = spi_shuffledu_lo(0x88888832, s_packed_fmbve_lmbhe_iter, 0); s_no_of_iter = spi_shuffleu(0x08080800, s_packed_fmbve_lmbhe_iter, 0); strip_size_p2 = s_no_of_iter; orig_s_no_of_iter = s_no_of_iter; in_pitch = strip_size_p2 << 2; out_frame_top_offset = spi_vshift32(spi_vadd32i(strip_size_p2, 1), 3); cid_6_7_14_15 = spi_vle32u(6U, spi_laneid()&7); cid_lt_8 = spi_vlt32u(spi_laneid(), 8); // Permuations for cluster communication perm_get_left_c = ((spi_laneid() - 1) & 0x1) + (spi_laneid() & 0xe); //0 - 10, 1 - 11, 2 - 0, 3 - 1, 4 - 14, 5 - 15, 6 - 4, 7 - 5, //8 - 2, 9 - 3, 10 - 8, 11 - 9, 12 - 6, 13 - 7, 14 - 12, 15 - 13 perm_get_top_c = ((spi_laneid()&0x2) == 2) ? spi_laneid()-2 : (spi_laneid()+10)&0xF; //0 - 2, 1 - 3, 2 - 8, 3 - 9, 4 - 6, 5 - 7, 6 - 12, 7 - 13 //8 - 10, 9 - 11, 10 - 0, 11 - 1, 12 - 14, 13 - 15, 14 - 4, 15 - 5 perm_get_bottom_c = (((spi_laneid()&0x2) == 2) ? spi_laneid()+6 : spi_laneid()+2)&0xF; perm_get_right_c = ((spi_laneid() + 1) & 0x1) + (spi_laneid() & 0xe); // Conditions for selecting subsets of 4x4 blocks (clusters) left_edge_c = spi_veq32((spi_laneid() & 1), 0); right_edge_c = spi_veq32((spi_laneid() & 1), 1); top_edge_c = spi_veq32((spi_laneid() & 2), 0); bottom_edge_c = spi_veq32((spi_laneid() & 2), 2); cid_b2 = spi_vne32((spi_laneid() & 4), 0); cid_b1 = spi_vne32((spi_laneid() & 2), 0); cid_b0 = spi_vne32((spi_laneid() & 1), 0); perm_b3 = spi_laneid() ^ 0x08; perm_b2 = spi_laneid() ^ 0x04; perm_b1 = spi_laneid() ^ 0x02; perm_b0 = spi_laneid() ^ 0x01; while(s_no_of_iter){ vec bool32x1 dont_filter_horz_edges; vec bool32x1 dont_filter_vert_edges; vec bool32x1 process_mb; // Create a lag of 2 MBs between MB processed by clusters // 0-7 and MB processed by clusters 8-15. process_mb = (spi_vgt32u(loop_ctr, 1U) & (spi_vnot32(cid_lt_8))) | (spi_vlt32u(loop_ctr, (strip_size_p2 - 2U)) & (cid_lt_8)); // Do not filter vertical edges if this MB is the first in strip and // not the first in a MB row or if this MB is the padding MB used for // wave front approach in 16 clusters. dont_filter_vert_edges = ((((spi_veq32(loop_ctr, 0) & (cid_lt_8)) | (spi_veq32(loop_ctr, 2) & spi_vnot32(cid_lt_8))) & spi_eq32(s_filter_first_mb_vert_edges, 0)) | spi_vnot32(process_mb)); dont_filter_horz_edges = ((((spi_veq32(loop_ctr, (strip_size_p2 - 3U)) & (cid_lt_8)) | (spi_veq32(loop_ctr, (strip_size_p2 - 1U)) & spi_vnot32(cid_lt_8))) & spi_eq32(s_filter_last_mb_horz_edges, 0)) | spi_vnot32(process_mb)); // in_idx_c * 12 abtc_idx = (in_idx_c<<3) + (in_idx_c<<2); // If chroma is not processed in the same kernel as luma, // skip over the BS/Alpha/Beta/tC0 entries related to Chroma. spi_read(bs_a_b_tc_str, tmp0_8x4u); spi_read(bs_a_b_tc_str, b_tc1_a_tc0_v); spi_read(bs_a_b_tc_str, tmp1_8x4u); spi_read(bs_a_b_tc_str, b_tc1_a_tc0_h); // Process Luma blocks //////////////////////////////////////////////////// // Load data //////////////////////////////////////////////////// // Load 2 rows of 16 pixels into each cluster. Input // Data is organized such that first 8 clusters will get // top MB row and clusters 8-15 get the data from bottom // MB row. Each row of pixels in LRF are separated by // pitch. // FFD doesn't seem like reading past the size of // stream. Would this be an issue with hardware. // FIXME, may be this is not needed in hardware cur_in_idx = spi_vselect32(process_mb, in_idx_c, 0)<<2; // Input strip contains extra MB on both left and right side // of the strip. Ignore the padded MB while reading spi_array_read(in_frame, cur0_3210_0, cur_in_idx+4U); spi_array_read(in_frame, cur1_3210_0, cur_in_idx+5U); spi_array_read(in_frame, cur2_3210_0, cur_in_idx+6U); spi_array_read(in_frame, cur3_3210_0, cur_in_idx+7U); spi_array_read(in_frame, cur0_3210_1, cur_in_idx+in_pitch+4U); spi_array_read(in_frame, cur1_3210_1, cur_in_idx+in_pitch+5U); spi_array_read(in_frame, cur2_3210_1, cur_in_idx+in_pitch+6U); spi_array_read(in_frame, cur3_3210_1, cur_in_idx+in_pitch+7U); // **************************************************** // Transpose horizontals to verticals // **************************************************** // Each cluster has only 2 lines per block to process. These // 2 lines are transposed to 4 sparse lines. Bytes 0 and 2 // are used. transpose_block_to_sparse(cur0_3210_0, cur0_3210_1, t_cur0_0, t_cur0_1, t_cur0_2, t_cur0_3); transpose_block_to_sparse(cur1_3210_0, cur1_3210_1, t_cur1_0, t_cur1_1, t_cur1_2, t_cur1_3); transpose_block_to_sparse(cur2_3210_0, cur2_3210_1, t_cur2_0, t_cur2_1, t_cur2_2, t_cur2_3); transpose_block_to_sparse(cur3_3210_0, cur3_3210_1, t_cur3_0, t_cur3_1, t_cur3_2, t_cur3_3); // **************************************************** // Filter vertical edges // **************************************************** // Load packed BS/Alpha/Beta/tC0 for vertical edge //load4(bs_a_b_tc_str, abtc_idx, bs, alpha_10, beta_10, tc0_3210); spi_read(bs_a_b_tc_str, bs); spi_read(bs_a_b_tc_str, alpha_10); spi_read(bs_a_b_tc_str, beta_10); spi_read(bs_a_b_tc_str, tc0_3210); alpha1 = (vec uint8x4)spi_vshuffledu_hi(0xA820A820, (vec uint32x1)alpha_10, 0); alpha0 = (vec uint8x4)spi_vshuffledu_lo(0xA820A820, (vec uint32x1)alpha_10, 0); beta1 = (vec uint8x4)spi_vshuffledu_hi(0xA820A820, (vec uint32x1)beta_10, 0); beta0 = (vec uint8x4)spi_vshuffledu_lo(0xA820A820, (vec uint32x1)beta_10, 0); tc0_2 = (vec uint16x2)spi_vshuffledu_hi(0xA820A820, (vec uint32x1)tc0_3210, 0); tc0_0 = (vec uint16x2)spi_vshuffledu_lo(0xA820A820, (vec uint32x1)tc0_3210, 0); tc0_3 = (vec uint16x2)spi_vshuffledu_hi(0xB931B931, (vec uint32x1)tc0_3210, 0); tc0_1 = (vec uint16x2)spi_vshuffledu_lo(0xB931B931, (vec uint32x1)tc0_3210, 0); // Filter vertical edges. // New langauge internal kernel functions doesn't support io type // vectors. Same variables are used for input and output arguments. filter_luma_edges(bs, alpha0, alpha1, alpha1, alpha1, beta0, beta1, beta1, beta1, tc0_0, tc0_1, tc0_2, tc0_3, t_left_0, t_left_1, t_left_2, t_left_3, t_cur0_0, t_cur0_1, t_cur0_2, t_cur0_3, t_cur1_0, t_cur1_1, t_cur1_2, t_cur1_3, t_cur2_0, t_cur2_1, t_cur2_2, t_cur2_3, t_cur3_0, t_cur3_1, t_cur3_2, t_cur3_3, dont_filter_vert_edges, t_left_1, t_left_2, t_left_3, t_cur0_0, t_cur0_1, t_cur0_2, t_cur0_3, t_cur1_0, t_cur1_1, t_cur1_2, t_cur1_3, t_cur2_0, t_cur2_1, t_cur2_2, t_cur2_3, t_cur3_0, t_cur3_1, t_cur3_2, t_cur3_3 ); // **************************************************** // Transpose and output left blocks: // **************************************************** transpose_block_from_sparse(t_left_0, t_left_1, t_left_2, t_left_3, left_0_3, left_1_3); // Vertical edge filtering could have modified left blocks, // some of these left block pixels are also bottom pixels of // previous MB(in clusters 6,7,14,15). Update these pixels to // ensure bottom MB row of current strip can indeed get updated // Top values from top MB row of current strip. bot0 = left_0_3; bot1 = left_1_3; // Each cluster 2 rows from left block, swap these with neighboring // clusters and transpose to create 2 vertical lines per cluster. bot1_tmp = (vec uint8x4)spi_vshuffledu_lo(0x75643120, (vec uint32x1)bot0, (vec uint32x1)bot1); bot0_tmp = (vec uint8x4)spi_vshuffledu_hi(0x75643120, (vec uint32x1)bot0, (vec uint32x1)bot1); bot1 = spi_vselect8((vec uint8x4)cid_b0, bot1_tmp, bot0_tmp); bot0 = spi_vselect8((vec uint8x4)cid_b0, bot0_tmp, bot1_tmp); bot1 = (vec uint8x4)spi_vperm32(perm_b0, (vec uint32x1)bot1, 0); bot1_tmp = bot1; bot0_tmp = bot0; bot1 = spi_vselect8((vec uint8x4)cid_b0, bot0_tmp, bot1_tmp); bot0 = spi_vselect8((vec uint8x4)cid_b0, bot1_tmp, bot0_tmp); bota_1u = (vec uint8x4)spi_vshuffledu_hi(0x88318820, (vec uint32x1)bot0, 0); bota_0u = (vec uint8x4)spi_vshuffledu_lo(0x88318820, (vec uint32x1)bot0, 0); bota_3u = (vec uint8x4)spi_vshuffledu_hi(0x88318820, (vec uint32x1)bot1, 0); bota_2u = (vec uint8x4)spi_vshuffledu_lo(0x88318820, (vec uint32x1)bot1, 0); // This update should only happen for clusters 6,7,14,15. bota_0 = spi_vselect8((vec uint8x4)cid_6_7_14_15, bota_0u, bota_0); bota_1 = spi_vselect8((vec uint8x4)cid_6_7_14_15, bota_1u, bota_1); bota_2 = spi_vselect8((vec uint8x4)cid_6_7_14_15, bota_2u, bota_2); bota_3 = spi_vselect8((vec uint8x4)cid_6_7_14_15, bota_3u, bota_3); // **************************************************** // 8-cluster vertical to horizontal transpose: After // the transpose each cluster will have 2 columns of pixels // of the full MB in place of 2 rows cluster originally had. // In addition to transpose, upper bytes 1 and 3 of each word // are to be padded with zeros. // **************************************************** transpose_macroblock16x2(cid_b2, cid_b1, cid_b0, perm_b2, perm_b1, perm_b0, t_cur0_0, t_cur0_1, t_cur0_2, t_cur0_3, t_cur1_0, t_cur1_1, t_cur1_2, t_cur1_3, t_cur2_0, t_cur2_1, t_cur2_2, t_cur2_3, t_cur3_0, t_cur3_1, t_cur3_2, t_cur3_3, cur0_0, cur0_1, cur0_2, cur0_3, cur1_0, cur1_1, cur1_2, cur1_3, cur2_0, cur2_1, cur2_2, cur2_3, cur3_0, cur3_1, cur3_2, cur3_3); //Read top from the stream which was stored //as bottom in previous strip. top_in_idx = spi_vselect32((cid_lt_8 & spi_vle32u(2U, loop_ctr)), loop_ctr - 2U, loop_ctr)<<2; spi_array_read(in_out_frame_bot, topa_0, top_in_idx); spi_array_read(in_out_frame_bot, topa_1, top_in_idx+1U); spi_array_read(in_out_frame_bot, topa_2, top_in_idx+2U); spi_array_read(in_out_frame_bot, topa_3, top_in_idx+3U); // Bottoms written out by clusters 8-15 in previous strip // are used as Top for current strip in clusters 0-7. Swap // the bottoms. topa_0 = (vec uint8x4)spi_vperm32(perm_b3, (vec uint32x1)topa_0, 0); topa_1 = (vec uint8x4)spi_vperm32(perm_b3, (vec uint32x1)topa_1, 0); topa_2 = (vec uint8x4)spi_vperm32(perm_b3, (vec uint32x1)topa_2, 0); topa_3 = (vec uint8x4)spi_vperm32(perm_b3, (vec uint32x1)topa_3, 0);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -