📄 deblock_luma_kc.sc
字号:
// -------------------------------------------------------------------// ?2005 Stream Processors, Inc. All rights reserved.// This Software is the property of Stream Processors, Inc. (SPI) and// is Proprietary and Confidential. It has been provided under// license for solely use in evaluating and/or developing code for a// stream processor device. Any use of the Software to develop code// for a semiconductor device not manufactured by or for SPI is// prohibited. Unauthorized use of this Software is strictly// prohibited.//// THIS SOFTWARE IS PROVIDED "AS IS". NO WARRANTIES ARE GIVEN,// WHETHER EXPRESS, IMPLIED OR STATUTORY, INCLUDING WARRANTIES OR// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE,// NONINFRINGEMENT AND TITLE. RECIPIENT SHALL HAVE THE SOLE// RESPONSIBILITY FOR THE ADEQUATE PROTECTION AND BACK-UP OF ITS DATA// USED IN CONNECTION WITH THIS SOFTWARE. IN NO EVENT WILL SPI BE// LIABLE FOR ANY CONSEQUENTIAL DAMAGES WHATSOEVER, INCLUDING LOSS OF// DATA OR USE, LOST PROFITS OR ANY INCIDENTAL OR SPECIAL DAMAGES,// ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS// SOFTWARE, WHETHER IN ACTION OF CONTRACT OR TORT, INCLUDING// NEGLIGENCE. SPI FURTHER DISCLAIMS ANY LIABILITY WHATSOEVER FOR// INFRINGEMENT OF ANY INTELLECTUAL PROPERTY RIGHTS OF ANY THIRD// PARTY.// -------------------------------------------------------------------//--------------------------------------------------------------------// File: $File: //depot/main/software/apps/spi_h264e_b/deblock_luma_kc.sc $ // Revision: $Revision: #27 $ // Last Modified: $DateTime: 2007/06/12 14:24:56 $ //// KernelC version of in-loop deblocking filter for luma and chroma block edges,// as per the H.264 standard.//--------------------------------------------------------------------#include "spi_common.h"#include "deblock_kc.h"#include "filter_block_kc.h"#define ROUND_CONSTANT 0x00020002p2inline kernel void post_deblock_data_munge_inline( // Input/output pixel data stream uint8x4 in_frame(array_io), //stream uint8x4 in_frame_top_inter(array_io), // No of iterations uint32x1 no_of_iter(in), stream uint8x4 out_frame(array_io), // Input/output pixel data for TOP stream uint8x4 in_out_frame_top(array_io), uint32x1 s_filter_first_mb_vert_edges(in), uint32x1 s_filter_last_mb_horz_edges(in) );//--------------------------------------------------------------------// FunctionName: DeblockMB_Luma//--------------------------------------------------------------------// This kernel is capable of processing both luma and components of// a MB. Combining chroma processing with luma processing allows better// ALU utilization and also minimizes the impact of long critical path// that is part of chroma processing loop.// Luma Edges// ===============// The kernel processes a strip of macroblocks, where as the strip consists// of consecutive macroblocks from a pair of rows. In order to utilize// all clusters effectively 2 MBs belonging to the neighboring rows are// processed across 16 clusters. Because of inherent dependencies in// H.264 BP deblocking, MBs belonging to bottom row of the strip are// processed with 2 iterations delay compared to MBs in the same position// in top row.//// it0 it1 it2 it3 it4...// -------------------------------------------// |TMB0|TMB1|TMB2|.... |...|TMBn-1| x | x |// |------------------------------------------// | x | x |BMB0|BMB1|BMB2|.... |...|BMBn-1|// |------------------------------------------// As shown in diagram above BMB0 is processed in clusters8-15 while// TMB2 is being processed by clusters 0-7 in the same iteration.// Blocks marked with 'x' are dummy iterations at the start and end of// a strip, needed to pipe up and down the wavefront scheme.//// If the last macroblock (call it LastMB) is on the right edge of// the image, no special boundary conditions need to be handled. If// the LastMB is in the interior of the image, than only the vertical// edges are filtered for that macroblock. In this case, when the// strip next to the current one is processed (i.e., the strip that// contains the macroblocks from LastMB+1 to LastMB+1+StripSize),// LastMB has to be loaded again and this time only the horizontal// edges will be filtered. The reason boundary cases are handled// this way is so that the StreamC code can be written to process// strips in vertical order, rather than in row-major order. That// is, one can process the strips containing the macroblocks at// macroblock coordinates (0,0) to (N,0), then (0,2) to (N,2), etc.,// and then process MB strips (N,0) to (2N-1,0), then (N,2) to// (2N-1,2), and so on. (Notice the overlap of the Nth macroblock on// each row in each pair of strips).//// The kernel also requires another stream of pixels that contain the// macroblocks to the top pair of rows of the current row. By processing// strips in vertical order, strip (pair of rows) processed in previous// kernel invocation can be reused as top for current iteration.//// The basic data layout is that each cluster gets 2 lines belonging to// a macro block - 32 pixels each. These 2 lines consist of 4 vertical edges// of size 2 pixels. bs/alpha/beta/tc0 is pre-arranged by bs calculation// kernel such that each cluster read 4+4 words from bs stream to get// all relevant parameters needed for processing 4 vertical and 4 horinzontal// edges.//// The flow of the kernel is to first filter the vertical edges, then// filter the horizontal edges. Each call to the filter function// (FilterLumaEdges) will filter all the edges in the vertical or// horizontal directions (four edges for Luma in each direction).// Initial data layout is optimal for vertical filtering of edges.// After the vertical filtering is done, data is transposed among// 8 clusters such that each cluster will have 2 consecutive columns// of in place of 2 rows it has read. This transpose allows FilterLumaEdges// to be reused for horizontal filtering too.//// The FilterLumaEdge function requires the data so that the four// pixels immediately on the block edge is in one 32-bit word, the// pixels one away from the edge is in another 32-bit words, and so// on. In the diagram below, these inputs will esentially be axax,// bxbx, etc. These inputs will then be updated by FilterLumaEdges to// contain the final results of filtering that edge. Since each// cluster is processing edges of size 2 pixels, word entries below only// 2 bytes with valid pixels and other 2 bytes have dummy data. //// -----------// |dcba|abcd|// |xxxx|xxxx|// |dcba|abcd|// |xxxx|xxxx|// -----------// Chroma Edges// ===============// Chroma edge processing is structured such that each cluster will process// one 4x4 block belonging to a color component(cb or cr). This means, each// cluster will filter 1 vertical and 1 horizontal edge per iteration.// Chroma edges too follow the same wavefromt mechanism used for luma edges.// // Since each cluster is processing only 1 block, pixels modified by one cluster// during the filtering need to be sent to neighboring clusters in order to// handle the dependencies inherent in H.264 deblocking. Left pixels modified// while deblocking a vertical edge need to be used as top pixels for// filtering horizontal edges bottom left block. After the vertical edge is// filtered receives are used to send updated pixels to neighboring clusters// as top pixels for left bottom block.////// Streams:// bs_a_b_tc_str - Packed bs, Alpha, Beta and tC0 for each edge// in_frame - Luma input stream, arranged such that each cluster// can read 2 consecutive rows of input strip.// out_frame_inter - Luma output stream, updated by current V,H edge// filtering.// in_out_frame_bot - Luma In/Out stream for storing and reading the filtered// pixels of bottom blocks of each MB.// out_frame_top - Output array stream to store updated luma pixels of top MB// row. This will be merged with Top MB strip in data munging// kernels.// in_out_framec - Chroma input stream, arranged such that each cluster// gets one 4x4 block.// in_out_frame_botc - Chroma In/Out stream for storing and reading the filtered// pixels of bottom blocks of each MB.// out_frame_topc - Output array stream to store updated Chroma pixels of top MB// row. This will be merged with Top MB strip in post data munging// kernels.//--------------------------------------------------------------------kernel void deblock_mb_luma( // Packed BS/Alpha/Beta/tC0 stream stream uint8x4 bs_a_b_tc_str(seq_in), int32x1 s_packed_fmbve_lmbhe_iter(in), // Input/output pixel data stream uint8x4 in_frame(array_io), stream uint8x4 out_frame_inter(array_io), // Input/output pixel data for TOP stream uint8x4 in_out_frame_bot(array_io), stream uint8x4 post_data_munge_out(array_io) ){ vec uint32x1 strip_size_p2; // In each lane, pixels belonging to 2 rows are separated // full strip size. vec uint32x1 in_pitch; // Transpose control vec bool32x1 cid_b2; vec bool32x1 cid_b1; vec bool32x1 cid_b0; vec uint32x1 perm_b3; vec uint32x1 perm_b2; vec uint32x1 perm_b1; vec uint32x1 perm_b0; vec bool32x1 cid_6_7_14_15; vec bool32x1 cid_lt_8; // Variables for current blocks vec uint8x4 cur0_0, cur0_1, cur0_2, cur0_3; vec uint8x4 cur1_0, cur1_1, cur1_2, cur1_3; vec uint8x4 cur2_0, cur2_1, cur2_2, cur2_3; vec uint8x4 cur3_0, cur3_1, cur3_2, cur3_3; vec uint8x4 cur0_3210_0, cur0_3210_1; vec uint8x4 cur1_3210_0, cur1_3210_1; vec uint8x4 cur2_3210_0, cur2_3210_1; vec uint8x4 cur3_3210_0, cur3_3210_1; // Variables for transposed current blocks vec uint8x4 t_cur0_0, t_cur0_1, t_cur0_2, t_cur0_3; vec uint8x4 t_cur1_0, t_cur1_1, t_cur1_2, t_cur1_3; vec uint8x4 t_cur2_0, t_cur2_1, t_cur2_2, t_cur2_3; vec uint8x4 t_cur3_0, t_cur3_1, t_cur3_2, t_cur3_3; vec uint8x4 left_0_3; vec uint8x4 left_1_3; // Variables for Left blocks vec uint8x4 t_left_0, t_left_1, t_left_2, t_left_3; // "Top" blocks //vec uint8x4 top_0, top_1, top_2, top_3; vec uint8x4 topa_0, topa_1, topa_2, topa_3; vec uint8x4 bota_0, bota_1, bota_2, bota_3; vec uint8x4 bota_0u, bota_1u, bota_2u, bota_3u; vec uint8x4 bot0, bot1, bot1_tmp, bot0_tmp; vec uint32x1 top_in_idx, bot_out_idx, top_out_idx; vec uint32x1 left_out_idx, cur_out_idx, cur_in_idx; vec uint8x4 bs; vec uint8x4 alpha0, alpha1, beta0, beta1; vec uint16x2 tc0_0, tc0_1, tc0_2, tc0_3; vec uint8x4 alpha_10, beta_10, tc0_3210; // Chroma // Permuations for cluster communication vec uint32x1 perm_get_left_c; //0 - 10, 1 - 11, 2 - 0, 3 - 1, 4 - 14, 5 - 15, 6 - 4, 7 - 5, //8 - 2, 9 - 3, 10 - 8, 11 - 9, 12 - 6, 13 - 7, 14 - 12, 15 - 13 vec uint32x1 perm_get_top_c; //0 - 2, 1 - 3, 2 - 8, 3 - 9, 4 - 6, 5 - 7, 6 - 12, 7 - 13 //8 - 10, 9 - 11, 10 - 0, 11 - 1, 12 - 14, 13 - 15, 14 - 4, 15 - 5 vec uint32x1 perm_get_bottom_c; vec uint32x1 perm_get_right_c; // Conditions for selecting subsets of 4x4 blocks (clusters) vec bool32x1 left_edge_c; vec bool32x1 right_edge_c; vec bool32x1 top_edge_c; vec bool32x1 bottom_edge_c; vec uint8x4 data_c_0, data_c_1, data_c_2, data_c_3; vec uint8x4 data_prev_c_0, data_prev_c_1, data_prev_c_2, data_prev_c_3; vec uint16x2 bs_v_c, bs_h_c; vec uint8x4 alpha_v_c, beta_v_c, alpha_h_c, beta_h_c; vec int8x4 tc0_v_c, tc0_h_c; vec uint8x4 q0_v, q1_v, q2_v, q3_v, q0_prev, q1_prev, q2_prev, q3_prev; vec uint8x4 p0_v, p1_v, p0_h, p1_h;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -