📄 deblock_sc.sc
字号:
// -------------------------------------------------------------------// ?2006 Stream Processors, Inc. All rights reserved.// This Software is the property of Stream Processors, Inc. (SPI) and// is Proprietary and Confidential. It has been provided under// license for solely use in evaluating and/or developing code for a// stream processor device. Any use of the Software to develop code// for a semiconductor device not manufactured by or for SPI is// prohibited. Unauthorized use of this Software is strictly// prohibited.//// THIS SOFTWARE IS PROVIDED "AS IS". NO WARRANTIES ARE GIVEN,// WHETHER EXPRESS, IMPLIED OR STATUTORY, INCLUDING WARRANTIES OR// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE,// NONINFRINGEMENT AND TITLE. RECIPIENT SHALL HAVE THE SOLE// RESPONSIBILITY FOR THE ADEQUATE PROTECTION AND BACK-UP OF ITS DATA// USED IN CONNECTION WITH THIS SOFTWARE. IN NO EVENT WILL SPI BE// LIABLE FOR ANY CONSEQUENTIAL DAMAGES WHATSOEVER, INCLUDING LOSS OF// DATA OR USE, LOST PROFITS OR ANY INCIDENTAL OR SPECIAL DAMAGES,// ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS// SOFTWARE, WHETHER IN ACTION OF CONTRACT OR TORT, INCLUDING// NEGLIGENCE. SPI FURTHER DISCLAIMS ANY LIABILITY WHATSOEVER FOR// INFRINGEMENT OF ANY INTELLECTUAL PROPERTY RIGHTS OF ANY THIRD// PARTY.// -------------------------------------------------------------------//--------------------------------------------------------------------// File: $File: //depot/main/software/apps/spi_h264e_b/deblock_sc.sc $// Revision: $Revision: #47 $// Last Modified: $DateTime: 2007/06/10 18:16:41 $//// Description:// Stream implementation of deblocking filter.//// Current version fully supports Baseline profile//--------------------------------------------------------------------#include <assert.h>#include "spi_common.h"#include "mb_info.h"#include "encoder_context.h"#include "encoder_tables.h"#include "deblock_kc.h"// Macro used to define streams#ifndef MAX_STRIP_SIZE#define MAX_STRIP_SIZE 44#endif // MAX_STRIP_SIZE// Maximum frame width deblocking can handle#define MAX_DEBLOCK_FRM_WIDTH 1920#define MAX_DEBLOCK_FRM_WIDTHC (MAX_DEBLOCK_FRM_WIDTH/2)// 2 Extra MBs for padding#define Y_STRIP_LEN_HDEC (((MAX_STRIP_SIZE+1+2)*2*8*8)/4)#define Y_STRIP_LEN_QDEC ((2*(MAX_STRIP_SIZE+1+2)*2*4*4)/4)#define Y_STRIP_LEN (((MAX_STRIP_SIZE+1+2)*2*16*16)/4)#define UV_STRIP_LEN (((MAX_STRIP_SIZE+1+2)*2*2*8*8)/4)#define Y_INTER_STRIP_LEN (((MAX_STRIP_SIZE+1)+3)*8*SPI_LANES)#define Y_TOP_INTER_STRIP_LEN (((MAX_STRIP_SIZE+1)+3)*4*SPI_LANES)#define Y_BOT_INTER_STRIP_LEN Y_TOP_INTER_STRIP_LEN#define UV_INTER_STRIP_LEN (((MAX_STRIP_SIZE+1)+3)*4*SPI_LANES) #define UV_TOP_INTER_STRIP_LEN (((MAX_STRIP_SIZE+1)+3)*1*SPI_LANES)#define UV_BOT_INTER_STRIP_LEN (((MAX_STRIP_SIZE+1)+3)*2*SPI_LANES)#define MV_INFO_STRIP_LEN ((MAX_STRIP_SIZE+1) * BLOCKS_PER_MB * 2)#define LF_MB_INFO_STRIP_LEN ((MAX_STRIP_SIZE+1) * SPI_LANES)#define MB_INFO_STRIP_LEN ((MAX_STRIP_SIZE+1) * BLOCKS_PER_MB * 2)#define BS_ABTC_STR_LEN ((MAX_STRIP_SIZE+1+3)*SPI_LANES*12)#define Y_FRMTOP_STRIP_LEN (16*(MAX_DEBLOCK_FRM_WIDTH+2*16)/4)#define U_FRMTOP_STRIP_LEN (16*(MAX_DEBLOCK_FRM_WIDTHC+2*8)/4)#define V_FRMTOP_STRIP_LEN U_FRMTOP_STRIP_LEN#define Y_FRMTOP_CF_STRIP_LEN Y_FRMTOP_STRIP_LEN#define UV_FRMTOP_CF_STRIP_LEN U_FRMTOP_STRIP_LEN#define Y_FRMTOP_STRIP_LEN_HLF (16*(MAX_DEBLOCK_FRM_WIDTH/2+2*8)/4)#define Y_FRMTOP_STRIP_LEN_QT (16*(MAX_DEBLOCK_FRM_WIDTH/4+2*4)/4)#define ROUND_UPTO_NEXT2nX(x, n) (((x) + (1<<(n)) - 1) & ~((1<<(n)) - 1))// Function to allocate and initialize deblocking context at the start of a sequence// encoding. It is assumed that frame height and width would stay the same// through all frames in a sequence.void init_deblock_context(encoder_context_t *p_enc){ yuv_frame_t *frame = p_enc->pRecFrame; int image_width = frame->image_width; int image_height = frame->image_height; int rec_width = frame->width; int rec_height = frame->height; int mb_numx = image_width>>4; int mb_numy = image_height>>4; int num_chroma_pix = (rec_width * rec_height)/4; // All tables used by deblocking collapsed into single // memory block. All tables are sized to in multiple of // words. int *p_deblk_tbls = spi_malloc(DEBLK_TBLS_SIZE * sizeof(int)); int *p_y_idx = spi_malloc(32*2 * sizeof(int)); int *p_uv_idx = spi_malloc(32*2 * sizeof(int)); int *p_lf_mb_info_idx = spi_malloc(16 * sizeof(int)); int *p_mb_info_idx = spi_malloc(ROUND_UPTO_NEXT2nX(4*MAX_STRIP_SIZE+4, 4) * sizeof(int)); int *p_mb_info_idx_bot = spi_malloc(ROUND_UPTO_NEXT2nX(4*MAX_STRIP_SIZE+4, 4) * sizeof(int)); int i; if (! (p_y_idx && p_uv_idx && p_lf_mb_info_idx && p_mb_info_idx && p_mb_info_idx_bot)){ spi_error_printf(" Failed to allocate memory for deblock context %s:%d \n", __FILE__, __LINE__); } // Copy tables into unified table memory memcpy(p_deblk_tbls + DEBLK_IDXA_TBL_OFFSET, IndexATable, sizeof(IndexATable)); memcpy(p_deblk_tbls + DEBLK_IDXB_TBL_OFFSET, IndexBTable, sizeof(IndexBTable)); memcpy(p_deblk_tbls + DEBLK_QP2CHR_TBL_OFFSET, QP_TO_CHROMA_MAPPING, sizeof(QP_TO_CHROMA_MAPPING)); for (i = 0; i < 16; i++){ p_y_idx[i] = i*2*rec_width; p_y_idx[16+i] = (i*2+1)*rec_width; p_uv_idx[i] = i*(rec_width/2); p_uv_idx[16+i] = i*(rec_width/2) + num_chroma_pix; } //Populate MB Info indices for (i = 0; i < (MAX_STRIP_SIZE+1); i++){ int rec_size = sizeof(S_BLK_MB_INFO_COMPRESSED); p_mb_info_idx[4*i] = 16*i*rec_size; p_mb_info_idx[4*i+1] = 16*i*rec_size + 8*rec_size; p_mb_info_idx[4*i+2] = 16*i*rec_size + 16*mb_numx*rec_size; p_mb_info_idx[4*i+3] = 16*i*rec_size + 16*mb_numx*rec_size + 8*rec_size; } // Generate bottom index streams for frames whose height is not // a multiple of 32. if (mb_numy % 2){ for (i = 0; i < 16; i++){ p_y_idx[32+i] = (i >= 8) ? 0 : (mb_numy-1)*rec_width*16 + i*2*rec_width; p_y_idx[48+i] = (i >= 8) ? 0 : (mb_numy-1)*rec_width*16 + (i*2+1)*rec_width; // If frame height is not a 32 multiple, bottom // most row will load inconsequential data from 0th row // of the frame. And this load should also 0 as base offset // to avoid using negative values in index stream. Note bot // stream is used only when height is not multiple of 32!! p_uv_idx[32+i] = (i >= 8) ? 0 : ((mb_numy-1)*(rec_width/2)*8 + i*(rec_width/2)); p_uv_idx[48+i] = p_uv_idx[32+i] + num_chroma_pix; } for (i = 0; i < (MAX_STRIP_SIZE+1); i++){ int rec_size = sizeof(S_BLK_MB_INFO_COMPRESSED); p_mb_info_idx_bot[4*i] = 16*i*rec_size; p_mb_info_idx_bot[4*i+1] = 16*i*rec_size + 8*rec_size; p_mb_info_idx_bot[4*i+2] = 16*i*rec_size; p_mb_info_idx_bot[4*i+3] = 16*i*rec_size + 8*rec_size; } } // Update context structure with allocated pointers p_enc->deblock_ctxt.p_deblk_tbls = p_deblk_tbls; p_enc->deblock_ctxt.p_y_idx = p_y_idx; p_enc->deblock_ctxt.p_uv_idx = p_uv_idx; p_enc->deblock_ctxt.p_lf_mb_info_idx = p_lf_mb_info_idx; p_enc->deblock_ctxt.p_mb_info_idx = p_mb_info_idx; p_enc->deblock_ctxt.p_mb_info_idx_bot = p_mb_info_idx_bot; // Flush cache to ensure all the data is written back to // ext mem. spi_flush_entire_data_cache(); return;}void free_deblock_context(encoder_context_t *p_enc){ spi_free(p_enc->deblock_ctxt.p_deblk_tbls); spi_free(p_enc->deblock_ctxt.p_y_idx); spi_free(p_enc->deblock_ctxt.p_uv_idx); spi_free(p_enc->deblock_ctxt.p_lf_mb_info_idx); spi_free(p_enc->deblock_ctxt.p_mb_info_idx); spi_free(p_enc->deblock_ctxt.p_mb_info_idx_bot);}//--------------------------------------------------------------------// StreamC function for deblocking the whole frame in H.264 encoder// Deblocking for decoder needs some of the flags below at slice/MB// level. Also data structures used in decoder are different from encoder//-------------------------------------------------------------------- void deblock_frame_sc( encoder_context_t *p_enc ){ S_BLK_MB_INFO_COMPRESSED *p_blk_mb_info = p_enc->p_blk_mb_info; int disable_deblocking_filter_idc = p_enc->loopfilter_params.disable_flag; int slice_alpha_c0_offset = p_enc->loopfilter_params.alpha_c0_offset; int slice_beta_offset = p_enc->loopfilter_params.beta_offset; yuv_frame_t *frame = p_enc->pRecFrame; // Input & Output int image_width = frame->image_width; int image_height = frame->image_height; int rec_width = frame->width; int rec_height = frame->height; int mb_numx = image_width>>4; int mb_numy = image_height>>4; // py should point 0,0 of unpadded frame int offset_to00_y = rec_width*(rec_height - image_height)/2 + (rec_width - image_width)/2; int offset_to00_u = rec_width*(rec_height - image_height)/8 + (rec_width - image_width)/4; unsigned char *py = frame->y + offset_to00_y; unsigned char *u = frame->u + offset_to00_u; unsigned char *v; int alpha_ofs = slice_alpha_c0_offset; int beta_ofs = slice_beta_offset; int disable_filter = disable_deblocking_filter_idc; // For better SCRT performance pack these offsets into a single // word. int packed_alpha_beta_disfil = (alpha_ofs << 24) | ((beta_ofs & 0xFF) << 16) | (disable_filter << 8); int *p_deblk_tbls = p_enc->deblock_ctxt.p_deblk_tbls; int *p_y_idx = p_enc->deblock_ctxt.p_y_idx; int *p_uv_idx = p_enc->deblock_ctxt.p_uv_idx; int *p_lf_mb_info_idx = p_enc->deblock_ctxt.p_lf_mb_info_idx; int *p_mb_info_idx = p_enc->deblock_ctxt.p_mb_info_idx; int *p_mb_info_idx_bot = p_enc->deblock_ctxt.p_mb_info_idx_bot; // Do not access p_enc directly below this line int dummy = (p_enc = NULL) == NULL; // set p_enc to NULL int s, x, y, i; int num_chroma_pix = (rec_width * rec_height)/4; ////////////////////////////////////////////////////// // Setup data: Y, U, and V planes, MB context, and lookup tables ////////////////////////////////////////////////////// // Lookup tables stream uint8x4 deblock_tbls_str((6*NUM_QP/4)*SPI_LANES); //stream uint8x4 QP2ChromaMappingStr((NUM_QP/4)*SPI_LANES); //stream uint8x4 IndexATableStr((NUM_QP*4/4)*SPI_LANES); //stream uint8x4 IndexBTableStr((NUM_QP/4)*SPI_LANES); // Throughout the code, the actual number of macroblocks a kernel // will process will actually be strip_size+1 (except for maybe the // last strip on each row). This is because we have to account // for the extra boundary macroblock that needs to be processed in // order to stitch together the filtering of two adjacent strips // (See comments below on the "Main stream loop"). So we have to // use strip_size+1 in many places. int strip_size = MAX_STRIP_SIZE; int num_strips; // Declare streams based on maximum strip size stream uint8x4 frame_strip_y(Y_STRIP_LEN); stream uint8x4 frame_strip_top_y(Y_STRIP_LEN); stream uint8x4 frame_strip_uv(UV_STRIP_LEN); stream uint8x4 frame_strip_top_uv(UV_STRIP_LEN); //stream R_BLK_MB_INFO mb_info_strip(MB_INFO_STRIP_LEN, DEBLK_MBINFO_LRF_OFFSET); // mb_info_strip and a_b_tc_str and short_mb_info_strip are combined into // a single stream without any overlapping. stream uint32x1 mb_info_strip(MB_INFO_STRIP_LEN * MBINFO_SIZE_IN_WORDS); stream R_DEBLK_COMP_MB_INFO short_mb_info_strip(MB_INFO_STRIP_LEN); stream uint8x4 bs_a_b_tc_str(BS_ABTC_STR_LEN); stream uint8x4 frame_strip_bot_y_inter(Y_BOT_INTER_STRIP_LEN); stream uint8x4 frame_strip_bot_uv_inter(UV_BOT_INTER_STRIP_LEN); // Temporary streams - not needed across iterations // Combined y_inter and topy_inter into a single stream to reduce // SCRT overhead. stream uint8x4 frame_strip_y_inter(Y_INTER_STRIP_LEN + Y_TOP_INTER_STRIP_LEN); // Declare Index streams used for deriving input/output data // Hardware needs even the index streams to be sized in multiples // of SPI_LANES. // First 16 entries are used for non-last row of the frame // and last 16 entries are used for last row. stream int32x1 y_idx_str(32); //2 is sufficient stream int32x1 y_idx_str_bot(32); //2 is sufficient stream int32x1 uv_idx_str(32); //4 is sufficient stream int32x1 uv_idx_str_bot(32); //4 is sufficient stream int32x1 lf_mb_info_idx_str(16); // Make SPC happy by having each index point to 4 mb_infos instead // of 16 earlier. stream int32x1 mb_info_idx_str(ROUND_UPTO_NEXT2nX(4*MAX_STRIP_SIZE+4, 4)); stream int32x1 mb_info_idx_str_bot(ROUND_UPTO_NEXT2nX(4*MAX_STRIP_SIZE+4, 4)); // Streams used for padding/colflattening of top and bottom rows // of each frame. This padding is done across the whole width of // the frame after all strips are deblocked. All streams below are // setup to support max frame width of 1920. stream uint8x4 top_row_y(Y_FRMTOP_STRIP_LEN, 64); stream uint8x4 top_row_u(U_FRMTOP_STRIP_LEN, 64 + Y_FRMTOP_STRIP_LEN*4); stream uint8x4 top_row_v(V_FRMTOP_STRIP_LEN, 64 + Y_FRMTOP_STRIP_LEN*4 + U_FRMTOP_STRIP_LEN*4); ////////////////////////////////////////////////////// // Main stream loop ////////////////////////////////////////////////////// // // The computation is strip-mined such that a strip is a // consecutive group of macroblocks all on the same row. The // StreamC code is written to process strips in vertical order, // rather than in row-major order. That is, the loop processes // the strips contain the macroblocks at macroblock coordinates // (0,0) to (N,0), then (0,1) to (N,1), etc. At this point, it // goes back to the top and process MB strips (N,0) to (2N-1,0), // then (N,1) to (2N-1,1), and so on. (Notice the overlap of the // Nth macroblock on each row in each pair of strips). // // This overlap is needed because of the nature of the deblocking // algorithm described in the H.264 standard. In order to // conform to the standard, if the last macroblock (call it // LastStripMB) is on the right edge of the image, no special // boundary conditions need to be handled. If LastStripMB is in // the interior of the image, than only the vertical edges are // filtered for that macroblock. In this case, when the strip // next to the current one is processed (i.e., the strip that // contains the macroblocks from LastStripMB to // LastStripMB+strip_size), only the horizontal edges will be // filtered for LastStripMB. But notice that LastStripMB must be // processed by the kernel twice. This is slightly inefficient, // but if the strips are large enough (say > 25 // macroblocks/strip) this has a neglible impact on performance.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -