📄 deblock_sc.sc

📁 deblocking 在SPI DSP平台优化好的代码,超级强
💻 SC
📖 第 1 页 / 共 2 页
字号:
12 下一页
// -------------------------------------------------------------------// ?2006 Stream Processors, Inc.  All rights reserved.// This Software is the property of Stream Processors, Inc. (SPI) and// is Proprietary and Confidential.  It has been provided under// license for solely use in evaluating and/or developing code for a// stream processor device.  Any use of the Software to develop code// for a semiconductor device not manufactured by or for SPI is// prohibited.  Unauthorized use of this Software is strictly// prohibited.//// THIS SOFTWARE IS PROVIDED "AS IS".  NO WARRANTIES ARE GIVEN,// WHETHER EXPRESS, IMPLIED OR STATUTORY, INCLUDING WARRANTIES OR// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE,// NONINFRINGEMENT AND TITLE.  RECIPIENT SHALL HAVE THE SOLE// RESPONSIBILITY FOR THE ADEQUATE PROTECTION AND BACK-UP OF ITS DATA// USED IN CONNECTION WITH THIS SOFTWARE. IN NO EVENT WILL SPI BE// LIABLE FOR ANY CONSEQUENTIAL DAMAGES WHATSOEVER, INCLUDING LOSS OF// DATA OR USE, LOST PROFITS OR ANY INCIDENTAL OR SPECIAL DAMAGES,// ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS// SOFTWARE, WHETHER IN ACTION OF CONTRACT OR TORT, INCLUDING// NEGLIGENCE.  SPI FURTHER DISCLAIMS ANY LIABILITY WHATSOEVER FOR// INFRINGEMENT OF ANY INTELLECTUAL PROPERTY RIGHTS OF ANY THIRD// PARTY.// -------------------------------------------------------------------//--------------------------------------------------------------------//  File:              $File: //depot/main/software/apps/spi_h264e_b/deblock_sc.sc $//  Revision:          $Revision: #47 $//  Last Modified:     $DateTime: 2007/06/10 18:16:41 $////  Description://     Stream implementation of deblocking filter.////  Current version fully supports Baseline profile//--------------------------------------------------------------------#include <assert.h>#include "spi_common.h"#include "mb_info.h"#include "encoder_context.h"#include "encoder_tables.h"#include "deblock_kc.h"// Macro used to define streams#ifndef MAX_STRIP_SIZE#define MAX_STRIP_SIZE 44#endif // MAX_STRIP_SIZE// Maximum frame width deblocking can handle#define MAX_DEBLOCK_FRM_WIDTH  1920#define MAX_DEBLOCK_FRM_WIDTHC (MAX_DEBLOCK_FRM_WIDTH/2)// 2 Extra MBs for padding#define Y_STRIP_LEN_HDEC        (((MAX_STRIP_SIZE+1+2)*2*8*8)/4)#define Y_STRIP_LEN_QDEC        ((2*(MAX_STRIP_SIZE+1+2)*2*4*4)/4)#define Y_STRIP_LEN             (((MAX_STRIP_SIZE+1+2)*2*16*16)/4)#define UV_STRIP_LEN            (((MAX_STRIP_SIZE+1+2)*2*2*8*8)/4)#define Y_INTER_STRIP_LEN       (((MAX_STRIP_SIZE+1)+3)*8*SPI_LANES)#define Y_TOP_INTER_STRIP_LEN   (((MAX_STRIP_SIZE+1)+3)*4*SPI_LANES)#define Y_BOT_INTER_STRIP_LEN   Y_TOP_INTER_STRIP_LEN#define UV_INTER_STRIP_LEN      (((MAX_STRIP_SIZE+1)+3)*4*SPI_LANES) #define UV_TOP_INTER_STRIP_LEN  (((MAX_STRIP_SIZE+1)+3)*1*SPI_LANES)#define UV_BOT_INTER_STRIP_LEN  (((MAX_STRIP_SIZE+1)+3)*2*SPI_LANES)#define MV_INFO_STRIP_LEN       ((MAX_STRIP_SIZE+1) * BLOCKS_PER_MB * 2)#define LF_MB_INFO_STRIP_LEN    ((MAX_STRIP_SIZE+1) * SPI_LANES)#define MB_INFO_STRIP_LEN       ((MAX_STRIP_SIZE+1) * BLOCKS_PER_MB * 2)#define BS_ABTC_STR_LEN         ((MAX_STRIP_SIZE+1+3)*SPI_LANES*12)#define Y_FRMTOP_STRIP_LEN      (16*(MAX_DEBLOCK_FRM_WIDTH+2*16)/4)#define U_FRMTOP_STRIP_LEN      (16*(MAX_DEBLOCK_FRM_WIDTHC+2*8)/4)#define V_FRMTOP_STRIP_LEN      U_FRMTOP_STRIP_LEN#define Y_FRMTOP_CF_STRIP_LEN   Y_FRMTOP_STRIP_LEN#define UV_FRMTOP_CF_STRIP_LEN  U_FRMTOP_STRIP_LEN#define Y_FRMTOP_STRIP_LEN_HLF  (16*(MAX_DEBLOCK_FRM_WIDTH/2+2*8)/4)#define Y_FRMTOP_STRIP_LEN_QT  (16*(MAX_DEBLOCK_FRM_WIDTH/4+2*4)/4)#define ROUND_UPTO_NEXT2nX(x, n) (((x) + (1<<(n)) - 1) & ~((1<<(n)) - 1))// Function to allocate and initialize deblocking context at the start of a sequence// encoding. It is assumed that frame height and width would stay the same// through all frames in a sequence.void init_deblock_context(encoder_context_t *p_enc){    yuv_frame_t *frame = p_enc->pRecFrame;    int image_width = frame->image_width;    int image_height = frame->image_height;    int rec_width = frame->width;    int rec_height = frame->height;    int mb_numx = image_width>>4;    int mb_numy = image_height>>4;    int num_chroma_pix = (rec_width * rec_height)/4;    // All tables used by deblocking collapsed into single    // memory block. All tables are sized to in multiple of    // words.    int *p_deblk_tbls = spi_malloc(DEBLK_TBLS_SIZE * sizeof(int));    int *p_y_idx = spi_malloc(32*2 * sizeof(int));    int *p_uv_idx = spi_malloc(32*2 * sizeof(int));    int *p_lf_mb_info_idx = spi_malloc(16 * sizeof(int));    int *p_mb_info_idx = spi_malloc(ROUND_UPTO_NEXT2nX(4*MAX_STRIP_SIZE+4, 4) * sizeof(int));    int *p_mb_info_idx_bot = spi_malloc(ROUND_UPTO_NEXT2nX(4*MAX_STRIP_SIZE+4, 4) * sizeof(int));    int i;    if (! (p_y_idx && p_uv_idx && p_lf_mb_info_idx && p_mb_info_idx &&           p_mb_info_idx_bot)){        spi_error_printf(" Failed to allocate memory for deblock context %s:%d \n",                         __FILE__, __LINE__);    }        // Copy tables into unified table memory    memcpy(p_deblk_tbls + DEBLK_IDXA_TBL_OFFSET, IndexATable, sizeof(IndexATable));    memcpy(p_deblk_tbls + DEBLK_IDXB_TBL_OFFSET, IndexBTable, sizeof(IndexBTable));    memcpy(p_deblk_tbls + DEBLK_QP2CHR_TBL_OFFSET, QP_TO_CHROMA_MAPPING, sizeof(QP_TO_CHROMA_MAPPING));        for (i = 0; i < 16; i++){        p_y_idx[i]     = i*2*rec_width;        p_y_idx[16+i]  = (i*2+1)*rec_width;        p_uv_idx[i]    = i*(rec_width/2);        p_uv_idx[16+i] = i*(rec_width/2) + num_chroma_pix;    }    //Populate MB Info indices    for (i = 0; i < (MAX_STRIP_SIZE+1); i++){        int rec_size = sizeof(S_BLK_MB_INFO_COMPRESSED);        p_mb_info_idx[4*i]   = 16*i*rec_size;        p_mb_info_idx[4*i+1] = 16*i*rec_size + 8*rec_size;        p_mb_info_idx[4*i+2] = 16*i*rec_size + 16*mb_numx*rec_size;        p_mb_info_idx[4*i+3] = 16*i*rec_size + 16*mb_numx*rec_size + 8*rec_size;    }        // Generate bottom index streams for frames whose height is not    // a multiple of 32.    if (mb_numy % 2){        for (i = 0; i < 16; i++){            p_y_idx[32+i] = (i >= 8) ? 0 : (mb_numy-1)*rec_width*16 + i*2*rec_width;            p_y_idx[48+i] = (i >= 8) ? 0 : (mb_numy-1)*rec_width*16 + (i*2+1)*rec_width;            // If frame height is not a 32 multiple, bottom            // most row will load inconsequential data from 0th row            // of the frame. And this load should also 0 as base offset            // to avoid using negative values in index stream. Note bot            // stream is used only when height is not multiple of 32!!            p_uv_idx[32+i] = (i >= 8) ? 0 : ((mb_numy-1)*(rec_width/2)*8 + i*(rec_width/2));            p_uv_idx[48+i] = p_uv_idx[32+i] + num_chroma_pix;        }        for (i = 0; i < (MAX_STRIP_SIZE+1); i++){            int rec_size = sizeof(S_BLK_MB_INFO_COMPRESSED);            p_mb_info_idx_bot[4*i]   = 16*i*rec_size;            p_mb_info_idx_bot[4*i+1] = 16*i*rec_size + 8*rec_size;            p_mb_info_idx_bot[4*i+2] = 16*i*rec_size;            p_mb_info_idx_bot[4*i+3] = 16*i*rec_size + 8*rec_size;        }    }    // Update context structure with allocated pointers    p_enc->deblock_ctxt.p_deblk_tbls        = p_deblk_tbls;    p_enc->deblock_ctxt.p_y_idx             = p_y_idx;    p_enc->deblock_ctxt.p_uv_idx            = p_uv_idx;    p_enc->deblock_ctxt.p_lf_mb_info_idx    = p_lf_mb_info_idx;    p_enc->deblock_ctxt.p_mb_info_idx       = p_mb_info_idx;    p_enc->deblock_ctxt.p_mb_info_idx_bot   = p_mb_info_idx_bot;        // Flush cache to ensure all the data is written back to    // ext mem.    spi_flush_entire_data_cache();        return;}void free_deblock_context(encoder_context_t *p_enc){    spi_free(p_enc->deblock_ctxt.p_deblk_tbls);    spi_free(p_enc->deblock_ctxt.p_y_idx);    spi_free(p_enc->deblock_ctxt.p_uv_idx);    spi_free(p_enc->deblock_ctxt.p_lf_mb_info_idx);    spi_free(p_enc->deblock_ctxt.p_mb_info_idx);    spi_free(p_enc->deblock_ctxt.p_mb_info_idx_bot);}//--------------------------------------------------------------------// StreamC function for deblocking the whole frame in H.264 encoder// Deblocking for decoder needs some of the flags below at slice/MB// level. Also data structures used in decoder are different from encoder//-------------------------------------------------------------------- void deblock_frame_sc(                      encoder_context_t *p_enc                      ){    S_BLK_MB_INFO_COMPRESSED *p_blk_mb_info  = p_enc->p_blk_mb_info;    int disable_deblocking_filter_idc  = p_enc->loopfilter_params.disable_flag;    int slice_alpha_c0_offset          = p_enc->loopfilter_params.alpha_c0_offset;    int slice_beta_offset              = p_enc->loopfilter_params.beta_offset;    yuv_frame_t *frame                 = p_enc->pRecFrame; // Input & Output    int image_width = frame->image_width;    int image_height = frame->image_height;    int rec_width = frame->width;    int rec_height = frame->height;    int mb_numx = image_width>>4;    int mb_numy = image_height>>4;    // py should point 0,0 of unpadded frame    int offset_to00_y = rec_width*(rec_height - image_height)/2 + (rec_width - image_width)/2;    int offset_to00_u = rec_width*(rec_height - image_height)/8 + (rec_width - image_width)/4;    unsigned char *py = frame->y + offset_to00_y;    unsigned char *u  = frame->u + offset_to00_u;    unsigned char *v;        int alpha_ofs  = slice_alpha_c0_offset;    int beta_ofs   = slice_beta_offset;    int disable_filter = disable_deblocking_filter_idc;    // For better SCRT performance pack these offsets into a single    // word.    int packed_alpha_beta_disfil =        (alpha_ofs << 24) | ((beta_ofs & 0xFF) << 16) | (disable_filter << 8);    int *p_deblk_tbls = p_enc->deblock_ctxt.p_deblk_tbls;    int *p_y_idx = p_enc->deblock_ctxt.p_y_idx;    int *p_uv_idx = p_enc->deblock_ctxt.p_uv_idx;    int *p_lf_mb_info_idx = p_enc->deblock_ctxt.p_lf_mb_info_idx;    int *p_mb_info_idx = p_enc->deblock_ctxt.p_mb_info_idx;    int *p_mb_info_idx_bot = p_enc->deblock_ctxt.p_mb_info_idx_bot;    // Do not access p_enc directly below this line    int dummy = (p_enc = NULL) == NULL;   // set p_enc to NULL        int s, x, y, i;    int num_chroma_pix = (rec_width * rec_height)/4;    //////////////////////////////////////////////////////    // Setup data: Y, U, and V planes, MB context, and lookup tables    //////////////////////////////////////////////////////        // Lookup tables    stream uint8x4 deblock_tbls_str((6*NUM_QP/4)*SPI_LANES);    //stream uint8x4 QP2ChromaMappingStr((NUM_QP/4)*SPI_LANES);    //stream uint8x4 IndexATableStr((NUM_QP*4/4)*SPI_LANES);    //stream uint8x4 IndexBTableStr((NUM_QP/4)*SPI_LANES);        // Throughout the code, the actual number of macroblocks a kernel    // will process will actually be strip_size+1 (except for maybe the    // last strip on each row).  This is because we have to account    // for the extra boundary macroblock that needs to be processed in    // order to stitch together the filtering of two adjacent strips    // (See comments below on the "Main stream loop").  So we have to    // use strip_size+1 in many places.    int strip_size = MAX_STRIP_SIZE;    int num_strips;    // Declare streams based on maximum strip size        stream uint8x4 frame_strip_y(Y_STRIP_LEN);    stream uint8x4 frame_strip_top_y(Y_STRIP_LEN);    stream uint8x4 frame_strip_uv(UV_STRIP_LEN);    stream uint8x4 frame_strip_top_uv(UV_STRIP_LEN);        //stream R_BLK_MB_INFO mb_info_strip(MB_INFO_STRIP_LEN, DEBLK_MBINFO_LRF_OFFSET);    // mb_info_strip and a_b_tc_str and short_mb_info_strip are combined into    // a single stream without any overlapping.    stream uint32x1 mb_info_strip(MB_INFO_STRIP_LEN * MBINFO_SIZE_IN_WORDS);    stream R_DEBLK_COMP_MB_INFO short_mb_info_strip(MB_INFO_STRIP_LEN);    stream uint8x4 bs_a_b_tc_str(BS_ABTC_STR_LEN);        stream uint8x4 frame_strip_bot_y_inter(Y_BOT_INTER_STRIP_LEN);    stream uint8x4 frame_strip_bot_uv_inter(UV_BOT_INTER_STRIP_LEN);        // Temporary streams - not needed across iterations    // Combined y_inter and topy_inter into a single stream to reduce    // SCRT overhead.    stream uint8x4 frame_strip_y_inter(Y_INTER_STRIP_LEN + Y_TOP_INTER_STRIP_LEN);        // Declare Index streams used for deriving input/output data    // Hardware needs even the index streams to be sized in multiples    // of SPI_LANES.    // First 16 entries are used for non-last row of the frame    // and last 16 entries are used for last row.    stream int32x1 y_idx_str(32); //2 is sufficient    stream int32x1 y_idx_str_bot(32); //2 is sufficient    stream int32x1 uv_idx_str(32); //4 is sufficient    stream int32x1 uv_idx_str_bot(32); //4 is sufficient    stream int32x1 lf_mb_info_idx_str(16);    // Make SPC happy by having each index point to 4 mb_infos instead    // of 16 earlier.    stream int32x1 mb_info_idx_str(ROUND_UPTO_NEXT2nX(4*MAX_STRIP_SIZE+4, 4));    stream int32x1 mb_info_idx_str_bot(ROUND_UPTO_NEXT2nX(4*MAX_STRIP_SIZE+4, 4));            // Streams used for padding/colflattening of top and bottom rows    // of each frame. This padding is done across the whole width of    // the frame after all strips are deblocked. All streams below are    // setup to support max frame width of 1920.    stream uint8x4 top_row_y(Y_FRMTOP_STRIP_LEN, 64);    stream uint8x4 top_row_u(U_FRMTOP_STRIP_LEN, 64 + Y_FRMTOP_STRIP_LEN*4);    stream uint8x4 top_row_v(V_FRMTOP_STRIP_LEN, 64 + Y_FRMTOP_STRIP_LEN*4 +                             U_FRMTOP_STRIP_LEN*4);    //////////////////////////////////////////////////////    // Main stream loop    //////////////////////////////////////////////////////    //    //  The computation is strip-mined such that a strip is a    //  consecutive group of macroblocks all on the same row.  The    //  StreamC code is written to process strips in vertical order,    //  rather than in row-major order.  That is, the loop processes    //  the strips contain the macroblocks at macroblock coordinates    //  (0,0) to (N,0), then (0,1) to (N,1), etc.  At this point, it    //  goes back to the top and process MB strips (N,0) to (2N-1,0),    //  then (N,1) to (2N-1,1), and so on.  (Notice the overlap of the    //  Nth macroblock on each row in each pair of strips).    //    //  This overlap is needed because of the nature of the deblocking    //  algorithm described in the H.264 standard.  In order to    //  conform to the standard, if the last macroblock (call it    //  LastStripMB) is on the right edge of the image, no special    //  boundary conditions need to be handled.  If LastStripMB is in    //  the interior of the image, than only the vertical edges are    //  filtered for that macroblock.  In this case, when the strip    //  next to the current one is processed (i.e., the strip that    //  contains the macroblocks from LastStripMB to    //  LastStripMB+strip_size), only the horizontal edges will be    //  filtered for LastStripMB.  But notice that LastStripMB must be    //  processed by the kernel twice.  This is slightly inefficient,    //  but if the strips are large enough (say > 25    //  macroblocks/strip) this has a neglible impact on performance.
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -