📄 h264dsputil.c
字号:
/* * DSP utils * Copyright (c) 2000, 2001 Fabrice Bellard. * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> * * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at> * * This file is part of FFmpeg. * * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *//** * @file dsputil.c * DSP utils */#include "avcodec.h"#include "dsputil.h"#include "mpegvideo.h"#include "simple_idct.h"#include "faandct.h"#include "h263.h"#include "snow.h"#ifdef JZ4740_MXU_OPT#include "jzmedia.h"#endif#ifdef JZ4740_MXU_OPT//put_pixels2_mxu (need not op by MXU)static void put_pixels4_mxu(uint8_t *dst, const uint8_t *src, int stride, int h){ uint32_t src_aln,src_rs; int i; src_aln = (uint32_t)src & 0xfffffffc; src_rs = 4 - ((uint32_t)src & 3); dst -= stride; for(i=0; i<h; i++) { S32LDD(xr1,src_aln,0); S32LDD(xr2,src_aln,4); src_aln += stride; S32ALN(xr1,xr2,xr1,src_rs); //xr1 <- src[3:0] S32SDIV(xr1,dst,stride,0); }}static void put_pixels8_mxu(uint8_t *dst, const uint8_t *src, int stride, int h){ uint32_t src_aln,src_rs; int i; src_aln = (uint32_t)src & 0xfffffffc; src_rs = 4 - ((uint32_t)src & 3); dst -= stride; src_aln -= stride; for(i=0; i<h; i++) { S32LDIV(xr1,src_aln,stride,0); S32LDD(xr2,src_aln,4); S32LDD(xr3,src_aln,8); S32ALN(xr1,xr2,xr1,src_rs); //xr1 <- src[3:0] S32ALN(xr2,xr3,xr2,src_rs); //xr2 <- src[7:4] S32SDIV(xr1,dst,stride,0); S32STD(xr2,dst,4); }}static void put_pixels16_mxu(uint8_t *dst, const uint8_t *src, int stride, int h){ uint32_t src_aln,src_rs; int i; src_aln = (uint32_t)src & 0xfffffffc; src_rs = 4 - ((uint32_t)src & 3); dst -= stride; src_aln -= stride; for(i=0; i<h; i++) { S32LDIV(xr1,src_aln,stride,0); S32LDD(xr2,src_aln,4); S32LDD(xr3,src_aln,8); S32LDD(xr4,src_aln,12); S32LDD(xr5,src_aln,16); S32ALN(xr1,xr2,xr1,src_rs); //xr1 <- src[3:0] S32ALN(xr2,xr3,xr2,src_rs); //xr2 <- src[7:4] S32ALN(xr3,xr4,xr3,src_rs); //xr3 <- src[11:8] S32ALN(xr4,xr5,xr4,src_rs); //xr2 <- src[16:12] S32SDIV(xr1,dst,stride,0); S32STD(xr2,dst,4); S32STD(xr3,dst,8); S32STD(xr4,dst,12); }}#endif#define PIXOP2(OPNAME, OP) \static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ int i;\ for(i=0; i<h; i++){\ OP(*((uint16_t*)(block )), AV_RN16(pixels ));\ pixels+=line_size;\ block +=line_size;\ }\}\static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ int i;\ for(i=0; i<h; i++){\ OP(*((uint32_t*)(block )), AV_RN32(pixels ));\ pixels+=line_size;\ block +=line_size;\ }\}\static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ int i;\ for(i=0; i<h; i++){\ OP(*((uint32_t*)(block )), AV_RN32(pixels ));\ OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\ pixels+=line_size;\ block +=line_size;\ }\}\static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ int src_stride1, int src_stride2, int h){\ int i;\ for(i=0; i<h; i++){\ uint32_t a,b;\ a= AV_RN32(&src1[i*src_stride1 ]);\ b= AV_RN32(&src2[i*src_stride2 ]);\ OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\ a= AV_RN32(&src1[i*src_stride1+4]);\ b= AV_RN32(&src2[i*src_stride2+4]);\ OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\ }\}\static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ int src_stride1, int src_stride2, int h){\ int i;\ for(i=0; i<h; i++){\ uint32_t a,b;\ a= AV_RN32(&src1[i*src_stride1 ]);\ b= AV_RN32(&src2[i*src_stride2 ]);\ OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\ }\}\\static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ int src_stride1, int src_stride2, int h){\ int i;\ for(i=0; i<h; i++){\ uint32_t a,b;\ a= AV_RN16(&src1[i*src_stride1 ]);\ b= AV_RN16(&src2[i*src_stride2 ]);\ OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\ }\}\static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ int src_stride1, int src_stride2, int h){\ OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\ OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\}\CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)#define op_put(a, b) a = b#define op_avg(a, b) a = rnd_avg32(a, b)PIXOP2(put, op_put)PIXOP2(avg, op_avg)#undef op_avg#undef op_put#ifdef JZ4740_MXU_OPTstatic void h264_v_loop_filter_luma_mxu(uint8_t *pix, int xstride, int alpha, int beta, int8_t *tc0){ int i, d; uint8_t *tpix; S32I2M (xr14, beta); S32SFL (xr0, xr14, xr14, xr14, ptn0); S32SFL (xr0, xr14, xr14, xr14, ptn3); // xr14: beta S32I2M (xr13, alpha); S32SFL (xr0, xr13, xr13, xr13, ptn0); S32SFL (xr0, xr13, xr13, xr13, ptn3); // xr13: alpha for( i = 0; i < 4; i++ ) { int t0 = tc0[i]; if( t0 < 0 ) { pix += 4; continue; } S32I2M (xr15, t0); S32SFL (xr0, xr15, xr15, xr15, ptn0); S32SFL (xr0, xr15, xr15, xr15, ptn3); // xr15: tc0[i] tpix = pix - 4*xstride; S32LDIV(xr12, tpix, xstride, 0); //-3, p2 S32LDIV(xr11, tpix, xstride, 0); //-2, p1 S32LDIV(xr10, tpix, xstride, 0); //-1, p0 S32LDIV(xr9, tpix, xstride, 0); // 0, q0 S32LDIV(xr1, tpix, xstride, 0); // 1, q1 S32LDIV(xr2, tpix, xstride, 0); // 2, q2 Q8ABD(xr3, xr10, xr9 ); // FFABS (p0 - q0) Q8ABD(xr4, xr11, xr10); // FFABS (p1 - p0) Q8ABD(xr5, xr1, xr9 ); // FFABS (q1 - q0)// FFABS(p0 - q0) - alpha, FFABS(p1 - p0) - beta Q8ADDE_SS (xr6, xr3, xr13, xr7); // FFABS(p0 - q0) - alpha Q8ADDE_SS (xr3, xr4, xr14, xr4); // FFABS(p1 - p0) - beta Q16SLR (xr6, xr6, xr7, xr7, 15); // 1: < 0 (FFABS( p0 - q0 ) < alpha) Q16SLR (xr3, xr3, xr4, xr4, 15); // 1: < 0 (FFABS( p1 - p0 ) < beta) Q16SAT (xr4, xr3, xr4); // xr4: 1: < 0 (FFABS( p1 - p0 ) < beta) Q16SAT (xr3, xr6, xr7); // xr3: 1: < 0 (FFABS( p0 - q0 ) < alpha)// FFABS(q1 - q0) - beta Q8ADDE_SS (xr6, xr5, xr14, xr7); // FFABS(q1 - q0) - beta Q16SLR (xr6, xr6, xr7, xr7, 15); // 1: < 0 (FFABS( q1 - q0 ) < beta) Q16SAT (xr5, xr6, xr7); // xr5: 1: < 0 (FFABS( q1 - q0 ) < beta)// if( FFABS( p0 - q0 ) < alpha && ..) Q8MADL_AA (xr0, xr3, xr4, xr3); Q8ABD(xr6, xr12, xr10 ); // FFABS (p2 - p0) Q8ABD(xr7, xr2, xr9 ); // FFABS (q2 - q0) Q8MADL_AA (xr0, xr3, xr5, xr3); // xr3: (FFABS( p0 - q0 ) < alpha // && FFABS( p1 - p0 ) < beta && ...) Q8ADDE_SS (xr4, xr6, xr14, xr5); // FFABS(p2 - p0) - beta Q8ADDE_SS (xr6, xr7, xr14, xr7); // FFABS(q2 - q0) - beta Q8MADL_AA (xr0, xr3, xr15, xr15); // xr15: new clip value Q16SLR (xr4, xr4, xr5, xr5, 15); // 1: < 0 FFABS(p2 - p0) - beta Q16SLR (xr6, xr6, xr7, xr7, 15); // 1: < 0 FFABS(q2 - q0) - beta Q16SAT (xr4, xr4, xr5); // xr4: 1: < 0 (FFABS( p2 - p0 ) < beta) Q16SAT (xr5, xr6, xr7); // xr5: 1: < 0 (FFABS( q2 - q0 ) < beta)// calculate clip value Q8ADD_AA (xr6, xr4, xr5); // pre-calculate for tc++, tc++ Q8MADL_AA (xr0, xr6, xr3, xr6); // xr6: new clip differ Q8MADL_AA (xr0, xr4, xr15, xr4); // xr4: new clip value for p1 Q8MADL_AA (xr0, xr5, xr15, xr5); // xr5: new clip value for q1 Q8ADD_AA (xr3, xr6, xr15); // xr3: new clip value for p0,q0// Q8AVGR (xr6, xr9, xr10); // (p0 + q0 + 1) >> 1 Q8AVG (xr12, xr6, xr12); // (p2 + (p0 + q0 + 1) >> 1) >> 1 Q8AVG (xr2, xr6, xr2); // (q2 + (p0 + q0 + 1) >> 1) >> 1 Q8ADDE_SS (xr12, xr12, xr11, xr7); // (p2 + (p0 + q0 + 1) >> 1) >> 1 - p1 Q8ADDE_SS (xr2, xr2, xr1, xr8); // (q2 + (p0 + q0 + 1) >> 1) >> 1 - q1 Q8ADDE_SS (xr6, xr0, xr4, xr15); //-tc//av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] ); D16MAX (xr12, xr12, xr6); D16MAX (xr7, xr7, xr15); Q8ADDE_AA (xr6, xr0, xr4, xr15); //tc D16MIN (xr12, xr12, xr6); D16MIN (xr7, xr7, xr15); Q8ADDE_SS (xr6, xr0, xr5, xr15); //-tc S32SFL (xr0, xr12, xr7, xr12, ptn1);//av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] ); D16MAX (xr2, xr2, xr6); D16MAX (xr8, xr8, xr15); Q8ADDE_AA (xr6, xr0, xr5, xr15); //tc D16MIN (xr2, xr2, xr6); D16MIN (xr8, xr8, xr15);// p1 + ...; q1 + ... Q8ADD_AA (xr4, xr12, xr11); // p1 + ... //final p1 S32SFL (xr0, xr2, xr8, xr2, ptn1); Q8ADD_AA (xr5, xr2, xr1); // q1 + ... //final q1//// (q0 - p0 ) << 2 + (p1 - q1) Q8ADDE_SS (xr2, xr9, xr10, xr12); // q0 - p0 Q16SLL (xr2, xr2, xr12, xr12, 2); // (q0 - p0) << 2 Q8ACCE_SS (xr2, xr11, xr1, xr12); // (q0 - p0) << 2 + (p1 - q1)// -tc, tc Q8ADDE_AA (xr11, xr0, xr3, xr15); //+tc (xr11, xr15) Q8ADDE_SS (xr1, xr0, xr3, xr6); //-tc (xr1, xr6)// i_delta = av_clip (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3 ....) Q16SAR (xr2, xr2, xr12, xr12, 2); // ((q0 - p0) << 2 + (p1 - q1) + 0) >> 2 D16AVGR(xr2, xr2, xr0); D16AVGR(xr12, xr12, xr0); // ((q0 - p0) << 2 + (p1 - q1) + 1) >> 1 D16MAX (xr2, xr2, xr1); D16MAX (xr12, xr12, xr6); D16MIN (xr2, xr2, xr11); D16MIN (xr12, xr12, xr15); Q16ADD_SS_WW (xr1, xr0, xr2, xr0); // xr1 = -xr2 Q16ADD_SS_WW (xr11, xr0, xr12, xr0); // xr11 = -xr12// pix[-xstride]=...; pix[0]=...; Q8ACCE_AA (xr2, xr0, xr10, xr12); // (p0 + i_delta) Q8ACCE_AA (xr1, xr0, xr9, xr11); // (q0 - i_delta) tpix = pix - 2*xstride; Q16SAT (xr10, xr2, xr12); // final p0 Q16SAT (xr9, xr1, xr11); // final q0// store S32STD (xr4, tpix, 0); // p1 S32SDIV(xr10, tpix, xstride, 0); //-1, p0 S32SDIV(xr9, tpix, xstride, 0); //0, q0 S32SDIV(xr5, tpix, xstride, 0); //0, q1 pix += 4; }}static void h264_h_loop_filter_luma_mxu(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0){ int i, d, t0; uint8_t *tpix; unsigned int t1, t2, t3, t4; S32I2M (xr14, beta); S32SFL (xr0, xr14, xr14, xr14, ptn0); S32SFL (xr0, xr14, xr14, xr14, ptn3); // xr14: beta S32I2M (xr13, alpha); S32SFL (xr0, xr13, xr13, xr13, ptn0); S32SFL (xr0, xr13, xr13, xr13, ptn3); // xr13: alpha for( i = 0; i < 4; i++ ) { t0 = tc0[i]; if( t0 < 0 ) { pix += 4*stride; continue; } S32I2M (xr15, t0); S32SFL (xr0, xr15, xr15, xr15, ptn0);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -