📄 quant_non_intra.c
字号:
/* quant_non_intra.c, this file is part of the * AltiVec optimized library for MJPEG tools MPEG-1/2 Video Encoder * Copyright (C) 2002 James Klicman <james@klicman.org> * * This library is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */#ifdef HAVE_CONFIG_H#include <config.h>#endif#include "altivec_quantize.h"#if defined(ALTIVEC_VERIFY) && ALTIVEC_TEST_FUNCTION(quant_non_intra)#include <stdlib.h>#include <string.h>#endif#include "vectorize.h"#include <math.h>#include "../mjpeg_logging.h"#include "../../mpeg2enc/syntaxconsts.h"#include "../../mpeg2enc/quantize_precomp.h"/* #define AMBER_ENABLE */#include "amber.h"#ifdef HAVE_ALTIVEC_H/* include last to ensure AltiVec type semantics, especially for bool. */#include <altivec.h>#endif/* * The original C version would start-over from the beginning each time * clipping occurred (until saturated) which resulted in the possibility of * most dst[] values being re-calculated many times. This version, if clipping * is detected, restarts calculating from the current block. Once it's finished * it will re-calculate blocks that need it starting with block 0. */#define QUANT_NON_INTRA_PDECL \ struct QuantizerWorkSpace *wsp, \ int16_t *src, int16_t *dst, \ int q_scale_type, int dctsatlim, int *nonsat_mquant \#define QUANT_NON_INTRA_ARGS \ wsp, src, dst, q_scale_type, dctsatlim, nonsat_mquant#define QUANT_NON_INTRA_PFMT \ "wsp=0x%X, src=0x%X, dst=0x%X, q_scale_type=%d, dctsatlim=%d, " \ "nonsat_mquant=0x%X"int quant_non_intra_altivec(QUANT_NON_INTRA_PDECL){ int mquant = *nonsat_mquant; int i, j, N, nzblockbits, last_block, recalc_blocks; vector unsigned short *pqm; vector unsigned short *inter_q_mat = wsp->inter_q_mat; signed short *ps, *pd; vector unsigned short zero, four; vector float one; vector unsigned short qmA, qmB; /* quant matrix */ vector signed short srcA, srcB; /* source */ vector signed short dstA, dstB; /* destination */ vector float sA0, sA1, sB0, sB1; /* dividend */ vector float dA0, dA1, dB0, dB1; /* divisor */ vector float reA0, reA1, reB0, reB1; /* reciprocal */ vector float qtA0, qtA1, qtB0, qtB1; /* quotient */ vector float rmA0, rmA1, rmB0, rmB1; /* remainder */ vector bool short selA, selB; /* bool selector */ vector bool short nz; /* non-zero */ vector unsigned short max; /* max value */ vector unsigned short t1, t2, t3, t4; /* vuv & vu are used to share values between vector and scalar code. * vu lives on the stack and vuv is a vector register. Using vuv * instead of vu.v allows control over when read/writes to vu are done. */ vector unsigned short vuv; union { /* do not use v, load vu into vuv for vector access. */ vector unsigned short v; struct { unsigned short mquant; unsigned short clipvalue; unsigned int nz; } s; } vu;#ifdef ALTIVEC_VERIFY /* {{{ */ if (NOT_VECTOR_ALIGNED(wsp->inter_q_mat)) mjpeg_error_exit1("quant_non_intra: wsp->inter_q_mat %% 16 != 0, (%d)", wsp->inter_q_mat); if (NOT_VECTOR_ALIGNED(src)) mjpeg_error_exit1("quant_non_intra: src %% 16 != 0, (%d)", src); if (NOT_VECTOR_ALIGNED(dst)) mjpeg_error_exit1("quant_non_intra: dst %% 16 != 0, (%d)", dst);#endif /* }}} */#define QUANT_NON_INTRA_AB /* {{{ */ \ qmA = vec_ld(0, pqm); \ pqm++; \ qmB = vec_ld(0, pqm); \ pqm++; \ srcA = vec_ld(0, ps); \ ps += 8; \ srcB = vec_ld(0, ps); \ ps += 8; \ \ /* calculate divisor */ \ vu16(dA0) = vec_mergeh(zero, qmA); \ vu16(dA1) = vec_mergel(zero, qmA); \ vu16(dB0) = vec_mergeh(zero, qmB); \ vu16(dB1) = vec_mergel(zero, qmB); \ vuv = vec_ld(0, (unsigned short*)&vu); \ vuv = vec_splat(vuv, 0); /* splat mquant */ \ vu32(dA0) = vec_mulo(vu16(dA0), vuv); \ vu32(dA1) = vec_mulo(vu16(dA1), vuv); \ vu32(dB0) = vec_mulo(vu16(dB0), vuv); \ vu32(dB1) = vec_mulo(vu16(dB1), vuv); \ dA0 = vec_ctf(vu32(dA0), 0); \ dA1 = vec_ctf(vu32(dA1), 0); \ dB0 = vec_ctf(vu32(dB0), 0); \ dB1 = vec_ctf(vu32(dB1), 0); \ reA0 = vec_re(dA0); \ reA1 = vec_re(dA1); \ reB0 = vec_re(dB0); \ reB1 = vec_re(dB1); \ \ /* refinement #1 */ \ vfp(t1) = vec_nmsub(reA0, vfp(dA0), vfp(one)); \ vfp(t2) = vec_nmsub(reA1, vfp(dA1), vfp(one)); \ vfp(t3) = vec_nmsub(reB0, vfp(dB0), vfp(one)); \ vfp(t4) = vec_nmsub(reB1, vfp(dB1), vfp(one)); \ reA0 = vec_madd(reA0, vfp(t1), reA0); \ reA1 = vec_madd(reA1, vfp(t2), reA1); \ reB0 = vec_madd(reB0, vfp(t3), reB0); \ reB1 = vec_madd(reB1, vfp(t4), reB1); \ \ /* refinement #2 */ \ vfp(t1) = vec_nmsub(reA0, vfp(dA0), vfp(one)); \ vfp(t2) = vec_nmsub(reA1, vfp(dA1), vfp(one)); \ vfp(t3) = vec_nmsub(reB0, vfp(dB0), vfp(one)); \ vfp(t4) = vec_nmsub(reB1, vfp(dB1), vfp(one)); \ reA0 = vec_madd(reA0, vfp(t1), reA0); \ reA1 = vec_madd(reA1, vfp(t2), reA1); \ reB0 = vec_madd(reB0, vfp(t3), reB0); \ reB1 = vec_madd(reB1, vfp(t4), reB1); \ \ /* (sA0,sB0) = abs(ps[n],ps[n+1]) << 4 {{{ */ \ vs16(t1) = vec_subs(vs16(zero), srcA); \ vs16(t2) = vec_subs(vs16(zero), srcB); \ vs16(t3) = vec_max(srcA, vs16(t1)); \ vs16(t4) = vec_max(srcB, vs16(t2)); \ four = vec_splat_u16(4); \ vu16(t1) = vec_sl(vu16(t3), four); \ vu16(t2) = vec_sl(vu16(t4), four); \ /* }}} */ \ \ vu16(sA0) = vec_mergeh(zero, vu16(t1)); \ vu16(sA1) = vec_mergel(zero, vu16(t1)); \ vu16(sB0) = vec_mergeh(zero, vu16(t2)); \ vu16(sB1) = vec_mergel(zero, vu16(t2)); \ vfp(sA0) = vec_ctf(vu32(sA0), 0); \ vfp(sA1) = vec_ctf(vu32(sA1), 0); \ vfp(sB0) = vec_ctf(vu32(sB0), 0); \ vfp(sB1) = vec_ctf(vu32(sB1), 0); \ \ /* calculate quotient */ \ vfp(qtA0) = vec_madd(vfp(sA0), reA0, vfp(zero)); \ vfp(qtA1) = vec_madd(vfp(sA1), reA1, vfp(zero)); \ vfp(qtB0) = vec_madd(vfp(sB0), reB0, vfp(zero)); \ vfp(qtB1) = vec_madd(vfp(sB1), reB1, vfp(zero)); \ \ /* calculate remainder */ \ vfp(rmA0) = vec_nmsub(vfp(dA0), vfp(qtA0), vfp(sA0)); \ vfp(rmA1) = vec_nmsub(vfp(dA1), vfp(qtA1), vfp(sA1)); \ vfp(rmB0) = vec_nmsub(vfp(dB0), vfp(qtB0), vfp(sB0)); \ vfp(rmB1) = vec_nmsub(vfp(dB1), vfp(qtB1), vfp(sB1)); \ \ /* round quotient with remainder */ \ vfp(qtA0) = vec_madd(vfp(rmA0), reA0, vfp(qtA0)); \ vfp(qtA1) = vec_madd(vfp(rmA1), reA1, vfp(qtA1)); \ vfp(qtB0) = vec_madd(vfp(rmB0), reB0, vfp(qtB0)); \ vfp(qtB1) = vec_madd(vfp(rmB1), reB1, vfp(qtB1)); \ \ /* convert to integer */ \ vu32(qtA0) = vec_ctu(vfp(qtA0), 0); \ vu32(qtA1) = vec_ctu(vfp(qtA1), 0); \ vu32(qtB0) = vec_ctu(vfp(qtB0), 0); \ vu32(qtB1) = vec_ctu(vfp(qtB1), 0); \ \ vu16(dstA) = vec_pack(vu32(qtA0), vu32(qtA1)); \ vu16(dstB) = vec_pack(vu32(qtB0), vu32(qtB1)); \ \ /* test for non-zero values */ \ selA = vec_cmpgt(vu16(dstA), zero); \ selB = vec_cmpgt(vu16(dstB), zero); \ nz = vec_or(nz, selA); \ nz = vec_or(nz, selB); \ /* }}} */#define SIGN_AND_STORE /* {{{ */ \ /* sign dst blocks */ \ selA = vec_cmpgt(vs16(zero), srcA); \ selB = vec_cmpgt(vs16(zero), srcB); \ vs16(t1) = vec_subs(vs16(zero), dstA); \ vs16(t2) = vec_subs(vs16(zero), dstB); \ dstA = vec_sel(dstA, vs16(t1), selA); \ dstB = vec_sel(dstB, vs16(t2), selB); \ \ /* store dst blocks */ \ vec_st(dstA, 0, pd); \ pd += 8; \ vec_st(dstB, 0, pd); \ pd += 8; \ /* }}} */#define UPDATE_NZBLOCKBITS /* {{{ */ \ /* quasi-count the non-zero values and store to vu.s.nz */ \ vs32(nz) = vec_sums(vs32(nz), vs32(zero)); \ vu32(nz) = vec_splat(vu32(nz), 3); \ vuv = vec_ld(0, (unsigned short*)&vu); \ /* vuv = ( vuv(mquant, clipvalue), nz, (), () ) */ \ vu32(vuv) = vec_mergeh(vu32(vuv), vu32(nz)); \ vec_st(vuv, 0, (unsigned short*)&vu); /* store for scalar access */ \ nzblockbits |= ((!!vu.s.nz) << i); /* set non-zero block bit */ \ /* }}} */ AMBER_START;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -