📄 timgfilterdct.cpp
字号:
/*
* Copyright (c) 2002-2006 Milan Cutka
* Copyright (c) 2002 Tom Barry. All rights reserved.
* trbarry@trbarry.com
* idct, fdct, quantization and dequantization routines from XviD
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "stdafx.h"
#include "Tconfig.h"
#include "TimgFilterDCT.h"
#include "TdctSettings.h"
extern "C" void Skl_IDct16_MMX (short *block);
extern "C" void Skl_IDct16_SSE (short *block);
extern "C" void fdct_mmx_skal (short *block);
extern "C" void fdct_xmm_skal (short *block);
TimgFilterDCT::TimgFilterDCT(IffdshowBase *Ideci,Tfilters *Iparent):TimgFilter(Ideci,Iparent)
{
#ifdef __SSE2__
if (Tconfig::cpu_flags&FF_CPU_SSE2)
{
fdct=fdct_sse2;
idct=idct_sse2;
}
#endif
#ifndef WIN64
#ifdef __SSE2__
else
#endif
if (Tconfig::cpu_flags&FF_CPU_MMXEXT)
{
fdct=fdct_xmm_skal;
idct=Skl_IDct16_SSE;
}
else if (Tconfig::cpu_flags&FF_CPU_MMX)
{
fdct=fdct_mmx_skal;
idct=Skl_IDct16_MMX;
}
#endif
else
{
fdct=fdct_c;
idct=idct_c;
idct_c_init();
}
oldfac[0]=INT_MAX;oldMode=-1;oldmatrix[0]=0;
pWorkArea=(short*)aligned_malloc(64*sizeof(short),16);
}
TimgFilterDCT::~TimgFilterDCT()
{
aligned_free(pWorkArea);
}
void TimgFilterDCT::multiply(void)
{
const char * const factors8=(const char*)&factors[0][0];
*(__m64*)(pWorkArea+0*8+0)=_mm_srai_pi16(_mm_mullo_pi16(*(__m64*)(pWorkArea+0*8+0),*(__m64*)(factors8+0*16 )),3);
*(__m64*)(pWorkArea+0*8+4)=_mm_srai_pi16(_mm_mullo_pi16(*(__m64*)(pWorkArea+0*8+4),*(__m64*)(factors8+0*16+8)),3);
*(__m64*)(pWorkArea+1*8+0)=_mm_srai_pi16(_mm_mullo_pi16(*(__m64*)(pWorkArea+1*8+0),*(__m64*)(factors8+1*16 )),3);
*(__m64*)(pWorkArea+1*8+4)=_mm_srai_pi16(_mm_mullo_pi16(*(__m64*)(pWorkArea+1*8+4),*(__m64*)(factors8+1*16+8)),3);
*(__m64*)(pWorkArea+2*8+0)=_mm_srai_pi16(_mm_mullo_pi16(*(__m64*)(pWorkArea+2*8+0),*(__m64*)(factors8+2*16 )),3);
*(__m64*)(pWorkArea+2*8+4)=_mm_srai_pi16(_mm_mullo_pi16(*(__m64*)(pWorkArea+2*8+4),*(__m64*)(factors8+2*16+8)),3);
*(__m64*)(pWorkArea+3*8+0)=_mm_srai_pi16(_mm_mullo_pi16(*(__m64*)(pWorkArea+3*8+0),*(__m64*)(factors8+3*16 )),3);
*(__m64*)(pWorkArea+3*8+4)=_mm_srai_pi16(_mm_mullo_pi16(*(__m64*)(pWorkArea+3*8+4),*(__m64*)(factors8+3*16+8)),3);
*(__m64*)(pWorkArea+4*8+0)=_mm_srai_pi16(_mm_mullo_pi16(*(__m64*)(pWorkArea+4*8+0),*(__m64*)(factors8+4*16 )),3);
*(__m64*)(pWorkArea+4*8+4)=_mm_srai_pi16(_mm_mullo_pi16(*(__m64*)(pWorkArea+4*8+4),*(__m64*)(factors8+4*16+8)),3);
*(__m64*)(pWorkArea+5*8+0)=_mm_srai_pi16(_mm_mullo_pi16(*(__m64*)(pWorkArea+5*8+0),*(__m64*)(factors8+5*16 )),3);
*(__m64*)(pWorkArea+5*8+4)=_mm_srai_pi16(_mm_mullo_pi16(*(__m64*)(pWorkArea+5*8+4),*(__m64*)(factors8+5*16+8)),3);
*(__m64*)(pWorkArea+6*8+0)=_mm_srai_pi16(_mm_mullo_pi16(*(__m64*)(pWorkArea+6*8+0),*(__m64*)(factors8+6*16 )),3);
*(__m64*)(pWorkArea+6*8+4)=_mm_srai_pi16(_mm_mullo_pi16(*(__m64*)(pWorkArea+6*8+4),*(__m64*)(factors8+6*16+8)),3);
*(__m64*)(pWorkArea+7*8+0)=_mm_srai_pi16(_mm_mullo_pi16(*(__m64*)(pWorkArea+7*8+0),*(__m64*)(factors8+7*16 )),3);
*(__m64*)(pWorkArea+7*8+4)=_mm_srai_pi16(_mm_mullo_pi16(*(__m64*)(pWorkArea+7*8+4),*(__m64*)(factors8+7*16+8)),3);
}
void TimgFilterDCT::quant_h263_inter(int16_t * coeff,const uint32_t quant, const uint16_t *)
{
#define SCALEBITS 16
#define FIX(X) ((1L << SCALEBITS) / (X) + 1)
static const uint32_t multipliers[32] =
{
0, FIX(2), FIX(4), FIX(6),
FIX(8), FIX(10), FIX(12), FIX(14),
FIX(16), FIX(18), FIX(20), FIX(22),
FIX(24), FIX(26), FIX(28), FIX(30),
FIX(32), FIX(34), FIX(36), FIX(38),
FIX(40), FIX(42), FIX(44), FIX(46),
FIX(48), FIX(50), FIX(52), FIX(54),
FIX(56), FIX(58), FIX(60), FIX(62)
};
#undef FIX
const uint32_t mult = multipliers[quant];
const uint16_t quant_m_2 = uint16_t(quant << 1);
const uint16_t quant_d_2 = uint16_t(quant >> 1);
uint32_t sum = 0;
uint32_t i;
for (i = 0; i < 64; i++) {
int16_t acLevel = coeff[i];
if (acLevel < 0) {
acLevel = (-acLevel) - quant_d_2;
if (acLevel < quant_m_2) {
coeff[i] = 0;
continue;
}
acLevel = int16_t((acLevel * mult) >> SCALEBITS);
sum += acLevel; /* sum += |acLevel| */
coeff[i] = -acLevel;
} else {
acLevel = int16_t(acLevel-quant_d_2);
if (acLevel < quant_m_2) {
coeff[i] = 0;
continue;
}
acLevel = int16_t((acLevel * mult) >> SCALEBITS);
sum += acLevel;
coeff[i] = acLevel;
}
}
#undef SCALEBITS
}
void TimgFilterDCT::dequant_h263_inter(int16_t * data,const uint32_t quant,const uint16_t *)
{
const uint16_t quant_m_2 = uint16_t(quant << 1);
const uint16_t quant_add = uint16_t (quant & 1 ? quant : quant - 1);
int i;
for (i = 0; i < 64; i++) {
int16_t acLevel = data[i];
if (acLevel == 0) {
data[i] = 0;
} else if (acLevel < 0) {
acLevel = acLevel * quant_m_2 - quant_add;
data[i] = (acLevel >= -2048 ? acLevel : -2048);
} else {
acLevel = acLevel * quant_m_2 + quant_add;
data[i] = (acLevel <= 2047 ? acLevel : 2047);
}
}
}
void TimgFilterDCT::h263(void)
{
quant_h263_inter(pWorkArea,quant);
dequant_h263_inter(pWorkArea,quant);
}
void TimgFilterDCT::quant_mpeg_inter(int16_t * coeff,const uint32_t quant,const uint16_t * mpeg_quant_matrices)
{
#define SCALEBITS 17
#define FIX(X) ((1UL << SCALEBITS) / (X) + 1)
static const uint32_t multipliers[32] =
{
0, FIX(2), FIX(4), FIX(6),
FIX(8), FIX(10), FIX(12), FIX(14),
FIX(16), FIX(18), FIX(20), FIX(22),
FIX(24), FIX(26), FIX(28), FIX(30),
FIX(32), FIX(34), FIX(36), FIX(38),
FIX(40), FIX(42), FIX(44), FIX(46),
FIX(48), FIX(50), FIX(52), FIX(54),
FIX(56), FIX(58), FIX(60), FIX(62)
};
#undef FIX
#undef SCALEBITS
const uint32_t mult = multipliers[quant];
const uint16_t *inter_matrix = mpeg_quant_matrices;
uint32_t sum = 0;
int i;
for (i = 0; i < 64; i++) {
if (coeff[i] < 0) {
uint32_t level = -coeff[i];
level = ((level << 4) + (inter_matrix[i] >> 1)) / inter_matrix[i];
level = (level * mult) >> 17;
sum += level;
coeff[i] = -(int16_t) level;
} else if (coeff[i] > 0) {
uint32_t level = coeff[i];
level = ((level << 4) + (inter_matrix[i] >> 1)) / inter_matrix[i];
level = (level * mult) >> 17;
sum += level;
coeff[i] = int16_t(level);
} else {
coeff[i] = 0;
}
}
}
void TimgFilterDCT::dequant_mpeg_inter(int16_t * data,const uint32_t quant,const uint16_t * mpeg_quant_matrices)
{
uint32_t sum = 0;
const uint16_t *inter_matrix = (mpeg_quant_matrices);
int i;
for (i = 0; i < 64; i++) {
if (data[i] == 0) {
data[i] = 0;
} else if (data[i] < 0) {
int32_t level = -data[i];
level = ((2 * level + 1) * inter_matrix[i] * quant) >> 4;
data[i] = int16_t(level <= 2048 ? -level : -2048);
} else {
uint32_t level = data[i];
level = ((2 * level + 1) * inter_matrix[i] * quant) >> 4;
data[i] = int16_t(level <= 2047 ? level : 2047);
}
sum ^= data[i];
}
/* mismatch control */
if ((sum & 1) == 0) {
data[63] ^= 1;
}
}
void TimgFilterDCT::mpeg(void)
{
quant_mpeg_inter(pWorkArea,quant,(const uint16_t*)&factors[0][0]);
dequant_mpeg_inter(pWorkArea,quant,(const uint16_t*)&factors[0][0]);
}
HRESULT TimgFilterDCT::process(TfilterQueue::iterator it,TffPict &pict,const TfilterSettingsVideo *cfg0)
{
const TdctSettings *cfg=(const TdctSettings*)cfg0;
init(pict,cfg->full,cfg->half);
if (pictRect.dx>=8 && pictRect.dy>=8)
{
bool modechange=oldMode!=cfg->mode;
if (modechange)
switch (oldMode=cfg->mode)
{
case 1:processDct=&TimgFilterDCT::h263;break;
case 2:processDct=&TimgFilterDCT::mpeg;break;
default:
case 0:processDct=&TimgFilterDCT::multiply;break;
}
if (oldMode==0 && (modechange || memcmp(oldfac,&cfg->fac0,sizeof(oldfac))!=0))
{
memcpy(oldfac,&cfg->fac0,sizeof(oldfac));
for (int i=0;i<=7;i++)
for (int j=0;j<=7;j++)
factors[i][j]=(short)((oldfac[i]/1000.0) * (oldfac[j]/1000.0) * 8);
}
if (oldMode==2 && (modechange || memcpy(oldmatrix,&cfg->matrix0,sizeof(oldmatrix))!=0))
{
memcpy(oldmatrix,&cfg->matrix0,sizeof(oldmatrix));
const unsigned char *m=(const unsigned char*)&cfg->matrix0;
for (int i=0;i<8;i++)
for (int j=0;j<8;j++)
factors[i][j]=(short)limit<int>(*m++,1,255);
}
quant=cfg->quant;
const unsigned char *srcY;
getCur(FF_CSPS_MASK_YUV_PLANAR,pict,cfg->full,&srcY,NULL,NULL,NULL);
unsigned char *dstY;
getNext(csp1,pict,cfg->full,&dstY,NULL,NULL,NULL);
unsigned int cycles=dx1[0]&~7;
if (dx1[0]&7)
TffPict::copy(dstY+cycles,stride2[0],srcY+cycles,stride1[0],dx1[0]&7,dy1[0]);
__m64 m0=_mm_setzero_si64();
const stride_t stride1_0=stride1[0],stride2_0=stride2[0];
for (unsigned int y=0;y<=dy1[0]-7;srcY+=8*stride1_0,dstY+=8*stride2_0,y+=8)
{
const unsigned char *srcLn=srcY;unsigned char *dstLn=dstY,*dstLnEnd=dstLn+cycles;
for (;dstLn<dstLnEnd;srcLn+=8,dstLn+=8)
{
__m64 mm0=*(__m64*)(srcLn+0*stride1_0);
__m64 mm2=*(__m64*)(srcLn+1*stride1_0);
*(__m64*)(pWorkArea+ 0)=_mm_unpacklo_pi8(mm0,m0);
*(__m64*)(pWorkArea+ 4)=_mm_unpackhi_pi8(mm0,m0);
*(__m64*)(pWorkArea+ 8)=_mm_unpacklo_pi8(mm2,m0);
*(__m64*)(pWorkArea+12)=_mm_unpackhi_pi8(mm2,m0);
mm0=*(__m64*)(srcLn+2*stride1_0);
mm2=*(__m64*)(srcLn+3*stride1_0);
*(__m64*)(pWorkArea+16)=_mm_unpacklo_pi8(mm0,m0);
*(__m64*)(pWorkArea+20)=_mm_unpackhi_pi8(mm0,m0);
*(__m64*)(pWorkArea+24)=_mm_unpacklo_pi8(mm2,m0);
*(__m64*)(pWorkArea+28)=_mm_unpackhi_pi8(mm2,m0);
mm0=*(__m64*)(srcLn+4*stride1_0);
mm2=*(__m64*)(srcLn+5*stride1_0);
*(__m64*)(pWorkArea+32)=_mm_unpacklo_pi8(mm0,m0);
*(__m64*)(pWorkArea+36)=_mm_unpackhi_pi8(mm0,m0);
*(__m64*)(pWorkArea+40)=_mm_unpacklo_pi8(mm2,m0);
*(__m64*)(pWorkArea+44)=_mm_unpackhi_pi8(mm2,m0);
mm0=*(__m64*)(srcLn+6*stride1_0);
mm2=*(__m64*)(srcLn+7*stride1_0);
*(__m64*)(pWorkArea+48)=_mm_unpacklo_pi8(mm0,m0);
*(__m64*)(pWorkArea+52)=_mm_unpackhi_pi8(mm0,m0);
*(__m64*)(pWorkArea+56)=_mm_unpacklo_pi8(mm2,m0);
*(__m64*)(pWorkArea+60)=_mm_unpackhi_pi8(mm2,m0);
fdct(pWorkArea);
(this->*processDct)();
idct(pWorkArea);
*(__m64*)(dstLn+0*stride2_0)=_mm_packs_pu16(*(__m64*)(pWorkArea+0*8),*(__m64*)(pWorkArea+0*8+4));
*(__m64*)(dstLn+1*stride2_0)=_mm_packs_pu16(*(__m64*)(pWorkArea+1*8),*(__m64*)(pWorkArea+1*8+4));
*(__m64*)(dstLn+2*stride2_0)=_mm_packs_pu16(*(__m64*)(pWorkArea+2*8),*(__m64*)(pWorkArea+2*8+4));
*(__m64*)(dstLn+3*stride2_0)=_mm_packs_pu16(*(__m64*)(pWorkArea+3*8),*(__m64*)(pWorkArea+3*8+4));
*(__m64*)(dstLn+4*stride2_0)=_mm_packs_pu16(*(__m64*)(pWorkArea+4*8),*(__m64*)(pWorkArea+4*8+4));
*(__m64*)(dstLn+5*stride2_0)=_mm_packs_pu16(*(__m64*)(pWorkArea+5*8),*(__m64*)(pWorkArea+5*8+4));
*(__m64*)(dstLn+6*stride2_0)=_mm_packs_pu16(*(__m64*)(pWorkArea+6*8),*(__m64*)(pWorkArea+6*8+4));
*(__m64*)(dstLn+7*stride2_0)=_mm_packs_pu16(*(__m64*)(pWorkArea+7*8),*(__m64*)(pWorkArea+7*8+4));
}
}
_mm_empty();
if (dy1[0]&7)
TffPict::copy(dstY,stride2[0],srcY,stride1[0],dx1[0],dy1[0]&7);
}
return parent->deliverSample(++it,pict);
}
short TimgFilterDCT::iclip[1024],*TimgFilterDCT::iclp;
void TimgFilterDCT::idct_c_init(void)
{
iclp = iclip + 512;
for (int i = -512; i < 512; i++)
iclp[i] = (short)limit(i,-256,255);
}
void TimgFilterDCT::idct_c(short *block)
{
/*
* idct_int32_init() must be called before the first call to this
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -