simpleresize.cpp
来自「从FFMPEG转换而来的H264解码程序,VC下编译..」· C++ 代码 · 共 1,176 行 · 第 1/5 页
CPP
1,176 行
// Simple (faster) resize for avisynth
// Copyright (C) 2002 Tom Barry
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
//
// Also, this program is "Philanthropy-Ware". That is, if you like it and
// feel the need to reward or inspire the author then please feel free (but
// not obligated) to consider joining or donating to the Electronic Frontier
// Foundation. This will help keep cyber space free of barbed wire and bullsh*t.
//
// See their web page at www.eff.org
// Changes:
//
// Jan 2003 0.3.3.0 Avisynth 2.5 YV12 support, AvisynthPluginit2
// Feb 2002 0.3.0.0 Added InterlacedResize support
// Jan 2002 0.2.0.0 Some rather ineffectual P3 SSE optimizations
// Jan 2002 0.1.0.0 First release
#include "stdafx.h"
#include "SimpleResize.h"
#include "Tconfig.h"
#include "simd.h"
#pragma warning(push)
#pragma warning(disable: 4700 4701)
__align16(const int64_t,SimpleResize::YMask[2] ) = {0x00ff00ff00ff00ffLL,0x00ff00ff00ff00ffLL}; // keeps only luma
__align16(const int64_t,SimpleResize::FPround1[2]) = {0x0080008000800080LL,0x0080008000800080LL}; // round words
__align16(const int64_t,SimpleResize::FPround2[2]) = {0x0000008000000080LL,0x0000008000000080LL}; // round dwords
SimpleResize::SimpleResize(const VideoInfo &vi,unsigned int _width, unsigned int _height, double _hWarp, double _vWarp, bool _Interlaced)
{
ok=false;
minslope = 0.30; // don't overstretch
oldwidth = vi.width;
oldheight = vi.height;
newwidth = _width;
newheight = _height;
hWarp = _hWarp; // 1.15 remember I used hw=1.15, vw=.95 for sample
vWarp= _vWarp; // .95
Interlaced = _Interlaced;
SSE2enabled = Tconfig::cpu_flags&FF_CPU_SSE2?true:false;
SSEMMXenabled = Tconfig::cpu_flags&FF_CPU_MMXEXT?true:false;
vOffsetsUV=vWeightsUV=hControl=vWorkY=vWorkUV=vOffsets=vWeights=NULL;
if (vi.IsYUY2)
{
DoYV12 = false;
}
else if (vi.IsYV12)
{
if (Interlaced)
{
;//env->ThrowError("InterlacedResize: Interlace not supported for YV12 yet");
}
DoYV12 = true;
vOffsetsUV = (int*) aligned_malloc(newheight*4,128);
vWeightsUV = (int*) aligned_malloc(newheight*4,128);
if (!vOffsetsUV || !vWeightsUV)
{
return;
}
}
else
{
return;
}
// 2 qwords, 2 offsets, and prefetch slack
hControl = (int*) aligned_malloc(newwidth*12+128, 128); // aligned for P4 cache line
vWorkY = (int*) aligned_malloc(2*oldwidth+128, 128);
vWorkUV = (int*) aligned_malloc(oldwidth+128, 128);
vOffsets = (int*) aligned_malloc(newheight*4, 128);
vWeights = (int*) aligned_malloc(newheight*4, 128);
if (!hControl || !vWeights)
{
return;
}
if (DoYV12)
{
InitTables_YV12();
}
else
{
InitTables();
}
ok=true;
}
SimpleResize::~SimpleResize()
{
if (vOffsetsUV) aligned_free(vOffsetsUV);
if (vWeightsUV) aligned_free(vWeightsUV);
if (hControl ) aligned_free(hControl);
if (vWorkY ) aligned_free(vWorkY);
if (vWorkUV ) aligned_free(vWorkUV);
if (vOffsets ) aligned_free(vOffsets);
if (vWeights ) aligned_free(vWeights);
}
void SimpleResize::GetFrame(const PVideoFrame *src,PVideoFrame *dst)
{
if (DoYV12)
{
GetFrame_YV12( src, dst, PLANAR_Y);
GetFrame_YV12( src, dst, PLANAR_U);
GetFrame_YV12( src, dst, PLANAR_V);
}
else
{
GetFrame_YUY2( src, dst, PLANAR_Y);
}
}
// YV12 Luma
void SimpleResize::GetFrame_YV12(const PVideoFrame *src, PVideoFrame *dst, int Planar_Type)
{
int vWeight1[4];
int vWeight2[4];
const BYTE* srcp = src->ptr[Planar_Type];
const BYTE* srcp2W = srcp;
BYTE* dstp=dst->ptr[Planar_Type];
BYTE* dstp2 = dst->ptr[Planar_Type];
// BYTE* dstp = dst->GetWritePtr(Planar_Type);
const stride_t src_pitch = src->pitch[Planar_Type];
const stride_t dst_pitch = dst->pitch[Planar_Type];
const int src_row_size = src->rowSize[Planar_Type];
const int row_size = dst->rowSize[Planar_Type];
const int height = dst->height[Planar_Type];
const int* pControl = &hControl[0];
const unsigned char* srcp1;
const unsigned char* srcp2;
int* vWorkYW = vWorkY;
int* vOffsetsW = (Planar_Type == PLANAR_Y)
? vOffsets
: vOffsetsUV;
int* vWeightsW = (Planar_Type == PLANAR_Y)
? vWeights
: vWeightsUV;
// Just in case things are not aligned right, maybe turn off sse2
#ifdef __SSE2__
__m128i xmm0,xmm5,xmm6,xmm7,xmm1,xmm2,xmm3,xmm4;
#endif
__m64 mm5,mm6,mm0,mm7,mm1,mm2,mm3,mm4;
for (int y = 0; y < height; y++)
{
vWeight1[0] = vWeight1[1] = vWeight1[2] = vWeight1[3] =
(256-vWeightsW[y]) << 16 | (256-vWeightsW[y]);
vWeight2[0] = vWeight2[1] = vWeight2[2] = vWeight2[3] =
vWeightsW[y] << 16 | vWeightsW[y];
srcp1 = srcp + vOffsetsW[y] * src_pitch;
if (Interlaced)
{
srcp2 = (y < height-2)
? srcp1 + 2 * src_pitch
: srcp1;
}
else
{
srcp2 = (y < height-1)
? srcp1 + src_pitch
: srcp1;
}
int ecx= src_row_size,ebx;
ecx>>=3; // 8 bytes a time
const unsigned char *esi= srcp1; // top of 2 src lines to get
const unsigned char *edx= srcp2; // next "
unsigned char *edi= (unsigned char*)vWorkYW; // luma work destination line
int eax=0;
// Let's check here to see if we are on a P4 or higher and can use SSE2 instructions.
// This first loop is not the performance bottleneck anyway but it is trivial to tune
// using SSE2 if we have proper alignment.
#ifdef __SSE2__
if (SSE2enabled==0) // is SSE2 supported?
goto vMaybeSSEMMX; // n, can't do anyway
if (ecx< 2) // we have at least 16 byts, 2 qwords?
goto vMaybeSSEMMX; // n, don't bother
if ((intptr_t(esi)|intptr_t(edx))&0xf) // both src rows 16 byte aligned?
goto vMaybeSSEMMX; // n, don't use sse2
ecx>>=1; // do 16 bytes at a time instead
ecx--; // jigger loop ct
//align 16
movdqu (xmm0, FPround1);
movdqu (xmm5, vWeight1);
movdqu (xmm6, vWeight2);
pxor (xmm7, xmm7);
//align 16
vLoopSSE2_Fetch:
prefetcht0 (esi+eax*2+16);
prefetcht0 (edx+eax*2+16);
vLoopSSE2:
movdqu (xmm1, esi+eax); // top of 2 lines to interpolate
movdqu (xmm3, edx+eax); // 2nd of 2 lines
movdqa (xmm2, xmm1);
movdqa (xmm4, xmm3);
punpcklbw (xmm1, xmm7); // make words
punpckhbw (xmm2, xmm7); // "
punpcklbw (xmm3, xmm7); // "
punpckhbw (xmm4, xmm7); // "
pmullw (xmm1, xmm5); // mult by top weighting factor
pmullw (xmm2, xmm5); // "
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?