simpleresize.cpp

来自「从FFMPEG转换而来的H264解码程序,VC下编译..」· C++ 代码 · 共 1,176 行 · 第 1/5 页

CPP
1,176
字号
//  Simple (faster) resize for avisynth
//      Copyright (C) 2002 Tom Barry
//
//      This program is free software; you can redistribute it and/or modify
//      it under the terms of the GNU General Public License as published by
//      the Free Software Foundation; either version 2 of the License, or
//      (at your option) any later version.
//
//      This program is distributed in the hope that it will be useful,
//      but WITHOUT ANY WARRANTY; without even the implied warranty of
//      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//      GNU General Public License for more details.
//
//      You should have received a copy of the GNU General Public License
//      along with this program; if not, write to the Free Software
//      Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
//
//  Also, this program is "Philanthropy-Ware".  That is, if you like it and
//  feel the need to reward or inspire the author then please feel free (but
//  not obligated) to consider joining or donating to the Electronic Frontier
//  Foundation. This will help keep cyber space free of barbed wire and bullsh*t.
//
//  See their web page at www.eff.org

// Changes:
//
// Jan 2003 0.3.3.0 Avisynth 2.5 YV12 support, AvisynthPluginit2
// Feb 2002 0.3.0.0 Added InterlacedResize support
// Jan 2002 0.2.0.0 Some rather ineffectual P3 SSE optimizations
// Jan 2002 0.1.0.0 First release

#include "stdafx.h"
#include "SimpleResize.h"
#include "Tconfig.h"
#include "simd.h"

#pragma warning(push)
#pragma warning(disable: 4700 4701)

__align16(const int64_t,SimpleResize::YMask[2]   ) = {0x00ff00ff00ff00ffLL,0x00ff00ff00ff00ffLL}; // keeps only luma
__align16(const int64_t,SimpleResize::FPround1[2]) = {0x0080008000800080LL,0x0080008000800080LL}; // round words
__align16(const int64_t,SimpleResize::FPround2[2]) = {0x0000008000000080LL,0x0000008000000080LL}; // round dwords

SimpleResize::SimpleResize(const VideoInfo &vi,unsigned int _width, unsigned int _height, double _hWarp, double _vWarp, bool _Interlaced)
{
 ok=false;
                minslope = 0.30;                        // don't overstretch
                oldwidth = vi.width;
                oldheight = vi.height;
                newwidth = _width;
                newheight = _height;
                hWarp =  _hWarp;                // 1.15 remember I used hw=1.15, vw=.95 for sample
                vWarp=  _vWarp;                 // .95
                Interlaced = _Interlaced;
                SSE2enabled = Tconfig::cpu_flags&FF_CPU_SSE2?true:false;
                SSEMMXenabled = Tconfig::cpu_flags&FF_CPU_MMXEXT?true:false;

                vOffsetsUV=vWeightsUV=hControl=vWorkY=vWorkUV=vOffsets=vWeights=NULL;
                if (vi.IsYUY2)
                {
                        DoYV12 = false;
                }
                else if (vi.IsYV12)
                {
                        if (Interlaced)
                        {
                                ;//env->ThrowError("InterlacedResize: Interlace not supported for YV12 yet");
                        }
                        DoYV12 = true;
                        vOffsetsUV = (int*) aligned_malloc(newheight*4,128);
                        vWeightsUV = (int*) aligned_malloc(newheight*4,128);

                        if (!vOffsetsUV || !vWeightsUV)
                        {
                                return;
                        }
                }

                else
                {
                        return;
                }

                // 2 qwords, 2 offsets, and prefetch slack
            hControl = (int*) aligned_malloc(newwidth*12+128, 128);   // aligned for P4 cache line
            vWorkY   = (int*) aligned_malloc(2*oldwidth+128, 128);
            vWorkUV  = (int*) aligned_malloc(oldwidth+128, 128);
            vOffsets = (int*) aligned_malloc(newheight*4, 128);
            vWeights = (int*) aligned_malloc(newheight*4, 128);

                if (!hControl || !vWeights)
                {
                        return;
                }
                if (DoYV12)
                {
                        InitTables_YV12();
                }
                else
                {
                        InitTables();
                }
 ok=true;
}

SimpleResize::~SimpleResize()
{
 if (vOffsetsUV) aligned_free(vOffsetsUV);
 if (vWeightsUV) aligned_free(vWeightsUV);
 if (hControl  ) aligned_free(hControl);
 if (vWorkY    ) aligned_free(vWorkY);
 if (vWorkUV   ) aligned_free(vWorkUV);
 if (vOffsets  ) aligned_free(vOffsets);
 if (vWeights  ) aligned_free(vWeights);
}

void SimpleResize::GetFrame(const PVideoFrame *src,PVideoFrame *dst)
{

        if (DoYV12)
        {
                GetFrame_YV12( src, dst, PLANAR_Y);
                GetFrame_YV12( src, dst, PLANAR_U);
                GetFrame_YV12( src, dst, PLANAR_V);
        }
        else
        {
                GetFrame_YUY2( src, dst, PLANAR_Y);
        }
}

// YV12 Luma
void SimpleResize::GetFrame_YV12(const PVideoFrame *src, PVideoFrame *dst, int Planar_Type)
{
        int vWeight1[4];
        int vWeight2[4];

        const BYTE* srcp = src->ptr[Planar_Type];
        const BYTE* srcp2W = srcp;
        BYTE* dstp=dst->ptr[Planar_Type];
        BYTE* dstp2 = dst->ptr[Planar_Type];

        //      BYTE* dstp = dst->GetWritePtr(Planar_Type);
        const stride_t src_pitch = src->pitch[Planar_Type];
        const stride_t dst_pitch = dst->pitch[Planar_Type];
        const int src_row_size = src->rowSize[Planar_Type];
        const int row_size = dst->rowSize[Planar_Type];
        const int height = dst->height[Planar_Type];

        const int* pControl = &hControl[0];
        const unsigned char* srcp1;
        const unsigned char* srcp2;
        int* vWorkYW = vWorkY;

        int* vOffsetsW = (Planar_Type == PLANAR_Y)
                ? vOffsets
                : vOffsetsUV;

        int* vWeightsW = (Planar_Type == PLANAR_Y)
                ? vWeights
                : vWeightsUV;

        // Just in case things are not aligned right, maybe turn off sse2
        #ifdef __SSE2__
        __m128i xmm0,xmm5,xmm6,xmm7,xmm1,xmm2,xmm3,xmm4;
        #endif
        __m64 mm5,mm6,mm0,mm7,mm1,mm2,mm3,mm4;
        for (int y = 0; y < height; y++)
        {

                vWeight1[0] = vWeight1[1] = vWeight1[2] = vWeight1[3] =
                        (256-vWeightsW[y]) << 16 | (256-vWeightsW[y]);
                vWeight2[0] = vWeight2[1] = vWeight2[2] = vWeight2[3] =
                        vWeightsW[y] << 16 | vWeightsW[y];

                srcp1 = srcp + vOffsetsW[y] * src_pitch;

                if (Interlaced)
                {
                        srcp2 = (y < height-2)
                                ? srcp1 + 2 * src_pitch
                                : srcp1;
                }
                else
                {
                        srcp2 = (y < height-1)
                                ? srcp1 + src_pitch
                                : srcp1;
                }

                        int             ecx= src_row_size,ebx;
                        ecx>>=3;                                  // 8 bytes a time
                        const unsigned char *esi= srcp1;                              // top of 2 src lines to get
                        const unsigned char *edx= srcp2;                              // next "
                        unsigned char *edi= (unsigned char*)vWorkYW;                    // luma work destination line
                        int eax=0;

// Let's check here to see if we are on a P4 or higher and can use SSE2 instructions.
// This first loop is not the performance bottleneck anyway but it is trivial to tune
// using SSE2 if we have proper alignment.
#ifdef __SSE2__
                        if (SSE2enabled==0)                  // is SSE2 supported?
                         goto vMaybeSSEMMX;                            // n, can't do anyway

                        if (ecx< 2)                                  // we have at least 16 byts, 2 qwords?
                         goto vMaybeSSEMMX;                            // n, don't bother

                        if ((intptr_t(esi)|intptr_t(edx))&0xf)                               // both src rows 16 byte aligned?
                         goto vMaybeSSEMMX;                    // n, don't use sse2

                        ecx>>=1;                                // do 16 bytes at a time instead
                        ecx--;                                  // jigger loop ct
                        //align       16
                        movdqu  (xmm0, FPround1);
                        movdqu  (xmm5, vWeight1);
                        movdqu  (xmm6, vWeight2);
                        pxor    (xmm7, xmm7);

                        //align   16
        vLoopSSE2_Fetch:
                        prefetcht0 (esi+eax*2+16);
                        prefetcht0 (edx+eax*2+16);

        vLoopSSE2:
                        movdqu  (xmm1, esi+eax); // top of 2 lines to interpolate
                        movdqu  (xmm3, edx+eax); // 2nd of 2 lines
                        movdqa  (xmm2, xmm1);
                        movdqa  (xmm4, xmm3);

                        punpcklbw (xmm1, xmm7);                    // make words
                        punpckhbw (xmm2, xmm7);                    // "
                        punpcklbw (xmm3, xmm7);                    // "
                        punpckhbw (xmm4, xmm7);                    // "

                        pmullw  (xmm1, xmm5);                              // mult by top weighting factor
                        pmullw  (xmm2, xmm5);              // "

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?