📄 tomsmocomp.cpp
字号:
Tsimd::pavgb (mm6, mm0); // 3 pixels of valid new luma averag, plus garbage
pand (mm0, YMask); // keep only old luma, 0Y0Y0Y0Y
psllw (mm6, 8); // now only new luma as y0y0y0y0
por (mm6, mm0); // all luma yYyYyYyY
// now chroma bytes
movq (mm1, mm2);
Tsimd::pavgb (mm1, esi+4); // chroma 4 bytes to right
psrlw (mm2, 8); // only orig chroma as 0V0U0V0U
packuswb (mm2, mm2); // now xxxxVUVU
psrlw (mm1, 8); // only new chroma as 0?0?0v0u
packuswb (mm1, mm1); // now new chroma bytes xxxx??vu
punpcklwd (mm2,mm1); // ??VUvuVU, merged old and new chroma
// merge final luma & chroma bytes
movq (mm7, mm6); // copy of luma, ?YyYyYyY
punpcklbw (mm6, mm2); // low bytes vyuYVyUY
punpckhbw (mm7, mm2); // hi bytes ???YVyUY
Tsimd::movntq(edi, mm6); // store low qword
Tsimd::movntq(edi+8, mm7); // store high qword
}
// now expand the last 8 into 16 bytes
movq (mm0, esi); // orig luma, in register as VYUYVYUY
movq (mm2, mm0); // keep a copy while we got it
movq (mm6, mm0); // and another to work on now
psrlq (mm6, 16); // pretend most of it came from +2 offs (remem revers order)
Tsimd::pavgb ( mm6, mm0); // 3 pixels of valid new luma averag, plus garbage
pand (mm0, YMask); // keep only old luma, 0Y0Y0Y0Y
psllw (mm6, 8); // now only new luma as ?0y0y0y0
por (mm6, mm0); // all luma ?YyYyYyY
// now final chroma bytes
movq (mm1, mm2);
psrlq (mm1, 32); // pretend most of it came from +4 offs (remem revers order)
Tsimd::pavgb (mm1, mm2); // 2 pixels of valid new chroma averag, plus garbage
psrlw (mm2, 8); // only orig chroma as 0V0U0V0U
packuswb (mm2, mm2); // now xxxxVUVU
psrlw (mm1, 8); // only new chroma as 0?0?0v0u
packuswb (mm1, mm1); // now new chroma bytes xxxx??vu
punpcklwd (mm2,mm1); // ??VUvuVU, merged old and new chroma
// merge final luma & chroma bytes
movq( mm7, mm6); // copy of luma, ?YyYyYyY
punpcklbw (mm6, mm2); // low bytes vyuYVyUY
punpckhbw (mm7, mm2); // hi bytes ???YVyUY
movq (edi, mm6); // store low qword
movq (edi+8, mm7); // store high qword
*(edi+13)=*(esi+5);
*(short*)(edi+14)=*(short*)(esi+6);
}
static __forceinline void isBobBetter(__m64 &mm0,__m64 &mm1,__m64 &mm2,__m64 &mm3,__m64 &mm4,__m64 &mm6,__m64 &mm7)
{
movq (mm2, mm0 );
Tsimd::v_pavgb(mm2, mm1, mm3, ShiftMask) ;
movq (mm3, mm0 );
psubusb(mm3, mm1 );
psubusb(mm1, mm0 );
por (mm3, mm1 ); /* abs diff */
movq (mm1, mm3 ); /* keep copy */
psubusb(mm3, mm7 ); /* nonzero where new weights bigger, else 0 */
pxor (mm4, mm4 );
pcmpeqb(mm3, mm4 ); /* now ff where new better, else 00 */
pcmpeqb(mm4, mm3 ); /* here ff where old better, else 00 */
pand (mm1, mm3 ); /* keep only better new avg and abs */
pand (mm2, mm3 );
pand (mm6, mm4 );
pand (mm7, mm4 );
por (mm6, mm2 ); /* and merge new & old vals keeping best */
por (mm7, mm1 );
}
static void Avisynth_DblResizeH(stride_t src_pit, stride_t dst_pit, int rowsize, const BYTE* srcp, BYTE* dstp, int FldHeight) // go H expand lines into even output lines
{
const BYTE* pSrc = srcp;
BYTE* pDest = dstp + 2*dst_pit;
int y;
int ct = (rowsize-8) >> 3;
DblResizeH_1(rowsize, srcp, dstp); // expand top line
DblResizeH_1(rowsize, srcp + src_pit, dstp + dst_pit); // expand 2nd line
DblResizeH_1(rowsize, srcp+(FldHeight-2) * src_pit,dstp + (FldHeight-2) * dst_pit); // expand next to last line
DblResizeH_1(rowsize, srcp+(FldHeight-1) * src_pit, dstp + (FldHeight-1) * dst_pit); // expand last line
for (y=1; y <= FldHeight-4; y++, pSrc+=src_pit,pDest+=dst_pit)
{
// Loop general reg usage
//
// eax -
// ebx - src 2 lines above curr
// ecx loop ctr
// edx - dst_pit
// edi - dest
// esi - src pixels, 1 line up
// now loop and get the middle qwords
unsigned char *edi= pDest;
const unsigned char *ebx= pSrc;
stride_t edx= src_pit;
__m64 mm0,mm1,mm6,mm7,mm2,mm4,mm3;
for (int ecx=ct;ecx>0;ebx+=8,edi+=16,ecx--)
{
// Assume our pixels are layed out as follows with x the calc'd value
// and the other pixels are from the current field, top, curr, and lower lines.
//
// i j 2 lines above
// a b above line
// c x d curr line
// e f next line
// m n 2 lines below
//
// we calc the middle x value as:
// x = either avg(a,f), avg(c,d), avg(b,e)
// selected for the smallest of SAD(a,f), SAD(c,d), or SAD(b,e), etc.
// where SAD() = sum of absolute diff
//
// we do this first for luma. For now we will just horizontally average chroma
const unsigned char *esi=ebx+edx;
// i,n
movq (mm0, ebx); // value i from top left
movq (mm1, ebx+4*edx+2); // value n from bottom right
movq (mm6, mm0);
// pavgb mm6, mm1 // avg(i,n), also best so far
Tsimd::v_pavgb(mm6, mm1, mm7, ShiftMask); // avg(i,n), also best so far
movq (mm7, mm0);
psubusb (mm7, mm1);
psubusb (mm1, mm0);
por (mm7, mm1); // abs diff (SAD), also best so far
// m,j
movq (mm0, ebx+2); // value j from top right
movq (mm1, ebx+4*edx); // value m from bottom left
isBobBetter(mm0,mm1,mm2,mm3,mm4,mm6,mm7);
// a,f
movq (mm0, esi); // value a from top left
movq (mm1, esi+2*edx+2); // value f from bottom right
isBobBetter(mm0,mm1,mm2,mm3,mm4,mm6,mm7);
// e,b
movq (mm0, esi+2); // value b from top right
movq (mm1, esi+2*edx); // value e from bottom left
isBobBetter(mm0,mm1,mm2,mm3,mm4,mm6,mm7);
// c,d
movq (mm0, esi+edx); // value c from left
movq (mm1, esi+edx+2); // value d from right
// We will also clip what we've got so far to avoid artifacts
movq (mm2, mm0);
// pminub mm2, mm1
Tsimd::v_pminub(mm2, mm1, mm4);
// pmaxub mm6, mm2 // clip our current results so far to be above this
Tsimd::pmaxub (mm6, mm2);
movq (mm2, mm0);
Tsimd::pmaxub (mm2, mm1);
// pminub mm6, mm2 // clip our current results so far to be below this
Tsimd::v_pminub (mm6, mm2, mm4);
isBobBetter(mm0,mm1,mm2,mm3,mm4,mm6,mm7);
// we have now create 4 new luma values for pixel offsets 1,3,5,7. But we need to merge these
// with luma values for offsets 0,2,4,6 and some chroma info. First the other luma.
movq (mm0, esi+edx); // orig luma, in register as VYUYVYUY
movq (mm2, mm0); // keep a copy while we got it
pand (mm0, YMask); // keep only old luma, 0Y0Y0Y0Y
psllw( mm6, 8); // now only new luma as y0y0y0y0
por (mm6, mm0); // all luma yYyYyYyY
__m64 SaveLuma;
movq (SaveLuma, mm6); // and save it for later
// get chroma, but we only use 3 lines for chroma instead of 5
// a,f - Chroma
movq (mm0, esi); // value a from top left
movq (mm1, esi+2*edx+4); // value f from bottom right
movq (mm6, mm0 );
// pavgb mm6, mm1 // avg(a,f), also best so far
Tsimd::v_pavgb(mm6, mm1, mm7, ShiftMask);
movq (mm7, mm0);
psubusb (mm7, mm1);
psubusb (mm1, mm0);
por (mm7, mm1); // abs diff (SAD), also best so far
// e,b - Chroma
movq (mm0, esi+4); // value b from top right
movq (mm1, esi+2*edx); // value e from bottom left
isBobBetter(mm0,mm1,mm2,mm3,mm4,mm6,mm7);
// c,d - Chroma
movq (mm0, esi+edx); // value c from left
movq (mm1, esi+edx+4); // value d from right
// We will also clip what we've got so far to avoid artifacts
movq (mm2, mm0);
// pminub mm2, mm1
Tsimd::v_pminub (mm2, mm1, mm4);
// pmaxub mm6, mm2 // clip our current results so far to be above this
Tsimd::pmaxub (mm6, mm2);
movq (mm2, mm0);
Tsimd::pmaxub (mm2, mm1);
// pminub mm6, mm2 // clip our current results so far to be below this
Tsimd::v_pminub (mm6, mm2, mm4);
isBobBetter(mm0,mm1,mm2,mm3,mm4,mm6,mm7);
movq (mm2, esi+edx); // another copy of orig Chroma from c
psrlw (mm2, 8); // only orig chroma as 0V0U0V0U
packuswb (mm2, mm2); // now xxxxVUVU
psrlw (mm6, 8); // only new chroma as 0v0u0v0u
packuswb (mm6, mm6); // now new chroma bytes xxxxvuvu
punpcklwd (mm2,mm6); // vuVUvuVU, merged old and new chroma
// merge luma & chroma
movq (mm6, SaveLuma); // get luma again
movq (mm7, mm6); // copy of luma
punpcklbw (mm6, mm2); // low bytes vyuYVyUY
punpckhbw (mm7, mm2); // hi bytes vyuYVyUY
// movntq qword ptr[eax+edx], mm0
Tsimd::movntq (edi, mm6); // store low qword
Tsimd::movntq (edi+8, mm7); // store high qword
}
// done with our horizontal loop but we have 8 more bytes of input to process
// we will be fairly lazy with this and for now just use the current line, first luma
const unsigned char *esi= ebx+edx;
movq (mm0, esi+edx); // orig luma, in register as VYUYVYUY
movq (mm2, mm0); // keep a copy while we got it
movq (mm6, mm0); // and another to work on now
psrlq( mm6, 16 ); // pretend most of it came from +2 offs (remem revers order)
Tsimd::pavgb( mm6, mm0); // 3 pixels of valid new luma averag, plus garbage
pand (mm0, YMask ); // keep only old luma, 0Y0Y0Y0Y
psllw( mm6, 8 ); // now only new luma as ?0y0y0y0
por (mm6, mm0 ); // all luma ?YyYyYyY
// now final chroma bytes
movq (mm1, mm2);
psrlq (mm1, 32); // pretend most of it came from +4 offs (remem revers order)
Tsimd::pavgb (mm1, mm2); // 2 pixels of valid new chroma averag, plus garbage
psrlw (mm2, 8); // only orig chroma as 0V0U0V0U
packuswb (mm2, mm2); // now xxxxVUVU
psrlw (mm1, 8); // only new chroma as 0?0?0v0u
packuswb (mm1, mm1); // now new chroma bytes xxxx??vu
punpcklwd (mm2,mm1); // ??VUvuVU, merged old and new chroma
// merge final luma & chroma bytes
movq (mm7, mm6); // copy of luma, ?YyYyYyY
punpcklbw (mm6, mm2); // low bytes vyuYVyUY
punpckhbw (mm7, mm2); // hi bytes ???YVyUY
movq (edi, mm6); // store low qword
movq (edi+8, mm7); // store high qword
*(edi+13)=*(esi+edx+5); // last U source byte
*(short*)(edi+14)=*(short*)(esi+edx+6); // last 2 YV source bytes
}
}
};
struct TomsMoComp :public ItomsMoComp
{
private:
enum
{
PLANAR_Y=0,
PLANAR_U=1,
PLANAR_V=2
};
struct TVideoFrame
{
unsigned char* ptr[3];
unsigned int dx[3],dy[3];stride_t stride[3];
const unsigned char* GetReadPtr(int plane) {return ptr[plane];}
unsigned char* GetWritePtr(int plane) {return ptr[plane];}
stride_t GetPitch(int plane) {return stride[plane];}
unsigned int GetRowSize(int plane) {return dx[plane];}
unsigned int GetHeight(int plane) {return dy[plane];}
};
typedef TVideoFrame *PVideoFrame;
PVideoFrame src;
PVideoFrame dst;
PVideoFrame prevSrc;
uint8_t* pWorkArea;
const uint8_t* pWeaveSrcP;
const uint8_t* pCopySrcP;
int TopFirst;
int searchEffort,oldSearchEffort;
bool IsYUY2,Use_Vertical_Filter;
bool SSE2enabled,SSEMMXenabled,_3DNOWenabled;
TsearchLoopFc searchLoopFc;
int PrevFrame;
int PrevInFrame;
int dropFrame;
stride_t src_pitch;
static void Fieldcopy(void *dest, const void *src, size_t count, int rows, stride_t dst_pitch, stride_t src_pitch)
{
BYTE* pDest = (BYTE*) dest;
const BYTE* pSrc = (const BYTE*) src;
for (int i=0; i < rows; i++)
{
memcpy(pDest, pSrc, count);
pSrc += src_pitch;
pDest += dst_pitch;
}
}
void ProcessFrame(int SearchEffort,const unsigned char *srcp,unsigned char *dstp,stride_t dst_pitch,int rowsize,int FldHeight,int height)
{
unsigned char *pWeaveDest;const unsigned char *pWeaveSrc;
unsigned char *pCopyDest;const unsigned char *pCopySrc;
if (SearchEffort==-2) // note - DBL_Resize carried ast TopFirst == 0 hrtrtr
{
SearchEffort = -1; // pretend it is -1 after this
pWeaveDest = dstp+dst_pitch; // odd dest lines
pCopyDest = dstp; // even dest lines
if (Use_Vertical_Filter)
{
if (SSEMMXenabled)
TdblResize<Tmmxext>::Avisynth_DblResizeH(src_pitch, rowsize,rowsize/2, srcp, pWorkArea,FldHeight);// go H expand lines into even output lines
//else if (_3DNOWenabled)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -