📄 tomsmocomp.cpp

📁 从FFMPEG转换而来的H264解码程序,VC下编译..
💻 CPP
📖 第 1 页 / 共 4 页
字号:
     Tsimd::pavgb   (mm6, mm0);    // 3 pixels of valid new luma averag, plus garbage
     pand (mm0, YMask);    // keep only old luma, 0Y0Y0Y0Y
     psllw (mm6, 8);     // now only new luma as y0y0y0y0
     por  (mm6, mm0);    // all luma yYyYyYyY

     // now  chroma bytes
     movq (mm1, mm2);
     Tsimd::pavgb (mm1, esi+4); // chroma 4 bytes to right
     psrlw (mm2, 8);     // only orig chroma as 0V0U0V0U
     packuswb (mm2, mm2);    // now xxxxVUVU
     psrlw (mm1, 8);     // only new chroma as 0?0?0v0u
     packuswb (mm1, mm1);    // now new chroma bytes xxxx??vu
     punpcklwd (mm2,mm1);    // ??VUvuVU, merged old and new chroma

     // merge final luma & chroma bytes

     movq (mm7, mm6);    // copy of luma, ?YyYyYyY
     punpcklbw (mm6, mm2);    // low bytes vyuYVyUY
     punpckhbw (mm7, mm2);    // hi bytes ???YVyUY

     Tsimd::movntq(edi, mm6); // store low qword
     Tsimd::movntq(edi+8, mm7); // store high qword
    }

   // now expand the last 8 into 16 bytes
   movq (mm0, esi);  // orig luma, in register as VYUYVYUY
   movq (mm2, mm0);    // keep a copy while we got it
   movq (mm6, mm0);    // and another to work on now

   psrlq (mm6, 16);     // pretend most of it came from +2 offs (remem revers order)
   Tsimd::pavgb (  mm6, mm0);    // 3 pixels of valid new luma averag, plus garbage
   pand (mm0, YMask);    // keep only old luma, 0Y0Y0Y0Y
   psllw (mm6, 8);     // now only new luma as ?0y0y0y0
   por  (mm6, mm0);    // all luma ?YyYyYyY

   // now final chroma bytes
   movq (mm1, mm2);
   psrlq (mm1, 32);     // pretend most of it came from +4 offs (remem revers order)
   Tsimd::pavgb   (mm1, mm2);    // 2 pixels of valid new chroma averag, plus garbage
   psrlw (mm2, 8);     // only orig chroma as 0V0U0V0U
   packuswb (mm2, mm2);    // now xxxxVUVU
   psrlw (mm1, 8);     // only new chroma as 0?0?0v0u
   packuswb (mm1, mm1);    // now new chroma bytes xxxx??vu
   punpcklwd (mm2,mm1);    // ??VUvuVU, merged old and new chroma

   // merge final luma & chroma bytes

   movq( mm7, mm6);    // copy of luma, ?YyYyYyY
   punpcklbw (mm6, mm2);    // low bytes vyuYVyUY
   punpckhbw (mm7, mm2);    // hi bytes ???YVyUY

   movq (edi, mm6);    // store low qword
   movq (edi+8, mm7);     // store high qword

   *(edi+13)=*(esi+5);
   *(short*)(edi+14)=*(short*)(esi+6);
  }
 static __forceinline void isBobBetter(__m64 &mm0,__m64 &mm1,__m64 &mm2,__m64 &mm3,__m64 &mm4,__m64 &mm6,__m64 &mm7)
  {
   movq   (mm2, mm0 );
   Tsimd::v_pavgb(mm2, mm1, mm3, ShiftMask) ;
   movq   (mm3, mm0 );
   psubusb(mm3, mm1 );
   psubusb(mm1, mm0 );
   por    (mm3, mm1 );       /* abs diff */
   movq   (mm1, mm3 );       /* keep copy */
   psubusb(mm3, mm7 );       /* nonzero where new weights bigger, else 0 */
   pxor   (mm4, mm4 );
   pcmpeqb(mm3, mm4 );       /* now ff where new better, else 00     */
   pcmpeqb(mm4, mm3 );       /* here ff where old better, else 00 */
   pand   (mm1, mm3 );       /* keep only better new avg and abs */
   pand   (mm2, mm3 );
   pand   (mm6, mm4 );
   pand   (mm7, mm4 );
   por    (mm6, mm2 );       /* and merge new & old vals keeping best */
   por    (mm7, mm1 );
  }

 static void Avisynth_DblResizeH(stride_t src_pit, stride_t dst_pit, int rowsize, const BYTE* srcp, BYTE* dstp, int FldHeight)     // go H expand lines into even output lines
  {
   const BYTE* pSrc = srcp;
   BYTE* pDest = dstp + 2*dst_pit;
   int y;
   int ct = (rowsize-8) >> 3;

   DblResizeH_1(rowsize, srcp, dstp);  // expand top line
   DblResizeH_1(rowsize, srcp + src_pit, dstp + dst_pit);  // expand 2nd line
   DblResizeH_1(rowsize, srcp+(FldHeight-2) * src_pit,dstp + (FldHeight-2) * dst_pit);  // expand next to last line
   DblResizeH_1(rowsize, srcp+(FldHeight-1) * src_pit, dstp + (FldHeight-1) * dst_pit);  // expand last line

   for (y=1; y <= FldHeight-4; y++, pSrc+=src_pit,pDest+=dst_pit)
    {
     // Loop general reg usage
     //
     // eax -
     // ebx - src 2 lines above curr
     // ecx loop ctr
     // edx - dst_pit
     // edi - dest
     // esi - src pixels, 1 line up

     // now loop and get the middle qwords
     unsigned char *edi= pDest;
     const unsigned char *ebx= pSrc;
     stride_t edx= src_pit;
     __m64 mm0,mm1,mm6,mm7,mm2,mm4,mm3;
     for (int ecx=ct;ecx>0;ebx+=8,edi+=16,ecx--)
      {
       // Assume our pixels are layed out as follows with x the calc'd value
       // and the other pixels are from the current field, top, curr, and lower lines.
       //
       //    i   j   2 lines above
       //    a   b   above line
       //        c x d   curr line
       //        e   f   next line
       //        m   n   2 lines below
       //
       // we calc the middle x value as:
       //  x = either avg(a,f), avg(c,d), avg(b,e)

       // selected for the smallest of SAD(a,f), SAD(c,d), or SAD(b,e), etc.
       // where SAD() = sum of absolute diff
       //
       // we do this first for luma. For now we will just horizontally average chroma

       const unsigned char *esi=ebx+edx;

       // i,n
       movq (mm0, ebx);   // value i from top left
       movq (mm1, ebx+4*edx+2); // value n from bottom right
       movq (mm6, mm0);
       //  pavgb mm6, mm1     // avg(i,n), also best so far
       Tsimd::v_pavgb(mm6, mm1, mm7, ShiftMask); // avg(i,n), also best so far
       movq (mm7, mm0);
       psubusb (mm7, mm1);
       psubusb (mm1, mm0);
       por  (mm7, mm1);     // abs diff (SAD), also best so far

       // m,j
       movq    (mm0, ebx+2);  // value j from top right
       movq    (mm1, ebx+4*edx); // value m from bottom left
       isBobBetter(mm0,mm1,mm2,mm3,mm4,mm6,mm7);

       // a,f
       movq    (mm0, esi);   // value a from top left
       movq    (mm1, esi+2*edx+2); // value f from bottom right
       isBobBetter(mm0,mm1,mm2,mm3,mm4,mm6,mm7);


       // e,b
       movq    (mm0, esi+2);  // value b from top right
       movq    (mm1, esi+2*edx); // value e from bottom left
       isBobBetter(mm0,mm1,mm2,mm3,mm4,mm6,mm7);

       // c,d
       movq    (mm0, esi+edx); // value c from left
       movq    (mm1, esi+edx+2); // value d from right

       // We will also clip what we've got so far to avoid artifacts

       movq (mm2, mm0);
       //  pminub mm2, mm1
       Tsimd::v_pminub(mm2, mm1, mm4);
       //  pmaxub mm6, mm2   // clip our current results so far to be above this
       Tsimd::pmaxub (mm6, mm2);
       movq (mm2, mm0);
       Tsimd::pmaxub (mm2, mm1);
       //  pminub mm6, mm2   // clip our current results so far to be below this
       Tsimd::v_pminub (mm6, mm2, mm4);

       isBobBetter(mm0,mm1,mm2,mm3,mm4,mm6,mm7);

       // we have now create 4 new luma values for pixel offsets 1,3,5,7. But we need to merge these
       // with luma values for offsets 0,2,4,6 and some chroma info. First the other luma.
       movq (mm0, esi+edx); // orig luma, in register as VYUYVYUY
       movq (mm2, mm0);    // keep a copy while we got it
       pand (mm0, YMask);    // keep only old luma, 0Y0Y0Y0Y
       psllw( mm6, 8);     // now only new luma as y0y0y0y0
       por  (mm6, mm0);    // all luma yYyYyYyY
       __m64 SaveLuma;
       movq (SaveLuma, mm6);   // and save it for later

       // get chroma, but we only use 3 lines for chroma instead of 5


       // a,f - Chroma
       movq (mm0, esi);   // value a from top left
       movq (mm1, esi+2*edx+4); // value f from bottom right
       movq (mm6, mm0 );
       //  pavgb mm6, mm1     // avg(a,f), also best so far
       Tsimd::v_pavgb(mm6, mm1, mm7, ShiftMask);
       movq (mm7, mm0);
       psubusb (mm7, mm1);
       psubusb (mm1, mm0);
       por  (mm7, mm1);     // abs diff (SAD), also best so far

       // e,b - Chroma
       movq    (mm0, esi+4);  // value b from top right
       movq    (mm1, esi+2*edx); // value e from bottom left
       isBobBetter(mm0,mm1,mm2,mm3,mm4,mm6,mm7);

       // c,d - Chroma
       movq    (mm0, esi+edx); // value c from left
       movq    (mm1, esi+edx+4); // value d from right

       // We will also clip what we've got so far to avoid artifacts
       movq (mm2, mm0);
       //  pminub mm2, mm1
       Tsimd::v_pminub (mm2, mm1, mm4);
       //  pmaxub mm6, mm2   // clip our current results so far to be above this
       Tsimd::pmaxub (mm6, mm2);
       movq (mm2, mm0);
       Tsimd::pmaxub (mm2, mm1);
       //  pminub mm6, mm2   // clip our current results so far to be below this
       Tsimd::v_pminub (mm6, mm2, mm4);

       isBobBetter(mm0,mm1,mm2,mm3,mm4,mm6,mm7);

       movq  (mm2, esi+edx); // another copy of orig Chroma from c
       psrlw (mm2, 8);     // only orig chroma as 0V0U0V0U
       packuswb (mm2, mm2);    // now xxxxVUVU
       psrlw (mm6, 8);     // only new chroma as 0v0u0v0u
       packuswb (mm6, mm6);    // now new chroma bytes xxxxvuvu
       punpcklwd (mm2,mm6);    // vuVUvuVU, merged old and new chroma

       // merge luma & chroma
       movq (mm6, SaveLuma);   // get luma again
       movq (mm7, mm6);    // copy of luma
       punpcklbw (mm6, mm2);    // low bytes vyuYVyUY
       punpckhbw (mm7, mm2);    // hi bytes vyuYVyUY

       //  movntq qword ptr[eax+edx], mm0
       Tsimd::movntq (edi, mm6); // store low qword
       Tsimd::movntq (edi+8, mm7); // store high qword
      }

     // done with our horizontal loop but we have 8 more bytes of input to process
     // we will be fairly lazy with this and for now just use the current line, first luma
     const unsigned char *esi= ebx+edx;
     movq (mm0, esi+edx); // orig luma, in register as VYUYVYUY
     movq (mm2, mm0);    // keep a copy while we got it
     movq (mm6, mm0);    // and another to work on now

     psrlq( mm6, 16  );   // pretend most of it came from +2 offs (remem revers order)
     Tsimd::pavgb(   mm6, mm0);    // 3 pixels of valid new luma averag, plus garbage
     pand (mm0, YMask );  // keep only old luma, 0Y0Y0Y0Y
     psllw( mm6, 8    ); // now only new luma as ?0y0y0y0
     por  (mm6, mm0   ); // all luma ?YyYyYyY

   // now final chroma bytes
     movq (mm1, mm2);
     psrlq (mm1, 32);     // pretend most of it came from +4 offs (remem revers order)
     Tsimd::pavgb   (mm1, mm2);    // 2 pixels of valid new chroma averag, plus garbage
     psrlw (mm2, 8);     // only orig chroma as 0V0U0V0U
     packuswb (mm2, mm2);    // now xxxxVUVU
     psrlw (mm1, 8);     // only new chroma as 0?0?0v0u
     packuswb (mm1, mm1);    // now new chroma bytes xxxx??vu
     punpcklwd (mm2,mm1);    // ??VUvuVU, merged old and new chroma

   // merge final luma & chroma bytes

     movq (mm7, mm6);    // copy of luma, ?YyYyYyY
     punpcklbw (mm6, mm2);    // low bytes vyuYVyUY
     punpckhbw (mm7, mm2);    // hi bytes ???YVyUY

     movq (edi, mm6);    // store low qword
     movq (edi+8, mm7);     // store high qword

     *(edi+13)=*(esi+edx+5);    // last U source byte
     *(short*)(edi+14)=*(short*)(esi+edx+6);    // last 2 YV source bytes
    }
  }
};

struct TomsMoComp :public ItomsMoComp
{
private:
 enum
  {
   PLANAR_Y=0,
   PLANAR_U=1,
   PLANAR_V=2
  };
 struct TVideoFrame
  {
   unsigned char* ptr[3];
   unsigned int dx[3],dy[3];stride_t stride[3];
   const unsigned char* GetReadPtr(int plane) {return ptr[plane];}
   unsigned char* GetWritePtr(int plane) {return ptr[plane];}
   stride_t GetPitch(int plane) {return stride[plane];}
   unsigned int GetRowSize(int plane) {return dx[plane];}
   unsigned int GetHeight(int plane) {return dy[plane];}
  };
 typedef TVideoFrame *PVideoFrame;
 PVideoFrame src;
 PVideoFrame dst;
 PVideoFrame prevSrc;
 uint8_t* pWorkArea;
 const uint8_t* pWeaveSrcP;
 const uint8_t* pCopySrcP;

 int TopFirst;
 int searchEffort,oldSearchEffort;
 bool IsYUY2,Use_Vertical_Filter;
 bool SSE2enabled,SSEMMXenabled,_3DNOWenabled;
 TsearchLoopFc searchLoopFc;

 int PrevFrame;
 int PrevInFrame;
 int dropFrame;

 stride_t src_pitch;

 static void Fieldcopy(void *dest, const void *src, size_t count, int rows, stride_t dst_pitch, stride_t src_pitch)
  {
  BYTE* pDest = (BYTE*) dest;
  const BYTE* pSrc = (const BYTE*) src;
  for (int i=0; i < rows; i++)
   {
    memcpy(pDest, pSrc, count);
    pSrc += src_pitch;
    pDest += dst_pitch;
   }
 }

 void ProcessFrame(int SearchEffort,const unsigned char *srcp,unsigned char *dstp,stride_t dst_pitch,int rowsize,int FldHeight,int height)
  {
   unsigned char *pWeaveDest;const unsigned char *pWeaveSrc;
   unsigned char *pCopyDest;const unsigned char *pCopySrc;
   if (SearchEffort==-2)    // note - DBL_Resize carried ast TopFirst == 0 hrtrtr
    {
     SearchEffort = -1;    // pretend it is -1 after this
     pWeaveDest = dstp+dst_pitch; // odd dest lines
     pCopyDest = dstp;    // even dest lines
     if (Use_Vertical_Filter)
      {
       if (SSEMMXenabled)
        TdblResize<Tmmxext>::Avisynth_DblResizeH(src_pitch, rowsize,rowsize/2, srcp, pWorkArea,FldHeight);// go H expand lines into even output lines
       //else if (_3DNOWenabled)
💿 文件大小 8073 K
👤 上传用户 sinba
📂 所属分类压缩解压
🏷️ 相关标签

#FFMPEG #H264 #VC #转换
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -