⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 dsp_mmx.cpp

📁 从FFMPEG转换而来的H264解码程序,VC下编译..
💻 CPP
📖 第 1 页 / 共 2 页
字号:
/********************************************************************
 *                                                                  *
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
 *                                                                  *
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
 * by the Xiph.Org Foundation http://www.xiph.org/                  *
 *                                                                  *
 ********************************************************************

  function:

 ********************************************************************/

#include <stdlib.h>
#include "dsp.h"
#include "csimd.h"

using namespace csimd;

static void sub8x8__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr,
                  ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
                  ogg_uint32_t ReconPixelsPerLine)
{
    __m64 mm7=_mm_setzero_si64();

    for (int i=0;i<8;i++)
     {
      __m64 mm0,mm1,mm2,mm3;
      movq        (FiltPtr, mm0);       /* mm0 = FiltPtr */
      movq        (ReconPtr, mm1);       /* mm1 = ReconPtr */
      movq        (mm0, mm2);      /* dup to prepare for up conversion */
      movq        (mm1, mm3);      /* dup to prepare for up conversion */
    /* convert from UINT8 to INT16 */
      punpcklbw   (mm7, mm0);      /* mm0 = INT16(FiltPtr) */
      punpcklbw   (mm7, mm1);      /* mm1 = INT16(ReconPtr) */
      punpckhbw   (mm7, mm2);      /* mm2 = INT16(FiltPtr) */
      punpckhbw   (mm7, mm3);      /* mm3 = INT16(ReconPtr) */
    /* start calculation */
      psubw       (mm1, mm0);      /* mm0 = FiltPtr - ReconPtr */
      psubw       (mm3, mm2);      /* mm2 = FiltPtr - ReconPtr */
      movq        (mm0,  DctInputPtr);      /* write answer out */
      movq        (mm2, 4+DctInputPtr);     /* write answer out */
    /* Increment pointers */
      DctInputPtr+=8;
      FiltPtr+=PixelsPerLine;
      ReconPtr+=ReconPixelsPerLine;
     }
}

static void sub8x8_128__mmx (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
                      ogg_uint32_t PixelsPerLine)
{
 __m64 mm1 = _mm_set1_pi16(0x00080);

 __m64 mm7=_mm_setzero_si64();
 for (int i=0;i<8;i++)
  {
   __m64 mm0,mm2;
      movq        (FiltPtr, mm0);       /* mm0 = FiltPtr */
      movq        (mm0, mm2);      /* dup to prepare for up conversion */
    /* convert from UINT8 to INT16 */
      punpcklbw   (mm7, mm0);      /* mm0 = INT16(FiltPtr) */
      punpckhbw   (mm7, mm2);      /* mm2 = INT16(FiltPtr) */
    /* start calculation */
      psubw       (mm1, mm0);      /* mm0 = FiltPtr - 128 */
      psubw       (mm1, mm2);      /* mm2 = FiltPtr - 128 */
      movq        (mm0,  DctInputPtr);      /* write answer out */
      movq        (mm2, 4+DctInputPtr);      /* write answer out */
    /* Increment pointers */
      DctInputPtr+=8;
      FiltPtr+=PixelsPerLine;
   }
}

static void sub8x8avg2__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr1,
                     unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
                     ogg_uint32_t PixelsPerLine,
                     ogg_uint32_t ReconPixelsPerLine)
{
 __m64 mm7=_mm_setzero_si64();

 for (int i=0;i<8;i++)
  {
   __m64 mm0,mm1,mm2,mm3,mm4,mm5;
      movq        (FiltPtr, mm0);       /* mm0 = FiltPtr */
      movq        (ReconPtr1, mm1);       /* mm1 = ReconPtr1 */
      movq        (ReconPtr2, mm4);       /* mm1 = ReconPtr2 */
      movq        (mm0, mm2);      /* dup to prepare for up conversion */
      movq        (mm1, mm3);      /* dup to prepare for up conversion */
      movq        (mm4, mm5);      /* dup to prepare for up conversion */
    /* convert from UINT8 to INT16 */
      punpcklbw   (mm7, mm0);      /* mm0 = INT16(FiltPtr) */
      punpcklbw   (mm7, mm1);      /* mm1 = INT16(ReconPtr1) */
      punpcklbw   (mm7, mm4);      /* mm1 = INT16(ReconPtr2) */
      punpckhbw   (mm7, mm2);      /* mm2 = INT16(FiltPtr) */
      punpckhbw   (mm7, mm3);      /* mm3 = INT16(ReconPtr1) */
      punpckhbw   (mm7, mm5);     /* mm3 = INT16(ReconPtr2) */
    /* average ReconPtr1 and ReconPtr2 */
      paddw       (mm4, mm1);      /* mm1 = ReconPtr1 + ReconPtr2 */
      paddw       (mm5, mm3);      /* mm3 = ReconPtr1 + ReconPtr2 */
      psrlw       (1, mm1);         /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
      psrlw       (1, mm3);         /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
      psubw       (mm1, mm0);      /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
      psubw       (mm3, mm2);      /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
      movq        (mm0,  DctInputPtr);      /* write answer out */
      movq        (mm2, 4+DctInputPtr);      /* write answer out */
    /* Increment pointers */
      DctInputPtr+=8;
      FiltPtr+=PixelsPerLine;
      ReconPtr1+=ReconPixelsPerLine;
      ReconPtr2+=ReconPixelsPerLine;
   }
}

static ogg_uint32_t row_sad8__mmx (unsigned char *Src1, unsigned char *Src2)
{
  ogg_uint32_t MaxSad;
  __m64 mm6=_mm_setzero_si64(),mm7=mm6,mm0,mm1,mm2,mm3;
      movq        (Src1, mm0);      	/* take 8 bytes */
      movq        (Src2, mm1);

      movq        (mm0, mm2);
      psubusb     (mm1, mm0);      	/* A - B */
      psubusb     (mm2, mm1);     	/* B - A */
      por         (mm1, mm0);           	/* and or gives abs difference */

      movq        (mm0, mm1);

      punpcklbw   (mm6, mm0);            /* ; unpack low four bytes to higher precision */
      punpckhbw   (mm7, mm1);            /* ; unpack high four bytes to higher precision */

      movq        (mm0, mm2);
      movq        (mm1, mm3);
      psrlq       (32, mm2 );      	/* fold and add */
      psrlq       (32, mm3 );
      paddw       (mm2, mm0);
      paddw       (mm3, mm1);
      movq        (mm0, mm2);
      movq        (mm1, mm3);
      psrlq       (16, mm2 );
      psrlq       (16, mm3 );
      paddw       (mm2, mm0);
      paddw       (mm3, mm1);

      psubusw     (mm0, mm1);
      paddw       (mm0, mm1);      	/* mm1 = max(mm1, mm0) */
      movd        (mm1, (int&)MaxSad);

  return MaxSad&0xffff;
}

static ogg_uint32_t col_sad8x8__mmx (unsigned char *Src1, unsigned char *Src2,
		                    ogg_uint32_t stride)
{
  ogg_uint32_t MaxSad;

  __m64 mm3=_mm_setzero_si64(),mm4=mm3,mm5=mm3,mm6=mm3,mm7=mm3,mm0,mm1,mm2;
  int i;
  for (i=0;i<4;i++,Src1+=stride,Src2+=stride)
   {
      movq        (Src1, mm0);      	/* take 8 bytes */
      movq        (Src2, mm1);      	/* take 8 bytes */

      movq        (mm0, mm2);
      psubusb     (mm1, mm0);      	/* A - B */
      psubusb     (mm2, mm1);     	/* B - A */
      por         (mm1, mm0);           	/* and or gives abs difference */
      movq        (mm0, mm1);

      punpcklbw   (mm3, mm0);     	/* unpack to higher precision for accumulation */
      paddw       (mm0, mm4);     	/* accumulate difference... */
      punpckhbw   (mm3, mm1);     	/* unpack high four bytes to higher precision */
      paddw       (mm1, mm5);     	/* accumulate difference... */
     }

  for (i=0;i<4;i++,Src1+=stride,Src2+=stride)
   {
      movq        (Src1, mm0);      	/* take 8 bytes */
      movq        (Src2, mm1);      	/* take 8 bytes */

      movq        (mm0, mm2);
      psubusb     (mm1, mm0);      	/* A - B */
      psubusb     (mm2, mm1);     	/* B - A */
      por         (mm1, mm0);           	/* and or gives abs difference */
      movq        (mm0, mm1);

      punpcklbw   (mm3, mm0);     	/* unpack to higher precision for accumulation */
      paddw       (mm0, mm6);     	/* accumulate difference... */
      punpckhbw   (mm3, mm1);     	/* unpack high four bytes to higher precision */
      paddw       (mm1, mm7);     	/* accumulate difference... */
   }

      psubusw     (mm6, mm7);
      paddw       (mm6, mm7);      	/* mm7 = max(mm7, mm6) */
      psubusw     (mm4, mm5);
      paddw       (mm4, mm5);      	/* mm5 = max(mm5, mm4) */
      psubusw     (mm5, mm7);
      paddw       (mm5, mm7);      	/* mm7 = max(mm5, mm7) */
      movq        (mm7, mm6);
      psrlq       (32, mm6 );
      psubusw     (mm6, mm7);
      paddw       (mm6, mm7);      	/* mm7 = max(mm5, mm7) */
      movq        (mm7, mm6);
      psrlq       (16, mm6 );
      psubusw     (mm6, mm7);
      paddw       (mm6, mm7);      	/* mm7 = max(mm5, mm7) */
      movd        (mm7, (int&)MaxSad);

  return MaxSad&0xffff;
}

static ogg_uint32_t sad8x8__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
		       	    unsigned char *ptr2, ogg_uint32_t stride2)
{
  ogg_uint32_t  DiffVal;

  __m64 mm6=_mm_setzero_si64(),mm7=mm6,mm0,mm1,mm2;
  for (int i=0;i<8;i++,ptr1+=stride1,ptr2+=stride2)
   {
      movq        (ptr1, mm0);      	/* take 8 bytes */
      movq        (ptr2, mm1);
      movq        (mm0, mm2);

      psubusb     (mm1, mm0);      	/* A - B */
      psubusb     (mm2, mm1);     	/* B - A */
      por         (mm1, mm0);           	/* and or gives abs difference */
      movq        (mm0, mm1);

      punpcklbw   (mm6, mm0);     	/* unpack to higher precision for accumulation */
      paddw       (mm0, mm7);     	/* accumulate difference... */
      punpckhbw   (mm6, mm1);     	/* unpack high four bytes to higher precision */
      paddw       (mm1, mm7);     	/* accumulate difference... */
    }

      movq        (mm7, mm0     );
      psrlq       (32, mm7       );
      paddw       (mm0, mm7     );
      movq        (mm7, mm0     );
      psrlq       (16, mm7       );
      paddw       (mm0, mm7     );
      movd        (mm7, (int&)DiffVal        );
  return DiffVal&0xffff;;
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -