⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 dsp_mmxext.cpp

📁 从FFMPEG转换而来的H264解码程序,VC下编译..
💻 CPP
字号:
/********************************************************************
 *                                                                  *
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
 *                                                                  *
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
 * by the Xiph.Org Foundation http://www.xiph.org/                  *
 *                                                                  *
 ********************************************************************

  function:

 ********************************************************************/

#include <stdlib.h>
#include "dsp.h"
#include "csimd.h"

using namespace csimd;

static ogg_uint32_t sad8x8__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
		       	    unsigned char *ptr2, ogg_uint32_t stride2)
{
  ogg_uint32_t  DiffVal;
  __m64 mm7=_mm_setzero_si64();
  for (int i=0;i<8;i++,ptr1+=stride1,ptr2+=stride2)
   {
    __m64 mm0,mm1;
    movq (ptr1, mm0);             	/* take 8 bytes */
    movq (ptr2, mm1);
    psadbw (mm1, mm0);
    paddw (mm0, mm7);           	/* accumulate difference... */
   }
  movd (mm7, (int&)DiffVal);
  return DiffVal;
}

static ogg_uint32_t sad8x8_thres__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
		       		  unsigned char *ptr2, ogg_uint32_t stride2,
			   	  ogg_uint32_t thres)
{
  ogg_uint32_t  DiffVal;

  __m64 mm7=_mm_setzero_si64();             	/* mm7 contains the result */

  for (int i=0;i<8;i++,ptr1+=stride1,ptr2+=stride2)
   {
    __m64 mm0,mm1;
    movq (ptr1, mm0);             	/* take 8 bytes */
    movq (ptr2, mm1);
    psadbw (mm1, mm0);
    paddw (mm0, mm7);           	/* accumulate difference... */
   }

  movd (mm7, (int&)DiffVal);

  return DiffVal;
}

static ogg_uint32_t sad8x8_xy2_thres__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
		                      unsigned char *RefDataPtr1,
			              unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
			              ogg_uint32_t thres)
{
  ogg_uint32_t  DiffVal;
  __m64 mm7=_mm_setzero_si64();
  for (int i=0;i<8;i++,SrcData+=SrcStride,RefDataPtr1+=RefStride,RefDataPtr2+=RefStride)
   {
      __m64 mm0,mm1,mm2;
      movq (SrcData, mm0);             	/* take 8 bytes */
      movq (RefDataPtr1, mm1);
      movq (RefDataPtr2, mm2);
      pavgb (mm2, mm1);
      psadbw (mm1, mm0);
      paddw (mm0, mm7);           	/* accumulate difference... */
   }

  movd (mm7, (int&)DiffVal);
  return DiffVal;
}

static ogg_uint32_t row_sad8__mmxext (unsigned char *Src1, unsigned char *Src2)
{
  ogg_uint32_t MaxSad;
  __m64 mm0,mm1,mm2,mm3;
      movd        (Src1, mm0);
      movd        (Src2, mm1);
      psadbw      (mm0, mm1);
      movd        (4+Src1, mm2);
      movd        (4+Src2, mm3);
      psadbw      (mm2, mm3);

      pmaxsw      (mm1, mm3);
      movd        (mm3, (int&)MaxSad);
  return MaxSad&0xffff;
}

static ogg_uint32_t col_sad8x8__mmxext (unsigned char *Src1, unsigned char *Src2,
		                    ogg_uint32_t stride)
{
  ogg_uint32_t MaxSad;

  __m64 mm3=_mm_setzero_si64(),mm4=mm3,mm5=mm3,mm6=mm3,mm7=mm3,mm0,mm1,mm2;
  int i;
  for (i=0;i<4;i++,Src1+=stride,Src2+=stride)
   {
      movq        (Src1, mm0);      	/* take 8 bytes */
      movq        (Src2, mm1);      	/* take 8 bytes */

      movq        (mm0, mm2);
      psubusb     (mm1, mm0);      	/* A - B */
      psubusb     (mm2, mm1);     	/* B - A */
      por         (mm1, mm0);           	/* and or gives abs difference */
      movq        (mm0, mm1);

      punpcklbw   (mm3, mm0);     	/* unpack to higher precision for accumulation */
      paddw       (mm0, mm4);     	/* accumulate difference... */
      punpckhbw   (mm3, mm1);     	/* unpack high four bytes to higher precision */
      paddw       (mm1, mm5);     	/* accumulate difference... */
   }
  for (i=0;i<4;i++,Src1+=stride,Src2+=stride)
   {
      movq        (Src1, mm0);      	/* take 8 bytes */
      movq        (Src2, mm1);      	/* take 8 bytes */

      movq        (mm0, mm2);
      psubusb     (mm1, mm0);      	/* A - B */
      psubusb     (mm2, mm1);     	/* B - A */
      por         (mm1, mm0);           	/* and or gives abs difference */
      movq        (mm0, mm1);

      punpcklbw   (mm3, mm0);     	/* unpack to higher precision for accumulation */
      paddw       (mm0, mm6);     	/* accumulate difference... */
      punpckhbw   (mm3, mm1);     	/* unpack high four bytes to higher precision */
      paddw       (mm1, mm7);     	/* accumulate difference... */
   }
      pmaxsw      (mm6, mm7);
      pmaxsw      (mm4, mm5);
      pmaxsw      (mm5, mm7);
      movq        (mm7, mm6);
      psrlq       (32, mm6 );
      pmaxsw      (mm6, mm7);
      movq        (mm7, mm6);
      psrlq       (16, mm6 );
      pmaxsw      (mm6, mm7);
      movd        (mm7, (int&)MaxSad);
  return MaxSad&0xffff;
}

static ogg_uint32_t inter8x8_err_xy2__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
		                     unsigned char *RefDataPtr1,
				     unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
{
  ogg_int16_t XSum;
  ogg_uint32_t XXSum;

  __m64 mm4=_mm_setzero_si64(),mm5=mm4,mm6=mm4,mm7=mm4,mm0,mm1,mm2,mm3;
  for (int i=0;i<8;i++,SrcData+=SrcStride,RefDataPtr1+=RefStride,RefDataPtr2+=RefStride)
   {
      movq        (SrcData, mm0);      	/* take 8 bytes */

      movq        (RefDataPtr1, mm2);
      movq        (RefDataPtr2, mm1);      	/* take average of mm2 and mm1 */
      pavgb       (mm2, mm1);

      movq        (mm0, mm2);
      movq        (mm1, mm3);

      punpcklbw   (mm6, mm0);
      punpcklbw   (mm4, mm1);
      punpckhbw   (mm6, mm2);
      punpckhbw   (mm4, mm3);

      psubsw      (mm1, mm0);
      psubsw      (mm3, mm2);

      paddw       (mm0, mm5);
      paddw       (mm2, mm5);

      pmaddwd     (mm0, mm0);
      pmaddwd     (mm2, mm2);

      paddd       (mm0, mm7);
      paddd       (mm2, mm7);
   }

      movq        (mm5, mm0);
      psrlq       (32, mm5 );
      paddw       (mm0, mm5);
      movq        (mm5, mm0);
      psrlq       (16, mm5 );
      paddw       (mm0, mm5);
      int edi;
      movd        (mm5, edi);
      //movsx       di, edi
      //movl        edi, XSum
      XSum=edi;

      movq        (mm7, mm0);
      psrlq       (32, mm7 );
      paddd       (mm0, mm7);
      movd        (mm7, (int&)XXSum);
  /* Compute and return population variance as mis-match metric. */
  return (( (XXSum<<6) - XSum*XSum ));
}

void dsp_i386_mmxext_init(DspFunctions *funcs)
{
  funcs->row_sad8 = row_sad8__mmxext;
  funcs->col_sad8x8 = col_sad8x8__mmxext;
  funcs->sad8x8 = sad8x8__mmxext;
  funcs->sad8x8_thres = sad8x8_thres__mmxext;
  funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmxext;
  funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmxext;
}





⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -