⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 mc_sse2.c

📁 从FFMPEG转换而来的H264解码程序,VC下编译..
💻 C
字号:
/*
 *      Copyright (C) 2003-2005 Gabest
 *      http://www.gabest.org
 *
 *  This Program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2, or (at your option)
 *  any later version.
 *
 *  This Program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with GNU Make; see the file COPYING.  If not, write to
 *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
 *  http://www.gnu.org/copyleft/gpl.html
 *
 *  Based on Intel's AP-942
 *
 */

#include <inttypes.h>
#include "attributes.h"
#include "../../simd.h"
#include "mpeg2.h"
#include "mpeg2_internal.h"

#ifdef __SSE2__

static const __m128i const_1_16_bytes=_mm_set1_epi16(1);

static void MC_put_o_16_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi)
{
                const int edi= eax+eax;
                const int ebx= edi+eax;
        for (;esi;edx+=edi*2,ecx+=edi*2,esi-=4){
                __m128i xmm0,xmm1,xmm2,xmm3;
                movdqu (xmm0,  edx    );
                movdqu (xmm1,  edx+eax);
                movdqu (xmm2,  edx+edi);
                movdqu (xmm3,  edx+ebx);
                movdqa (ecx, xmm0     );
                movdqa (ecx+eax, xmm1 );
                movdqa (ecx+edi, xmm2 );
                movdqa (ecx+ebx, xmm3 );
        }
}

static void MC_put_o_8_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi)
{
                const int edi= eax+eax;
		const int ebx= edi+eax;
        for (;esi;edx+=edi*2,ecx+=edi*2,esi-=4){
                __m128d xmm0,xmm1,xmm2,xmm3;
		movlpd (xmm0, edx);
		movlpd (xmm1, edx+eax);
		movlpd (xmm2, edx+edi);
		movlpd (xmm3, edx+ebx);
		movlpd (ecx, xmm0);
		movlpd (ecx+eax, xmm1 );
		movlpd (ecx+edi, xmm2);
		movlpd (ecx+ebx, xmm3 );
        }
}

static void MC_put_x_16_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi)
{
                const int edi= eax+eax;
		const int ebx= edi+eax;
        for (;esi;edx+=edi*2,ecx+=edi*2,esi-=4){
                __m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7;
		movdqu (xmm0, edx);
		movdqu (xmm1, edx+1);
		movdqu (xmm2, edx+eax);
		movdqu (xmm3, edx+eax+1);
		movdqu (xmm4, edx+edi);
		movdqu( xmm5, edx+edi+1);
		movdqu( xmm6, edx+ebx );
		movdqu( xmm7, edx+ebx+1 );
		pavgb (xmm0, xmm1);
		pavgb (xmm2, xmm3);
		pavgb (xmm4, xmm5);
		pavgb (xmm6, xmm7);
		movdqa (ecx, xmm0);
		movdqa (ecx+eax, xmm2);
		movdqa (ecx+edi, xmm4);
		movdqa (ecx+ebx, xmm6);
        }
}

static void MC_put_x_8_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi)
{
                const int edi= eax+eax;
		const int ebx= edi+eax;
                __m128i xmm0,xmm1,xmm2,xmm3;
        for (;esi;edx+=edi*2,ecx+=edi*2,esi-=4){
		movlpd (xmm0, edx);
		movlpd (xmm1, edx+1);
		movhpd (xmm0, edx+eax);
		movhpd (xmm1, edx+eax+1);
		movlpd (xmm2, edx+edi);
		movlpd (xmm3, edx+edi+1);
		movhpd (xmm2, edx+ebx);
		movhpd (xmm3, edx+ebx+1);
		pavgb (xmm0, xmm1);
		pavgb (xmm2, xmm3);
		movlpd (ecx, xmm0);
		movhpd (ecx+eax, xmm0);
		movlpd (ecx+edi, xmm2);
		movhpd (ecx+ebx, xmm2);
        }
}

static void MC_put_y_16_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi)
{
                const int edi= eax+eax;
		const int ebx= edi+eax;
                __m128i xmm0;
                movdqu (xmm0, edx);
        for (;esi;edx+=edi*2,ecx+=edi*2,esi-=4){
                __m128i xmm1,xmm2,xmm3,xmm4;
		movdqu (xmm1, edx+eax);
		movdqu (xmm2, edx+edi );
		movdqu (xmm3, edx+ebx );
		movdqu (xmm4, edx+edi*2 );
		pavgb (xmm0, xmm1 );
		pavgb (xmm1, xmm2 );
		pavgb (xmm2, xmm3 );
		pavgb (xmm3, xmm4 );
		movdqa (ecx, xmm0 );
		movdqa (ecx+eax, xmm1 );
		movdqa (ecx+edi, xmm2 );
		movdqa (ecx+ebx, xmm3 );
		movdqa (xmm0, xmm4 );
        }
}

static void MC_put_y_8_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi)
{
                const int edi= eax+eax;
		const int ebx= edi+eax;
                __m128i xmm0;
                //movhpd (xmm0,edx);
                //movlpd (xmm0,edx+eax);
		movlpd (xmm0, edx);

        for (;esi;edx+=edi*2,ecx+=edi*2,esi-=4){
                __m128i xmm1,xmm2,xmm3,xmm4;
		movlpd (xmm1, edx+eax );
		movlpd (xmm2, edx+edi );
		movlpd (xmm3, edx+ebx );
		movlpd (xmm4, edx+edi*2 );
		pavgb (xmm0, xmm1 );
		pavgb (xmm1, xmm2);
		pavgb (xmm2, xmm3 );
		pavgb (xmm3, xmm4 );
		movlpd (ecx, xmm0 );
		movlpd (ecx+eax, xmm1 );
		movlpd (ecx+edi, xmm2 );
		movlpd (ecx+ebx, xmm3 );
		movdqa (xmm0, xmm4 );
        }
}

static void MC_put_xy_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
{
                const uint8_t *edx= ref ;
                uint8_t  *ecx= dest;
                int eax= stride;
                int esi= height;
                int edi= eax+eax;
                __m128i xmm7,xmm0,xmm1,xmm4,xmm5,xmm2,xmm3;
                movdqa (xmm7, const_1_16_bytes );
                movdqu (xmm0, edx );
                movdqu (xmm1, edx+1 );
        for (;esi;edx+= edi,ecx+= edi,esi-= 2)
         {
                movdqu (xmm2, edx+eax );
                movdqu (xmm3, edx+eax+1 );
                movdqu (xmm4, edx+edi );
                movdqu (xmm5, edx+edi+1 );
                pavgb (xmm0, xmm1 );
                pavgb (xmm2, xmm3 );
                movdqa( xmm1, xmm5 );
                pavgb (xmm5, xmm4 );
                psubusb( xmm2, xmm7 );
                pavgb (xmm0, xmm2 );
                pavgb (xmm2, xmm5);
                movdqa (ecx, xmm0);
                movdqa (xmm0, xmm4);
                movdqa (ecx+eax, xmm2);
        }
}

static void MC_put_xy_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
{
                const uint8_t *edx= ref;
                uint8_t *ecx= dest;
                int eax= stride;
                int esi= height;
                int edi= eax+eax;
                __m128i xmm7,xmm0,xmm2,xmm1,xmm3,xmm4,xmm5;
		movdqa (xmm7, const_1_16_bytes);
		movlpd (xmm0, edx);
		movlpd (xmm1, edx+1);
        for (;esi;edx+= edi,ecx+= edi,esi-= 2)
         {
		movlpd (xmm2, edx+eax);
		movlpd (xmm3, edx+eax+1);
		movlpd (xmm4, edx+edi);
		movlpd (xmm5, edx+edi+1);
		pavgb (xmm0, xmm1 );
		pavgb (xmm2, xmm3 );
		movdqa( xmm1, xmm5 );
		pavgb (xmm5, xmm4 );
		psubusb( xmm2, xmm7 );
		pavgb (xmm0, xmm2 );
		pavgb (xmm2, xmm5);
		movlpd (ecx, xmm0);
		movdqa (xmm0, xmm4);
		movlpd (ecx+eax, xmm2);
        }
}

static void MC_avg_o_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
{
                const uint8_t *edx= ref;
                uint8_t *ecx= dest;
                int esi= height;
                int eax= stride;
                int edi= eax+eax;
       		int ebx= edi+eax;

        for (;esi;edx+=edi*2,ecx+=edi*2,esi-=4)
         {
                __m128i xmm0,xmm1,xmm2,xmm3;
		movdqu (xmm0, edx);
		movdqu (xmm1, edx+eax );
		movdqu (xmm2, edx+edi);
		movdqu (xmm3, edx+ebx );
		pavgb (xmm0, ecx);
		pavgb (xmm1, ecx+eax);
		pavgb (xmm2, ecx+edi);
		pavgb (xmm3, ecx+ebx);
		movdqa (ecx, xmm0);
		movdqa (ecx+eax, xmm1 );
		movdqa (ecx+edi, xmm2);
		movdqa (ecx+ebx, xmm3 );
         }
}

static void MC_avg_o_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
{
                const uint8_t *edx= ref;
                uint8_t *ecx= dest;
                int esi= height;
                int eax= stride;
                int edi= eax+eax;
                int ebx= edi+eax;

                __m128i xmm0,xmm1,xmm2,xmm3;
        for (;esi;edx+=edi*2,ecx+=edi*2,esi-=4)
         {
		movlpd (xmm0, edx);
		movhpd (xmm0, edx+eax );
		movlpd (xmm2, edx+edi);
		movhpd (xmm2, edx+ebx );
		movlpd (xmm1, ecx);
		movhpd (xmm1, ecx+eax);
		movlpd (xmm3, ecx+edi);
		movhpd (xmm3, ecx+ebx);
		pavgb (xmm0, xmm1);
		pavgb (xmm2, xmm3);
		movlpd (ecx, xmm0);
		movhpd (ecx+eax, xmm0);
		movlpd (ecx+edi, xmm2);
		movhpd (ecx+ebx, xmm2);
         }
}

static void MC_avg_x_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
{
                const uint8_t *edx= ref;
                uint8_t *ecx= dest;
                int esi= height;
                int eax= stride;
                int edi= eax+eax;
                int ebx= edi+eax;

                __m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7;
        for (;esi;edx+=edi*2,ecx+=edi*2,esi-=4)
         {
		movdqu (xmm0, edx);
		movdqu (xmm1, edx+1);
		movdqu (xmm2, edx+eax);
		movdqu (xmm3, edx+eax+1);
		movdqu (xmm4, edx+edi);
		movdqu (xmm5, edx+edi+1);
		movdqu (xmm6, edx+ebx);
		movdqu (xmm7, edx+ebx+1);
		pavgb (xmm0, xmm1);
		pavgb (xmm2, xmm3);
		pavgb (xmm4, xmm5);
		pavgb (xmm6, xmm7);
		pavgb (xmm0, ecx);
		pavgb (xmm2, ecx+eax);
		pavgb (xmm4, ecx+edi);
		pavgb (xmm6, ecx+ebx);
		movdqa (ecx, xmm0);
		movdqa (ecx+eax, xmm2);
		movdqa (ecx+edi, xmm4);
		movdqa (ecx+ebx, xmm6);
         }
}

static void MC_avg_x_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
{
                const uint8_t *edx= ref;
                uint8_t *ecx= dest;
                int esi= height;
                int eax= stride;
                int edi= eax+eax;
                int ebx= edi+eax;

                __m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5;
        for (;esi;edx+=edi*2,ecx+=edi*2,esi-=4)
         {
		movlpd (xmm0, edx);
		movlpd (xmm1, edx+1);
		movhpd (xmm0, edx+eax);
		movhpd (xmm1, edx+eax+1);
		movlpd (xmm2, edx+edi);
		movlpd (xmm3, edx+edi+1);
		movhpd (xmm2, edx+ebx);
		movhpd (xmm3, edx+ebx+1);
		movlpd (xmm4, ecx);
		movhpd (xmm4, ecx+eax);
		movlpd (xmm5, ecx+edi);
		movhpd (xmm5, ecx+ebx);
		pavgb (xmm0, xmm1);
		pavgb (xmm2, xmm3);
		pavgb (xmm0, xmm4);
		pavgb (xmm2, xmm5);
		movlpd (ecx, xmm0);
		movhpd (ecx+eax, xmm0);
		movlpd (ecx+edi, xmm2);
		movhpd (ecx+ebx, xmm2);
        }
}

static void MC_avg_y_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
{
                const uint8_t *edx= ref;
                uint8_t *ecx= dest;
                int esi= height;
                int eax= stride;
                int edi= eax+eax;
		int ebx= edi+eax;
                __m128i xmm0,xmm1,xmm2,xmm3,xmm4;

                movdqu (xmm0,edx);
        for (;esi;edx+=edi*2,ecx+=edi*2,esi-=4)
         {
		movdqu (xmm1, edx+eax );
		movdqu (xmm2, edx+edi );
		movdqu (xmm3, edx+ebx );
		movdqu (xmm4, edx+edi*2 );
		pavgb (xmm0, xmm1 );
		pavgb (xmm1, xmm2 );
		pavgb (xmm2, xmm3 );
		pavgb (xmm3, xmm4 );
		pavgb (xmm0, ecx);
		pavgb (xmm1, ecx+eax );
		pavgb (xmm2, ecx+edi);
		pavgb (xmm3, ecx+ebx );
		movdqa (ecx, xmm0 );
		movdqa (ecx+eax, xmm1 );
		movdqa (ecx+edi, xmm2 );
		movdqa (ecx+ebx, xmm3 );
		movdqa (xmm0, xmm4 );
         }
}

static void MC_avg_y_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
{
                const uint8_t *edx= ref;
                uint8_t *ecx= dest;
                int esi= height;
                int eax= stride;
                int edi= eax+eax;
		int ebx= edi+eax;
                __m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5;
                movhpd (xmm0, edx );
                movlpd (xmm0, edx+eax );
        for (;esi;edx+=edi*2,ecx+=edi*2,esi-=4)
         {
		movlhps (xmm1, xmm0);
		movlpd (xmm1, edx+edi );
		movlhps (xmm2, xmm1);
		movlpd (xmm2, edx+ebx );
		movlhps (xmm3, xmm2);
		movlpd (xmm3, edx+edi*2 );
		movhpd (xmm4, ecx );
		movlpd (xmm4, ecx+eax );
		movhpd (xmm5, ecx+edi );
		movlpd (xmm5, ecx+ebx );
		pavgb (xmm0, xmm1 );
		pavgb (xmm2, xmm3);
		pavgb (xmm0, xmm4);
		pavgb (xmm2, xmm5);
		movhpd (ecx, xmm0 );
		movlpd (ecx+eax, xmm0 );
		movhpd (ecx+edi, xmm2 );
		movlpd (ecx+ebx, xmm2);
		movdqa (xmm0, xmm3 );
         }
}

static void MC_avg_xy_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
{
                const uint8_t *edx= ref;
                uint8_t *ecx= dest;
                int esi= height;
                int eax= stride;
                int edi= eax+eax;
                __m128i xmm7,xmm0,xmm1,xmm2,xmm3,xmm4,xmm5;
                movdqa (xmm7, const_1_16_bytes );
                movdqu (xmm0, edx );
                movdqu (xmm1, edx+1 );
        for (;esi;edx+=edi,ecx+=edi,esi-=2)
          {
		movdqu (xmm2, edx+eax );
		movdqu (xmm3, edx+eax+1 );
		movdqu (xmm4, edx+edi );
		movdqu (xmm5, edx+edi+1 );
		pavgb (xmm0, xmm1 );
		pavgb (xmm2, xmm3 );
		movdqa (xmm1, xmm5 );
		pavgb (xmm5, xmm4 );
		psubusb (xmm2, xmm7 );
		pavgb (xmm0, xmm2 );
		pavgb (xmm2, xmm5);
		pavgb (xmm0, ecx );
		pavgb (xmm2, ecx+eax);
		movdqa (ecx, xmm0);
		movdqa (xmm0, xmm4);
		movdqa (ecx+eax, xmm2);
         }
}

static void MC_avg_xy_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
{
                const uint8_t *edx= ref;
                uint8_t *ecx= dest;
                int esi= height;
                int eax= stride;
                int edi= eax+eax;
                __m128i xmm7,xmm0,xmm2,xmm1,xmm3,xmm4;
                movdqa (xmm7, const_1_16_bytes );
                movhpd (xmm0, edx );
                movlpd (xmm0, edx+eax );
                movhpd (xmm2, edx+1 );
                movlpd (xmm2, edx+eax+1 );
        for (;esi;edx+=edi,ecx+=edi,esi-=2)
         {

                movhpd (xmm1, edx+eax );
                movlpd (xmm1, edx+edi );
                movhpd (xmm3, edx+eax+1 );
                movlpd (xmm3, edx+edi+1 );
                pavgb (xmm0, xmm1 );
                pavgb (xmm2, xmm3 );
                psubusb( xmm0, xmm7 );
                pavgb (xmm0, xmm2 );
                movhpd( xmm4, ecx);
                movlpd( xmm4, ecx+eax);
                pavgb (xmm0, xmm4 );
                movhpd (ecx, xmm0 );
                movlpd (ecx+eax, xmm0 );
                movdqa (xmm0, xmm1 );
                movdqa (xmm2, xmm3 );
        }
}

mpeg2_mc_t mpeg2_mc_sse2 =
{
        {MC_put_o_16_sse2, MC_put_x_16_sse2, MC_put_y_16_sse2, MC_put_xy_16_sse2,
        MC_put_o_8_sse2,  MC_put_x_8_sse2,  MC_put_y_8_sse2,  MC_put_xy_8_sse2},
        {MC_avg_o_16_sse2, MC_avg_x_16_sse2, MC_avg_y_16_sse2, MC_avg_xy_16_sse2,
        MC_avg_o_8_sse2,  MC_avg_x_8_sse2,  MC_avg_y_8_sse2,  MC_avg_xy_8_sse2}
};

#endif //__SSE2__

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -