⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 vp3dsp_sse2.cpp

📁 从FFMPEG转换而来的H264解码程序,VC下编译..
💻 CPP
📖 第 1 页 / 共 3 页
字号:
/*
 * Copyright (C) 2004 the ffmpeg project
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

/**
 * @file vp3dsp_sse2.c
 * SSE2-optimized functions cribbed from the original VP3 source code.
 */

#include "csimd.h"
#include "inttypes.h"

static __align16(const unsigned short,SSE2_dequant_const[]) =
{
    0,65535,65535,0,0,0,0,0,    // 0x0000 0000 0000 0000 0000 FFFF FFFF 0000
    0,0,0,0,65535,65535,0,0,    // 0x0000 0000 FFFF FFFF 0000 0000 0000 0000
    65535,65535,65535,0,0,0,0,0,// 0x0000 0000 0000 0000 0000 FFFF FFFF FFFF
    0,0,0,65535,0,0,0,0,        // 0x0000 0000 0000 0000 FFFF 0000 0000 0000
    0,0,0,65535,65535,0,0,0,    // 0x0000 0000 0000 FFFF FFFF 0000 0000 0000
    65535,0,0,0,0,65535,0,0,    // 0x0000 0000 FFFF 0000 0000 0000 0000 FFFF
    0,0,65535,65535, 0,0,0,0    // 0x0000 0000 0000 0000 FFFF FFFF 0000 0000
};

static __align16(const unsigned int,Eight[]) =
{
    0x00080008,
    0x00080008,
    0x00080008,
    0x00080008
};

static __align16(const unsigned short,SSE2_idct_data[7 * 8]) =
{
    64277,64277,64277,64277,64277,64277,64277,64277,
    60547,60547,60547,60547,60547,60547,60547,60547,
    54491,54491,54491,54491,54491,54491,54491,54491,
    46341,46341,46341,46341,46341,46341,46341,46341,
    36410,36410,36410,36410,36410,36410,36410,36410,
    25080,25080,25080,25080,25080,25080,25080,25080,
    12785,12785,12785,12785,12785,12785,12785,12785
};


extern "C" void ff_vp3_idct_sse2(int16_t * input_data, int16_t * qtbl, int16_t * output)
{
    unsigned char *input_bytes = (unsigned char *)input_data;
    unsigned char *dequant_const_bytes = (unsigned char *)SSE2_dequant_const;

   __m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7;

#define eax input_bytes
#define ebx ((unsigned char*)qtbl)/*dequant_matrix_bytes*/
#define ecx dequant_const_bytes

//#define SSE2_Dequantize() {
    movdqu((eax), xmm0);

    pmullw((ebx), xmm0);          /* xmm0 = 07 06 05 04 03 02 01 00 */
    movdqu((eax + 16), xmm1);

    pmullw((ebx + 16), xmm1);     /* xmm1 = 17 16 15 14 13 12 11 10 */
    xmm3= _mm_shufflelo_epi16(xmm0,0x078);// pshuflw(xmm0, xmm3, 0x078);    /* xmm3 = 07 06 05 04 01 03 02 00 */

    movdqu(xmm1, xmm2);            /* xmm2 = 17 16 15 14 13 12 11 10 */
    movdqu((ecx), xmm7);          /* xmm7 = -- -- -- -- -- FF FF -- */

    movdqu((eax + 32), xmm4);
    movdqu((eax + 64), xmm5);

    pmullw((ebx + 32), xmm4);     /* xmm4 = 27 26 25 24 23 22 21 20 */
    pmullw((ebx + 64), xmm5);     /* xmm5 = 47 46 45 44 43 42 41 40 */

    movdqu((ecx + 16), xmm6);     /* xmm6 = -- -- FF FF -- -- -- -- */
    pand(xmm2, xmm7);              /* xmm7 = -- -- -- -- -- 12 11 -- */

    pand(xmm4, xmm6);              /* xmm6 = -- -- 25 24 -- -- -- -- */
    pxor(xmm7, xmm2);              /* xmm2 = 17 16 15 14 13 -- -- 10 */

    pxor(xmm6, xmm4);              /* xmm4 = 27 26 -- -- 23 22 21 20 */
    pslldq(4, xmm7);               /* xmm7 = -- -- -- 12 11 -- -- -- */

    pslldq(2, xmm6);               /* xmm6 = -- 25 24 -- -- -- -- -- */
    por(xmm6, xmm7);               /* xmm7 = -- 25 24 12 11 -- -- -- */

    movdqu((ecx + 32), xmm0);     /* xmm0 = -- -- -- -- -- FF FF FF */
    movdqu((ecx + 48), xmm6);     /* xmm6 = -- -- -- -- FF -- -- -- */

    pand(xmm3, xmm0);              /* xmm0 = -- -- -- -- -- 03 02 00 */
    pand(xmm5, xmm6);              /* xmm6 = -- -- -- -- 43 -- -- -- */

    pxor(xmm0, xmm3);              /* xmm3 = 07 06 05 04 01 -- -- -- */
    pxor(xmm6, xmm5);              /* xmm5 = 47 46 45 44 -- 42 41 40 */

    por(xmm7, xmm0);               /* xmm0 = -- 25 24 12 11 03 02 00 */
    pslldq(8, xmm6);               /* xmm6 = 43 -- -- -- -- -- -- -- */

    por(xmm6, xmm0);               /* xmm0 = 43 25 24 12 11 03 02 00 */
    /* 02345 in use */

    movdqu((ecx + 64 ), xmm1);    /* xmm1 = -- -- -- FF FF -- -- -- */
    xmm5=_mm_shufflelo_epi16(xmm5,0x0b4);// pshuflw(xmm5, xmm5, 0x0B4);    /* xmm5 = 47 46 45 44 42 -- 41 40 */

    movdqu(xmm1, xmm7);            /* xmm7 = -- -- -- FF FF -- -- -- */
    movdqu(xmm1, xmm6);            /* xmm6 = -- -- -- FF FF -- -- -- */

    movdqu(xmm0, (eax));          /* write  43 25 24 12 11 03 02 00 */
    xmm4=_mm_shufflehi_epi16(xmm4,0x0c2);// pshufhw(xmm4, xmm4, 0x0C2);    /* xmm4 = 27 -- -- 26 23 22 21 20 */

    pand(xmm4, xmm7);              /* xmm7 = -- -- -- 26 23 -- -- -- */
    pand(xmm5, xmm1);              /* xmm1 = -- -- -- 44 42 -- -- -- */

    pxor(xmm7, xmm4);              /* xmm4 = 27 -- -- -- -- 22 21 20 */
    pxor(xmm1, xmm5);              /* xmm5 = 47 46 45 -- -- -- 41 40 */

    xmm2=_mm_shufflelo_epi16(xmm2,0x0c6);//pshuflw(xmm2, xmm2, 0x0C6);    /* xmm2 = 17 16 15 14 13 10 -- -- */
    movdqu(xmm6, xmm0);            /* xmm0 = -- -- -- FF FF -- -- -- */

    pslldq(2, xmm7);               /* xmm7 = -- -- 26 23 -- -- -- -- */
    pslldq(6, xmm1);               /* xmm1 = 44 42 -- -- -- -- -- -- */

    psrldq(2, xmm0);               /* xmm0 = -- -- -- -- FF FF -- -- */
    pand(xmm3, xmm6);              /* xmm6 = -- -- -- 04 01 -- -- -- */

    pand(xmm2, xmm0);              /* xmm0 = -- -- -- -- 13 10 -- -- */
    pxor(xmm6, xmm3);              /* xmm3 = 07 06 05 -- -- -- -- -- */

    pxor(xmm0, xmm2);              /* xmm2 = 17 16 15 14 -- -- -- -- */
    psrldq(6, xmm6);               /* xmm0 = -- -- -- -- -- -- 04 01 */

    por(xmm7, xmm1);               /* xmm1 = 44 42 26 23 -- -- -- -- */
    por(xmm6, xmm0);               /* xmm1 = -- -- -- -- 13 10 04 01 */
    /* 12345 in use */
    por(xmm0, xmm1);               /* xmm1 = 44 42 26 23 13 10 04 01 */
    xmm4=_mm_shufflelo_epi16(xmm4,0x093);//pshuflw(xmm4, xmm4, 0x093);    /* xmm4 = 27 -- -- -- 22 21 20 -- */

    xmm4=_mm_shufflehi_epi16(xmm4,0x093);//pshufhw(xmm4, xmm4, 0x093);    /* xmm4 = -- -- -- 27 22 21 20 -- */
    movdqu(xmm1, (eax + 16));     /* write  44 42 26 23 13 10 04 01 */

    xmm3=_mm_shufflehi_epi16(xmm3,0x0d2);//pshufhw(xmm3, xmm3, 0x0D2);    /* xmm3 = 07 05 -- 06 -- -- -- -- */
    movdqu((ecx + 64), xmm0);     /* xmm0 = -- -- -- FF FF -- -- -- */

    pand(xmm3, xmm0);              /* xmm0 = -- -- -- 06 -- -- -- -- */
    psrldq(12, xmm3);              /* xmm3 = -- -- -- -- -- -- 07 05 */

    psrldq(8, xmm0);               /* xmm0 = -- -- -- -- -- -- -- 06 */

    movdqu((ecx + 64), xmm6);     /* xmm6 = -- -- -- FF FF -- -- -- */
    movdqu((ecx + 96), xmm7);     /* xmm7 = -- -- -- -- FF FF -- -- */

    pand(xmm4, xmm6);              /* xmm6 = -- -- -- 27 22 -- -- -- */
    pxor(xmm6, xmm4);              /* xmm4 = -- -- -- -- -- 21 20 -- */

    por(xmm6, xmm3);               /* xmm3 = -- -- -- 27 22 -- 07 05 */
    pand(xmm4, xmm7);              /* xmm7 = -- -- -- -- -- 21 -- -- */

    por(xmm7, xmm0);               /* xmm0 = -- -- -- -- -- 21 -- 06 */
    pxor(xmm7, xmm4);              /* xmm4 = -- -- -- -- -- -- 20 -- */

    movdqu((ecx + 16 ), xmm6);    /* xmm6 = -- -- FF FF -- -- -- -- */
    movdqu((ecx + 64 ), xmm1);    /* xmm1 = -- -- -- FF FF -- -- -- */

    pand(xmm2, xmm6);              /* xmm6 = -- -- 15 14 -- -- -- -- */
    pand(xmm6, xmm1);              /* xmm1 = -- -- -- 14 -- -- -- -- */

    pxor(xmm6, xmm2);              /* xmm2 = 17 16 -- -- -- -- -- -- */
    pxor(xmm1, xmm6);              /* xmm6 = -- -- 15 -- -- -- -- -- */

    psrldq(4, xmm1);               /* xmm1 = -- -- -- -- -- 14 -- -- */

    psrldq(8, xmm6);               /* xmm6 = -- -- -- -- -- -- 15 -- */
    por(xmm1, xmm3);               /* xmm3 = -- -- -- 27 22 14 07 05 */

    por(xmm6, xmm0);               /* xmm0 = -- -- -- -- -- 21 15 06 */
    xmm5=_mm_shufflehi_epi16(xmm5,0x0e1);//pshufhw(xmm5, xmm5, 0x0E1);    /* xmm5 = 47 46 -- 45 -- -- 41 40 */

    movdqu((ecx + 64), xmm1);     /* xmm1 = -- -- -- FF FF -- -- -- */
    xmm5=_mm_shufflelo_epi16(xmm5,0x072);//pshuflw(xmm5, xmm5, 0x072);    /* xmm5 = 47 46 -- 45 41 -- 40 -- */

    movdqu(xmm1, xmm6);            /* xmm6 = -- -- -- FF FF -- -- -- */
    pand(xmm5, xmm1);              /* xmm1 = -- -- -- 45 41 -- -- -- */

    pxor(xmm1, xmm5);              /* xmm5 = 47 46 -- -- -- -- 40 -- */
    pslldq(4, xmm1);               /* xmm1 = -- 45 41 -- -- -- -- -- */

    xmm5=_mm_shuffle_epi32(xmm5,0x09c);//pshufd(xmm5, xmm5, 0x09C);     /* xmm5 = -- -- -- -- 47 46 40 -- */
    por(xmm1, xmm3);               /* xmm3 = -- 45 41 27 22 14 07 05 */

    movdqu((eax + 96), xmm1);     /* xmm1 = 67 66 65 64 63 62 61 60 */
    pmullw((ebx + 96), xmm1);

    movdqu((ecx), xmm7);          /* xmm7 = -- -- -- -- -- FF FF -- */

    psrldq(8, xmm6);               /* xmm6 = -- -- -- -- -- -- -- FF */
    pand(xmm5, xmm7);              /* xmm7 = -- -- -- -- -- 46 40 -- */

    pand(xmm1, xmm6);              /* xmm6 = -- -- -- -- -- -- -- 60 */
    pxor(xmm7, xmm5);              /* xmm5 = -- -- -- -- 47 -- -- -- */

    pxor(xmm6, xmm1);              /* xmm1 = 67 66 65 64 63 62 61 -- */
    pslldq(2, xmm5);               /* xmm5 = -- -- -- 47 -- -- -- -- */

    pslldq(14, xmm6);              /* xmm6 = 60 -- -- -- -- -- -- -- */
    por(xmm5, xmm4);               /* xmm4 = -- -- -- 47 -- -- 20 -- */

    por(xmm6, xmm3);               /* xmm3 = 60 45 41 27 22 14 07 05 */
    pslldq(6, xmm7);               /* xmm7 = -- -- 46 40 -- -- -- -- */

    movdqu(xmm3, (eax+32));       /* write  60 45 41 27 22 14 07 05 */
    por(xmm7, xmm0);               /* xmm0 = -- -- 46 40 -- 21 15 06 */
    /* 0, 1, 2, 4 in use */
    movdqu((eax + 48), xmm3);     /* xmm3 = 37 36 35 34 33 32 31 30 */
    movdqu((eax + 80), xmm5);     /* xmm5 = 57 56 55 54 53 52 51 50 */

    pmullw((ebx + 48), xmm3);
    pmullw((ebx + 80), xmm5);

    movdqu((ecx + 64), xmm6);     /* xmm6 = -- -- -- FF FF -- -- -- */
    movdqu((ecx + 64), xmm7);     /* xmm7 = -- -- -- FF FF -- -- -- */

    psrldq(8, xmm6);               /* xmm6 = -- -- -- -- -- -- -- FF */
    pslldq(8, xmm7);               /* xmm7 = FF -- -- -- -- -- -- -- */

    pand(xmm3, xmm6);              /* xmm6 = -- -- -- -- -- -- -- 30 */
    pand(xmm5, xmm7);              /* xmm7 = 57 -- -- -- -- -- -- -- */

    pxor(xmm6, xmm3);              /* xmm3 = 37 36 35 34 33 32 31 -- */
    pxor(xmm7, xmm5);              /* xmm5 = __ 56 55 54 53 52 51 50 */

    pslldq(6, xmm6);               /* xmm6 = -- -- -- -- 30 -- -- -- */
    psrldq(2, xmm7);               /* xmm7 = -- 57 -- -- -- -- -- -- */

    por(xmm7, xmm6);               /* xmm6 = -- 57 -- -- 30 -- -- -- */
    movdqu((ecx), xmm7);          /* xmm7 = -- -- -- -- -- FF FF -- */

    por(xmm6, xmm0);               /* xmm0 = -- 57 46 40 30 21 15 06 */
    psrldq(2, xmm7);               /* xmm7 = -- -- -- -- -- -- FF FF */

    movdqu(xmm2, xmm6);            /* xmm6 = 17 16 -- -- -- -- -- -- */
    pand(xmm1, xmm7);              /* xmm7 = -- -- -- -- -- -- 61 -- */

    pslldq(2, xmm6);               /* xmm6 = 16 -- -- -- -- -- -- -- */
    psrldq(14, xmm2);              /* xmm2 = -- -- -- -- -- -- -- 17 */

    pxor(xmm7, xmm1);              /* xmm1 = 67 66 65 64 63 62 -- -- */
    pslldq(12, xmm7);              /* xmm7 = 61 -- -- -- -- -- -- -- */

    psrldq(14, xmm6);              /* xmm6 = -- -- -- -- -- -- -- 16 */
    por(xmm6, xmm4);               /* xmm4 = -- -- -- 47 -- -- 20 16 */

    por(xmm7, xmm0);               /* xmm0 = 61 57 46 40 30 21 15 06 */
    movdqu((ecx), xmm6);          /* xmm6 = -- -- -- -- -- FF FF -- */

    psrldq(2, xmm6);               /* xmm6 = -- -- -- -- -- -- FF FF */
    movdqu(xmm0, (eax+48));       /* write  61 57 46 40 30 21 15 06 */
    /* 1, 2, 3, 4, 5 in use */
    movdqu((ecx), xmm0);          /* xmm0 = -- -- -- -- -- FF FF -- */

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -