📄 mc-c.c.svn-base

📁 H.264 source codes
💻 SVN-BASE
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/***************************************************************************** * mc.c: h264 encoder library (Motion Compensation) ***************************************************************************** * Copyright (C) 2003 Laurent Aimar * $Id: mc-c.c,v 1.5 2004/06/18 01:59:58 chenm001 Exp $ * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA. *****************************************************************************/#ifdef HAVE_STDINT_H#include <stdint.h>#else#include <inttypes.h>#endif#include <stdlib.h>#include <stdio.h>#include <string.h>#include <stdarg.h>#include "x264.h"   /* DECLARE_ALIGNED */#include "common/mc.h"#include "common/clip1.h"#include "mc.h"/* NASM functions */extern void x264_pixel_avg_w4_mmxext( uint8_t *,  int, uint8_t *, int, uint8_t *, int, int );extern void x264_pixel_avg_w8_mmxext( uint8_t *,  int, uint8_t *, int, uint8_t *, int, int );extern void x264_pixel_avg_w16_mmxext( uint8_t *,  int, uint8_t *, int, uint8_t *, int, int );extern void x264_pixel_avg_w16_sse2( uint8_t *,  int, uint8_t *, int, uint8_t *, int, int );extern void x264_mc_copy_w4_mmxext( uint8_t *, int, uint8_t *, int, int );extern void x264_mc_copy_w8_mmxext( uint8_t *, int, uint8_t *, int, int );extern void x264_mc_copy_w16_mmxext( uint8_t *, int, uint8_t *, int, int );extern void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int );#if 0#if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3)#define USED_UINT64(foo) \    static const uint64_t foo __asm__ (#foo) __attribute__((used))#else#define USED_UINT64(foo) \    static const uint64_t foo __asm__ (#foo) __attribute__((unused))#endifUSED_UINT64( x264_w0x10 ) = 0x0010001000100010ULL;#define MMX_ZERO( MMZ ) \    asm volatile( "pxor " #MMZ ", " #MMZ "\n" :: )#define MMX_INIT( MMV, NAME ) \    asm volatile( "movq " #NAME ", " #MMV "\n" :: )#define MMX_SAVE_4P( MMP, MMZ, dst ) \    asm volatile( "packuswb " #MMZ  "," #MMP "\n" \                  "movd " #MMP ", (%0)" :: "r"(dst) )#define MMX_LOAD_4P( MMP, MMZ, pix ) \    asm volatile( "movd (%0), " #MMP "\n" \                  "punpcklbw  " #MMZ ", " #MMP "\n" : : "r"(pix) )#define MMX_LOAD_4x4( MMP1, MMP2, MMP3, MMP4, MMZ, pix, i_pix )\    MMX_LOAD_4P( MMP1, MMZ, &(pix)[0*(i_pix)] ); \    MMX_LOAD_4P( MMP2, MMZ, &(pix)[1*(i_pix)] ); \    MMX_LOAD_4P( MMP3, MMZ, &(pix)[2*(i_pix)] ); \    MMX_LOAD_4P( MMP4, MMZ, &(pix)[3*(i_pix)] )#define MMX_LOAD_2x4( MMP1, MMP2, MMZ, pix, i_pix )\    MMX_LOAD_4P( MMP1, MMZ, &(pix)[0*(i_pix)] ); \    MMX_LOAD_4P( MMP2, MMZ, &(pix)[1*(i_pix)] )#define MMX_SAVEPACK_8P( MMP1, MMP2, MMZ, dst ) \    asm volatile( "packuswb " #MMP2  "," #MMP1 "\n" \                  "movq " #MMP1 ", (%0)\n" :: "r"(dst) )#define MMX_LOAD_8P( MMP1, MMP2, MMZ, pix ) \    asm volatile( "movq         (%0)   , " #MMP1 "\n" \                  "movq       " #MMP1 ", " #MMP2 "\n" \                  "punpcklbw  " #MMZ  ", " #MMP1 "\n" \                  "punpckhbw  " #MMZ  ", " #MMP2 "\n" : : "r"(pix) )#define MMX_LOAD_2x8( MMP1, MMP2, MMP3, MMP4, MMZ, pix, i_pix )\    MMX_LOAD_8P( MMP1, MMP2, MMZ, &(pix)[0*(i_pix)] ); \    MMX_LOAD_8P( MMP3, MMP4, MMZ, &(pix)[1*(i_pix)] )#define SBUTTERFLYwd(a,b,t )\    asm volatile( "movq " #a ", " #t "        \n\t" \                  "punpcklwd " #b ", " #a "   \n\t" \                  "punpckhwd " #b ", " #t "   \n\t" :: )#define SBUTTERFLYdq(a,b,t )\    asm volatile( "movq " #a ", " #t "        \n\t" \                  "punpckldq " #b ", " #a "   \n\t" \                  "punpckhdq " #b ", " #t "   \n\t" :: )/* input ABCD output ADTC  ( or 0?31-2->0123 ) */#define MMX_TRANSPOSE( MMA, MMB, MMC, MMD, MMT ) \        SBUTTERFLYwd( MMA, MMB, MMT ); \        SBUTTERFLYwd( MMC, MMD, MMB ); \        SBUTTERFLYdq( MMA, MMC, MMD ); \        SBUTTERFLYdq( MMT, MMB, MMC )/* first pass MM0 = MM0 -5*MM1 */#define MMX_FILTERTAP_P1( MMP0, MMP1 ) \    asm volatile( "psubw    " #MMP1 "," #MMP0 "\n" \                  "psllw      $2,     " #MMP1 "\n" \                  "psubw    " #MMP1 "," #MMP0 "\n" :: )                                                   \/* second pass MM0 = MM0 + 20*(MM2+MM3) */#define MMX_FILTERTAP_P2( MMP0, MMP2, MMP3 ) \    asm volatile( "paddw    " #MMP3 "," #MMP2 "\n" \                                                 \                  "psllw      $2,     " #MMP2 "\n" \                  "paddw    " #MMP2 "," #MMP0 "\n" \                  "psllw      $2,     " #MMP2 "\n" \                  "paddw    " #MMP2 "," #MMP0 "\n" :: )/* last pass: MM0 = ( MM0 -5*MM1 + MM2 + MMV ) >> 5 */#define MMX_FILTERTAP_P3( MMP0, MMP1, MMP2, MMV, MMZ ) \    asm volatile( "psubw    " #MMP1 "," #MMP0 "\n" \                  "psllw      $2,     " #MMP1 "\n" \                  "psubw    " #MMP1 "," #MMP0 "\n" \                                                   \                  "paddw    " #MMP2 "," #MMP0 "\n" \                  "paddw    " #MMV  "," #MMP0 "\n" \                  "psraw      $5,     " #MMP0 "\n" :: )#define MMX_FILTERTAP2_P1( MMP0, MMP1, MMP2, MMP3 ) \    asm volatile( "psubw    " #MMP1 "," #MMP0 "\n" \                  "psubw    " #MMP3 "," #MMP2 "\n" \                  "psllw      $2,     " #MMP1 "\n" \                  "psllw      $2,     " #MMP3 "\n" \                  "psubw    " #MMP1 "," #MMP0 "\n" \                  "psubw    " #MMP3 "," #MMP2 "\n" :: )/* second pass MM0 = MM0 + 20*(MM1+MM2) */#define MMX_FILTERTAP2_P2( MMP0, MMP1, MMP2, MMP3, MMP4, MMP5 ) \    asm volatile( "paddw    " #MMP2 "," #MMP1 "\n" \                  "paddw    " #MMP5 "," #MMP4 "\n" \                                                 \                  "psllw      $2,     " #MMP1 "\n" \                  "psllw      $2,     " #MMP4 "\n" \                  "paddw    " #MMP1 "," #MMP0 "\n" \                  "paddw    " #MMP4 "," #MMP3 "\n" \                  "psllw      $2,     " #MMP1 "\n" \                  "psllw      $2,     " #MMP4 "\n" \                  "paddw    " #MMP1 "," #MMP0 "\n" \                  "paddw    " #MMP4 "," #MMP3 "\n" :: )#define MMX_LOAD_1r( m1, dst ) \    asm volatile( "movq (%0), " #m1 "\n" :: "r"(dst) ); \#define MMX_SAVE_1r( m1, dst ) \    asm volatile( "movq " #m1 ", (%0)\n" :: "r"(dst) ); \#define MMX_LOAD_2r( m1, m2, dst, i_dst ) \    asm volatile( "movq (%0), " #m1 "\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \    asm volatile( "movq (%0), " #m2 "\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) )#define MMX_SAVE_2r( m1, m2, dst, i_dst ) \    asm volatile( "movq " #m1 ", (%0)\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \    asm volatile( "movq " #m2 ", (%0)\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) )#define MMX_SAVE_4r( m1, m2, m3, m4, dst, i_dst ) \    asm volatile( "movq " #m1 ", (%0)\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \    asm volatile( "movq " #m2 ", (%0)\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) ); \    asm volatile( "movq " #m3 ", (%0)\n" :: "r"(&((uint8_t*)dst)[2*(i_dst)]) ); \    asm volatile( "movq " #m4 ", (%0)\n" :: "r"(&((uint8_t*)dst)[3*(i_dst)]) )#define MMX_LOAD_4r( m1, m2, m3, m4, dst, i_dst ) \    asm volatile( "movq (%0), " #m1 "\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \    asm volatile( "movq (%0), " #m2 "\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) ); \    asm volatile( "movq (%0), " #m3 "\n" :: "r"(&((uint8_t*)dst)[2*(i_dst)]) ); \    asm volatile( "movq (%0), " #m4 "\n" :: "r"(&((uint8_t*)dst)[3*(i_dst)]) )static inline int x264_tapfilter( uint8_t *pix, int i_pix_next ){    return pix[-2*i_pix_next] - 5*pix[-1*i_pix_next] + 20*(pix[0] + pix[1*i_pix_next]) - 5*pix[ 2*i_pix_next] + pix[ 3*i_pix_next];}static inline int x264_tapfilter1( uint8_t *pix ){    return pix[-2] - 5*pix[-1] + 20*(pix[0] + pix[1]) - 5*pix[ 2] + pix[ 3];}typedef void (*pf_mc_t)(uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height );/* Macro to define NxM functions *//* mc I+H */#define MC_IH( name, cpu, width, height, off )  \static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \{                                                               \    DECLARE_ALIGNED( uint8_t, tmp[width*height], width );       \                                                                \    mc_hh_w##width( src, i_src_stride, tmp, width, i_height );  \    x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride,         \                                     src+(off), i_src_stride,   \                                     tmp, width, i_height );    \}/* mc I+V */#define MC_IV( name, cpu, width, height, off )  \static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \{                                                               \    DECLARE_ALIGNED( uint8_t, tmp[width*height], width );       \                                                                \    mc_hv_w##width( src, i_src_stride, tmp, width, i_height );  \    x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride,         \                                     src+(off), i_src_stride,   \                                     tmp, width, i_height );    \}/* mc H+V */#define MC_HV( name, cpu, width, height, off1, off2 ) \static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \{                                                               \    DECLARE_ALIGNED( uint8_t, tmp1[width*height], width );      \    DECLARE_ALIGNED( uint8_t, tmp2[width*height], width );      \                                                                \    mc_hv_w##width( src+(off1), i_src_stride, tmp1, width, i_height );  \    mc_hh_w##width( src+(off2), i_src_stride, tmp2, width, i_height );  \    x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride,         \                                     tmp1, width, tmp2, width,  \                                     i_height );                \}/* mc C+H */#define MC_CH( name, cpu, width, height, off ) \static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \{                                                               \    DECLARE_ALIGNED( uint8_t, tmp1[width*height], width );      \    DECLARE_ALIGNED( uint8_t, tmp2[width*height], width );      \                                                                \    mc_hc_w##width( src,       i_src_stride, tmp1, width, i_height );  \    mc_hh_w##width( src+(off), i_src_stride, tmp2, width, i_height );  \    x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride,         \                                     tmp1, width, tmp2, width,  \                                     i_height );                \}/* mc C+V */#define MC_CV( name, cpu, width, height, off ) \static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \{                                                               \    DECLARE_ALIGNED( uint8_t, tmp1[width*height], width );      \    DECLARE_ALIGNED( uint8_t, tmp2[width*height], width );      \                                                                \    mc_hc_w##width( src,       i_src_stride, tmp1, width, i_height );  \    mc_hv_w##width( src+(off), i_src_stride, tmp2, width, i_height );  \    x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride,         \                                     tmp1, width, tmp2, width,  \                                     i_height );                \}/***************************************************************************** * MC with width == 4 (height <= 8) *****************************************************************************/static inline void mc_hh_w4( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height ){    const int h4 = i_height / 4;    uint8_t  srct[4*8*3];    uint64_t tmp[4];    int y;    src -= 2;    MMX_ZERO( %%mm7 );    MMX_INIT( %%mm6, x264_w0x10 );    for( y = 0; y < h4; y++ )    {        int i;        /* Preload data and transpose them */        MMX_LOAD_4x4 ( %%mm0, %%mm4, %%mm3, %%mm1, %%mm7, &src[0], i_src );        MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */        MMX_SAVE_4r( %%mm0, %%mm1, %%mm2, %%mm3, &srct[4*8*0], 8 );        MMX_LOAD_4x4 ( %%mm0, %%mm4, %%mm3, %%mm1, %%mm7, &src[4], i_src );        MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */        MMX_SAVE_4r( %%mm0, %%mm1, %%mm2, %%mm3, &srct[4*8*1], 8 );        /* we read 2 more bytes that needed */        MMX_LOAD_4x4 ( %%mm0, %%mm4, %%mm3, %%mm1, %%mm7, &src[8], i_src );        MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */        MMX_SAVE_2r( %%mm0, %%mm1, &srct[4*8*2], 8 );        /* tap filter */        for( i = 0; i < 4; i++ )        {            MMX_LOAD_4r( %%mm0, %%mm1, %%mm2, %%mm3, &srct[8*(i+0)], 8 );            MMX_FILTERTAP_P1( %%mm0, %%mm1 );            MMX_FILTERTAP_P2( %%mm0, %%mm2, %%mm3 );            MMX_LOAD_2r( %%mm1, %%mm2, &srct[8*(i+4)], 8 );            MMX_FILTERTAP_P3( %%mm0, %%mm1, %%mm2, %%mm6, %%mm7 );            MMX_SAVE_1r( %%mm0, &tmp[i] );        }        MMX_LOAD_4r( %%mm0, %%mm4, %%mm3, %%mm1, tmp, 8 );        MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */        MMX_SAVE_4P( %%mm0, %%mm7, &dst[0*i_dst] );        MMX_SAVE_4P( %%mm1, %%mm7, &dst[1*i_dst] );        MMX_SAVE_4P( %%mm2, %%mm7, &dst[2*i_dst] );        MMX_SAVE_4P( %%mm3, %%mm7, &dst[3*i_dst] );        src += 4 * i_src;        dst += 4 * i_dst;    }}static inline void mc_hv_w4( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height ){    int y;    src -= 2 * i_src;    MMX_ZERO( %%mm7 );    MMX_INIT( %%mm6, x264_w0x10 );    for( y = 0; y < i_height; y++ )    {        MMX_LOAD_4x4( %%mm0, %%mm1, %%mm2, %%mm3, %%mm7, src, i_src );        MMX_FILTERTAP_P1( %%mm0, %%mm1 );        MMX_FILTERTAP_P2( %%mm0, %%mm2, %%mm3 );        MMX_LOAD_2x4( %%mm4, %%mm5, %%mm7, &src[4*i_src], i_src );        MMX_FILTERTAP_P3( %%mm0, %%mm4, %%mm5, %%mm6, %%mm7 );        MMX_SAVE_4P( %%mm0, %%mm7, dst );        src += i_src;        dst += i_dst;    }}static inline void mc_hc_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){    int i, x, y;    for( y = 0; y < i_height; y++ )    {        int16_t tap[5+4];        for( i = 0; i < 5+4; i++ )        {            tap[i] = x264_tapfilter( &src[-2+i], i_src_stride );        }        for( x = 0; x < 4; x++ )        {            dst[x] = x264_mc_clip1( ( tap[0+x] - 5*tap[1+x] + 20 * tap[2+x] + 20 * tap[3+x] -5*tap[4+x] + tap[5+x] + 512 ) >> 10 );        }        src += i_src_stride;        dst += i_dst_stride;    }}MC_IH( mc_xy10, mmxext, 4, 8, 0 )MC_IH( mc_xy30, mmxext, 4, 8, 1 )MC_IV( mc_xy01, mmxext, 4, 8, 0 )MC_IV( mc_xy03, mmxext, 4, 8, i_src_stride )MC_HV( mc_xy11, mmxext, 4, 8, 0, 0 )MC_HV( mc_xy31, mmxext, 4, 8, 1, 0 )MC_HV( mc_xy13, mmxext, 4, 8, 0, i_src_stride )MC_HV( mc_xy33, mmxext, 4, 8, 1, i_src_stride )
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -