📄 mc-c.c.svn-base
字号:
/***************************************************************************** * mc.c: h264 encoder library (Motion Compensation) ***************************************************************************** * Copyright (C) 2003 Laurent Aimar * $Id: mc-c.c,v 1.5 2004/06/18 01:59:58 chenm001 Exp $ * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. *****************************************************************************/#ifdef HAVE_STDINT_H#include <stdint.h>#else#include <inttypes.h>#endif#include <stdlib.h>#include <stdio.h>#include <string.h>#include <stdarg.h>#include "x264.h" /* DECLARE_ALIGNED */#include "common/mc.h"#include "common/clip1.h"#include "mc.h"/* NASM functions */extern void x264_pixel_avg_w4_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );extern void x264_pixel_avg_w8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );extern void x264_pixel_avg_w16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );extern void x264_pixel_avg_w16_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );extern void x264_mc_copy_w4_mmxext( uint8_t *, int, uint8_t *, int, int );extern void x264_mc_copy_w8_mmxext( uint8_t *, int, uint8_t *, int, int );extern void x264_mc_copy_w16_mmxext( uint8_t *, int, uint8_t *, int, int );extern void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int );#if 0#if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3)#define USED_UINT64(foo) \ static const uint64_t foo __asm__ (#foo) __attribute__((used))#else#define USED_UINT64(foo) \ static const uint64_t foo __asm__ (#foo) __attribute__((unused))#endifUSED_UINT64( x264_w0x10 ) = 0x0010001000100010ULL;#define MMX_ZERO( MMZ ) \ asm volatile( "pxor " #MMZ ", " #MMZ "\n" :: )#define MMX_INIT( MMV, NAME ) \ asm volatile( "movq " #NAME ", " #MMV "\n" :: )#define MMX_SAVE_4P( MMP, MMZ, dst ) \ asm volatile( "packuswb " #MMZ "," #MMP "\n" \ "movd " #MMP ", (%0)" :: "r"(dst) )#define MMX_LOAD_4P( MMP, MMZ, pix ) \ asm volatile( "movd (%0), " #MMP "\n" \ "punpcklbw " #MMZ ", " #MMP "\n" : : "r"(pix) )#define MMX_LOAD_4x4( MMP1, MMP2, MMP3, MMP4, MMZ, pix, i_pix )\ MMX_LOAD_4P( MMP1, MMZ, &(pix)[0*(i_pix)] ); \ MMX_LOAD_4P( MMP2, MMZ, &(pix)[1*(i_pix)] ); \ MMX_LOAD_4P( MMP3, MMZ, &(pix)[2*(i_pix)] ); \ MMX_LOAD_4P( MMP4, MMZ, &(pix)[3*(i_pix)] )#define MMX_LOAD_2x4( MMP1, MMP2, MMZ, pix, i_pix )\ MMX_LOAD_4P( MMP1, MMZ, &(pix)[0*(i_pix)] ); \ MMX_LOAD_4P( MMP2, MMZ, &(pix)[1*(i_pix)] )#define MMX_SAVEPACK_8P( MMP1, MMP2, MMZ, dst ) \ asm volatile( "packuswb " #MMP2 "," #MMP1 "\n" \ "movq " #MMP1 ", (%0)\n" :: "r"(dst) )#define MMX_LOAD_8P( MMP1, MMP2, MMZ, pix ) \ asm volatile( "movq (%0) , " #MMP1 "\n" \ "movq " #MMP1 ", " #MMP2 "\n" \ "punpcklbw " #MMZ ", " #MMP1 "\n" \ "punpckhbw " #MMZ ", " #MMP2 "\n" : : "r"(pix) )#define MMX_LOAD_2x8( MMP1, MMP2, MMP3, MMP4, MMZ, pix, i_pix )\ MMX_LOAD_8P( MMP1, MMP2, MMZ, &(pix)[0*(i_pix)] ); \ MMX_LOAD_8P( MMP3, MMP4, MMZ, &(pix)[1*(i_pix)] )#define SBUTTERFLYwd(a,b,t )\ asm volatile( "movq " #a ", " #t " \n\t" \ "punpcklwd " #b ", " #a " \n\t" \ "punpckhwd " #b ", " #t " \n\t" :: )#define SBUTTERFLYdq(a,b,t )\ asm volatile( "movq " #a ", " #t " \n\t" \ "punpckldq " #b ", " #a " \n\t" \ "punpckhdq " #b ", " #t " \n\t" :: )/* input ABCD output ADTC ( or 0?31-2->0123 ) */#define MMX_TRANSPOSE( MMA, MMB, MMC, MMD, MMT ) \ SBUTTERFLYwd( MMA, MMB, MMT ); \ SBUTTERFLYwd( MMC, MMD, MMB ); \ SBUTTERFLYdq( MMA, MMC, MMD ); \ SBUTTERFLYdq( MMT, MMB, MMC )/* first pass MM0 = MM0 -5*MM1 */#define MMX_FILTERTAP_P1( MMP0, MMP1 ) \ asm volatile( "psubw " #MMP1 "," #MMP0 "\n" \ "psllw $2, " #MMP1 "\n" \ "psubw " #MMP1 "," #MMP0 "\n" :: ) \/* second pass MM0 = MM0 + 20*(MM2+MM3) */#define MMX_FILTERTAP_P2( MMP0, MMP2, MMP3 ) \ asm volatile( "paddw " #MMP3 "," #MMP2 "\n" \ \ "psllw $2, " #MMP2 "\n" \ "paddw " #MMP2 "," #MMP0 "\n" \ "psllw $2, " #MMP2 "\n" \ "paddw " #MMP2 "," #MMP0 "\n" :: )/* last pass: MM0 = ( MM0 -5*MM1 + MM2 + MMV ) >> 5 */#define MMX_FILTERTAP_P3( MMP0, MMP1, MMP2, MMV, MMZ ) \ asm volatile( "psubw " #MMP1 "," #MMP0 "\n" \ "psllw $2, " #MMP1 "\n" \ "psubw " #MMP1 "," #MMP0 "\n" \ \ "paddw " #MMP2 "," #MMP0 "\n" \ "paddw " #MMV "," #MMP0 "\n" \ "psraw $5, " #MMP0 "\n" :: )#define MMX_FILTERTAP2_P1( MMP0, MMP1, MMP2, MMP3 ) \ asm volatile( "psubw " #MMP1 "," #MMP0 "\n" \ "psubw " #MMP3 "," #MMP2 "\n" \ "psllw $2, " #MMP1 "\n" \ "psllw $2, " #MMP3 "\n" \ "psubw " #MMP1 "," #MMP0 "\n" \ "psubw " #MMP3 "," #MMP2 "\n" :: )/* second pass MM0 = MM0 + 20*(MM1+MM2) */#define MMX_FILTERTAP2_P2( MMP0, MMP1, MMP2, MMP3, MMP4, MMP5 ) \ asm volatile( "paddw " #MMP2 "," #MMP1 "\n" \ "paddw " #MMP5 "," #MMP4 "\n" \ \ "psllw $2, " #MMP1 "\n" \ "psllw $2, " #MMP4 "\n" \ "paddw " #MMP1 "," #MMP0 "\n" \ "paddw " #MMP4 "," #MMP3 "\n" \ "psllw $2, " #MMP1 "\n" \ "psllw $2, " #MMP4 "\n" \ "paddw " #MMP1 "," #MMP0 "\n" \ "paddw " #MMP4 "," #MMP3 "\n" :: )#define MMX_LOAD_1r( m1, dst ) \ asm volatile( "movq (%0), " #m1 "\n" :: "r"(dst) ); \#define MMX_SAVE_1r( m1, dst ) \ asm volatile( "movq " #m1 ", (%0)\n" :: "r"(dst) ); \#define MMX_LOAD_2r( m1, m2, dst, i_dst ) \ asm volatile( "movq (%0), " #m1 "\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \ asm volatile( "movq (%0), " #m2 "\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) )#define MMX_SAVE_2r( m1, m2, dst, i_dst ) \ asm volatile( "movq " #m1 ", (%0)\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \ asm volatile( "movq " #m2 ", (%0)\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) )#define MMX_SAVE_4r( m1, m2, m3, m4, dst, i_dst ) \ asm volatile( "movq " #m1 ", (%0)\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \ asm volatile( "movq " #m2 ", (%0)\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) ); \ asm volatile( "movq " #m3 ", (%0)\n" :: "r"(&((uint8_t*)dst)[2*(i_dst)]) ); \ asm volatile( "movq " #m4 ", (%0)\n" :: "r"(&((uint8_t*)dst)[3*(i_dst)]) )#define MMX_LOAD_4r( m1, m2, m3, m4, dst, i_dst ) \ asm volatile( "movq (%0), " #m1 "\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \ asm volatile( "movq (%0), " #m2 "\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) ); \ asm volatile( "movq (%0), " #m3 "\n" :: "r"(&((uint8_t*)dst)[2*(i_dst)]) ); \ asm volatile( "movq (%0), " #m4 "\n" :: "r"(&((uint8_t*)dst)[3*(i_dst)]) )static inline int x264_tapfilter( uint8_t *pix, int i_pix_next ){ return pix[-2*i_pix_next] - 5*pix[-1*i_pix_next] + 20*(pix[0] + pix[1*i_pix_next]) - 5*pix[ 2*i_pix_next] + pix[ 3*i_pix_next];}static inline int x264_tapfilter1( uint8_t *pix ){ return pix[-2] - 5*pix[-1] + 20*(pix[0] + pix[1]) - 5*pix[ 2] + pix[ 3];}typedef void (*pf_mc_t)(uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height );/* Macro to define NxM functions *//* mc I+H */#define MC_IH( name, cpu, width, height, off ) \static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \{ \ DECLARE_ALIGNED( uint8_t, tmp[width*height], width ); \ \ mc_hh_w##width( src, i_src_stride, tmp, width, i_height ); \ x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride, \ src+(off), i_src_stride, \ tmp, width, i_height ); \}/* mc I+V */#define MC_IV( name, cpu, width, height, off ) \static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \{ \ DECLARE_ALIGNED( uint8_t, tmp[width*height], width ); \ \ mc_hv_w##width( src, i_src_stride, tmp, width, i_height ); \ x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride, \ src+(off), i_src_stride, \ tmp, width, i_height ); \}/* mc H+V */#define MC_HV( name, cpu, width, height, off1, off2 ) \static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \{ \ DECLARE_ALIGNED( uint8_t, tmp1[width*height], width ); \ DECLARE_ALIGNED( uint8_t, tmp2[width*height], width ); \ \ mc_hv_w##width( src+(off1), i_src_stride, tmp1, width, i_height ); \ mc_hh_w##width( src+(off2), i_src_stride, tmp2, width, i_height ); \ x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride, \ tmp1, width, tmp2, width, \ i_height ); \}/* mc C+H */#define MC_CH( name, cpu, width, height, off ) \static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \{ \ DECLARE_ALIGNED( uint8_t, tmp1[width*height], width ); \ DECLARE_ALIGNED( uint8_t, tmp2[width*height], width ); \ \ mc_hc_w##width( src, i_src_stride, tmp1, width, i_height ); \ mc_hh_w##width( src+(off), i_src_stride, tmp2, width, i_height ); \ x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride, \ tmp1, width, tmp2, width, \ i_height ); \}/* mc C+V */#define MC_CV( name, cpu, width, height, off ) \static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \{ \ DECLARE_ALIGNED( uint8_t, tmp1[width*height], width ); \ DECLARE_ALIGNED( uint8_t, tmp2[width*height], width ); \ \ mc_hc_w##width( src, i_src_stride, tmp1, width, i_height ); \ mc_hv_w##width( src+(off), i_src_stride, tmp2, width, i_height ); \ x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride, \ tmp1, width, tmp2, width, \ i_height ); \}/***************************************************************************** * MC with width == 4 (height <= 8) *****************************************************************************/static inline void mc_hh_w4( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height ){ const int h4 = i_height / 4; uint8_t srct[4*8*3]; uint64_t tmp[4]; int y; src -= 2; MMX_ZERO( %%mm7 ); MMX_INIT( %%mm6, x264_w0x10 ); for( y = 0; y < h4; y++ ) { int i; /* Preload data and transpose them */ MMX_LOAD_4x4 ( %%mm0, %%mm4, %%mm3, %%mm1, %%mm7, &src[0], i_src ); MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */ MMX_SAVE_4r( %%mm0, %%mm1, %%mm2, %%mm3, &srct[4*8*0], 8 ); MMX_LOAD_4x4 ( %%mm0, %%mm4, %%mm3, %%mm1, %%mm7, &src[4], i_src ); MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */ MMX_SAVE_4r( %%mm0, %%mm1, %%mm2, %%mm3, &srct[4*8*1], 8 ); /* we read 2 more bytes that needed */ MMX_LOAD_4x4 ( %%mm0, %%mm4, %%mm3, %%mm1, %%mm7, &src[8], i_src ); MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */ MMX_SAVE_2r( %%mm0, %%mm1, &srct[4*8*2], 8 ); /* tap filter */ for( i = 0; i < 4; i++ ) { MMX_LOAD_4r( %%mm0, %%mm1, %%mm2, %%mm3, &srct[8*(i+0)], 8 ); MMX_FILTERTAP_P1( %%mm0, %%mm1 ); MMX_FILTERTAP_P2( %%mm0, %%mm2, %%mm3 ); MMX_LOAD_2r( %%mm1, %%mm2, &srct[8*(i+4)], 8 ); MMX_FILTERTAP_P3( %%mm0, %%mm1, %%mm2, %%mm6, %%mm7 ); MMX_SAVE_1r( %%mm0, &tmp[i] ); } MMX_LOAD_4r( %%mm0, %%mm4, %%mm3, %%mm1, tmp, 8 ); MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */ MMX_SAVE_4P( %%mm0, %%mm7, &dst[0*i_dst] ); MMX_SAVE_4P( %%mm1, %%mm7, &dst[1*i_dst] ); MMX_SAVE_4P( %%mm2, %%mm7, &dst[2*i_dst] ); MMX_SAVE_4P( %%mm3, %%mm7, &dst[3*i_dst] ); src += 4 * i_src; dst += 4 * i_dst; }}static inline void mc_hv_w4( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height ){ int y; src -= 2 * i_src; MMX_ZERO( %%mm7 ); MMX_INIT( %%mm6, x264_w0x10 ); for( y = 0; y < i_height; y++ ) { MMX_LOAD_4x4( %%mm0, %%mm1, %%mm2, %%mm3, %%mm7, src, i_src ); MMX_FILTERTAP_P1( %%mm0, %%mm1 ); MMX_FILTERTAP_P2( %%mm0, %%mm2, %%mm3 ); MMX_LOAD_2x4( %%mm4, %%mm5, %%mm7, &src[4*i_src], i_src ); MMX_FILTERTAP_P3( %%mm0, %%mm4, %%mm5, %%mm6, %%mm7 ); MMX_SAVE_4P( %%mm0, %%mm7, dst ); src += i_src; dst += i_dst; }}static inline void mc_hc_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){ int i, x, y; for( y = 0; y < i_height; y++ ) { int16_t tap[5+4]; for( i = 0; i < 5+4; i++ ) { tap[i] = x264_tapfilter( &src[-2+i], i_src_stride ); } for( x = 0; x < 4; x++ ) { dst[x] = x264_mc_clip1( ( tap[0+x] - 5*tap[1+x] + 20 * tap[2+x] + 20 * tap[3+x] -5*tap[4+x] + tap[5+x] + 512 ) >> 10 ); } src += i_src_stride; dst += i_dst_stride; }}MC_IH( mc_xy10, mmxext, 4, 8, 0 )MC_IH( mc_xy30, mmxext, 4, 8, 1 )MC_IV( mc_xy01, mmxext, 4, 8, 0 )MC_IV( mc_xy03, mmxext, 4, 8, i_src_stride )MC_HV( mc_xy11, mmxext, 4, 8, 0, 0 )MC_HV( mc_xy31, mmxext, 4, 8, 1, 0 )MC_HV( mc_xy13, mmxext, 4, 8, 0, i_src_stride )MC_HV( mc_xy33, mmxext, 4, 8, 1, i_src_stride )
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -