📄 mc.c
字号:
/***************************************************************************** * mc.c: h264 encoder library (Motion Compensation) ***************************************************************************** * Copyright (C) 2003 Laurent Aimar * $Id: mc.c,v 1.1 2003/11/09 23:25:04 fenrir Exp $ * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. *****************************************************************************/#include <stdlib.h>#include <stdio.h>#include <string.h>#include <stdint.h>#include "../mc.h"#include "../clip1.h"#include "mc.h"#define UNUSED_UINT64( foo ) \ static const uint64_t foo __asm__ (#foo) __attribute__((unused))UNUSED_UINT64( x264_w0x10 ) = 0x0010001000100010ULL;static inline int x264_tapfilter( uint8_t *pix, int i_pix_next ){ return pix[-2*i_pix_next] - 5*pix[-1*i_pix_next] + 20*(pix[0] + pix[1*i_pix_next]) - 5*pix[ 2*i_pix_next] + pix[ 3*i_pix_next];}static inline int x264_tapfilter1( uint8_t *pix ){ return pix[-2] - 5*pix[-1] + 20*(pix[0] + pix[1]) - 5*pix[ 2] + pix[ 3];}static inline void pixel_avg_w4( uint8_t *dst, int i_dst_stride, uint8_t *src1, int i_src1_stride, uint8_t *src2, int i_src2_stride, int i_height ){ int x, y; for( y = 0; y < i_height; y++ ) { for( x = 0; x < 4; x++ ) { dst[x] = ( src1[x] + src2[x] + 1 ) >> 1; } dst += i_dst_stride; src1 += i_src1_stride; src2 += i_src2_stride; }}static inline void pixel_avg_w8( uint8_t *dst, int i_dst_stride, uint8_t *src1, int i_src1_stride, uint8_t *src2, int i_src2_stride, int i_height ){ int y; for( y = 0; y < i_height; y++ ) { asm volatile( "movq (%1), %%mm0\n" "movq (%2), %%mm1\n" "pavgb %%mm1, %%mm0\n" "movq %%mm0, (%0)\n" : : "r"(dst), "r"(src1), "r"(src2) ); dst += i_dst_stride; src1 += i_src1_stride; src2 += i_src2_stride; }}static inline void pixel_avg_w16( uint8_t *dst, int i_dst_stride, uint8_t *src1, int i_src1_stride, uint8_t *src2, int i_src2_stride, int i_height ){ int y; for( y = 0; y < i_height; y++ ) { asm volatile( "movq (%1), %%mm0\n" "movq 8(%1), %%mm2\n" "movq (%2), %%mm1\n" "movq 8(%2), %%mm3\n" "pavgb %%mm1, %%mm0\n" "movq %%mm0, (%0)\n" "pavgb %%mm3, %%mm2\n" "movq %%mm2, 8(%0)\n" : : "r"(dst), "r"(src1), "r"(src2) ); dst += i_dst_stride; src1 += i_src1_stride; src2 += i_src2_stride; }}typedef void (*pf_mc_t)(uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height );/***************************************************************************** * MC with width == 4 (height <= 8) *****************************************************************************/static void mc_copy_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){ int y; for( y = 0; y < i_height; y++ ) { memcpy( dst, src, 4 ); src += i_src_stride; dst += i_dst_stride; }}static inline void mc_hh_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){ int x, y; for( y = 0; y < i_height; y++ ) { for( x = 0; x < 4; x++ ) { dst[x] = x264_mc_clip1( ( x264_tapfilter1( &src[x] ) + 16 ) >> 5 ); } src += i_src_stride; dst += i_dst_stride; }}static inline void mc_hv_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){ int y; src -= 2 * i_src_stride; asm volatile( "pxor %%mm7, %%mm7\n" "movq x264_w0x10, %%mm4\n" : : ); for( y = 0; y < i_height; y++ ) { asm volatile( "leal (%0, %1), %%eax\n" "movd (%0), %%mm0\n" /* load pix-2 */ "punpcklbw %%mm7, %%mm0\n" "movd (%%eax),%%mm1\n" /* load pix-1 */ "punpcklbw %%mm7, %%mm1\n" "psubw %%mm1, %%mm0\n" "psllw $2, %%mm1\n" "psubw %%mm1, %%mm0\n" "movd (%%eax,%1),%%mm1\n" /* load pix */ "punpcklbw %%mm7, %%mm1\n" "psllw $2, %%mm1\n" "paddw %%mm1, %%mm0\n" "psllw $2, %%mm1\n" "paddw %%mm1, %%mm0\n" "movd (%%eax,%1,2),%%mm1\n" /* load pix+1 */ "punpcklbw %%mm7, %%mm1\n" "psllw $2, %%mm1\n" "paddw %%mm1, %%mm0\n" "psllw $2, %%mm1\n" "paddw %%mm1, %%mm0\n" "movd (%0,%1,4),%%mm1\n" /* load pix+2 */ "punpcklbw %%mm7, %%mm1\n" "psubw %%mm1, %%mm0\n" "psllw $2, %%mm1\n" "psubw %%mm1, %%mm0\n" "movd (%%eax,%1,4),%%mm1\n" /* load pix+3 */ "punpcklbw %%mm7, %%mm1\n" "paddw %%mm1, %%mm0\n" "paddw %%mm4, %%mm0\n" "psraw $5, %%mm0\n" "packuswb %%mm7, %%mm0\n" "movd %%mm0, (%2)\n" : : "r"(src), "r"(i_src_stride), "r"(dst) : "%eax" ); src += i_src_stride; dst += i_dst_stride; }}static inline void mc_hc_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){#if 0 uint8_t *out; uint8_t *pix; int x, y; for( x = 0; x < 4; x++ ) { int tap[6]; pix = &src[x]; out = &dst[x]; tap[0] = x264_tapfilter1( &pix[-2*i_src_stride] ); tap[1] = x264_tapfilter1( &pix[-1*i_src_stride] ); tap[2] = x264_tapfilter1( &pix[ 0*i_src_stride] ); tap[3] = x264_tapfilter1( &pix[ 1*i_src_stride] ); tap[4] = x264_tapfilter1( &pix[ 2*i_src_stride] ); for( y = 0; y < i_height; y++ ) { tap[5] = x264_tapfilter1( &pix[ 3*i_src_stride] ); *out = x264_mc_clip1( ( tap[0] - 5*tap[1] + 20 * tap[2] + 20 * tap[3] -5*tap[4] + tap[5] + 512 ) >> 10 ); /* Next line */ pix += i_src_stride; out += i_dst_stride; tap[0] = tap[1]; tap[1] = tap[2]; tap[2] = tap[3]; tap[3] = tap[4]; tap[4] = tap[5]; } }#else int i, x, y; for( y = 0; y < i_height; y++ ) { int16_t tap[5+4]; for( i = 0; i < 5+4; i++ ) { tap[i] = x264_tapfilter( &src[-2+i], i_src_stride ); } for( x = 0; x < 4; x++ ) { dst[x] = x264_mc_clip1( ( tap[0+x] - 5*tap[1+x] + 20 * tap[2+x] + 20 * tap[3+x] -5*tap[4+x] + tap[5+x] + 512 ) >> 10 ); } src += i_src_stride; dst += i_dst_stride; }#endif}/* mc I+H */static void mc_xy10_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){ uint8_t tmp[4*8]; mc_hh_w4( src, i_src_stride, tmp, 4, i_height ); pixel_avg_w4( dst, i_dst_stride, src, i_src_stride, tmp, 4, i_height );}static void mc_xy30_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){ uint8_t tmp[4*8]; mc_hh_w4( src, i_src_stride, tmp, 4, i_height ); pixel_avg_w4( dst, i_dst_stride, src+1, i_src_stride, tmp, 4, i_height );}/* mc I+V */static void mc_xy01_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){ uint8_t tmp[4*8]; mc_hv_w4( src, i_src_stride, tmp, 4, i_height ); pixel_avg_w4( dst, i_dst_stride, src, i_src_stride, tmp, 4, i_height );}static void mc_xy03_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){ uint8_t tmp[4*8]; mc_hv_w4( src, i_src_stride, tmp, 4, i_height ); pixel_avg_w4( dst, i_dst_stride, src+i_src_stride, i_src_stride, tmp, 4, i_height );}/* H+V */static void mc_xy11_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){ uint8_t tmp1[4*8]; uint8_t tmp2[4*8]; mc_hv_w4( src, i_src_stride, tmp1, 4, i_height ); mc_hh_w4( src, i_src_stride, tmp2, 4, i_height ); pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );}static void mc_xy31_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){ uint8_t tmp1[4*8]; uint8_t tmp2[4*8]; mc_hv_w4( src+1, i_src_stride, tmp1, 4, i_height ); mc_hh_w4( src, i_src_stride, tmp2, 4, i_height ); pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );}static void mc_xy13_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){ uint8_t tmp1[4*8]; uint8_t tmp2[4*8]; mc_hv_w4( src, i_src_stride, tmp1, 4, i_height ); mc_hh_w4( src+i_src_stride, i_src_stride, tmp2, 4, i_height ); pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );}static void mc_xy33_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){ uint8_t tmp1[4*8]; uint8_t tmp2[4*8]; mc_hv_w4( src+1, i_src_stride, tmp1, 4, i_height ); mc_hh_w4( src+i_src_stride, i_src_stride, tmp2, 4, i_height ); pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );}static void mc_xy21_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){ uint8_t tmp1[4*8]; uint8_t tmp2[4*8]; mc_hc_w4( src, i_src_stride, tmp1, 4, i_height ); mc_hh_w4( src, i_src_stride, tmp2, 4, i_height ); pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );}static void mc_xy12_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){ uint8_t tmp1[4*8]; uint8_t tmp2[4*8]; mc_hc_w4( src, i_src_stride, tmp1, 4, i_height ); mc_hv_w4( src, i_src_stride, tmp2, 4, i_height ); pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );}static void mc_xy32_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){ uint8_t tmp1[4*8]; uint8_t tmp2[4*8]; mc_hc_w4( src, i_src_stride, tmp1, 4, i_height ); mc_hv_w4( src+1, i_src_stride, tmp2, 4, i_height ); pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );}static void mc_xy23_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){ uint8_t tmp1[4*8]; uint8_t tmp2[4*8]; mc_hc_w4( src, i_src_stride, tmp1, 4, i_height ); mc_hh_w4( src+i_src_stride, i_src_stride, tmp2, 4, i_height ); pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );}/***************************************************************************** * MC with width == 8 (height <= 16) *****************************************************************************/static void mc_copy_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){ int y; for( y = 0; y < i_height; y++ ) { memcpy( dst, src, 8 ); src += i_src_stride; dst += i_dst_stride; }}static inline void mc_hh_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){ int x, y; for( y = 0; y < i_height; y++ ) { for( x = 0; x < 8; x++ ) { dst[x] = x264_mc_clip1( ( x264_tapfilter1( &src[x] ) + 16 ) >> 5 ); }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -