📄 mc.c
字号:
src += i_src_stride; dst += i_dst_stride; }}static inline void mc_hv_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){ int y; src -= 2 * i_src_stride; asm volatile( "pxor %%mm7, %%mm7\n" "movq x264_w0x10, %%mm4\n" : : ); for( y = 0; y < i_height; y++ ) { asm volatile( "leal (%0, %1), %%eax\n" "movq (%0), %%mm0\n" /* load pix-2 */ "movq %%mm0, %%mm2\n" "punpcklbw %%mm7, %%mm0\n" "punpckhbw %%mm7, %%mm2\n" "movq (%%eax),%%mm1\n" /* load pix-1 */ "movq %%mm1, %%mm3\n" "punpcklbw %%mm7, %%mm1\n" "punpckhbw %%mm7, %%mm3\n" "psubw %%mm1, %%mm0\n" "psllw $2, %%mm1\n" "psubw %%mm1, %%mm0\n" "psubw %%mm3, %%mm2\n" "psllw $2, %%mm3\n" "psubw %%mm3, %%mm2\n" "movq (%%eax,%1),%%mm1\n" /* load pix */ "movq %%mm1, %%mm3\n" "punpcklbw %%mm7, %%mm1\n" "punpckhbw %%mm7, %%mm3\n" "psllw $2, %%mm1\n" "paddw %%mm1, %%mm0\n" "psllw $2, %%mm1\n" "paddw %%mm1, %%mm0\n" "psllw $2, %%mm3\n" "paddw %%mm3, %%mm2\n" "psllw $2, %%mm3\n" "paddw %%mm3, %%mm2\n" "movq (%%eax,%1,2),%%mm1\n" /* load pix+1 */ "movq %%mm1, %%mm3\n" "punpcklbw %%mm7, %%mm1\n" "punpckhbw %%mm7, %%mm3\n" "psllw $2, %%mm1\n" "paddw %%mm1, %%mm0\n" "psllw $2, %%mm1\n" "paddw %%mm1, %%mm0\n" "psllw $2, %%mm3\n" "paddw %%mm3, %%mm2\n" "psllw $2, %%mm3\n" "paddw %%mm3, %%mm2\n" "movq (%0,%1,4),%%mm1\n" /* load pix+2 */ "movq %%mm1, %%mm3\n" "punpcklbw %%mm7, %%mm1\n" "punpckhbw %%mm7, %%mm3\n" "psubw %%mm1, %%mm0\n" "psllw $2, %%mm1\n" "psubw %%mm1, %%mm0\n" "psubw %%mm3, %%mm2\n" "psllw $2, %%mm3\n" "psubw %%mm3, %%mm2\n" "movq (%%eax,%1,4),%%mm1\n" /* load pix+3 */ "movq %%mm1, %%mm3\n" "punpcklbw %%mm7, %%mm1\n" "punpckhbw %%mm7, %%mm3\n" "paddw %%mm1, %%mm0\n" "paddw %%mm3, %%mm2\n" "paddw %%mm4, %%mm0\n" "psraw $5, %%mm0\n" "paddw %%mm4, %%mm2\n" "psraw $5, %%mm2\n" "packuswb %%mm2, %%mm0\n" "movq %%mm0, (%2)\n" : : "r"(src), "r"(i_src_stride), "r"(dst) : "%eax" ); src += i_src_stride; dst += i_dst_stride; }}static inline void mc_hc_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){ int x, y; asm volatile( "pxor %%mm7, %%mm7\n" : : ); for( y = 0; y < i_height; y++ ) { int16_t tap[5+8]; /* first 8 */ asm volatile( "leal (%0, %1), %%eax\n" "movq (%0), %%mm0\n" /* load pix-2 */ "movq %%mm0, %%mm2\n" "punpcklbw %%mm7, %%mm0\n" "punpckhbw %%mm7, %%mm2\n" "movq (%%eax),%%mm1\n" /* load pix-1 */ "movq %%mm1, %%mm3\n" "punpcklbw %%mm7, %%mm1\n" "punpckhbw %%mm7, %%mm3\n" "psubw %%mm1, %%mm0\n" "psllw $2, %%mm1\n" "psubw %%mm1, %%mm0\n" "psubw %%mm3, %%mm2\n" "psllw $2, %%mm3\n" "psubw %%mm3, %%mm2\n" "movq (%%eax,%1),%%mm1\n" /* load pix */ "movq %%mm1, %%mm3\n" "punpcklbw %%mm7, %%mm1\n" "punpckhbw %%mm7, %%mm3\n" "psllw $2, %%mm1\n" "paddw %%mm1, %%mm0\n" "psllw $2, %%mm1\n" "paddw %%mm1, %%mm0\n" "psllw $2, %%mm3\n" "paddw %%mm3, %%mm2\n" "psllw $2, %%mm3\n" "paddw %%mm3, %%mm2\n" "movq (%%eax,%1,2),%%mm1\n" /* load pix+1 */ "movq %%mm1, %%mm3\n" "punpcklbw %%mm7, %%mm1\n" "punpckhbw %%mm7, %%mm3\n" "psllw $2, %%mm1\n" "paddw %%mm1, %%mm0\n" "psllw $2, %%mm1\n" "paddw %%mm1, %%mm0\n" "psllw $2, %%mm3\n" "paddw %%mm3, %%mm2\n" "psllw $2, %%mm3\n" "paddw %%mm3, %%mm2\n" "movq (%0,%1,4),%%mm1\n" /* load pix+2 */ "movq %%mm1, %%mm3\n" "punpcklbw %%mm7, %%mm1\n" "punpckhbw %%mm7, %%mm3\n" "psubw %%mm1, %%mm0\n" "psllw $2, %%mm1\n" "psubw %%mm1, %%mm0\n" "psubw %%mm3, %%mm2\n" "psllw $2, %%mm3\n" "psubw %%mm3, %%mm2\n" "movq (%%eax,%1,4),%%mm1\n" /* load pix+3 */ "movq %%mm1, %%mm3\n" "punpcklbw %%mm7, %%mm1\n" "punpckhbw %%mm7, %%mm3\n" "paddw %%mm1, %%mm0\n" "paddw %%mm3, %%mm2\n" "movq %%mm0, (%2)\n" "movq %%mm2, 8(%2)\n" "addl $8, %%eax\n" "addl $8, %0\n" "movd (%0), %%mm0\n" /* load pix-2 */ "punpcklbw %%mm7, %%mm0\n" "movd (%%eax),%%mm1\n" /* load pix-1 */ "punpcklbw %%mm7, %%mm1\n" "psubw %%mm1, %%mm0\n" "psllw $2, %%mm1\n" "psubw %%mm1, %%mm0\n" "movd (%%eax,%1),%%mm1\n" /* load pix */ "punpcklbw %%mm7, %%mm1\n" "psllw $2, %%mm1\n" "paddw %%mm1, %%mm0\n" "psllw $2, %%mm1\n" "paddw %%mm1, %%mm0\n" "movd (%%eax,%1,2),%%mm1\n" /* load pix+1 */ "punpcklbw %%mm7, %%mm1\n" "psllw $2, %%mm1\n" "paddw %%mm1, %%mm0\n" "psllw $2, %%mm1\n" "paddw %%mm1, %%mm0\n" "movd (%0,%1,4),%%mm1\n" /* load pix+2 */ "punpcklbw %%mm7, %%mm1\n" "psubw %%mm1, %%mm0\n" "psllw $2, %%mm1\n" "psubw %%mm1, %%mm0\n" "movd (%%eax,%1,4),%%mm1\n" /* load pix+3 */ "punpcklbw %%mm7, %%mm1\n" "paddw %%mm1, %%mm0\n" "movq %%mm0, 16(%2)\n" : : "r"(src-2*i_src_stride-2), "r"(i_src_stride), "r"(&tap[0]) : "%eax" ); /* last one */ tap[8+4] = x264_tapfilter( &src[-2+8+4], i_src_stride ); for( x = 0; x < 8; x++ ) { dst[x] = x264_mc_clip1( ( tap[0+x] - 5*tap[1+x] + 20 * tap[2+x] + 20 * tap[3+x] -5*tap[4+x] + tap[5+x] + 512 ) >> 10 ); } src += i_src_stride; dst += i_dst_stride; }}/* mc I+H */static void mc_xy10_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){ uint8_t tmp[8*16]; mc_hh_w8( src, i_src_stride, tmp, 8, i_height ); pixel_avg_w8( dst, i_dst_stride, src, i_src_stride, tmp, 8, i_height );}static void mc_xy30_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){ uint8_t tmp[8*16]; mc_hh_w8( src, i_src_stride, tmp, 8, i_height ); pixel_avg_w8( dst, i_dst_stride, src+1, i_src_stride, tmp, 8, i_height );}/* mc I+V */static void mc_xy01_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){ uint8_t tmp[8*16]; mc_hv_w8( src, i_src_stride, tmp, 8, i_height ); pixel_avg_w8( dst, i_dst_stride, src, i_src_stride, tmp, 8, i_height );}static void mc_xy03_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){ uint8_t tmp[8*16]; mc_hv_w8( src, i_src_stride, tmp, 8, i_height ); pixel_avg_w8( dst, i_dst_stride, src+i_src_stride, i_src_stride, tmp, 8, i_height );}/* H+V */static void mc_xy11_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){ uint8_t tmp1[8*16]; uint8_t tmp2[8*16]; mc_hv_w8( src, i_src_stride, tmp1, 8, i_height ); mc_hh_w8( src, i_src_stride, tmp2, 8, i_height ); pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );}static void mc_xy31_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){ uint8_t tmp1[8*16]; uint8_t tmp2[8*16]; mc_hv_w8( src+1, i_src_stride, tmp1, 8, i_height ); mc_hh_w8( src, i_src_stride, tmp2, 8, i_height ); pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );}static void mc_xy13_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){ uint8_t tmp1[8*16]; uint8_t tmp2[8*16]; mc_hv_w8( src, i_src_stride, tmp1, 8, i_height ); mc_hh_w8( src+i_src_stride, i_src_stride, tmp2, 8, i_height ); pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );}static void mc_xy33_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){ uint8_t tmp1[8*16]; uint8_t tmp2[8*16]; mc_hv_w8( src+1, i_src_stride, tmp1, 8, i_height ); mc_hh_w8( src+i_src_stride, i_src_stride, tmp2, 8, i_height ); pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );}static void mc_xy21_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){ uint8_t tmp1[8*16]; uint8_t tmp2[8*16]; mc_hc_w8( src, i_src_stride, tmp1, 8, i_height ); mc_hh_w8( src, i_src_stride, tmp2, 8, i_height ); pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );}static void mc_xy12_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){ uint8_t tmp1[8*16]; uint8_t tmp2[8*16]; mc_hc_w8( src, i_src_stride, tmp1, 8, i_height ); mc_hv_w8( src, i_src_stride, tmp2, 8, i_height ); pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );}static void mc_xy32_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){ uint8_t tmp1[8*16]; uint8_t tmp2[8*16]; mc_hc_w8( src, i_src_stride, tmp1, 8, i_height ); mc_hv_w8( src+1, i_src_stride, tmp2, 8, i_height ); pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );}static void mc_xy23_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){ uint8_t tmp1[8*16]; uint8_t tmp2[8*16]; mc_hc_w8( src, i_src_stride, tmp1, 8, i_height ); mc_hh_w8( src+i_src_stride, i_src_stride, tmp2, 8, i_height ); pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );}/***************************************************************************** * MC with width == 16 (height <= 16) *****************************************************************************/static void mc_copy_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){ int y; for( y = 0; y < i_height; y++ ) { memcpy( dst, src, 16 ); src += i_src_stride; dst += i_dst_stride; }}static inline void mc_hh_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){ int x, y; for( y = 0; y < i_height; y++ ) { for( x = 0; x < 16; x++ ) { dst[x] = x264_mc_clip1( ( x264_tapfilter1( &src[x] ) + 16 ) >> 5 ); } src += i_src_stride; dst += i_dst_stride; }}static inline void mc_hv_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){ mc_hv_w8( src, i_src_stride, dst, i_dst_stride, i_height ); mc_hv_w8( &src[8], i_src_stride, &dst[8], i_dst_stride, i_height );}static inline void mc_hc_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ){ int x, y; asm volatile( "pxor %%mm7, %%mm7\n" : : ); for( y = 0; y < i_height; y++ ) { int16_t tap[5+16]; asm volatile( "leal (%0, %1), %%eax\n" "movq (%0), %%mm0\n" /* load pix-2 */ "movq %%mm0, %%mm2\n" "punpcklbw %%mm7, %%mm0\n" "punpckhbw %%mm7, %%mm2\n" "movq (%%eax),%%mm1\n" /* load pix-1 */ "movq %%mm1, %%mm3\n" "punpcklbw %%mm7, %%mm1\n" "punpckhbw %%mm7, %%mm3\n" "psubw %%mm1, %%mm0\n" "psllw $2, %%mm1\n" "psubw %%mm1, %%mm0\n"
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -