📄 pixel.c
字号:
/***************************************************************************** * pixel.c: h264 encoder ***************************************************************************** * Copyright (C) 2003 Laurent Aimar * $Id: pixel.c,v 1.1 2003/11/09 23:25:05 fenrir Exp $ * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. *****************************************************************************/#include <stdlib.h>#include <string.h>#include <stdint.h>#include "../pixel.h"#include "pixel.h"/* XXX Remarks: the SBUTTERFLY and LBUTTERFLY2 come from ffmpeg */#define __align8 __attribute__ ((aligned (8)))static int pixel_sad_16x16_mmxext( uint8_t *pix1, int i_stride_pix1, uint8_t *pix2, int i_stride_pix2 ){ int i_sum = 0; int i; asm volatile( "pxor %%mm0, %%mm0" : : ); for( i = 0; i < 16; i++ ) { asm volatile ( "movq (%0), %%mm1\n" /* mm0 : pix1[0-7] */ "movq (%1), %%mm2\n" /* mm2 : pix2[0-7] */ "movq 8(%0), %%mm3\n" /* mm3 : pix1[8-15] */ "movq 8(%1), %%mm4\n" /* mm4 : pix2[8-15] */ "psadbw %%mm2, %%mm1\n" "psadbw %%mm4, %%mm3\n" "paddw %%mm1, %%mm0\n" "paddw %%mm3, %%mm0\n" : : "r"(pix1), "r"(pix2) ); pix1 += i_stride_pix1; pix2 += i_stride_pix2; } asm volatile( "movd %%mm0, %0\n" : "=r"(i_sum) : ); return i_sum;}static int pixel_sad_16x8_mmxext( uint8_t *pix1, int i_stride_pix1, uint8_t *pix2, int i_stride_pix2 ){ int i_sum = 0; int i; asm volatile( "pxor %%mm0, %%mm0" : : ); for( i = 0; i < 8; i++ ) { asm volatile ( "movq (%0), %%mm1\n" /* mm0 : pix1[0-7] */ "movq (%1), %%mm2\n" /* mm2 : pix2[0-7] */ "movq 8(%0), %%mm3\n" /* mm3 : pix1[8-15] */ "movq 8(%1), %%mm4\n" /* mm4 : pix2[8-15] */ "psadbw %%mm2, %%mm1\n" "psadbw %%mm4, %%mm3\n" "paddw %%mm1, %%mm0\n" "paddw %%mm3, %%mm0\n" : : "r"(pix1), "r"(pix2) ); pix1 += i_stride_pix1; pix2 += i_stride_pix2; } asm volatile( "movd %%mm0, %0\n" : "=r"(i_sum) : ); return i_sum;}static int pixel_sad_8x16_mmxext( uint8_t *pix1, int i_stride_pix1, uint8_t *pix2, int i_stride_pix2 ){ int i_sum = 0; int i; for( i = 0; i < 16; i++ ) { int tmp; asm volatile ( "movq (%1), %%mm0\n" /* mm0 : pix1[0-7] */ "movq (%2), %%mm2\n" /* mm2 : pix2[0-7] */ "psadbw %%mm2, %%mm0\n" "movd %%mm0, %0\n" : "=r"(tmp) : "r"(pix1), "r"(pix2) ); i_sum += tmp; pix1 += i_stride_pix1; pix2 += i_stride_pix2; } return i_sum;}static int pixel_sad_8x8_mmxext( uint8_t *pix1, int i_stride_pix1, uint8_t *pix2, int i_stride_pix2 ){ int i_sum = 0; int i; for( i = 0; i < 8; i++ ) { int tmp; asm volatile ( "movq (%1), %%mm0\n" /* mm0 : pix1[0-7] */ "movq (%2), %%mm2\n" /* mm2 : pix2[0-7] */ "psadbw %%mm2, %%mm0\n" "movd %%mm0, %0\n" : "=r"(tmp) : "r"(pix1), "r"(pix2) ); i_sum += tmp; pix1 += i_stride_pix1; pix2 += i_stride_pix2; } return i_sum;}static int pixel_sad_4x4_mmxext( uint8_t *pix1, int i_stride_pix1, uint8_t *pix2, int i_stride_pix2 ){ int i_sum = 0; int i; for( i = 0; i < 4; i++ ) { int tmp; asm volatile ( "movd (%1), %%mm0\n" /* mm0 : pix1[0-3] */ "movd (%2), %%mm1\n" /* mm2 : pix2[0-3] */ "psadbw %%mm1, %%mm0\n" "movd %%mm0, %0\n" : "=r"(tmp) : "r"(pix1), "r"(pix2) ); i_sum += tmp; pix1 += i_stride_pix1; pix2 += i_stride_pix2; } return i_sum;}static void pixel_sub_4x4_mmx( int16_t diff[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ){ int y; asm volatile( "pxor %%mm7, %%mm7\n" : : ); for( y = 0; y < 4; y++ ) { asm volatile( "movd (%1), %%mm0\n" "punpcklbw %%mm7, %%mm0\n" "movd (%2), %%mm1\n" "punpcklbw %%mm7, %%mm1\n" "psubw %%mm1, %%mm0\n" "movq %%mm0, (%0)\n" : : "r"(&diff[y][0]), "r"(pix1), "r"(pix2) ); pix1 += i_pix1; pix2 += i_pix2; }}/* We could do a lot better */static void pixel_sub_8x8_mmx( int16_t diff[4][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ){ pixel_sub_4x4_mmx( diff[0], &pix1[0], i_pix1, &pix2[0], i_pix2 ); pixel_sub_4x4_mmx( diff[1], &pix1[4], i_pix1, &pix2[4], i_pix2 ); pixel_sub_4x4_mmx( diff[2], &pix1[4*i_pix1], i_pix1, &pix2[4*i_pix2], i_pix2 ); pixel_sub_4x4_mmx( diff[3], &pix1[4*i_pix1+4], i_pix1, &pix2[4*i_pix2+4], i_pix2 );}static void pixel_sub_16x16_mmx( int16_t diff[16][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ){ pixel_sub_8x8_mmx( &diff[ 0], &pix1[0], i_pix1, &pix2[0], i_pix2 ); pixel_sub_8x8_mmx( &diff[ 4], &pix1[8], i_pix1, &pix2[8], i_pix2 ); pixel_sub_8x8_mmx( &diff[ 8], &pix1[8*i_pix1], i_pix1, &pix2[8*i_pix2], i_pix2 ); pixel_sub_8x8_mmx( &diff[12], &pix1[8*i_pix1+8], i_pix1, &pix2[8*i_pix2+8], i_pix2 );}static void pixel_add_4x4_mmx( uint8_t *dst, int i_dst, int16_t diff[4][4] ){ int y; asm volatile( "pxor %%mm7, %%mm7" : : ); for( y = 0; y < 4; y++ ) { asm volatile( "movd (%0), %%mm0\n" /* load 4 pixels */ "movq (%1), %%mm1\n" /* load 4 diff(16b) */ "punpcklbw %%mm7,%%mm0\n" "paddsw %%mm1,%%mm0\n" "packuswb %%mm7,%%mm0\n" "movd %%mm0,(%0)\n" "\n" : : "r"(dst), "r"(&diff[y][0]) ); dst += i_dst; }}/* TODO do 8 pixel at once */static void pixel_add_8x8_mmx( uint8_t *dst, int i_dst, int16_t diff[4][4][4] ){ pixel_add_4x4_mmx( &dst[0], i_dst, diff[0] ); pixel_add_4x4_mmx( &dst[4], i_dst, diff[1] ); pixel_add_4x4_mmx( &dst[0+4*i_dst], i_dst, diff[2] ); pixel_add_4x4_mmx( &dst[4+4*i_dst], i_dst, diff[3] );}static void pixel_add_16x16_mmx( uint8_t *dst, int i_dst, int16_t diff[16][4][4] ){ pixel_add_8x8_mmx( &dst[0], i_dst, &diff[ 0] ); pixel_add_8x8_mmx( &dst[8], i_dst, &diff[ 4] ); pixel_add_8x8_mmx( &dst[0+8*i_dst], i_dst, &diff[ 8] ); pixel_add_8x8_mmx( &dst[8+8*i_dst], i_dst, &diff[12] );}#define LBUTTERFLY2(a1,b1,a2,b2)\ "paddw " #b1 ", " #a1 " \n\t"\ "paddw " #b2 ", " #a2 " \n\t"\ "paddw " #b1 ", " #b1 " \n\t"\ "paddw " #b2 ", " #b2 " \n\t"\ "psubw " #a1 ", " #b1 " \n\t"\ "psubw " #a2 ", " #b2 " \n\t"#define SBUTTERFLYwd(a,b,t )\ "movq " #a ", " #t " \n\t" \ "punpcklwd " #b ", " #a " \n\t" \ "punpckhwd " #b ", " #t " \n\t"#define SBUTTERFLYdq(a,b,t )\ "movq " #a ", " #t " \n\t" \ "punpckldq " #b ", " #a " \n\t" \ "punpckhdq " #b ", " #t " \n\t"static inline int pixel_satd_4x4_mmxext( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ){ int i_satd; int16_t diff[4][4] __align8; pixel_sub_4x4_mmx( diff, pix1, i_pix1, pix2, i_pix2 ); asm volatile( /* load the diffs we could avoid pixel_sub_4x4_mmx (but more unreadable) */ "movq (%1), %%mm0\n" "movq 8(%1), %%mm1\n" "movq 16(%1), %%mm2\n" "movq 24(%1), %%mm3\n" /* hadamard h */ LBUTTERFLY2(%%mm0, %%mm1, %%mm2, %%mm3) LBUTTERFLY2(%%mm0, %%mm2, %%mm1, %%mm3) /* transpose 4x4 (in:%%mm0123 out:%%mm0342 */ SBUTTERFLYwd( %%mm0, %%mm1, %%mm4 ) SBUTTERFLYwd( %%mm2, %%mm3, %%mm1 ) SBUTTERFLYdq( %%mm0, %%mm2, %%mm3 ) SBUTTERFLYdq( %%mm4, %%mm1, %%mm2 ) /* hadamard v */ LBUTTERFLY2(%%mm0, %%mm3, %%mm4, %%mm2) LBUTTERFLY2(%%mm0, %%mm4, %%mm3, %%mm2) /* sum of abs */ "pxor %%mm7, %%mm7\n" "psubw %%mm0, %%mm7\n" "pmaxsw %%mm7, %%mm0\n" "pxor %%mm7, %%mm7\n" "psubw %%mm3, %%mm7\n" "pmaxsw %%mm7, %%mm3\n" "paddusw %%mm3, %%mm0\n" "pxor %%mm7, %%mm7\n" "psubw %%mm4, %%mm7\n" "pmaxsw %%mm7, %%mm4\n" "paddusw %%mm4, %%mm0\n" "pxor %%mm7, %%mm7\n" "psubw %%mm2, %%mm7\n" "pmaxsw %%mm7, %%mm2\n" "paddusw %%mm2, %%mm0\n" /* last sum */ "movq %%mm0, %%mm1\n" "psrlq $32, %%mm0\n" "paddusw %%mm1, %%mm0\n" "movq %%mm0, %%mm1\n" "psrlq $16, %%mm0\n" "paddusw %%mm1, %%mm0\n" "movd %%mm0, %0\n" : "=r"(i_satd) : "r"(diff) ); return (i_satd&0xffff) / 2;}static int pixel_satd_4x8_mmxext( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ){ return pixel_satd_4x4_mmxext( &pix1[0], i_pix1, &pix2[0], i_pix2 ) + pixel_satd_4x4_mmxext( &pix1[4*i_pix1], i_pix1, &pix2[4*i_pix2], i_pix2 );}static int pixel_satd_8x4_mmxext( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ){ return pixel_satd_4x4_mmxext( &pix1[0], i_pix1, &pix2[0], i_pix2 ) + pixel_satd_4x4_mmxext( &pix1[4], i_pix1, &pix2[4], i_pix2 );}static int pixel_satd_8x8_mmxext( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ){ return pixel_satd_4x4_mmxext( &pix1[0], i_pix1, &pix2[0], i_pix2 ) + pixel_satd_4x4_mmxext( &pix1[4], i_pix1, &pix2[4], i_pix2 ) + pixel_satd_4x4_mmxext( &pix1[4*i_pix1], i_pix1, &pix2[4*i_pix2], i_pix2 ) + pixel_satd_4x4_mmxext( &pix1[4*i_pix1+4], i_pix1, &pix2[4*i_pix2+4], i_pix2 );}static int pixel_satd_16x8_mmxext( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ){ return pixel_satd_8x8_mmxext( &pix1[0], i_pix1, &pix2[0], i_pix2 ) + pixel_satd_8x8_mmxext( &pix1[8], i_pix1, &pix2[8], i_pix2 );}static int pixel_satd_8x16_mmxext( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ){ return pixel_satd_8x8_mmxext( &pix1[0], i_pix1, &pix2[0], i_pix2 ) + pixel_satd_8x8_mmxext( &pix1[8*i_pix1], i_pix1, &pix2[8*i_pix2], i_pix2 );}static int pixel_satd_16x16_mmxext( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ){ return pixel_satd_8x8_mmxext( &pix1[0], i_pix1, &pix2[0], i_pix2 ) + pixel_satd_8x8_mmxext( &pix1[8], i_pix1, &pix2[8], i_pix2 ) + pixel_satd_8x8_mmxext( &pix1[8*i_pix1], i_pix1, &pix2[8*i_pix2], i_pix2 ) + pixel_satd_8x8_mmxext( &pix1[8*i_pix1+8], i_pix1, &pix2[8*i_pix2+8], i_pix2 );}/**************************************************************************** * x264_pixel_init: ****************************************************************************/void x264_pixel_mmxext_init( x264_pixel_function_t *pixf ){ pixf->sad[PIXEL_16x16] = pixel_sad_16x16_mmxext; pixf->sad[PIXEL_16x8 ] = pixel_sad_16x8_mmxext; pixf->sad[PIXEL_8x16 ] = pixel_sad_8x16_mmxext; pixf->sad[PIXEL_8x8 ] = pixel_sad_8x8_mmxext; pixf->sad[PIXEL_4x4] = pixel_sad_4x4_mmxext; pixf->satd[PIXEL_16x16]= pixel_satd_16x16_mmxext; pixf->satd[PIXEL_16x8] = pixel_satd_16x8_mmxext; pixf->satd[PIXEL_8x16] = pixel_satd_8x16_mmxext; pixf->satd[PIXEL_8x8] = pixel_satd_8x8_mmxext; pixf->satd[PIXEL_8x4] = pixel_satd_8x4_mmxext; pixf->satd[PIXEL_4x8] = pixel_satd_4x8_mmxext; pixf->satd[PIXEL_4x4] = pixel_satd_4x4_mmxext; pixf->sub4x4 = pixel_sub_4x4_mmx; pixf->sub8x8 = pixel_sub_8x8_mmx; pixf->sub16x16 = pixel_sub_16x16_mmx; pixf->add4x4 = pixel_add_4x4_mmx; pixf->add8x8 = pixel_add_8x8_mmx; pixf->add16x16 = pixel_add_16x16_mmx;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -