📄 checkasm.c.svn-base
字号:
#include <stdio.h>#include <stdlib.h>#include <string.h>#include <math.h>#include "common/common.h"#include "common/cpu.h"#ifdef HAVE_MMXEXT#include "common/i386/pixel.h"#include "common/i386/dct.h"#include "common/i386/mc.h"#endif#ifdef ARCH_PPC#include "common/ppc/pixel.h"#include "common/ppc/mc.h"#endif/* buf1, buf2: initialised to random data and shouldn't write into them */uint8_t * buf1, * buf2;/* buf3, buf4: used to store output */uint8_t * buf3, * buf4;/* buf5: temp */uint8_t * buf5;#define report( name ) { \ if( used_asm ) \ fprintf( stderr, " - %-21s [%s]\n", name, ok ? "OK" : "FAILED" ); \ if( !ok ) ret = -1; \}static int check_pixel( int cpu_ref, int cpu_new ){ x264_pixel_function_t pixel_c; x264_pixel_function_t pixel_ref; x264_pixel_function_t pixel_asm; x264_predict_t predict_16x16[4+3]; x264_predict_t predict_8x8c[4+3]; x264_predict_t predict_4x4[9+3]; x264_predict8x8_t predict_8x8[9+3]; DECLARE_ALIGNED( uint8_t, edge[33], 8 ); int ret = 0, ok, used_asm; int i, j; x264_pixel_init( 0, &pixel_c ); x264_pixel_init( cpu_ref, &pixel_ref ); x264_pixel_init( cpu_new, &pixel_asm ); x264_predict_16x16_init( 0, predict_16x16 ); x264_predict_8x8c_init( 0, predict_8x8c ); x264_predict_8x8_init( 0, predict_8x8 ); x264_predict_4x4_init( 0, predict_4x4 ); x264_predict_8x8_filter( buf2+40, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );#define TEST_PIXEL( name ) \ for( i = 0, ok = 1, used_asm = 0; i < 7; i++ ) \ { \ int res_c, res_asm; \ if( pixel_asm.name[i] != pixel_ref.name[i] ) \ { \ used_asm = 1; \ res_c = pixel_c.name[i]( buf1, 32, buf2, 16 ); \ res_asm = pixel_asm.name[i]( buf1, 32, buf2, 16 ); \ if( res_c != res_asm ) \ { \ ok = 0; \ fprintf( stderr, #name "[%d]: %d != %d [FAILED]\n", i, res_c, res_asm ); \ } \ } \ } \ report( "pixel " #name " :" ); TEST_PIXEL( sad ); TEST_PIXEL( ssd ); TEST_PIXEL( satd ); TEST_PIXEL( sa8d );#define TEST_PIXEL_X( N ) \ for( i = 0, ok = 1, used_asm = 0; i < 7; i++ ) \ { \ int res_c[4]={0}, res_asm[4]={0}; \ if( pixel_asm.sad_x##N[i] && pixel_asm.sad_x##N[i] != pixel_ref.sad_x##N[i] ) \ { \ used_asm = 1; \ res_c[0] = pixel_c.sad[i]( buf1, 16, buf2, 32 ); \ res_c[1] = pixel_c.sad[i]( buf1, 16, buf2+30, 32 ); \ res_c[2] = pixel_c.sad[i]( buf1, 16, buf2+1, 32 ); \ if(N==4) \ { \ res_c[3] = pixel_c.sad[i]( buf1, 16, buf2+99, 32 ); \ pixel_asm.sad_x4[i]( buf1, buf2, buf2+30, buf2+1, buf2+99, 32, res_asm ); \ } \ else \ pixel_asm.sad_x3[i]( buf1, buf2, buf2+30, buf2+1, 32, res_asm ); \ if( memcmp(res_c, res_asm, sizeof(res_c)) ) \ { \ ok = 0; \ fprintf( stderr, "sad_x"#N"[%d]: %d,%d,%d,%d != %d,%d,%d,%d [FAILED]\n", \ i, res_c[0], res_c[1], res_c[2], res_c[3], \ res_asm[0], res_asm[1], res_asm[2], res_asm[3] ); \ } \ } \ } \ report( "pixel sad_x"#N" :" ); TEST_PIXEL_X(3); TEST_PIXEL_X(4);#define TEST_INTRA_SATD( name, pred, satd, i8x8, ... ) \ if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \ { \ int res_c[3], res_asm[3]; \ used_asm = 1; \ memcpy( buf3, buf2, 1024 ); \ for( i=0; i<3; i++ ) \ { \ pred[i]( buf3+40, ##__VA_ARGS__ ); \ res_c[i] = pixel_c.satd( buf1+40, 16, buf3+40, 32 ); \ } \ pixel_asm.name( buf1+40, i8x8 ? edge : buf3+40, res_asm ); \ if( memcmp(res_c, res_asm, sizeof(res_c)) ) \ { \ ok = 0; \ fprintf( stderr, #name": %d,%d,%d != %d,%d,%d [FAILED]\n", \ res_c[0], res_c[1], res_c[2], \ res_asm[0], res_asm[1], res_asm[2] ); \ } \ } ok = 1; used_asm = 0; TEST_INTRA_SATD( intra_satd_x3_16x16, predict_16x16, satd[PIXEL_16x16], 0 ); TEST_INTRA_SATD( intra_satd_x3_8x8c, predict_8x8c, satd[PIXEL_8x8], 0 ); TEST_INTRA_SATD( intra_satd_x3_4x4, predict_4x4, satd[PIXEL_4x4], 0 ); TEST_INTRA_SATD( intra_sa8d_x3_8x8, predict_8x8, sa8d[PIXEL_8x8], 1, edge ); report( "intra satd_x3 :" ); if( pixel_asm.ssim_4x4x2_core != pixel_ref.ssim_4x4x2_core || pixel_asm.ssim_end4 != pixel_ref.ssim_end4 ) { float res_c, res_a; ok = 1; x264_cpu_restore( cpu_new ); res_c = x264_pixel_ssim_wxh( &pixel_c, buf1+2, 32, buf2+2, 32, 32, 28 ); res_a = x264_pixel_ssim_wxh( &pixel_asm, buf1+2, 32, buf2+2, 32, 32, 28 ); if( fabs(res_c - res_a) > 1e-8 ) { ok = 0; fprintf( stderr, "ssim: %.7f != %.7f [FAILED]\n", res_c, res_a ); } report( "ssim :" ); } ok = 1; used_asm = 0; for( i=0; i<4; i++ ) if( pixel_asm.ads[i] != pixel_ref.ads[i] ) { uint16_t res_a[32], res_c[32]; uint16_t sums[72]; int dc[4]; for( j=0; j<72; j++ ) sums[j] = rand() & 0x3fff; for( j=0; j<4; j++ ) dc[j] = rand() & 0x3fff; used_asm = 1; pixel_c.ads[i]( dc, sums, 32, res_c, 32 ); pixel_asm.ads[i]( dc, sums, 32, res_a, 32 ); if( memcmp(res_a, res_c, sizeof(res_c)) ) ok = 0; } report( "esa ads:" ); return ret;}static int check_dct( int cpu_ref, int cpu_new ){ x264_dct_function_t dct_c; x264_dct_function_t dct_ref; x264_dct_function_t dct_asm; int ret = 0, ok, used_asm; int16_t dct1[16][4][4] __attribute__((aligned(16))); int16_t dct2[16][4][4] __attribute__((aligned(16))); x264_dct_init( 0, &dct_c ); x264_dct_init( cpu_ref, &dct_ref); x264_dct_init( cpu_new, &dct_asm );#define TEST_DCT( name, t1, t2, size ) \ if( dct_asm.name != dct_ref.name ) \ { \ used_asm = 1; \ dct_c.name( t1, buf1, buf2 ); \ dct_asm.name( t2, buf1, buf2 ); \ if( memcmp( t1, t2, size ) ) \ { \ ok = 0; \ fprintf( stderr, #name " [FAILED]\n" ); \ } \ } ok = 1; used_asm = 0; TEST_DCT( sub4x4_dct, dct1[0], dct2[0], 16*2 ); TEST_DCT( sub8x8_dct, dct1, dct2, 16*2*4 ); TEST_DCT( sub16x16_dct, dct1, dct2, 16*2*16 ); report( "sub_dct4 :" ); ok = 1; used_asm = 0; TEST_DCT( sub8x8_dct8, (void*)dct1[0], (void*)dct2[0], 64*2 ); TEST_DCT( sub16x16_dct8, (void*)dct1, (void*)dct2, 64*2*4 ); report( "sub_dct8 :" );#undef TEST_DCT /* copy coefs because idct8 modifies them in place */ memcpy( buf5, dct1, 512 );#define TEST_IDCT( name ) \ if( dct_asm.name != dct_ref.name ) \ { \ used_asm = 1; \ memcpy( buf3, buf1, 32*32 ); \ memcpy( buf4, buf1, 32*32 ); \ memcpy( dct1, buf5, 512 ); \ memcpy( dct2, buf5, 512 ); \ dct_c.name( buf3, (void*)dct1 ); \ dct_asm.name( buf4, (void*)dct2 ); \ if( memcmp( buf3, buf4, 32*32 ) ) \ { \ ok = 0; \ fprintf( stderr, #name " [FAILED]\n" ); \ } \ } ok = 1; used_asm = 0; TEST_IDCT( add4x4_idct ); TEST_IDCT( add8x8_idct ); TEST_IDCT( add16x16_idct ); report( "add_idct4 :" ); ok = 1; used_asm = 0; TEST_IDCT( add8x8_idct8 ); TEST_IDCT( add16x16_idct8 ); report( "add_idct8 :" );#undef TEST_IDCT ok = 1; used_asm = 0; if( dct_asm.dct4x4dc != dct_ref.dct4x4dc ) { int16_t dct1[4][4] __attribute((aligned(16))) = { {-12, 42, 23, 67},{2, 90, 89,56}, {67,43,-76,91},{56,-78,-54,1}}; int16_t dct2[4][4] __attribute((aligned(16))) = { {-12, 42, 23, 67},{2, 90, 89,56}, {67,43,-76,91},{56,-78,-54,1}}; used_asm = 1; dct_c.dct4x4dc( dct1 ); dct_asm.dct4x4dc( dct2 ); if( memcmp( dct1, dct2, 32 ) ) { ok = 0; fprintf( stderr, " - dct4x4dc : [FAILED]\n" ); } } if( dct_asm.dct4x4dc != dct_ref.dct4x4dc ) { int16_t dct1[4][4] __attribute((aligned(16))) = { {-12, 42, 23, 67},{2, 90, 89,56}, {67,43,-76,91},{56,-78,-54,1}}; int16_t dct2[4][4] __attribute((aligned(16))) = { {-12, 42, 23, 67},{2, 90, 89,56}, {67,43,-76,91},{56,-78,-54,1}}; used_asm = 1; dct_c.idct4x4dc( dct1 ); dct_asm.idct4x4dc( dct2 ); if( memcmp( dct1, dct2, 32 ) ) { ok = 0; fprintf( stderr, " - idct4x4dc : [FAILED]\n" ); } } report( "(i)dct4x4dc :" ); ok = 1; used_asm = 0; if( dct_asm.dct2x2dc != dct_ref.dct2x2dc ) { int16_t dct1[2][2] __attribute((aligned(16))) = { {-12, 42},{2, 90}}; int16_t dct2[2][2] __attribute((aligned(16))) = { {-12, 42},{2, 90}}; used_asm = 1; dct_c.dct2x2dc( dct1 ); dct_asm.dct2x2dc( dct2 ); if( memcmp( dct1, dct2, 4*2 ) ) { ok = 0; fprintf( stderr, " - dct2x2dc : [FAILED]\n" ); } } if( dct_asm.idct2x2dc != dct_ref.idct2x2dc ) { int16_t dct1[2][2] __attribute((aligned(16))) = { {-12, 42},{2, 90}}; int16_t dct2[2][2] __attribute((aligned(16))) = { {-12, 42},{2, 90}}; used_asm = 1; dct_c.idct2x2dc( dct1 ); dct_asm.idct2x2dc( dct2 ); if( memcmp( dct1, dct2, 4*2 ) ) { ok = 0; fprintf( stderr, " - idct2x2dc : [FAILED]\n" ); } } report( "(i)dct2x2dc :" ); return ret;}static int check_mc( int cpu_ref, int cpu_new ){ x264_mc_functions_t mc_c; x264_mc_functions_t mc_ref; x264_mc_functions_t mc_a; uint8_t *src = &buf1[2*32+2]; uint8_t *src2[4] = { &buf1[2*32+2], &buf1[7*32+2], &buf1[12*32+2], &buf1[17*32+2] }; uint8_t *dst1 = &buf3[2*32+2]; uint8_t *dst2 = &buf4[2*32+2]; int dx, dy, i, j, w; int ret = 0, ok, used_asm; x264_mc_init( 0, &mc_c ); x264_mc_init( cpu_ref, &mc_ref ); x264_mc_init( cpu_new, &mc_a );#define MC_TEST_LUMA( w, h ) \ if( mc_a.mc_luma != mc_ref.mc_luma ) \ { \ used_asm = 1; \ memset(buf3, 0xCD, 1024); \ memset(buf4, 0xCD, 1024); \ mc_c.mc_luma( src2, 32, dst1, 16, dx, dy, w, h ); \ mc_a.mc_luma( src2, 32, dst2, 16, dx, dy, w, h ); \ if( memcmp( buf3, buf4, 1024 ) ) \ { \ fprintf( stderr, "mc_luma[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w, h ); \ ok = 0; \ } \ }#define MC_TEST_CHROMA( w, h ) \ if( mc_a.mc_chroma != mc_ref.mc_chroma ) \ { \ used_asm = 1; \ memset(buf3, 0xCD, 1024); \ memset(buf4, 0xCD, 1024); \ mc_c.mc_chroma( src, 32, dst1, 16, dx, dy, w, h ); \ mc_a.mc_chroma( src, 32, dst2, 16, dx, dy, w, h ); \ /* mc_chroma width=2 may write garbage to the right of dst. ignore that. */\ for( j=0; j<h; j++ ) \ for( i=w; i<4; i++ ) \ dst2[i+j*16] = dst1[i+j*16]; \ if( memcmp( buf3, buf4, 1024 ) ) \ { \ fprintf( stderr, "mc_chroma[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w, h ); \ ok = 0; \ } \ } ok = 1; used_asm = 0; for( dy = 0; dy < 4; dy++ ) for( dx = 0; dx < 4; dx++ ) { MC_TEST_LUMA( 16, 16 ); MC_TEST_LUMA( 16, 8 ); MC_TEST_LUMA( 8, 16 ); MC_TEST_LUMA( 8, 8 ); MC_TEST_LUMA( 8, 4 ); MC_TEST_LUMA( 4, 8 ); MC_TEST_LUMA( 4, 4 ); } report( "mc luma :" ); ok = 1; used_asm = 0; for( dy = -1; dy < 9; dy++ ) for( dx = -1; dx < 9; dx++ ) { MC_TEST_CHROMA( 8, 8 ); MC_TEST_CHROMA( 8, 4 ); MC_TEST_CHROMA( 4, 8 ); MC_TEST_CHROMA( 4, 4 ); MC_TEST_CHROMA( 4, 2 ); MC_TEST_CHROMA( 2, 4 ); MC_TEST_CHROMA( 2, 2 ); } report( "mc chroma :" );#undef MC_TEST_LUMA#undef MC_TEST_CHROMA#define MC_TEST_AVG( name, ... ) \ for( i = 0, ok = 1, used_asm = 0; i < 10; i++ ) \ { \ memcpy( buf3, buf1, 1024 ); \ memcpy( buf4, buf1, 1024 ); \ if( mc_a.name[i] != mc_ref.name[i] ) \ { \ used_asm = 1; \ mc_c.name[i]( buf3, 32, buf2, 16, ##__VA_ARGS__ ); \ mc_a.name[i]( buf4, 32, buf2, 16, ##__VA_ARGS__ ); \ if( memcmp( buf3, buf4, 1024 ) ) \
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -