transfrm_x86.c

来自「Motion JPEG编解码器源代码」· C语言 代码 · 共 284 行

C
284
字号
/* transfrm.c,  forward / inverse transformation     In compiler (gcc) embdeed assembly language...              *//* Modifications and enhancements (C) 2000-2003 Andrew Stevens *//* These modifications are free software; you can redistribute it *  and/or modify it under the terms of the GNU General Public License *  as published by the Free Software Foundation; either version 2 of *  the License, or (at your option) any later version. * *  This program is distributed in the hope that it will be useful, *  but WITHOUT ANY WARRANTY; without even the implied warranty of *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU *  General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 02111-1307, USA. * */#include <config.h>#include <math.h>#include "mjpeg_types.h"#include "mjpeg_logging.h"#include "attributes.h"#include "mmx.h"#include "simd.h"/* Routines written in assembler */extern void fdct_mmx( int16_t * blk );extern void init_fdct_sse(void);extern void fdct_sse( int16_t * blk );extern void fdct_test( int16_t * blk );extern void idct_mmx( int16_t * blk );extern void idct_sse( int16_t * blk );extern void idct_test( int16_t * blk );extern void add_pred_mmx (uint8_t *pred, uint8_t *cur,						  int lx, int16_t *blk);extern  void sub_pred_mmx (uint8_t *pred, uint8_t *cur,						  int lx, int16_t *blk);static inline voidmmx_sum_4_word_accs( mmx_t *accs, int32_t *res ){	movq_m2r( *accs, mm1 );	movq_r2r( mm1, mm3 );	movq_r2r( mm1, mm2 );	/* Generate sign extensions for mm1 words! */	psraw_i2r( 15, mm3 );	punpcklwd_r2r( mm3, mm1 );	punpckhwd_r2r( mm3, mm2 );	paddd_r2r( mm1, mm2 );	movq_r2r( mm2, mm3);	psrlq_i2r( 32, mm2);	paddd_r2r( mm2, mm3);	movd_r2m( mm3, *res );}static inline voidsum_sumsq_8bytes( uint8_t *cur_lum_mb, 				  uint8_t *pred_lum_mb,				  mmx_t *sumtop_accs,				  mmx_t *sumbot_accs,				  mmx_t *sumsqtop_accs,				  mmx_t *sumsqbot_accs,				  mmx_t *sumxprod_accs,				  int stride	){	pxor_r2r(mm0,mm0);	/* Load pixels from top field into mm1.w,mm2.w	 */	movq_m2r( *((mmx_t*)cur_lum_mb), mm1 );	movq_m2r( *((mmx_t*)pred_lum_mb), mm2 );		/* mm3 := mm1 mm4 := mm2	   mm1.w[0..3] := mm1.b[0..3]-mm2.b[0..3]	*/ 		movq_r2r( mm1, mm3 );	punpcklbw_r2r( mm0, mm1 );	movq_r2r( mm2, mm4 );	punpcklbw_r2r( mm0, mm2 );	psubw_r2r( mm2, mm1 );		/* mm3.w[0..3] := mm3.b[4..7]-mm4.b[4..7]	 */	punpckhbw_r2r( mm0, mm3 );	punpckhbw_r2r( mm0, mm4 );	psubw_r2r( mm4, mm3 );	/* sumtop_accs->w[0..3] += mm1.w[0..3];	   sumtop_accs->w[0..3] += mm3.w[0..3];	   mm6 = mm1; mm7 = mm3;	*/	movq_m2r( *sumtop_accs, mm5 );	paddw_r2r( mm1, mm5 );	paddw_r2r( mm3, mm5 );	movq_r2r( mm1, mm6 );	movq_r2r( mm3, mm7 );	movq_r2m( mm5, *sumtop_accs );	/* 	   *sumsq_top_acc += mm1.w[0..3] * mm1.w[0..3];	   *sumsq_top_acc += mm3.w[0..3] * mm3.w[0..3];	*/	pmaddwd_r2r( mm1, mm1 );	movq_m2r( *sumsqtop_accs, mm5 );	pmaddwd_r2r( mm3, mm3 );	paddd_r2r( mm1, mm5 );	paddd_r2r( mm3, mm5 );	movq_r2m( mm5, *sumsqtop_accs );		/* Load pixels from bot field into mm1.w,mm2.w	 */	movq_m2r( *((mmx_t*)(cur_lum_mb+stride)), mm1 );	movq_m2r( *((mmx_t*)(pred_lum_mb+stride)), mm2 );		/* mm2 := mm1 mm4 := mm2	   mm1.w[0..3] := mm1.b[0..3]-mm2.b[0..3]	*/ 		movq_r2r( mm1, mm3 );	punpcklbw_r2r( mm0, mm1 );	movq_r2r( mm2, mm4 );	punpcklbw_r2r( mm0, mm2 );	psubw_r2r( mm2, mm1 );		/* mm3.w[0..3] := mm3.b[4..7]-mm4.b[4..7]	 */	punpckhbw_r2r( mm0, mm3 );	punpckhbw_r2r( mm0, mm4 );	psubw_r2r( mm4, mm3 );	/* 	   sumbot_accs->w[0..3] += mm1.w[0..3];	   sumbot_accs->w[0..3] += mm3.w[0..3];	   mm2 := mm1; mm4 := mm3;	*/	movq_m2r( *sumbot_accs, mm5 );	paddw_r2r( mm1, mm5 );	movq_r2r( mm1, mm2 );	paddw_r2r( mm3, mm5 );	movq_r2r( mm3, mm4 );	movq_r2m( mm5, *sumbot_accs );	/* 	   *sumsqbot_acc += mm1.w[0..3] * mm1.w[0..3];	   *sumsqbot_acc += mm3.w[0..3] * mm3.w[0..3];	*/	pmaddwd_r2r( mm1, mm1 );	movq_m2r( *sumsqbot_accs, mm5 );	pmaddwd_r2r( mm3, mm3 );	paddd_r2r( mm1, mm5 );	paddd_r2r( mm3, mm5 );	movq_r2m( mm5, *sumsqbot_accs );			/* Accumulate cross-product 	 *sum_xprod_acc += mm1.w[0..3] * mm6[0..3];	 *sum_xprod_acc += mm3.w[0..3] * mm7[0..3];	 */	movq_m2r( *sumxprod_accs, mm5 );	pmaddwd_r2r( mm6, mm2);	pmaddwd_r2r( mm7, mm4);	paddd_r2r( mm2, mm5 );	paddd_r2r( mm4, mm5 );	movq_r2m( mm5, *sumxprod_accs );	emms();}int field_dct_best_mmx( uint8_t *cur_lum_mb, uint8_t *pred_lum_mb, int stride){	/*	 * calculate prediction error (cur-pred) for top (blk0)	 * and bottom field (blk1)	 */	double r,d;	int rowoffs = 0;	int sumtop, sumbot, sumsqtop, sumsqbot, sumbottop;	int j;	int dct_type;	int topvar, botvar;	mmx_t sumtop_accs, sumbot_accs;	mmx_t sumsqtop_accs, sumsqbot_accs, sumxprod_accs;	int32_t sumtop_acc, sumbot_acc;	int32_t sumsqtop_acc, sumsqbot_acc, sumxprod_acc;	pxor_r2r(mm0,mm0);	movq_r2m( mm0, *(&sumtop_accs) );	movq_r2m( mm0, *(&sumbot_accs) );	movq_r2m( mm0, *(&sumsqtop_accs) );	movq_r2m( mm0, *(&sumsqbot_accs) );	movq_r2m( mm0, *(&sumxprod_accs) );		sumtop = sumsqtop = sumbot = sumsqbot = sumbottop = 0;	sumtop_acc = sumbot_acc = sumsqtop_acc = sumsqbot_acc = sumxprod_acc = 0; 	for (j=0; j<8; j++)	{#ifdef ORIGINAL_CODE		for (i=0; i<16; i++)		{			register int toppix = 				cur_lum_mb[rowoffs+i] - pred_lum_mb[rowoffs+i];			register int botpix = 				cur_lum_mb[rowoffs+width+i] - pred_lum_mb[rowoffs+width+i];			sumtop += toppix;			sumsqtop += toppix*toppix;			sumbot += botpix;			sumsqbot += botpix*botpix;			sumbottop += toppix*botpix;		}#endif		sum_sumsq_8bytes( &cur_lum_mb[rowoffs], &pred_lum_mb[rowoffs],						  &sumtop_accs, &sumbot_accs,						  &sumsqtop_accs, &sumsqbot_accs, &sumxprod_accs,						  stride);		sum_sumsq_8bytes( &cur_lum_mb[rowoffs+8], &pred_lum_mb[rowoffs+8],						  &sumtop_accs, &sumbot_accs,						  &sumsqtop_accs, &sumsqbot_accs, &sumxprod_accs,						  stride);		rowoffs += (stride<<1);	}	mmx_sum_4_word_accs( &sumtop_accs, &sumtop );	mmx_sum_4_word_accs( &sumbot_accs, &sumbot );	emms();	sumsqtop = sumsqtop_accs.d[0] + sumsqtop_accs.d[1];	sumsqbot = sumsqbot_accs.d[0] + sumsqbot_accs.d[1];	sumbottop = sumxprod_accs.d[0] + sumxprod_accs.d[1];	/* Calculate Variances top and bottom.  If they're of similar	 sign estimate correlation if its good use frame DCT otherwise	 use field.	*/	r = 0.0;	topvar = sumsqtop-sumtop*sumtop/128;	botvar = sumsqbot-sumbot*sumbot/128;	if ( !((topvar <= 0) ^ (botvar <= 0)) )	{		d = ((double) topvar) * ((double)botvar);		r = (sumbottop-(sumtop*sumbot)/128);		if (r>0.5*sqrt(d))			return 0; /* frame DCT */		else			return 1; /* field DCT */	}	else		return 1; /* field DCT */	return dct_type;}void init_x86_transform(){        char *opt_type1="";        int flags = cpu_accel();	pfdct = fdct_mmx;	pidct = idct_mmx;	padd_pred = add_pred_mmx;	psub_pred = sub_pred_mmx;	pfield_dct_best = field_dct_best_mmx;        if( flags & ACCEL_X86_SSE ) {            init_fdct_sse();            pfdct = fdct_sse;            pidct = idct_sse;            opt_type1 = "SSE and ";        }	mjpeg_info( "SETTING %sMMX for TRANSFORM!",opt_type1);}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?