📄 altivec_mlib.c

📁 基于linux的DVD播放器程序
💻 C
📖 第 1 页 / 共 5 页
字号:
上一页 1 2 3 45
		int i0 = 0, i1 = 16;		asm(""			"lvsl 4,%0,%1\n"		"" : : "b" (ref_block), "b" (i0));		for (i = 0; i < 16; i++) {			asm(""				"lvx 0,%1,%3\n"				"lvx 1,%1,%4\n"				"lvx 2,%2,%3\n"				"lvx 3,%2,%4\n"				"vperm 5,0,1,4\n"				"vperm 6,2,3,4\n"				"vavgub 5,5,6\n"				"stvx 5,%0,%3\n"			"" : : "b" (curr_block), "b" (ref_block),			      "b" (ref_block + field_stride),			      "b" (i0), "b" (i1));			i0 += frame_stride, i1 += frame_stride;		}	} else {		int i0 = 0;		for (i = 0; i < 16; i++) {			asm(""				"lvx 0,%1,%3\n"				"lvx 1,%2,%3\n"				"vavgub 0,0,1\n"				"stvx 0,%0,%3\n"			"" : : "b" (curr_block), "b" (ref_block),			      "b" (ref_block + field_stride),			      "b" (i0));			i0 += frame_stride;		}	}}void mlib_VideoInterpY_U8_U8_16x8(uint8_t *curr_block,			     const uint8_t *ref_block,			     int32_t frame_stride,			     int32_t field_stride){	int i;	ASSERT(((int)curr_block & 15) == 0);	if (((int)ref_block & 15) != 0) {		int i0 = 0, i1 = 16;		asm(""			"lvsl 4,%0,%1\n"		"" : : "b" (ref_block), "b" (i0));		for (i = 0; i < 8; i++) {			asm(""				"lvx 0,%1,%3\n"				"lvx 1,%1,%4\n"				"lvx 2,%2,%3\n"				"lvx 3,%2,%4\n"				"vperm 5,0,1,4\n"				"vperm 6,2,3,4\n"				"vavgub 5,5,6\n"				"stvx 5,%0,%3\n"			"" : : "b" (curr_block), "b" (ref_block),			      "b" (ref_block + field_stride),			      "b" (i0), "b" (i1));			i0 += frame_stride, i1 += frame_stride;		}	} else {		int i0 = 0;		for (i = 0; i < 8; i++) {			asm(""				"lvx 0,%1,%3\n"				"lvx 1,%2,%3\n"				"vavgub 0,0,1\n"				"stvx 0,%0,%3\n"			"" : : "b" (curr_block), "b" (ref_block),			      "b" (ref_block + field_stride),			      "b" (i0));			i0 += frame_stride;		}	}}void mlib_VideoInterpY_U8_U8_8x8(uint8_t *curr_block,			    const uint8_t *ref_block,			    int32_t frame_stride,			    int32_t field_stride){	int i;	ASSERT(((int)curr_block & 7) == 0);	if (((((int)ref_block ^ (int)curr_block) | field_stride) & 15) != 0) {		const int i0 = 0, i1 = 16, i2 = 4;		asm(""			"lvsl 4,%1,%3\n"			"lvsl 5,%1,%4\n"			"lvsl 6,%2,%3\n"			"lvsl 7,%2,%4\n"			"lvsr 8,%0,%3\n"			"lvsr 9,%0,%4\n"			"vperm 4,4,4,8\n"			"vperm 5,5,5,9\n"			"vperm 6,6,6,8\n"			"vperm 7,7,7,9\n"		"" : : "b" (curr_block), "b" (ref_block),		      "b" (ref_block + field_stride),		      "b" (i0), "b" (i0 + frame_stride));		for (i = 0; i < 4; i++) {			asm(""				"lvx 0,%1,%3\n"				"lvx 1,%1,%4\n"				"lvx 2,%2,%3\n"				"lvx 3,%2,%4\n"				"vperm 8,0,1,4\n"				"vperm 9,2,3,6\n"				"vavgub 8,8,9\n"				"stvewx 8,%0,%3\n"				"stvewx 8,%0,%5\n"			"" : : "b" (curr_block), "b" (ref_block),			      "b" (ref_block + field_stride),			      "b" (i0), "b" (i1), "b" (i2));			curr_block += frame_stride, ref_block += frame_stride;			asm(""				"lvx 0,%1,%3\n"				"lvx 1,%1,%4\n"				"lvx 2,%2,%3\n"				"lvx 3,%2,%4\n"				"vperm 8,0,1,5\n"				"vperm 9,2,3,7\n"				"vavgub 8,8,9\n"				"stvewx 8,%0,%3\n"				"stvewx 8,%0,%5\n"			"" : : "b" (curr_block), "b" (ref_block),			      "b" (ref_block + field_stride),			      "b" (i0), "b" (i1), "b" (i2));			curr_block += frame_stride, ref_block += frame_stride;		}	} else {		int i0 = 0, i1 = 4;		for (i = 0; i < 8; i++) {			asm(""				"lvx 0,%1,%3\n"				"lvx 1,%2,%3\n"				"vavgub 0,0,1\n"				"stvewx 0,%0,%3\n"				"stvewx 0,%0,%4\n"			"" : : "b" (curr_block), "b" (ref_block),			      "b" (ref_block + field_stride),			      "b" (i0), "b" (i1));			i0 += frame_stride, i1 += frame_stride;		}	}}void mlib_VideoInterpY_U8_U8_8x4(uint8_t *curr_block,			    const uint8_t *ref_block,			    int32_t frame_stride,			    int32_t field_stride){	int i;	ASSERT(((int)curr_block & 7) == 0);	if (((((int)ref_block ^ (int)curr_block) | field_stride) & 15) != 0) {		const int i0 = 0, i1 = 16, i2 = 4;		asm(""			"lvsl 4,%1,%3\n"			"lvsl 5,%1,%4\n"			"lvsl 6,%2,%3\n"			"lvsl 7,%2,%4\n"			"lvsr 8,%0,%3\n"			"lvsr 9,%0,%4\n"			"vperm 4,4,4,8\n"			"vperm 5,5,5,9\n"			"vperm 6,6,6,8\n"			"vperm 7,7,7,9\n"		"" : : "b" (curr_block), "b" (ref_block),		      "b" (ref_block + field_stride),		      "b" (i0), "b" (i0 + frame_stride));		for (i = 0; i < 2; i++) {			asm(""				"lvx 0,%1,%3\n"				"lvx 1,%1,%4\n"				"lvx 2,%2,%3\n"				"lvx 3,%2,%4\n"				"vperm 8,0,1,4\n"				"vperm 9,2,3,6\n"				"vavgub 8,8,9\n"				"stvewx 8,%0,%3\n"				"stvewx 8,%0,%5\n"			"" : : "b" (curr_block), "b" (ref_block),			      "b" (ref_block + field_stride),			      "b" (i0), "b" (i1), "b" (i2));			curr_block += frame_stride, ref_block += frame_stride;			asm(""				"lvx 0,%1,%3\n"				"lvx 1,%1,%4\n"				"lvx 2,%2,%3\n"				"lvx 3,%2,%4\n"				"vperm 8,0,1,5\n"				"vperm 9,2,3,7\n"				"vavgub 8,8,9\n"				"stvewx 8,%0,%3\n"				"stvewx 8,%0,%5\n"			"" : : "b" (curr_block), "b" (ref_block),			      "b" (ref_block + field_stride),			      "b" (i0), "b" (i1), "b" (i2));			curr_block += frame_stride, ref_block += frame_stride;		}	} else {		int i0 = 0, i1 = 4;		for (i = 0; i < 4; i++) {			asm(""				"lvx 0,%1,%3\n"				"lvx 1,%2,%3\n"				"vavgub 0,0,1\n"				"stvewx 0,%0,%3\n"				"stvewx 0,%0,%4\n"			"" : : "b" (curr_block), "b" (ref_block),			      "b" (ref_block + field_stride),			      "b" (i0), "b" (i1));			i0 += frame_stride, i1 += frame_stride;		}	}}voidmlib_VideoInterpXY_U8_U8_16x16(uint8_t *curr_block,			       const uint8_t *ref_block,			       int32_t frame_stride,			       int32_t field_stride){	int i;	int i0 = 0, i1 = 16;	ASSERT(((int)curr_block & 15) == 0);	asm(""		"vspltisb 0,1\n"		"lvsl 4,%0,%1\n"		"vaddubs 5,4,0\n"	"" : : "b" (ref_block), "b" (i0));	for (i = 0; i < 16; i++) {		asm(""			"lvx 0,%1,%3\n"			"lvx 1,%2,%3\n"			"lvx 2,%1,%4\n"			"lvx 3,%2,%4\n"			"vperm 6,0,2,4\n"			"vperm 7,0,2,5\n"			"vperm 8,1,3,4\n"			"vperm 9,1,3,5\n"			"vavgub 6,6,7\n"			"vavgub 8,8,9\n"			"vavgub 6,6,8\n"			"stvx 6,%0,%3\n"		"" : : "b" (curr_block), "b" (ref_block),		      "b" (ref_block + field_stride),		      "b" (i0), "b" (i1));		i0 += frame_stride, i1 += frame_stride;	}}void mlib_VideoInterpXY_U8_U8_16x8(uint8_t *curr_block,			      const uint8_t *ref_block,			      int32_t frame_stride,			      int32_t field_stride){	int i;	int i0 = 0, i1 = 16;	ASSERT(((int)curr_block & 15) == 0);	asm(""		"vspltisb 0,1\n"		"lvsl 4,%0,%1\n"		"vaddubs 5,4,0\n"	"" : : "b" (ref_block), "b" (i0));	for (i = 0; i < 8; i++) {		asm(""			"lvx 0,%1,%3\n"			"lvx 1,%2,%3\n"			"lvx 2,%1,%4\n"			"lvx 3,%2,%4\n"			"vperm 6,0,2,4\n"			"vperm 7,0,2,5\n"			"vperm 8,1,3,4\n"			"vperm 9,1,3,5\n"			"vavgub 6,6,7\n"			"vavgub 8,8,9\n"			"vavgub 6,6,8\n"			"stvx 6,%0,%3\n"		"" : : "b" (curr_block), "b" (ref_block),		      "b" (ref_block + field_stride),		      "b" (i0), "b" (i1));		i0 += frame_stride, i1 += frame_stride;	}}void mlib_VideoInterpXY_U8_U8_8x8(uint8_t *curr_block,			     const uint8_t *ref_block,			     int32_t frame_stride,			     int32_t field_stride){	mlib_VideoInterpXY_U8_U8 (curr_block, ref_block, 8, 8, frame_stride, field_stride);}void mlib_VideoInterpXY_U8_U8_8x4(uint8_t *curr_block,			     const uint8_t *ref_block,			     int32_t frame_stride,			     int32_t field_stride){	mlib_VideoInterpXY_U8_U8 (curr_block, ref_block, 8, 4, frame_stride, field_stride);}void mlib_ClearCoeffs(int16_t *coeffs){	asm(""		"vspltish 0,0\n"		"stvx 0,%0,%1\n"		"addi %1,%1,32\n"		"stvx 0,%0,%2\n"		"addi %2,%2,32\n"		"stvx 0,%0,%1\n"		"addi %1,%1,32\n"		"stvx 0,%0,%2\n"		"addi %2,%2,32\n"		"stvx 0,%0,%1\n"		"addi %1,%1,32\n"		"stvx 0,%0,%2\n"		"addi %2,%2,32\n"		"stvx 0,%0,%1\n"		"stvx 0,%0,%2\n"	"" : : "b" (coeffs), "b" (0), "b" (16));}/*************************************************************** * * Copyright:   (c) Copyright Motorola Inc. 1998 * * Date:        April 17, 1998 * * Function:    IDCT * * Description: Scaled Chen (III) algorithm for IDCT *              Arithmetic is 16-bit fixed point. * * Inputs:      input - Pointer to input data (short), which *                      must be between -2048 to +2047. *                      It is assumed that the allocated array *                      has been 128-bit aligned and contains *                      8x8 short elements. * * Outputs:     output - Pointer to output area for the transfored *                       data. The output values are between -255 *                       and 255 . It is assumed that a 128-bit *                       aligned 8x8 array of short has been *                       pre-allocated. * * Return:      None * ***************************************************************/static const int16_t SpecialConstants[8] __attribute__ ((aligned (16))) = {	23170, 13573, 6518, 21895, -23170, -21895, 32, 0 };static const int16_t PreScale[64] __attribute__ ((aligned (16))) = {	16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725, 	22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521, 	21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692, 	19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722, };void mlib_VideoIDCTAdd_U8_S16(uint8_t *output, const int16_t *input, int32_t stride) {	ASSERT(((int)output & 7) == 0);	/* Load constants, input data, and prescale factors.  Do prescaling. */	asm(""		"vspltish        31,0\n"		"lvx		24,0,%1\n"		"vspltish	23,4\n"\		"addi		5,0,0\n"		"vsplth		29,24,4\n"		"lvx		0,%0,5\n"		"addi		6,0,16\n"		"vsplth		28,24,3\n"		"lvx		0+16,%2,5\n"		"addi		7,0,32\n"		"vsplth		27,24,2\n"		"lvx		1,%0,6\n"		"addi		8,0,48\n"		"vsplth		26,24,1\n"		"lvx		1+16,%2,6\n"		"addi		5,0,64\n"		"vsplth		25,24,0\n"		"lvx		2,%0,7\n"		"addi		6,0,80\n"		"vslh		0,0,23\n"		"lvx		2+16,%2,7\n"		"addi		7,0,96\n"		"vslh		1,1,23\n"		"lvx		3,%0,8\n"		"vslh		2,2,23\n"		"lvx		3+16,%2,8\n"		"addi		8,0,112\n"		"vslh		3,3,23\n"		"lvx		4,%0,5\n"		"vsplth		30,24,5\n"		"lvx		5,%0,6\n"		"vsplth		24,24,6\n"		"lvx		6,%0,7\n"		"vslh		4,4,23\n"		"lvx		7,%0,8\n"		"vslh		5,5,23\n"		"vmhraddshs	0,0,0+16,31\n"		"vslh		6,6,23\n"		"vmhraddshs	4,4,0+16,31\n"		"vslh		7,7,23\n"	"" : : "b" (input), "b" (SpecialConstants), "b" (PreScale)	  : "cc", "r5", "r6", "r7", "r8", "memory");	asm(""		"vmhraddshs	1,1,1+16,31\n"		"vmhraddshs	5,5,3+16,31\n"		"vmhraddshs	2,2,2+16,31\n"		"vmhraddshs	6,6,2+16,31\n"		"vmhraddshs	3,3,3+16,31\n"		"vmhraddshs	7,7,1+16,31\n"\\		"vmhraddshs      11,27,7,1\n"		"vmhraddshs      19,27,1,31\n"		"vmhraddshs      12,26,6,2\n"		"vmhraddshs      13,30,3,5\n"		"vmhraddshs      17,28,5,3\n"		"vsubshs		18,19,7\n"	"");	/* Second stage. */	asm(""		"vmhraddshs      19,26,2,31\n"		"vaddshs		15,0,4\n"		"vsubshs		10,0,4\n"		"vsubshs		14,19,6\n"		"vaddshs		16,18,13\n"		"vsubshs		13,18,13\n"		"vsubshs		18,11,17\n"		"vaddshs		11,11,17\n"	"");	/* Third stage. */	asm(""		"vaddshs		17,15,12\n"		"vsubshs		12,15,12\n"		"vaddshs		15,10,14\n"		"vsubshs		10,10,14\n"		"vsubshs		14,18,13\n"		"vaddshs		13,18,13\n"	"");	/* Fourth stage. */	asm(""		"vmhraddshs      2,25,14,10\n"		"vsubshs		4,12,16\n"		"vmhraddshs      1,25,13,15\n"		"vaddshs		0,17,11\n"		"vmhraddshs      5,29,14,10\n"		"vmrghh  0+8,0,4\n"		"vaddshs		3,12,16\n"		"vmrglh  1+8,0,4\n"		"vmhraddshs      6,29,13,15\n"		"vmrghh  2+8,1,5\n"		"vsubshs		7,17,11\n"	"");	/* Transpose the matrix again. */	asm(""		"vmrglh  3+8,1,5\n"		"vmrghh  4+8,2,6\n"		"vmrglh  5+8,2,6\n"		"vmrghh  6+8,3,7\n"		"vmrglh  7+8,3,7\n"\		"vmrghh  0+16,0+8,4+8\n"		"vmrglh  1+16,0+8,4+8\n"		"vmrghh  2+16,1+8,5+8\n"		"vmrglh  3+16,1+8,5+8\n"		"vmrghh  4+16,2+8,6+8\n"		"vmrglh  5+16,2+8,6+8\n"		"vmrghh  6+16,3+8,7+8\n"		"vmrglh  7+16,3+8,7+8\n"\		"vmrglh  1,0+16,4+16\n"		"vmrglh  7,3+16,7+16\n"		"vmrglh  3,1+16,5+16\n"		"vmrghh  2,1+16,5+16\n"		"vmhraddshs      11,27,7,1\n"		"vmrghh  6,3+16,7+16\n"		"vmhraddshs      19,27,1,31\n"		"vmrglh  5,2+16,6+16\n"		"vmhraddshs      12,26,6,2\n"		"vmrghh  0,0+16,4+16\n"		"vmhraddshs      13,30,3,5\n"		"vmrghh  4,2+16,6+16\n"		"vmhraddshs      17,28,5,3\n"		"vsubshs		18,19,7\n"	"");	/* Add a rounding bias for the final shift.  v0 is added into every	   vector, so the bias propagates from here. */	asm(""		"vaddshs	0,0,24\n"	"");	/* Second stage. */	asm(""		"vmhraddshs      19,26,2,31\n"		"vaddshs		15,0,4\n"		"vsubshs		10,0,4\n"		"vsubshs		14,19,6\n"		"vaddshs		16,18,13\n"		"vsubshs		13,18,13\n"		"vsubshs		18,11,17\n"		"vaddshs		11,11,17\n"	"");	/* Third stage. */	asm(""		"vaddshs		17,15,12\n"		"vsubshs		12,15,12\n"		"vaddshs		15,10,14\n"		"vsu
上一页 1 2 3 45
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -