📄 rgb2rgb_template.c

📁 从FFMPEG转换而来的H264解码程序,VC下编译..
💻 C
📖 第 1 页 / 共 5 页
字号:
		asm volatile(
			"xor %%"REG_a", %%"REG_a"	\n\t"
			".balign 16			\n\t"
			"1:				\n\t"
			PREFETCH" 32(%1, %%"REG_a", 2)	\n\t"
			PREFETCH" 32(%2, %%"REG_a")	\n\t"
			PREFETCH" 32(%3, %%"REG_a")	\n\t"
			"movq (%2, %%"REG_a"), %%mm0	\n\t" // U(0)
			"movq %%mm0, %%mm2		\n\t" // U(0)
			"movq (%3, %%"REG_a"), %%mm1	\n\t" // V(0)
			"punpcklbw %%mm1, %%mm0		\n\t" // UVUV UVUV(0)
			"punpckhbw %%mm1, %%mm2		\n\t" // UVUV UVUV(8)

			"movq (%1, %%"REG_a",2), %%mm3	\n\t" // Y(0)
			"movq 8(%1, %%"REG_a",2), %%mm5	\n\t" // Y(8)
			"movq %%mm3, %%mm4		\n\t" // Y(0)
			"movq %%mm5, %%mm6		\n\t" // Y(8)
			"punpcklbw %%mm0, %%mm3		\n\t" // YUYV YUYV(0)
			"punpckhbw %%mm0, %%mm4		\n\t" // YUYV YUYV(4)
			"punpcklbw %%mm2, %%mm5		\n\t" // YUYV YUYV(8)
			"punpckhbw %%mm2, %%mm6		\n\t" // YUYV YUYV(12)

			MOVNTQ" %%mm3, (%0, %%"REG_a", 4)\n\t"
			MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
			MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4)\n\t"
			MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"

			"add $8, %%"REG_a"		\n\t"
			"cmp %4, %%"REG_a"		\n\t"
			" jb 1b				\n\t"
			::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
			: "%"REG_a
		);
#else

#if defined ARCH_ALPHA && defined HAVE_MVI
#define pl2yuy2(n)					\
	y1 = yc[n];					\
	y2 = yc2[n];					\
	u = uc[n];					\
	v = vc[n];					\
	asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1));	\
	asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2));	\
	asm("unpkbl %1, %0" : "=r"(u) : "r"(u));	\
	asm("unpkbl %1, %0" : "=r"(v) : "r"(v));	\
	yuv1 = (u << 8) + (v << 24);			\
	yuv2 = yuv1 + y2;				\
	yuv1 += y1;					\
	qdst[n] = yuv1;					\
	qdst2[n] = yuv2;

		int i;
		uint64_t *qdst = (uint64_t *) dst;
		uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
		const uint32_t *yc = (uint32_t *) ysrc;
		const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
		const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
		for(i = 0; i < chromWidth; i += 8){
			uint64_t y1, y2, yuv1, yuv2;
			uint64_t u, v;
			/* Prefetch */
			asm("ldq $31,64(%0)" :: "r"(yc));
			asm("ldq $31,64(%0)" :: "r"(yc2));
			asm("ldq $31,64(%0)" :: "r"(uc));
			asm("ldq $31,64(%0)" :: "r"(vc));

			pl2yuy2(0);
			pl2yuy2(1);
			pl2yuy2(2);
			pl2yuy2(3);

			yc += 4;
			yc2 += 4;
			uc += 4;
			vc += 4;
			qdst += 4;
			qdst2 += 4;
		}
		y++;
		ysrc += lumStride;
		dst += dstStride;

#elif __WORDSIZE >= 64
		int i;
		uint64_t *ldst = (uint64_t *) dst;
		const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
		for(i = 0; i < chromWidth; i += 2){
			uint64_t k, l;
			k = yc[0] + (uc[0] << 8) +
			    (yc[1] << 16) + (vc[0] << 24);
			l = yc[2] + (uc[1] << 8) +
			    (yc[3] << 16) + (vc[1] << 24);
			*ldst++ = k + (l << 32);
			yc += 4;
			uc += 2;
			vc += 2;
		}

#else
		int i, *idst = (int32_t *) dst;
		const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
		for(i = 0; i < chromWidth; i++){
#ifdef WORDS_BIGENDIAN
			*idst++ = (yc[0] << 24)+ (uc[0] << 16) +
			    (yc[1] << 8) + (vc[0] << 0);
#else
			*idst++ = yc[0] + (uc[0] << 8) +
			    (yc[1] << 16) + (vc[0] << 24);
#endif
			yc += 2;
			uc++;
			vc++;
		}
#endif
#endif
		if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
		{
			usrc += chromStride;
			vsrc += chromStride;
		}
		ysrc += lumStride;
		dst += dstStride;
	}
#ifdef HAVE_MMX
asm(    EMMS" \n\t"
        SFENCE" \n\t"
        :::"memory");
#endif
}

/**
 *
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
 * problem for anyone then tell me, and ill fix it)
 */
static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
	long width, long height,
	stride_t lumStride, stride_t chromStride, stride_t dstStride)
{
	//FIXME interpolate chroma
	RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
}

static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
	long width, long height,
	stride_t lumStride, stride_t chromStride, stride_t dstStride, stride_t vertLumPerChroma)
{
	long y;
	const long chromWidth= width>>1;
	for(y=0; y<height; y++)
	{
#ifdef HAVE_MMX
//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
		asm volatile(
			"xor %%"REG_a", %%"REG_a"	\n\t"
			".balign 16			\n\t"
			"1:				\n\t"
			PREFETCH" 32(%1, %%"REG_a", 2)	\n\t"
			PREFETCH" 32(%2, %%"REG_a")	\n\t"
			PREFETCH" 32(%3, %%"REG_a")	\n\t"
			"movq (%2, %%"REG_a"), %%mm0	\n\t" // U(0)
			"movq %%mm0, %%mm2		\n\t" // U(0)
			"movq (%3, %%"REG_a"), %%mm1	\n\t" // V(0)
			"punpcklbw %%mm1, %%mm0		\n\t" // UVUV UVUV(0)
			"punpckhbw %%mm1, %%mm2		\n\t" // UVUV UVUV(8)

			"movq (%1, %%"REG_a",2), %%mm3	\n\t" // Y(0)
			"movq 8(%1, %%"REG_a",2), %%mm5	\n\t" // Y(8)
			"movq %%mm0, %%mm4		\n\t" // Y(0)
			"movq %%mm2, %%mm6		\n\t" // Y(8)
			"punpcklbw %%mm3, %%mm0		\n\t" // YUYV YUYV(0)
			"punpckhbw %%mm3, %%mm4		\n\t" // YUYV YUYV(4)
			"punpcklbw %%mm5, %%mm2		\n\t" // YUYV YUYV(8)
			"punpckhbw %%mm5, %%mm6		\n\t" // YUYV YUYV(12)

			MOVNTQ" %%mm0, (%0, %%"REG_a", 4)\n\t"
			MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
			MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4)\n\t"
			MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"

			"add $8, %%"REG_a"		\n\t"
			"cmp %4, %%"REG_a"		\n\t"
			" jb 1b				\n\t"
			::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
			: "%"REG_a
		);
#else
//FIXME adapt the alpha asm code from yv12->yuy2

#if __WORDSIZE >= 64
		int i;
		uint64_t *ldst = (uint64_t *) dst;
		const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
		for(i = 0; i < chromWidth; i += 2){
			uint64_t k, l;
			k = uc[0] + (yc[0] << 8) +
			    (vc[0] << 16) + (yc[1] << 24);
			l = uc[1] + (yc[2] << 8) +
			    (vc[1] << 16) + (yc[3] << 24);
			*ldst++ = k + (l << 32);
			yc += 4;
			uc += 2;
			vc += 2;
		}

#else
		int i, *idst = (int32_t *) dst;
		const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
		for(i = 0; i < chromWidth; i++){
#ifdef WORDS_BIGENDIAN
			*idst++ = (uc[0] << 24)+ (yc[0] << 16) +
			    (vc[0] << 8) + (yc[1] << 0);
#else
			*idst++ = uc[0] + (yc[0] << 8) +
			    (vc[0] << 16) + (yc[1] << 24);
#endif
			yc += 2;
			uc++;
			vc++;
		}
#endif
#endif
		if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
		{
			usrc += chromStride;
			vsrc += chromStride;
		}
		ysrc += lumStride;
		dst += dstStride;
	}
#ifdef HAVE_MMX
asm(    EMMS" \n\t"
        SFENCE" \n\t"
        :::"memory");
#endif
}

static inline void RENAME(yuvPlanartovyuy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
	long width, long height,
	stride_t lumStride, stride_t chromStride, stride_t dstStride, stride_t vertLumPerChroma)
{
	long y;
	const long chromWidth= width>>1;
	for(y=0; y<height; y++)
	{
		int i, *idst = (int32_t *) dst;
		const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
		for(i = 0; i < chromWidth; i++){
			*idst++ = vc[0] + (yc[0] << 8) +
			    (uc[0] << 16) + (yc[1] << 24);
			yc += 2;
			uc++;
			vc++;
		}
		if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
		{
			usrc += chromStride;
			vsrc += chromStride;
		}
		ysrc += lumStride;
		dst += dstStride;
	}
}

static inline void RENAME(yuvPlanartoyvyu)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
	long width, long height,
	stride_t lumStride, stride_t chromStride, stride_t dstStride, stride_t vertLumPerChroma)
{
	long y;
	const long chromWidth= width>>1;
	for(y=0; y<height; y++)
	{
		int i, *idst = (int32_t *) dst;
		const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
		for(i = 0; i < chromWidth; i++){
			*idst++ = yc[0] + (vc[0] << 8) +
			    (yc[1] << 16) + (uc[0] << 24);
			yc += 2;
			uc++;
			vc++;
		}
		if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
		{
			usrc += chromStride;
			vsrc += chromStride;
		}
		ysrc += lumStride;
		dst += dstStride;
	}
}

/**
 *
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
 * problem for anyone then tell me, and ill fix it)
 */
static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
	long width, long height,
	stride_t lumStride, stride_t chromStride, stride_t dstStride)
{
	//FIXME interpolate chroma
	RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
}

static inline void RENAME(yv12toyvyu)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
	long width, long height,
	stride_t lumStride, stride_t chromStride, stride_t dstStride)
{
	//FIXME interpolate chroma
	RENAME(yuvPlanartoyvyu)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
}

static inline void RENAME(yv12tovyuy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
	long width, long height,
	stride_t lumStride, stride_t chromStride, stride_t dstStride)
{
	//FIXME interpolate chroma
	RENAME(yuvPlanartovyuy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
}

/**
 *
 * width should be a multiple of 16
 */
static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
	long width, long height,
	stride_t lumStride, stride_t chromStride, stride_t dstStride)
{
	RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
}

/**
 *
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
 * problem for anyone then tell me, and ill fix it)
 */
static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
	long width, long height,
	stride_t lumStride, stride_t chromStride, stride_t srcStride)
{
	long y;
	const long chromWidth= width>>1;
	for(y=0; y<height; y+=2)
	{
#ifdef HAVE_MMX
		asm volatile(
			"xor %%"REG_a", %%"REG_a"	\n\t"
			"pcmpeqw %%mm7, %%mm7		\n\t"
			"psrlw $8, %%mm7		\n\t" // FF,00,FF,00...
			".balign 16			\n\t"
			"1:				\n\t"
			PREFETCH" 64(%0, %%"REG_a", 4)	\n\t"
			"movq (%0, %%"REG_a", 4), %%mm0	\n\t" // YUYV YUYV(0)
			"movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
			"movq %%mm0, %%mm2		\n\t" // YUYV YUYV(0)
			"movq %%mm1, %%mm3		\n\t" // YUYV YUYV(4)
			"psrlw $8, %%mm0		\n\t" // U0V0 U0V0(0)
			"psrlw $8, %%mm1		\n\t" // U0V0 U0V0(4)
			"pand %%mm7, %%mm2		\n\t" // Y0Y0 Y0Y0(0)
			"pand %%mm7, %%mm3		\n\t" // Y0Y0 Y0Y0(4)
			"packuswb %%mm1, %%mm0		\n\t" // UVUV UVUV(0)
			"packuswb %%mm3, %%mm2		\n\t" // YYYY YYYY(0)

			MOVNTQ" %%mm2, (%1, %%"REG_a", 2)\n\t"

			"movq 16(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(8)
			"movq 24(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(12)
			"movq %%mm1, %%mm3		\n\t" // YUYV YUYV(8)
			"movq %%mm2, %%mm4		\n\t" // YUYV YUYV(12)
			"psrlw $8, %%mm1		\n\t" // U0V0 U0V0(8)
			"psrlw $8, %%mm2		\n\t" // U0V0 U0V0(12)
			"pand %%mm7, %%mm3		\n\t" // Y0Y0 Y0Y0(8)
			"pand %%mm7, %%mm4		\n\t" // Y0Y0 Y0Y0(12)
			"packuswb %%mm2, %%mm1		\n\t" // UVUV UVUV(8)
			"packuswb %%mm4, %%mm3		\n\t" // YYYY YYYY(8)

			MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2)\n\t"

			"movq %%mm0, %%mm2		\n\t" // UVUV UVUV(0)
			"movq %%mm1, %%mm3		\n\t" // UVUV UVUV(8)
			"psrlw $8, %%mm0		\n\t" // V0V0 V0V0(0)
			"psrlw $8, %%mm1		\n\t" // V0V0 V0V0(8)
			"pand %%mm7, %%mm2		\n\t" // U0U0 U0U0(0)
			"pand %%mm7, %%mm3		\n\t" // U0U0 U0U0(8)
			"packuswb %%mm1, %%mm0		\n\t" // VVVV VVVV(0)
			"packuswb %%mm3, %%mm2		\n\t" // UUUU UUUU(0)

			MOVNTQ" %%mm0, (%3, %%"REG_a")	\n\t"
			MOVNTQ" %%mm2, (%2, %%"REG_a")	\n\t"

			"add $8, %%"REG_a"		\n\t"
			"cmp %4, %%"REG_a"		\n\t"
			" jb 1b				\n\t"
			::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
			: "memory", "%"REG_a
		);

		ydst += lumStride;
		src  += srcStride;

		asm volatile(
			"xor %%"REG_a", %%"REG_a"	\n\t"
			".balign 16			\n\t"
			"1:				\n\t"
			PREFETCH" 64(%0, %%"REG_a", 4)	\n\t"
			"movq (%0, %%"REG_a", 4), %%mm0	\n\t" // YUYV YUYV(0)
			"movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
			"movq 16(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(8)
			"movq 24(%0, %%"REG_a", 4), %%mm3\n\t" // YUYV YUYV(12)
			"pand %%mm7, %%mm0		\n\t" // Y0Y0 Y0Y0(0)
			"pand %%mm7, %%mm1		\n\t" // Y0Y0 Y0Y0(4)
			"pand %%mm7, %%mm2		\n\t" // Y0Y0 Y0Y0(8)
			"pand %%mm7, %%mm3		\n\t" // Y0Y0 Y0Y0(12)
			"packuswb %%mm1, %%mm0		\n\t" // YYYY YYYY(0)
			"packuswb %%mm3, %%mm2		\n\t" // YYYY YYYY(8)

			MOVNTQ" %%mm0, (%1, %%"REG_a", 2)\n\t"
			MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2)\n\t"

			"add $8, %%"REG_a"		\n\t"
			"cmp %4, %%"REG_a"		\n\t"
			" jb 1b				\n\t"

			::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
			: "memory", "%"REG_a
		);
#else
		long i;
		for(i=0; i<chromWidth; i++)
		{
			ydst[2*i+0] 	= src[4*i+0];
			udst[i] 	= src[4*i+1];
			ydst[2*i+1] 	= src[4*i+2];
			vdst[i] 	= src[4*i+3];
		}
		ydst += lumStride;
		src  += srcStride;

		for(i=0; i<chromWidth; i++)
		{
			ydst[2*i+0] 	= src[4*i+0];
			ydst[2*i+1] 	= src[4*i+2];
		}
#endif
		udst += chromStride;
		vdst += chromStride;
		ydst += lumStride;
		src  += srcStride;
	}
#ifdef HAVE_MMX
asm volatile(   EMMS" \n\t"
        	SFENCE" \n\t"
        	:::"memory");
#endif
}

static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
	uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
	long width, long height, stride_t lumStride, stride_t chromStride)
{
	/* Y Plane */
	memcpy(ydst, ysrc, width*height);

	/* XXX: implement upscaling for U,V */
}

static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, stride_t srcStride, stride_t dstStride)
{
	long x,y;

	dst[0]= src[0];

	// first line
	for(x=0; x<srcWidth-1; x++){
		dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
		dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
	}
	dst[2*srcWidth-1]= src[srcWidth-1];

	dst+= dstStride;

	for(y=1; y<srcHeight; y++){
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
		const long mmxSize= srcWidth&~15;
		asm volatile(
			"mov %4, %%"REG_a"		\n\t"
			"1:				\n\t"
			"movq (%0, %%"REG_a"), %%mm0	\n\t"
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -