dct.cpp

来自「MPEG4音频视频压缩编码(含G.711/ACC/H.261等)」· C++ 代码 · 共 2,018 行 · 第 1/5 页
CPP
2,018 行
		out += stride;	}}void dcfill(int DC, u_char* out, int stride){	int t;	u_int dc = DC;	dc = LIMIT(dc, t);	dc |= dc << 8;	dc |= dc << 16;#ifdef INT_64	INT_64 xdc = dc;	xdc |= xdc << 32;	*(INT_64 *)out = xdc;	out += stride;	*(INT_64 *)out = xdc;	out += stride;	*(INT_64 *)out = xdc;	out += stride;	*(INT_64 *)out = xdc;	out += stride;	*(INT_64 *)out = xdc;	out += stride;	*(INT_64 *)out = xdc;	out += stride;	*(INT_64 *)out = xdc;	out += stride;	*(INT_64 *)out = xdc;#else	*(u_int*)out = dc;	*(u_int*)(out + 4) = dc;	out += stride;	*(u_int*)out = dc;	*(u_int*)(out + 4) = dc;	out += stride;	*(u_int*)out = dc;	*(u_int*)(out + 4) = dc;	out += stride;	*(u_int*)out = dc;	*(u_int*)(out + 4) = dc;	out += stride;	*(u_int*)out = dc;	*(u_int*)(out + 4) = dc;	out += stride;	*(u_int*)out = dc;	*(u_int*)(out + 4) = dc;	out += stride;	*(u_int*)out = dc;	*(u_int*)(out + 4) = dc;	out += stride;	*(u_int*)out = dc;	*(u_int*)(out + 4) = dc;#endif}#endif/* * This routine mixes the DC & AC components of an 8x8 block of * pixels.  This routine is called for every block decoded so it * needs to be efficient.  It tries to do as many pixels in parallel * as will fit in a word.  The one complication is that it has to * deal with overflow (sum > 255) and underflow (sum < 0).  Underflow * & overflow are only possible if both terms have the same sign and * are indicated by the result having a different sign than the terms. * Note that underflow is more worrisome than overflow since it results * in bright white dots in a black field. * The DC term and sum are biased by 128 so a negative number has the * 2^7 bit = 0.  The AC term is not biased so a negative number has * the 2^7 bit = 1.  So underflow is indicated by (DC & AC & sum) != 0; */#define MIX_LOGIC(sum, a, b, omask, uflo) \{ \	sum = a + b; \	uflo = (a ^ b) & (a ^ sum) & omask; \	if (uflo) { \		if ((b = uflo & a) != 0) { \			/* integer overflows */ \			b |= b >> 1; \			b |= b >> 2; \			b |= b >> 4; \			sum |= b; \		} \		if ((uflo &=~ b) != 0) { \			/* integer underflow(s) */ \			uflo |= uflo >> 1; \			uflo |= uflo >> 2; \			uflo |= uflo >> 4; \			sum &= ~uflo; \		} \	} \}/* * Table of products of 8-bit scaled coefficients * and idct coefficients (there are only 33 unique * coefficients so we index via a compact ID). */extern "C" u_char multab[];/* * Array of coefficient ID's used to index multab. */extern "C" u_int dct_basis[64][64 / sizeof(u_int)];/*FIXME*/#define LIMIT_512(s) ((s) > 511 ? 511 : (s) < -512 ? -512 : (s))#ifdef INT_64/*FIXME assume little-endian */#define PSPLICE(v, n) pix |= (INT_64)(v) << ((n)*8)#define DID4PIX#define PSTORE ((INT_64*)p)[0] = pix#define PIXDEF INT_64 pix = 0; int v, oflo = 0#else#define PSPLICE(v, n) SPLICE(pix, (v), (3 - ((n)&3)) * 8)#define DID4PIX pix0 = pix; pix = 0#define PSTORE ((u_int*)p)[0] = pix0; ((u_int*)p)[1] = pix#define PIXDEF	u_int pix0, pix = 0; int v, oflo = 0#endif#define DOJPIX(val, n) v = FP_JNORM(val); oflo |= v; PSPLICE(v, n)#define DOJPIXLIMIT(val, n) PSPLICE(LIMIT(FP_JNORM(val),t), n)#define DOPIX(val, n) v = FP_NORM(val); oflo |= v; PSPLICE(v, n)#define DOPIXLIMIT(val, n) PSPLICE(LIMIT(FP_NORM(val),t), n)#define DOPIXIN(val, n) v = FP_NORM(val) + in[n]; oflo |= v; PSPLICE(v, n)#define DOPIXINLIMIT(val, n) PSPLICE(LIMIT(FP_NORM(val) + in[n], t), n)#if MMX_DCT_ENABLED/* this code invokes mmx version of rdct.  This routine uses the algorithm * described by C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast * 1-D DCT Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics, * Speech, and Signal Processing 1989.  The algorithm itself is slower but more * accurate than the AAN algorithm used in the standard version of this routine. */char *output_buf[8];       // array of pointers to shortchar output_space[8][8];   // space that assembly code can write tovoid#ifdef INT_64rdct(register short *bp, INT_64 m0, u_char* p, int stride, const int* qt)#elserdct(register short *bp, u_int m0, u_int m1, u_char* p,     int stride, const int* qt)#endif{  int workspace[68];  short bpt[8][8];  short qtmx[64];  for (int i = 0; i < 64; i++) {    qtmx[i] = (short)qt[i];    bpt[i & 7][i >> 3] = *(bp + i);  }  domidct8x8llmW(&bpt[0][0], qtmx, workspace, p, stride);}#else/* The following code is the standard implementation of the DCT *//* * A 2D Inverse DCT based on a column-row decomposition using * Arai, Agui, and Nakajmia's 8pt 1D Inverse DCT, from Fig. 4-8 * Pennebaker & Mitchell (i.e., the pink JPEG book).  This figure * is the forward transform; reverse the flowgraph for the inverse * (you need to draw a picture).  The outputs are DFT coefficients * and need to be scaled according to Eq [4-3]. * * The input coefficients are, counter to tradition, in column-order. * The bit mask indicates which coefficients are non-zero.  If the * corresponding bit is zero, then the coefficient is assumed zero * and the input coefficient is not referenced and need not be defined. * The 8-bit outputs are computed in row order and placed in the * output array pointed to by p, with each of the eight 8-byte lines * offset by "stride" bytes. * * qt is the inverse quantization table in column order.  These * coefficients are the product of the inverse quantization factor, * specified by the jpeg quantization table, and the first multiplier * in the inverse DCT flow graph.  This first multiplier is actually * biased by the row term so that the columns are pre-scaled to * eliminate the first row multiplication stage. * * The output is biased by 128, i.e., [-128,127] is mapped to [0,255], * which is relevant to jpeg. */void#ifdef INT_64rdct(register short *bp, INT_64 m0, u_char* p, int stride, const int* qt)#elserdct(register short *bp, u_int m0, u_int m1, u_char* p,     int stride, const int* qt)#endif{	/*	 * First pass is 1D transform over the columns.  Note that	 * the coefficients are stored in column order, so even	 * though we're accessing the columns, we access them	 * in a row-like fashion.  We use a column-row decomposition	 * instead of a row-column decomposition so we can splice	 * pixels in an efficient, row-wise order in the second	 * stage.	 *	 * The inverse quantization is folded together with the	 * first stage multiplier.  This reduces the number of	 * multiplies (per 8-pt transform) from 11 to 5 (ignoring	 * the inverse quantization which must be done anyway).	 *	 * Because we compute appropriately scaled column DCTs,	 * the row DCTs require only 5 multiplies per row total.	 * So the total number of multiplies for the 8x8 DCT is 80	 * (ignoring the checks for zero coefficients).	 */	int tmp[64];	int* tp = tmp;	int i;	for (i = 8; --i >= 0; ) {		if ((m0 & 0xfe) == 0) {			/* AC terms all zero */			int v = M(0) ? qt[0] * bp[0] : 0;			tp[0] = v;			tp[1] = v;			tp[2] = v;			tp[3] = v;			tp[4] = v;			tp[5] = v;			tp[6] = v;			tp[7] = v;		} else {			int t0, t1, t2, t3, t4, t5, t6, t7;			if ((m0 & 0xaa) == 0)				t4 = t5 = t6 = t7 = 0;			else {				t0 = M(5) ? qt[5] * bp[5] : 0;				t2 = M(1) ? qt[1] * bp[1] : 0;				t6 = M(7) ? qt[7] * bp[7] : 0;				t7 = M(3) ? qt[3] * bp[3] : 0;				t4 = t0 - t7;				t1 = t2 + t6;				t6 = t2 - t6;				t7 += t0;				t5 = t1 - t7;				t7 += t1;				t2 = FP_MUL(-A5, t4 + t6);				t4 = FP_MUL(-A2, t4);				t0 = t4 + t2;				t1 = FP_MUL(A3, t5);				t2 += FP_MUL(A4, t6);				t4 = -t0;				t5 = t1 - t0;				t6 = t1 + t2;				t7 += t2;			}#ifdef notdef			if ((m0 & 0x55) == 0)				t0 = t1 = t2 = t3 = 0;			else {#endif				t0 = M(0) ? qt[0] * bp[0] : 0;				t1 = M(4) ? qt[4] * bp[4] : 0;				int x0 = t0 + t1;				int x1 = t0 - t1;				t0 = M(2) ? qt[2] * bp[2] : 0;				t3 = M(6) ? qt[6] * bp[6] : 0;				t2 = t0 - t3;				t3 += t0;				t2 = FP_MUL(A1, t2);				t3 += t2;				t0 = x0 + t3;				t1 = x1 + t2;				t2 = x1 - t2;				t3 = x0 - t3;#ifdef notdef			}#endif			tp[0] = t0 + t7;			tp[7] = t0 - t7;			tp[1] = t1 + t6;			tp[6] = t1 - t6;			tp[2] = t2 + t5;			tp[5] = t2 - t5;			tp[3] = t3 + t4;			tp[4] = t3 - t4;		}		tp += 8;		bp += 8;		qt += 8;		m0 >>= 8;#ifndef INT_64		m0 |= m1 << 24;		m1 >>= 8;#endif	}	tp -= 64;	/*	 * Second pass is 1D transform over the rows.  Note that	 * the coefficients are stored in column order, so even	 * though we're accessing the rows, we access them	 * in a column-like fashion.	 *	 * The pass above already computed the first multiplier	 * in the flow graph.	 */	for (i = 8; --i >= 0; ) {		int t0, t1, t2, t3, t4, t5, t6, t7;		t0 = tp[8*5];		t2 = tp[8*1];		t6 = tp[8*7];		t7 = tp[8*3];#ifdef notdef		if ((t0 | t2 | t6 | t7) == 0) {			t4 = t5 = 0;		} else#endif{			t4 = t0 - t7;			t1 = t2 + t6;			t6 = t2 - t6;			t7 += t0;			t5 = t1 - t7;			t7 += t1;			t2 = FP_MUL(-A5, t4 + t6);			t4 = FP_MUL(-A2, t4);			t0 = t4 + t2;			t1 = FP_MUL(A3, t5);			t2 += FP_MUL(A4, t6);			t4 = -t0;			t5 = t1 - t0;			t6 = t1 + t2;			t7 += t2;		}		t0 = tp[8*0];		t1 = tp[8*4];		int x0 = t0 + t1;		int x1 = t0 - t1;		t0 = tp[8*2];		t3 = tp[8*6];		t2 = t0 - t3;		t3 += t0;		t2 = FP_MUL(A1, t2);		t3 += t2;		t0 = x0 + t3;		t1 = x1 + t2;		t2 = x1 - t2;		t3 = x0 - t3;		PIXDEF;		DOJPIX(t0 + t7, 0);		DOJPIX(t1 + t6, 1);		DOJPIX(t2 + t5, 2);		DOJPIX(t3 + t4, 3);		DID4PIX;		DOJPIX(t3 - t4, 4);		DOJPIX(t2 - t5, 5);		DOJPIX(t1 - t6, 6);		DOJPIX(t0 - t7, 7);		if (oflo & ~0xff) {			int t;			pix = 0;			DOJPIXLIMIT(t0 + t7, 0);			DOJPIXLIMIT(t1 + t6, 1);			DOJPIXLIMIT(t2 + t5, 2);			DOJPIXLIMIT(t3 + t4, 3);			DID4PIX;			DOJPIXLIMIT(t3 - t4, 4);			DOJPIXLIMIT(t2 - t5, 5);			DOJPIXLIMIT(t1 - t6, 6);			DOJPIXLIMIT(t0 - t7, 7);		}		PSTORE;		++tp;		p += stride;	}}#endif/* * Inverse 2-D transform, similar to routine above (see comment above), * but more appropriate for H.261 instead of JPEG.  This routine does * not bias the output by 128, and has an additional argument which is * an input array which gets summed together with the inverse-transform. * For example, this allows motion-compensation to be folded in here, * saving an extra traversal of the block.  The input pointer can be * null, if motion-compensation is not needed.
dct.cpp - 源码说明

本页面展示了「MPEG4音频视频压缩编码(含G.711/ACC/H.261等)」中的 dct.cpp 源码文件，采用 C++ 编程语言编写，共 2,018 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与MPEG4相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?