📄 dct.cxx

📁 sloedgy open sip stack source code
💻 CXX
📖 第 1 页 / 共 3 页
字号:
	dc = LIMIT(dc, t);
	dc |= dc << 8;
	dc |= dc << 16;
#ifdef INT_64
	INT_64 xdc = dc;
	xdc |= xdc << 32;
	*(INT_64 *)out = xdc;
	out += stride;
	*(INT_64 *)out = xdc;
	out += stride;
	*(INT_64 *)out = xdc;
	out += stride;
	*(INT_64 *)out = xdc;
	out += stride;
	*(INT_64 *)out = xdc;
	out += stride;
	*(INT_64 *)out = xdc;
	out += stride;
	*(INT_64 *)out = xdc;
	out += stride;
	*(INT_64 *)out = xdc;
#else
	*(u_int*)out = dc;
	*(u_int*)(out + 4) = dc;
	out += stride;
	*(u_int*)out = dc;
	*(u_int*)(out + 4) = dc;
	out += stride;
	*(u_int*)out = dc;
	*(u_int*)(out + 4) = dc;
	out += stride;
	*(u_int*)out = dc;
	*(u_int*)(out + 4) = dc;
	out += stride;
	*(u_int*)out = dc;
	*(u_int*)(out + 4) = dc;
	out += stride;
	*(u_int*)out = dc;
	*(u_int*)(out + 4) = dc;
	out += stride;
	*(u_int*)out = dc;
	*(u_int*)(out + 4) = dc;
	out += stride;
	*(u_int*)out = dc;
	*(u_int*)(out + 4) = dc;
#endif
}

/*
 * This routine mixes the DC & AC components of an 8x8 block of
 * pixels.  This routine is called for every block decoded so it
 * needs to be efficient.  It tries to do as many pixels in parallel
 * as will fit in a word.  The one complication is that it has to
 * deal with overflow (sum > 255) and underflow (sum < 0).  Underflow
 * & overflow are only possible if both terms have the same sign and
 * are indicated by the result having a different sign than the terms.
 * Note that underflow is more worrisome than overflow since it results
 * in bright white dots in a black field.
 * The DC term and sum are biased by 128 so a negative number has the
 * 2^7 bit = 0.  The AC term is not biased so a negative number has
 * the 2^7 bit = 1.  So underflow is indicated by (DC & AC & sum) != 0;
 */
#define MIX_LOGIC(sum, a, b, omask, uflo) \
{ \
	sum = a + b; \
	uflo = (a ^ b) & (a ^ sum) & omask; \
	if (uflo) { \
		if ((b = uflo & a) != 0) { \
			/* integer overflows */ \
			b |= b >> 1; \
			b |= b >> 2; \
			b |= b >> 4; \
			sum |= b; \
		} \
		if ((uflo &=~ b) != 0) { \
			/* integer underflow(s) */ \
			uflo |= uflo >> 1; \
			uflo |= uflo >> 2; \
			uflo |= uflo >> 4; \
			sum &= ~uflo; \
		} \
	} \
}
/*
 * Table of products of 8-bit scaled coefficients
 * and idct coefficients (there are only 33 unique
 * coefficients so we index via a compact ID).
 */
extern "C" u_char multab[];
/*
 * Array of coefficient ID's used to index multab.
 */
extern "C" u_int dct_basis[64][64 / sizeof(u_int)];

/*XXX*/
#define LIMIT_512(s) ((s) > 511 ? 511 : (s) < -512 ? -512 : (s))

void
bv_rdct1(int dc, short* bp, int acx, u_char* out, int stride)
{
	u_int omask = 0x80808080;
	u_int uflo;
	u_int* vp = dct_basis[acx];
	int s = LIMIT_512(bp[acx]);
	s = (s >> 2) & 0xff;
	/* 66 unique coefficients require 7 bits */
	char* mt = (char*)&multab[s << 7];

	dc |= dc << 8;
	dc |= dc << 16;
	for (int k = 8; --k >= 0; ) {
		u_int v = *vp++;
		u_int m = mt[v >> 24] << SHIFT(24) |
			mt[v >> 16 & 0xff] << SHIFT(16) |
			mt[v >> 8 & 0xff] << SHIFT(8) |
			mt[v & 0xff] << SHIFT(0);
		MIX_LOGIC(v, dc, m, omask, uflo);
		*(u_int*)out = v;
		v = *vp++;
		m = mt[v >> 24] << SHIFT(24) |
			mt[v >> 16 & 0xff] << SHIFT(16) |
			mt[v >> 8 & 0xff] << SHIFT(8) |
			mt[v & 0xff] << SHIFT(0);
		MIX_LOGIC(v, dc, m, omask, uflo);
		*(u_int*)(out + 4) = v;
		out += stride;
	}
}

/* XXX this version has to be exact */
void
bv_rdct2(int dc, short* bp, int ac0, u_char* in, u_char* out, int stride)
{
	int s0 = LIMIT_512(bp[ac0]);
	s0 = (s0 >> 2) & 0xff;
	/* 66 unique coefficients require 7 bits */
	const char* mt = (const char*)&multab[s0 << 7];
	const u_int* vp0 = dct_basis[ac0];

	dc |= dc << 8;
	dc |= dc << 16;
	u_int omask = 0x80808080;
	u_int uflo;
	for (int k = 8; --k >= 0; ) {
		u_int v, m, i;
		v = *vp0++;
		m = mt[v >> 24] << SHIFT(24) | mt[v >> 16 & 0xff] << SHIFT(16) |
		    mt[v >> 8 & 0xff] << SHIFT(8) | mt[v & 0xff] << SHIFT(0);
		MIX_LOGIC(v, dc, m, omask, uflo);
		i = in[0] << SHIFT(24) | in[1] << SHIFT(16) |
		    in[2] << SHIFT(8) | in[3] << SHIFT(0);
		MIX_LOGIC(m, i, v, omask, uflo);
		*(u_int*)out = m;

		v = *vp0++;
		m = mt[v >> 24] << SHIFT(24) | mt[v >> 16 & 0xff] << SHIFT(16) |
		    mt[v >> 8 & 0xff] << SHIFT(8) | mt[v & 0xff] << SHIFT(0);
		MIX_LOGIC(v, dc, m, omask, uflo);
		i = in[4] << SHIFT(24) | in[5] << SHIFT(16) |
		    in[6] << SHIFT(8) | in[7] << SHIFT(0);
		MIX_LOGIC(m, i, v, omask, uflo);
		*(u_int*)(out + 4) = m;
		out += stride;
		in += stride;
	}
}

/* XXX this version has to be exact */
void
bv_rdct3(int dc, short* bp, int ac0, int ac1, u_char* in, u_char* out, int stride)
{
	int s0 = LIMIT_512(bp[ac0]);
	s0 = (s0 >> 2) & 0xff;
	/* 66 unique coefficients require 7 bits */
	char* mt0 = (char*)&multab[s0 << 7];

	int s1 = LIMIT_512(bp[ac1]);
	s1 = (s1 >> 2) & 0xff;
	char* mt1 = (char*)&multab[s1 << 7];

	u_int* vp0 = dct_basis[ac0];
	u_int* vp1 = dct_basis[ac1];

	for (int k = 8; --k >= 0; ) {
		int t;
		u_int v0 = *vp0++;
		u_int v1 = *vp1++;
		s0 = mt0[v0 >> 24] + mt1[v1 >> 24] + in[0] + dc;
		u_int m = LIMIT(s0, t) << SHIFT(24);
		s0 = mt0[v0 >> 16 & 0xff] + mt1[v1 >> 16 & 0xff] + in[1] + dc;
		m |= LIMIT(s0, t) << SHIFT(16);
		s0 = mt0[v0 >> 8 & 0xff] + mt1[v1 >> 8 & 0xff] + in[2] + dc;
		m |= LIMIT(s0, t) << SHIFT(8);
		s0 = mt0[v0 & 0xff] + mt1[v1 & 0xff] + in[3] + dc;
		m |= LIMIT(s0, t) << SHIFT(0);
		*(u_int*)out = m;

		v0 = *vp0++;
		v1 = *vp1++;
		s0 = mt0[v0 >> 24] + mt1[v1 >> 24] + in[4] + dc;
		m = 0;
		m |= LIMIT(s0, t) << SHIFT(24);
		s0 = mt0[v0 >> 16 & 0xff] + mt1[v1 >> 16 & 0xff] + in[5] + dc;
		m |= LIMIT(s0, t) << SHIFT(16);
		s0 = mt0[v0 >> 8 & 0xff] + mt1[v1 >> 8 & 0xff] + in[6] + dc;
		m |= LIMIT(s0, t) << SHIFT(8);
		s0 = mt0[v0 & 0xff] + mt1[v1 & 0xff] + in[7] + dc;
		m |= LIMIT(s0, t) << SHIFT(0);
		*(u_int*)(out + 4) = m;

		out += stride;
		in += stride;
	}
}

#ifdef INT_64
/*XXX assume little-endian */
#define PSPLICE(v, n) pix |= (INT_64)(v) << ((n)*8)
#define DID4PIX 
#define PSTORE ((INT_64*)p)[0] = pix
#define PIXDEF INT_64 pix = 0; int v, oflo = 0
#else
#define PSPLICE(v, n) SPLICE(pix, (v), (3 - ((n)&3)) * 8)
#define DID4PIX pix0 = pix; pix = 0
#define PSTORE ((u_int*)p)[0] = pix0; ((u_int*)p)[1] = pix
#define PIXDEF	u_int pix0, pix = 0; int v, oflo = 0
#endif
#define DOJPIX(val, n) v = FP_JNORM(val); oflo |= v; PSPLICE(v, n)
#define DOJPIXLIMIT(val, n) PSPLICE(LIMIT(FP_JNORM(val),t), n)
#define DOPIX(val, n) v = FP_NORM(val); oflo |= v; PSPLICE(v, n)
#define DOPIXLIMIT(val, n) PSPLICE(LIMIT(FP_NORM(val),t), n)
#define DOPIXIN(val, n) v = FP_NORM(val) + in[n]; oflo |= v; PSPLICE(v, n)
#define DOPIXINLIMIT(val, n) PSPLICE(LIMIT(FP_NORM(val) + in[n], t), n)

/*
 * A 2D Inverse DCT based on a column-row decomposition using
 * Arai, Agui, and Nakajmia's 8pt 1D Inverse DCT, from Fig. 4-8
 * Pennebaker & Mitchell (i.e., the pink JPEG book).  This figure
 * is the forward transform; reverse the flowgraph for the inverse
 * (you need to draw a picture).  The outputs are DFT coefficients
 * and need to be scaled according to Eq [4-3].
 *
 * The input coefficients are, counter to tradition, in column-order.
 * The bit mask indicates which coefficients are non-zero.  If the
 * corresponding bit is zero, then the coefficient is assumed zero
 * and the input coefficient is not referenced and need not be defined.
 * The 8-bit outputs are computed in row order and placed in the
 * output array pointed to by p, with each of the eight 8-byte lines
 * offset by "stride" bytes.
 *
 * qt is the inverse quantization table in column order.  These
 * coefficients are the product of the inverse quantization factor,
 * specified by the jpeg quantization table, and the first multiplier
 * in the inverse DCT flow graph.  This first multiplier is actually
 * biased by the row term so that the columns are pre-scaled to
 * eliminate the first row multiplication stage.
 *
 * The output is biased by 128, i.e., [-128,127] is mapped to [0,255],
 * which is relevant to jpeg.
 */
void
#ifdef INT_64
rdct(register short *bp, INT_64 m0, u_char* p, int stride, const int* qt)
#else
rdct(register short *bp, u_int m0, u_int m1, u_char* p,
     int stride, const int* qt)
#endif
{
	/*
	 * First pass is 1D transform over the columns.  Note that
	 * the coefficients are stored in column order, so even
	 * though we're accessing the columns, we access them
	 * in a row-like fashion.  We use a column-row decomposition
	 * instead of a row-column decomposition so we can splice
	 * pixels in an efficient, row-wise order in the second
	 * stage.
	 *
	 * The inverse quantization is folded together with the
	 * first stage multiplier.  This reduces the number of
	 * multiplies (per 8-pt transform) from 11 to 5 (ignoring
	 * the inverse quantization which must be done anyway).
	 *
	 * Because we compute appropriately scaled column DCTs,
	 * the row DCTs require only 5 multiplies per row total.
	 * So the total number of multiplies for the 8x8 DCT is 80
	 * (ignoring the checks for zero coefficients).
	 */
	int tmp[64];
	int* tp = tmp;
	int i;
	for (i = 8; --i >= 0; ) {
		if ((m0 & 0xfe) == 0) {
			/* AC terms all zero */
			int v = M(0) ? qt[0] * bp[0] : 0;
			tp[0] = v;
			tp[1] = v;
			tp[2] = v;
			tp[3] = v;
			tp[4] = v;
			tp[5] = v;
			tp[6] = v;
			tp[7] = v;
		} else {
			int t0, t1, t2, t3, t4, t5, t6, t7;

			if ((m0 & 0xaa) == 0)
				t4 = t5 = t6 = t7 = 0;
			else {
				t0 = M(5) ? qt[5] * bp[5] : 0;
				t2 = M(1) ? qt[1] * bp[1] : 0;
				t6 = M(7) ? qt[7] * bp[7] : 0;
				t7 = M(3) ? qt[3] * bp[3] : 0;

				t4 = t0 - t7;
				t1 = t2 + t6;
				t6 = t2 - t6;
				t7 += t0;

				t5 = t1 - t7;
				t7 += t1;

				t2 = FP_MUL(-A5, t4 + t6);
				t4 = FP_MUL(-A2, t4);
				t0 = t4 + t2;
				t1 = FP_MUL(A3, t5);
				t2 += FP_MUL(A4, t6);
			
				t4 = -t0;
				t5 = t1 - t0;
				t6 = t1 + t2;
				t7 += t2;
			}

#ifdef notdef
			if ((m0 & 0x55) == 0)
				t0 = t1 = t2 = t3 = 0;
			else {
#endif
				t0 = M(0) ? qt[0] * bp[0] : 0;
				t1 = M(4) ? qt[4] * bp[4] : 0;
				int x0 = t0 + t1;
				int x1 = t0 - t1;

				t0 = M(2) ? qt[2] * bp[2] : 0;
				t3 = M(6) ? qt[6] * bp[6] : 0;
				t2 = t0 - t3;
				t3 += t0;

				t2 = FP_MUL(A1, t2);
				t3 += t2;

				t0 = x0 + t3;
				t1 = x1 + t2;
				t2 = x1 - t2;
				t3 = x0 - t3;
#ifdef notdef
			}
#endif

			tp[0] = t0 + t7;
			tp[7] = t0 - t7;
			tp[1] = t1 + t6;
			tp[6] = t1 - t6;
			tp[2] = t2 + t5;
			tp[5] = t2 - t5;
			tp[3] = t3 + t4;
			tp[4] = t3 - t4;
		}
		tp += 8;
		bp += 8;
		qt += 8;

		m0 >>= 8;
#ifndef INT_64
		m0 |= m1 << 24;
		m1 >>= 8;
#endif
	}
	tp -= 64;
	/*
	 * Second pass is 1D transform over the rows.  Note that
	 * the coefficients are stored in column order, so even
	 * though we're accessing the rows, we access them
	 * in a column-like fashion.
	 *
	 * The pass above already computed the first multiplier
	 * in the flow graph.
	 */
	for (i = 8; --i >= 0; ) {
		int t0, t1, t2, t3, t4, t5, t6, t7;

		t0 = tp[8*5];
		t2 = tp[8*1];
		t6 = tp[8*7];
		t7 = tp[8*3];

#ifdef notdef
		if ((t0 | t2 | t6 | t7) == 0) {
			t4 = t5 = 0;
		} else 
#endif
{
			t4 = t0 - t7;
			t1 = t2 + t6;
			t6 = t2 - t6;
			t7 += t0;

			t5 = t1 - t7;
			t7 += t1;

			t2 = FP_MUL(-A5, t4 + t6);
			t4 = FP_MUL(-A2, t4);
			t0 = t4 + t2;
			t1 = FP_MUL(A3, t5);
			t2 += FP_MUL(A4, t6);
			
			t4 = -t0;
			t5 = t1 - t0;
			t6 = t1 + t2;
			t7 += t2;
		}

		t0 = tp[8*0];
		t1 = tp[8*4];
		int x0 = t0 + t1;
		int x1 = t0 - t1;

		t0 = tp[8*2];
		t3 = tp[8*6];
		t2 = t0 - t3;
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -