📄 dct.cpp
字号:
out += stride; }}void dcfill(int DC, u_char* out, int stride){ int t; u_int dc = DC; dc = LIMIT(dc, t); dc |= dc << 8; dc |= dc << 16;#ifdef INT_64 INT_64 xdc = dc; xdc |= xdc << 32; *(INT_64 *)out = xdc; out += stride; *(INT_64 *)out = xdc; out += stride; *(INT_64 *)out = xdc; out += stride; *(INT_64 *)out = xdc; out += stride; *(INT_64 *)out = xdc; out += stride; *(INT_64 *)out = xdc; out += stride; *(INT_64 *)out = xdc; out += stride; *(INT_64 *)out = xdc;#else *(u_int*)out = dc; *(u_int*)(out + 4) = dc; out += stride; *(u_int*)out = dc; *(u_int*)(out + 4) = dc; out += stride; *(u_int*)out = dc; *(u_int*)(out + 4) = dc; out += stride; *(u_int*)out = dc; *(u_int*)(out + 4) = dc; out += stride; *(u_int*)out = dc; *(u_int*)(out + 4) = dc; out += stride; *(u_int*)out = dc; *(u_int*)(out + 4) = dc; out += stride; *(u_int*)out = dc; *(u_int*)(out + 4) = dc; out += stride; *(u_int*)out = dc; *(u_int*)(out + 4) = dc;#endif}#endif/* * This routine mixes the DC & AC components of an 8x8 block of * pixels. This routine is called for every block decoded so it * needs to be efficient. It tries to do as many pixels in parallel * as will fit in a word. The one complication is that it has to * deal with overflow (sum > 255) and underflow (sum < 0). Underflow * & overflow are only possible if both terms have the same sign and * are indicated by the result having a different sign than the terms. * Note that underflow is more worrisome than overflow since it results * in bright white dots in a black field. * The DC term and sum are biased by 128 so a negative number has the * 2^7 bit = 0. The AC term is not biased so a negative number has * the 2^7 bit = 1. So underflow is indicated by (DC & AC & sum) != 0; */#define MIX_LOGIC(sum, a, b, omask, uflo) \{ \ sum = a + b; \ uflo = (a ^ b) & (a ^ sum) & omask; \ if (uflo) { \ if ((b = uflo & a) != 0) { \ /* integer overflows */ \ b |= b >> 1; \ b |= b >> 2; \ b |= b >> 4; \ sum |= b; \ } \ if ((uflo &=~ b) != 0) { \ /* integer underflow(s) */ \ uflo |= uflo >> 1; \ uflo |= uflo >> 2; \ uflo |= uflo >> 4; \ sum &= ~uflo; \ } \ } \}/* * Table of products of 8-bit scaled coefficients * and idct coefficients (there are only 33 unique * coefficients so we index via a compact ID). */extern "C" u_char multab[];/* * Array of coefficient ID's used to index multab. */extern "C" u_int dct_basis[64][64 / sizeof(u_int)];/*FIXME*/#define LIMIT_512(s) ((s) > 511 ? 511 : (s) < -512 ? -512 : (s))#ifdef INT_64/*FIXME assume little-endian */#define PSPLICE(v, n) pix |= (INT_64)(v) << ((n)*8)#define DID4PIX#define PSTORE ((INT_64*)p)[0] = pix#define PIXDEF INT_64 pix = 0; int v, oflo = 0#else#define PSPLICE(v, n) SPLICE(pix, (v), (3 - ((n)&3)) * 8)#define DID4PIX pix0 = pix; pix = 0#define PSTORE ((u_int*)p)[0] = pix0; ((u_int*)p)[1] = pix#define PIXDEF u_int pix0, pix = 0; int v, oflo = 0#endif#define DOJPIX(val, n) v = FP_JNORM(val); oflo |= v; PSPLICE(v, n)#define DOJPIXLIMIT(val, n) PSPLICE(LIMIT(FP_JNORM(val),t), n)#define DOPIX(val, n) v = FP_NORM(val); oflo |= v; PSPLICE(v, n)#define DOPIXLIMIT(val, n) PSPLICE(LIMIT(FP_NORM(val),t), n)#define DOPIXIN(val, n) v = FP_NORM(val) + in[n]; oflo |= v; PSPLICE(v, n)#define DOPIXINLIMIT(val, n) PSPLICE(LIMIT(FP_NORM(val) + in[n], t), n)#if MMX_DCT_ENABLED/* this code invokes mmx version of rdct. This routine uses the algorithm * described by C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast * 1-D DCT Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics, * Speech, and Signal Processing 1989. The algorithm itself is slower but more * accurate than the AAN algorithm used in the standard version of this routine. */char *output_buf[8]; // array of pointers to shortchar output_space[8][8]; // space that assembly code can write tovoid#ifdef INT_64rdct(register short *bp, INT_64 m0, u_char* p, int stride, const int* qt)#elserdct(register short *bp, u_int m0, u_int m1, u_char* p, int stride, const int* qt)#endif{ int workspace[68]; short bpt[8][8]; short qtmx[64]; for (int i = 0; i < 64; i++) { qtmx[i] = (short)qt[i]; bpt[i & 7][i >> 3] = *(bp + i); } domidct8x8llmW(&bpt[0][0], qtmx, workspace, p, stride);}#else/* The following code is the standard implementation of the DCT *//* * A 2D Inverse DCT based on a column-row decomposition using * Arai, Agui, and Nakajmia's 8pt 1D Inverse DCT, from Fig. 4-8 * Pennebaker & Mitchell (i.e., the pink JPEG book). This figure * is the forward transform; reverse the flowgraph for the inverse * (you need to draw a picture). The outputs are DFT coefficients * and need to be scaled according to Eq [4-3]. * * The input coefficients are, counter to tradition, in column-order. * The bit mask indicates which coefficients are non-zero. If the * corresponding bit is zero, then the coefficient is assumed zero * and the input coefficient is not referenced and need not be defined. * The 8-bit outputs are computed in row order and placed in the * output array pointed to by p, with each of the eight 8-byte lines * offset by "stride" bytes. * * qt is the inverse quantization table in column order. These * coefficients are the product of the inverse quantization factor, * specified by the jpeg quantization table, and the first multiplier * in the inverse DCT flow graph. This first multiplier is actually * biased by the row term so that the columns are pre-scaled to * eliminate the first row multiplication stage. * * The output is biased by 128, i.e., [-128,127] is mapped to [0,255], * which is relevant to jpeg. */void#ifdef INT_64rdct(register short *bp, INT_64 m0, u_char* p, int stride, const int* qt)#elserdct(register short *bp, u_int m0, u_int m1, u_char* p, int stride, const int* qt)#endif{ /* * First pass is 1D transform over the columns. Note that * the coefficients are stored in column order, so even * though we're accessing the columns, we access them * in a row-like fashion. We use a column-row decomposition * instead of a row-column decomposition so we can splice * pixels in an efficient, row-wise order in the second * stage. * * The inverse quantization is folded together with the * first stage multiplier. This reduces the number of * multiplies (per 8-pt transform) from 11 to 5 (ignoring * the inverse quantization which must be done anyway). * * Because we compute appropriately scaled column DCTs, * the row DCTs require only 5 multiplies per row total. * So the total number of multiplies for the 8x8 DCT is 80 * (ignoring the checks for zero coefficients). */ int tmp[64]; int* tp = tmp; int i; for (i = 8; --i >= 0; ) { if ((m0 & 0xfe) == 0) { /* AC terms all zero */ int v = M(0) ? qt[0] * bp[0] : 0; tp[0] = v; tp[1] = v; tp[2] = v; tp[3] = v; tp[4] = v; tp[5] = v; tp[6] = v; tp[7] = v; } else { int t0, t1, t2, t3, t4, t5, t6, t7; if ((m0 & 0xaa) == 0) t4 = t5 = t6 = t7 = 0; else { t0 = M(5) ? qt[5] * bp[5] : 0; t2 = M(1) ? qt[1] * bp[1] : 0; t6 = M(7) ? qt[7] * bp[7] : 0; t7 = M(3) ? qt[3] * bp[3] : 0; t4 = t0 - t7; t1 = t2 + t6; t6 = t2 - t6; t7 += t0; t5 = t1 - t7; t7 += t1; t2 = FP_MUL(-A5, t4 + t6); t4 = FP_MUL(-A2, t4); t0 = t4 + t2; t1 = FP_MUL(A3, t5); t2 += FP_MUL(A4, t6); t4 = -t0; t5 = t1 - t0; t6 = t1 + t2; t7 += t2; }#ifdef notdef if ((m0 & 0x55) == 0) t0 = t1 = t2 = t3 = 0; else {#endif t0 = M(0) ? qt[0] * bp[0] : 0; t1 = M(4) ? qt[4] * bp[4] : 0; int x0 = t0 + t1; int x1 = t0 - t1; t0 = M(2) ? qt[2] * bp[2] : 0; t3 = M(6) ? qt[6] * bp[6] : 0; t2 = t0 - t3; t3 += t0; t2 = FP_MUL(A1, t2); t3 += t2; t0 = x0 + t3; t1 = x1 + t2; t2 = x1 - t2; t3 = x0 - t3;#ifdef notdef }#endif tp[0] = t0 + t7; tp[7] = t0 - t7; tp[1] = t1 + t6; tp[6] = t1 - t6; tp[2] = t2 + t5; tp[5] = t2 - t5; tp[3] = t3 + t4; tp[4] = t3 - t4; } tp += 8; bp += 8; qt += 8; m0 >>= 8;#ifndef INT_64 m0 |= m1 << 24; m1 >>= 8;#endif } tp -= 64; /* * Second pass is 1D transform over the rows. Note that * the coefficients are stored in column order, so even * though we're accessing the rows, we access them * in a column-like fashion. * * The pass above already computed the first multiplier * in the flow graph. */ for (i = 8; --i >= 0; ) { int t0, t1, t2, t3, t4, t5, t6, t7; t0 = tp[8*5]; t2 = tp[8*1]; t6 = tp[8*7]; t7 = tp[8*3];#ifdef notdef if ((t0 | t2 | t6 | t7) == 0) { t4 = t5 = 0; } else#endif{ t4 = t0 - t7; t1 = t2 + t6; t6 = t2 - t6; t7 += t0; t5 = t1 - t7; t7 += t1; t2 = FP_MUL(-A5, t4 + t6); t4 = FP_MUL(-A2, t4); t0 = t4 + t2; t1 = FP_MUL(A3, t5); t2 += FP_MUL(A4, t6); t4 = -t0; t5 = t1 - t0; t6 = t1 + t2; t7 += t2; } t0 = tp[8*0]; t1 = tp[8*4]; int x0 = t0 + t1; int x1 = t0 - t1; t0 = tp[8*2]; t3 = tp[8*6]; t2 = t0 - t3; t3 += t0; t2 = FP_MUL(A1, t2); t3 += t2; t0 = x0 + t3; t1 = x1 + t2; t2 = x1 - t2; t3 = x0 - t3; PIXDEF; DOJPIX(t0 + t7, 0); DOJPIX(t1 + t6, 1); DOJPIX(t2 + t5, 2); DOJPIX(t3 + t4, 3); DID4PIX; DOJPIX(t3 - t4, 4); DOJPIX(t2 - t5, 5); DOJPIX(t1 - t6, 6); DOJPIX(t0 - t7, 7); if (oflo & ~0xff) { int t; pix = 0; DOJPIXLIMIT(t0 + t7, 0); DOJPIXLIMIT(t1 + t6, 1); DOJPIXLIMIT(t2 + t5, 2); DOJPIXLIMIT(t3 + t4, 3); DID4PIX; DOJPIXLIMIT(t3 - t4, 4); DOJPIXLIMIT(t2 - t5, 5); DOJPIXLIMIT(t1 - t6, 6); DOJPIXLIMIT(t0 - t7, 7); } PSTORE; ++tp; p += stride; }}#endif/* * Inverse 2-D transform, similar to routine above (see comment above), * but more appropriate for H.261 instead of JPEG. This routine does * not bias the output by 128, and has an additional argument which is * an input array which gets summed together with the inverse-transform. * For example, this allows motion-compensation to be folded in here, * saving an extra traversal of the block. The input pointer can be * null, if motion-compensation is not needed.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -