📄 dct.cxx
字号:
dc = LIMIT(dc, t);
dc |= dc << 8;
dc |= dc << 16;
#ifdef INT_64
INT_64 xdc = dc;
xdc |= xdc << 32;
*(INT_64 *)out = xdc;
out += stride;
*(INT_64 *)out = xdc;
out += stride;
*(INT_64 *)out = xdc;
out += stride;
*(INT_64 *)out = xdc;
out += stride;
*(INT_64 *)out = xdc;
out += stride;
*(INT_64 *)out = xdc;
out += stride;
*(INT_64 *)out = xdc;
out += stride;
*(INT_64 *)out = xdc;
#else
*(u_int*)out = dc;
*(u_int*)(out + 4) = dc;
out += stride;
*(u_int*)out = dc;
*(u_int*)(out + 4) = dc;
out += stride;
*(u_int*)out = dc;
*(u_int*)(out + 4) = dc;
out += stride;
*(u_int*)out = dc;
*(u_int*)(out + 4) = dc;
out += stride;
*(u_int*)out = dc;
*(u_int*)(out + 4) = dc;
out += stride;
*(u_int*)out = dc;
*(u_int*)(out + 4) = dc;
out += stride;
*(u_int*)out = dc;
*(u_int*)(out + 4) = dc;
out += stride;
*(u_int*)out = dc;
*(u_int*)(out + 4) = dc;
#endif
}
/*
* This routine mixes the DC & AC components of an 8x8 block of
* pixels. This routine is called for every block decoded so it
* needs to be efficient. It tries to do as many pixels in parallel
* as will fit in a word. The one complication is that it has to
* deal with overflow (sum > 255) and underflow (sum < 0). Underflow
* & overflow are only possible if both terms have the same sign and
* are indicated by the result having a different sign than the terms.
* Note that underflow is more worrisome than overflow since it results
* in bright white dots in a black field.
* The DC term and sum are biased by 128 so a negative number has the
* 2^7 bit = 0. The AC term is not biased so a negative number has
* the 2^7 bit = 1. So underflow is indicated by (DC & AC & sum) != 0;
*/
#define MIX_LOGIC(sum, a, b, omask, uflo) \
{ \
sum = a + b; \
uflo = (a ^ b) & (a ^ sum) & omask; \
if (uflo) { \
if ((b = uflo & a) != 0) { \
/* integer overflows */ \
b |= b >> 1; \
b |= b >> 2; \
b |= b >> 4; \
sum |= b; \
} \
if ((uflo &=~ b) != 0) { \
/* integer underflow(s) */ \
uflo |= uflo >> 1; \
uflo |= uflo >> 2; \
uflo |= uflo >> 4; \
sum &= ~uflo; \
} \
} \
}
/*
* Table of products of 8-bit scaled coefficients
* and idct coefficients (there are only 33 unique
* coefficients so we index via a compact ID).
*/
extern "C" u_char multab[];
/*
* Array of coefficient ID's used to index multab.
*/
extern "C" u_int dct_basis[64][64 / sizeof(u_int)];
/*XXX*/
#define LIMIT_512(s) ((s) > 511 ? 511 : (s) < -512 ? -512 : (s))
void
bv_rdct1(int dc, short* bp, int acx, u_char* out, int stride)
{
u_int omask = 0x80808080;
u_int uflo;
u_int* vp = dct_basis[acx];
int s = LIMIT_512(bp[acx]);
s = (s >> 2) & 0xff;
/* 66 unique coefficients require 7 bits */
char* mt = (char*)&multab[s << 7];
dc |= dc << 8;
dc |= dc << 16;
for (int k = 8; --k >= 0; ) {
u_int v = *vp++;
u_int m = mt[v >> 24] << SHIFT(24) |
mt[v >> 16 & 0xff] << SHIFT(16) |
mt[v >> 8 & 0xff] << SHIFT(8) |
mt[v & 0xff] << SHIFT(0);
MIX_LOGIC(v, dc, m, omask, uflo);
*(u_int*)out = v;
v = *vp++;
m = mt[v >> 24] << SHIFT(24) |
mt[v >> 16 & 0xff] << SHIFT(16) |
mt[v >> 8 & 0xff] << SHIFT(8) |
mt[v & 0xff] << SHIFT(0);
MIX_LOGIC(v, dc, m, omask, uflo);
*(u_int*)(out + 4) = v;
out += stride;
}
}
/* XXX this version has to be exact */
void
bv_rdct2(int dc, short* bp, int ac0, u_char* in, u_char* out, int stride)
{
int s0 = LIMIT_512(bp[ac0]);
s0 = (s0 >> 2) & 0xff;
/* 66 unique coefficients require 7 bits */
const char* mt = (const char*)&multab[s0 << 7];
const u_int* vp0 = dct_basis[ac0];
dc |= dc << 8;
dc |= dc << 16;
u_int omask = 0x80808080;
u_int uflo;
for (int k = 8; --k >= 0; ) {
u_int v, m, i;
v = *vp0++;
m = mt[v >> 24] << SHIFT(24) | mt[v >> 16 & 0xff] << SHIFT(16) |
mt[v >> 8 & 0xff] << SHIFT(8) | mt[v & 0xff] << SHIFT(0);
MIX_LOGIC(v, dc, m, omask, uflo);
i = in[0] << SHIFT(24) | in[1] << SHIFT(16) |
in[2] << SHIFT(8) | in[3] << SHIFT(0);
MIX_LOGIC(m, i, v, omask, uflo);
*(u_int*)out = m;
v = *vp0++;
m = mt[v >> 24] << SHIFT(24) | mt[v >> 16 & 0xff] << SHIFT(16) |
mt[v >> 8 & 0xff] << SHIFT(8) | mt[v & 0xff] << SHIFT(0);
MIX_LOGIC(v, dc, m, omask, uflo);
i = in[4] << SHIFT(24) | in[5] << SHIFT(16) |
in[6] << SHIFT(8) | in[7] << SHIFT(0);
MIX_LOGIC(m, i, v, omask, uflo);
*(u_int*)(out + 4) = m;
out += stride;
in += stride;
}
}
/* XXX this version has to be exact */
void
bv_rdct3(int dc, short* bp, int ac0, int ac1, u_char* in, u_char* out, int stride)
{
int s0 = LIMIT_512(bp[ac0]);
s0 = (s0 >> 2) & 0xff;
/* 66 unique coefficients require 7 bits */
char* mt0 = (char*)&multab[s0 << 7];
int s1 = LIMIT_512(bp[ac1]);
s1 = (s1 >> 2) & 0xff;
char* mt1 = (char*)&multab[s1 << 7];
u_int* vp0 = dct_basis[ac0];
u_int* vp1 = dct_basis[ac1];
for (int k = 8; --k >= 0; ) {
int t;
u_int v0 = *vp0++;
u_int v1 = *vp1++;
s0 = mt0[v0 >> 24] + mt1[v1 >> 24] + in[0] + dc;
u_int m = LIMIT(s0, t) << SHIFT(24);
s0 = mt0[v0 >> 16 & 0xff] + mt1[v1 >> 16 & 0xff] + in[1] + dc;
m |= LIMIT(s0, t) << SHIFT(16);
s0 = mt0[v0 >> 8 & 0xff] + mt1[v1 >> 8 & 0xff] + in[2] + dc;
m |= LIMIT(s0, t) << SHIFT(8);
s0 = mt0[v0 & 0xff] + mt1[v1 & 0xff] + in[3] + dc;
m |= LIMIT(s0, t) << SHIFT(0);
*(u_int*)out = m;
v0 = *vp0++;
v1 = *vp1++;
s0 = mt0[v0 >> 24] + mt1[v1 >> 24] + in[4] + dc;
m = 0;
m |= LIMIT(s0, t) << SHIFT(24);
s0 = mt0[v0 >> 16 & 0xff] + mt1[v1 >> 16 & 0xff] + in[5] + dc;
m |= LIMIT(s0, t) << SHIFT(16);
s0 = mt0[v0 >> 8 & 0xff] + mt1[v1 >> 8 & 0xff] + in[6] + dc;
m |= LIMIT(s0, t) << SHIFT(8);
s0 = mt0[v0 & 0xff] + mt1[v1 & 0xff] + in[7] + dc;
m |= LIMIT(s0, t) << SHIFT(0);
*(u_int*)(out + 4) = m;
out += stride;
in += stride;
}
}
#ifdef INT_64
/*XXX assume little-endian */
#define PSPLICE(v, n) pix |= (INT_64)(v) << ((n)*8)
#define DID4PIX
#define PSTORE ((INT_64*)p)[0] = pix
#define PIXDEF INT_64 pix = 0; int v, oflo = 0
#else
#define PSPLICE(v, n) SPLICE(pix, (v), (3 - ((n)&3)) * 8)
#define DID4PIX pix0 = pix; pix = 0
#define PSTORE ((u_int*)p)[0] = pix0; ((u_int*)p)[1] = pix
#define PIXDEF u_int pix0, pix = 0; int v, oflo = 0
#endif
#define DOJPIX(val, n) v = FP_JNORM(val); oflo |= v; PSPLICE(v, n)
#define DOJPIXLIMIT(val, n) PSPLICE(LIMIT(FP_JNORM(val),t), n)
#define DOPIX(val, n) v = FP_NORM(val); oflo |= v; PSPLICE(v, n)
#define DOPIXLIMIT(val, n) PSPLICE(LIMIT(FP_NORM(val),t), n)
#define DOPIXIN(val, n) v = FP_NORM(val) + in[n]; oflo |= v; PSPLICE(v, n)
#define DOPIXINLIMIT(val, n) PSPLICE(LIMIT(FP_NORM(val) + in[n], t), n)
/*
* A 2D Inverse DCT based on a column-row decomposition using
* Arai, Agui, and Nakajmia's 8pt 1D Inverse DCT, from Fig. 4-8
* Pennebaker & Mitchell (i.e., the pink JPEG book). This figure
* is the forward transform; reverse the flowgraph for the inverse
* (you need to draw a picture). The outputs are DFT coefficients
* and need to be scaled according to Eq [4-3].
*
* The input coefficients are, counter to tradition, in column-order.
* The bit mask indicates which coefficients are non-zero. If the
* corresponding bit is zero, then the coefficient is assumed zero
* and the input coefficient is not referenced and need not be defined.
* The 8-bit outputs are computed in row order and placed in the
* output array pointed to by p, with each of the eight 8-byte lines
* offset by "stride" bytes.
*
* qt is the inverse quantization table in column order. These
* coefficients are the product of the inverse quantization factor,
* specified by the jpeg quantization table, and the first multiplier
* in the inverse DCT flow graph. This first multiplier is actually
* biased by the row term so that the columns are pre-scaled to
* eliminate the first row multiplication stage.
*
* The output is biased by 128, i.e., [-128,127] is mapped to [0,255],
* which is relevant to jpeg.
*/
void
#ifdef INT_64
rdct(register short *bp, INT_64 m0, u_char* p, int stride, const int* qt)
#else
rdct(register short *bp, u_int m0, u_int m1, u_char* p,
int stride, const int* qt)
#endif
{
/*
* First pass is 1D transform over the columns. Note that
* the coefficients are stored in column order, so even
* though we're accessing the columns, we access them
* in a row-like fashion. We use a column-row decomposition
* instead of a row-column decomposition so we can splice
* pixels in an efficient, row-wise order in the second
* stage.
*
* The inverse quantization is folded together with the
* first stage multiplier. This reduces the number of
* multiplies (per 8-pt transform) from 11 to 5 (ignoring
* the inverse quantization which must be done anyway).
*
* Because we compute appropriately scaled column DCTs,
* the row DCTs require only 5 multiplies per row total.
* So the total number of multiplies for the 8x8 DCT is 80
* (ignoring the checks for zero coefficients).
*/
int tmp[64];
int* tp = tmp;
int i;
for (i = 8; --i >= 0; ) {
if ((m0 & 0xfe) == 0) {
/* AC terms all zero */
int v = M(0) ? qt[0] * bp[0] : 0;
tp[0] = v;
tp[1] = v;
tp[2] = v;
tp[3] = v;
tp[4] = v;
tp[5] = v;
tp[6] = v;
tp[7] = v;
} else {
int t0, t1, t2, t3, t4, t5, t6, t7;
if ((m0 & 0xaa) == 0)
t4 = t5 = t6 = t7 = 0;
else {
t0 = M(5) ? qt[5] * bp[5] : 0;
t2 = M(1) ? qt[1] * bp[1] : 0;
t6 = M(7) ? qt[7] * bp[7] : 0;
t7 = M(3) ? qt[3] * bp[3] : 0;
t4 = t0 - t7;
t1 = t2 + t6;
t6 = t2 - t6;
t7 += t0;
t5 = t1 - t7;
t7 += t1;
t2 = FP_MUL(-A5, t4 + t6);
t4 = FP_MUL(-A2, t4);
t0 = t4 + t2;
t1 = FP_MUL(A3, t5);
t2 += FP_MUL(A4, t6);
t4 = -t0;
t5 = t1 - t0;
t6 = t1 + t2;
t7 += t2;
}
#ifdef notdef
if ((m0 & 0x55) == 0)
t0 = t1 = t2 = t3 = 0;
else {
#endif
t0 = M(0) ? qt[0] * bp[0] : 0;
t1 = M(4) ? qt[4] * bp[4] : 0;
int x0 = t0 + t1;
int x1 = t0 - t1;
t0 = M(2) ? qt[2] * bp[2] : 0;
t3 = M(6) ? qt[6] * bp[6] : 0;
t2 = t0 - t3;
t3 += t0;
t2 = FP_MUL(A1, t2);
t3 += t2;
t0 = x0 + t3;
t1 = x1 + t2;
t2 = x1 - t2;
t3 = x0 - t3;
#ifdef notdef
}
#endif
tp[0] = t0 + t7;
tp[7] = t0 - t7;
tp[1] = t1 + t6;
tp[6] = t1 - t6;
tp[2] = t2 + t5;
tp[5] = t2 - t5;
tp[3] = t3 + t4;
tp[4] = t3 - t4;
}
tp += 8;
bp += 8;
qt += 8;
m0 >>= 8;
#ifndef INT_64
m0 |= m1 << 24;
m1 >>= 8;
#endif
}
tp -= 64;
/*
* Second pass is 1D transform over the rows. Note that
* the coefficients are stored in column order, so even
* though we're accessing the rows, we access them
* in a column-like fashion.
*
* The pass above already computed the first multiplier
* in the flow graph.
*/
for (i = 8; --i >= 0; ) {
int t0, t1, t2, t3, t4, t5, t6, t7;
t0 = tp[8*5];
t2 = tp[8*1];
t6 = tp[8*7];
t7 = tp[8*3];
#ifdef notdef
if ((t0 | t2 | t6 | t7) == 0) {
t4 = t5 = 0;
} else
#endif
{
t4 = t0 - t7;
t1 = t2 + t6;
t6 = t2 - t6;
t7 += t0;
t5 = t1 - t7;
t7 += t1;
t2 = FP_MUL(-A5, t4 + t6);
t4 = FP_MUL(-A2, t4);
t0 = t4 + t2;
t1 = FP_MUL(A3, t5);
t2 += FP_MUL(A4, t6);
t4 = -t0;
t5 = t1 - t0;
t6 = t1 + t2;
t7 += t2;
}
t0 = tp[8*0];
t1 = tp[8*4];
int x0 = t0 + t1;
int x1 = t0 - t1;
t0 = tp[8*2];
t3 = tp[8*6];
t2 = t0 - t3;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -