📄 dct.cpp

📁 网络MPEG4IP流媒体开发源代码
💻 CPP
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* * Copyright (c) 1994 Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright *    notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright *    notice, this list of conditions and the following disclaimer in the *    documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software *    must display the following acknowledgement: *	This product includes software developed by the Network Research *	Group at Lawrence Berkeley Laboratory. * 4. Neither the name of the University nor of the Laboratory may be used *    to endorse or promote products derived from this software without *    specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */#include "bsd-endian.h"#include "dct.h"/* * Macros for fix-point (integer) arithmetic.  FP_NBITS gives the number * of binary digits past the decimal point.  FP_MUL computes the product * of two fixed point numbers.  A fixed point number and an integer * can be directly multiplied to give a fixed point number.  FP_SCALE * converts a floating point number to fixed point (and is used only * at startup, not by the dct engine).  FP_NORM converts a fixed * point number to scalar by rounding to the closest integer. * FP_JNORM is similar except it folds the jpeg bias of 128 into the * rounding addition. */#define FP_NBITS 15#define FP_MUL(a, b)	((((a) >> 5) * ((b) >> 5)) >> (FP_NBITS - 10))#define FP_SCALE(v)	(int)((double)(v) * double(1 << FP_NBITS) + 0.5)#define FP_NORM(v)	((v) + (1 << (FP_NBITS-1)) >> FP_NBITS)#define FP_JNORM(v)	((v) + (257 << (FP_NBITS-1)) >> FP_NBITS)#define M(n) ((m0 >> (n)) & 1)/* * This macro stolen from nv. *//* Sick little macro which will limit x to [0..255] with logical ops */#define LIMIT8(x, t) ((t = (x)), (t &= ~(t>>31)), (t | ~((t-256) >> 31)))#define LIMIT(x, t) (LIMIT8((x), t) & 0xff)/* row order */static const u_char ROWZAG[] = {	0,  1,  8, 16,  9,  2,  3, 10,	17, 24, 32, 25, 18, 11,  4,  5,	12, 19, 26, 33, 40, 48, 41, 34,	27, 20, 13,  6,  7, 14, 21, 28,	35, 42, 49, 56, 57, 50, 43, 36,	29, 22, 15, 23, 30, 37, 44, 51,	58, 59, 52, 45, 38, 31, 39, 46,	53, 60, 61, 54, 47, 55, 62, 63,	0,  0,  0,  0,  0,  0,  0,  0,	0,  0,  0,  0,  0,  0,  0,  0};/* column order */const u_char COLZAG[] = {	0, 8, 1, 2, 9, 16, 24, 17,	10, 3, 4, 11, 18, 25, 32, 40,	33, 26, 19, 12, 5, 6, 13, 20,	27, 34, 41, 48, 56, 49, 42, 35,	28, 21, 14, 7, 15, 22, 29, 36,	43, 50, 57, 58, 51, 44, 37, 30,	23, 31, 38, 45, 52, 59, 60, 53,	46, 39, 47, 54, 61, 62, 55, 63,	0,  0,  0,  0,  0,  0,  0,  0,	0,  0,  0,  0,  0,  0,  0,  0};#define A1 FP_SCALE(0.7071068)#define A2 FP_SCALE(0.5411961)#define A3 A1#define A4 FP_SCALE(1.3065630)#define A5 FP_SCALE(0.3826834)#define FA1 (0.707106781f)#define FA2 (0.541196100f)#define FA3 FA1#define FA4 (1.306562965f)#define FA5 (0.382683433f)/* * these magic numbers are scaling factors for each coef of the 1-d * AA&N DCT.  The scale factor for coef 0 is 1 and coef 1<=n<=7 is * cos(n*PI/16)*sqrt(2).  There is also a normalization of sqrt(8). * Formally you divide by the scale factor but we multiply by the * inverse because it's faster.  So the numbers below are the inverse * of what was just described. */#define B0 0.35355339059327376220#define B1 0.25489778955207958447#define B2 0.27059805007309849220#define B3 0.30067244346752264027#define B4 0.35355339059327376220#define B5 0.44998811156820785231#define B6 0.65328148243818826392#define B7 1.28145772387075308943/* * Output multipliers for AA&N DCT * (i.e., first stage multipliers for inverse DCT). */static const double first_stage[8] = { B0, B1, B2, B3, B4, B5, B6, B7, };/* * The first_stage array crossed with itself.  This allows us * to embed the first stage multipliers of the row pass by * computing scaled versions of the columns. */static const int cross_stage[64] = {	FP_SCALE(B0 * B0),	FP_SCALE(B0 * B1),	FP_SCALE(B0 * B2),	FP_SCALE(B0 * B3),	FP_SCALE(B0 * B4),	FP_SCALE(B0 * B5),	FP_SCALE(B0 * B6),	FP_SCALE(B0 * B7),	FP_SCALE(B1 * B0),	FP_SCALE(B1 * B1),	FP_SCALE(B1 * B2),	FP_SCALE(B1 * B3),	FP_SCALE(B1 * B4),	FP_SCALE(B1 * B5),	FP_SCALE(B1 * B6),	FP_SCALE(B1 * B7),	FP_SCALE(B2 * B0),	FP_SCALE(B2 * B1),	FP_SCALE(B2 * B2),	FP_SCALE(B2 * B3),	FP_SCALE(B2 * B4),	FP_SCALE(B2 * B5),	FP_SCALE(B2 * B6),	FP_SCALE(B2 * B7),	FP_SCALE(B3 * B0),	FP_SCALE(B3 * B1),	FP_SCALE(B3 * B2),	FP_SCALE(B3 * B3),	FP_SCALE(B3 * B4),	FP_SCALE(B3 * B5),	FP_SCALE(B3 * B6),	FP_SCALE(B3 * B7),	FP_SCALE(B4 * B0),	FP_SCALE(B4 * B1),	FP_SCALE(B4 * B2),	FP_SCALE(B4 * B3),	FP_SCALE(B4 * B4),	FP_SCALE(B4 * B5),	FP_SCALE(B4 * B6),	FP_SCALE(B4 * B7),	FP_SCALE(B5 * B0),	FP_SCALE(B5 * B1),	FP_SCALE(B5 * B2),	FP_SCALE(B5 * B3),	FP_SCALE(B5 * B4),	FP_SCALE(B5 * B5),	FP_SCALE(B5 * B6),	FP_SCALE(B5 * B7),	FP_SCALE(B6 * B0),	FP_SCALE(B6 * B1),	FP_SCALE(B6 * B2),	FP_SCALE(B6 * B3),	FP_SCALE(B6 * B4),	FP_SCALE(B6 * B5),	FP_SCALE(B6 * B6),	FP_SCALE(B6 * B7),	FP_SCALE(B7 * B0),	FP_SCALE(B7 * B1),	FP_SCALE(B7 * B2),	FP_SCALE(B7 * B3),	FP_SCALE(B7 * B4),	FP_SCALE(B7 * B5),	FP_SCALE(B7 * B6),	FP_SCALE(B7 * B7),};static const float f_cross_stage[64] = {	B0 * B0,	B0 * B1,	B0 * B2,	B0 * B3,	B0 * B4,	B0 * B5,	B0 * B6,	B0 * B7,	B1 * B0,	B1 * B1,	B1 * B2,	B1 * B3,	B1 * B4,	B1 * B5,	B1 * B6,	B1 * B7,	B2 * B0,	B2 * B1,	B2 * B2,	B2 * B3,	B2 * B4,	B2 * B5,	B2 * B6,	B2 * B7,	B3 * B0,	B3 * B1,	B3 * B2,	B3 * B3,	B3 * B4,	B3 * B5,	B3 * B6,	B3 * B7,	B4 * B0,	B4 * B1,	B4 * B2,	B4 * B3,	B4 * B4,	B4 * B5,	B4 * B6,	B4 * B7,	B5 * B0,	B5 * B1,	B5 * B2,	B5 * B3,	B5 * B4,	B5 * B5,	B5 * B6,	B5 * B7,	B6 * B0,	B6 * B1,	B6 * B2,	B6 * B3,	B6 * B4,	B6 * B5,	B6 * B6,	B6 * B7,	B7 * B0,	B7 * B1,	B7 * B2,	B7 * B3,	B7 * B4,	B7 * B5,	B7 * B6,	B7 * B7,};/* * Map a quantization table in natural, row-order, * into the qt input expected by rdct(). */voidrdct_fold_q(const int* in, int* out){	for (int i = 0; i < 64; ++i) {		/*		 * Fold column and row passes of the dct.		 * By scaling each column DCT independently,		 * we pre-bias all the row DCT's so the		 * first multiplier is already embedded		 * in the temporary result.  Thanks to		 * Martin Vetterli for explaining how		 * to do this.		 */		double v = double(in[i]);		v *= first_stage[i & 7];		v *= first_stage[i >> 3];		out[i] = FP_SCALE(v);	}}/* * Just like rdct_fold_q() but we divide by the quantizer. */voidfdct_fold_q(const int* in, float* out){	for (int i = 0; i < 64; ++i) {		double v = first_stage[i >> 3];		v *= first_stage[i & 7];		double q = double(in[i]);		out[i] = v / q;	}}void dcsum(int dc, u_char* in, u_char* out, int stride){	for (int k = 8; --k >= 0; ) {		int t;#ifdef INT_64		/*XXX assume little-endian */		INT_64 i = *(INT_64*)in;		INT_64 o = (INT_64)LIMIT(dc + (i >> 56), t) << 56;		o |=  (INT_64)LIMIT(dc + (i >> 48 & 0xff), t) << 48;		o |=  (INT_64)LIMIT(dc + (i >> 40 & 0xff), t) << 40;		o |=  (INT_64)LIMIT(dc + (i >> 32 & 0xff), t) << 32;		o |=  (INT_64)LIMIT(dc + (i >> 24 & 0xff), t) << 24;		o |=  (INT_64)LIMIT(dc + (i >> 16 & 0xff), t) << 16;		o |=  (INT_64)LIMIT(dc + (i >> 8 & 0xff), t) << 8;		o |=  (INT_64)LIMIT(dc + (i & 0xff), t);		*(INT_64*)out = o;#else		u_int o = 0;		u_int i = *(u_int*)in;		SPLICE(o, LIMIT(dc + EXTRACT(i, 24), t), 24);		SPLICE(o, LIMIT(dc + EXTRACT(i, 16), t), 16);		SPLICE(o, LIMIT(dc + EXTRACT(i, 8), t), 8);		SPLICE(o, LIMIT(dc + EXTRACT(i, 0), t), 0);		*(u_int*)out = o;		o = 0;		i = *(u_int*)(in + 4);		SPLICE(o, LIMIT(dc + EXTRACT(i, 24),  t), 24);		SPLICE(o, LIMIT(dc + EXTRACT(i, 16), t), 16);		SPLICE(o, LIMIT(dc + EXTRACT(i, 8), t), 8);		SPLICE(o, LIMIT(dc + EXTRACT(i, 0), t), 0);		*(u_int*)(out + 4) = o;#endif		in += stride;		out += stride;	}}void dcsum2(int dc, u_char* in, u_char* out, int stride){	for (int k = 8; --k >= 0; ) {		int t;		u_int o = 0;		SPLICE(o, LIMIT(dc + in[0], t), 24);		SPLICE(o, LIMIT(dc + in[1], t), 16);		SPLICE(o, LIMIT(dc + in[2], t), 8);		SPLICE(o, LIMIT(dc + in[3], t), 0);		*(u_int*)out = o;		o = 0;		SPLICE(o, LIMIT(dc + in[4], t), 24);		SPLICE(o, LIMIT(dc + in[5], t), 16);		SPLICE(o, LIMIT(dc + in[6], t), 8);		SPLICE(o, LIMIT(dc + in[7], t), 0);		*(u_int*)(out + 4) = o;		in += stride;		out += stride;	}}void dcfill(int DC, u_char* out, int stride){	int t;	u_int dc = DC;	dc = LIMIT(dc, t);	dc |= dc << 8;	dc |= dc << 16;#ifdef INT_64	INT_64 xdc = dc;	xdc |= xdc << 32;	*(INT_64 *)out = xdc;	out += stride;	*(INT_64 *)out = xdc;	out += stride;	*(INT_64 *)out = xdc;	out += stride;	*(INT_64 *)out = xdc;	out += stride;	*(INT_64 *)out = xdc;	out += stride;	*(INT_64 *)out = xdc;	out += stride;	*(INT_64 *)out = xdc;	out += stride;	*(INT_64 *)out = xdc;#else	*(u_int*)out = dc;	*(u_int*)(out + 4) = dc;	out += stride;	*(u_int*)out = dc;	*(u_int*)(out + 4) = dc;	out += stride;	*(u_int*)out = dc;	*(u_int*)(out + 4) = dc;	out += stride;	*(u_int*)out = dc;	*(u_int*)(out + 4) = dc;	out += stride;	*(u_int*)out = dc;	*(u_int*)(out + 4) = dc;	out += stride;	*(u_int*)out = dc;	*(u_int*)(out + 4) = dc;	out += stride;	*(u_int*)out = dc;	*(u_int*)(out + 4) = dc;	out += stride;	*(u_int*)out = dc;	*(u_int*)(out + 4) = dc;#endif}/* * This routine mixes the DC & AC components of an 8x8 block of * pixels.  This routine is called for every block decoded so it * needs to be efficient.  It tries to do as many pixels in parallel * as will fit in a word.  The one complication is that it has to * deal with overflow (sum > 255) and underflow (sum < 0).  Underflow * & overflow are only possible if both terms have the same sign and * are indicated by the result having a different sign than the terms. * Note that underflow is more worrisome than overflow since it results * in bright white dots in a black field. * The DC term and sum are biased by 128 so a negative number has the * 2^7 bit = 0.  The AC term is not biased so a negative number has * the 2^7 bit = 1.  So underflow is indicated by (DC & AC & sum) != 0; */#define MIX_LOGIC(sum, a, b, omask, uflo) \{ \	sum = a + b; \	uflo = (a ^ b) & (a ^ sum) & omask; \	if (uflo) { \		if ((b = uflo & a) != 0) { \			/* integer overflows */ \			b |= b >> 1; \			b |= b >> 2; \			b |= b >> 4; \			sum |= b; \		} \		if ((uflo &=~ b) != 0) { \			/* integer underflow(s) */ \			uflo |= uflo >> 1; \			uflo |= uflo >> 2; \			uflo |= uflo >> 4; \			sum &= ~uflo; \		} \	} \}/* * Table of products of 8-bit scaled coefficients * and idct coefficients (there are only 33 unique * coefficients so we index via a compact ID). */extern "C" u_char multab[];/* * Array of coefficient ID's used to index multab. */extern "C" u_int dct_basis[64][64 / sizeof(u_int)];/*XXX*/#define LIMIT_512(s) ((s) > 511 ? 511 : (s) < -512 ? -512 : (s))voidbv_rdct1(int dc, short* bp, int acx, u_char* out, int stride){	u_int omask = 0x80808080;	u_int uflo;	u_int* vp = dct_basis[acx];	int s = LIMIT_512(bp[acx]);	s = (s >> 2) & 0xff;	/* 66 unique coefficients require 7 bits */	char* mt = (char*)&multab[s << 7];	dc |= dc << 8;	dc |= dc << 16;	for (int k = 8; --k >= 0; ) {		u_int v = *vp++;		u_int m = mt[v >> 24] << SHIFT(24) |			mt[v >> 16 & 0xff] << SHIFT(16) |			mt[v >> 8 & 0xff] << SHIFT(8) |			mt[v & 0xff] << SHIFT(0);		MIX_LOGIC(v, dc, m, omask, uflo);		*(u_int*)out = v;		v = *vp++;		m = mt[v >> 24] << SHIFT(24) |			mt[v >> 16 & 0xff] << SHIFT(16) |			mt[v >> 8 & 0xff] << SHIFT(8) |			mt[v & 0xff] << SHIFT(0);		MIX_LOGIC(v, dc, m, omask, uflo);		*(u_int*)(out + 4) = v;		out += stride;	}}/* XXX this version has to be exact */voidbv_rdct2(int dc, short* bp, int ac0, u_char* in, u_char* out, int stride){	int s0 = LIMIT_512(bp[ac0]);	s0 = (s0 >> 2) & 0xff;	/* 66 unique coefficients require 7 bits */	const char* mt = (const char*)&multab[s0 << 7];	const u_int* vp0 = dct_basis[ac0];	dc |= dc << 8;	dc |= dc << 16;	u_int omask = 0x80808080;	u_int uflo;	for (int k = 8; --k >= 0; ) {		u_int v, m, i;		v = *vp0++;		m = mt[v >> 24] << SHIFT(24) | mt[v >> 16 & 0xff] << SHIFT(16) |		    mt[v >> 8 & 0xff] << SHIFT(8) | mt[v & 0xff] << SHIFT(0);		MIX_LOGIC(v, dc, m, omask, uflo);		i = in[0] << SHIFT(24) | in[1] << SHIFT(16) |		    in[2] << SHIFT(8) | in[3] << SHIFT(0);		MIX_LOGIC(m, i, v, omask, uflo);		*(u_int*)out = m;		v = *vp0++;		m = mt[v >> 24] << SHIFT(24) | mt[v >> 16 & 0xff] << SHIFT(16) |		    mt[v >> 8 & 0xff] << SHIFT(8) | mt[v & 0xff] << SHIFT(0);		MIX_LOGIC(v, dc, m, omask, uflo);		i = in[4] << SHIFT(24) | in[5] << SHIFT(16) |		    in[6] << SHIFT(8) | in[7] << SHIFT(0);		MIX_LOGIC(m, i, v, omask, uflo);		*(u_int*)(out + 4) = m;		out += stride;		in += stride;	}}/* XXX this version has to be exact */voidbv_rdct3(int dc, short* bp, int ac0, int ac1, u_char* in, u_char* out, int stride){	int s0 = LIMIT_512(bp[ac0]);	s0 = (s0 >> 2) & 0xff;	/* 66 unique coefficients require 7 bits */	char* mt0 = (char*)&multab[s0 << 7];	int s1 = LIMIT_512(bp[ac1]);	s1 = (s1 >> 2) & 0xff;	char* mt1 = (char*)&multab[s1 << 7];	u_int* vp0 = dct_basis[ac0];	u_int* vp1 = dct_basis[ac1];	for (int k = 8; --k >= 0; ) {		int t;		u_int v0 = *vp0++;		u_int v1 = *vp1++;		s0 = mt0[v0 >> 24] + mt1[v1 >> 24] + in[0] + dc;		u_int m = LIMIT(s0, t) << SHIFT(24);		s0 = mt0[v0 >> 16 & 0xff] + mt1[v1 >> 16 & 0xff] + in[1] + dc;		m |= LIMIT(s0, t) << SHIFT(16);		s0 = mt0[v0 >> 8 & 0xff] + mt1[v1 >> 8 & 0xff] + in[2] + dc;		m |= LIMIT(s0, t) << SHIFT(8);		s0 = mt0[v0 & 0xff] + mt1[v1 & 0xff] + in[3] + dc;		m |= LIMIT(s0, t) << SHIFT(0);		*(u_int*)out = m;		v0 = *vp0++;		v1 = *vp1++;		s0 = mt0[v0 >> 24] + mt1[v1 >> 24] + in[4] + dc;		m = 0;		m |= LIMIT(s0, t) << SHIFT(24);		s0 = mt0[v0 >> 16 & 0xff] + mt1[v1 >> 16 & 0xff] + in[5] + dc;		m |= LIMIT(s0, t) << SHIFT(16);		s0 = mt0[v0 >> 8 & 0xff] + mt1[v1 >> 8 & 0xff] + in[6] + dc;		m |= LIMIT(s0, t) << SHIFT(8);		s0 = mt0[v0 & 0xff] + mt1[v1 & 0xff] + in[7] + dc;		m |= LIMIT(s0, t) << SHIFT(0);		*(u_int*)(out + 4) = m;		out += stride;		in += stride;	}}#ifdef INT_64/*XXX assume little-endian */#define PSPLICE(v, n) pix |= (INT_64)(v) << ((n)*8)#define DID4PIX #define PSTORE ((INT_64*)p)[0] = pix#define PIXDEF INT_64 pix = 0; int v, oflo = 0#else#define PSPLICE(v, n) SPLICE(pix, (v), (3 - ((n)&3)) * 8)#define DID4PIX pix0 = pix; pix = 0#define PSTORE ((u_int*)p)[0] = pix0; ((u_int*)p)[1] = pix#define PIXDEF	u_int pix0, pix = 0; int v, oflo = 0#endif#define DOJPIX(val, n) v = FP_JNORM(val); oflo |= v; PSPLICE(v, n)#define DOJPIXLIMIT(val, n) PSPLICE(LIMIT(FP_JNORM(val),t), n)#define DOPIX(val, n) v = FP_NORM(val); oflo |= v; PSPLICE(v, n)#define DOPIXLIMIT(val, n) PSPLICE(LIMIT(FP_NORM(val),t), n)#define DOPIXIN(val, n) v = FP_NORM(val) + in[n]; oflo |= v; PSPLICE(v, n)#define DOPIXINLIMIT(val, n) PSPLICE(LIMIT(FP_NORM(val) + in[n], t), n)/* * A 2D Inverse DCT based on a column-row decomposition using * Arai, Agui, and Nakajmia's 8pt 1D Inverse DCT, from Fig. 4-8 * Pennebaker & Mitchell (i.e., the pink JPEG book).  This figure * is the forward transform; reverse the flowgraph for the inverse
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -