📄 jidctint.c

📁 jpeg编解码器
💻 C
📖 第 1 页 / 共 5 页
字号:
12 3 4 5 下一页
/* * jidctint.c * * Copyright (C) 1991-1996, Thomas G. Lane. * This file is part of the Independent JPEG Group's software. * For conditions of distribution and use, see the accompanying README file. * * This file contains a slow-but-accurate integer implementation of the * inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine * must also perform dequantization of the input coefficients. * * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT * on each row (or vice versa, but it's more convenient to emit a row at * a time).  Direct algorithms are also available, but they are much more * complex and seem not to be any faster when reduced to code. * * This implementation is based on an algorithm described in *   C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT *   Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics, *   Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991. * The primary algorithm described there uses 11 multiplies and 29 adds. * We use their alternate method with 12 multiplies and 32 adds. * The advantage of this method is that no data path contains more than one * multiplication; this allows a very simple and accurate implementation in * scaled fixed-point arithmetic, with a minimal number of shifts. */ /*************************************************************************** * *      This program has been developed by Intel Corporation.   *      You have Intel's permission to incorporate this code  *      into your product, royalty free.  Intel has various  *      intellectual property rights which it may assert under *      certain circumstances, such as if another manufacturer's *      processor mis-identifies itself as being "GenuineIntel" *      when the CPUID instruction is executed. * *      Intel specifically disclaims all warranties, express or *      implied, and all liability, including consequential and *      other indirect damages, for the use of this code,  *      including liability for infringement of any proprietary *      rights, and including the warranties of merchantability *      and fitness for a particular purpose.  Intel does not  *      assume any responsibility for any errors which may  *      appear in this code nor any responsibility to update it. * *  *  Other brands and names are the property of their respective *     owners. * *  Copyright (c) 1997, Intel Corporation.  All rights reserved. ***************************************************************************/#define JPEG_INTERNALS#include "jinclude.h"#include "jpeglib.h"#include "jdct.h"		/* Private declarations for DCT subsystem */#ifdef DCT_ISLOW_SUPPORTED/* * This module is specialized to the case DCTSIZE = 8. */#if DCTSIZE != 8  Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */#endif/* * The poop on this scaling stuff is as follows: * * Each 1-D IDCT step produces outputs which are a factor of sqrt(N) * larger than the true IDCT outputs.  The final outputs are therefore * a factor of N larger than desired; since N=8 this can be cured by * a simple right shift at the end of the algorithm.  The advantage of * this arrangement is that we save two multiplications per 1-D IDCT, * because the y0 and y4 inputs need not be divided by sqrt(N). * * We have to do addition and subtraction of the integer inputs, which * is no problem, and multiplication by fractional constants, which is * a problem to do in integer arithmetic.  We multiply all the constants * by CONST_SCALE and convert them to integer constants (thus retaining * CONST_BITS bits of precision in the constants).  After doing a * multiplication we have to divide the product by CONST_SCALE, with proper * rounding, to produce the correct output.  This division can be done * cheaply as a right shift of CONST_BITS bits.  We postpone shifting * as long as possible so that partial sums can be added together with * full fractional precision. * * The outputs of the first pass are scaled up by PASS1_BITS bits so that * they are represented to better-than-integral precision.  These outputs * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word * with the recommended scaling.  (To scale up 12-bit sample data further, an * intermediate INT32 array would be needed.) * * To avoid overflow of the 32-bit intermediate results in pass 2, we must * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error analysis * shows that the values given below are the most effective. */#if BITS_IN_JSAMPLE == 8#define CONST_BITS  13#define PASS1_BITS  2#else#define CONST_BITS  13#define PASS1_BITS  1		/* lose a little precision to avoid overflow */#endif/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus * causing a lot of useless floating-point operations at run time. * To get around this we use the following pre-calculated constants. * If you change CONST_BITS you may want to add appropriate values. * (With a reasonable C compiler, you can just rely on the FIX() macro...) */#if CONST_BITS == 13#define FIX_0_298631336  ((INT32)  2446)	/* FIX(0.298631336) */#define FIX_0_390180644  ((INT32)  3196)	/* FIX(0.390180644) */#define FIX_0_541196100  ((INT32)  4433)	/* FIX(0.541196100) */#define FIX_0_765366865  ((INT32)  6270)	/* FIX(0.765366865) */#define FIX_0_899976223  ((INT32)  7373)	/* FIX(0.899976223) */#define FIX_1_175875602  ((INT32)  9633)	/* FIX(1.175875602) */#define FIX_1_501321110  ((INT32)  12299)	/* FIX(1.501321110) */#define FIX_1_847759065  ((INT32)  15137)	/* FIX(1.847759065) */#define FIX_1_961570560  ((INT32)  16069)	/* FIX(1.961570560) */#define FIX_2_053119869  ((INT32)  16819)	/* FIX(2.053119869) */#define FIX_2_562915447  ((INT32)  20995)	/* FIX(2.562915447) */#define FIX_3_072711026  ((INT32)  25172)	/* FIX(3.072711026) */#else#define FIX_0_298631336  FIX(0.298631336)#define FIX_0_390180644  FIX(0.390180644)#define FIX_0_541196100  FIX(0.541196100)#define FIX_0_765366865  FIX(0.765366865)#define FIX_0_899976223  FIX(0.899976223)#define FIX_1_175875602  FIX(1.175875602)#define FIX_1_501321110  FIX(1.501321110)#define FIX_1_847759065  FIX(1.847759065)#define FIX_1_961570560  FIX(1.961570560)#define FIX_2_053119869  FIX(2.053119869)#define FIX_2_562915447  FIX(2.562915447)#define FIX_3_072711026  FIX(3.072711026)#endif/* Multiply an INT32 variable by an INT32 constant to yield an INT32 result. * For 8-bit samples with the recommended scaling, all the variable * and constant values involved are no more than 16 bits wide, so a * 16x16->32 bit multiply can be used instead of a full 32x32 multiply. * For 12-bit samples, a full 32-bit multiplication will be needed. */#if BITS_IN_JSAMPLE == 8#define MULTIPLY(var,const)  MULTIPLY16C16(var,const)#else#define MULTIPLY(var,const)  ((var) * (const))#endif/* Dequantize a coefficient by multiplying it by the multiplier-table * entry; produce an int result.  In this module, both inputs and result * are 16 bits or less, so either int or short multiply will work. */#define DEQUANTIZE(coef,quantval)  (((ISLOW_MULT_TYPE) (coef)) * (quantval))/* * Perform dequantization and inverse DCT on one block of coefficients. */#define __int64 unsigned long long	static const	__int64 fix_029_n089n196	= 0x098ea46e098ea46eLL;	static const	__int64 fix_n196_n089		= 0xc13be333c13be333LL;	static const	__int64 fix_205_n256n039	= 0x41b3a18141b3a181LL;	static const	__int64 fix_n039_n256		= 0xf384adfdf384adfdLL;	static const	__int64 fix_307n256_n196	= 0x1051c13b1051c13bLL;	static const	__int64 fix_n256_n196		= 0xadfdc13badfdc13bLL;	static const	__int64 fix_150_n089n039	= 0x300bd6b7300bd6b7LL;	static const	__int64 fix_n039_n089		= 0xf384e333f384e333LL;	static const	__int64 fix_117_117			= 0x25a125a125a125a1LL;	static const	__int64 fix_054_054p076		= 0x115129cf115129cfLL;	static const	__int64 fix_054n184_054		= 0xd6301151d6301151LL;	static const	__int64 fix_054n184 		= 0xd630d630d630d630LL;	static const	__int64 fix_054				= 0x1151115111511151LL;	static const	__int64 fix_054p076			= 0x29cf29cf29cf29cfLL;	static const	__int64 fix_n196p307n256	= 0xd18cd18cd18cd18cLL;	static const	__int64 fix_n089n039p150	= 0x06c206c206c206c2LL;	static const	__int64 fix_n256			= 0xadfdadfdadfdadfdLL;	static const	__int64 fix_n039			= 0xf384f384f384f384LL;	static const	__int64 fix_n256n039p205	= 0xe334e334e334e334LL;	static const	__int64 fix_n196			= 0xc13bc13bc13bc13bLL;	static const	__int64 fix_n089			= 0xe333e333e333e333LL;	static const	__int64 fix_n089n196p029	= 0xadfcadfcadfcadfcLL;	static const  __int64 const_0x2xx8		= 0x0000010000000100LL;	static const  __int64 const_0x0808		= 0x0808080808080808LL;__inline void domidct8x8llmW(short *inptr, short *quantptr, int *wsptr,				   JSAMPARRAY outptr, int output_col);GLOBAL(void)jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,		 JCOEFPTR coef_block,		 JSAMPARRAY output_buf, JDIMENSION output_col){    int workspace[DCTSIZE2+4];	/* buffers data between passes */  domidct8x8llmW(coef_block, compptr->dct_table, workspace, output_buf, output_col);}    __inline void domidct8x8llmW(short *inptr, short *quantptr, int *wsptr,				   JSAMPARRAY outptr, int output_col){#if defined(HAVE_MMX_INTEL_MNEMONICS)	__asm{    	mov		edi, quantptr	mov		ebx, inptr	mov		esi, wsptr	add		esi, 0x07		;align wsptr to qword	and		esi, 0xfffffff8	;align wsptr to qword		mov		eax, esi	/* Pass 1. */    	movq		mm0, [ebx + 8*4]	;p1(1,0)	pmullw		mm0, [edi + 8*4]	;p1(1,1)	    movq		mm1, [ebx + 8*12]	;p1(2,0)	pmullw		mm1, [edi + 8*12]	;p1(2,1)	movq		mm6, [ebx + 8*0]	;p1(5,0)	pmullw		mm6, [edi + 8*0]	;p1(5,1)	movq		mm2, mm0				;p1(3,0)	movq		mm7, [ebx + 8*8]	;p1(6,0)	punpcklwd	mm0, mm1				;p1(3,1)	pmullw		mm7, [edi + 8*8]	;p1(6,1)	movq		mm4, mm0				;p1(3,2)	punpckhwd	mm2, mm1				;p1(3,4)	pmaddwd		mm0, fix_054n184_054	;p1(3,3)	movq		mm5, mm2				;p1(3,5)	pmaddwd		mm2, fix_054n184_054	;p1(3,6)	pxor		mm1, mm1	;p1(7,0)    	pmaddwd		mm4, fix_054_054p076		;p1(4,0)	punpcklwd   mm1, mm6	;p1(7,1)	pmaddwd		mm5, fix_054_054p076		;p1(4,1)	psrad		mm1, 3		;p1(7,2)	pxor		mm3, mm3	;p1(7,3)	punpcklwd	mm3, mm7	;p1(7,4)		psrad		mm3, 3		;p1(7,5)		paddd		mm1, mm3	;p1(7,6)		movq		mm3, mm1	;p1(7,7)		paddd		mm1, mm4	;p1(7,8)		psubd		mm3, mm4	;p1(7,9)	movq		[esi + 8*16], mm1	;p1(7,10)	pxor		mm4, mm4	;p1(7,12)	movq		[esi + 8*22], mm3	;p1(7,11)	punpckhwd	mm4, mm6	;p1(7,13)	psrad		mm4, 3		;p1(7,14)	pxor		mm1, mm1	;p1(7,15)		punpckhwd	mm1, mm7	;p1(7,16)		psrad		mm1, 3		;p1(7,17)		paddd		mm4, mm1	;p1(7,18)		movq		mm3, mm4	;p1(7,19)		pxor		mm1, mm1	;p1(8,0)	paddd		mm3, mm5	;p1(7,20)	punpcklwd	mm1, mm6	;p1(8,1)	psubd		mm4, mm5	;p1(7,21)	psrad		mm1, 3		;p1(8,2)	movq		[esi + 8*17], mm3	;p1(7,22)	pxor		mm5, mm5	;p1(8,3)		movq		[esi + 8*23], mm4	;p1(7,23)	punpcklwd	mm5, mm7	;p1(8,4)	psrad		mm5, 3		;p1(8,5)	pxor		mm4, mm4	;p1(8,12)	psubd		mm1, mm5	;p1(8,6)		punpckhwd	mm4, mm6	;p1(8,13)	movq		mm3, mm1	;p1(8,7)	psrad		mm4, 3		;p1(8,14)	paddd		mm1, mm0	;p1(8,8)	pxor		mm5, mm5	;p1(8,15)	psubd		mm3, mm0	;p1(8,9)	movq		mm0, [ebx + 8*14]	;p1(9,0)	punpckhwd	mm5, mm7	;p1(8,16)	pmullw		mm0, [edi + 8*14]	;p1(9,1)	movq		[esi + 8*18], mm1	;p1(8,10)	psrad		mm5, 3		;p1(8,17)	movq		[esi + 8*20], mm3	;p1(8,11)	psubd		mm4, mm5	;p1(8,18)	movq		mm3, mm4	;p1(8,19)	movq		mm1, [ebx + 8*6]	;p1(10,0)	paddd		mm3, mm2	;p1(8,20)	pmullw		mm1, [edi + 8*6]	;p1(10,1)	psubd		mm4, mm2	;p1(8,21)	movq		mm5, mm0			;p1(11,1)	movq		[esi + 8*21], mm4	;p1(8,23)	movq		[esi + 8*19], mm3	;p1(8,22)	movq		mm4, mm0			;p1(11,0)	punpcklwd	mm4, mm1			;p1(11,2)	movq		mm2, [ebx + 8*10]	;p1(12,0)	punpckhwd	mm5, mm1			;p1(11,4)		pmullw		mm2, [edi + 8*10]	;p1(12,1)	movq		mm3, [ebx + 8*2]	;p1(13,0)	pmullw		mm3, [edi + 8*2]	;p1(13,1)	movq		mm6, mm2			;p1(14,0)	pmaddwd		mm4, fix_117_117	;p1(11,3)	movq		mm7, mm2			;p1(14,1)		pmaddwd		mm5, fix_117_117	;p1(11,5)	punpcklwd	mm6, mm3			;p1(14,2)	pmaddwd		mm6, fix_117_117	;p1(14,3)	punpckhwd	mm7, mm3			;p1(14,4)		pmaddwd		mm7, fix_117_117	;p1(14,5)	paddd		mm4, mm6			;p1(15,0)	paddd		mm5, mm7			;p1(15,1)   	movq		[esi+8*24], mm4		;p1(15,2)	movq		[esi+8*25], mm5		;p1(15,3)	movq		mm6, mm0				;p1(16,0)	movq		mm7, mm3				;p1(16,3)	punpcklwd	mm6, mm2				;p1(16,1)		punpcklwd	mm7, mm3				;p1(16,4)	pmaddwd		mm6, fix_n039_n089		;p1(16,2)	pmaddwd		mm7, fix_150_n089n039	;p1(16,5)	movq		mm4, mm0				;p1(16,12)	paddd		mm6, [esi+8*24]			;p1(16,6)	punpckhwd	mm4, mm2				;p1(16,13)	paddd		mm6, mm7				;p1(16,7)	pmaddwd		mm4, fix_n039_n089		;p1(16,14)	movq		mm7, mm6				;p1(16,8)	paddd		mm4, [esi+8*25]			;p1(16,18)	movq		mm5, mm3				;p1(16,15)	paddd		mm6, [esi + 8*16]		;p1(16,9)	punpckhwd	mm5, mm3				;p1(16,16)	paddd		mm6, const_0x2xx8		;p1(16,10)		psrad		mm6, 9					;p1(16,11)	pmaddwd		mm5, fix_150_n089n039	;p1(16,17)	paddd		mm4, mm5				;p1(16,19)	movq		mm5, mm4				;p1(16,20)	paddd		mm4, [esi + 8*17]		;p1(16,21)		paddd		mm4, const_0x2xx8		;p1(16,22)		psrad		mm4, 9					;p1(16,23)	packssdw	mm6, mm4				;p1(16,24)		movq		[esi + 8*0], mm6		;p1(16,25)			movq		mm4, [esi + 8*16]		;p1(16,26)		psubd		mm4, mm7				;p1(16,27)	movq		mm6, [esi + 8*17]		;p1(16,30)		paddd		mm4, const_0x2xx8		;p1(16,28)	movq		mm7, mm1				;p1(17,3)	psrad		mm4, 9					;p1(16,29)	psubd		mm6, mm5				;p1(16,31)	paddd		mm6, const_0x2xx8		;p1(16,32)	punpcklwd	mm7, mm1				;p1(17,4)	pmaddwd		mm7, fix_307n256_n196	;p1(17,5)	psrad		mm6, 9					;p1(16,33)	packssdw	mm4, mm6				;p1(16,34)	movq		[esi + 8*14], mm4		;p1(16,35)	movq		mm6, mm0				;p1(17,0)	movq		mm4, mm0				;p1(17,12)	punpcklwd	mm6, mm2				;p1(17,1)	punpckhwd	mm4, mm2				;p1(17,13)		pmaddwd		mm6, fix_n256_n196		;p1(17,2)	movq		mm5, mm1				;p1(17,15)	paddd		mm6, [esi+8*24]			;p1(17,6)	punpckhwd	mm5, mm1				;p1(17,16)		paddd		mm6, mm7				;p1(17,7)
12 3 4 5 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -