mmxidct.cpp

来自「从FFMPEG转换而来的H264解码程序,VC下编译..」· C++ 代码 · 共 1,435 行 · 第 1/4 页
CPP
1,435 行
//==========================================================================
//
//  THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY
//  KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
//  IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR
//  PURPOSE.
//
//  Copyright (c) 1999 - 2001  On2 Technologies Inc. All Rights Reserved.
//
//--------------------------------------------------------------------------


/****************************************************************************
*
*   Module Title :     IDCTPart.c
*
*   Description  :     IDCT with multiple versions based on # of non 0 coeffs
*
*
*****************************************************************************
*/

// Dequantization + inverse discrete cosine transform.

#include "ogg/ogg.h"
#include <math.h>
#include <memory.h>
#include <inttypes.h>
#include "simd.h"

// Constants used in MMX implementation of dequantization and idct.
// All the MMX stuff works with 4 16-bit quantities at a time and
// we create 11 constants of size 4 x 16 bits.
// The first 4 are used to mask the individual 16-bit words within a group
// and are used in the address-shuffling part of the dequantization.
// The last 7 are fixed-point approximations to the cosines of angles
// occurring in the DCT; each of these contains 4 copies of the same value.

// There is only one (statically initialized) instance of this object
// wrapped in an allocator object that forces its starting address
// to be evenly divisible by 32.  Hence the actual object occupies 2.75
// cache lines on a Pentium processor.

// Offsets in bytes used by the assembler code below
// must of course agree with the idctConstants constructor.

#define MaskOffset 0		// 4 masks come in order low word to high
#define CosineOffset 32		// 7 cosines come in order pi/16 * (1 ... 7)
#define EightOffset 88
#define IdctAdjustBeforeShift 8
#pragma warning( disable : 4799 )  // Disable no emms instruction warning!

static ogg_int16_t idctconstants[(4+7+1) * 4];
static const ogg_int16_t idctcosTbl[ 7] =
{
	64277, 60547, 54491, 46341, 36410, 25080, 12785
};

extern "C" void fillidctconstants(void)
{
	int j = 16;
	ogg_int16_t * p;
	do
	{
		idctconstants[ --j] = 0;
	}
	while( j);

	idctconstants[0] = idctconstants[5] = idctconstants[10] = idctconstants[15] = 65535;

	j = 1;
	do
	{
		p = idctconstants + ( (j+3) << 2);
		p[0] = p[1] = p[2] = p[3] = idctcosTbl[ j - 1];
	}
	while( ++j <= 7);

	idctconstants[44] = idctconstants[45] = idctconstants[46] = idctconstants[47] = IdctAdjustBeforeShift;
}

/* Dequantization + inverse DCT.

   Dequantization multiplies user's 16-bit signed indices (range -512 to +511)
   by unsigned 16-bit quantization table entries.
   These table entries are upscaled by 4, max is 30 * 128 * 4 < 2^14.
   Result is scaled signed DCT coefficients (abs value < 2^15).

   In the data stream, the coefficients are sent in order of increasing
   total (horizontal + vertical) frequency.  The exact picture is as follows:

	00 01 05 06  16 17 33 34
	02 04 07 15  20 32 35 52
	03 10 14 21  31 36 51 53
	11 13 22 30  37 50 54 65

	12 23 27 40  47 55 64 66
	24 26 41 46	 56 63 67 74
	25 42 45 57  62 70 73 75
	43 44 60 61  71 72 76 77

   Here the position in the matrix corresponds to the (horiz,vert)
   freqency indices and the octal entry in the matrix is the position
   of the coefficient in the data stream.  Thus the coefficients are sent
   in sort of a diagonal "snake".

   The dequantization stage "uncurls the snake" and stores the expanded
   coefficients in more convenient positions.  These are not exactly the
   natural positions given above but take into account our implementation
   of the idct, which basically requires two one-dimensional idcts and
   two transposes.

   We fold the first transpose into the storage of the expanded coefficients.
   We don't actually do a full transpose because this would require doubling
   the size of the idct buffer; rather, we just transpose each of the 4x4
   subblocks.  Using slightly varying addressing schemes in each of the
   four 4x8 idcts then allows these transforms to be done in place.

   Transposing the 4x4 subblocks in the matrix above gives

	00 02 03 11  16 20 31 37
	01 04 10 13  17 32 36 50
	05 07 14 22  33 35 51 54
	06 15 21 30  34 52 53 65

	12 24 25 43  47 56 62 71
	23 26 42 44  55 63 70 72
	27 41 45 60  64 67 73 76
	40 46 57 61  66 74 75 77

   Finally, we reverse the words in each 4 word group to clarify
   direction of shifts.

	11 03 02 00  37 31 20 16
	13 10 04 01  50 36 32 17
	22 14 07 05	 54 51 35 33
	30 21 15 06	 65 53 52 34

	43 25 24 12	 71 62 56 47
	44 42 26 23  72 70 63 55
	60 45 41 27	 76 73 67 64
	61 57 46 40  77 75 74 66

   This matrix then shows the 16 4x16 destination words in terms of
   the 16 4x16 input words.

   We implement this algorithm by manipulation of mmx registers,
   which seems to be the fastest way to proceed.  It is completely
   hand-written; there does not seem to be enough recurrence to
   reasonably compartmentalize any of it.  Hence the resulting
   program is ugly and bloated.  Furthermore, due to the absence of
   register pressure, it is boring and artless.	 I hate it.

   The idct itself is more interesting.  Since the two-dimensional dct
   basis functions are products of the one-dimesional dct basis functions,
   we can compute an inverse (or forward) dct via two 1-D transforms,
   on rows then on columns.  To exploit MMX parallelism, we actually do
   both operations on columns, interposing a (partial) transpose between
   the two 1-D transforms, the first transpose being done by the expansion
   described above.

   The 8-sample one-dimensional DCT is a standard orthogonal expansion using
   the (unnormalized) basis functions

	b[k]( i) = cos( pi * k * (2i + 1) / 16);

   here k = 0 ... 7 is the frequency and i = 0 ... 7 is the spatial coordinate.
   To normalize, b[0] should be multiplied by 1/sqrt( 8) and the other b[k]
   should be multiplied by 1/2.

   The 8x8 two-dimensional DCT is just the product of one-dimensional DCTs
   in each direction.  The (unnormalized) basis functions are

	B[k,l]( i, j) = b[k]( i) * b[l]( j);

   this time k and l are the horizontal and vertical frequencies,
   i and j are the horizontal and vertical spatial coordinates;
   all indices vary from 0 ... 7 (as above)
   and there are now 4 cases of normalization.

   Our 1-D idct expansion uses constants C1 ... C7 given by

   	(*)  Ck = C(-k) = cos( pi * k/16) = S(8-k) = -S(k-8) = sin( pi * (8-k)/16)

   and the following 1-D algorithm transforming I0 ... I7  to  R0 ... R7 :

   A = (C1 * I1) + (C7 * I7)		B = (C7 * I1) - (C1 * I7)
   C = (C3 * I3) + (C5 * I5)		D = (C3 * I5) - (C5 * I3)
   A. = C4 * (A - C)				B. = C4 * (B - D)
   C. = A + C						D. = B + D

   E = C4 * (I0 + I4)				F = C4 * (I0 - I4)
   G = (C2 * I2) + (C6 * I6)		H = (C6 * I2) - (C2 * I6)
   E. = E - G
   G. = E + G

   A.. = F + A.					B.. = B. - H
   F.  = F - A. 				H.  = B. + H

   R0 = G. + C.	R1 = A.. + H.	R3 = E. + D.	R5 = F. + B..
   R7 = G. - C.	R2 = A.. - H.	R4 = E. - D.	R6 = F. - B..

   It is due to Vetterli and Lightenberg and may be found in the JPEG
   reference book by Pennebaker and Mitchell.

   Correctness of the algorithm follows from (*) together with the
   addition formulas for sine and cosine:

	cos( A + B) = cos( A) * cos( B)  -  sin( A) * sin( B)
	sin( A + B) = sin( A) * cos( B)  +  cos( A) * sin( B)

   Note that this implementation absorbs the difference in normalization
   between the 0th and higher frequencies, although the results produced
   are actually twice as big as they should be.  Since we do this for each
   dimension, the 2-D idct results are 4x the desired results.  Finally,
   taking into account that the dequantization multiplies by 4 as well,
   our actual results are 16x too big.  We fix this by shifting the final
   results right by 4 bits.

   High precision version approximates C1 ... C7 to 16 bits.
   Since MMX only provides a signed multiply, C1 ... C5 appear to be
   negative and multiplies involving them must be adjusted to compensate
   for this.  C6 and C7 do not require this adjustment since
   they are < 1/2 and are correctly treated as positive numbers.

   Following macro does four 8-sample one-dimensional idcts in parallel.
   This is actually not such a difficult program to write once you
   make a couple of observations (I of course was unable to make these
   observations until I'd half-written a couple of other versions).

	1. Everything is easy once you are done with the multiplies.
	   This is because, given X and Y in registers, one may easily
	   calculate X+Y and X-Y using just those 2 registers.

	2. You always need at least 2 extra registers to calculate products,
	   so storing 2 temporaries is inevitable.  C. and D. seem to be
	   the best candidates.

	3. The products should be calculated in decreasing order of complexity
	   (which translates into register pressure).  Since C1 ... C5 require
	   adjustment (and C6, C7 do not), we begin by calculating C and D.
*/

/**************************************************************************************
 *
 *		Routine:		BeginIDCT
 *
 *		Description:	The Macro does IDct on 4 1-D Dcts
 *
 *		Input:			None
 *
 *		Output:			None
 *
 *		Return:			None
 *
 *		Special Note:	None
 *
 *		Error:			None
 *
 ***************************************************************************************
 */

template<class Ti,class Tj,class Tc> static __forceinline void BeginIDCT(__m64 &r0,__m64 &r1,__m64 &r2,__m64 &r3,__m64 &r4,__m64 &r5,__m64 &r6,__m64 &r7,Ti I,Tj J, Tc C)
{
		movq		(r2, I(3)  );

		movq		(r6, C(3) );
		 movq		(r4, r2 );
		movq		(r7, J(5) );
		 pmulhw		(r4, r6		);/* r4 = c3*i3 - i3 */
		movq		(r1, C(5) );
		 pmulhw		(r6, r7		);/* r6 = c3*i5 - i5 */
		movq		(r5, r1);
		 pmulhw		(r1, r2		);/* r1 = c5*i3 - i3 */
		movq		(r3, I(1) );
		 pmulhw		(r5, r7		);/* r5 = c5*i5 - i5 */
		movq		(r0, C(1)	);/* (all registers are in use) */
		 paddw		(r4, r2		);/* r4 = c3*i3 */
		paddw		(r6, r7		);/* r6 = c3*i5 */
		 paddw		(r2, r1		);/* r2 = c5*i3 */
		movq		(r1, J(7) );
		 paddw		(r7, r5		);/* r7 = c5*i5 */
		movq		(r5, r0		);/* r5 = c1 */
		 pmulhw		(r0, r3		);/* r0 = c1*i1 - i1 */
		paddsw		(r4, r7		);/* r4 = C = c3*i3 + c5*i5 */
		 pmulhw		(r5, r1		);/* r5 = c1*i7 - i7 */
		movq		(r7, C(7) );
		 psubsw		(r6, r2		);/* r6 = D = c3*i5 - c5*i3  (done w/r2) */
		paddw		(r0, r3		);/* r0 = c1*i1 */
		 pmulhw		(r3, r7		);/* r3 = c7*i1 */
		movq		(r2, I(2) );
		 pmulhw		(r7, r1		);/* r7 = c7*i7 */
		paddw		(r5, r1		);/* r5 = c1*i7 */
		 movq		(r1, r2		);/* r1 = i2 */
		pmulhw		(r2, C(2)	);/* r2 = c2*i2 - i2 */
		 psubsw		(r3, r5		);/* r3 = B = c7*i1 - c1*i7 */
		movq		(r5, J(6) );
		 paddsw		(r0, r7		);/* r0 = A = c1*i1 + c7*i7 */
		movq		(r7, r5		);/* r7 = i6 */
		 psubsw		(r0, r4		);/* r0 = A - C */
		pmulhw		(r5, C(2)	);/* r5 = c2*i6 - i6 */
		 paddw		(r2, r1		);/* r2 = c2*i2 */
		pmulhw		(r1, C(6)	);/* r1 = c6*i2 */
		 paddsw		(r4, r4		);/* r4 = C + C */
		paddsw		(r4, r0		);/* r4 = C. = A + C */
		 psubsw		(r3, r6		);/* r3 = B - D */
		paddw		(r5, r7		);/* r5 = c2*i6 */
		 paddsw		(r6, r6		);/* r6 = D + D */
		pmulhw		(r7, C(6)	);/* r7 = c6*i6 */
		 paddsw		(r6, r3		);/* r6 = D. = B + D */
		movq		(I(1), r4	);/* save C. at I(1) */
		 psubsw		(r1, r5		);/* r1 = H = c6*i2 - c2*i6 */
		movq		(r4, C(4) );
		 movq		(r5, r3		);/* r5 = B - D */
		pmulhw		(r3, r4		);/* r3 = (c4 - 1) * (B - D) */
		 paddsw		(r7, r2		);/* r7 = G = c6*i6 + c2*i2 */
		movq		(I(2), r6	);/* save D. at I(2) */
		 movq		(r2, r0		);/* r2 = A - C */
		movq		(r6, I(0) );
		 pmulhw		(r0, r4		);/* r0 = (c4 - 1) * (A - C) */
		paddw		(r5, r3		);/* r5 = B. = c4 * (B - D) */

		movq		(r3, J(4) );
		 psubsw		(r5, r1		);/* r5 = B.. = B. - H */
		paddw		(r2, r0		);/* r0 = A. = c4 * (A - C) */
		 psubsw		(r6, r3		);/* r6 = i0 - i4 */
		movq		(r0, r6 );
		 pmulhw		(r6, r4		);/* r6 = (c4 - 1) * (i0 - i4) */
		paddsw		(r3, r3		);/* r3 = i4 + i4 */
		 paddsw		(r1, r1		);/* r1 = H + H */
		paddsw		(r3, r0		);/* r3 = i0 + i4 */
		 paddsw		(r1, r5		);/* r1 = H. = B + H */
		pmulhw		(r4, r3		);/* r4 = (c4 - 1) * (i0 + i4) */
		 paddsw		(r6, r0		);/* r6 = F = c4 * (i0 - i4) */
		psubsw		(r6, r2		);/* r6 = F. = F - A. */
		 paddsw		(r2, r2		);/* r2 = A. + A. */
		movq		(r0, I(1)	);/* r0 = C. */
		 paddsw		(r2, r6		);/* r2 = A.. = F + A. */
		paddw		(r4, r3		);/* r4 = E = c4 * (i0 + i4) */
		 psubsw		(r2, r1		);/* r2 = R2 = A.. - H. */
}
// end BeginIDCT macro (38 cycle(s).

// Two versions of the end of th(e idct depending on whether we're feeding
// into a transpose or dividing the final results by 16 and storing them.

/**************************************************************************************
 *
 *		Routine:		RowIDCT
 *
 *		Description:	The Macro does 1-D IDct on 4 Rows
 *
 *		Input:			None
 *
 *		Output:			None
 *
 *		Return:			None
 *
 *		Special Note:	None
mmxidct.cpp - 源码说明

本页面展示了「从FFMPEG转换而来的H264解码程序,VC下编译..」中的 mmxidct.cpp 源码文件，采用 C++ 编程语言编写，共 1,435 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与FFMPEG相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?