📄 mmxidct.cpp
字号:
//==========================================================================
//
// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY
// KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR
// PURPOSE.
//
// Copyright (c) 1999 - 2001 On2 Technologies Inc. All Rights Reserved.
//
//--------------------------------------------------------------------------
/****************************************************************************
*
* Module Title : IDCTPart.c
*
* Description : IDCT with multiple versions based on # of non 0 coeffs
*
*
*****************************************************************************
*/
// Dequantization + inverse discrete cosine transform.
#include "ogg/ogg.h"
#include <math.h>
#include <memory.h>
#include <inttypes.h>
#include "simd.h"
// Constants used in MMX implementation of dequantization and idct.
// All the MMX stuff works with 4 16-bit quantities at a time and
// we create 11 constants of size 4 x 16 bits.
// The first 4 are used to mask the individual 16-bit words within a group
// and are used in the address-shuffling part of the dequantization.
// The last 7 are fixed-point approximations to the cosines of angles
// occurring in the DCT; each of these contains 4 copies of the same value.
// There is only one (statically initialized) instance of this object
// wrapped in an allocator object that forces its starting address
// to be evenly divisible by 32. Hence the actual object occupies 2.75
// cache lines on a Pentium processor.
// Offsets in bytes used by the assembler code below
// must of course agree with the idctConstants constructor.
#define MaskOffset 0 // 4 masks come in order low word to high
#define CosineOffset 32 // 7 cosines come in order pi/16 * (1 ... 7)
#define EightOffset 88
#define IdctAdjustBeforeShift 8
#pragma warning( disable : 4799 ) // Disable no emms instruction warning!
static ogg_int16_t idctconstants[(4+7+1) * 4];
static const ogg_int16_t idctcosTbl[ 7] =
{
64277, 60547, 54491, 46341, 36410, 25080, 12785
};
extern "C" void fillidctconstants(void)
{
int j = 16;
ogg_int16_t * p;
do
{
idctconstants[ --j] = 0;
}
while( j);
idctconstants[0] = idctconstants[5] = idctconstants[10] = idctconstants[15] = 65535;
j = 1;
do
{
p = idctconstants + ( (j+3) << 2);
p[0] = p[1] = p[2] = p[3] = idctcosTbl[ j - 1];
}
while( ++j <= 7);
idctconstants[44] = idctconstants[45] = idctconstants[46] = idctconstants[47] = IdctAdjustBeforeShift;
}
/* Dequantization + inverse DCT.
Dequantization multiplies user's 16-bit signed indices (range -512 to +511)
by unsigned 16-bit quantization table entries.
These table entries are upscaled by 4, max is 30 * 128 * 4 < 2^14.
Result is scaled signed DCT coefficients (abs value < 2^15).
In the data stream, the coefficients are sent in order of increasing
total (horizontal + vertical) frequency. The exact picture is as follows:
00 01 05 06 16 17 33 34
02 04 07 15 20 32 35 52
03 10 14 21 31 36 51 53
11 13 22 30 37 50 54 65
12 23 27 40 47 55 64 66
24 26 41 46 56 63 67 74
25 42 45 57 62 70 73 75
43 44 60 61 71 72 76 77
Here the position in the matrix corresponds to the (horiz,vert)
freqency indices and the octal entry in the matrix is the position
of the coefficient in the data stream. Thus the coefficients are sent
in sort of a diagonal "snake".
The dequantization stage "uncurls the snake" and stores the expanded
coefficients in more convenient positions. These are not exactly the
natural positions given above but take into account our implementation
of the idct, which basically requires two one-dimensional idcts and
two transposes.
We fold the first transpose into the storage of the expanded coefficients.
We don't actually do a full transpose because this would require doubling
the size of the idct buffer; rather, we just transpose each of the 4x4
subblocks. Using slightly varying addressing schemes in each of the
four 4x8 idcts then allows these transforms to be done in place.
Transposing the 4x4 subblocks in the matrix above gives
00 02 03 11 16 20 31 37
01 04 10 13 17 32 36 50
05 07 14 22 33 35 51 54
06 15 21 30 34 52 53 65
12 24 25 43 47 56 62 71
23 26 42 44 55 63 70 72
27 41 45 60 64 67 73 76
40 46 57 61 66 74 75 77
Finally, we reverse the words in each 4 word group to clarify
direction of shifts.
11 03 02 00 37 31 20 16
13 10 04 01 50 36 32 17
22 14 07 05 54 51 35 33
30 21 15 06 65 53 52 34
43 25 24 12 71 62 56 47
44 42 26 23 72 70 63 55
60 45 41 27 76 73 67 64
61 57 46 40 77 75 74 66
This matrix then shows the 16 4x16 destination words in terms of
the 16 4x16 input words.
We implement this algorithm by manipulation of mmx registers,
which seems to be the fastest way to proceed. It is completely
hand-written; there does not seem to be enough recurrence to
reasonably compartmentalize any of it. Hence the resulting
program is ugly and bloated. Furthermore, due to the absence of
register pressure, it is boring and artless. I hate it.
The idct itself is more interesting. Since the two-dimensional dct
basis functions are products of the one-dimesional dct basis functions,
we can compute an inverse (or forward) dct via two 1-D transforms,
on rows then on columns. To exploit MMX parallelism, we actually do
both operations on columns, interposing a (partial) transpose between
the two 1-D transforms, the first transpose being done by the expansion
described above.
The 8-sample one-dimensional DCT is a standard orthogonal expansion using
the (unnormalized) basis functions
b[k]( i) = cos( pi * k * (2i + 1) / 16);
here k = 0 ... 7 is the frequency and i = 0 ... 7 is the spatial coordinate.
To normalize, b[0] should be multiplied by 1/sqrt( 8) and the other b[k]
should be multiplied by 1/2.
The 8x8 two-dimensional DCT is just the product of one-dimensional DCTs
in each direction. The (unnormalized) basis functions are
B[k,l]( i, j) = b[k]( i) * b[l]( j);
this time k and l are the horizontal and vertical frequencies,
i and j are the horizontal and vertical spatial coordinates;
all indices vary from 0 ... 7 (as above)
and there are now 4 cases of normalization.
Our 1-D idct expansion uses constants C1 ... C7 given by
(*) Ck = C(-k) = cos( pi * k/16) = S(8-k) = -S(k-8) = sin( pi * (8-k)/16)
and the following 1-D algorithm transforming I0 ... I7 to R0 ... R7 :
A = (C1 * I1) + (C7 * I7) B = (C7 * I1) - (C1 * I7)
C = (C3 * I3) + (C5 * I5) D = (C3 * I5) - (C5 * I3)
A. = C4 * (A - C) B. = C4 * (B - D)
C. = A + C D. = B + D
E = C4 * (I0 + I4) F = C4 * (I0 - I4)
G = (C2 * I2) + (C6 * I6) H = (C6 * I2) - (C2 * I6)
E. = E - G
G. = E + G
A.. = F + A. B.. = B. - H
F. = F - A. H. = B. + H
R0 = G. + C. R1 = A.. + H. R3 = E. + D. R5 = F. + B..
R7 = G. - C. R2 = A.. - H. R4 = E. - D. R6 = F. - B..
It is due to Vetterli and Lightenberg and may be found in the JPEG
reference book by Pennebaker and Mitchell.
Correctness of the algorithm follows from (*) together with the
addition formulas for sine and cosine:
cos( A + B) = cos( A) * cos( B) - sin( A) * sin( B)
sin( A + B) = sin( A) * cos( B) + cos( A) * sin( B)
Note that this implementation absorbs the difference in normalization
between the 0th and higher frequencies, although the results produced
are actually twice as big as they should be. Since we do this for each
dimension, the 2-D idct results are 4x the desired results. Finally,
taking into account that the dequantization multiplies by 4 as well,
our actual results are 16x too big. We fix this by shifting the final
results right by 4 bits.
High precision version approximates C1 ... C7 to 16 bits.
Since MMX only provides a signed multiply, C1 ... C5 appear to be
negative and multiplies involving them must be adjusted to compensate
for this. C6 and C7 do not require this adjustment since
they are < 1/2 and are correctly treated as positive numbers.
Following macro does four 8-sample one-dimensional idcts in parallel.
This is actually not such a difficult program to write once you
make a couple of observations (I of course was unable to make these
observations until I'd half-written a couple of other versions).
1. Everything is easy once you are done with the multiplies.
This is because, given X and Y in registers, one may easily
calculate X+Y and X-Y using just those 2 registers.
2. You always need at least 2 extra registers to calculate products,
so storing 2 temporaries is inevitable. C. and D. seem to be
the best candidates.
3. The products should be calculated in decreasing order of complexity
(which translates into register pressure). Since C1 ... C5 require
adjustment (and C6, C7 do not), we begin by calculating C and D.
*/
/**************************************************************************************
*
* Routine: BeginIDCT
*
* Description: The Macro does IDct on 4 1-D Dcts
*
* Input: None
*
* Output: None
*
* Return: None
*
* Special Note: None
*
* Error: None
*
***************************************************************************************
*/
template<class Ti,class Tj,class Tc> static __forceinline void BeginIDCT(__m64 &r0,__m64 &r1,__m64 &r2,__m64 &r3,__m64 &r4,__m64 &r5,__m64 &r6,__m64 &r7,Ti I,Tj J, Tc C)
{
movq (r2, I(3) );
movq (r6, C(3) );
movq (r4, r2 );
movq (r7, J(5) );
pmulhw (r4, r6 );/* r4 = c3*i3 - i3 */
movq (r1, C(5) );
pmulhw (r6, r7 );/* r6 = c3*i5 - i5 */
movq (r5, r1);
pmulhw (r1, r2 );/* r1 = c5*i3 - i3 */
movq (r3, I(1) );
pmulhw (r5, r7 );/* r5 = c5*i5 - i5 */
movq (r0, C(1) );/* (all registers are in use) */
paddw (r4, r2 );/* r4 = c3*i3 */
paddw (r6, r7 );/* r6 = c3*i5 */
paddw (r2, r1 );/* r2 = c5*i3 */
movq (r1, J(7) );
paddw (r7, r5 );/* r7 = c5*i5 */
movq (r5, r0 );/* r5 = c1 */
pmulhw (r0, r3 );/* r0 = c1*i1 - i1 */
paddsw (r4, r7 );/* r4 = C = c3*i3 + c5*i5 */
pmulhw (r5, r1 );/* r5 = c1*i7 - i7 */
movq (r7, C(7) );
psubsw (r6, r2 );/* r6 = D = c3*i5 - c5*i3 (done w/r2) */
paddw (r0, r3 );/* r0 = c1*i1 */
pmulhw (r3, r7 );/* r3 = c7*i1 */
movq (r2, I(2) );
pmulhw (r7, r1 );/* r7 = c7*i7 */
paddw (r5, r1 );/* r5 = c1*i7 */
movq (r1, r2 );/* r1 = i2 */
pmulhw (r2, C(2) );/* r2 = c2*i2 - i2 */
psubsw (r3, r5 );/* r3 = B = c7*i1 - c1*i7 */
movq (r5, J(6) );
paddsw (r0, r7 );/* r0 = A = c1*i1 + c7*i7 */
movq (r7, r5 );/* r7 = i6 */
psubsw (r0, r4 );/* r0 = A - C */
pmulhw (r5, C(2) );/* r5 = c2*i6 - i6 */
paddw (r2, r1 );/* r2 = c2*i2 */
pmulhw (r1, C(6) );/* r1 = c6*i2 */
paddsw (r4, r4 );/* r4 = C + C */
paddsw (r4, r0 );/* r4 = C. = A + C */
psubsw (r3, r6 );/* r3 = B - D */
paddw (r5, r7 );/* r5 = c2*i6 */
paddsw (r6, r6 );/* r6 = D + D */
pmulhw (r7, C(6) );/* r7 = c6*i6 */
paddsw (r6, r3 );/* r6 = D. = B + D */
movq (I(1), r4 );/* save C. at I(1) */
psubsw (r1, r5 );/* r1 = H = c6*i2 - c2*i6 */
movq (r4, C(4) );
movq (r5, r3 );/* r5 = B - D */
pmulhw (r3, r4 );/* r3 = (c4 - 1) * (B - D) */
paddsw (r7, r2 );/* r7 = G = c6*i6 + c2*i2 */
movq (I(2), r6 );/* save D. at I(2) */
movq (r2, r0 );/* r2 = A - C */
movq (r6, I(0) );
pmulhw (r0, r4 );/* r0 = (c4 - 1) * (A - C) */
paddw (r5, r3 );/* r5 = B. = c4 * (B - D) */
movq (r3, J(4) );
psubsw (r5, r1 );/* r5 = B.. = B. - H */
paddw (r2, r0 );/* r0 = A. = c4 * (A - C) */
psubsw (r6, r3 );/* r6 = i0 - i4 */
movq (r0, r6 );
pmulhw (r6, r4 );/* r6 = (c4 - 1) * (i0 - i4) */
paddsw (r3, r3 );/* r3 = i4 + i4 */
paddsw (r1, r1 );/* r1 = H + H */
paddsw (r3, r0 );/* r3 = i0 + i4 */
paddsw (r1, r5 );/* r1 = H. = B + H */
pmulhw (r4, r3 );/* r4 = (c4 - 1) * (i0 + i4) */
paddsw (r6, r0 );/* r6 = F = c4 * (i0 - i4) */
psubsw (r6, r2 );/* r6 = F. = F - A. */
paddsw (r2, r2 );/* r2 = A. + A. */
movq (r0, I(1) );/* r0 = C. */
paddsw (r2, r6 );/* r2 = A.. = F + A. */
paddw (r4, r3 );/* r4 = E = c4 * (i0 + i4) */
psubsw (r2, r1 );/* r2 = R2 = A.. - H. */
}
// end BeginIDCT macro (38 cycle(s).
// Two versions of the end of th(e idct depending on whether we're feeding
// into a transpose or dividing the final results by 16 and storing them.
/**************************************************************************************
*
* Routine: RowIDCT
*
* Description: The Macro does 1-D IDct on 4 Rows
*
* Input: None
*
* Output: None
*
* Return: None
*
* Special Note: None
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -