📄 fdct3dn.cpp
字号:
// z13 = (*tmp7) - (*z3);
#define z13z11 t7t7
#define out5out1 mm4
#define out3out7 z13z11
// 7d) [z13,z11] <= [tmp7,tmp7] - [z3,-z3]
// PFSUB( z13z11, z3mz3); // z13z11 = [z13,z11]
EMIT 0x0f
EMIT 0x0f
EMIT 0xee //((0xc5 & 0x3f) << 3) | 0xc6
EMIT 0x9a
movq out5out1, z2z4; // begin forming out5out1
// dataptr[5] = z13 + (*z2); // phase 6
// dataptr[3] = z13 - (*z2);
// dataptr[1] = z11 + (*z4);
// dataptr[7] = z11 - (*z4);
// 8a) [dataptr5,dataptr1] <= [z13,z11] + [z2,z4]
// 8b) [dataptr3,dataptr7] <= [z13,z11] - [z2,z4]
// PFADD( out5out1, z13z11); // produce [out5,out1]
EMIT 0x0f
EMIT 0x0f
EMIT 0xe5 //((0xc4 & 0x3f) << 3) | 0xc5
EMIT 0x9e
// PFSUB( out3out7, z2z4 ); // produce [out3,out7]
EMIT 0x0f
EMIT 0x0f
EMIT 0xe8 //((0xc5 & 0x3f) << 3) | 0xc0
EMIT 0x9a
movd [edx + 4*1], out5out1; // store out[1]
// movd [edx + 32*1], out5out1; // store out[1]
psrlq out5out1, 32; // out5out1 <= [ 000, out5 ]
movd [edx + 4*7], out3out7; // store out[7]
// movd [edx + 32*7], out3out7; // store out[7]
psrlq out3out7, 32; // out3out7 <= [ 000, out3 ]
movd [edx + 4*5], out5out1; // store out[5]
// movd [edx + 32*5], out5out1; // store out[5]
sub edi, 0x01; // i = i - 1
movd [edx + 4*3], out3out7; // store out[3]
// movd [edx + 32*3], out3out7; // store out[3]
add edx, 32; // 32=4*8, outptr += 8 (floats)
// add edx, 4; // 4=4*1, outptr += 1 (floats)
cmp edi, 0x00; // end for ( i=8; i >= 0; i=i-1)
jg tdn_dct_col1; // branch until (edi == 0)
// FEMMS;
// };
// dataptr2 += 8; // advance pointer to next row
// blkptr += 8;
// } // end for (i=0; i<8; i++);
// --------------------------- end of dct_column processing
// output matrix is transposed with respect to input. Before this result
// can be used, the matrix must be transposed.
// descale and transpose the output
// dataptr2 = &data[0];
/*
// To enable precise rounding uncomment this C-code, and the #defines
// near the beginning of this file (important!)
// of course, this code will run much slower!
for (i = 0; i < 8; i++)
for ( j = 0; j < 8; j++)
// scale, round, and transpose output matrix
block[i+(j<<3)] = (short int) floor(dataptr2[j+(i<<3)] * local_aanscales[i+(j<<3)] + 0.5);
*/
// The following loop transposes and descales the 3dn_dct result.
// There is a slight systematic error in the descaling algorithm.
// The x86/x87 float->int convert uses a truncation policy :
// The floating-point value's least significant digit (LSD) is
// rounded to integer of lesser MAGNITUDE. For positive numbers, this
// policy is the same as standard-truncation (+5.x -> +5.0)
// But problems arise with negative numbers : -5.x(float) -> -5(int),
// For negative numbers, the policy is *not* truncation, but rather
// unconditional-roundup.
//
// Since negative#'s and postive#'s are rounded differently, the naive
// "add +0.5" compensation method fails. This rounding issue ruins an
// otherwise "correct" f_DCT algorithm.
//
// To correct the rounding problem, negative numbers are 'precompensated'
// with '-0.5'. Postive numbers continue to 'precompensated' with '+0.5.'
// This policy produces accurate rounding, but is data-dependent :
// The CPU must examine each input float-value in order to select the
// proper rounding compensation-value (-0.5 or +0.5.)
//
// In practice, data-decision policies are to be avoided.
//
// An alternative "fast-round"-algorithm offers faster performance at the
// expense of accuracy. The float-value is first multiplied by a
// factor:"2^N", where N is an integer. Then, the scaled-float is
// converted to integer using the standard CPU instruction (yes, the one
// with the troublesome truncation-policy.) Now, the scaled-integer is
// summed with a scaled-precompensation value ("+2^(N-1)".) During the
// last step, the compenasted-integer is descaled by right shifting
// N-bits, yielding the final result.
//
// The forward(float)-scaling and back(int)-decsaling effectively
// "pushes" the rounding-inaccuracy far to the right of the decimal-point,
// where its impact is less. Using this method, negative#s still exhibit
// a slight-shift (the shift is exactly equal to 2^(1-N)), but with a
// suitably large N, the shift is negligble.
// For N=15, the shift is 1 part per 32768, or 1/32768 (~0.0000305)
// As a final note, the -0.5/+0.5 (data-dependent) rounding policy can
// be implemented using the partition-compare MMX instructions pcmpXXX.
// Statistical tests for both precise and imprecise rounding policies
// were compared. For N=15, the tests revealed no loss in precision
// for the "fast-round" approach.
// row col-> (source) col-> (destination)
// # 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
// - --------------- -----------------
// 0 A I ... A B C D E F G H
// 1 B J ... I J K L M N O P
// 2 C K ...(source) ------> (destination)
// 3 D L ...
//
// 4 E M ...
// 5 F N ...
// 6 G O ...
// 7 H P ...
#define mmRoundup mm7
mov eax, dword ptr [dataptr2];// eax = upper half of source matrix
mov edi, 0x04; // edi = 'i' // for ( i = 4; i > 0; i=i+1 )
lea ebx, dword ptr [local_aanscales];//ebx <= local_aanscales[] (table of constants)
mov ecx, eax; // copy eax -> ecx
movq mmRoundup, [mmMaskRnd]; // "round up" (+0.5,+0.5) mask
add ecx, 128; // ecx = lower half of source matrix
mov edx, dword ptr [block]; // edx <= block[] (output)
sub ebx, 64;
tdn_dct_postproc: // 3d_now_dct post-processing jump-point
// Each loop iteration converts a 2x8 (colxrow) segment. The loop
// executes 4 times to produce the final 8x8 output matrix.
movq mm0, qword ptr [eax+32*0]; // mm0 <= (1,0 : 0) [A,I]
add ebx, 64;
movq mm2, qword ptr [eax+32*1]; // mm2 <= (1,0 : 1) [B,J]
movq mm1, mm0; // mm1 = copy of (1,0 : 0) [A,I]
movq mm3, qword ptr [eax+32*2]; // mm3 <= (1,0 : 2) [C,K]
punpckldq mm0,mm2; // mm0 = ( 0 : 1,0 ) [B,A]
movq mm4, qword ptr [eax+32*3]; // mm4 <= (1,0 : 3) [D,L]
punpckhdq mm1, mm2; // mm1 = ( 1 : 1,0 ) [J,I]
movq mm2, mm3; // mm2 = copy of (1,0 : 2) [C,K]
// PFMULM( mm0, _ebx, 0 ); // mm0 <= scale ( 0 : 1,0 ) [B,A]
EMIT 0x0f
EMIT 0x0f
EMIT 0x43 //(((0xc0 & 0x3f) << 3) | 0x03 | 0x40)
EMIT 0
EMIT 0xb4
punpckldq mm2, mm4; // mm2 = ( 0 : 3,2 ) [D,C]
// PFMULM( mm1, _ebx, 0+32 ); // mm1 <= scale ( 1 : 1,0 ) [J,K]
EMIT 0x0f
EMIT 0x0f
EMIT 0x4b //(((0xc1 & 0x3f) << 3) | 0x03 | 0x40)
EMIT 0+32
EMIT 0xb4
punpckhdq mm3, mm4; // mm3 = ( 1 : 3,2 ) [L,K]
// PFMULM( mm2, _ebx, 8 ); // mm2 <= scale ( 0 : 3,2 ) [D,C]
EMIT 0x0f
EMIT 0x0f
EMIT 0x53 //(((0xc2 & 0x3f) << 3) | 0x03 | 0x40)
EMIT 8
EMIT 0xb4
// PFMULM( mm3, _ebx, 8+32 ); // mm1 <= scale ( 1: 3,2 ) [L,K]
// PF2ID( mm0, mm0 ); // mm0 <= float -> integer [B,A]
EMIT 0x0f
EMIT 0x0f
EMIT 0x5b //(((0xc3 & 0x3f) << 3) | 0x03 | 0x40)
EMIT 8+32
EMIT 0xb4
EMIT 0x0f
EMIT 0x0f
EMIT 0xc0 //((0xc0 & 0x3f) << 3) | 0xc0
EMIT 0x1d
// PF2ID( mm1, mm1 ); // mm1 <= float -> integer [J,I]
EMIT 0x0f
EMIT 0x0f
EMIT 0xc9 //((0xc1 & 0x3f) << 3) | 0xc1
EMIT 0x1d
add eax, 8;
// PF2ID( mm2, mm2 ); // mm2 <= float -> integer [D,C]
EMIT 0x0f
EMIT 0x0f
EMIT 0xd2 //((0xc2 & 0x3f) << 3) | 0xc2
EMIT 0x1d
paddd mm0, mmRoundup; // roundup compensation [B,A]
paddd mm1, mmRoundup; // roundup compensation [J,I]
// PF2ID( mm3, mm3 ); // mm3 <= float -> integer [L,K]
EMIT 0x0f
EMIT 0x0f
EMIT 0xdb //((0xc3 & 0x3f) << 3) | 0xc3
EMIT 0x1d
paddd mm2, mmRoundup; // roundup compensation [D,C]
psrad mm0, PSCF_SHIFT; // DESCALE [B,A]
paddd mm3, mmRoundup; // roundup compensation [L,K]
psrad mm2, PSCF_SHIFT; // DESCALE [D,C]
packssdw mm0, mm2; // form [D,C,B,A]
psrad mm1, PSCF_SHIFT; // DESCALE [J,I]
movq qword ptr [edx], mm0; // store [D,C,B,A]
psrad mm3, PSCF_SHIFT; // DESCALE [L,K]
packssdw mm1, mm3; // form [L,K,J,I]
// sub edi, 1; // i = i - 1
// row->0 1
// col ---
// 4 E M ...
// 5 F N ...
// 6 G O ...
// 7 H P ...
movq mm0, qword ptr [ecx+32*0]; // mm0 <= (1,0 : 0) [E,M]
movq qword ptr [edx+16], mm1; // store [L,K,J,I]
movq mm2, qword ptr [ecx+32*1]; // mm2 <= (1,0 : 1) [F,N]
movq mm1, mm0; // mm1 = copy of (1,0 : 0) [E,M]
movq mm3, qword ptr [ecx+32*2]; // mm3 <= (1,0 : 2) [G,O]
punpckldq mm0,mm2; // mm0 = ( 0 : 1,0 ) [F,E]
movq mm4, qword ptr [ecx+32*3]; // mm4 <= (1,0 : 3) [H,P]
punpckhdq mm1, mm2; // mm1 = ( 1 : 1,0 ) [N,M]
movq mm2, mm3; // mm2 = copy of (1,0 : 2) [G,O]
// PFMULM( mm0, _ebx, 16); // mm0 <= scale ( 0 : 1,0 ) [F,E]
EMIT 0x0f
EMIT 0x0f
EMIT 0x43 //(((0xc0 & 0x3f) << 3) | 0x03 | 0x40)
EMIT 16
EMIT 0xb4
punpckldq mm2, mm4; // mm2 = ( 0 : 3,2 ) [H,G]
// PFMULM( mm1, _ebx, 16+32 ); // mm1 <= scale ( 1 : 1,0 ) [J,K]
EMIT 0x0f
EMIT 0x0f
EMIT 0x4b //(((0xc1 & 0x3f) << 3) | 0x03 | 0x40)
EMIT 16+32
EMIT 0xb4
punpckhdq mm3, mm4; // mm3 = ( 1 : 3,2 ) [P,O]
// PFMULM( mm2, _ebx, 16+8); // mm2 <= scale ( 0 : 3,2 ) [H,G]
EMIT 0x0f
EMIT 0x0f
EMIT 0x53 //(((0xc2 & 0x3f) << 3) | 0x03 | 0x40)
EMIT 16+8
EMIT 0xb4
// PFMULM( mm3, _ebx, 16+8+32 ); // mm1 <= scale ( 1: 3,2 ) [L,K]
// PF2ID( mm0, mm0 ); // mm0 <= float -> integer [F,E]
EMIT 0x0f
EMIT 0x0f
EMIT 0x5b //(((0xc3 & 0x3f) << 3) | 0x03 | 0x40)
EMIT 16+8+32
EMIT 0xb4
EMIT 0x0f
EMIT 0x0f
EMIT 0xc0 //((0xc0 & 0x3f) << 3) | 0xc0
EMIT 0x1d
// PF2ID( mm1, mm1 ); // mm1 <= float -> integer [N,M]
EMIT 0x0f
EMIT 0x0f
EMIT 0xc9 //((0xc1 & 0x3f) << 3) | 0xc1
EMIT 0x1d
add ecx, 8;
// PF2ID( mm2, mm2 ); // mm2 <= float -> integer [H,G]
EMIT 0x0f
EMIT 0x0f
EMIT 0xd2 //((0xc2 & 0x3f) << 3) | 0xc2
EMIT 0x1d
paddd mm0, mmRoundup; // roundup compensation [F,E]
paddd mm1, mmRoundup; // roundup compensation [N,M]
// PF2ID( mm3, mm3 ); // mm3 <= float -> integer [P,O]
EMIT 0x0f
EMIT 0x0f
EMIT 0xdb //((0xc3 & 0x3f) << 3) | 0xc3
EMIT 0x1d
paddd mm2, mmRoundup; // roundup compensation [H,G]
psrad mm0, PSCF_SHIFT; // DESCALE [F,E]
paddd mm3, mmRoundup; // roundup compensation [P,O]
psrad mm2, PSCF_SHIFT; // DESCALE [H,G]
packssdw mm0, mm2; // form [H,G,F,E]
psrad mm1, PSCF_SHIFT; // DESCALE [N,M]
movq qword ptr [edx+8], mm0; // store [H,G,F,E]
psrad mm3, PSCF_SHIFT; // DESCALE [P,O]
packssdw mm1, mm3; // form [P,O,N,M]
sub edi, 1; // i = i - 1
movq qword ptr [edx+8+16], mm1; // store [P,O,N,M]
add edx, 32;
cmp edi, 0x00; // (i==0) ?
jg tdn_dct_postproc; // end for ( i =8; i > 0; i = i - 1)
// FEMMS;
EMIT 0x0f
EMIT 0x0e // this is the EMMS instruction the compiler complains about
};
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -