📄 fdct3dn.cpp
字号:
#undef t7t7
#undef z13z11
#undef out5out1
#undef out3out7
/*************************************************************
*
* Pass 2: DCT process columns
*
*************************************************************/
// The 2nd-pass uses the same base-code as the 1st-pass, the two
// code loops differ only in the handling of input/output data.
//
// 1) Since the 1st-pass produces float-data, the 2nd-pass's
// input must accept floating point data.
// 2) Since the 2nd-pass processes input row-by-row, and the source
// is already located in the temp array, the output is also
// stored row-by-row. (We don't want to allocate an additional
// float[64] array. This wastes cache RAM.)
// Therefore, the final post-processing (descaling) must
// additionally transpose the final output.
// s
//
// for (i = 0; i < 8; i++)
// {
// 3_2_1_0 -> x_3_2_1
// mm0, mm1 = 3_2_1_0, mm2, mm3 = x_3_2_1
// mm4 = 7_6_5_4
mov eax, dword ptr [dataptr];
pxor mm7, mm7; // mm7 <= 0x0000_0000_0000_0000
mov edx, dword ptr [dataptr2];// edx <= &dataptr[64]
mov edi, 0x08; // edi = 'i' // for ( i = 8; i > 0; i=i+1 )
lea ebx, dword ptr [CONSTANTS];// ebx <= &CONSTANTS[0]
// tdn_dct_col1 computes the fDCT for 1 input-row.
// tdn_dct_col1 stored the FDCT result back into the same row of the
// temp array.
tdn_dct_col1: // 3d_now_dct_col1 loop-point
// 1a) [in7,in6] = [blkptr7, blkptr6]; // dword, dword
// 1b) [in4,in5] = [blkptr4, blkptr5]; //dword, dword
// 1c) [in0,in1] = [blkptr0, blkptr1]; // dword,dword
// 1d) [in3,in2] = [blkptr3, blkptr2]; //dword, dword
#define in7in6 mm2
#define in4in5 mm3
#define in0in1 mm1
#define in3in2 mm0
#define in0in1_2 mm4
#define in3in2_2 mm5
movd in0in1, dword ptr [eax+4*1]; // in0in1 <= [ 000, in1 ]
punpckldq in0in1, qword ptr [eax+4*0]; // in0in1 <= [ in0, in1 ]
movd in4in5, dword ptr [eax+4*5]; // in4in5 <= [ 000, in5 ]
movq in0in1_2, in0in1; // copy in0in1 to in0in1_2
punpckldq in4in5, qword ptr [eax+4*4]; // in4in5 <= [ in4, in5 ]
movq in3in2, qword ptr [eax+4*2]; // get [in3,in2]
movq in7in6, qword ptr [eax+4*6]; // get [in7,in6]
movq in3in2_2, in3in2; // copy in3in2 to in3in2_2
/*
*tmp0 = dataptr[0] + dataptr[7];
*tmp7 = dataptr[0] - dataptr[7];
*tmp1 = dataptr[1] + dataptr[6];
*tmp6 = dataptr[1] - dataptr[6];
*tmp2 = dataptr[2] + dataptr[5];
*tmp5 = dataptr[2] - dataptr[5];
*tmp3 = dataptr[3] + dataptr[4];
*tmp4 = dataptr[3] - dataptr[4];
*/
// tmp0= inptr[0] + inptr[7];
// tmp1= inptr[1] + inptr[6];
// tmp7= inptr[0] - inptr[7];
// tmp6= inptr[1] - inptr[6];
// tmp3= inptr[3] + inptr[4];
// tmp2= inptr[2] + inptr[5];
// tmp4= inptr[3] - inptr[4];
// tmp5= inptr[2] - inptr[5];
#define t0t1 in0in1
#define t3t2 in3in2
#define t4t5 in3in2_2
#define t7t6 in0in1_2
// st1_0 <= [tmp0,tmp1] (float, float)
// st1_1 <= [tmp7,tmp6] (float, float)
// st1_2 <= [tmp3,tmp2] (float, float)
// st1_3 <= [tmp4,tmp5] (float, float)
EMIT 0x0f
EMIT 0x0f
EMIT 0xe2 //((0xc4 & 0x3f) << 3) | 0xc2
EMIT 0x9a
// PFSUB(t7t6, in7in6); // [tmp7,tmp6] <= [in0,in1] - [in7,in6]
add eax, 4*8; // increment inptr+=8 (floats)
EMIT 0x0f
EMIT 0x0f
EMIT 0xeb //((0xc5 & 0x3f) << 3) | 0xc3
EMIT 0x9a
// PFSUB(t4t5, in4in5);// [tmp4,tmp5] <= [in3,in2] - [in4,in5]
EMIT 0x0f
EMIT 0x0f
EMIT 0xc3 //((0xc0 & 0x3f) << 3) | 0xc3
EMIT 0x9e
// PFADD(t3t2, in4in5); // [tmp3,tmp2] <= [in3,in2] + [in4,in5]
movq [tmp7tmp6], t7t6;
EMIT 0x0f
EMIT 0x0f
EMIT 0xca //((0xc1 & 0x3f) << 3) | 0xc2
EMIT 0x9e
// PFADD(t0t1, in7in6); // [tmp0,tmp1] <= [in0,in1] + [in7,in6]
movq [tmp4tmp5], t4t5;
// movq [tmp3tmp2], t3t2;
// movq [tmp0tmp1], t0t1;
// Even part
// *tmp10 = (*tmp0) + (*tmp3); // phase 2
// *tmp11 = (*tmp1) + (*tmp2);
// *tmp13 = (*tmp0) - (*tmp3);
// *tmp12 = (*tmp1) - (*tmp2);
// 2a) [tmp10,tmp11] <= [tmp0,tmp1] + [tmp3,tmp2]
// 2b) [tmp13,tmp12] <= [tmp0,tmp1] - [tmp3,tmp2]
#define t10t11 mm6
#define t13t12 t0t1
movq t10t11, t0t1; // copy t0t1, prepare t10t11 calculation
// PFADD( t10t11, t3t2 ); // [tmp10,tmp11] <= [tmp0,tmp1] + [tmp3,tmp2]
EMIT 0x0f
EMIT 0x0f
EMIT 0xf0 //((0xc6 & 0x3f) << 3) | 0xc0
EMIT 0x9e
// PFSUB( t13t12, t3t2 ); // [tmp13,tmp12] <= [tmp0,tmp1] - [tmp3,tmp2]
EMIT 0x0f
EMIT 0x0f
EMIT 0xc8 //((0xc1 & 0x3f) << 3) | 0xc0
EMIT 0x9a
#define t10mt11 mm2 // mm0, mm4, mm5 not ok
#define t13t13 mm7 // mm1 not ok
// st1_20 <= [tmp10,tmp11]
// st1_21 <= [tmp10,-tmp11] // negated tmp11
// st1_22 <= [tmp13,tmp12]
// st1_23 <= [tmp13,tmp13] // tmp13 duplicated
movq [tmp10tmp11], t10t11;
movq t10mt11, t10t11;
movq [tmp13tmp12], t13t12;
pxor t10mt11, [mmMask00001000]; // t10mt11 = [tmp10, -tmp11]
movq t13t13, t13t12;
// 3a) dataptr2[4] = (*tmp10) - (*tmp11);
// 3a) dataptr2[0] = (*tmp10) + (*tmp11); // phase 3
punpckhdq t13t13, t13t13; // t13t13 = [tmp13, tmp13]
EMIT 0x0f
EMIT 0x0f
EMIT 0xf2 //((0xc6 & 0x3f) << 3) | 0xc2
EMIT 0xae
// PFACC( t10t11, t10mt11 ); // produce t10t11 = [dataptr[4], dataptr[0] ]
EMIT 0x0f
EMIT 0x0f
EMIT 0xc9 //((0xc1 & 0x3f) << 3) | 0xc1
EMIT 0xae
// PFACC( t13t12, t13t12); // t13t12 = [z1a,z1a] <= [tmp12+tmp13,tmp12+tmp13]
#define z1mz1 t13t12 // [z1, -z1]
#define out4out0 t10t11
#define Z1B_CONST _ebx + 0 // [0.07071, -0.7071]
movd dword ptr [ edx + 4*0 ], out4out0; // dataptr[0] <= final result
// movd dword ptr [ edx + 32*0 ], out4out0; // dataptr[0] <= final result
psrlq out4out0, 32; // [ __, out4 ]
// z1 = ((*tmp12) + (*tmp13)) * ((float ) NC_R_SQRT2); // c4
// 4b) [z1,-z1] <= [z1a,z1a] * [ 0.7071,-0.7071 ];
// PFMULM( z1mz1, Z1B_CONST, 0 ); // z1mz1<= [z1, -z1]
EMIT 0x0f
EMIT 0x0f
EMIT 0x4b //(((0xc1 & 0x3f) << 3) | 0x03 + 0 | 0x40)
EMIT 0
EMIT 0xb4
movd dword ptr [ edx + 4*4 ], out4out0; // dataptr[4] <= final result
// movd dword ptr [ edx + 32*4 ], out4out0; // dataptr[4] <= final result
// [dataptr2,dataptr6] <= [tmp13,tmp13] + [z1,-z1]; // pfadd
// PFADD(t13t13, z1mz1 ); // produce t13t13 = [dataptr2,dataptr6]
EMIT 0x0f
EMIT 0x0f
EMIT 0xf9 //((0xc7 & 0x3f) << 3) | 0xc1
EMIT 0x9e
// 5a) dataptr[2] = (*tmp13) + z1; // phase 5
// 5a) dataptr[6] = (*tmp13) - z1;
#define out2out6 t13t13
movd dword ptr [edx + 4*6], out2out6; // dataptr[6] <= final result
// movd dword ptr [edx + 32*6], out2out6; // dataptr[6] <= final result
psrlq out2out6, 32; // [ __, out2 ]
movd dword ptr [edx + 4*2], out2out6; // dataptr[2] <= final result
// movd dword ptr [edx + 32*2], out2out6; // dataptr[2] <= final result
#define t14t16 mm0
#define t7t6_2 mm2 // temp copy of t7t6
#define t15t15 mm1
// *tmp14 = (*tmp4) + (*tmp5); // phase 2
// *tmp15 = (*tmp5) + (*tmp6);
// *tmp16 = (*tmp6) + (*tmp7);
// 6a) [tmp14,tmp16] <= [ (tmp4+tmp5), (tmp7+tmp6) ]; //pfacc
// 6b) [ tmp15] <= [tmp7,tmp6] + [tmp4,tmp5];
movq t14t16, [tmp7tmp6]; // prepare [tmp14,tmp16] generation
movq t15t15, [tmp4tmp5];
movq t7t6_2, t14t16; // make copy of [tmp7,tmp6]
EMIT 0x0f
EMIT 0x0f
EMIT 0xc1 //((0xc0 & 0x3f) << 3) | 0xc1
EMIT 0xae
// PFACC(t14t16, t15t15); // t14t16 <= [ (tmp4+tmp5), (tmp7+tmp6) ]
EMIT 0x0f
EMIT 0x0f
EMIT 0xca //((0xc1 & 0x3f) << 3) | 0xc2
EMIT 0x9e
// PFADD(t15t15, t7t6_2); // t15t15 <= [(x),tmp5+tmp6]
#define t14mt16 mm2
#define t15t14 mm3
movq t15t14, t14t16; // prepare to generate [tmp15,tmp14]
// movd [tmp17tmp16], t14t16; // produce [tmp17,tmp16] <= [...,t16]
punpckldq t15t15,t15t15; // t15t15 <= [tmp5+tmp6,tmp5+tmp6]
movq t14mt16, t14t16; // prepare to generate [tmp14, -tmp16]
punpckhdq t15t14, t15t15; // mm3 <= [t15,t14]
// movq [tmp15tmp14], t15t14;
// The rotator is modified from fig 4-8 to avoid extra negations.
// (*z5) = ((*tmp14) - (*tmp16)) * ((float ) 0.382683433); // c6
// (*z2) = ((float ) 0.541196100) * (*tmp14) + (*z5); // c2-c6
// (*z4) = ((float ) 1.306562965) * (*tmp16) + (*z5); // c2+c6
// (*z3) = (*tmp15) * ((float ) NC_R_SQRT2); // c4
// form [z5a,z3a] <= [tmp14-tmp16, tmp15]
// 7a) [z2a,z4a] <= [tmp14,tmp16] * [0.5411,1.3066]
// 7b) [z5 ,z3 ] <= [tmp14-tmp16,tmp15] * [0.3827,0.7071]
#define ___t15 t15t15
pxor t14mt16, [mmMask00001000]; // t14mt16 = [tmp14,-tmp16]
psllq ___t15, 32; // create ___t15 = [ t15, 000]
//#define Z2AZ4A_CONST _ebx + 8 // [0.5411, 1.3066]
#define Z2AZ4A_CONST _ebx // [0.5411, 1.3066]
#define z5az3a ___t15 // [z5a,z3a] <= [t14-t16,t15] <= t14mt16
#define z2az4a t14t16 // [z2a,z4a] = [t14t16]*[0.5411,1.30666]
// PFMULM( z2az4a, Z2AZ4A_CONST, 8 ); // form [z2a,z4a]
// PFACC( z5az3a, t14mt16 ); // [z5a,z3a] = [tmp14-tmp16, tmp15]
EMIT 0x0f
EMIT 0x0f
EMIT 0x43 //(((0xc0 & 0x3f) << 3) | 0x03 | 0x40)
EMIT 8
EMIT 0xb4
EMIT 0x0f
EMIT 0x0f
EMIT 0xca //((0xc1 & 0x3f) << 3) | 0xc2
EMIT 0xae
#define Z5AZ3A_CONST _ebx // [0.3827, 0.7071]
//#define Z5AZ3A_CONST _ebx + 16 // [0.3827, 0.7071]
// PFMULM( z5az3a, Z5AZ3A_CONST, 16 ); // z5az3a = [z5,z3]
EMIT 0x0f
EMIT 0x0f
EMIT 0x4b //(((0xc1 & 0x3f) << 3) | 0x03 | 0x40)
EMIT 16
EMIT 0xb4
#define z5z3 z5az3a
#define z5z5 mm7
#define z3z3 mm6
movq z5z5, z5z3; // start to form z5z5
movq z3z3, z5z3; // start to form z3z3
punpckhdq z5z5,z5z5; // z5z5 = [z5,z5]
punpckldq z3z3,z3z3; // z3z3 = [z3,z3]
// movq [z5z4], z5z5; // produce z5
#define z3mz3 z3z3
#define z2z4 z2az4a
pxor z3mz3, [mmMask00001000]; // z3mz3 = [z3,-z3]
// 7c) [z2,z4] <= [z2a,z4a] + [z5,z5];
// PFADD( z2z4, z5z5 ); // z2az4a <= [z2,z4]
EMIT 0x0f
EMIT 0x0f
EMIT 0xc7 //((0xc0 & 0x3f) << 3) | 0xc7
EMIT 0x9e
#define t7t7 mm5
movq t7t7, [tmp7tmp6]; // begin forming [tmp7,tmp7]
punpckhdq t7t7,t7t7; // t7t7 = [tmp7,tmp7]
// z11 = (*tmp7) + (*z3); // phase 5
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -