📄 timgfilterdct.cpp
字号:
* function!
*/
static const int W1=2841; /* 2048*sqrt(2)*cos(1*pi/16) */
static const int W2=2676; /* 2048*sqrt(2)*cos(2*pi/16) */
static const int W3=2408; /* 2048*sqrt(2)*cos(3*pi/16) */
static const int W5=1609; /* 2048*sqrt(2)*cos(5*pi/16) */
static const int W6=1108; /* 2048*sqrt(2)*cos(6*pi/16) */
static const int W7=565; /* 2048*sqrt(2)*cos(7*pi/16) */
short *blk;
long i;
long X0, X1, X2, X3, X4, X5, X6, X7, X8;
for (i = 0; i < 8; i++) /* idct rows */
{
blk = block + (i << 3);
if (!
((X1 = blk[4] << 11) | (X2 = blk[6]) | (X3 = blk[2]) | (X4 =
blk[1]) |
(X5 = blk[7]) | (X6 = blk[5]) | (X7 = blk[3]))) {
blk[0] = blk[1] = blk[2] = blk[3] = blk[4] = blk[5] = blk[6] =
blk[7] = blk[0] << 3;
continue;
}
X0 = (blk[0] << 11) + 128; /* for proper rounding in the fourth stage */
/* first stage */
X8 = W7 * (X4 + X5);
X4 = X8 + (W1 - W7) * X4;
X5 = X8 - (W1 + W7) * X5;
X8 = W3 * (X6 + X7);
X6 = X8 - (W3 - W5) * X6;
X7 = X8 - (W3 + W5) * X7;
/* second stage */
X8 = X0 + X1;
X0 -= X1;
X1 = W6 * (X3 + X2);
X2 = X1 - (W2 + W6) * X2;
X3 = X1 + (W2 - W6) * X3;
X1 = X4 + X6;
X4 -= X6;
X6 = X5 + X7;
X5 -= X7;
/* third stage */
X7 = X8 + X3;
X8 -= X3;
X3 = X0 + X2;
X0 -= X2;
X2 = (181 * (X4 + X5) + 128) >> 8;
X4 = (181 * (X4 - X5) + 128) >> 8;
/* fourth stage */
blk[0] = (short) ((X7 + X1) >> 8);
blk[1] = (short) ((X3 + X2) >> 8);
blk[2] = (short) ((X0 + X4) >> 8);
blk[3] = (short) ((X8 + X6) >> 8);
blk[4] = (short) ((X8 - X6) >> 8);
blk[5] = (short) ((X0 - X4) >> 8);
blk[6] = (short) ((X3 - X2) >> 8);
blk[7] = (short) ((X7 - X1) >> 8);
} /* end for ( i = 0; i < 8; ++i ) IDCT-rows */
for (i = 0; i < 8; i++) /* idct columns */
{
blk = block + i;
/* shortcut */
if (!
((X1 = (blk[8 * 4] << 8)) | (X2 = blk[8 * 6]) | (X3 =
blk[8 *
2]) | (X4 =
blk[8 *
1])
| (X5 = blk[8 * 7]) | (X6 = blk[8 * 5]) | (X7 = blk[8 * 3]))) {
blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3] = blk[8 * 4] =
blk[8 * 5] = blk[8 * 6] = blk[8 * 7] =
iclp[(blk[8 * 0] + 32) >> 6];
continue;
}
X0 = (blk[8 * 0] << 8) + 8192;
/* first stage */
X8 = W7 * (X4 + X5) + 4;
X4 = (X8 + (W1 - W7) * X4) >> 3;
X5 = (X8 - (W1 + W7) * X5) >> 3;
X8 = W3 * (X6 + X7) + 4;
X6 = (X8 - (W3 - W5) * X6) >> 3;
X7 = (X8 - (W3 + W5) * X7) >> 3;
/* second stage */
X8 = X0 + X1;
X0 -= X1;
X1 = W6 * (X3 + X2) + 4;
X2 = (X1 - (W2 + W6) * X2) >> 3;
X3 = (X1 + (W2 - W6) * X3) >> 3;
X1 = X4 + X6;
X4 -= X6;
X6 = X5 + X7;
X5 -= X7;
/* third stage */
X7 = X8 + X3;
X8 -= X3;
X3 = X0 + X2;
X0 -= X2;
X2 = (181 * (X4 + X5) + 128) >> 8;
X4 = (181 * (X4 - X5) + 128) >> 8;
/* fourth stage */
blk[8 * 0] = iclp[(X7 + X1) >> 14];
blk[8 * 1] = iclp[(X3 + X2) >> 14];
blk[8 * 2] = iclp[(X0 + X4) >> 14];
blk[8 * 3] = iclp[(X8 + X6) >> 14];
blk[8 * 4] = iclp[(X8 - X6) >> 14];
blk[8 * 5] = iclp[(X0 - X4) >> 14];
blk[8 * 6] = iclp[(X3 - X2) >> 14];
blk[8 * 7] = iclp[(X7 - X1) >> 14];
}
} /* end function idct_int32(block) */
void TimgFilterDCT::fdct_c(short *block)
{
static const int CONST_BITS =13;
static const int PASS1_BITS =2;
static const int FIX_0_298631336= 2446; /* FIX(0.298631336) */
static const int FIX_0_390180644= 3196; /* FIX(0.390180644) */
static const int FIX_0_541196100= 4433; /* FIX(0.541196100) */
static const int FIX_0_765366865= 6270; /* FIX(0.765366865) */
static const int FIX_0_899976223= 7373; /* FIX(0.899976223) */
static const int FIX_1_175875602= 9633; /* FIX(1.175875602) */
static const int FIX_1_501321110=12299; /* FIX(1.501321110) */
static const int FIX_1_847759065=15137; /* FIX(1.847759065) */
static const int FIX_1_961570560=16069; /* FIX(1.961570560) */
static const int FIX_2_053119869=16819; /* FIX(2.053119869) */
static const int FIX_2_562915447=20995; /* FIX(2.562915447) */
static const int FIX_3_072711026=25172; /* FIX(3.072711026) */
int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int tmp10, tmp11, tmp12, tmp13;
int z1, z2, z3, z4, z5;
short *blkptr;
int *dataptr;
int data[64];
int i;
/* Pass 1: process rows. */
/* Note results are scaled up by sqrt(8) compared to a true DCT; */
/* furthermore, we scale the results by 2**PASS1_BITS. */
dataptr = data;
blkptr = block;
for (i = 0; i < 8; i++) {
tmp0 = blkptr[0] + blkptr[7];
tmp7 = blkptr[0] - blkptr[7];
tmp1 = blkptr[1] + blkptr[6];
tmp6 = blkptr[1] - blkptr[6];
tmp2 = blkptr[2] + blkptr[5];
tmp5 = blkptr[2] - blkptr[5];
tmp3 = blkptr[3] + blkptr[4];
tmp4 = blkptr[3] - blkptr[4];
/* Even part per LL&M figure 1 --- note that published figure is faulty;
* rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
*/
tmp10 = tmp0 + tmp3;
tmp13 = tmp0 - tmp3;
tmp11 = tmp1 + tmp2;
tmp12 = tmp1 - tmp2;
dataptr[0] = (tmp10 + tmp11) << PASS1_BITS;
dataptr[4] = (tmp10 - tmp11) << PASS1_BITS;
z1 = (tmp12 + tmp13) * FIX_0_541196100;
dataptr[2] =
roundRshift(z1 + tmp13 * FIX_0_765366865, CONST_BITS - PASS1_BITS);
dataptr[6] =
roundRshift(z1 + tmp12 * (-FIX_1_847759065), CONST_BITS - PASS1_BITS);
/* Odd part per figure 8 --- note paper omits factor of sqrt(2).
* cK represents cos(K*pi/16).
* i0..i3 in the paper are tmp4..tmp7 here.
*/
z1 = tmp4 + tmp7;
z2 = tmp5 + tmp6;
z3 = tmp4 + tmp6;
z4 = tmp5 + tmp7;
z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */
tmp4 *= FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */
tmp5 *= FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */
tmp6 *= FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */
tmp7 *= FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */
z1 *= -FIX_0_899976223; /* sqrt(2) * (c7-c3) */
z2 *= -FIX_2_562915447; /* sqrt(2) * (-c1-c3) */
z3 *= -FIX_1_961570560; /* sqrt(2) * (-c3-c5) */
z4 *= -FIX_0_390180644; /* sqrt(2) * (c5-c3) */
z3 += z5;
z4 += z5;
dataptr[7] = roundRshift(tmp4 + z1 + z3, CONST_BITS - PASS1_BITS);
dataptr[5] = roundRshift(tmp5 + z2 + z4, CONST_BITS - PASS1_BITS);
dataptr[3] = roundRshift(tmp6 + z2 + z3, CONST_BITS - PASS1_BITS);
dataptr[1] = roundRshift(tmp7 + z1 + z4, CONST_BITS - PASS1_BITS);
dataptr += 8; /* advance pointer to next row */
blkptr += 8;
}
/* Pass 2: process columns.
* We remove the PASS1_BITS scaling, but leave the results scaled up
* by an overall factor of 8.
*/
dataptr = data;
for (i = 0; i < 8; i++) {
tmp0 = dataptr[0] + dataptr[56];
tmp7 = dataptr[0] - dataptr[56];
tmp1 = dataptr[8] + dataptr[48];
tmp6 = dataptr[8] - dataptr[48];
tmp2 = dataptr[16] + dataptr[40];
tmp5 = dataptr[16] - dataptr[40];
tmp3 = dataptr[24] + dataptr[32];
tmp4 = dataptr[24] - dataptr[32];
/* Even part per LL&M figure 1 --- note that published figure is faulty;
* rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
*/
tmp10 = tmp0 + tmp3;
tmp13 = tmp0 - tmp3;
tmp11 = tmp1 + tmp2;
tmp12 = tmp1 - tmp2;
dataptr[0] = roundRshift(tmp10 + tmp11, PASS1_BITS);
dataptr[32] = roundRshift(tmp10 - tmp11, PASS1_BITS);
z1 = (tmp12 + tmp13) * FIX_0_541196100;
dataptr[16] =
roundRshift(z1 + tmp13 * FIX_0_765366865, CONST_BITS + PASS1_BITS);
dataptr[48] =
roundRshift(z1 + tmp12 * (-FIX_1_847759065), CONST_BITS + PASS1_BITS);
/* Odd part per figure 8 --- note paper omits factor of sqrt(2).
* cK represents cos(K*pi/16).
* i0..i3 in the paper are tmp4..tmp7 here.
*/
z1 = tmp4 + tmp7;
z2 = tmp5 + tmp6;
z3 = tmp4 + tmp6;
z4 = tmp5 + tmp7;
z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */
tmp4 *= FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */
tmp5 *= FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */
tmp6 *= FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */
tmp7 *= FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */
z1 *= -FIX_0_899976223; /* sqrt(2) * (c7-c3) */
z2 *= -FIX_2_562915447; /* sqrt(2) * (-c1-c3) */
z3 *= -FIX_1_961570560; /* sqrt(2) * (-c3-c5) */
z4 *= -FIX_0_390180644; /* sqrt(2) * (c5-c3) */
z3 += z5;
z4 += z5;
dataptr[56] = roundRshift(tmp4 + z1 + z3, CONST_BITS + PASS1_BITS);
dataptr[40] = roundRshift(tmp5 + z2 + z4, CONST_BITS + PASS1_BITS);
dataptr[24] = roundRshift(tmp6 + z2 + z3, CONST_BITS + PASS1_BITS);
dataptr[8] = roundRshift(tmp7 + z1 + z4, CONST_BITS + PASS1_BITS);
dataptr++; /* advance pointer to next column */
}
/* descale */
for (i = 0; i < 64; i++)
block[i] = (short int) roundRshift(data[i], 3);
}
#ifdef __SSE2__
void TimgFilterDCT::fdct_sse2(short *block)
{
static __align16(const unsigned short,fTab1[])=
{
0x4000, 0x4000, 0x58c5, 0x4b42,
0xdd5d, 0xac61, 0xa73b, 0xcdb7,
0x4000, 0x4000, 0x3249, 0x11a8,
0x539f, 0x22a3, 0x4b42, 0xee58,
0x4000, 0xc000, 0x3249, 0xa73b,
0x539f, 0xdd5d, 0x4b42, 0xa73b,
0xc000, 0x4000, 0x11a8, 0x4b42,
0x22a3, 0xac61, 0x11a8, 0xcdb7
};
static __align16(const unsigned short,fTab2[])=
{
0x58c5, 0x58c5, 0x7b21, 0x6862,
0xcff5, 0x8c04, 0x84df, 0xba41,
0x58c5, 0x58c5, 0x45bf, 0x187e,
0x73fc, 0x300b, 0x6862, 0xe782,
0x58c5, 0xa73b, 0x45bf, 0x84df,
0x73fc, 0xcff5, 0x6862, 0x84df,
0xa73b, 0x58c5, 0x187e, 0x6862,
0x300b, 0x8c04, 0x187e, 0xba41
};
static __align16(const unsigned short,fTab3[])=
{
0x539f, 0x539f, 0x73fc, 0x6254,
0xd2bf, 0x92bf, 0x8c04, 0xbe4d,
0x539f, 0x539f, 0x41b3, 0x1712,
0x6d41, 0x2d41, 0x6254, 0xe8ee,
0x539f, 0xac61, 0x41b3, 0x8c04,
0x6d41, 0xd2bf, 0x6254, 0x8c04,
0xac61, 0x539f, 0x1712, 0x6254,
0x2d41, 0x92bf, 0x1712, 0xbe4d
};
static __align16(const unsigned short,fTab4[])=
{
0x4b42, 0x4b42, 0x6862, 0x587e,
0xd746, 0x9dac, 0x979e, 0xc4df,
0x4b42, 0x4b42, 0x3b21, 0x14c3,
0x6254, 0x28ba, 0x587e, 0xeb3d,
0x4b42, 0xb4be, 0x3b21, 0x979e,
0x6254, 0xd746, 0x587e, 0x979e,
0xb4be, 0x4b42, 0x14c3, 0x587e,
0x28ba, 0x9dac, 0x14c3, 0xc4df
};
static __align16(const unsigned short,Fdct_Rnd0[])={ 6,8,8,8, 6,8,8,8};
static __align16(const unsigned short,Fdct_Rnd1[])={ 8,8,8,8, 8,8,8,8};
static __align16(const unsigned short,Fdct_Rnd2[])={10,8,8,8, 8,8,8,8};
struct Tfdct
{
static __forceinline void fLLM_PASS(unsigned char *src,int shift,__m128i &xmm0,__m128i &xmm1,__m128i &xmm2,__m128i &xmm3,__m128i &xmm4,__m128i &xmm5,__m128i &xmm6,__m128i &xmm7)
{
movdqa (xmm0, src+0*16); // In0
movdqa (xmm2, src+2*16); // In2
movdqa (xmm3, xmm0 );
movdqa (xmm4, xmm2 );
movdqa (xmm7, src+7*16); // In7
movdqa (xmm5, src+5*16); // In5
psubsw (xmm0, xmm7); // t7 = In0-In7
paddsw (xmm7, xmm3); // t0 = In0+In7
psubsw (xmm2, xmm5); // t5 = In2-In5
paddsw (xmm5, xmm4); // t2 = In2+In5
movdqa (xmm3, src+3*16); // In3
movdqa (xmm4, src+4*16); // In4
movdqa (xmm1, xmm3);
psubsw (xmm3, xmm4); // t4 = In3-In4
paddsw (xmm4, xmm1); // t3 = In3+In4
movdqa (xmm6, src+6*16); // In6
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -