📄 timgfilterdct.cpp

📁 从FFMPEG转换而来的H264解码程序,VC下编译..
💻 CPP
📖 第 1 页 / 共 3 页
字号:
	 * function!
	 */

static const int W1=2841;				/* 2048*sqrt(2)*cos(1*pi/16) */
static const int W2=2676;				/* 2048*sqrt(2)*cos(2*pi/16) */
static const int W3=2408;				/* 2048*sqrt(2)*cos(3*pi/16) */
static const int W5=1609;				/* 2048*sqrt(2)*cos(5*pi/16) */
static const int W6=1108;				/* 2048*sqrt(2)*cos(6*pi/16) */
static const int W7=565;				/* 2048*sqrt(2)*cos(7*pi/16) */

	short *blk;
	long i;
	long X0, X1, X2, X3, X4, X5, X6, X7, X8;


	for (i = 0; i < 8; i++)		/* idct rows */
	{
		blk = block + (i << 3);
		if (!
			((X1 = blk[4] << 11) | (X2 = blk[6]) | (X3 = blk[2]) | (X4 =
																	blk[1]) |
			 (X5 = blk[7]) | (X6 = blk[5]) | (X7 = blk[3]))) {
			blk[0] = blk[1] = blk[2] = blk[3] = blk[4] = blk[5] = blk[6] =
				blk[7] = blk[0] << 3;
			continue;
		}

		X0 = (blk[0] << 11) + 128;	/* for proper rounding in the fourth stage  */

		/* first stage  */
		X8 = W7 * (X4 + X5);
		X4 = X8 + (W1 - W7) * X4;
		X5 = X8 - (W1 + W7) * X5;
		X8 = W3 * (X6 + X7);
		X6 = X8 - (W3 - W5) * X6;
		X7 = X8 - (W3 + W5) * X7;

		/* second stage  */
		X8 = X0 + X1;
		X0 -= X1;
		X1 = W6 * (X3 + X2);
		X2 = X1 - (W2 + W6) * X2;
		X3 = X1 + (W2 - W6) * X3;
		X1 = X4 + X6;
		X4 -= X6;
		X6 = X5 + X7;
		X5 -= X7;

		/* third stage  */
		X7 = X8 + X3;
		X8 -= X3;
		X3 = X0 + X2;
		X0 -= X2;
		X2 = (181 * (X4 + X5) + 128) >> 8;
		X4 = (181 * (X4 - X5) + 128) >> 8;

		/* fourth stage  */

		blk[0] = (short) ((X7 + X1) >> 8);
		blk[1] = (short) ((X3 + X2) >> 8);
		blk[2] = (short) ((X0 + X4) >> 8);
		blk[3] = (short) ((X8 + X6) >> 8);
		blk[4] = (short) ((X8 - X6) >> 8);
		blk[5] = (short) ((X0 - X4) >> 8);
		blk[6] = (short) ((X3 - X2) >> 8);
		blk[7] = (short) ((X7 - X1) >> 8);

	}							/* end for ( i = 0; i < 8; ++i ) IDCT-rows */



	for (i = 0; i < 8; i++)		/* idct columns */
	{
		blk = block + i;
		/* shortcut  */
		if (!
			((X1 = (blk[8 * 4] << 8)) | (X2 = blk[8 * 6]) | (X3 =
															 blk[8 *
																 2]) | (X4 =
																		blk[8 *
																			1])
			 | (X5 = blk[8 * 7]) | (X6 = blk[8 * 5]) | (X7 = blk[8 * 3]))) {
			blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3] = blk[8 * 4] =
				blk[8 * 5] = blk[8 * 6] = blk[8 * 7] =
				iclp[(blk[8 * 0] + 32) >> 6];
			continue;
		}

		X0 = (blk[8 * 0] << 8) + 8192;

		/* first stage  */
		X8 = W7 * (X4 + X5) + 4;
		X4 = (X8 + (W1 - W7) * X4) >> 3;
		X5 = (X8 - (W1 + W7) * X5) >> 3;
		X8 = W3 * (X6 + X7) + 4;
		X6 = (X8 - (W3 - W5) * X6) >> 3;
		X7 = (X8 - (W3 + W5) * X7) >> 3;

		/* second stage  */
		X8 = X0 + X1;
		X0 -= X1;
		X1 = W6 * (X3 + X2) + 4;
		X2 = (X1 - (W2 + W6) * X2) >> 3;
		X3 = (X1 + (W2 - W6) * X3) >> 3;
		X1 = X4 + X6;
		X4 -= X6;
		X6 = X5 + X7;
		X5 -= X7;

		/* third stage  */
		X7 = X8 + X3;
		X8 -= X3;
		X3 = X0 + X2;
		X0 -= X2;
		X2 = (181 * (X4 + X5) + 128) >> 8;
		X4 = (181 * (X4 - X5) + 128) >> 8;

		/* fourth stage  */
		blk[8 * 0] = iclp[(X7 + X1) >> 14];
		blk[8 * 1] = iclp[(X3 + X2) >> 14];
		blk[8 * 2] = iclp[(X0 + X4) >> 14];
		blk[8 * 3] = iclp[(X8 + X6) >> 14];
		blk[8 * 4] = iclp[(X8 - X6) >> 14];
		blk[8 * 5] = iclp[(X0 - X4) >> 14];
		blk[8 * 6] = iclp[(X3 - X2) >> 14];
		blk[8 * 7] = iclp[(X7 - X1) >> 14];
	}

}								/* end function idct_int32(block) */

void TimgFilterDCT::fdct_c(short *block)
{
 static const int CONST_BITS =13;
 static const int PASS1_BITS =2;

 static const int FIX_0_298631336= 2446;	/* FIX(0.298631336) */
 static const int FIX_0_390180644= 3196;	/* FIX(0.390180644) */
 static const int FIX_0_541196100= 4433;	/* FIX(0.541196100) */
 static const int FIX_0_765366865= 6270;	/* FIX(0.765366865) */
 static const int FIX_0_899976223= 7373;	/* FIX(0.899976223) */
 static const int FIX_1_175875602= 9633;	/* FIX(1.175875602) */
 static const int FIX_1_501321110=12299;	/* FIX(1.501321110) */
 static const int FIX_1_847759065=15137;	/* FIX(1.847759065) */
 static const int FIX_1_961570560=16069;	/* FIX(1.961570560) */
 static const int FIX_2_053119869=16819;	/* FIX(2.053119869) */
 static const int FIX_2_562915447=20995;	/* FIX(2.562915447) */
 static const int FIX_3_072711026=25172;	/* FIX(3.072711026) */

	int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
	int tmp10, tmp11, tmp12, tmp13;
	int z1, z2, z3, z4, z5;
	short *blkptr;
	int *dataptr;
	int data[64];
	int i;

	/* Pass 1: process rows. */
	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
	/* furthermore, we scale the results by 2**PASS1_BITS. */

	dataptr = data;
	blkptr = block;
	for (i = 0; i < 8; i++) {
		tmp0 = blkptr[0] + blkptr[7];
		tmp7 = blkptr[0] - blkptr[7];
		tmp1 = blkptr[1] + blkptr[6];
		tmp6 = blkptr[1] - blkptr[6];
		tmp2 = blkptr[2] + blkptr[5];
		tmp5 = blkptr[2] - blkptr[5];
		tmp3 = blkptr[3] + blkptr[4];
		tmp4 = blkptr[3] - blkptr[4];

		/* Even part per LL&M figure 1 --- note that published figure is faulty;
		 * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
		 */

		tmp10 = tmp0 + tmp3;
		tmp13 = tmp0 - tmp3;
		tmp11 = tmp1 + tmp2;
		tmp12 = tmp1 - tmp2;

		dataptr[0] = (tmp10 + tmp11) << PASS1_BITS;
		dataptr[4] = (tmp10 - tmp11) << PASS1_BITS;

		z1 = (tmp12 + tmp13) * FIX_0_541196100;
		dataptr[2] =
			roundRshift(z1 + tmp13 * FIX_0_765366865, CONST_BITS - PASS1_BITS);
		dataptr[6] =
			roundRshift(z1 + tmp12 * (-FIX_1_847759065), CONST_BITS - PASS1_BITS);

		/* Odd part per figure 8 --- note paper omits factor of sqrt(2).
		 * cK represents cos(K*pi/16).
		 * i0..i3 in the paper are tmp4..tmp7 here.
		 */

		z1 = tmp4 + tmp7;
		z2 = tmp5 + tmp6;
		z3 = tmp4 + tmp6;
		z4 = tmp5 + tmp7;
		z5 = (z3 + z4) * FIX_1_175875602;	/* sqrt(2) * c3 */

		tmp4 *= FIX_0_298631336;	/* sqrt(2) * (-c1+c3+c5-c7) */
		tmp5 *= FIX_2_053119869;	/* sqrt(2) * ( c1+c3-c5+c7) */
		tmp6 *= FIX_3_072711026;	/* sqrt(2) * ( c1+c3+c5-c7) */
		tmp7 *= FIX_1_501321110;	/* sqrt(2) * ( c1+c3-c5-c7) */
		z1 *= -FIX_0_899976223;	/* sqrt(2) * (c7-c3) */
		z2 *= -FIX_2_562915447;	/* sqrt(2) * (-c1-c3) */
		z3 *= -FIX_1_961570560;	/* sqrt(2) * (-c3-c5) */
		z4 *= -FIX_0_390180644;	/* sqrt(2) * (c5-c3) */

		z3 += z5;
		z4 += z5;

		dataptr[7] = roundRshift(tmp4 + z1 + z3, CONST_BITS - PASS1_BITS);
		dataptr[5] = roundRshift(tmp5 + z2 + z4, CONST_BITS - PASS1_BITS);
		dataptr[3] = roundRshift(tmp6 + z2 + z3, CONST_BITS - PASS1_BITS);
		dataptr[1] = roundRshift(tmp7 + z1 + z4, CONST_BITS - PASS1_BITS);

		dataptr += 8;			/* advance pointer to next row */
		blkptr += 8;
	}

	/* Pass 2: process columns.
	 * We remove the PASS1_BITS scaling, but leave the results scaled up
	 * by an overall factor of 8.
	 */

	dataptr = data;
	for (i = 0; i < 8; i++) {
		tmp0 = dataptr[0] + dataptr[56];
		tmp7 = dataptr[0] - dataptr[56];
		tmp1 = dataptr[8] + dataptr[48];
		tmp6 = dataptr[8] - dataptr[48];
		tmp2 = dataptr[16] + dataptr[40];
		tmp5 = dataptr[16] - dataptr[40];
		tmp3 = dataptr[24] + dataptr[32];
		tmp4 = dataptr[24] - dataptr[32];

		/* Even part per LL&M figure 1 --- note that published figure is faulty;
		 * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
		 */

		tmp10 = tmp0 + tmp3;
		tmp13 = tmp0 - tmp3;
		tmp11 = tmp1 + tmp2;
		tmp12 = tmp1 - tmp2;

		dataptr[0] = roundRshift(tmp10 + tmp11, PASS1_BITS);
		dataptr[32] = roundRshift(tmp10 - tmp11, PASS1_BITS);

		z1 = (tmp12 + tmp13) * FIX_0_541196100;
		dataptr[16] =
			roundRshift(z1 + tmp13 * FIX_0_765366865, CONST_BITS + PASS1_BITS);
		dataptr[48] =
			roundRshift(z1 + tmp12 * (-FIX_1_847759065), CONST_BITS + PASS1_BITS);

		/* Odd part per figure 8 --- note paper omits factor of sqrt(2).
		 * cK represents cos(K*pi/16).
		 * i0..i3 in the paper are tmp4..tmp7 here.
		 */

		z1 = tmp4 + tmp7;
		z2 = tmp5 + tmp6;
		z3 = tmp4 + tmp6;
		z4 = tmp5 + tmp7;
		z5 = (z3 + z4) * FIX_1_175875602;	/* sqrt(2) * c3 */

		tmp4 *= FIX_0_298631336;	/* sqrt(2) * (-c1+c3+c5-c7) */
		tmp5 *= FIX_2_053119869;	/* sqrt(2) * ( c1+c3-c5+c7) */
		tmp6 *= FIX_3_072711026;	/* sqrt(2) * ( c1+c3+c5-c7) */
		tmp7 *= FIX_1_501321110;	/* sqrt(2) * ( c1+c3-c5-c7) */
		z1 *= -FIX_0_899976223;	/* sqrt(2) * (c7-c3) */
		z2 *= -FIX_2_562915447;	/* sqrt(2) * (-c1-c3) */
		z3 *= -FIX_1_961570560;	/* sqrt(2) * (-c3-c5) */
		z4 *= -FIX_0_390180644;	/* sqrt(2) * (c5-c3) */

		z3 += z5;
		z4 += z5;

		dataptr[56] = roundRshift(tmp4 + z1 + z3, CONST_BITS + PASS1_BITS);
		dataptr[40] = roundRshift(tmp5 + z2 + z4, CONST_BITS + PASS1_BITS);
		dataptr[24] = roundRshift(tmp6 + z2 + z3, CONST_BITS + PASS1_BITS);
		dataptr[8] = roundRshift(tmp7 + z1 + z4, CONST_BITS + PASS1_BITS);

		dataptr++;				/* advance pointer to next column */
	}
	/* descale */
	for (i = 0; i < 64; i++)
		block[i] = (short int) roundRshift(data[i], 3);
}
#ifdef __SSE2__
void TimgFilterDCT::fdct_sse2(short *block)
{
 static __align16(const unsigned short,fTab1[])=
  {
   0x4000, 0x4000, 0x58c5, 0x4b42,
   0xdd5d, 0xac61, 0xa73b, 0xcdb7,
   0x4000, 0x4000, 0x3249, 0x11a8,
   0x539f, 0x22a3, 0x4b42, 0xee58,
   0x4000, 0xc000, 0x3249, 0xa73b,
   0x539f, 0xdd5d, 0x4b42, 0xa73b,
   0xc000, 0x4000, 0x11a8, 0x4b42,
   0x22a3, 0xac61, 0x11a8, 0xcdb7
  };
 static __align16(const unsigned short,fTab2[])=
  {
   0x58c5, 0x58c5, 0x7b21, 0x6862,
   0xcff5, 0x8c04, 0x84df, 0xba41,
   0x58c5, 0x58c5, 0x45bf, 0x187e,
   0x73fc, 0x300b, 0x6862, 0xe782,
   0x58c5, 0xa73b, 0x45bf, 0x84df,
   0x73fc, 0xcff5, 0x6862, 0x84df,
   0xa73b, 0x58c5, 0x187e, 0x6862,
   0x300b, 0x8c04, 0x187e, 0xba41
  };
 static __align16(const unsigned short,fTab3[])=
  {
   0x539f, 0x539f, 0x73fc, 0x6254,
   0xd2bf, 0x92bf, 0x8c04, 0xbe4d,
   0x539f, 0x539f, 0x41b3, 0x1712,
   0x6d41, 0x2d41, 0x6254, 0xe8ee,
   0x539f, 0xac61, 0x41b3, 0x8c04,
   0x6d41, 0xd2bf, 0x6254, 0x8c04,
   0xac61, 0x539f, 0x1712, 0x6254,
   0x2d41, 0x92bf, 0x1712, 0xbe4d
  };
 static __align16(const unsigned short,fTab4[])=
  {
   0x4b42, 0x4b42, 0x6862, 0x587e,
   0xd746, 0x9dac, 0x979e, 0xc4df,
   0x4b42, 0x4b42, 0x3b21, 0x14c3,
   0x6254, 0x28ba, 0x587e, 0xeb3d,
   0x4b42, 0xb4be, 0x3b21, 0x979e,
   0x6254, 0xd746, 0x587e, 0x979e,
   0xb4be, 0x4b42, 0x14c3, 0x587e,
   0x28ba, 0x9dac, 0x14c3, 0xc4df
  };

 static __align16(const unsigned short,Fdct_Rnd0[])={ 6,8,8,8, 6,8,8,8};
 static __align16(const unsigned short,Fdct_Rnd1[])={ 8,8,8,8, 8,8,8,8};
 static __align16(const unsigned short,Fdct_Rnd2[])={10,8,8,8, 8,8,8,8};

 struct Tfdct
  {
   static __forceinline void fLLM_PASS(unsigned char *src,int shift,__m128i &xmm0,__m128i &xmm1,__m128i &xmm2,__m128i &xmm3,__m128i &xmm4,__m128i &xmm5,__m128i &xmm6,__m128i &xmm7)
    {
     movdqa (xmm0, src+0*16);   // In0
     movdqa (xmm2, src+2*16);   // In2
     movdqa (xmm3, xmm0    );
     movdqa (xmm4, xmm2    );
     movdqa (xmm7, src+7*16);   // In7
     movdqa (xmm5, src+5*16);   // In5

     psubsw (xmm0, xmm7);         // t7 = In0-In7
     paddsw (xmm7, xmm3);         // t0 = In0+In7
     psubsw (xmm2, xmm5);         // t5 = In2-In5
     paddsw (xmm5, xmm4);         // t2 = In2+In5

     movdqa (xmm3, src+3*16);   // In3
     movdqa (xmm4, src+4*16);   // In4
     movdqa (xmm1, xmm3);
     psubsw (xmm3, xmm4);         // t4 = In3-In4
     paddsw (xmm4, xmm1);         // t3 = In3+In4
     movdqa (xmm6, src+6*16);   // In6
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -