fdct_mmx.c
来自「Motion JPEG编解码器源代码」· C语言 代码 · 共 565 行 · 第 1/2 页
C
565 行
////////////////////////////////////////////////////////////////////////////////// fdctam32.c - AP922 MMX(3D-Now) forward-DCT// ----------// Intel Application Note AP-922 - fast, precise implementation of DCT// http://developer.intel.com/vtune/cbts/appnotes.htm// ----------// // This routine can use a 3D-Now/MMX enhancement to increase the// accuracy of the fdct_col_4 macro. The dct_col function uses 3D-Now's// PMHULHRW instead of MMX's PMHULHW(and POR). The substitution improves// accuracy very slightly with performance penalty. If the target CPU// does not support 3D-Now, then this function cannot be executed.// // For a fast, precise MMX implementation of inverse-DCT // visit http://www.elecard.com/peter//// v1.0 07/22/2000 (initial release)// // liaor@iname.com http://members.tripod.com/~liaor ///////////////////////////////////////////////////////////////////////////////* * A.Stevens Jul 2000: ported to nasm syntax and disentangled from * from Win**** compiler specific stuff. * All the real work was done above though. * See above for how to optimise quality on 3DNow! CPU's * Nov 2003 changed to PIC for use in shared libraries * * G.Vervoort Jan 2005: ported to inline asm. */#include <config.h>#include "mjpeg_types.h"#include "attributes.h"#include "mmx.h"////////////////////////////////////////////////////////////////////////// constants for the forward DCT// -----------------------------//// Be sure to check that your compiler is aligning all constants to QWORD// (8-byte) memory boundaries! Otherwise the unaligned memory access will// severely stall MMX execution.////////////////////////////////////////////////////////////////////////#define BITS_FRW_ACC 3 //; 2 or 3 for accuracy#define SHIFT_FRW_COL BITS_FRW_ACC#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17)//#define RND_FRW_ROW (262144 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_ROW-1)#define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1))//#define RND_FRW_COL (2 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_COL-1)#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1))//concatenated table, for forward DCT transformationstatic const int16_t fdct_tg_all_16[] ATTR_ALIGN(8) = { 13036, 13036, 13036, 13036, // tg * (2<<16) + 0.5 27146, 27146, 27146, 27146, // tg * (2<<16) + 0.5 -21746, -21746, -21746, -21746, // tg * (2<<16) + 0.5};static const int16_t ocos_4_16[4] ATTR_ALIGN(8) = { 23170, 23170, 23170, 23170, //cos * (2<<15) + 0.5};static const int64_t fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL;static const int32_t fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW };static const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = { // forward_dct coeff table //row0 16384, 16384, 21407, -8867, // w09 w01 w08 w00 16384, 16384, 8867, -21407, // w13 w05 w12 w04 16384, -16384, 8867, 21407, // w11 w03 w10 w02 -16384, 16384, -21407, -8867, // w15 w07 w14 w06 22725, 12873, 19266, -22725, // w22 w20 w18 w16 19266, 4520, -4520, -12873, // w23 w21 w19 w17 12873, 4520, 4520, 19266, // w30 w28 w26 w24 -22725, 19266, -12873, -22725, // w31 w29 w27 w25 //row1 22725, 22725, 29692, -12299, // w09 w01 w08 w00 22725, 22725, 12299, -29692, // w13 w05 w12 w04 22725, -22725, 12299, 29692, // w11 w03 w10 w02 -22725, 22725, -29692, -12299, // w15 w07 w14 w06 31521, 17855, 26722, -31521, // w22 w20 w18 w16 26722, 6270, -6270, -17855, // w23 w21 w19 w17 17855, 6270, 6270, 26722, // w30 w28 w26 w24 -31521, 26722, -17855, -31521, // w31 w29 w27 w25 //row2 21407, 21407, 27969, -11585, // w09 w01 w08 w00 21407, 21407, 11585, -27969, // w13 w05 w12 w04 21407, -21407, 11585, 27969, // w11 w03 w10 w02 -21407, 21407, -27969, -11585, // w15 w07 w14 w06 29692, 16819, 25172, -29692, // w22 w20 w18 w16 25172, 5906, -5906, -16819, // w23 w21 w19 w17 16819, 5906, 5906, 25172, // w30 w28 w26 w24 -29692, 25172, -16819, -29692, // w31 w29 w27 w25 //row3 19266, 19266, 25172, -10426, // w09 w01 w08 w00 19266, 19266, 10426, -25172, // w13 w05 w12 w04 19266, -19266, 10426, 25172, // w11 w03 w10 w02 -19266, 19266, -25172, -10426, // w15 w07 w14 w06, 26722, 15137, 22654, -26722, // w22 w20 w18 w16 22654, 5315, -5315, -15137, // w23 w21 w19 w17 15137, 5315, 5315, 22654, // w30 w28 w26 w24 -26722, 22654, -15137, -26722, // w31 w29 w27 w25, //row4 16384, 16384, 21407, -8867, // w09 w01 w08 w00 16384, 16384, 8867, -21407, // w13 w05 w12 w04 16384, -16384, 8867, 21407, // w11 w03 w10 w02 -16384, 16384, -21407, -8867, // w15 w07 w14 w06 22725, 12873, 19266, -22725, // w22 w20 w18 w16 19266, 4520, -4520, -12873, // w23 w21 w19 w17 12873, 4520, 4520, 19266, // w30 w28 w26 w24 -22725, 19266, -12873, -22725, // w31 w29 w27 w25 //row5 19266, 19266, 25172, -10426, // w09 w01 w08 w00 19266, 19266, 10426, -25172, // w13 w05 w12 w04 19266, -19266, 10426, 25172, // w11 w03 w10 w02 -19266, 19266, -25172, -10426, // w15 w07 w14 w06 26722, 15137, 22654, -26722, // w22 w20 w18 w16 22654, 5315, -5315, -15137, // w23 w21 w19 w17 15137, 5315, 5315, 22654, // w30 w28 w26 w24 -26722, 22654, -15137, -26722, // w31 w29 w27 w25 //row6 21407, 21407, 27969, -11585, // w09 w01 w08 w00 21407, 21407, 11585, -27969, // w13 w05 w12 w04 21407, -21407, 11585, 27969, // w11 w03 w10 w02 -21407, 21407, -27969, -11585, // w15 w07 w14 w06, 29692, 16819, 25172, -29692, // w22 w20 w18 w16 25172, 5906, -5906, -16819, // w23 w21 w19 w17 16819, 5906, 5906, 25172, // w30 w28 w26 w24 -29692, 25172, -16819, -29692, // w31 w29 w27 w25, //row7 22725, 22725, 29692, -12299, // w09 w01 w08 w00 22725, 22725, 12299, -29692, // w13 w05 w12 w04 22725, -22725, 12299, 29692, // w11 w03 w10 w02 -22725, 22725, -29692, -12299, // w15 w07 w14 w06, 31521, 17855, 26722, -31521, // w22 w20 w18 w16 26722, 6270, -6270, -17855, // w23 w21 w19 w17 17855, 6270, 6270, 26722, // w30 w28 w26 w24 -31521, 26722, -17855, -31521 // w31 w29 w27 w25};#define x0 (inp + 0*8)#define x1 (inp + 1*8)#define x2 (inp + 2*8)#define x3 (inp + 3*8)#define x4 (inp + 4*8)#define x5 (inp + 5*8)#define x6 (inp + 6*8)#define x7 (inp + 7*8)#define y0 (out + 0*8)#define y1 (out + 1*8)#define y2 (out + 2*8)#define y3 (out + 3*8)#define y4 (out + 4*8)#define y5 (out + 5*8)#define y6 (out + 6*8)#define y7 (out + 7*8)#define round_frw_row fdct_r_row //////////////////////////////////////////////////////////////////////// // // The high-level pseudocode for the fdct_am32() routine : // // fdct_am32() // { // forward_dct_col03(); // dct_column transform on cols 0-3 // forward_dct_col47(); // dct_column transform on cols 4-7 // for ( j = 0; j < 8; j=j+1 ) // forward_dct_row1(j); // dct_row transform on row #j // } //void fdct_mmx(int16_t *blk){ int16_t *inp, *out; int16_t *table; int i; /* transform the left half of the matrix (4 columns) */ out = inp = blk; /* * for ( i = 0; i < 2; i = i + 1) * the for-loop is executed twice. We are better off unrolling the * loop to avoid branch misprediction. * .mmx32_fdct_col03: */ movq_m2r(*x1, mm0); /* 0 ; x1 */ movq_m2r(*x6, mm1); /* 1 ; x6 */ movq_r2r(mm0, mm2); /* 2 ; x1 */ movq_m2r(*x2, mm3); /* 3 ; x2 */ paddsw_r2r(mm1, mm0); /* t1 = x[1] + x[6] */ movq_m2r(*x5, mm4); /* 4 ; x5 */ psllw_i2r(SHIFT_FRW_COL, mm0); /* t1 */ movq_m2r(*x0, mm5); /* 5 ; x0 */ paddsw_r2r(mm3, mm4); /* t2 = x[2] + x[5] */ paddsw_m2r(*x7, mm5); /* t0 = x[0] + x[7] */ psllw_i2r(SHIFT_FRW_COL, mm4); /* t2 */ movq_r2r(mm0, mm6); /* 6 ; t1 */ psubsw_r2r(mm1, mm2); /* 1 ; t6 = x[1] - x[6] */ movq_m2r(*(fdct_tg_all_16 + 4), mm1); /* 1 ; tg_2_16 */ psubsw_r2r(mm4, mm0); /* tm12 = t1 - t2 */ movq_m2r(*x3, mm7); /* x3 */ pmulhw_r2r(mm0, mm1); /* tm12*tg_2_16 */ paddsw_m2r(*x4, mm7); /* t3 = x[3] + x[4] */ psllw_i2r(SHIFT_FRW_COL, mm5); /* t0 */ paddsw_r2r(mm4, mm6); /* 4 ; tp12 = t1 + t2 */ psllw_i2r(SHIFT_FRW_COL, mm7); /* t3 */ movq_r2r(mm5, mm4); /* 4 ; t0 */ psubsw_r2r(mm7, mm5); /* tm03 = t0 - t3 */ paddsw_r2r(mm5, mm1); /* y2 = tm03 + tm12*tg_2_16 */ paddsw_r2r(mm7, mm4); /* 7 ; tp03 = t0 + t3 */ por_m2r(fdct_one_corr, mm1); /* correction y2 +0.5 */ psllw_i2r(SHIFT_FRW_COL+1, mm2); /* t6 */ pmulhw_m2r(*(fdct_tg_all_16 + 4), mm5); /* tm03*tg_2_16 */ movq_r2r(mm4, mm7); /* 7 ; tp03 */ psubsw_m2r(*x5, mm3); /* t5 = x[2] - x[5] */ psubsw_r2r(mm6, mm4); /* y4 = tp03 - tp12 */ movq_r2m(mm1, *y2); /* 1 ; save y2 */ paddsw_r2r(mm6, mm7); /* 6 ; y0 = tp03 + tp12 */ movq_m2r(*x3, mm1); /* 1 ; x3 */ psllw_i2r(SHIFT_FRW_COL+1, mm3); /* t5 */ psubsw_m2r(*x4, mm1); /* t4 = x[3] - x[4] */ movq_r2r(mm2, mm6); /* 6 ; t6 */ movq_r2m(mm4, *y4); /* 4 ; save y4 */ paddsw_r2r(mm3, mm2); /* t6 + t5 */ pmulhw_m2r(*ocos_4_16, mm2); /* tp65 = (t6 + t5)*cos_4_16 */ psubsw_r2r(mm3, mm6); /* 3 ; t6 - t5 */ pmulhw_m2r(*ocos_4_16, mm6); /* tm65 = (t6 - t5)*cos_4_16 */ psubsw_r2r(mm0, mm5); /* 0 ; y6 = tm03*tg_2_16 - tm12 */ por_m2r(fdct_one_corr, mm5); /* correction y6 +0.5 */ psllw_i2r(SHIFT_FRW_COL, mm1); /* t4 */ por_m2r(fdct_one_corr, mm2); /* correction tp65 +0.5 */ movq_r2r(mm1, mm4); /* 4 ; t4 */ movq_m2r(*x0, mm3); /* 3 ; x0 */ paddsw_r2r(mm6, mm1); /* tp465 = t4 + tm65 */ psubsw_m2r(*x7, mm3); /* t7 = x[0] - x[7] */ psubsw_r2r(mm6, mm4); /* 6 ; tm465 = t4 - tm65 */ movq_m2r(*(fdct_tg_all_16 + 0), mm0); /* 0 ; tg_1_16 */ psllw_i2r(SHIFT_FRW_COL, mm3); /* t7 */
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?