📄 text_fdct_mmx.c
字号:
const static short _tg_1_16 = 13036; //tg * (2<<16) + 0.5
const static short _tg_2_16 = 27146; //tg * (2<<16) + 0.5
const static short _tg_3_16 =-21746; //tg * (2<<16) + 0.5
const static short _cos_4_16 =-19195; //cos * (2<<16) + 0.5
const static short _ocos_4_16 = 23170; //cos * (2<<15) + 0.5
const static short _one_corr = 1; //rounding compensation
static short t0, t1, t2, t3, t4, t5, t6, t7;
static short tp03, tm03, tp12, tm12, tp65, tm65;
static short tp465, tm465, tp765, tm765;
__asm {
////////////////////////////////////////////////////////////////////////
//
// The high-level pseudocode for the fdct_mm32() routine :
//
// fdct_mm32()
// {
// forward_dct_col03(); // dct_column transform on cols 0-3
// forward_dct_col47(); // dct_column transform on cols 4-7
// for ( j = 0; j < 8; j=j+1 )
// forward_dct_row1(j); // dct_row transform on row #j
// }
mov INP, dword ptr [blk]; ;// input data is row 0 of blk[]
;// transform the left half of the matrix (4 columns)
lea TABLEF, dword ptr [tg_all_16];
mov OUT, INP;
// lea round_frw_col, dword ptr [r_frw_col]
// for ( i = 0; i < 2; i = i + 1)
// the for-loop is executed twice. We are better off unrolling the
// loop to avoid branch misprediction.
mmx32_fdct_col03: // begin processing columns 0-3
movq mm0, [x1] ; 0 ; x1
;//
movq mm1, [x6] ; 1 ; x6
movq mm2, mm0 ; 2 ; x1
movq mm3, [x2] ; 3 ; x2
paddsw mm0, mm1 ; t1 = x[1] + x[6]
movq mm4, [x5] ; 4 ; x5
psllw mm0, SHIFT_FRW_COL ; t1
movq mm5, [x0] ; 5 ; x0
paddsw mm4, mm3 ; t2 = x[2] + x[5]
paddsw mm5, [x7] ; t0 = x[0] + x[7]
psllw mm4, SHIFT_FRW_COL ; t2
movq mm6, mm0 ; 6 ; t1
psubsw mm2, mm1 ; 1 ; t6 = x[1] - x[6]
movq mm1, qword ptr [tg_2_16] ; 1 ; tg_2_16
psubsw mm0, mm4 ; tm12 = t1 - t2
movq mm7, [x3] ; 7 ; x3
pmulhw mm1, mm0 ; tm12*tg_2_16
paddsw mm7, [x4] ; t3 = x[3] + x[4]
psllw mm5, SHIFT_FRW_COL ; t0
paddsw mm6, mm4 ; 4 ; tp12 = t1 + t2
psllw mm7, SHIFT_FRW_COL ; t3
movq mm4, mm5 ; 4 ; t0
psubsw mm5, mm7 ; tm03 = t0 - t3
paddsw mm1, mm5 ; y2 = tm03 + tm12*tg_2_16
paddsw mm4, mm7 ; 7 ; tp03 = t0 + t3
por mm1, qword ptr one_corr ; correction y2 +0.5
psllw mm2, SHIFT_FRW_COL+1 ; t6
pmulhw mm5, qword ptr [tg_2_16] ; tm03*tg_2_16
movq mm7, mm4 ; 7 ; tp03
psubsw mm3, [x5] ; t5 = x[2] - x[5]
psubsw mm4, mm6 ; y4 = tp03 - tp12
movq [y2], mm1 ; 1 ; save y2
paddsw mm7, mm6 ; 6 ; y0 = tp03 + tp12
movq mm1, [x3] ; 1 ; x3
psllw mm3, SHIFT_FRW_COL+1 ; t5
psubsw mm1, [x4] ; t4 = x[3] - x[4]
movq mm6, mm2 ; 6 ; t6
movq [y4], mm4 ; 4 ; save y4
paddsw mm2, mm3 ; t6 + t5
pmulhw mm2, qword ptr [ocos_4_16] ; tp65 = (t6 + t5)*cos_4_16
psubsw mm6, mm3 ; 3 ; t6 - t5
pmulhw mm6, qword ptr [ocos_4_16] ; tm65 = (t6 - t5)*cos_4_16
psubsw mm5, mm0 ; 0 ; y6 = tm03*tg_2_16 - tm12
por mm5, qword ptr one_corr ; correction y6 +0.5
psllw mm1, SHIFT_FRW_COL ; t4
por mm2, qword ptr one_corr ; correction tp65 +0.5
movq mm4, mm1 ; 4 ; t4
movq mm3, [x0] ; 3 ; x0
paddsw mm1, mm6 ; tp465 = t4 + tm65
psubsw mm3, [x7] ; t7 = x[0] - x[7]
psubsw mm4, mm6 ; 6 ; tm465 = t4 - tm65
movq mm0, qword ptr [tg_1_16] ; 0 ; tg_1_16
psllw mm3, SHIFT_FRW_COL ; t7
movq mm6, qword ptr [tg_3_16] ; 6 ; tg_3_16
pmulhw mm0, mm1 ; tp465*tg_1_16
movq [y0], mm7 ; 7 ; save y0
pmulhw mm6, mm4 ; tm465*tg_3_16
movq [y6], mm5 ; 5 ; save y6
movq mm7, mm3 ; 7 ; t7
movq mm5, qword ptr [tg_3_16] ; 5 ; tg_3_16
psubsw mm7, mm2 ; tm765 = t7 - tp65
paddsw mm3, mm2 ; 2 ; tp765 = t7 + tp65
pmulhw mm5, mm7 ; tm765*tg_3_16
paddsw mm0, mm3 ; y1 = tp765 + tp465*tg_1_16
paddsw mm6, mm4 ; tm465*tg_3_16
pmulhw mm3, qword ptr [tg_1_16] ; tp765*tg_1_16
;//
por mm0, qword ptr one_corr ; correction y1 +0.5
paddsw mm5, mm7 ; tm765*tg_3_16
psubsw mm7, mm6 ; 6 ; y3 = tm765 - tm465*tg_3_16
add INP, 0x08 ; // increment pointer
movq [y1], mm0 ; 0 ; save y1
paddsw mm5, mm4 ; 4 ; y5 = tm765*tg_3_16 + tm465
movq [y3], mm7 ; 7 ; save y3
psubsw mm3, mm1 ; 1 ; y7 = tp765*tg_1_16 - tp465
movq [y5], mm5 ; 5 ; save y5
mmx32_fdct_col47: // begin processing columns 4-7
movq mm0, [x1] ; 0 ; x1
;//
movq [y7], mm3 ; 3 ; save y7 (columns 0-4)
;//
movq mm1, [x6] ; 1 ; x6
movq mm2, mm0 ; 2 ; x1
movq mm3, [x2] ; 3 ; x2
paddsw mm0, mm1 ; t1 = x[1] + x[6]
movq mm4, [x5] ; 4 ; x5
psllw mm0, SHIFT_FRW_COL ; t1
movq mm5, [x0] ; 5 ; x0
paddsw mm4, mm3 ; t2 = x[2] + x[5]
paddsw mm5, [x7] ; t0 = x[0] + x[7]
psllw mm4, SHIFT_FRW_COL ; t2
movq mm6, mm0 ; 6 ; t1
psubsw mm2, mm1 ; 1 ; t6 = x[1] - x[6]
movq mm1, qword ptr [tg_2_16] ; 1 ; tg_2_16
psubsw mm0, mm4 ; tm12 = t1 - t2
movq mm7, [x3] ; 7 ; x3
pmulhw mm1, mm0 ; tm12*tg_2_16
paddsw mm7, [x4] ; t3 = x[3] + x[4]
psllw mm5, SHIFT_FRW_COL ; t0
paddsw mm6, mm4 ; 4 ; tp12 = t1 + t2
psllw mm7, SHIFT_FRW_COL ; t3
movq mm4, mm5 ; 4 ; t0
psubsw mm5, mm7 ; tm03 = t0 - t3
paddsw mm1, mm5 ; y2 = tm03 + tm12*tg_2_16
paddsw mm4, mm7 ; 7 ; tp03 = t0 + t3
por mm1, qword ptr one_corr ; correction y2 +0.5
psllw mm2, SHIFT_FRW_COL+1 ; t6
pmulhw mm5, qword ptr [tg_2_16] ; tm03*tg_2_16
movq mm7, mm4 ; 7 ; tp03
psubsw mm3, [x5] ; t5 = x[2] - x[5]
psubsw mm4, mm6 ; y4 = tp03 - tp12
movq [y2+8], mm1 ; 1 ; save y2
paddsw mm7, mm6 ; 6 ; y0 = tp03 + tp12
movq mm1, [x3] ; 1 ; x3
psllw mm3, SHIFT_FRW_COL+1 ; t5
psubsw mm1, [x4] ; t4 = x[3] - x[4]
movq mm6, mm2 ; 6 ; t6
movq [y4+8], mm4 ; 4 ; save y4
paddsw mm2, mm3 ; t6 + t5
pmulhw mm2, qword ptr [ocos_4_16] ; tp65 = (t6 + t5)*cos_4_16
psubsw mm6, mm3 ; 3 ; t6 - t5
pmulhw mm6, qword ptr [ocos_4_16] ; tm65 = (t6 - t5)*cos_4_16
psubsw mm5, mm0 ; 0 ; y6 = tm03*tg_2_16 - tm12
por mm5, qword ptr one_corr ; correction y6 +0.5
psllw mm1, SHIFT_FRW_COL ; t4
por mm2, qword ptr one_corr ; correction tp65 +0.5
movq mm4, mm1 ; 4 ; t4
movq mm3, [x0] ; 3 ; x0
paddsw mm1, mm6 ; tp465 = t4 + tm65
psubsw mm3, [x7] ; t7 = x[0] - x[7]
psubsw mm4, mm6 ; 6 ; tm465 = t4 - tm65
movq mm0, qword ptr [tg_1_16] ; 0 ; tg_1_16
psllw mm3, SHIFT_FRW_COL ; t7
movq mm6, qword ptr [tg_3_16] ; 6 ; tg_3_16
pmulhw mm0, mm1 ; tp465*tg_1_16
movq [y0+8], mm7 ; 7 ; save y0
pmulhw mm6, mm4 ; tm465*tg_3_16
movq [y6+8], mm5 ; 5 ; save y6
movq mm7, mm3 ; 7 ; t7
movq mm5, qword ptr [tg_3_16] ; 5 ; tg_3_16
psubsw mm7, mm2 ; tm765 = t7 - tp65
paddsw mm3, mm2 ; 2 ; tp765 = t7 + tp65
pmulhw mm5, mm7 ; tm765*tg_3_16
paddsw mm0, mm3 ; y1 = tp765 + tp465*tg_1_16
paddsw mm6, mm4 ; tm465*tg_3_16
pmulhw mm3, qword ptr [tg_1_16] ; tp765*tg_1_16
;//
por mm0, qword ptr one_corr ; correction y1 +0.5
paddsw mm5, mm7 ; tm765*tg_3_16
psubsw mm7, mm6 ; 6 ; y3 = tm765 - tm465*tg_3_16
;//
movq [y1+8], mm0 ; 0 ; save y1
paddsw mm5, mm4 ; 4 ; y5 = tm765*tg_3_16 + tm465
movq [y3+8], mm7 ; 7 ; save y3
psubsw mm3, mm1 ; 1 ; y7 = tp765*tg_1_16 - tp465
movq [y5+8], mm5 ; 5 ; save y5
movq [y7+8], mm3 ; 3 ; save y7
// emms;
// } // end of forward_dct_col07()
// done with dct_col transform
////////////////////////////////////////////////////////////////////////
//
// fdct_mmx32_rows() --
// the following subroutine performs the row-transform operation,
//
// The output is stored into blk[], destroying the original
// source data.
// v1.01 - output is range-clipped to {-2048, +2047}
mov INP, dword ptr [blk]; ;// row 0
mov edi, 0x08; //x = 8
lea TABLE, dword ptr [tab_frw_01234567]; // row 0
mov OUT, INP;
lea round_frw_row, dword ptr [r_frw_row];
// for ( x = 8; x > 0; --x ) // transform 1 row per iteration
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -