📄 text_fdct_mmx.c
字号:
const static short _tg_1_16 = 13036; //tg * (2<<16) + 0.5 const static short _tg_2_16 = 27146; //tg * (2<<16) + 0.5 const static short _tg_3_16 =-21746; //tg * (2<<16) + 0.5 const static short _cos_4_16 =-19195; //cos * (2<<16) + 0.5 const static short _ocos_4_16 = 23170; //cos * (2<<15) + 0.5 const static short _one_corr = 1; //rounding compensation static short t0, t1, t2, t3, t4, t5, t6, t7; static short tp03, tm03, tp12, tm12, tp65, tm65; static short tp465, tm465, tp765, tm765; __asm { //////////////////////////////////////////////////////////////////////// // // The high-level pseudocode for the fdct_mm32() routine : // // fdct_mm32() // { // forward_dct_col03(); // dct_column transform on cols 0-3 // forward_dct_col47(); // dct_column transform on cols 4-7 // for ( j = 0; j < 8; j=j+1 ) // forward_dct_row1(j); // dct_row transform on row #j // } mov INP, dword ptr [blk]; ;// input data is row 0 of blk[] ;// transform the left half of the matrix (4 columns) lea TABLEF, dword ptr [tg_all_16]; mov OUT, INP;// lea round_frw_col, dword ptr [r_frw_col] // for ( i = 0; i < 2; i = i + 1) // the for-loop is executed twice. We are better off unrolling the // loop to avoid branch misprediction.// mmx32_fdct_col03: // begin processing columns 0-3 movq mm0, [x1] ; 0 ; x1 ;// movq mm1, [x6] ; 1 ; x6 movq mm2, mm0 ; 2 ; x1 movq mm3, [x2] ; 3 ; x2 paddsw mm0, mm1 ; t1 = x[1] + x[6] movq mm4, [x5] ; 4 ; x5 psllw mm0, SHIFT_FRW_COL ; t1 movq mm5, [x0] ; 5 ; x0 paddsw mm4, mm3 ; t2 = x[2] + x[5] paddsw mm5, [x7] ; t0 = x[0] + x[7] psllw mm4, SHIFT_FRW_COL ; t2 movq mm6, mm0 ; 6 ; t1 psubsw mm2, mm1 ; 1 ; t6 = x[1] - x[6] movq mm1, qword ptr [tg_2_16] ; 1 ; tg_2_16 psubsw mm0, mm4 ; tm12 = t1 - t2 movq mm7, [x3] ; 7 ; x3 pmulhw mm1, mm0 ; tm12*tg_2_16 paddsw mm7, [x4] ; t3 = x[3] + x[4] psllw mm5, SHIFT_FRW_COL ; t0 paddsw mm6, mm4 ; 4 ; tp12 = t1 + t2 psllw mm7, SHIFT_FRW_COL ; t3 movq mm4, mm5 ; 4 ; t0 psubsw mm5, mm7 ; tm03 = t0 - t3 paddsw mm1, mm5 ; y2 = tm03 + tm12*tg_2_16 paddsw mm4, mm7 ; 7 ; tp03 = t0 + t3 por mm1, qword ptr one_corr ; correction y2 +0.5 psllw mm2, SHIFT_FRW_COL+1 ; t6 pmulhw mm5, qword ptr [tg_2_16] ; tm03*tg_2_16 movq mm7, mm4 ; 7 ; tp03 psubsw mm3, [x5] ; t5 = x[2] - x[5] psubsw mm4, mm6 ; y4 = tp03 - tp12 movq [y2], mm1 ; 1 ; save y2 paddsw mm7, mm6 ; 6 ; y0 = tp03 + tp12 movq mm1, [x3] ; 1 ; x3 psllw mm3, SHIFT_FRW_COL+1 ; t5 psubsw mm1, [x4] ; t4 = x[3] - x[4] movq mm6, mm2 ; 6 ; t6 movq [y4], mm4 ; 4 ; save y4 paddsw mm2, mm3 ; t6 + t5 pmulhw mm2, qword ptr [ocos_4_16] ; tp65 = (t6 + t5)*cos_4_16 psubsw mm6, mm3 ; 3 ; t6 - t5 pmulhw mm6, qword ptr [ocos_4_16] ; tm65 = (t6 - t5)*cos_4_16 psubsw mm5, mm0 ; 0 ; y6 = tm03*tg_2_16 - tm12 por mm5, qword ptr one_corr ; correction y6 +0.5 psllw mm1, SHIFT_FRW_COL ; t4 por mm2, qword ptr one_corr ; correction tp65 +0.5 movq mm4, mm1 ; 4 ; t4 movq mm3, [x0] ; 3 ; x0 paddsw mm1, mm6 ; tp465 = t4 + tm65 psubsw mm3, [x7] ; t7 = x[0] - x[7] psubsw mm4, mm6 ; 6 ; tm465 = t4 - tm65 movq mm0, qword ptr [tg_1_16] ; 0 ; tg_1_16 psllw mm3, SHIFT_FRW_COL ; t7 movq mm6, qword ptr [tg_3_16] ; 6 ; tg_3_16 pmulhw mm0, mm1 ; tp465*tg_1_16 movq [y0], mm7 ; 7 ; save y0 pmulhw mm6, mm4 ; tm465*tg_3_16 movq [y6], mm5 ; 5 ; save y6 movq mm7, mm3 ; 7 ; t7 movq mm5, qword ptr [tg_3_16] ; 5 ; tg_3_16 psubsw mm7, mm2 ; tm765 = t7 - tp65 paddsw mm3, mm2 ; 2 ; tp765 = t7 + tp65 pmulhw mm5, mm7 ; tm765*tg_3_16 paddsw mm0, mm3 ; y1 = tp765 + tp465*tg_1_16 paddsw mm6, mm4 ; tm465*tg_3_16 pmulhw mm3, qword ptr [tg_1_16] ; tp765*tg_1_16 ;// por mm0, qword ptr one_corr ; correction y1 +0.5 paddsw mm5, mm7 ; tm765*tg_3_16 psubsw mm7, mm6 ; 6 ; y3 = tm765 - tm465*tg_3_16 add INP, 0x08 ; // increment pointer movq [y1], mm0 ; 0 ; save y1 paddsw mm5, mm4 ; 4 ; y5 = tm765*tg_3_16 + tm465 movq [y3], mm7 ; 7 ; save y3 psubsw mm3, mm1 ; 1 ; y7 = tp765*tg_1_16 - tp465 movq [y5], mm5 ; 5 ; save y5// mmx32_fdct_col47: // begin processing columns 4-7 movq mm0, [x1] ; 0 ; x1 ;// movq [y7], mm3 ; 3 ; save y7 (columns 0-4) ;// movq mm1, [x6] ; 1 ; x6 movq mm2, mm0 ; 2 ; x1 movq mm3, [x2] ; 3 ; x2 paddsw mm0, mm1 ; t1 = x[1] + x[6] movq mm4, [x5] ; 4 ; x5 psllw mm0, SHIFT_FRW_COL ; t1 movq mm5, [x0] ; 5 ; x0 paddsw mm4, mm3 ; t2 = x[2] + x[5] paddsw mm5, [x7] ; t0 = x[0] + x[7] psllw mm4, SHIFT_FRW_COL ; t2 movq mm6, mm0 ; 6 ; t1 psubsw mm2, mm1 ; 1 ; t6 = x[1] - x[6] movq mm1, qword ptr [tg_2_16] ; 1 ; tg_2_16 psubsw mm0, mm4 ; tm12 = t1 - t2 movq mm7, [x3] ; 7 ; x3 pmulhw mm1, mm0 ; tm12*tg_2_16 paddsw mm7, [x4] ; t3 = x[3] + x[4] psllw mm5, SHIFT_FRW_COL ; t0 paddsw mm6, mm4 ; 4 ; tp12 = t1 + t2 psllw mm7, SHIFT_FRW_COL ; t3 movq mm4, mm5 ; 4 ; t0 psubsw mm5, mm7 ; tm03 = t0 - t3 paddsw mm1, mm5 ; y2 = tm03 + tm12*tg_2_16 paddsw mm4, mm7 ; 7 ; tp03 = t0 + t3 por mm1, qword ptr one_corr ; correction y2 +0.5 psllw mm2, SHIFT_FRW_COL+1 ; t6 pmulhw mm5, qword ptr [tg_2_16] ; tm03*tg_2_16 movq mm7, mm4 ; 7 ; tp03 psubsw mm3, [x5] ; t5 = x[2] - x[5] psubsw mm4, mm6 ; y4 = tp03 - tp12 movq [y2+8], mm1 ; 1 ; save y2 paddsw mm7, mm6 ; 6 ; y0 = tp03 + tp12 movq mm1, [x3] ; 1 ; x3 psllw mm3, SHIFT_FRW_COL+1 ; t5 psubsw mm1, [x4] ; t4 = x[3] - x[4] movq mm6, mm2 ; 6 ; t6 movq [y4+8], mm4 ; 4 ; save y4 paddsw mm2, mm3 ; t6 + t5 pmulhw mm2, qword ptr [ocos_4_16] ; tp65 = (t6 + t5)*cos_4_16 psubsw mm6, mm3 ; 3 ; t6 - t5 pmulhw mm6, qword ptr [ocos_4_16] ; tm65 = (t6 - t5)*cos_4_16 psubsw mm5, mm0 ; 0 ; y6 = tm03*tg_2_16 - tm12 por mm5, qword ptr one_corr ; correction y6 +0.5 psllw mm1, SHIFT_FRW_COL ; t4 por mm2, qword ptr one_corr ; correction tp65 +0.5 movq mm4, mm1 ; 4 ; t4 movq mm3, [x0] ; 3 ; x0 paddsw mm1, mm6 ; tp465 = t4 + tm65 psubsw mm3, [x7] ; t7 = x[0] - x[7] psubsw mm4, mm6 ; 6 ; tm465 = t4 - tm65 movq mm0, qword ptr [tg_1_16] ; 0 ; tg_1_16 psllw mm3, SHIFT_FRW_COL ; t7 movq mm6, qword ptr [tg_3_16] ; 6 ; tg_3_16 pmulhw mm0, mm1 ; tp465*tg_1_16 movq [y0+8], mm7 ; 7 ; save y0 pmulhw mm6, mm4 ; tm465*tg_3_16 movq [y6+8], mm5 ; 5 ; save y6 movq mm7, mm3 ; 7 ; t7 movq mm5, qword ptr [tg_3_16] ; 5 ; tg_3_16 psubsw mm7, mm2 ; tm765 = t7 - tp65 paddsw mm3, mm2 ; 2 ; tp765 = t7 + tp65 pmulhw mm5, mm7 ; tm765*tg_3_16 paddsw mm0, mm3 ; y1 = tp765 + tp465*tg_1_16 paddsw mm6, mm4 ; tm465*tg_3_16 pmulhw mm3, qword ptr [tg_1_16] ; tp765*tg_1_16 ;// por mm0, qword ptr one_corr ; correction y1 +0.5 paddsw mm5, mm7 ; tm765*tg_3_16 psubsw mm7, mm6 ; 6 ; y3 = tm765 - tm465*tg_3_16 ;// movq [y1+8], mm0 ; 0 ; save y1 paddsw mm5, mm4 ; 4 ; y5 = tm765*tg_3_16 + tm465 movq [y3+8], mm7 ; 7 ; save y3 psubsw mm3, mm1 ; 1 ; y7 = tp765*tg_1_16 - tp465 movq [y5+8], mm5 ; 5 ; save y5 movq [y7+8], mm3 ; 3 ; save y7 // emms; // } // end of forward_dct_col07() // done with dct_col transform //////////////////////////////////////////////////////////////////////// // // fdct_mmx32_rows() -- // the following subroutine performs the row-transform operation, // // The output is stored into blk[], destroying the original // source data. // v1.01 - output is range-clipped to {-2048, +2047} mov INP, dword ptr [blk]; ;// row 0 mov edi, 0x08; //x = 8 lea TABLE, dword ptr [tab_frw_01234567]; // row 0 mov OUT, INP; lea round_frw_row, dword ptr [r_frw_row]; // for ( x = 8; x > 0; --x ) // transform 1 row per iteration
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -