text_fdct_mmx.c

来自「quicktime linux播放器v1」· C语言代码 · 共 1,793 行 · 第 1/3 页
1,793 行
        const static short _tg_1_16   = 13036;  //tg * (2<<16) + 0.5    const static short _tg_2_16   = 27146;  //tg * (2<<16) + 0.5    const static short _tg_3_16   =-21746;  //tg * (2<<16) + 0.5    const static short _cos_4_16  =-19195;  //cos * (2<<16) + 0.5    const static short _ocos_4_16 = 23170;  //cos * (2<<15) + 0.5    const static short _one_corr  =     1;  //rounding compensation    static short t0, t1, t2, t3, t4, t5, t6, t7;    static short tp03, tm03, tp12, tm12, tp65, tm65;    static short tp465, tm465, tp765, tm765;    __asm {    ////////////////////////////////////////////////////////////////////////    //    // The high-level pseudocode for the fdct_mm32() routine :    //    // fdct_mm32()    // {    //    forward_dct_col03(); // dct_column transform on cols 0-3    //    forward_dct_col47(); // dct_column transform on cols 4-7    //    for ( j = 0; j < 8; j=j+1 )    //      forward_dct_row1(j); // dct_row transform on row #j    // }	mov INP, dword ptr [blk];		;// input data is row 0 of blk[]    ;// transform the left half of the matrix (4 columns)    lea TABLEF, dword ptr [tg_all_16];    mov OUT, INP;//	lea round_frw_col, dword ptr [r_frw_col]    // for ( i = 0; i < 2; i = i + 1)    // the for-loop is executed twice.  We are better off unrolling the     // loop to avoid branch misprediction.//	mmx32_fdct_col03: // begin processing columns 0-3    movq mm0, [x1] ; 0 ; x1     ;//    movq mm1, [x6] ; 1 ; x6     movq mm2, mm0 ; 2 ; x1    movq mm3, [x2] ; 3 ; x2     paddsw mm0, mm1 ; t1 = x[1] + x[6]    movq mm4, [x5] ; 4 ; x5     psllw mm0, SHIFT_FRW_COL ; t1    movq mm5, [x0] ; 5 ; x0     paddsw mm4, mm3 ; t2 = x[2] + x[5]    paddsw mm5, [x7] ; t0 = x[0] + x[7]     psllw mm4, SHIFT_FRW_COL ; t2    movq mm6, mm0 ; 6 ; t1     psubsw mm2, mm1 ; 1 ; t6 = x[1] - x[6]    movq mm1, qword ptr [tg_2_16] ; 1 ; tg_2_16     psubsw mm0, mm4 ; tm12 = t1 - t2    movq mm7, [x3] ; 7 ; x3     pmulhw mm1, mm0 ; tm12*tg_2_16    paddsw mm7, [x4] ; t3 = x[3] + x[4]     psllw mm5, SHIFT_FRW_COL ; t0    paddsw mm6, mm4 ; 4 ; tp12 = t1 + t2     psllw mm7, SHIFT_FRW_COL ; t3    movq mm4, mm5 ; 4 ; t0     psubsw mm5, mm7 ; tm03 = t0 - t3    paddsw mm1, mm5 ; y2 = tm03 + tm12*tg_2_16     paddsw mm4, mm7 ; 7 ; tp03 = t0 + t3    por mm1, qword ptr one_corr ; correction y2 +0.5     psllw mm2, SHIFT_FRW_COL+1 ; t6    pmulhw mm5, qword ptr [tg_2_16] ; tm03*tg_2_16     movq mm7, mm4 ; 7 ; tp03    psubsw mm3, [x5] ; t5 = x[2] - x[5]     psubsw mm4, mm6 ; y4 = tp03 - tp12    movq [y2], mm1 ; 1 ; save y2     paddsw mm7, mm6 ; 6 ; y0 = tp03 + tp12         movq mm1, [x3] ; 1 ; x3     psllw mm3, SHIFT_FRW_COL+1 ; t5    psubsw mm1, [x4] ; t4 = x[3] - x[4]     movq mm6, mm2 ; 6 ; t6        movq [y4], mm4 ; 4 ; save y4     paddsw mm2, mm3 ; t6 + t5    pmulhw mm2, qword ptr [ocos_4_16] ; tp65 = (t6 + t5)*cos_4_16     psubsw mm6, mm3 ; 3 ; t6 - t5    pmulhw mm6, qword ptr [ocos_4_16] ; tm65 = (t6 - t5)*cos_4_16     psubsw mm5, mm0 ; 0 ; y6 = tm03*tg_2_16 - tm12    por mm5, qword ptr one_corr ; correction y6 +0.5     psllw mm1, SHIFT_FRW_COL ; t4    por mm2, qword ptr one_corr ; correction tp65 +0.5     movq mm4, mm1 ; 4 ; t4    movq mm3, [x0] ; 3 ; x0     paddsw mm1, mm6 ; tp465 = t4 + tm65    psubsw mm3, [x7] ; t7 = x[0] - x[7]     psubsw mm4, mm6 ; 6 ; tm465 = t4 - tm65    movq mm0, qword ptr [tg_1_16] ; 0 ; tg_1_16     psllw mm3, SHIFT_FRW_COL ; t7    movq mm6, qword ptr [tg_3_16] ; 6 ; tg_3_16     pmulhw mm0, mm1 ; tp465*tg_1_16    movq [y0], mm7 ; 7 ; save y0     pmulhw mm6, mm4 ; tm465*tg_3_16    movq [y6], mm5 ; 5 ; save y6     movq mm7, mm3 ; 7 ; t7    movq mm5, qword ptr [tg_3_16] ; 5 ; tg_3_16     psubsw mm7, mm2 ; tm765 = t7 - tp65    paddsw mm3, mm2 ; 2 ; tp765 = t7 + tp65     pmulhw mm5, mm7 ; tm765*tg_3_16    paddsw mm0, mm3 ; y1 = tp765 + tp465*tg_1_16     paddsw mm6, mm4 ; tm465*tg_3_16    pmulhw mm3, qword ptr [tg_1_16] ; tp765*tg_1_16     ;//    por mm0, qword ptr one_corr ; correction y1 +0.5     paddsw mm5, mm7 ; tm765*tg_3_16    psubsw mm7, mm6 ; 6 ; y3 = tm765 - tm465*tg_3_16     add INP, 0x08   ; // increment pointer    movq [y1], mm0 ; 0 ; save y1     paddsw mm5, mm4 ; 4 ; y5 = tm765*tg_3_16 + tm465    movq [y3], mm7 ; 7 ; save y3     psubsw mm3, mm1 ; 1 ; y7 = tp765*tg_1_16 - tp465    movq [y5], mm5 ; 5 ; save y5//  mmx32_fdct_col47: // begin processing columns 4-7    movq mm0, [x1] ; 0 ; x1     ;//    movq [y7], mm3 ; 3 ; save y7 (columns 0-4)     ;//    movq mm1, [x6] ; 1 ; x6     movq mm2, mm0 ; 2 ; x1    movq mm3, [x2] ; 3 ; x2     paddsw mm0, mm1 ; t1 = x[1] + x[6]    movq mm4, [x5] ; 4 ; x5     psllw mm0, SHIFT_FRW_COL ; t1    movq mm5, [x0] ; 5 ; x0     paddsw mm4, mm3 ; t2 = x[2] + x[5]    paddsw mm5, [x7] ; t0 = x[0] + x[7]     psllw mm4, SHIFT_FRW_COL ; t2    movq mm6, mm0 ; 6 ; t1     psubsw mm2, mm1 ; 1 ; t6 = x[1] - x[6]    movq mm1, qword ptr [tg_2_16] ; 1 ; tg_2_16     psubsw mm0, mm4 ; tm12 = t1 - t2    movq mm7, [x3] ; 7 ; x3     pmulhw mm1, mm0 ; tm12*tg_2_16    paddsw mm7, [x4] ; t3 = x[3] + x[4]     psllw mm5, SHIFT_FRW_COL ; t0    paddsw mm6, mm4 ; 4 ; tp12 = t1 + t2     psllw mm7, SHIFT_FRW_COL ; t3    movq mm4, mm5 ; 4 ; t0     psubsw mm5, mm7 ; tm03 = t0 - t3    paddsw mm1, mm5 ; y2 = tm03 + tm12*tg_2_16     paddsw mm4, mm7 ; 7 ; tp03 = t0 + t3    por mm1, qword ptr one_corr ; correction y2 +0.5     psllw mm2, SHIFT_FRW_COL+1 ; t6    pmulhw mm5, qword ptr [tg_2_16] ; tm03*tg_2_16     movq mm7, mm4 ; 7 ; tp03    psubsw mm3, [x5] ; t5 = x[2] - x[5]     psubsw mm4, mm6 ; y4 = tp03 - tp12    movq [y2+8], mm1 ; 1 ; save y2     paddsw mm7, mm6 ; 6 ; y0 = tp03 + tp12         movq mm1, [x3] ; 1 ; x3     psllw mm3, SHIFT_FRW_COL+1 ; t5    psubsw mm1, [x4] ; t4 = x[3] - x[4]     movq mm6, mm2 ; 6 ; t6        movq [y4+8], mm4 ; 4 ; save y4     paddsw mm2, mm3 ; t6 + t5    pmulhw mm2, qword ptr [ocos_4_16] ; tp65 = (t6 + t5)*cos_4_16     psubsw mm6, mm3 ; 3 ; t6 - t5    pmulhw mm6, qword ptr [ocos_4_16] ; tm65 = (t6 - t5)*cos_4_16     psubsw mm5, mm0 ; 0 ; y6 = tm03*tg_2_16 - tm12    por mm5, qword ptr one_corr ; correction y6 +0.5     psllw mm1, SHIFT_FRW_COL ; t4    por mm2, qword ptr one_corr ; correction tp65 +0.5     movq mm4, mm1 ; 4 ; t4    movq mm3, [x0] ; 3 ; x0     paddsw mm1, mm6 ; tp465 = t4 + tm65    psubsw mm3, [x7] ; t7 = x[0] - x[7]     psubsw mm4, mm6 ; 6 ; tm465 = t4 - tm65    movq mm0, qword ptr [tg_1_16] ; 0 ; tg_1_16     psllw mm3, SHIFT_FRW_COL ; t7    movq mm6, qword ptr [tg_3_16] ; 6 ; tg_3_16     pmulhw mm0, mm1 ; tp465*tg_1_16    movq [y0+8], mm7 ; 7 ; save y0     pmulhw mm6, mm4 ; tm465*tg_3_16    movq [y6+8], mm5 ; 5 ; save y6     movq mm7, mm3 ; 7 ; t7    movq mm5, qword ptr [tg_3_16] ; 5 ; tg_3_16     psubsw mm7, mm2 ; tm765 = t7 - tp65    paddsw mm3, mm2 ; 2 ; tp765 = t7 + tp65     pmulhw mm5, mm7 ; tm765*tg_3_16    paddsw mm0, mm3 ; y1 = tp765 + tp465*tg_1_16     paddsw mm6, mm4 ; tm465*tg_3_16    pmulhw mm3, qword ptr [tg_1_16] ; tp765*tg_1_16     ;//    por mm0, qword ptr one_corr ; correction y1 +0.5     paddsw mm5, mm7 ; tm765*tg_3_16    psubsw mm7, mm6 ; 6 ; y3 = tm765 - tm465*tg_3_16     ;//    movq [y1+8], mm0 ; 0 ; save y1     paddsw mm5, mm4 ; 4 ; y5 = tm765*tg_3_16 + tm465    movq [y3+8], mm7 ; 7 ; save y3     psubsw mm3, mm1 ; 1 ; y7 = tp765*tg_1_16 - tp465    movq [y5+8], mm5 ; 5 ; save y5    movq [y7+8], mm3 ; 3 ; save y7  //   emms;  //  }   // end of forward_dct_col07()     //  done with dct_col transform  ////////////////////////////////////////////////////////////////////////  //  // fdct_mmx32_rows() --  // the following subroutine performs the row-transform operation,  //  //  The output is stored into blk[], destroying the original  //  source data.  //  v1.01 - output is range-clipped to {-2048, +2047}	mov INP, dword ptr [blk];		;// row 0	 mov edi, 0x08;	//x = 8	lea TABLE, dword ptr [tab_frw_01234567]; // row 0	 mov OUT, INP;	lea round_frw_row, dword ptr [r_frw_row];	// for ( x = 8; x > 0; --x )  // transform 1 row per iteration
text_fdct_mmx.c - 源码说明

本页面展示了「quicktime linux播放器v1」中的 text_fdct_mmx.c 源码文件，采用 C语言编程语言编写，共 1,793 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与quicktime相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?