⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 text_fdct_mmx.c

📁 mpeg4 encoder MPEG4编码库源代码
💻 C
📖 第 1 页 / 共 3 页
字号:
    
    const static short _tg_1_16   = 13036;  //tg * (2<<16) + 0.5
    const static short _tg_2_16   = 27146;  //tg * (2<<16) + 0.5
    const static short _tg_3_16   =-21746;  //tg * (2<<16) + 0.5
    const static short _cos_4_16  =-19195;  //cos * (2<<16) + 0.5
    const static short _ocos_4_16 = 23170;  //cos * (2<<15) + 0.5
    const static short _one_corr  =     1;  //rounding compensation

    static short t0, t1, t2, t3, t4, t5, t6, t7;
    static short tp03, tm03, tp12, tm12, tp65, tm65;
    static short tp465, tm465, tp765, tm765;

    __asm {

    ////////////////////////////////////////////////////////////////////////
    //
    // The high-level pseudocode for the fdct_mm32() routine :
    //
    // fdct_mm32()
    // {
    //    forward_dct_col03(); // dct_column transform on cols 0-3
    //    forward_dct_col47(); // dct_column transform on cols 4-7
    //    for ( j = 0; j < 8; j=j+1 )
    //      forward_dct_row1(j); // dct_row transform on row #j
    // }

	mov INP, dword ptr [blk];		;// input data is row 0 of blk[]
    ;// transform the left half of the matrix (4 columns)

    lea TABLEF, dword ptr [tg_all_16];
    mov OUT, INP;

//	lea round_frw_col, dword ptr [r_frw_col]
    // for ( i = 0; i < 2; i = i + 1)
    // the for-loop is executed twice.  We are better off unrolling the 
    // loop to avoid branch misprediction.
	mmx32_fdct_col03: // begin processing columns 0-3
    movq mm0, [x1] ; 0 ; x1
     ;//

    movq mm1, [x6] ; 1 ; x6
     movq mm2, mm0 ; 2 ; x1

    movq mm3, [x2] ; 3 ; x2
     paddsw mm0, mm1 ; t1 = x[1] + x[6]

    movq mm4, [x5] ; 4 ; x5
     psllw mm0, SHIFT_FRW_COL ; t1

    movq mm5, [x0] ; 5 ; x0
     paddsw mm4, mm3 ; t2 = x[2] + x[5]

    paddsw mm5, [x7] ; t0 = x[0] + x[7]
     psllw mm4, SHIFT_FRW_COL ; t2

    movq mm6, mm0 ; 6 ; t1
     psubsw mm2, mm1 ; 1 ; t6 = x[1] - x[6]

    movq mm1, qword ptr [tg_2_16] ; 1 ; tg_2_16
     psubsw mm0, mm4 ; tm12 = t1 - t2

    movq mm7, [x3] ; 7 ; x3
     pmulhw mm1, mm0 ; tm12*tg_2_16

    paddsw mm7, [x4] ; t3 = x[3] + x[4]
     psllw mm5, SHIFT_FRW_COL ; t0

    paddsw mm6, mm4 ; 4 ; tp12 = t1 + t2
     psllw mm7, SHIFT_FRW_COL ; t3

    movq mm4, mm5 ; 4 ; t0
     psubsw mm5, mm7 ; tm03 = t0 - t3

    paddsw mm1, mm5 ; y2 = tm03 + tm12*tg_2_16
     paddsw mm4, mm7 ; 7 ; tp03 = t0 + t3

    por mm1, qword ptr one_corr ; correction y2 +0.5
     psllw mm2, SHIFT_FRW_COL+1 ; t6

    pmulhw mm5, qword ptr [tg_2_16] ; tm03*tg_2_16
     movq mm7, mm4 ; 7 ; tp03

    psubsw mm3, [x5] ; t5 = x[2] - x[5]
     psubsw mm4, mm6 ; y4 = tp03 - tp12

    movq [y2], mm1 ; 1 ; save y2
     paddsw mm7, mm6 ; 6 ; y0 = tp03 + tp12
     
    movq mm1, [x3] ; 1 ; x3
     psllw mm3, SHIFT_FRW_COL+1 ; t5

    psubsw mm1, [x4] ; t4 = x[3] - x[4]
     movq mm6, mm2 ; 6 ; t6
    
    movq [y4], mm4 ; 4 ; save y4
     paddsw mm2, mm3 ; t6 + t5

    pmulhw mm2, qword ptr [ocos_4_16] ; tp65 = (t6 + t5)*cos_4_16
     psubsw mm6, mm3 ; 3 ; t6 - t5

    pmulhw mm6, qword ptr [ocos_4_16] ; tm65 = (t6 - t5)*cos_4_16
     psubsw mm5, mm0 ; 0 ; y6 = tm03*tg_2_16 - tm12

    por mm5, qword ptr one_corr ; correction y6 +0.5
     psllw mm1, SHIFT_FRW_COL ; t4

    por mm2, qword ptr one_corr ; correction tp65 +0.5
     movq mm4, mm1 ; 4 ; t4

    movq mm3, [x0] ; 3 ; x0
     paddsw mm1, mm6 ; tp465 = t4 + tm65

    psubsw mm3, [x7] ; t7 = x[0] - x[7]
     psubsw mm4, mm6 ; 6 ; tm465 = t4 - tm65

    movq mm0, qword ptr [tg_1_16] ; 0 ; tg_1_16
     psllw mm3, SHIFT_FRW_COL ; t7

    movq mm6, qword ptr [tg_3_16] ; 6 ; tg_3_16
     pmulhw mm0, mm1 ; tp465*tg_1_16

    movq [y0], mm7 ; 7 ; save y0
     pmulhw mm6, mm4 ; tm465*tg_3_16

    movq [y6], mm5 ; 5 ; save y6
     movq mm7, mm3 ; 7 ; t7

    movq mm5, qword ptr [tg_3_16] ; 5 ; tg_3_16
     psubsw mm7, mm2 ; tm765 = t7 - tp65

    paddsw mm3, mm2 ; 2 ; tp765 = t7 + tp65
     pmulhw mm5, mm7 ; tm765*tg_3_16

    paddsw mm0, mm3 ; y1 = tp765 + tp465*tg_1_16
     paddsw mm6, mm4 ; tm465*tg_3_16

    pmulhw mm3, qword ptr [tg_1_16] ; tp765*tg_1_16
     ;//

    por mm0, qword ptr one_corr ; correction y1 +0.5
     paddsw mm5, mm7 ; tm765*tg_3_16

    psubsw mm7, mm6 ; 6 ; y3 = tm765 - tm465*tg_3_16
     add INP, 0x08   ; // increment pointer

    movq [y1], mm0 ; 0 ; save y1
     paddsw mm5, mm4 ; 4 ; y5 = tm765*tg_3_16 + tm465

    movq [y3], mm7 ; 7 ; save y3
     psubsw mm3, mm1 ; 1 ; y7 = tp765*tg_1_16 - tp465

    movq [y5], mm5 ; 5 ; save y5


  mmx32_fdct_col47: // begin processing columns 4-7
    movq mm0, [x1] ; 0 ; x1
     ;//
    movq [y7], mm3 ; 3 ; save y7 (columns 0-4)
     ;//

    movq mm1, [x6] ; 1 ; x6
     movq mm2, mm0 ; 2 ; x1

    movq mm3, [x2] ; 3 ; x2
     paddsw mm0, mm1 ; t1 = x[1] + x[6]

    movq mm4, [x5] ; 4 ; x5
     psllw mm0, SHIFT_FRW_COL ; t1

    movq mm5, [x0] ; 5 ; x0
     paddsw mm4, mm3 ; t2 = x[2] + x[5]

    paddsw mm5, [x7] ; t0 = x[0] + x[7]
     psllw mm4, SHIFT_FRW_COL ; t2

    movq mm6, mm0 ; 6 ; t1
     psubsw mm2, mm1 ; 1 ; t6 = x[1] - x[6]

    movq mm1, qword ptr [tg_2_16] ; 1 ; tg_2_16
     psubsw mm0, mm4 ; tm12 = t1 - t2

    movq mm7, [x3] ; 7 ; x3
     pmulhw mm1, mm0 ; tm12*tg_2_16

    paddsw mm7, [x4] ; t3 = x[3] + x[4]
     psllw mm5, SHIFT_FRW_COL ; t0

    paddsw mm6, mm4 ; 4 ; tp12 = t1 + t2
     psllw mm7, SHIFT_FRW_COL ; t3

    movq mm4, mm5 ; 4 ; t0
     psubsw mm5, mm7 ; tm03 = t0 - t3

    paddsw mm1, mm5 ; y2 = tm03 + tm12*tg_2_16
     paddsw mm4, mm7 ; 7 ; tp03 = t0 + t3

    por mm1, qword ptr one_corr ; correction y2 +0.5
     psllw mm2, SHIFT_FRW_COL+1 ; t6

    pmulhw mm5, qword ptr [tg_2_16] ; tm03*tg_2_16
     movq mm7, mm4 ; 7 ; tp03

    psubsw mm3, [x5] ; t5 = x[2] - x[5]
     psubsw mm4, mm6 ; y4 = tp03 - tp12

    movq [y2+8], mm1 ; 1 ; save y2
     paddsw mm7, mm6 ; 6 ; y0 = tp03 + tp12
     
    movq mm1, [x3] ; 1 ; x3
     psllw mm3, SHIFT_FRW_COL+1 ; t5

    psubsw mm1, [x4] ; t4 = x[3] - x[4]
     movq mm6, mm2 ; 6 ; t6
    
    movq [y4+8], mm4 ; 4 ; save y4
     paddsw mm2, mm3 ; t6 + t5

    pmulhw mm2, qword ptr [ocos_4_16] ; tp65 = (t6 + t5)*cos_4_16
     psubsw mm6, mm3 ; 3 ; t6 - t5

    pmulhw mm6, qword ptr [ocos_4_16] ; tm65 = (t6 - t5)*cos_4_16
     psubsw mm5, mm0 ; 0 ; y6 = tm03*tg_2_16 - tm12

    por mm5, qword ptr one_corr ; correction y6 +0.5
     psllw mm1, SHIFT_FRW_COL ; t4

    por mm2, qword ptr one_corr ; correction tp65 +0.5
     movq mm4, mm1 ; 4 ; t4

    movq mm3, [x0] ; 3 ; x0
     paddsw mm1, mm6 ; tp465 = t4 + tm65

    psubsw mm3, [x7] ; t7 = x[0] - x[7]
     psubsw mm4, mm6 ; 6 ; tm465 = t4 - tm65

    movq mm0, qword ptr [tg_1_16] ; 0 ; tg_1_16
     psllw mm3, SHIFT_FRW_COL ; t7

    movq mm6, qword ptr [tg_3_16] ; 6 ; tg_3_16
     pmulhw mm0, mm1 ; tp465*tg_1_16

    movq [y0+8], mm7 ; 7 ; save y0
     pmulhw mm6, mm4 ; tm465*tg_3_16

    movq [y6+8], mm5 ; 5 ; save y6
     movq mm7, mm3 ; 7 ; t7

    movq mm5, qword ptr [tg_3_16] ; 5 ; tg_3_16
     psubsw mm7, mm2 ; tm765 = t7 - tp65

    paddsw mm3, mm2 ; 2 ; tp765 = t7 + tp65
     pmulhw mm5, mm7 ; tm765*tg_3_16

    paddsw mm0, mm3 ; y1 = tp765 + tp465*tg_1_16
     paddsw mm6, mm4 ; tm465*tg_3_16

    pmulhw mm3, qword ptr [tg_1_16] ; tp765*tg_1_16
     ;//

    por mm0, qword ptr one_corr ; correction y1 +0.5
     paddsw mm5, mm7 ; tm765*tg_3_16

    psubsw mm7, mm6 ; 6 ; y3 = tm765 - tm465*tg_3_16
     ;//

    movq [y1+8], mm0 ; 0 ; save y1
     paddsw mm5, mm4 ; 4 ; y5 = tm765*tg_3_16 + tm465

    movq [y3+8], mm7 ; 7 ; save y3
     psubsw mm3, mm1 ; 1 ; y7 = tp765*tg_1_16 - tp465

    movq [y5+8], mm5 ; 5 ; save y5

    movq [y7+8], mm3 ; 3 ; save y7

  //   emms;
  //  }   // end of forward_dct_col07() 
    //  done with dct_col transform


  ////////////////////////////////////////////////////////////////////////
  //
  // fdct_mmx32_rows() --
  // the following subroutine performs the row-transform operation,
  //
  //  The output is stored into blk[], destroying the original
  //  source data.

  //  v1.01 - output is range-clipped to {-2048, +2047}

	mov INP, dword ptr [blk];		;// row 0
	 mov edi, 0x08;	//x = 8

	lea TABLE, dword ptr [tab_frw_01234567]; // row 0
	 mov OUT, INP;

	lea round_frw_row, dword ptr [r_frw_row];
	// for ( x = 8; x > 0; --x )  // transform 1 row per iteration

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -