vf_fspp.c
来自「君正早期ucos系统(只有早期的才不没有打包成库),MPLAYER,文件系统,图」· C语言 代码 · 共 2,126 行 · 第 1/5 页
C
2,126 行
"movq 1*8+4*16(%%"REG_d"), %%mm6 \n\t" "psllw $2, %%mm7 \n\t" "psubw 1*8+0*16(%%"REG_d"), %%mm5 \n\t" "psubw %%mm6, %%mm2 \n\t" "paddusw 1*8+0*16(%%"REG_d"), %%mm5 \n\t" "paddusw %%mm6, %%mm2 \n\t" "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm7 \n\t" // "paddw 1*8+0*16(%%"REG_d"), %%mm5 \n\t" "paddw %%mm6, %%mm2 \n\t" "psubusw 1*8+0*16(%%"REG_d"), %%mm5 \n\t" "psubusw %%mm6, %%mm2 \n\t"//This func is totally compute-bound, operates at huge speed. So, DC shortcut// at this place isn't worthwhile due to BTB miss penalty (checked on Pent. 3).//However, typical numbers: nondc - 29%%, dc - 46%%, zero - 25%%. All <> 0 case is very rare. "paddw "MANGLE(MM_2)", %%mm5 \n\t" "movq %%mm2, %%mm6 \n\t" "paddw %%mm5, %%mm2 \n\t" "psubw %%mm6, %%mm5 \n\t" "movq %%mm1, %%mm6 \n\t" "paddw %%mm7, %%mm1 \n\t" //d2 "psubw 1*8+2*16(%%"REG_d"), %%mm1 \n\t" "psubw %%mm7, %%mm6 \n\t" //d6 "movq 1*8+6*16(%%"REG_d"), %%mm7 \n\t" "psraw $2, %%mm5 \n\t" "paddusw 1*8+2*16(%%"REG_d"), %%mm1 \n\t" "psubw %%mm7, %%mm6 \n\t" // t7 d2 /t11 t4 t6 - d6 /t10 "paddw 1*8+2*16(%%"REG_d"), %%mm1 \n\t" "paddusw %%mm7, %%mm6 \n\t" "psubusw 1*8+2*16(%%"REG_d"), %%mm1 \n\t" "paddw %%mm7, %%mm6 \n\t" "psubw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm3 \n\t" "psubusw %%mm7, %%mm6 \n\t" //movq [edi+"DCTSIZE_S"*2*2], mm1 //movq [edi+"DCTSIZE_S"*6*2], mm6 "movq %%mm1, %%mm7 \n\t" "psraw $2, %%mm2 \n\t" "psubw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm4 \n\t" "psubw %%mm6, %%mm1 \n\t" "psubw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm0 \n\t" "paddw %%mm7, %%mm6 \n\t" //'t13 "psraw $2, %%mm6 \n\t" //paddw mm6, MM_2 !! --- "movq %%mm2, %%mm7 \n\t" "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm1 \n\t" "paddw %%mm6, %%mm2 \n\t" //'t0 "movq %%mm2, "MANGLE(temps)"+0*8 \n\t" //! "psubw %%mm6, %%mm7 \n\t" //'t3 "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t" "psubw %%mm6, %%mm1 \n\t" //'t12 "psubw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t5 "movq %%mm5, %%mm6 \n\t" "movq %%mm7, "MANGLE(temps)"+3*8 \n\t" "paddw %%mm2, %%mm3 \n\t" //t10 "paddw %%mm4, %%mm2 \n\t" //t11 "paddw %%mm0, %%mm4 \n\t" //t12 "movq %%mm3, %%mm7 \n\t" "psubw %%mm4, %%mm3 \n\t" "psllw $2, %%mm3 \n\t" "psllw $2, %%mm7 \n\t" //opt for P6 "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t" "psllw $2, %%mm4 \n\t" "pmulhw "MANGLE(MM_FIX_0_541196100)", %%mm7 \n\t" "psllw $2, %%mm2 \n\t" "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm4 \n\t" "paddw %%mm1, %%mm5 \n\t" //'t1 "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm2 \n\t" "psubw %%mm1, %%mm6 \n\t" //'t2 // t7 't12 't11 t4 t6 - 't13 't10 --- "paddw %%mm3, %%mm7 \n\t" //z2 "movq %%mm5, "MANGLE(temps)"+1*8 \n\t" "paddw %%mm3, %%mm4 \n\t" //z4 "movq 1*8+3*16(%%"REG_d"), %%mm3 \n\t" "movq %%mm0, %%mm1 \n\t" "movq %%mm6, "MANGLE(temps)"+2*8 \n\t" "psubw %%mm2, %%mm1 \n\t" //z13 //=== "paddw %%mm2, %%mm0 \n\t" //z11 "movq %%mm1, %%mm5 \n\t" "movq 1*8+5*16(%%"REG_d"), %%mm2 \n\t" "psubw %%mm7, %%mm1 \n\t" //d3 "paddw %%mm7, %%mm5 \n\t" //d5 "psubw %%mm3, %%mm1 \n\t" "movq 1*8+1*16(%%"REG_d"), %%mm7 \n\t" "psubw %%mm2, %%mm5 \n\t" "movq %%mm0, %%mm6 \n\t" "paddw %%mm4, %%mm0 \n\t" //d1 "paddusw %%mm3, %%mm1 \n\t" "psubw %%mm4, %%mm6 \n\t" //d7 // d1 d3 - - - d5 d7 - "movq 1*8+7*16(%%"REG_d"), %%mm4 \n\t" "psubw %%mm7, %%mm0 \n\t" "psubw %%mm4, %%mm6 \n\t" "paddusw %%mm2, %%mm5 \n\t" "paddusw %%mm4, %%mm6 \n\t" "paddw %%mm3, %%mm1 \n\t" "paddw %%mm2, %%mm5 \n\t" "paddw %%mm4, %%mm6 \n\t" "psubusw %%mm3, %%mm1 \n\t" "psubusw %%mm2, %%mm5 \n\t" "psubusw %%mm4, %%mm6 \n\t" "movq %%mm1, %%mm4 \n\t" "por %%mm5, %%mm4 \n\t" "paddusw %%mm7, %%mm0 \n\t" "por %%mm6, %%mm4 \n\t" "paddw %%mm7, %%mm0 \n\t" "packssdw %%mm4, %%mm4 \n\t" "psubusw %%mm7, %%mm0 \n\t" "movd %%mm4, %%"REG_a" \n\t" "or %%"REG_a", %%"REG_a" \n\t" "jnz 3f \n\t" //movq [edi+"DCTSIZE_S"*3*2], mm1 //movq [edi+"DCTSIZE_S"*5*2], mm5 //movq [edi+"DCTSIZE_S"*1*2], mm0 //movq [edi+"DCTSIZE_S"*7*2], mm6 // t4 t5 - - - t6 t7 - //--- t4 (mm0) may be <>0; mm1, mm5, mm6 == 0//Typical numbers: nondc - 19%%, dc - 26%%, zero - 55%%. zero case alone isn't worthwhile "movq "MANGLE(temps)"+0*8, %%mm4 \n\t" "movq %%mm0, %%mm1 \n\t" "pmulhw "MANGLE(MM_FIX_0_847759065)", %%mm0 \n\t" //tmp6 "movq %%mm1, %%mm2 \n\t" "movq "DCTSIZE_S"*0*2(%%"REG_D"), %%mm5 \n\t" "movq %%mm2, %%mm3 \n\t" "pmulhw "MANGLE(MM_FIX_0_566454497)", %%mm1 \n\t" //tmp5 "paddw %%mm4, %%mm5 \n\t" "movq "MANGLE(temps)"+1*8, %%mm6 \n\t" //paddw mm3, MM_2 "psraw $2, %%mm3 \n\t" //tmp7 "pmulhw "MANGLE(MM_FIX_0_198912367)", %%mm2 \n\t" //-tmp4 "psubw %%mm3, %%mm4 \n\t" "movq "DCTSIZE_S"*1*2(%%"REG_D"), %%mm7 \n\t" "paddw %%mm3, %%mm5 \n\t" "movq %%mm4, "DCTSIZE_S"*7*2(%%"REG_D") \n\t" "paddw %%mm6, %%mm7 \n\t" "movq "MANGLE(temps)"+2*8, %%mm3 \n\t" "psubw %%mm0, %%mm6 \n\t" "movq "DCTSIZE_S"*2*2(%%"REG_D"), %%mm4 \n\t" "paddw %%mm0, %%mm7 \n\t" "movq %%mm5, "DCTSIZE_S"*0*2(%%"REG_D") \n\t" "paddw %%mm3, %%mm4 \n\t" "movq %%mm6, "DCTSIZE_S"*6*2(%%"REG_D") \n\t" "psubw %%mm1, %%mm3 \n\t" "movq "DCTSIZE_S"*5*2(%%"REG_D"), %%mm5 \n\t" "paddw %%mm1, %%mm4 \n\t" "movq "DCTSIZE_S"*3*2(%%"REG_D"), %%mm6 \n\t" "paddw %%mm3, %%mm5 \n\t" "movq "MANGLE(temps)"+3*8, %%mm0 \n\t" "add $24, %%"REG_S" \n\t" "movq %%mm7, "DCTSIZE_S"*1*2(%%"REG_D") \n\t" "paddw %%mm0, %%mm6 \n\t" "movq %%mm4, "DCTSIZE_S"*2*2(%%"REG_D") \n\t" "psubw %%mm2, %%mm0 \n\t" "movq "DCTSIZE_S"*4*2(%%"REG_D"), %%mm7 \n\t" "paddw %%mm2, %%mm6 \n\t" "movq %%mm5, "DCTSIZE_S"*5*2(%%"REG_D") \n\t" "paddw %%mm0, %%mm7 \n\t" "movq %%mm6, "DCTSIZE_S"*3*2(%%"REG_D") \n\t" "movq %%mm7, "DCTSIZE_S"*4*2(%%"REG_D") \n\t" "add $24, %%"REG_D" \n\t" "sub $2, %%"REG_c" \n\t" "jnz 1b \n\t" "jmp 5f \n\t" "3: \n\t" //--- non DC2 //psraw mm1, 2 w/o it -> offset. thr1, thr1, thr1 (actually thr1, thr1, thr1-1) //psraw mm5, 2 //psraw mm0, 2 //psraw mm6, 2 "movq %%mm5, %%mm3 \n\t" "psubw %%mm1, %%mm5 \n\t" "psllw $1, %%mm5 \n\t" //'z10 "paddw %%mm1, %%mm3 \n\t" //'z13 "movq %%mm0, %%mm2 \n\t" "psubw %%mm6, %%mm0 \n\t" "movq %%mm5, %%mm1 \n\t" "psllw $1, %%mm0 \n\t" //'z12 "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm1 \n\t" //- "paddw %%mm0, %%mm5 \n\t" "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm5 \n\t" //'z5 "paddw %%mm6, %%mm2 \n\t" //'z11 "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm0 \n\t" "movq %%mm2, %%mm7 \n\t" //--- "movq "MANGLE(temps)"+0*8, %%mm4 \n\t" "psubw %%mm3, %%mm2 \n\t" "psllw $1, %%mm2 \n\t" "paddw %%mm3, %%mm7 \n\t" //'t7 "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" //'t11 "movq %%mm4, %%mm6 \n\t" //paddw mm7, MM_2 "psraw $2, %%mm7 \n\t" "paddw "DCTSIZE_S"*0*2(%%"REG_D"), %%mm4 \n\t" "psubw %%mm7, %%mm6 \n\t" "movq "MANGLE(temps)"+1*8, %%mm3 \n\t" "paddw %%mm7, %%mm4 \n\t" "movq %%mm6, "DCTSIZE_S"*7*2(%%"REG_D") \n\t" "paddw %%mm5, %%mm1 \n\t" //'t12 "movq %%mm4, "DCTSIZE_S"*0*2(%%"REG_D") \n\t" "psubw %%mm7, %%mm1 \n\t" //'t6 "movq "MANGLE(temps)"+2*8, %%mm7 \n\t" "psubw %%mm5, %%mm0 \n\t" //'t10 "movq "MANGLE(temps)"+3*8, %%mm6 \n\t" "movq %%mm3, %%mm5 \n\t" "paddw "DCTSIZE_S"*1*2(%%"REG_D"), %%mm3 \n\t" "psubw %%mm1, %%mm5 \n\t" "psubw %%mm1, %%mm2 \n\t" //'t5 "paddw %%mm1, %%mm3 \n\t" "movq %%mm5, "DCTSIZE_S"*6*2(%%"REG_D") \n\t" "movq %%mm7, %%mm4 \n\t" "paddw "DCTSIZE_S"*2*2(%%"REG_D"), %%mm7 \n\t" "psubw %%mm2, %%mm4 \n\t" "paddw "DCTSIZE_S"*5*2(%%"REG_D"), %%mm4 \n\t" "paddw %%mm2, %%mm7 \n\t" "movq %%mm3, "DCTSIZE_S"*1*2(%%"REG_D") \n\t" "paddw %%mm2, %%mm0 \n\t" //'t4 // 't4 't6 't5 - - - - 't7 "movq %%mm7, "DCTSIZE_S"*2*2(%%"REG_D") \n\t" "movq %%mm6, %%mm1 \n\t" "paddw "DCTSIZE_S"*4*2(%%"REG_D"), %%mm6 \n\t" "psubw %%mm0, %%mm1 \n\t" "paddw "DCTSIZE_S"*3*2(%%"REG_D"), %%mm1 \n\t" "paddw %%mm0, %%mm6 \n\t" "movq %%mm4, "DCTSIZE_S"*5*2(%%"REG_D") \n\t" "add $24, %%"REG_S" \n\t" "movq %%mm6, "DCTSIZE_S"*4*2(%%"REG_D") \n\t" "movq %%mm1, "DCTSIZE_S"*3*2(%%"REG_D") \n\t" "add $24, %%"REG_D" \n\t" "sub $2, %%"REG_c" \n\t" "jnz 1b \n\t" "5: \n\t" : "+S"(data), "+D"(output), "+c"(cnt)// input regs : "d"(thr_adr) : "%"REG_a );}#endif // HAVE_MMX#ifndef HAVE_MMXstatic void row_idct_c(DCTELEM* workspace, int16_t* output_adr, int output_stride, int cnt){ int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; int_simd16_t tmp10, tmp11, tmp12, tmp13; int_simd16_t z5, z10, z11, z12, z13; int16_t* outptr; DCTELEM* wsptr; cnt*=4; wsptr = workspace; outptr = output_adr; for (; cnt > 0; cnt--) { // Even part //Simd version reads 4x4 block and transposes it tmp10 = ( wsptr[2] + wsptr[3]); tmp11 = ( wsptr[2] - wsptr[3]); tmp13 = ( wsptr[0] + wsptr[1]); tmp12 = (MULTIPLY16H( wsptr[0] - wsptr[1], FIX_1_414213562_A)<<2) - tmp13;//this shift order to avoid overflow tmp0 = tmp10 + tmp13; //->temps tmp3 = tmp10 - tmp13; //->temps tmp1 = tmp11 + tmp12; tmp2 = tmp11 - tmp12; // Odd part //Also transpose, with previous: // ---- ---- |||| // ---- ---- idct |||| // ---- ---- ---> |||| // ---- ---- |||| z13 = wsptr[4] + wsptr[5]; z10 = wsptr[4] - wsptr[5]; z11 = wsptr[6] + wsptr[7]; z12 = wsptr[6] - wsptr[7]; tmp7 = z11 + z13; tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562); z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065); tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5; tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - FIX_ tmp6 = (tmp12<<3) - tmp7; tmp5 = (tmp11<<3) - tmp6; tmp4 = (tmp10<<3) + tmp5; // Final output stage: descale and write column outptr[0*output_stride]+= DESCALE(tmp0 + tmp7, 3); outptr[1*output_stride]+= DESCALE(tmp1 + tmp6, 3); outptr[2*output_stride]+= DESCALE(tmp2 + tmp5, 3); outptr[3*output_stride]+= DESCALE(tmp3 - tmp4, 3); outptr[4*output_stride]+= DESCALE(tmp3 + tmp4, 3); outptr[5*output_stride]+= DESCALE(tmp2 - tmp5, 3); outptr[6*output_stride]+= DESCALE(tmp1 - tmp6, 3); //no += ? outptr[7*output_stride]+= DESCALE(tmp0 - tmp7, 3); //no += ? outptr++; wsptr += DCTSIZE; // advance pointer to next row }}#else /* HAVE_MMX */static void row_idct_mmx (DCTELEM* workspace, int16_t* output_adr, int output_stride, int cnt){ asm volatile( "lea (%%"REG_a",%%"REG_a",2), %%"REG_d" \n\t" "1: \n\t" "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm0 \n\t" // "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm1 \n\t" "movq %%mm0, %%mm4 \n\t" "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t" "punpcklwd %%mm1, %%mm0 \n\t" "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm3 \n\t" "punpckhwd %%mm1, %%mm4 \n\t" //transpose 4x4 "movq %%mm2, %%mm7 \n\t" "punpcklwd %%mm3, %%mm2 \n\t"
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?