📄 x86.c
字号:
{ int iDataCntr; int iDataCount; float fRes; pv2sf m64pSrc1 = (pv2sf) fpSrc1; pv2sf m64pSrc2 = (pv2sf) fpSrc2; iDataCount = (iDataLength >> 1); X86_ASM ( "pxor %%mm3, %%mm3\n\t" \ "pxor %%mm4, %%mm4\n\t" \ "pxor %%mm5, %%mm5\n\t" : : : "mm3", "mm4", "mm5"); for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++) { X86_ASM ( "movq %0, %%mm0\n\t" \ "movq %1, %%mm1\n\t" \ "movq %%mm1, %%mm2\n\t" \ "pfmul %%mm0, %%mm2\n\t" \ "pfacc %%mm2, %%mm5\n\t" \ "pfmul %%mm0, %%mm0\n\t" \ "pfacc %%mm0, %%mm3\n\t" \ "pfmul %%mm1, %%mm1\n\t" \ "pfacc %%mm1, %%mm4\n\t" : : "m" (m64pSrc1[iDataCntr]), "m" (m64pSrc2[iDataCntr]) : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "memory"); } if (iDataLength & 0x1) { X86_ASM ( "movd %0, %%mm0\n\t" \ "movd %1, %%mm1\n\t" \ "movq %%mm1, %%mm2\n\t" \ "pfmul %%mm0, %%mm2\n\t" \ "pfacc %%mm2, %%mm5\n\t" \ "pfmul %%mm0, %%mm0\n\t" \ "pfacc %%mm0, %%mm3\n\t" \ "pfmul %%mm1, %%mm1\n\t" \ "pfacc %%mm1, %%mm4\n\t" : : "m" (fpSrc1[iDataLength - 1]), "m" (fpSrc2[iDataLength - 1]) : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "memory"); } X86_ASM ( "pfacc %%mm3, %%mm3\n\t" \ "pfacc %%mm4, %%mm4\n\t" \ "pfacc %%mm5, %%mm5\n\t" \ \ "movd %1, %%mm6\n\t" \ "pswapd %%mm6, %%mm7\n\t" \ "paddd %%mm7, %%mm6\n\t" \ "pi2fd %%mm6, %%mm7\n\t" \ \ "pfrcp %%mm7, %%mm6\n\t" \ "pfrcpit1 %%mm6, %%mm7\n\t" \ "pfrcpit2 %%mm6, %%mm7\n\t" \ \ "pfmul %%mm3, %%mm4\n\t" \ \ "movq %%mm4, %%mm0\n\t" \ "pfrsqrt %%mm4, %%mm1\n\t" \ "movq %%mm1, %%mm2\n\t" \ "pfmul %%mm1, %%mm1\n\t" \ "pfrsqit1 %%mm4, %%mm1\n\t" \ "pfrcpit2 %%mm2, %%mm1\n\t" \ "pfmul %%mm1, %%mm4\n\t" \ \ "pfmul %%mm6, %%mm4\n\t" \ \ "pfrcp %%mm4, %%mm0\n\t" \ "pfrcpit1 %%mm0, %%mm4\n\t" \ "pfrcpit2 %%mm0, %%mm4\n\t" \ \ "pfmul %%mm6, %%mm5\n\t" \ "pfmul %%mm4, %%mm5\n\t" \ "movd %%mm5, %0\n\t" : "=m" (fRes) : "m" (iDataLength) : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory"); X86_ASM ("femms\n\t"); return fRes;}float dsp_x86_sse_crosscorrf (const float *fpSrc1, const float *fpSrc2, int iDataLength){ int iDataCntr; float fScale; float fNormFact; float fProdSum; float fSqSum1; float fSqSum2; float fRes; X86_ASM ( "xorps %%xmm0, %%xmm0\n\t" \ "xorps %%xmm1, %%xmm1\n\t" \ "xorps %%xmm2, %%xmm2\n\t" : : : "xmm0", "xmm1", "xmm2"); for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++) { X86_ASM ( "movss %3, %%xmm3\n\t" \ "movss %4, %%xmm4\n\t" \ \ "movss %%xmm4, %%xmm5\n\t" \ "mulss %%xmm3, %%xmm5\n\t" \ "addss %%xmm5, %%xmm0\n\t" \ \ "movss %%xmm3, %%xmm5\n\t" \ "mulss %%xmm3, %%xmm5\n\t" \ "addss %%xmm5, %%xmm1\n\t" \ \ "movss %%xmm4, %%xmm5\n\t" \ "mulss %%xmm4, %%xmm5\n\t" \ "addss %%xmm5, %%xmm2\n\t" \ \ "movss %%xmm0, %0\n\t" \ "movss %%xmm1, %1\n\t" \ "movss %%xmm2, %2\n\t" : "=m" (fProdSum), "=m" (fSqSum1), "=m" (fSqSum2) : "m" (fpSrc1[iDataCntr]), "m" (fpSrc2[iDataCntr]) : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } fScale = 1.0F / iDataLength; fNormFact = sqrtf(fSqSum1 * fSqSum2) * fScale; fRes = (fProdSum * fScale) / fNormFact; return fRes;}double dsp_x86_sse_crosscorr (const double *dpSrc1, const double *dpSrc2, int iDataLength){ int iDataCntr; double dScale; double dNormFact; double dProdSum; double dSqSum1; double dSqSum2; double dRes; X86_ASM ( "xorpd %%xmm0, %%xmm0\n\t" \ "xorpd %%xmm1, %%xmm1\n\t" \ "xorpd %%xmm2, %%xmm2\n\t" : : : "xmm0", "xmm1", "xmm2"); for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++) { X86_ASM ( "movsd %3, %%xmm3\n\t" \ "movsd %4, %%xmm4\n\t" \ \ "movsd %%xmm4, %%xmm5\n\t" \ "mulsd %%xmm3, %%xmm5\n\t" \ "addsd %%xmm5, %%xmm0\n\t" \ \ "movsd %%xmm3, %%xmm5\n\t" \ "mulsd %%xmm3, %%xmm5\n\t" \ "addsd %%xmm5, %%xmm1\n\t" \ \ "movsd %%xmm4, %%xmm5\n\t" \ "mulsd %%xmm4, %%xmm5\n\t" \ "addsd %%xmm5, %%xmm2\n\t" \ \ "movsd %%xmm0, %0\n\t" \ "movsd %%xmm1, %1\n\t" \ "movsd %%xmm2, %2\n\t" : "=m" (dProdSum), "=m" (dSqSum1), "=m" (dSqSum2) : "m" (dpSrc1[iDataCntr]), "m" (dpSrc2[iDataCntr]) : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } dScale = 1.0 / iDataLength; dNormFact = sqrt(dSqSum1 * dSqSum2) * dScale; dRes = (dProdSum * dScale) / dNormFact; return dRes;}void dsp_x86_3dnow_i16tof (float *fpDest, const short *ipSrc, int iDataLength, int iIntMax){ int iDataCntr; float fScale; X86_ASM ( "movd %1, %%mm1\n\t" \ "pswapd %%mm1, %%mm2\n\t" \ "paddd %%mm2, %%mm1\n\t" \ "pi2fd %%mm1, %%mm1\n\t" \ "pfrcp %%mm1, %%mm2\n\t" \ "pfrcpit1 %%mm2, %%mm1\n\t" \ "pfrcpit2 %%mm2, %%mm1\n\t" \ "movd %%mm1, %0\n\t" : "=m" (fScale) : "m" (iIntMax) : "mm1", "mm2", "memory"); for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr += 2) { X86_ASM ( "movd %1, %%mm0\n\t" \ "punpcklwd %%mm0, %%mm0\n\t" \ "pi2fw %%mm0, %%mm0\n\t" \ "pfmul %%mm1, %%mm0\n\t" \ "movntq %%mm0, %0\n\t" : "=m" (fpDest[iDataCntr]) : "m" (ipSrc[iDataCntr]) : "mm0", "mm1", "memory"); } X86_ASM ( "femms\n\t" \ "sfence\n\t"); if ((iDataLength % 2) != 0) { fpDest[iDataLength - 1] = ((float) ipSrc[iDataLength - 1]) * fScale; }}void dsp_x86_3dnow_i32tof (float *fpDest, const int *ipSrc, int iDataLength, int iIntMax){ int iDataCntr; float fScale; X86_ASM ( "movd %1, %%mm1\n\t" \ "pswapd %%mm1, %%mm2\n\t" \ "paddd %%mm2, %%mm1\n\t" \ "pi2fd %%mm1, %%mm1\n\t" \ "pfrcp %%mm1, %%mm2\n\t" \ "pfrcpit1 %%mm2, %%mm1\n\t" \ "pfrcpit2 %%mm2, %%mm1\n\t" \ "movd %%mm1, %0\n\t" : "=m" (fScale) : "m" (iIntMax) : "mm1", "mm2", "memory"); for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr += 2) { X86_ASM ( "movq %1, %%mm0\n\t" \ "pi2fd %%mm0, %%mm0\n\t" \ "pfmul %%mm1, %%mm0\n\t" \ "movntq %%mm0, %0\n\t" : "=m" (fpDest[iDataCntr]) : "m" (ipSrc[iDataCntr]) : "mm0", "mm1", "memory"); } X86_ASM ( "femms\n\t" \ "sfence\n\t"); if ((iDataLength % 2) != 0) { fpDest[iDataLength - 1] = ((float) ipSrc[iDataLength - 1]) * fScale; }}void dsp_x86_3dnow_firf (float *fpDest, const float *fpSrc, int iDataLength, const float *fpCoeff, int iCoeffLength){ int iSrcCntr; int iDestCntr; int iCoeffCntr; int iSrcCount; pv2sf m64pDest = (pv2sf) fpDest; iDestCntr = 0; iSrcCount = iDataLength + iCoeffLength; for (iSrcCntr = iCoeffLength; iSrcCntr < iSrcCount; iSrcCntr += 2) { X86_ASM ( "pxor %%mm0, %%mm0\n\t" : : : "mm0"); for (iCoeffCntr = 0; iCoeffCntr < iCoeffLength; iCoeffCntr++) { X86_ASM ( "movq %0, %%mm1\n\t" \ "movd %1, %%mm2\n\t" \ "pswapd %%mm2, %%mm3\n\t" \ "pfadd %%mm3, %%mm2\n\t" \ "pfmul %%mm2, %%mm1\n\t" \ "pfadd %%mm1, %%mm0\n\t" : : "m" (fpSrc[iSrcCntr - iCoeffCntr]), "m" (fpCoeff[iCoeffCntr]) : "mm0", "mm1", "mm2", "mm3", "memory"); } X86_ASM ( "movntq %%mm0, %0\n\t" : "=m" (m64pDest[iDestCntr++]) : : "mm0", "memory"); } if (iDataLength & 0x1) { X86_ASM ( "pxor %%mm0, %%mm0\n\t" : : : "mm0"); for (iCoeffCntr = 0; iCoeffCntr < iCoeffLength; iCoeffCntr++) { X86_ASM ( "movd %0, %%mm1\n\t" \ "movd %1, %%mm2\n\t" \ "pfmul %%mm2, %%mm1\n\t" \ "pfadd %%mm1, %%mm0\n\t" : : "m" (fpSrc[iDataLength - 1 - iCoeffCntr]), "m" (fpCoeff[iCoeffCntr]) : "mm0", "mm1", "mm2", "memory"); } X86_ASM ( "movd %%mm0, %0\n\t" : "=m" (fpDest[iDataLength - 1]) : : "mm0", "memory"); } X86_ASM ( "femms\n\t" \ "sfence\n\t");}void dsp_x86_sse_firf (float *fpDest, const float *fpSrc, int iDataLength, const float *fpCoeff, int iCoeffLength){ int iDestCntr; int iSrcCntr; int iCoeffCntr; int iSrcCount; iDestCntr = 0; iSrcCount = iDataLength + iCoeffLength; for (iSrcCntr = iCoeffLength; iSrcCntr < iSrcCount; iSrcCntr++) { X86_ASM ( "xorps %%xmm0, %%xmm0\n\t" : : : "xmm0"); for (iCoeffCntr = 0; iCoeffCntr < iCoeffLength; iCoeffCntr++) { X86_ASM ( "movss %0, %%xmm1\n\t" "mulss %1, %%xmm1\n\t" "addss %%xmm1, %%xmm0\n\t" : : "m" (fpSrc[iSrcCntr - iCoeffCntr]), "m" (fpCoeff[iCoeffCntr]) : "xmm0", "xmm1", "memory"); } X86_ASM ( "movss %%xmm0, %0\n\t" : "=m" (fpDest[iDestCntr++]) : : "xmm0", "memory"); }}void dsp_x86_sse_fir (double *dpDest, const double *dpSrc, int iDataLength, const double *dpCoeff, int iCoeffLength){ int iDestCntr; int iSrcCntr; int iCoeffCntr; int iSrcCount; iDestCntr = 0; iSrcCount = iDataLength + iCoeffLength; for (iSrcCntr = iCoeffLength; iSrcCntr < iSrcCount; iSrcCntr++) { X86_ASM ( "xorpd %%xmm0, %%xmm0\n\t" : : : "xmm0"); for (iCoeffCntr = 0; iCoeffCntr < iCoeffLength; iCoeffCntr++) { X86_ASM ( "movsd %0, %%xmm1\n\t" "mulsd %1, %%xmm1\n\t" "addsd %%xmm1, %%xmm0\n\t" : : "m" (dpSrc[iSrcCntr - iCoeffCntr]), "m" (dpCoeff[iCoeffCntr]) : "xmm0", "xmm1", "memory"); } X86_ASM ( "movsd %%xmm0, %0\n\t" : "=m" (dpDest[iDestCntr++]) : : "xmm0", "memory"); }}void dsp_x86_3dnow_iirf (float *fpVect, int iDataLength, const float *fpCoeff, float *fpX, float *fpY){ int iDataCntr; pv2sf m64pCoeff = (pv2sf) &fpCoeff[1]; pv2sf m64pCoeff2 = (pv2sf) &fpCoeff[3]; pv2sf m64pX = (pv2sf) fpX; pv2sf m64pY = (pv2sf) fpY; X86_ASM ( "movq %0, %%mm0\n\t" \ "pswapd %%mm0, %%mm2\n\t" \ "movd %1, %%mm3\n\t" \ "movq %2, %%mm0\n\t" \ "pswapd %%mm0, %%mm4\n\t" \ "movq %3, %%mm5\n\t" \ "movq %4, %%mm7\n\t" \ : : "m" (*m64pCoeff), "m" (fpCoeff[0]), "m" (*m64pCoeff2), "m" (*m64pX), "m" (*m64pY) : "mm0", "mm2", "mm3", "mm4", "mm5", "mm7", "memory"); for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++) { X86_ASM ( "pxor %%mm0, %%mm0\n\t" \ "movd %1, %%mm6\n\t" \ "movq %%mm5, %%mm1\n\t" \ "pfmul %%mm2, %%mm1\n\t" \ "pfacc %%mm1, %%mm0\n\t" \ "movq %%mm6, %%mm1\n\t" \ "pfmul %%mm3, %%mm1\n\t" \ "pfacc %%mm1, %%mm0\n\t" \ "movq %%mm7, %%mm1\n\t" \ "pfmul %%mm4, %%mm1\n\t" \ "pfacc %%mm1, %%mm0\n\t" \ "pfacc %%mm0, %%mm0\n\t" \ \ "pswapd %%mm7, %%mm1\n\t" \ "movq %%mm1, %%mm7\n\t" \ "punpckldq %%mm0, %%mm7\n\t" \ \ "pswapd %%mm5, %%mm1\n\t" \ "movq %%mm1, %%mm5\n\t" \ "movq %%mm6, %%mm1\n\t" \ "punpckldq %%mm1, %%mm5\n\t" \ \ "movd %%mm0, %0\n\t" : "=m" (fpVect[iDataCntr]) : "m0
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -