📄 x86.c
字号:
/* x86 specific optimized assembler dsp routines Copyright (C) 2001-2005 Jussi Laako This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/#ifdef DSP_X86#include <stdio.h>#include <string.h>#include <limits.h>#include <math.h>#include <float.h>#include "dsp/X86.h"#ifndef DSP_X86_64static char cpCPUid[13];#endif#ifdef __cplusplusextern "C"{#endif#ifndef DSP_X86_64const char *dsp_x86_cpuid (){ unsigned int *ipCPUid = (unsigned int *) cpCPUid; X86_ASM ( "pushl %%ebx\n\t" \ "xorl %%eax, %%eax\n\t" \ "cpuid\n\t" \ "movl %%ebx, %0\n\t" \ "movl %%ecx, %2\n\t" \ "movl %%edx, %1\n\t" \ "popl %%ebx\n\t" \ : "=m" (ipCPUid[0]), "=m" (ipCPUid[1]), "=m" (ipCPUid[2]) : : "eax", "ecx", "edx", "memory"); cpCPUid[12] = '\0'; return cpCPUid;}unsigned int dsp_x86_features (){ unsigned int uiFeatures = 0; X86_ASM ( "pushl %%ebx\n\t" \ "movl $1, %%eax\n\t" \ "cpuid\n\t" \ "movl %%edx, %0\n\t" \ "popl %%ebx\n\t" \ : "=m" (uiFeatures) : : "eax", "ecx", "edx", "memory"); return uiFeatures;}unsigned int dsp_x86_amd_features (){ unsigned int uiFunction = 0x80000001; unsigned int uiFeatures = 0; X86_ASM ( "pushl %%ebx\n\t" \ "movl %1, %%eax\n\t" \ "cpuid\n\t" \ "movl %%edx, %0\n\t" \ "popl %%ebx\n\t" \ : "=m" (uiFeatures) : "m" (uiFunction) : "eax", "ecx", "edx", "memory"); return uiFeatures;}#endifextern int dsp_x86_have_e3dnow (){ #ifndef DSP_X86_64 unsigned int uiExtSup = 0; unsigned int uiFeatures; X86_ASM ( "pushl %%ebx\n\t" \ "movl $0x80000000, %%eax\n\t" \ "cpuid\n\t" \ "cmpl $0x80000001, %%eax\n\t" \ "jl have3dnowxit\n\t" \ "movl $1, %0\n\t" \ "have3dnowxit:\n\t" \ "popl %%ebx\n\t" : "=m" (uiExtSup) : : "eax", "ecx", "edx", "memory"); if (uiExtSup) { uiFeatures = dsp_x86_amd_features(); if ((uiFeatures & (1 << 31)) && (uiFeatures & (1 << 30))) return 1; } return 0; #else return 1; #endif}extern int dsp_x86_have_sse2 (){ #ifndef DSP_X86_64 unsigned int uiFeatures; uiFeatures = dsp_x86_features(); if ((uiFeatures & (1 << 25)) && (uiFeatures & (1 << 26))) return 1; return 0; #else return 1; #endif}// --- inline code snippetsinline void dsp_x86_prefetchntf_init (const float *fpSrc){ stpm64 m64pSrc = (stpm64) fpSrc; X86_ASM ( "prefetchnta %0\n\t" \ "prefetchnta %1\n\t" \ "prefetchnta %2\n\t" \ "prefetchnta %3\n\t" : : "m" (m64pSrc[0]), "m" (m64pSrc[8]), "m" (m64pSrc[16]), "m" (m64pSrc[24]));}inline void dsp_x86_prefetchnt_init (const double *dpSrc){ stpm64 m64pSrc = (stpm64) dpSrc; X86_ASM ( "prefetchnta %0\n\t" \ "prefetchnta %1\n\t" \ "prefetchnta %2\n\t" \ "prefetchnta %3\n\t" : : "m" (m64pSrc[0]), "m" (m64pSrc[8]), "m" (m64pSrc[16]), "m" (m64pSrc[24]));}inline void dsp_x86_prefetchtf_init (const float *fpSrc){ stpm64 m64pSrc = (stpm64) fpSrc; X86_ASM ( "prefetcht0 %0\n\t" \ "prefetcht0 %1\n\t" \ "prefetcht0 %2\n\t" \ "prefetcht0 %3\n\t" : : "m" (m64pSrc[0]), "m" (m64pSrc[8]), "m" (m64pSrc[16]), "m" (m64pSrc[24]));}inline void dsp_x86_prefetcht_init (const double *dpSrc){ stpm64 m64pSrc = (stpm64) dpSrc; X86_ASM ( "prefetcht0 %0\n\t" \ "prefetcht0 %1\n\t" \ "prefetcht0 %2\n\t" \ "prefetcht0 %3\n\t" : : "m" (m64pSrc[0]), "m" (m64pSrc[8]), "m" (m64pSrc[16]), "m" (m64pSrc[24]));}inline void dsp_x86_prefetchntf_next (const float *fpSrc){ stpm64 m64pSrc = (stpm64) fpSrc; X86_ASM ( "prefetchnta %0\n\t" : : "m" (m64pSrc[32]));}inline void dsp_x86_prefetchnt_next (const double *dpSrc){ stpm64 m64pSrc = (stpm64) dpSrc; X86_ASM ( "prefetchnta %0\n\t" : : "m" (m64pSrc[32]));}inline void dsp_x86_prefetchtf_next (const float *fpSrc){ stpm64 m64pSrc = (stpm64) fpSrc; X86_ASM ( "prefetcht0 %0\n\t" : : "m" (m64pSrc[32]));}inline void dsp_x86_prefetcht_next (const double *dpSrc){ stpm64 m64pSrc = (stpm64) dpSrc; X86_ASM ( "prefetcht0 %0\n\t" : : "m" (m64pSrc[32]));}// ---void dsp_x86_3dnow_copyf (float *fpDest, const float *fpSrc, int iDataLength){ int iStartIdx; int iDataCntr; int iDataCount; pv2sf m64pDest = (pv2sf) fpDest; pv2sf m64pSrc = (pv2sf) fpSrc; iStartIdx = 0; X86_ASM ( "prefetchnta %0\n\t" \ "prefetchnta %1\n\t" \ "prefetchnta %2\n\t" \ "prefetchnta %3\n\t" : : "m" (m64pSrc[0]), "m" (m64pSrc[8]), "m" (m64pSrc[16]), "m" (m64pSrc[24])); iDataCount = ((iDataLength & 0xfffffff0) >> 1); for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr += 8) { X86_ASM ( "prefetchnta %16\n\t" \ "movq %8, %%mm0\n\t" \ "movq %9, %%mm1\n\t" \ "movq %10, %%mm2\n\t" \ "movq %11, %%mm3\n\t" \ "movq %12, %%mm4\n\t" \ "movq %13, %%mm5\n\t" \ "movq %14, %%mm6\n\t" \ "movq %15, %%mm7\n\t" \ "movntq %%mm0, %0\n\t" \ "movntq %%mm1, %1\n\t" \ "movntq %%mm2, %2\n\t" \ "movntq %%mm3, %3\n\t" \ "movntq %%mm4, %4\n\t" \ "movntq %%mm5, %5\n\t" \ "movntq %%mm6, %6\n\t" \ "movntq %%mm7, %7\n\t" : "=m" (m64pDest[iDataCntr]), "=m" (m64pDest[iDataCntr + 1]), "=m" (m64pDest[iDataCntr + 2]), "=m" (m64pDest[iDataCntr + 3]), "=m" (m64pDest[iDataCntr + 4]), "=m" (m64pDest[iDataCntr + 5]), "=m" (m64pDest[iDataCntr + 6]), "=m" (m64pDest[iDataCntr + 7]) : "m" (m64pSrc[iDataCntr]), "m" (m64pSrc[iDataCntr + 1]), "m" (m64pSrc[iDataCntr + 2]), "m" (m64pSrc[iDataCntr + 3]), "m" (m64pSrc[iDataCntr + 4]), "m" (m64pSrc[iDataCntr + 5]), "m" (m64pSrc[iDataCntr + 6]), "m" (m64pSrc[iDataCntr + 7]), "m" (m64pSrc[iDataCntr + 32]) : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory"); } iStartIdx = iDataCount; iDataCount = ((iDataLength & 0xfffffffe) >> 1); for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr++) { X86_ASM ( "prefetchnta %2\n\t" \ "movq %1, %%mm0\n\t" \ "movntq %%mm0, %0\n\t" : "=m" (m64pDest[iDataCntr]) : "m" (m64pSrc[iDataCntr]), "m" (m64pSrc[iDataCntr + 32]) : "mm0", "memory"); } if (iDataLength & 0x1) { X86_ASM ( "movd %1, %%mm0\n\t" \ "movd %%mm0, %0\n\t" : "=m" (fpDest[iDataLength - 1]) : "m" (fpSrc[iDataLength - 1]) : "mm0", "memory"); } X86_ASM ( "femms\n\t" \ "sfence\n\t");}void dsp_x86_3dnow_copyd (double *dpDest, const double *dpSrc, int iDataLength){ int iStartIdx; int iDataCntr; int iDataCount; iStartIdx = 0; X86_ASM ( "prefetchnta %0\n\t" \ "prefetchnta %1\n\t" \ "prefetchnta %2\n\t" \ "prefetchnta %3\n\t" : : "m" (dpSrc[0]), "m" (dpSrc[8]), "m" (dpSrc[16]), "m" (dpSrc[24])); iDataCount = (iDataLength & 0xfffffff8); for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr += 8) { X86_ASM ( "prefetchnta %16\n\t" \ "movq %8, %%mm0\n\t" \ "movq %9, %%mm1\n\t" \ "movq %10, %%mm2\n\t" \ "movq %11, %%mm3\n\t" \ "movq %12, %%mm4\n\t" \ "movq %13, %%mm5\n\t" \ "movq %14, %%mm6\n\t" \ "movq %15, %%mm7\n\t" \ "movntq %%mm0, %0\n\t" \ "movntq %%mm1, %1\n\t" \ "movntq %%mm2, %2\n\t" \ "movntq %%mm3, %3\n\t" \ "movntq %%mm4, %4\n\t" \ "movntq %%mm5, %5\n\t" \ "movntq %%mm6, %6\n\t" \ "movntq %%mm7, %7\n\t" : "=m" (dpDest[iDataCntr]), "=m" (dpDest[iDataCntr + 1]), "=m" (dpDest[iDataCntr + 2]), "=m" (dpDest[iDataCntr + 3]), "=m" (dpDest[iDataCntr + 4]), "=m" (dpDest[iDataCntr + 5]), "=m" (dpDest[iDataCntr + 6]), "=m" (dpDest[iDataCntr + 7]) : "m" (dpSrc[iDataCntr]), "m" (dpSrc[iDataCntr + 1]), "m" (dpSrc[iDataCntr + 2]), "m" (dpSrc[iDataCntr + 3]), "m" (dpSrc[iDataCntr + 4]), "m" (dpSrc[iDataCntr + 5]), "m" (dpSrc[iDataCntr + 6]), "m" (dpSrc[iDataCntr + 7]), "m" (dpSrc[iDataCntr + 32]) : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory"); } iStartIdx = iDataCount; iDataCount = iDataLength; for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr++) { X86_ASM ( "prefetchnta %2\n\t" \ "movq %1, %%mm0\n\t" \ "movntq %%mm0, %0\n\t" : "=m" (dpDest[iDataCntr]) : "m" (dpSrc[iDataCntr]), "m" (dpSrc[iDataCntr + 32]) : "mm0", "memory"); } X86_ASM ( "femms\n\t" \ "sfence\n\t");}void dsp_x86_3dnow_addf (float *fpVect, float fSrc, int iDataLength){ int iDataCntr; int iDataCount; pv2sf m64pVect = (pv2sf) fpVect; stm64 m64Src; m64Src.f[0] = m64Src.f[1] = fSrc; iDataCount = (iDataLength >> 1); X86_ASM ( "movq %0, %%mm1\n\t" : : "m" (m64Src) : "mm1", "memory"); for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++) { X86_ASM ( "movq %1, %%mm0\n\t" \ "pfadd %%mm1, %%mm0\n\t" \ "movntq %%mm0, %0\n\t" : "=m" (m64pVect[iDataCntr]) : "m0" (m64pVect[iDataCntr]) : "mm0", "mm1", "memory"); } if (iDataLength & 0x1) { X86_ASM ( "movd %1, %%mm0\n\t" \ "pfadd %%mm1, %%mm0\n\t" \ "movd %%mm0, %0\n\t" : "=m" (fpVect[iDataLength - 1]) : "m0" (fpVect[iDataLength - 1]) : "mm0", "mm1", "memory"); } X86_ASM ( "femms\n\t" \ "sfence\n\t");}void dsp_x86_sse_addf (float *fpVect, float fSrc, int iDataLength){ int iDataCntr; X86_ASM ( "movss %0, %%xmm1\n\t" : : "m" (fSrc) : "xmm1", "memory"); for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++) { X86_ASM ( "movss %1, %%xmm0\n\t" \ "addss %%xmm1, %%xmm0\n\t" \ "movss %%xmm0, %0\n\t"
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -