⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 mmx_gcc.cpp

📁 Audacity是一款用於錄音和編輯聲音的、免費的開放源碼軟體。它可以執行於Mac OS X、Microsoft Windows、GNU/Linux和其它作業系統
💻 CPP
📖 第 1 页 / 共 2 页
字号:
/***************************************************************************** * * gcc version of the MMX optimized routines. All MMX optimized functions * have been gathered into this single source code file, regardless to their  * class or original source code file, in order to ease porting the library * to other compiler and processor platforms. * * This file is to be compiled on any platform with the GNU C compiler. * Compiler. Please see 'mmx_win.cpp' for the x86 Windows version of this * file. * * Author          : Copyright (c) Olli Parviainen * Author e-mail   : oparviai @ iki.fi * File created    : 13-Jan-2002 * * Last changed  : $Date: 2004/10/26 19:09:36 $ * File revision : $Revision: 1.2 $ * * $Id: mmx_gcc.cpp,v 1.2 2004/10/26 19:09:36 vjohnson Exp $ * * Acknowledgements: * Adopted for gcc : Stuart Lamble <sjl @ debian.lib.monash.edu.au> * Adopted for gcc3: Shachar Raindel <shacharr @ users.sourceforge.net> * * License : *  *  SoundTouch sound processing library *  Copyright (c) Olli Parviainen * *  This library is free software; you can redistribute it and/or *  modify it under the terms of the GNU Lesser General Public *  License as published by the Free Software Foundation; either *  version 2.1 of the License, or (at your option) any later version. * *  This library is distributed in the hope that it will be useful, *  but WITHOUT ANY WARRANTY; without even the implied warranty of *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU *  Lesser General Public License for more details. * *  You should have received a copy of the GNU Lesser General Public *  License along with this library; if not, write to the Free Software *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA * *****************************************************************************/#include <stdexcept>#include <string>#include "cpu_detect.h"#ifndef __GNUC__#error "wrong platform - this source code file is for the GNU C compiler."#endifusing namespace std;#ifdef ALLOW_MMX// MMX routines available only with integer sample type    ////////////////////////////////////////////////////////////////////////////////// implementation of MMX optimized functions of class 'TDStretch'////////////////////////////////////////////////////////////////////////////////#include "TDStretch.h"#include <limits.h>// these are declared in 'TDStretch.cpp'extern int scanOffsets[4][24];// Calculates cross correlation of two buffersinline int TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const{#ifdef __i386__    int corr;    uint local_overlapLength = overlapLength;    uint local_overlapDividerBits = overlapDividerBits;    asm volatile(        // Calculate cross-correlation between the tempOffset and tmpbid_buffer.        // Process 4 parallel batches of 2 * stereo samples each during one        // round to improve CPU-level parallellization.        // load address of sloped pV2 buffer to eax        // load address of mixing point of the sample data buffer to ebx        // load counter to ecx = overlapLength / 8 - 1        // empty the mm0        // prepare to the first round by loading         // load mm1 = eax[0]        // load mm2 = eax[1];        "\n\tmovl        %1, %%eax"        "\n\tmovl        %2, %%ebx"        "\n\tmovq        (%%eax), %%mm1"        "\n\tmovl        %3, %%ecx"        "\n\tmovq        8(%%eax), %%mm2"        "\n\tshr         $3, %%ecx"        "\n\tpxor        %%mm0, %%mm0"        "\n\tsub         $1, %%ecx"        "\n\tmovd        %4, %%mm5"        "\n1:"        // multiply-add mm1 = mm1 * ebx[0]        // multiply-add mm2 = mm2 * ebx[1]        //        // add mm2 += mm1        // mm2 >>= mm5 (=overlapDividerBits)        // add mm0 += mm2        //        // load mm3 = eax[2]        // multiply-add mm3 = mm3 * ebx[2]        //        // load mm4 = eax[3]        // multiply-add mm4 = mm4 * ebx[3]        //        // add mm3 += mm4        // mm3 >>= mm5 (=overlapDividerBits)        // add mm0 += mm3        //        // add eax += 4        // add ebx += 4        // load mm1 = eax[0] (~eax[4])        // load mm2 = eax[1] (~eax[5])        //        // loop        "\n\tpmaddwd     (%%ebx), %%mm1"   // qword ptr [ebx]        "\n\tmovq        16(%%eax), %%mm3" // qword ptr [eax+16]        "\n\tpmaddwd     8(%%ebx), %%mm2"  // qword ptr [ebx+8]        "\n\tmovq        24(%%eax), %%mm4" // qword ptr [eax+24]        "\n\tpmaddwd     16(%%ebx), %%mm3" // qword ptr [ebx+16]        "\n\tpaddd       %%mm1, %%mm2"        "\n\tpmaddwd     24(%%ebx), %%mm4" // qword ptr [ebx+24]        "\n\tmovq        32(%%eax), %%mm1" // qword ptr [eax+32]        "\n\tpsrad       %%mm5, %%mm2"        "\n\tadd         $32, %%eax"        "\n\tpaddd       %%mm4, %%mm3"        "\n\tpaddd       %%mm2, %%mm0"        "\n\tmovq        8(%%eax), %%mm2"  // qword ptr [eax+8]        "\n\tpsrad       %%mm5, %%mm3"        "\n\tadd         $32, %%ebx"        "\n\tpaddd       %%mm3, %%mm0"        "\n\tdec         %%ecx"        "\n\tjnz         1b"        // Finalize the last partial loop:        "\n\tmovq        16(%%eax), %%mm3" // qword ptr [eax+16]        "\n\tpmaddwd     (%%ebx), %%mm1"   // qword ptr [ebx]        "\n\tmovq        24(%%eax), %%mm4" // qword ptr [eax+24]        "\n\tpmaddwd     8(%%ebx), %%mm2"  // qword ptr [ebx+8]        "\n\tpmaddwd     16(%%ebx), %%mm3" // qword ptr [ebx+16]        "\n\tpaddd       %%mm1, %%mm2"        "\n\tpmaddwd     24(%%ebx), %%mm4" // qword ptr [ebx+24]        "\n\tpsrad       %%mm5, %%mm2"        "\n\tpaddd       %%mm4, %%mm3"        "\n\tpaddd       %%mm2, %%mm0"        "\n\tpsrad       %%mm5, %%mm3"        "\n\tpaddd       %%mm3, %%mm0"        // copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1        // and finally store the result into the variable "corr"        "\n\tmovq        %%mm0, %%mm1"        "\n\tpsrlq       $32, %%mm1"        "\n\tpaddd       %%mm1, %%mm0"        "\n\tmovd        %%mm0, %0"      : "=rm" (corr)      : "rim" (pV1), "rim" (pV2), "rim" (local_overlapLength),        "rim" (local_overlapDividerBits)      : "%ecx", "%ebx", "%eax"    );    return corr;        // Note: Warning about the missing EMMS instruction is harmless    // as it'll be called elsewhere.#else    throw runtime_error("MMX not supported");#endif}void TDStretchMMX::clearCrossCorrState(){#ifdef __i386__    asm volatile("EMMS");#endif}// MMX-optimized version of the function overlapStereovoid TDStretchMMX::overlapStereo(short *output, const short *input) const{#ifdef __i386__    short *local_midBuffer = pMidBuffer;    uint local_overlapLength = overlapLength;    uint local_overlapDividerBits = overlapDividerBits;    unsigned long shadow_esi;    asm volatile(        "\n\t"        // load sliding mixing value counter to mm6 and mm7        // load counter value to ecx = overlapLength / 4        // load divider-shifter value to esi        // load mixing value adder to mm5        // load address of midBuffer to eax        // load address of inputBuffer added with ovlOffset to ebx        // load address of end of the outputBuffer to edx        //        // We need to preserve esi, since gcc uses it for the        // stack frame.        "movl        %%esi, %0\n\t"        "movl        %1, %%eax\n\t"               // ecx = 0x0000 OVL_        "movl        $0x0002fffe, %%edi\n\t"      // ecx = 0x0002 fffe        "movl        %2, %%edx\n\t"        "movd        %%eax, %%mm6\n\t"            // mm6 = 0x0000 0000 0000 OVL_        "movl        %%eax, %%ecx\n\t"        "sub         $1, %%eax\n\t"        "punpckldq   %%mm6, %%mm6\n\t"            // mm6 = 0x0000 OVL_ 0000 OVL_        "or          $0x00010000, %%eax\n\t"      // eax = 0x0001 overlapLength-1        "movl        %4, %%ebx\n\t"        "movd        %%edi, %%mm5\n\t"            // mm5 = 0x0000 0000 0002 fffe        "movd        %%eax, %%mm7\n\t"            // mm7 = 0x0000 0000 0001 01ff        "movl        %5, %%eax\n\t"               // dword ptr local_midBuffer        "punpckldq   %%mm5, %%mm5\n\t"            // mm5 = 0x0002 fffe 0002 fffe        "shr         $2, %%ecx\n\t"               // ecx = overlapLength / 2        "punpckldq   %%mm7, %%mm7\n\t"            // mm7 = 0x0001 01ff 0001 01ff        "push        %%edx\n\t"        "pop         %%esi\n"        "movl        %3, %%edx\n\t"        "2:\n\t"        // Process two parallel batches of 2+2 stereo samples during each round         // to improve CPU-level parallellization.        //        // Load [eax] into mm0 and mm1        // Load [ebx] into mm3        // unpack words of mm0, mm1 and mm3 into mm0 and mm1

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -