📄 mmx_gcc.cpp
字号:
/***************************************************************************** * * gcc version of the MMX optimized routines. All MMX optimized functions * have been gathered into this single source code file, regardless to their * class or original source code file, in order to ease porting the library * to other compiler and processor platforms. * * This file is to be compiled on any platform with the GNU C compiler. * Compiler. Please see 'mmx_win.cpp' for the x86 Windows version of this * file. * * Author : Copyright (c) Olli Parviainen * Author e-mail : oparviai @ iki.fi * File created : 13-Jan-2002 * * Last changed : $Date: 2004/10/26 19:09:36 $ * File revision : $Revision: 1.2 $ * * $Id: mmx_gcc.cpp,v 1.2 2004/10/26 19:09:36 vjohnson Exp $ * * Acknowledgements: * Adopted for gcc : Stuart Lamble <sjl @ debian.lib.monash.edu.au> * Adopted for gcc3: Shachar Raindel <shacharr @ users.sourceforge.net> * * License : * * SoundTouch sound processing library * Copyright (c) Olli Parviainen * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * *****************************************************************************/#include <stdexcept>#include <string>#include "cpu_detect.h"#ifndef __GNUC__#error "wrong platform - this source code file is for the GNU C compiler."#endifusing namespace std;#ifdef ALLOW_MMX// MMX routines available only with integer sample type ////////////////////////////////////////////////////////////////////////////////// implementation of MMX optimized functions of class 'TDStretch'////////////////////////////////////////////////////////////////////////////////#include "TDStretch.h"#include <limits.h>// these are declared in 'TDStretch.cpp'extern int scanOffsets[4][24];// Calculates cross correlation of two buffersinline int TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const{#ifdef __i386__ int corr; uint local_overlapLength = overlapLength; uint local_overlapDividerBits = overlapDividerBits; asm volatile( // Calculate cross-correlation between the tempOffset and tmpbid_buffer. // Process 4 parallel batches of 2 * stereo samples each during one // round to improve CPU-level parallellization. // load address of sloped pV2 buffer to eax // load address of mixing point of the sample data buffer to ebx // load counter to ecx = overlapLength / 8 - 1 // empty the mm0 // prepare to the first round by loading // load mm1 = eax[0] // load mm2 = eax[1]; "\n\tmovl %1, %%eax" "\n\tmovl %2, %%ebx" "\n\tmovq (%%eax), %%mm1" "\n\tmovl %3, %%ecx" "\n\tmovq 8(%%eax), %%mm2" "\n\tshr $3, %%ecx" "\n\tpxor %%mm0, %%mm0" "\n\tsub $1, %%ecx" "\n\tmovd %4, %%mm5" "\n1:" // multiply-add mm1 = mm1 * ebx[0] // multiply-add mm2 = mm2 * ebx[1] // // add mm2 += mm1 // mm2 >>= mm5 (=overlapDividerBits) // add mm0 += mm2 // // load mm3 = eax[2] // multiply-add mm3 = mm3 * ebx[2] // // load mm4 = eax[3] // multiply-add mm4 = mm4 * ebx[3] // // add mm3 += mm4 // mm3 >>= mm5 (=overlapDividerBits) // add mm0 += mm3 // // add eax += 4 // add ebx += 4 // load mm1 = eax[0] (~eax[4]) // load mm2 = eax[1] (~eax[5]) // // loop "\n\tpmaddwd (%%ebx), %%mm1" // qword ptr [ebx] "\n\tmovq 16(%%eax), %%mm3" // qword ptr [eax+16] "\n\tpmaddwd 8(%%ebx), %%mm2" // qword ptr [ebx+8] "\n\tmovq 24(%%eax), %%mm4" // qword ptr [eax+24] "\n\tpmaddwd 16(%%ebx), %%mm3" // qword ptr [ebx+16] "\n\tpaddd %%mm1, %%mm2" "\n\tpmaddwd 24(%%ebx), %%mm4" // qword ptr [ebx+24] "\n\tmovq 32(%%eax), %%mm1" // qword ptr [eax+32] "\n\tpsrad %%mm5, %%mm2" "\n\tadd $32, %%eax" "\n\tpaddd %%mm4, %%mm3" "\n\tpaddd %%mm2, %%mm0" "\n\tmovq 8(%%eax), %%mm2" // qword ptr [eax+8] "\n\tpsrad %%mm5, %%mm3" "\n\tadd $32, %%ebx" "\n\tpaddd %%mm3, %%mm0" "\n\tdec %%ecx" "\n\tjnz 1b" // Finalize the last partial loop: "\n\tmovq 16(%%eax), %%mm3" // qword ptr [eax+16] "\n\tpmaddwd (%%ebx), %%mm1" // qword ptr [ebx] "\n\tmovq 24(%%eax), %%mm4" // qword ptr [eax+24] "\n\tpmaddwd 8(%%ebx), %%mm2" // qword ptr [ebx+8] "\n\tpmaddwd 16(%%ebx), %%mm3" // qword ptr [ebx+16] "\n\tpaddd %%mm1, %%mm2" "\n\tpmaddwd 24(%%ebx), %%mm4" // qword ptr [ebx+24] "\n\tpsrad %%mm5, %%mm2" "\n\tpaddd %%mm4, %%mm3" "\n\tpaddd %%mm2, %%mm0" "\n\tpsrad %%mm5, %%mm3" "\n\tpaddd %%mm3, %%mm0" // copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1 // and finally store the result into the variable "corr" "\n\tmovq %%mm0, %%mm1" "\n\tpsrlq $32, %%mm1" "\n\tpaddd %%mm1, %%mm0" "\n\tmovd %%mm0, %0" : "=rm" (corr) : "rim" (pV1), "rim" (pV2), "rim" (local_overlapLength), "rim" (local_overlapDividerBits) : "%ecx", "%ebx", "%eax" ); return corr; // Note: Warning about the missing EMMS instruction is harmless // as it'll be called elsewhere.#else throw runtime_error("MMX not supported");#endif}void TDStretchMMX::clearCrossCorrState(){#ifdef __i386__ asm volatile("EMMS");#endif}// MMX-optimized version of the function overlapStereovoid TDStretchMMX::overlapStereo(short *output, const short *input) const{#ifdef __i386__ short *local_midBuffer = pMidBuffer; uint local_overlapLength = overlapLength; uint local_overlapDividerBits = overlapDividerBits; unsigned long shadow_esi; asm volatile( "\n\t" // load sliding mixing value counter to mm6 and mm7 // load counter value to ecx = overlapLength / 4 // load divider-shifter value to esi // load mixing value adder to mm5 // load address of midBuffer to eax // load address of inputBuffer added with ovlOffset to ebx // load address of end of the outputBuffer to edx // // We need to preserve esi, since gcc uses it for the // stack frame. "movl %%esi, %0\n\t" "movl %1, %%eax\n\t" // ecx = 0x0000 OVL_ "movl $0x0002fffe, %%edi\n\t" // ecx = 0x0002 fffe "movl %2, %%edx\n\t" "movd %%eax, %%mm6\n\t" // mm6 = 0x0000 0000 0000 OVL_ "movl %%eax, %%ecx\n\t" "sub $1, %%eax\n\t" "punpckldq %%mm6, %%mm6\n\t" // mm6 = 0x0000 OVL_ 0000 OVL_ "or $0x00010000, %%eax\n\t" // eax = 0x0001 overlapLength-1 "movl %4, %%ebx\n\t" "movd %%edi, %%mm5\n\t" // mm5 = 0x0000 0000 0002 fffe "movd %%eax, %%mm7\n\t" // mm7 = 0x0000 0000 0001 01ff "movl %5, %%eax\n\t" // dword ptr local_midBuffer "punpckldq %%mm5, %%mm5\n\t" // mm5 = 0x0002 fffe 0002 fffe "shr $2, %%ecx\n\t" // ecx = overlapLength / 2 "punpckldq %%mm7, %%mm7\n\t" // mm7 = 0x0001 01ff 0001 01ff "push %%edx\n\t" "pop %%esi\n" "movl %3, %%edx\n\t" "2:\n\t" // Process two parallel batches of 2+2 stereo samples during each round // to improve CPU-level parallellization. // // Load [eax] into mm0 and mm1 // Load [ebx] into mm3 // unpack words of mm0, mm1 and mm3 into mm0 and mm1
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -