📄 mmx_win.cpp
字号:
/***************************************************************************** * * Win32 version of the MMX optimized routines. All MMX optimized functions * have been gathered into this single source code file, regardless to their * class or original source code file, in order to ease porting the library * to other compiler and processor platforms. * * This file is to be compiled in Windows platform with Microsoft Visual C++ * Compiler. Please see 'mmx_gcc.cpp' for the gcc compiler version for all * GNU platforms. * * Author : Copyright (c) Olli Parviainen * Author e-mail : oparviai @ iki.fi * File created : 13-Jan-2002 * * Last changed : $Date: 2004/10/26 19:09:37 $ * File revision : $Revision: 1.2 $ * * $Id: mmx_win.cpp,v 1.2 2004/10/26 19:09:37 vjohnson Exp $ * * License : * * SoundTouch sound processing library * Copyright (c) Olli Parviainen * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * *****************************************************************************/#include "STTypes.h"#ifndef WIN32#error "wrong platform - this source code file is exclusively for Win32 platform"#endifusing namespace soundtouch;#ifdef ALLOW_MMX// MMX routines available only with integer sample type ////////////////////////////////////////////////////////////////////////////////// implementation of MMX optimized functions of class 'TDStretchMMX'////////////////////////////////////////////////////////////////////////////////#include "TDStretch.h"#include <limits.h>// these are declared in 'TDStretch.cpp'extern int scanOffsets[4][24];// Calculates cross correlation of two bufferslong TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const{ long corr; uint local_overlapLength = overlapLength; uint local_overlapDividerBits = overlapDividerBits; _asm { ; Calculate cross-correlation between the tempOffset and tmpbid_buffer. ; ; Process 4 parallel batches of 2 * stereo samples each during one ; round to improve CPU-level parallellization. ; ; load address of sloped pV2 buffer to eax ; load address of mixing point of the sample data buffer to ebx ; load counter to ecx = overlapLength / 8 - 1 ; empty the mm0 ; ; prepare to the first round by loading ; load mm1 = eax[0] ; load mm2 = eax[1]; mov eax, dword ptr pV1 mov ebx, dword ptr pV2 movq mm1, qword ptr [eax] mov ecx, local_overlapLength movq mm2, qword ptr [eax+8] shr ecx, 3 pxor mm0, mm0 sub ecx, 1 movd mm5, local_overlapDividerBits loop1: ; multiply-add mm1 = mm1 * ebx[0] ; multiply-add mm2 = mm2 * ebx[1] ; ; add mm2 += mm1 ; mm2 >>= mm5 (=overlapDividerBits) ; add mm0 += mm2 ; ; load mm3 = eax[2] ; multiply-add mm3 = mm3 * ebx[2] ; ; load mm4 = eax[3] ; multiply-add mm4 = mm4 * ebx[3] ; ; add mm3 += mm4 ; mm3 >>= mm5 (=overlapDividerBits) ; add mm0 += mm3 ; ; add eax += 4; ; add ebx += 4 ; load mm1 = eax[0] (~eax[4]) ; load mm2 = eax[1] (~eax[5]) ; ; loop pmaddwd mm1, qword ptr [ebx] movq mm3, qword ptr [eax+16] pmaddwd mm2, qword ptr [ebx+8] movq mm4, qword ptr [eax+24] pmaddwd mm3, qword ptr [ebx+16] paddd mm2, mm1 pmaddwd mm4, qword ptr [ebx+24] movq mm1, qword ptr [eax+32] psrad mm2, mm5 add eax, 32 paddd mm3, mm4 paddd mm0, mm2 movq mm2, qword ptr [eax+8] psrad mm3, mm5 add ebx, 32 paddd mm0, mm3 dec ecx jnz loop1 ; Finalize the last partial loop: movq mm3, qword ptr [eax+16] pmaddwd mm1, qword ptr [ebx] movq mm4, qword ptr [eax+24] pmaddwd mm2, qword ptr [ebx+8] pmaddwd mm3, qword ptr [ebx+16] paddd mm2, mm1 pmaddwd mm4, qword ptr [ebx+24] psrad mm2, mm5 paddd mm3, mm4 paddd mm0, mm2 psrad mm3, mm5 paddd mm0, mm3 ; copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1 ; and finally store the result into the variable "corr" movq mm1, mm0 psrlq mm1, 32 paddd mm0, mm1 movd corr, mm0 } return corr; // Note: Warning about the missing EMMS instruction is harmless // as it'll be called elsewhere.}void TDStretchMMX::clearCrossCorrState(){ _asm EMMS;}// MMX-optimized version of the function overlapStereovoid TDStretchMMX::overlapStereo(short *output, const short *input) const{ short *local_midBuffer = pMidBuffer; uint local_overlapLength = overlapLength; uint local_overlapDividerBits = overlapDividerBits; _asm { ; load sliding mixing value counter to mm6 and mm7 ; load counter value to ecx = overlapLength / 4 ; load divider-shifter value to esi ; load mixing value adder to mm5 ; load address of midBuffer to eax ; load address of inputBuffer added with ovlOffset to ebx ; load address of end of the outputBuffer to edx mov eax, local_overlapLength ; ecx = 0x0000 OVL_ mov edi, 0x0002fffe ; ecx = 0x0002 fffe mov esi, local_overlapDividerBits movd mm6, eax ; mm6 = 0x0000 0000 0000 OVL_ mov ecx, eax; sub eax, 1 punpckldq mm6, mm6 ; mm6 = 0x0000 OVL_ 0000 OVL_ mov edx, output or eax, 0x00010000 ; eax = 0x0001 overlapLength-1 mov ebx, dword ptr input movd mm5, edi ; mm5 = 0x0000 0000 0002 fffe movd mm7, eax ; mm7 = 0x0000 0000 0001 01ff mov eax, dword ptr local_midBuffer punpckldq mm5, mm5 ; mm5 = 0x0002 fffe 0002 fffe shr ecx, 2 ; ecx = overlapLength / 2 punpckldq mm7, mm7 ; mm7 = 0x0001 01ff 0001 01ff loop1: ; Process two parallel batches of 2+2 stereo samples during each round
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -