⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 lpc_asm.nasm

📁 这是著名的TCPMP播放器在WINDWOWS,和WINCE下编译通过的源程序.笔者对其中的LIBMAD库做了针对ARM MPU的优化. 并增加了词幕功能.
💻 NASM
📖 第 1 页 / 共 3 页
字号:
;  libFLAC - Free Lossless Audio Codec library
;  Copyright (C) 2001,2002,2003,2004,2005  Josh Coalson
;
;  Redistribution and use in source and binary forms, with or without
;  modification, are permitted provided that the following conditions
;  are met:
;
;  - Redistributions of source code must retain the above copyright
;  notice, this list of conditions and the following disclaimer.
;
;  - Redistributions in binary form must reproduce the above copyright
;  notice, this list of conditions and the following disclaimer in the
;  documentation and/or other materials provided with the distribution.
;
;  - Neither the name of the Xiph.org Foundation nor the names of its
;  contributors may be used to endorse or promote products derived from
;  this software without specific prior written permission.
;
;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
;  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
;  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
;  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
;  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
;  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
;  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

%include "nasm.h"

	data_section

cglobal FLAC__lpc_compute_autocorrelation_asm_ia32
cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow
cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
cglobal FLAC__lpc_restore_signal_asm_ia32
cglobal FLAC__lpc_restore_signal_asm_ia32_mmx

	code_section

; **********************************************************************
;
; void FLAC__lpc_compute_autocorrelation_asm(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
; {
;	FLAC__real d;
;	unsigned sample, coeff;
;	const unsigned limit = data_len - lag;
;
;	FLAC__ASSERT(lag > 0);
;	FLAC__ASSERT(lag <= data_len);
;
;	for(coeff = 0; coeff < lag; coeff++)
;		autoc[coeff] = 0.0;
;	for(sample = 0; sample <= limit; sample++) {
;		d = data[sample];
;		for(coeff = 0; coeff < lag; coeff++)
;			autoc[coeff] += d * data[sample+coeff];
;	}
;	for(; sample < data_len; sample++) {
;		d = data[sample];
;		for(coeff = 0; coeff < data_len - sample; coeff++)
;			autoc[coeff] += d * data[sample+coeff];
;	}
; }
;
	ALIGN 16
cident FLAC__lpc_compute_autocorrelation_asm_ia32
	;[esp + 28] == autoc[]
	;[esp + 24] == lag
	;[esp + 20] == data_len
	;[esp + 16] == data[]

	;ASSERT(lag > 0)
	;ASSERT(lag <= 33)
	;ASSERT(lag <= data_len)

.begin:
	push	esi
	push	edi
	push	ebx

	;	for(coeff = 0; coeff < lag; coeff++)
	;		autoc[coeff] = 0.0;
	mov	edi, [esp + 28]			; edi == autoc
	mov	ecx, [esp + 24]			; ecx = # of dwords (=lag) of 0 to write
	xor	eax, eax
	rep	stosd

	;	const unsigned limit = data_len - lag;
	mov	eax, [esp + 24]			; eax == lag
	mov	ecx, [esp + 20]
	sub	ecx, eax			; ecx == limit

	mov	edi, [esp + 28]			; edi == autoc
	mov	esi, [esp + 16]			; esi == data
	inc	ecx				; we are looping <= limit so we add one to the counter

	;	for(sample = 0; sample <= limit; sample++) {
	;		d = data[sample];
	;		for(coeff = 0; coeff < lag; coeff++)
	;			autoc[coeff] += d * data[sample+coeff];
	;	}
	fld	dword [esi]			; ST = d <- data[sample]
	; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
	lea	edx, [eax + eax*2]
	neg	edx
	lea	edx, [eax + edx*4 + .jumper1_0 - .get_eip1]
	call	.get_eip1
.get_eip1:
	pop	ebx
	add	edx, ebx
	inc	edx				; compensate for the shorter opcode on the last iteration
	inc	edx				; compensate for the shorter opcode on the last iteration
	inc	edx				; compensate for the shorter opcode on the last iteration
	cmp	eax, 33
	jne	.loop1_start
	sub	edx, byte 9			; compensate for the longer opcodes on the first iteration
.loop1_start:
	jmp	edx

	fld	st0				; ST = d d
	fmul	dword [esi + (32*4)]		; ST = d*data[sample+32] d		WATCHOUT: not a byte displacement here!
	fadd	dword [edi + (32*4)]		; ST = autoc[32]+d*data[sample+32] d	WATCHOUT: not a byte displacement here!
	fstp	dword [edi + (32*4)]		; autoc[32]+=d*data[sample+32]  ST = d	WATCHOUT: not a byte displacement here!
	fld	st0				; ST = d d
	fmul	dword [esi + (31*4)]		; ST = d*data[sample+31] d
	fadd	dword [edi + (31*4)]		; ST = autoc[31]+d*data[sample+31] d
	fstp	dword [edi + (31*4)]		; autoc[31]+=d*data[sample+31]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (30*4)]		; ST = d*data[sample+30] d
	fadd	dword [edi + (30*4)]		; ST = autoc[30]+d*data[sample+30] d
	fstp	dword [edi + (30*4)]		; autoc[30]+=d*data[sample+30]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (29*4)]		; ST = d*data[sample+29] d
	fadd	dword [edi + (29*4)]		; ST = autoc[29]+d*data[sample+29] d
	fstp	dword [edi + (29*4)]		; autoc[29]+=d*data[sample+29]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (28*4)]		; ST = d*data[sample+28] d
	fadd	dword [edi + (28*4)]		; ST = autoc[28]+d*data[sample+28] d
	fstp	dword [edi + (28*4)]		; autoc[28]+=d*data[sample+28]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (27*4)]		; ST = d*data[sample+27] d
	fadd	dword [edi + (27*4)]		; ST = autoc[27]+d*data[sample+27] d
	fstp	dword [edi + (27*4)]		; autoc[27]+=d*data[sample+27]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (26*4)]		; ST = d*data[sample+26] d
	fadd	dword [edi + (26*4)]		; ST = autoc[26]+d*data[sample+26] d
	fstp	dword [edi + (26*4)]		; autoc[26]+=d*data[sample+26]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (25*4)]		; ST = d*data[sample+25] d
	fadd	dword [edi + (25*4)]		; ST = autoc[25]+d*data[sample+25] d
	fstp	dword [edi + (25*4)]		; autoc[25]+=d*data[sample+25]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (24*4)]		; ST = d*data[sample+24] d
	fadd	dword [edi + (24*4)]		; ST = autoc[24]+d*data[sample+24] d
	fstp	dword [edi + (24*4)]		; autoc[24]+=d*data[sample+24]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (23*4)]		; ST = d*data[sample+23] d
	fadd	dword [edi + (23*4)]		; ST = autoc[23]+d*data[sample+23] d
	fstp	dword [edi + (23*4)]		; autoc[23]+=d*data[sample+23]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (22*4)]		; ST = d*data[sample+22] d
	fadd	dword [edi + (22*4)]		; ST = autoc[22]+d*data[sample+22] d
	fstp	dword [edi + (22*4)]		; autoc[22]+=d*data[sample+22]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (21*4)]		; ST = d*data[sample+21] d
	fadd	dword [edi + (21*4)]		; ST = autoc[21]+d*data[sample+21] d
	fstp	dword [edi + (21*4)]		; autoc[21]+=d*data[sample+21]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (20*4)]		; ST = d*data[sample+20] d
	fadd	dword [edi + (20*4)]		; ST = autoc[20]+d*data[sample+20] d
	fstp	dword [edi + (20*4)]		; autoc[20]+=d*data[sample+20]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (19*4)]		; ST = d*data[sample+19] d
	fadd	dword [edi + (19*4)]		; ST = autoc[19]+d*data[sample+19] d
	fstp	dword [edi + (19*4)]		; autoc[19]+=d*data[sample+19]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (18*4)]		; ST = d*data[sample+18] d
	fadd	dword [edi + (18*4)]		; ST = autoc[18]+d*data[sample+18] d
	fstp	dword [edi + (18*4)]		; autoc[18]+=d*data[sample+18]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (17*4)]		; ST = d*data[sample+17] d
	fadd	dword [edi + (17*4)]		; ST = autoc[17]+d*data[sample+17] d
	fstp	dword [edi + (17*4)]		; autoc[17]+=d*data[sample+17]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (16*4)]		; ST = d*data[sample+16] d
	fadd	dword [edi + (16*4)]		; ST = autoc[16]+d*data[sample+16] d
	fstp	dword [edi + (16*4)]		; autoc[16]+=d*data[sample+16]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (15*4)]		; ST = d*data[sample+15] d
	fadd	dword [edi + (15*4)]		; ST = autoc[15]+d*data[sample+15] d
	fstp	dword [edi + (15*4)]		; autoc[15]+=d*data[sample+15]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (14*4)]		; ST = d*data[sample+14] d
	fadd	dword [edi + (14*4)]		; ST = autoc[14]+d*data[sample+14] d
	fstp	dword [edi + (14*4)]		; autoc[14]+=d*data[sample+14]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (13*4)]		; ST = d*data[sample+13] d
	fadd	dword [edi + (13*4)]		; ST = autoc[13]+d*data[sample+13] d
	fstp	dword [edi + (13*4)]		; autoc[13]+=d*data[sample+13]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (12*4)]		; ST = d*data[sample+12] d
	fadd	dword [edi + (12*4)]		; ST = autoc[12]+d*data[sample+12] d
	fstp	dword [edi + (12*4)]		; autoc[12]+=d*data[sample+12]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (11*4)]		; ST = d*data[sample+11] d
	fadd	dword [edi + (11*4)]		; ST = autoc[11]+d*data[sample+11] d
	fstp	dword [edi + (11*4)]		; autoc[11]+=d*data[sample+11]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (10*4)]		; ST = d*data[sample+10] d
	fadd	dword [edi + (10*4)]		; ST = autoc[10]+d*data[sample+10] d
	fstp	dword [edi + (10*4)]		; autoc[10]+=d*data[sample+10]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + ( 9*4)]		; ST = d*data[sample+9] d
	fadd	dword [edi + ( 9*4)]		; ST = autoc[9]+d*data[sample+9] d
	fstp	dword [edi + ( 9*4)]		; autoc[9]+=d*data[sample+9]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + ( 8*4)]		; ST = d*data[sample+8] d
	fadd	dword [edi + ( 8*4)]		; ST = autoc[8]+d*data[sample+8] d
	fstp	dword [edi + ( 8*4)]		; autoc[8]+=d*data[sample+8]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + ( 7*4)]		; ST = d*data[sample+7] d
	fadd	dword [edi + ( 7*4)]		; ST = autoc[7]+d*data[sample+7] d
	fstp	dword [edi + ( 7*4)]		; autoc[7]+=d*data[sample+7]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + ( 6*4)]		; ST = d*data[sample+6] d
	fadd	dword [edi + ( 6*4)]		; ST = autoc[6]+d*data[sample+6] d
	fstp	dword [edi + ( 6*4)]		; autoc[6]+=d*data[sample+6]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + ( 5*4)]		; ST = d*data[sample+4] d
	fadd	dword [edi + ( 5*4)]		; ST = autoc[4]+d*data[sample+4] d
	fstp	dword [edi + ( 5*4)]		; autoc[4]+=d*data[sample+4]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + ( 4*4)]		; ST = d*data[sample+4] d
	fadd	dword [edi + ( 4*4)]		; ST = autoc[4]+d*data[sample+4] d
	fstp	dword [edi + ( 4*4)]		; autoc[4]+=d*data[sample+4]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + ( 3*4)]		; ST = d*data[sample+3] d
	fadd	dword [edi + ( 3*4)]		; ST = autoc[3]+d*data[sample+3] d
	fstp	dword [edi + ( 3*4)]		; autoc[3]+=d*data[sample+3]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + ( 2*4)]		; ST = d*data[sample+2] d
	fadd	dword [edi + ( 2*4)]		; ST = autoc[2]+d*data[sample+2] d
	fstp	dword [edi + ( 2*4)]		; autoc[2]+=d*data[sample+2]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + ( 1*4)]		; ST = d*data[sample+1] d
	fadd	dword [edi + ( 1*4)]		; ST = autoc[1]+d*data[sample+1] d
	fstp	dword [edi + ( 1*4)]		; autoc[1]+=d*data[sample+1]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi]			; ST = d*data[sample] d			WATCHOUT: no displacement byte here!
	fadd	dword [edi]			; ST = autoc[0]+d*data[sample] d	WATCHOUT: no displacement byte here!
	fstp	dword [edi]			; autoc[0]+=d*data[sample]  ST = d	WATCHOUT: no displacement byte here!
.jumper1_0:

	fstp	st0				; pop d, ST = empty
	add	esi, byte 4			; sample++
	dec	ecx
	jz	.loop1_end
	fld	dword [esi]			; ST = d <- data[sample]
	jmp	edx
.loop1_end:

	;	for(; sample < data_len; sample++) {
	;		d = data[sample];
	;		for(coeff = 0; coeff < data_len - sample; coeff++)
	;			autoc[coeff] += d * data[sample+coeff];
	;	}
	mov	ecx, [esp + 24]			; ecx <- lag
	dec	ecx				; ecx <- lag - 1
	jz	near .end			; skip loop if 0 (i.e. lag == 1)

	fld	dword [esi]			; ST = d <- data[sample]
	mov	eax, ecx			; eax <- lag - 1 == data_len - sample the first time through
	; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
	lea	edx, [eax + eax*2]
	neg	edx
	lea	edx, [eax + edx*4 + .jumper2_0 - .get_eip2]
	call	.get_eip2
.get_eip2:
	pop	ebx
	add	edx, ebx
	inc	edx				; compensate for the shorter opcode on the last iteration
	inc	edx				; compensate for the shorter opcode on the last iteration
	inc	edx				; compensate for the shorter opcode on the last iteration
	jmp	edx

	fld	st0				; ST = d d
	fmul	dword [esi + (31*4)]		; ST = d*data[sample+31] d
	fadd	dword [edi + (31*4)]		; ST = autoc[31]+d*data[sample+31] d
	fstp	dword [edi + (31*4)]		; autoc[31]+=d*data[sample+31]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (30*4)]		; ST = d*data[sample+30] d
	fadd	dword [edi + (30*4)]		; ST = autoc[30]+d*data[sample+30] d
	fstp	dword [edi + (30*4)]		; autoc[30]+=d*data[sample+30]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (29*4)]		; ST = d*data[sample+29] d
	fadd	dword [edi + (29*4)]		; ST = autoc[29]+d*data[sample+29] d
	fstp	dword [edi + (29*4)]		; autoc[29]+=d*data[sample+29]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (28*4)]		; ST = d*data[sample+28] d
	fadd	dword [edi + (28*4)]		; ST = autoc[28]+d*data[sample+28] d
	fstp	dword [edi + (28*4)]		; autoc[28]+=d*data[sample+28]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (27*4)]		; ST = d*data[sample+27] d
	fadd	dword [edi + (27*4)]		; ST = autoc[27]+d*data[sample+27] d
	fstp	dword [edi + (27*4)]		; autoc[27]+=d*data[sample+27]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (26*4)]		; ST = d*data[sample+26] d
	fadd	dword [edi + (26*4)]		; ST = autoc[26]+d*data[sample+26] d
	fstp	dword [edi + (26*4)]		; autoc[26]+=d*data[sample+26]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (25*4)]		; ST = d*data[sample+25] d
	fadd	dword [edi + (25*4)]		; ST = autoc[25]+d*data[sample+25] d
	fstp	dword [edi + (25*4)]		; autoc[25]+=d*data[sample+25]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (24*4)]		; ST = d*data[sample+24] d
	fadd	dword [edi + (24*4)]		; ST = autoc[24]+d*data[sample+24] d
	fstp	dword [edi + (24*4)]		; autoc[24]+=d*data[sample+24]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (23*4)]		; ST = d*data[sample+23] d
	fadd	dword [edi + (23*4)]		; ST = autoc[23]+d*data[sample+23] d
	fstp	dword [edi + (23*4)]		; autoc[23]+=d*data[sample+23]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (22*4)]		; ST = d*data[sample+22] d
	fadd	dword [edi + (22*4)]		; ST = autoc[22]+d*data[sample+22] d
	fstp	dword [edi + (22*4)]		; autoc[22]+=d*data[sample+22]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (21*4)]		; ST = d*data[sample+21] d
	fadd	dword [edi + (21*4)]		; ST = autoc[21]+d*data[sample+21] d
	fstp	dword [edi + (21*4)]		; autoc[21]+=d*data[sample+21]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (20*4)]		; ST = d*data[sample+20] d
	fadd	dword [edi + (20*4)]		; ST = autoc[20]+d*data[sample+20] d
	fstp	dword [edi + (20*4)]		; autoc[20]+=d*data[sample+20]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (19*4)]		; ST = d*data[sample+19] d
	fadd	dword [edi + (19*4)]		; ST = autoc[19]+d*data[sample+19] d
	fstp	dword [edi + (19*4)]		; autoc[19]+=d*data[sample+19]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (18*4)]		; ST = d*data[sample+18] d
	fadd	dword [edi + (18*4)]		; ST = autoc[18]+d*data[sample+18] d
	fstp	dword [edi + (18*4)]		; autoc[18]+=d*data[sample+18]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (17*4)]		; ST = d*data[sample+17] d
	fadd	dword [edi + (17*4)]		; ST = autoc[17]+d*data[sample+17] d
	fstp	dword [edi + (17*4)]		; autoc[17]+=d*data[sample+17]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (16*4)]		; ST = d*data[sample+16] d
	fadd	dword [edi + (16*4)]		; ST = autoc[16]+d*data[sample+16] d
	fstp	dword [edi + (16*4)]		; autoc[16]+=d*data[sample+16]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (15*4)]		; ST = d*data[sample+15] d
	fadd	dword [edi + (15*4)]		; ST = autoc[15]+d*data[sample+15] d
	fstp	dword [edi + (15*4)]		; autoc[15]+=d*data[sample+15]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (14*4)]		; ST = d*data[sample+14] d
	fadd	dword [edi + (14*4)]		; ST = autoc[14]+d*data[sample+14] d
	fstp	dword [edi + (14*4)]		; autoc[14]+=d*data[sample+14]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (13*4)]		; ST = d*data[sample+13] d
	fadd	dword [edi + (13*4)]		; ST = autoc[13]+d*data[sample+13] d
	fstp	dword [edi + (13*4)]		; autoc[13]+=d*data[sample+13]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (12*4)]		; ST = d*data[sample+12] d
	fadd	dword [edi + (12*4)]		; ST = autoc[12]+d*data[sample+12] d
	fstp	dword [edi + (12*4)]		; autoc[12]+=d*data[sample+12]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (11*4)]		; ST = d*data[sample+11] d
	fadd	dword [edi + (11*4)]		; ST = autoc[11]+d*data[sample+11] d
	fstp	dword [edi + (11*4)]		; autoc[11]+=d*data[sample+11]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + (10*4)]		; ST = d*data[sample+10] d
	fadd	dword [edi + (10*4)]		; ST = autoc[10]+d*data[sample+10] d
	fstp	dword [edi + (10*4)]		; autoc[10]+=d*data[sample+10]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + ( 9*4)]		; ST = d*data[sample+9] d
	fadd	dword [edi + ( 9*4)]		; ST = autoc[9]+d*data[sample+9] d
	fstp	dword [edi + ( 9*4)]		; autoc[9]+=d*data[sample+9]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + ( 8*4)]		; ST = d*data[sample+8] d
	fadd	dword [edi + ( 8*4)]		; ST = autoc[8]+d*data[sample+8] d
	fstp	dword [edi + ( 8*4)]		; autoc[8]+=d*data[sample+8]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + ( 7*4)]		; ST = d*data[sample+7] d
	fadd	dword [edi + ( 7*4)]		; ST = autoc[7]+d*data[sample+7] d
	fstp	dword [edi + ( 7*4)]		; autoc[7]+=d*data[sample+7]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + ( 6*4)]		; ST = d*data[sample+6] d
	fadd	dword [edi + ( 6*4)]		; ST = autoc[6]+d*data[sample+6] d
	fstp	dword [edi + ( 6*4)]		; autoc[6]+=d*data[sample+6]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + ( 5*4)]		; ST = d*data[sample+4] d
	fadd	dword [edi + ( 5*4)]		; ST = autoc[4]+d*data[sample+4] d
	fstp	dword [edi + ( 5*4)]		; autoc[4]+=d*data[sample+4]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + ( 4*4)]		; ST = d*data[sample+4] d
	fadd	dword [edi + ( 4*4)]		; ST = autoc[4]+d*data[sample+4] d
	fstp	dword [edi + ( 4*4)]		; autoc[4]+=d*data[sample+4]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + ( 3*4)]		; ST = d*data[sample+3] d
	fadd	dword [edi + ( 3*4)]		; ST = autoc[3]+d*data[sample+3] d
	fstp	dword [edi + ( 3*4)]		; autoc[3]+=d*data[sample+3]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + ( 2*4)]		; ST = d*data[sample+2] d
	fadd	dword [edi + ( 2*4)]		; ST = autoc[2]+d*data[sample+2] d
	fstp	dword [edi + ( 2*4)]		; autoc[2]+=d*data[sample+2]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi + ( 1*4)]		; ST = d*data[sample+1] d
	fadd	dword [edi + ( 1*4)]		; ST = autoc[1]+d*data[sample+1] d
	fstp	dword [edi + ( 1*4)]		; autoc[1]+=d*data[sample+1]  ST = d
	fld	st0				; ST = d d
	fmul	dword [esi]			; ST = d*data[sample] d			WATCHOUT: no displacement byte here!
	fadd	dword [edi]			; ST = autoc[0]+d*data[sample] d	WATCHOUT: no displacement byte here!
	fstp	dword [edi]			; autoc[0]+=d*data[sample]  ST = d	WATCHOUT: no displacement byte here!
.jumper2_0:

	fstp	st0				; pop d, ST = empty
	add	esi, byte 4			; sample++
	dec	ecx
	jz	.loop2_end
	add	edx, byte 11			; adjust our inner loop counter by adjusting the jump target
	fld	dword [esi]			; ST = d <- data[sample]
	jmp	edx
.loop2_end:

.end:
	pop	ebx
	pop	edi
	pop	esi
	ret

	ALIGN 16
cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
	;[esp + 16] == autoc[]
	;[esp + 12] == lag
	;[esp + 8] == data_len
	;[esp + 4] == data[]

	;ASSERT(lag > 0)
	;ASSERT(lag <= 4)
	;ASSERT(lag <= data_len)

	;	for(coeff = 0; coeff < lag; coeff++)
	;		autoc[coeff] = 0.0;
	xorps	xmm5, xmm5

	mov	edx, [esp + 8]			; edx == data_len
	mov	eax, [esp + 4]			; eax == &data[sample] <- &data[0]

	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[0]
	add	eax, 4
	movaps	xmm2, xmm0			; xmm2 = 0,0,0,data[0]
	shufps	xmm0, xmm0, 0			; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
.warmup:					; xmm2 == data[sample-3],data[sample-2],data[sample-1],data[sample]
	mulps	xmm0, xmm2			; xmm0 = xmm0 * xmm2
	addps	xmm5, xmm0			; xmm5 += xmm0 * xmm2
	dec	edx
	jz	.loop_end
	ALIGN 16
.loop_start:
	; start by reading the next sample
	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[sample]
	add	eax, 4
	shufps	xmm0, xmm0, 0			; xmm0 = data[sample],data[sample],data[sample],data[sample]
	shufps	xmm2, xmm2, 93h			; 93h=2-1-0-3 => xmm2 gets rotated left by one float
	movss	xmm2, xmm0
	mulps	xmm0, xmm2			; xmm0 = xmm0 * xmm2
	addps	xmm5, xmm0			; xmm5 += xmm0 * xmm2
	dec	edx
	jnz	.loop_start
.loop_end:
	; store autoc
	mov	edx, [esp + 16]			; edx == autoc
	movups	[edx], xmm5

.end:
	ret

	ALIGN 16
cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
	;[esp + 16] == autoc[]
	;[esp + 12] == lag
	;[esp + 8] == data_len
	;[esp + 4] == data[]

	;ASSERT(lag > 0)
	;ASSERT(lag <= 8)
	;ASSERT(lag <= data_len)

	;	for(coeff = 0; coeff < lag; coeff++)
	;		autoc[coeff] = 0.0;
	xorps	xmm5, xmm5
	xorps	xmm6, xmm6

	mov	edx, [esp + 8]			; edx == data_len
	mov	eax, [esp + 4]			; eax == &data[sample] <- &data[0]

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -