⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 lpc_asm.nasm

📁 这是著名的TCPMP播放器在WINDWOWS,和WINCE下编译通过的源程序.笔者对其中的LIBMAD库做了针对ARM MPU的优化. 并增加了词幕功能.
💻 NASM
📖 第 1 页 / 共 3 页
字号:

	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[0]
	add	eax, 4
	movaps	xmm2, xmm0			; xmm2 = 0,0,0,data[0]
	shufps	xmm0, xmm0, 0			; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
	movaps	xmm1, xmm0			; xmm1 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
	xorps	xmm3, xmm3			; xmm3 = 0,0,0,0
.warmup:					; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
	mulps	xmm0, xmm2
	mulps	xmm1, xmm3			; xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
	addps	xmm5, xmm0
	addps	xmm6, xmm1			; xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
	dec	edx
	jz	.loop_end
	ALIGN 16
.loop_start:
	; start by reading the next sample
	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[sample]
	; here we reorder the instructions; see the (#) indexes for a logical order
	shufps	xmm2, xmm2, 93h			; (3) 93h=2-1-0-3 => xmm2 gets rotated left by one float
	add	eax, 4				; (0)
	shufps	xmm3, xmm3, 93h			; (4) 93h=2-1-0-3 => xmm3 gets rotated left by one float
	shufps	xmm0, xmm0, 0			; (1) xmm0 = data[sample],data[sample],data[sample],data[sample]
	movss	xmm3, xmm2			; (5)
	movaps	xmm1, xmm0			; (2) xmm1 = data[sample],data[sample],data[sample],data[sample]
	movss	xmm2, xmm0			; (6)
	mulps	xmm1, xmm3			; (8)
	mulps	xmm0, xmm2			; (7) xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
	addps	xmm6, xmm1			; (10)
	addps	xmm5, xmm0			; (9) xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
	dec	edx
	jnz	.loop_start
.loop_end:
	; store autoc
	mov	edx, [esp + 16]			; edx == autoc
	movups	[edx], xmm5
	movups	[edx + 16], xmm6

.end:
	ret

	ALIGN 16
cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
	;[esp + 16] == autoc[]
	;[esp + 12] == lag
	;[esp + 8] == data_len
	;[esp + 4] == data[]

	;ASSERT(lag > 0)
	;ASSERT(lag <= 12)
	;ASSERT(lag <= data_len)

	;	for(coeff = 0; coeff < lag; coeff++)
	;		autoc[coeff] = 0.0;
	xorps	xmm5, xmm5
	xorps	xmm6, xmm6
	xorps	xmm7, xmm7

	mov	edx, [esp + 8]			; edx == data_len
	mov	eax, [esp + 4]			; eax == &data[sample] <- &data[0]

	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[0]
	add	eax, 4
	movaps	xmm2, xmm0			; xmm2 = 0,0,0,data[0]
	shufps	xmm0, xmm0, 0			; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
	xorps	xmm3, xmm3			; xmm3 = 0,0,0,0
	xorps	xmm4, xmm4			; xmm4 = 0,0,0,0
.warmup:					; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
	movaps	xmm1, xmm0
	mulps	xmm1, xmm2
	addps	xmm5, xmm1
	movaps	xmm1, xmm0
	mulps	xmm1, xmm3
	addps	xmm6, xmm1
	mulps	xmm0, xmm4
	addps	xmm7, xmm0			; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2
	dec	edx
	jz	.loop_end
	ALIGN 16
.loop_start:
	; start by reading the next sample
	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[sample]
	add	eax, 4
	shufps	xmm0, xmm0, 0			; xmm0 = data[sample],data[sample],data[sample],data[sample]

	; shift xmm4:xmm3:xmm2 left by one float
	shufps	xmm2, xmm2, 93h			; 93h=2-1-0-3 => xmm2 gets rotated left by one float
	shufps	xmm3, xmm3, 93h			; 93h=2-1-0-3 => xmm3 gets rotated left by one float
	shufps	xmm4, xmm4, 93h			; 93h=2-1-0-3 => xmm4 gets rotated left by one float
	movss	xmm4, xmm3
	movss	xmm3, xmm2
	movss	xmm2, xmm0

	; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm3:xmm3:xmm2
	movaps	xmm1, xmm0
	mulps	xmm1, xmm2
	addps	xmm5, xmm1
	movaps	xmm1, xmm0
	mulps	xmm1, xmm3
	addps	xmm6, xmm1
	mulps	xmm0, xmm4
	addps	xmm7, xmm0

	dec	edx
	jnz	.loop_start
.loop_end:
	; store autoc
	mov	edx, [esp + 16]			; edx == autoc
	movups	[edx], xmm5
	movups	[edx + 16], xmm6
	movups	[edx + 32], xmm7

.end:
	ret

	align 16
cident FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow
	;[ebp + 32] autoc
	;[ebp + 28] lag
	;[ebp + 24] data_len
	;[ebp + 20] data

	push	ebp
	push	ebx
	push	esi
	push	edi
	mov	ebp, esp

	mov	esi, [ebp + 20]
	mov	edi, [ebp + 24]
	mov	edx, [ebp + 28]
	inc	edx
	and	edx, byte -2
	mov	eax, edx
	neg	eax
	and	esp, byte -8
	lea	esp, [esp + 4 * eax]
	mov	ecx, edx
	xor	eax, eax
.loop0:
	dec	ecx
	mov	[esp + 4 * ecx], eax
	jnz	short .loop0

	mov	eax, edi
	sub	eax, edx
	mov	ebx, edx
	and	ebx, byte 1
	sub	eax, ebx
	lea	ecx, [esi + 4 * eax - 12]
	cmp	esi, ecx
	mov	eax, esi
	ja	short .loop2_pre
	align	16		;4 nops
.loop1_i:
	movd	mm0, [eax]
	movd	mm2, [eax + 4]
	movd	mm4, [eax + 8]
	movd	mm6, [eax + 12]
	mov	ebx, edx
	punpckldq	mm0, mm0
	punpckldq	mm2, mm2
	punpckldq	mm4, mm4
	punpckldq	mm6, mm6
	align	16		;3 nops
.loop1_j:
	sub	ebx, byte 2
	movd	mm1, [eax + 4 * ebx]
	movd	mm3, [eax + 4 * ebx + 4]
	movd	mm5, [eax + 4 * ebx + 8]
	movd	mm7, [eax + 4 * ebx + 12]
	punpckldq	mm1, mm3
	punpckldq	mm3, mm5
	pfmul	mm1, mm0
	punpckldq	mm5, mm7
	pfmul	mm3, mm2
	punpckldq	mm7, [eax + 4 * ebx + 16]
	pfmul	mm5, mm4
	pfmul	mm7, mm6
	pfadd	mm1, mm3
	movq	mm3, [esp + 4 * ebx]
	pfadd	mm5, mm7
	pfadd	mm1, mm5
	pfadd	mm3, mm1
	movq	[esp + 4 * ebx], mm3
	jg	short .loop1_j

	add	eax, byte 16
	cmp	eax, ecx
	jb	short .loop1_i

.loop2_pre:
	mov	ebx, eax
	sub	eax, esi
	shr	eax, 2
	lea	ecx, [esi + 4 * edi]
	mov	esi, ebx
.loop2_i:
	movd	mm0, [esi]
	mov	ebx, edi
	sub	ebx, eax
	cmp	ebx, edx
	jbe	short .loop2_j
	mov	ebx, edx
.loop2_j:
	dec	ebx
	movd	mm1, [esi + 4 * ebx]
	pfmul	mm1, mm0
	movd	mm2, [esp + 4 * ebx]
	pfadd	mm1, mm2
	movd	[esp + 4 * ebx], mm1

	jnz	short .loop2_j

	add	esi, byte 4
	inc	eax
	cmp	esi, ecx
	jnz	short .loop2_i

	mov	edi, [ebp + 32]
	mov	edx, [ebp + 28]
.loop3:
	dec	edx
	mov	eax, [esp + 4 * edx]
	mov	[edi + 4 * edx], eax
	jnz	short .loop3

	femms

	mov	esp, ebp
	pop	edi
	pop	esi
	pop	ebx
	pop	ebp
	ret

;void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
;
;	for(i = 0; i < data_len; i++) {
;		sum = 0;
;		for(j = 0; j < order; j++)
;			sum += qlp_coeff[j] * data[i-j-1];
;		residual[i] = data[i] - (sum >> lp_quantization);
;	}
;
	ALIGN	16
cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
	;[esp + 40]	residual[]
	;[esp + 36]	lp_quantization
	;[esp + 32]	order
	;[esp + 28]	qlp_coeff[]
	;[esp + 24]	data_len
	;[esp + 20]	data[]

	;ASSERT(order > 0)

	push	ebp
	push	ebx
	push	esi
	push	edi

	mov	esi, [esp + 20]			; esi = data[]
	mov	edi, [esp + 40]			; edi = residual[]
	mov	eax, [esp + 32]			; eax = order
	mov	ebx, [esp + 24]			; ebx = data_len

	test	ebx, ebx
	jz	near .end			; do nothing if data_len == 0
.begin:
	cmp	eax, byte 1
	jg	short .i_1more

	mov	ecx, [esp + 28]
	mov	edx, [ecx]			; edx = qlp_coeff[0]
	mov	eax, [esi - 4]			; eax = data[-1]
	mov	cl, [esp + 36]			; cl = lp_quantization
	ALIGN	16
.i_1_loop_i:
	imul	eax, edx
	sar	eax, cl
	neg	eax
	add	eax, [esi]
	mov	[edi], eax
	mov	eax, [esi]
	add	edi, byte 4
	add	esi, byte 4
	dec	ebx
	jnz	.i_1_loop_i

	jmp	.end

.i_1more:
	cmp	eax, byte 32			; for order <= 32 there is a faster routine
	jbe	short .i_32

	; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
	ALIGN 16
.i_32more_loop_i:
	xor	ebp, ebp
	mov	ecx, [esp + 32]
	mov	edx, ecx
	shl	edx, 2
	add	edx, [esp + 28]
	neg	ecx
	ALIGN	16
.i_32more_loop_j:
	sub	edx, byte 4
	mov	eax, [edx]
	imul	eax, [esi + 4 * ecx]
	add	ebp, eax
	inc	ecx
	jnz	short .i_32more_loop_j

	mov	cl, [esp + 36]
	sar	ebp, cl
	neg	ebp
	add	ebp, [esi]
	mov	[edi], ebp
	add	esi, byte 4
	add	edi, byte 4

	dec	ebx
	jnz	.i_32more_loop_i

	jmp	.end

.i_32:
	sub	edi, esi
	neg	eax
	lea	edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
	call	.get_eip0
.get_eip0:
	pop	eax
	add	edx, eax
	inc	edx
	mov	eax, [esp + 28]			; eax = qlp_coeff[]
	xor	ebp, ebp
	jmp	edx

	mov	ecx, [eax + 124]
	imul	ecx, [esi - 128]
	add	ebp, ecx
	mov	ecx, [eax + 120]
	imul	ecx, [esi - 124]
	add	ebp, ecx
	mov	ecx, [eax + 116]
	imul	ecx, [esi - 120]
	add	ebp, ecx
	mov	ecx, [eax + 112]
	imul	ecx, [esi - 116]
	add	ebp, ecx
	mov	ecx, [eax + 108]
	imul	ecx, [esi - 112]
	add	ebp, ecx
	mov	ecx, [eax + 104]
	imul	ecx, [esi - 108]
	add	ebp, ecx
	mov	ecx, [eax + 100]
	imul	ecx, [esi - 104]
	add	ebp, ecx
	mov	ecx, [eax + 96]
	imul	ecx, [esi - 100]
	add	ebp, ecx
	mov	ecx, [eax + 92]
	imul	ecx, [esi - 96]
	add	ebp, ecx
	mov	ecx, [eax + 88]
	imul	ecx, [esi - 92]
	add	ebp, ecx
	mov	ecx, [eax + 84]
	imul	ecx, [esi - 88]
	add	ebp, ecx
	mov	ecx, [eax + 80]
	imul	ecx, [esi - 84]
	add	ebp, ecx
	mov	ecx, [eax + 76]
	imul	ecx, [esi - 80]
	add	ebp, ecx
	mov	ecx, [eax + 72]
	imul	ecx, [esi - 76]
	add	ebp, ecx
	mov	ecx, [eax + 68]
	imul	ecx, [esi - 72]
	add	ebp, ecx
	mov	ecx, [eax + 64]
	imul	ecx, [esi - 68]
	add	ebp, ecx
	mov	ecx, [eax + 60]
	imul	ecx, [esi - 64]
	add	ebp, ecx
	mov	ecx, [eax + 56]
	imul	ecx, [esi - 60]
	add	ebp, ecx
	mov	ecx, [eax + 52]
	imul	ecx, [esi - 56]
	add	ebp, ecx
	mov	ecx, [eax + 48]
	imul	ecx, [esi - 52]
	add	ebp, ecx
	mov	ecx, [eax + 44]
	imul	ecx, [esi - 48]
	add	ebp, ecx
	mov	ecx, [eax + 40]
	imul	ecx, [esi - 44]
	add	ebp, ecx
	mov	ecx, [eax + 36]
	imul	ecx, [esi - 40]
	add	ebp, ecx
	mov	ecx, [eax + 32]
	imul	ecx, [esi - 36]
	add	ebp, ecx
	mov	ecx, [eax + 28]
	imul	ecx, [esi - 32]
	add	ebp, ecx
	mov	ecx, [eax + 24]
	imul	ecx, [esi - 28]
	add	ebp, ecx
	mov	ecx, [eax + 20]
	imul	ecx, [esi - 24]
	add	ebp, ecx
	mov	ecx, [eax + 16]
	imul	ecx, [esi - 20]
	add	ebp, ecx
	mov	ecx, [eax + 12]
	imul	ecx, [esi - 16]
	add	ebp, ecx
	mov	ecx, [eax + 8]
	imul	ecx, [esi - 12]
	add	ebp, ecx
	mov	ecx, [eax + 4]
	imul	ecx, [esi - 8]
	add	ebp, ecx
	mov	ecx, [eax]			; there is one byte missing
	imul	ecx, [esi - 4]
	add	ebp, ecx
.jumper_0:

	mov	cl, [esp + 36]
	sar	ebp, cl
	neg	ebp
	add	ebp, [esi]
	mov	[edi + esi], ebp
	add	esi, byte 4

	dec	ebx
	jz	short .end
	xor	ebp, ebp
	jmp	edx

.end:
	pop	edi
	pop	esi
	pop	ebx
	pop	ebp
	ret

; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
; the channel must be <= 16.  Especially note that this routine cannot be used
; for side-channel coded 16bps channels since the effective bps is 17.
	ALIGN	16
cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
	;[esp + 40]	residual[]
	;[esp + 36]	lp_quantization
	;[esp + 32]	order
	;[esp + 28]	qlp_coeff[]
	;[esp + 24]	data_len
	;[esp + 20]	data[]

	;ASSERT(order > 0)

	push	ebp
	push	ebx
	push	esi
	push	edi

	mov	esi, [esp + 20]			; esi = data[]
	mov	edi, [esp + 40]			; edi = residual[]
	mov	eax, [esp + 32]			; eax = order
	mov	ebx, [esp + 24]			; ebx = data_len

	test	ebx, ebx
	jz	near .end			; do nothing if data_len == 0
	dec	ebx
	test	ebx, ebx
	jz	near .last_one

	mov	edx, [esp + 28]			; edx = qlp_coeff[]
	movd	mm6, [esp + 36]			; mm6 = 0:lp_quantization
	mov	ebp, esp

	and	esp, 0xfffffff8

	xor	ecx, ecx
.copy_qlp_loop:
	push	word [edx + 4 * ecx]
	inc	ecx
	cmp	ecx, eax
	jnz	short .copy_qlp_loop

	and	ecx, 0x3
	test	ecx, ecx
	je	short .za_end

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -