lpc_asm.nasm

来自「wince下著名的视频播放器源码」· NASM 代码 · 共 1,504 行 · 第 1/3 页
NASM
1,504 行
	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[0]	add	eax, 4	movaps	xmm2, xmm0			; xmm2 = 0,0,0,data[0]	shufps	xmm0, xmm0, 0			; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]	movaps	xmm1, xmm0			; xmm1 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]	xorps	xmm3, xmm3			; xmm3 = 0,0,0,0.warmup:					; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]	mulps	xmm0, xmm2	mulps	xmm1, xmm3			; xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2	addps	xmm5, xmm0	addps	xmm6, xmm1			; xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2	dec	edx	jz	.loop_end	ALIGN 16.loop_start:	; start by reading the next sample	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[sample]	; here we reorder the instructions; see the (#) indexes for a logical order	shufps	xmm2, xmm2, 93h			; (3) 93h=2-1-0-3 => xmm2 gets rotated left by one float	add	eax, 4				; (0)	shufps	xmm3, xmm3, 93h			; (4) 93h=2-1-0-3 => xmm3 gets rotated left by one float	shufps	xmm0, xmm0, 0			; (1) xmm0 = data[sample],data[sample],data[sample],data[sample]	movss	xmm3, xmm2			; (5)	movaps	xmm1, xmm0			; (2) xmm1 = data[sample],data[sample],data[sample],data[sample]	movss	xmm2, xmm0			; (6)	mulps	xmm1, xmm3			; (8)	mulps	xmm0, xmm2			; (7) xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2	addps	xmm6, xmm1			; (10)	addps	xmm5, xmm0			; (9) xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2	dec	edx	jnz	.loop_start.loop_end:	; store autoc	mov	edx, [esp + 16]			; edx == autoc	movups	[edx], xmm5	movups	[edx + 16], xmm6.end:	ret	ALIGN 16cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12	;[esp + 16] == autoc[]	;[esp + 12] == lag	;[esp + 8] == data_len	;[esp + 4] == data[]	;ASSERT(lag > 0)	;ASSERT(lag <= 12)	;ASSERT(lag <= data_len)	;	for(coeff = 0; coeff < lag; coeff++)	;		autoc[coeff] = 0.0;	xorps	xmm5, xmm5	xorps	xmm6, xmm6	xorps	xmm7, xmm7	mov	edx, [esp + 8]			; edx == data_len	mov	eax, [esp + 4]			; eax == &data[sample] <- &data[0]	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[0]	add	eax, 4	movaps	xmm2, xmm0			; xmm2 = 0,0,0,data[0]	shufps	xmm0, xmm0, 0			; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]	xorps	xmm3, xmm3			; xmm3 = 0,0,0,0	xorps	xmm4, xmm4			; xmm4 = 0,0,0,0.warmup:					; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]	movaps	xmm1, xmm0	mulps	xmm1, xmm2	addps	xmm5, xmm1	movaps	xmm1, xmm0	mulps	xmm1, xmm3	addps	xmm6, xmm1	mulps	xmm0, xmm4	addps	xmm7, xmm0			; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2	dec	edx	jz	.loop_end	ALIGN 16.loop_start:	; start by reading the next sample	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[sample]	add	eax, 4	shufps	xmm0, xmm0, 0			; xmm0 = data[sample],data[sample],data[sample],data[sample]	; shift xmm4:xmm3:xmm2 left by one float	shufps	xmm2, xmm2, 93h			; 93h=2-1-0-3 => xmm2 gets rotated left by one float	shufps	xmm3, xmm3, 93h			; 93h=2-1-0-3 => xmm3 gets rotated left by one float	shufps	xmm4, xmm4, 93h			; 93h=2-1-0-3 => xmm4 gets rotated left by one float	movss	xmm4, xmm3	movss	xmm3, xmm2	movss	xmm2, xmm0	; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm3:xmm3:xmm2	movaps	xmm1, xmm0	mulps	xmm1, xmm2	addps	xmm5, xmm1	movaps	xmm1, xmm0	mulps	xmm1, xmm3	addps	xmm6, xmm1	mulps	xmm0, xmm4	addps	xmm7, xmm0	dec	edx	jnz	.loop_start.loop_end:	; store autoc	mov	edx, [esp + 16]			; edx == autoc	movups	[edx], xmm5	movups	[edx + 16], xmm6	movups	[edx + 32], xmm7.end:	ret	align 16cident FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow	;[ebp + 32] autoc	;[ebp + 28] lag	;[ebp + 24] data_len	;[ebp + 20] data	push	ebp	push	ebx	push	esi	push	edi	mov	ebp, esp	mov	esi, [ebp + 20]	mov	edi, [ebp + 24]	mov	edx, [ebp + 28]	inc	edx	and	edx, byte -2	mov	eax, edx	neg	eax	and	esp, byte -8	lea	esp, [esp + 4 * eax]	mov	ecx, edx	xor	eax, eax.loop0:	dec	ecx	mov	[esp + 4 * ecx], eax	jnz	short .loop0	mov	eax, edi	sub	eax, edx	mov	ebx, edx	and	ebx, byte 1	sub	eax, ebx	lea	ecx, [esi + 4 * eax - 12]	cmp	esi, ecx	mov	eax, esi	ja	short .loop2_pre	align	16		;4 nops.loop1_i:	movd	mm0, [eax]	movd	mm2, [eax + 4]	movd	mm4, [eax + 8]	movd	mm6, [eax + 12]	mov	ebx, edx	punpckldq	mm0, mm0	punpckldq	mm2, mm2	punpckldq	mm4, mm4	punpckldq	mm6, mm6	align	16		;3 nops.loop1_j:	sub	ebx, byte 2	movd	mm1, [eax + 4 * ebx]	movd	mm3, [eax + 4 * ebx + 4]	movd	mm5, [eax + 4 * ebx + 8]	movd	mm7, [eax + 4 * ebx + 12]	punpckldq	mm1, mm3	punpckldq	mm3, mm5	pfmul	mm1, mm0	punpckldq	mm5, mm7	pfmul	mm3, mm2	punpckldq	mm7, [eax + 4 * ebx + 16]	pfmul	mm5, mm4	pfmul	mm7, mm6	pfadd	mm1, mm3	movq	mm3, [esp + 4 * ebx]	pfadd	mm5, mm7	pfadd	mm1, mm5	pfadd	mm3, mm1	movq	[esp + 4 * ebx], mm3	jg	short .loop1_j	add	eax, byte 16	cmp	eax, ecx	jb	short .loop1_i.loop2_pre:	mov	ebx, eax	sub	eax, esi	shr	eax, 2	lea	ecx, [esi + 4 * edi]	mov	esi, ebx.loop2_i:	movd	mm0, [esi]	mov	ebx, edi	sub	ebx, eax	cmp	ebx, edx	jbe	short .loop2_j	mov	ebx, edx.loop2_j:	dec	ebx	movd	mm1, [esi + 4 * ebx]	pfmul	mm1, mm0	movd	mm2, [esp + 4 * ebx]	pfadd	mm1, mm2	movd	[esp + 4 * ebx], mm1	jnz	short .loop2_j	add	esi, byte 4	inc	eax	cmp	esi, ecx	jnz	short .loop2_i	mov	edi, [ebp + 32]	mov	edx, [ebp + 28].loop3:	dec	edx	mov	eax, [esp + 4 * edx]	mov	[edi + 4 * edx], eax	jnz	short .loop3	femms	mov	esp, ebp	pop	edi	pop	esi	pop	ebx	pop	ebp	ret;void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);;	for(i = 0; i < data_len; i++) {;		sum = 0;;		for(j = 0; j < order; j++);			sum += qlp_coeff[j] * data[i-j-1];;		residual[i] = data[i] - (sum >> lp_quantization);;	};	ALIGN	16cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32	;[esp + 40]	residual[]	;[esp + 36]	lp_quantization	;[esp + 32]	order	;[esp + 28]	qlp_coeff[]	;[esp + 24]	data_len	;[esp + 20]	data[]	;ASSERT(order > 0)	push	ebp	push	ebx	push	esi	push	edi	mov	esi, [esp + 20]			; esi = data[]	mov	edi, [esp + 40]			; edi = residual[]	mov	eax, [esp + 32]			; eax = order	mov	ebx, [esp + 24]			; ebx = data_len	test	ebx, ebx	jz	near .end			; do nothing if data_len == 0.begin:	cmp	eax, byte 1	jg	short .i_1more	mov	ecx, [esp + 28]	mov	edx, [ecx]			; edx = qlp_coeff[0]	mov	eax, [esi - 4]			; eax = data[-1]	mov	cl, [esp + 36]			; cl = lp_quantization	ALIGN	16.i_1_loop_i:	imul	eax, edx	sar	eax, cl	neg	eax	add	eax, [esi]	mov	[edi], eax	mov	eax, [esi]	add	edi, byte 4	add	esi, byte 4	dec	ebx	jnz	.i_1_loop_i	jmp	.end.i_1more:	cmp	eax, byte 32			; for order <= 32 there is a faster routine	jbe	short .i_32	; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32	ALIGN 16.i_32more_loop_i:	xor	ebp, ebp	mov	ecx, [esp + 32]	mov	edx, ecx	shl	edx, 2	add	edx, [esp + 28]	neg	ecx	ALIGN	16.i_32more_loop_j:	sub	edx, byte 4	mov	eax, [edx]	imul	eax, [esi + 4 * ecx]	add	ebp, eax	inc	ecx	jnz	short .i_32more_loop_j	mov	cl, [esp + 36]	sar	ebp, cl	neg	ebp	add	ebp, [esi]	mov	[edi], ebp	add	esi, byte 4	add	edi, byte 4	dec	ebx	jnz	.i_32more_loop_i	jmp	.end.i_32:	sub	edi, esi	neg	eax	lea	edx, [eax + eax * 8 + .jumper_0 - .get_eip0]	call	.get_eip0.get_eip0:	pop	eax	add	edx, eax	inc	edx	mov	eax, [esp + 28]			; eax = qlp_coeff[]	xor	ebp, ebp	jmp	edx	mov	ecx, [eax + 124]	imul	ecx, [esi - 128]	add	ebp, ecx	mov	ecx, [eax + 120]	imul	ecx, [esi - 124]	add	ebp, ecx	mov	ecx, [eax + 116]	imul	ecx, [esi - 120]	add	ebp, ecx	mov	ecx, [eax + 112]	imul	ecx, [esi - 116]	add	ebp, ecx	mov	ecx, [eax + 108]	imul	ecx, [esi - 112]	add	ebp, ecx	mov	ecx, [eax + 104]	imul	ecx, [esi - 108]	add	ebp, ecx	mov	ecx, [eax + 100]	imul	ecx, [esi - 104]	add	ebp, ecx	mov	ecx, [eax + 96]	imul	ecx, [esi - 100]	add	ebp, ecx	mov	ecx, [eax + 92]	imul	ecx, [esi - 96]	add	ebp, ecx	mov	ecx, [eax + 88]	imul	ecx, [esi - 92]	add	ebp, ecx	mov	ecx, [eax + 84]	imul	ecx, [esi - 88]	add	ebp, ecx	mov	ecx, [eax + 80]	imul	ecx, [esi - 84]	add	ebp, ecx	mov	ecx, [eax + 76]	imul	ecx, [esi - 80]	add	ebp, ecx	mov	ecx, [eax + 72]	imul	ecx, [esi - 76]	add	ebp, ecx	mov	ecx, [eax + 68]	imul	ecx, [esi - 72]	add	ebp, ecx	mov	ecx, [eax + 64]	imul	ecx, [esi - 68]	add	ebp, ecx	mov	ecx, [eax + 60]	imul	ecx, [esi - 64]	add	ebp, ecx	mov	ecx, [eax + 56]	imul	ecx, [esi - 60]	add	ebp, ecx	mov	ecx, [eax + 52]	imul	ecx, [esi - 56]	add	ebp, ecx	mov	ecx, [eax + 48]	imul	ecx, [esi - 52]	add	ebp, ecx	mov	ecx, [eax + 44]	imul	ecx, [esi - 48]	add	ebp, ecx	mov	ecx, [eax + 40]	imul	ecx, [esi - 44]	add	ebp, ecx	mov	ecx, [eax + 36]	imul	ecx, [esi - 40]	add	ebp, ecx	mov	ecx, [eax + 32]	imul	ecx, [esi - 36]	add	ebp, ecx	mov	ecx, [eax + 28]	imul	ecx, [esi - 32]	add	ebp, ecx	mov	ecx, [eax + 24]	imul	ecx, [esi - 28]	add	ebp, ecx	mov	ecx, [eax + 20]	imul	ecx, [esi - 24]	add	ebp, ecx	mov	ecx, [eax + 16]	imul	ecx, [esi - 20]	add	ebp, ecx	mov	ecx, [eax + 12]	imul	ecx, [esi - 16]	add	ebp, ecx	mov	ecx, [eax + 8]	imul	ecx, [esi - 12]	add	ebp, ecx	mov	ecx, [eax + 4]	imul	ecx, [esi - 8]	add	ebp, ecx	mov	ecx, [eax]			; there is one byte missing	imul	ecx, [esi - 4]	add	ebp, ecx.jumper_0:	mov	cl, [esp + 36]	sar	ebp, cl	neg	ebp	add	ebp, [esi]	mov	[edi + esi], ebp	add	esi, byte 4	dec	ebx	jz	short .end	xor	ebp, ebp	jmp	edx.end:	pop	edi	pop	esi	pop	ebx	pop	ebp	ret; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for; the channel must be <= 16.  Especially note that this routine cannot be used; for side-channel coded 16bps channels since the effective bps is 17.	ALIGN	16cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx	;[esp + 40]	residual[]	;[esp + 36]	lp_quantization	;[esp + 32]	order	;[esp + 28]	qlp_coeff[]	;[esp + 24]	data_len	;[esp + 20]	data[]	;ASSERT(order > 0)	push	ebp	push	ebx	push	esi	push	edi	mov	esi, [esp + 20]			; esi = data[]	mov	edi, [esp + 40]			; edi = residual[]	mov	eax, [esp + 32]			; eax = order	mov	ebx, [esp + 24]			; ebx = data_len	test	ebx, ebx	jz	near .end			; do nothing if data_len == 0	dec	ebx	test	ebx, ebx	jz	near .last_one	mov	edx, [esp + 28]			; edx = qlp_coeff[]	movd	mm6, [esp + 36]			; mm6 = 0:lp_quantization	mov	ebp, esp	and	esp, 0xfffffff8	xor	ecx, ecx.copy_qlp_loop:	push	word [edx + 4 * ecx]	inc	ecx	cmp	ecx, eax	jnz	short .copy_qlp_loop	and	ecx, 0x3	test	ecx, ecx	je	short .za_end
lpc_asm.nasm - 源码说明

本页面展示了「wince下著名的视频播放器源码」中的 lpc_asm.nasm 源码文件，采用 NASM 编程语言编写，共 1,504 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与wince相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?