📄 lpc_asm.nasm
字号:
movss xmm0, [eax] ; xmm0 = 0,0,0,data[0] add eax, 4 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0] shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] movaps xmm1, xmm0 ; xmm1 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] xorps xmm3, xmm3 ; xmm3 = 0,0,0,0.warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample] mulps xmm0, xmm2 mulps xmm1, xmm3 ; xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2 addps xmm5, xmm0 addps xmm6, xmm1 ; xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2 dec edx jz .loop_end ALIGN 16.loop_start: ; start by reading the next sample movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample] ; here we reorder the instructions; see the (#) indexes for a logical order shufps xmm2, xmm2, 93h ; (3) 93h=2-1-0-3 => xmm2 gets rotated left by one float add eax, 4 ; (0) shufps xmm3, xmm3, 93h ; (4) 93h=2-1-0-3 => xmm3 gets rotated left by one float shufps xmm0, xmm0, 0 ; (1) xmm0 = data[sample],data[sample],data[sample],data[sample] movss xmm3, xmm2 ; (5) movaps xmm1, xmm0 ; (2) xmm1 = data[sample],data[sample],data[sample],data[sample] movss xmm2, xmm0 ; (6) mulps xmm1, xmm3 ; (8) mulps xmm0, xmm2 ; (7) xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2 addps xmm6, xmm1 ; (10) addps xmm5, xmm0 ; (9) xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2 dec edx jnz .loop_start.loop_end: ; store autoc mov edx, [esp + 16] ; edx == autoc movups [edx], xmm5 movups [edx + 16], xmm6.end: ret ALIGN 16cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12 ;[esp + 16] == autoc[] ;[esp + 12] == lag ;[esp + 8] == data_len ;[esp + 4] == data[] ;ASSERT(lag > 0) ;ASSERT(lag <= 12) ;ASSERT(lag <= data_len) ; for(coeff = 0; coeff < lag; coeff++) ; autoc[coeff] = 0.0; xorps xmm5, xmm5 xorps xmm6, xmm6 xorps xmm7, xmm7 mov edx, [esp + 8] ; edx == data_len mov eax, [esp + 4] ; eax == &data[sample] <- &data[0] movss xmm0, [eax] ; xmm0 = 0,0,0,data[0] add eax, 4 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0] shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] xorps xmm3, xmm3 ; xmm3 = 0,0,0,0 xorps xmm4, xmm4 ; xmm4 = 0,0,0,0.warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample] movaps xmm1, xmm0 mulps xmm1, xmm2 addps xmm5, xmm1 movaps xmm1, xmm0 mulps xmm1, xmm3 addps xmm6, xmm1 mulps xmm0, xmm4 addps xmm7, xmm0 ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2 dec edx jz .loop_end ALIGN 16.loop_start: ; start by reading the next sample movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample] add eax, 4 shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample] ; shift xmm4:xmm3:xmm2 left by one float shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float shufps xmm3, xmm3, 93h ; 93h=2-1-0-3 => xmm3 gets rotated left by one float shufps xmm4, xmm4, 93h ; 93h=2-1-0-3 => xmm4 gets rotated left by one float movss xmm4, xmm3 movss xmm3, xmm2 movss xmm2, xmm0 ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm3:xmm3:xmm2 movaps xmm1, xmm0 mulps xmm1, xmm2 addps xmm5, xmm1 movaps xmm1, xmm0 mulps xmm1, xmm3 addps xmm6, xmm1 mulps xmm0, xmm4 addps xmm7, xmm0 dec edx jnz .loop_start.loop_end: ; store autoc mov edx, [esp + 16] ; edx == autoc movups [edx], xmm5 movups [edx + 16], xmm6 movups [edx + 32], xmm7.end: ret align 16cident FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow ;[ebp + 32] autoc ;[ebp + 28] lag ;[ebp + 24] data_len ;[ebp + 20] data push ebp push ebx push esi push edi mov ebp, esp mov esi, [ebp + 20] mov edi, [ebp + 24] mov edx, [ebp + 28] inc edx and edx, byte -2 mov eax, edx neg eax and esp, byte -8 lea esp, [esp + 4 * eax] mov ecx, edx xor eax, eax.loop0: dec ecx mov [esp + 4 * ecx], eax jnz short .loop0 mov eax, edi sub eax, edx mov ebx, edx and ebx, byte 1 sub eax, ebx lea ecx, [esi + 4 * eax - 12] cmp esi, ecx mov eax, esi ja short .loop2_pre align 16 ;4 nops.loop1_i: movd mm0, [eax] movd mm2, [eax + 4] movd mm4, [eax + 8] movd mm6, [eax + 12] mov ebx, edx punpckldq mm0, mm0 punpckldq mm2, mm2 punpckldq mm4, mm4 punpckldq mm6, mm6 align 16 ;3 nops.loop1_j: sub ebx, byte 2 movd mm1, [eax + 4 * ebx] movd mm3, [eax + 4 * ebx + 4] movd mm5, [eax + 4 * ebx + 8] movd mm7, [eax + 4 * ebx + 12] punpckldq mm1, mm3 punpckldq mm3, mm5 pfmul mm1, mm0 punpckldq mm5, mm7 pfmul mm3, mm2 punpckldq mm7, [eax + 4 * ebx + 16] pfmul mm5, mm4 pfmul mm7, mm6 pfadd mm1, mm3 movq mm3, [esp + 4 * ebx] pfadd mm5, mm7 pfadd mm1, mm5 pfadd mm3, mm1 movq [esp + 4 * ebx], mm3 jg short .loop1_j add eax, byte 16 cmp eax, ecx jb short .loop1_i.loop2_pre: mov ebx, eax sub eax, esi shr eax, 2 lea ecx, [esi + 4 * edi] mov esi, ebx.loop2_i: movd mm0, [esi] mov ebx, edi sub ebx, eax cmp ebx, edx jbe short .loop2_j mov ebx, edx.loop2_j: dec ebx movd mm1, [esi + 4 * ebx] pfmul mm1, mm0 movd mm2, [esp + 4 * ebx] pfadd mm1, mm2 movd [esp + 4 * ebx], mm1 jnz short .loop2_j add esi, byte 4 inc eax cmp esi, ecx jnz short .loop2_i mov edi, [ebp + 32] mov edx, [ebp + 28].loop3: dec edx mov eax, [esp + 4 * edx] mov [edi + 4 * edx], eax jnz short .loop3 femms mov esp, ebp pop edi pop esi pop ebx pop ebp ret;void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);; for(i = 0; i < data_len; i++) {; sum = 0;; for(j = 0; j < order; j++); sum += qlp_coeff[j] * data[i-j-1];; residual[i] = data[i] - (sum >> lp_quantization);; }; ALIGN 16cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32 ;[esp + 40] residual[] ;[esp + 36] lp_quantization ;[esp + 32] order ;[esp + 28] qlp_coeff[] ;[esp + 24] data_len ;[esp + 20] data[] ;ASSERT(order > 0) push ebp push ebx push esi push edi mov esi, [esp + 20] ; esi = data[] mov edi, [esp + 40] ; edi = residual[] mov eax, [esp + 32] ; eax = order mov ebx, [esp + 24] ; ebx = data_len test ebx, ebx jz near .end ; do nothing if data_len == 0.begin: cmp eax, byte 1 jg short .i_1more mov ecx, [esp + 28] mov edx, [ecx] ; edx = qlp_coeff[0] mov eax, [esi - 4] ; eax = data[-1] mov cl, [esp + 36] ; cl = lp_quantization ALIGN 16.i_1_loop_i: imul eax, edx sar eax, cl neg eax add eax, [esi] mov [edi], eax mov eax, [esi] add edi, byte 4 add esi, byte 4 dec ebx jnz .i_1_loop_i jmp .end.i_1more: cmp eax, byte 32 ; for order <= 32 there is a faster routine jbe short .i_32 ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32 ALIGN 16.i_32more_loop_i: xor ebp, ebp mov ecx, [esp + 32] mov edx, ecx shl edx, 2 add edx, [esp + 28] neg ecx ALIGN 16.i_32more_loop_j: sub edx, byte 4 mov eax, [edx] imul eax, [esi + 4 * ecx] add ebp, eax inc ecx jnz short .i_32more_loop_j mov cl, [esp + 36] sar ebp, cl neg ebp add ebp, [esi] mov [edi], ebp add esi, byte 4 add edi, byte 4 dec ebx jnz .i_32more_loop_i jmp .end.i_32: sub edi, esi neg eax lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0] call .get_eip0.get_eip0: pop eax add edx, eax inc edx mov eax, [esp + 28] ; eax = qlp_coeff[] xor ebp, ebp jmp edx mov ecx, [eax + 124] imul ecx, [esi - 128] add ebp, ecx mov ecx, [eax + 120] imul ecx, [esi - 124] add ebp, ecx mov ecx, [eax + 116] imul ecx, [esi - 120] add ebp, ecx mov ecx, [eax + 112] imul ecx, [esi - 116] add ebp, ecx mov ecx, [eax + 108] imul ecx, [esi - 112] add ebp, ecx mov ecx, [eax + 104] imul ecx, [esi - 108] add ebp, ecx mov ecx, [eax + 100] imul ecx, [esi - 104] add ebp, ecx mov ecx, [eax + 96] imul ecx, [esi - 100] add ebp, ecx mov ecx, [eax + 92] imul ecx, [esi - 96] add ebp, ecx mov ecx, [eax + 88] imul ecx, [esi - 92] add ebp, ecx mov ecx, [eax + 84] imul ecx, [esi - 88] add ebp, ecx mov ecx, [eax + 80] imul ecx, [esi - 84] add ebp, ecx mov ecx, [eax + 76] imul ecx, [esi - 80] add ebp, ecx mov ecx, [eax + 72] imul ecx, [esi - 76] add ebp, ecx mov ecx, [eax + 68] imul ecx, [esi - 72] add ebp, ecx mov ecx, [eax + 64] imul ecx, [esi - 68] add ebp, ecx mov ecx, [eax + 60] imul ecx, [esi - 64] add ebp, ecx mov ecx, [eax + 56] imul ecx, [esi - 60] add ebp, ecx mov ecx, [eax + 52] imul ecx, [esi - 56] add ebp, ecx mov ecx, [eax + 48] imul ecx, [esi - 52] add ebp, ecx mov ecx, [eax + 44] imul ecx, [esi - 48] add ebp, ecx mov ecx, [eax + 40] imul ecx, [esi - 44] add ebp, ecx mov ecx, [eax + 36] imul ecx, [esi - 40] add ebp, ecx mov ecx, [eax + 32] imul ecx, [esi - 36] add ebp, ecx mov ecx, [eax + 28] imul ecx, [esi - 32] add ebp, ecx mov ecx, [eax + 24] imul ecx, [esi - 28] add ebp, ecx mov ecx, [eax + 20] imul ecx, [esi - 24] add ebp, ecx mov ecx, [eax + 16] imul ecx, [esi - 20] add ebp, ecx mov ecx, [eax + 12] imul ecx, [esi - 16] add ebp, ecx mov ecx, [eax + 8] imul ecx, [esi - 12] add ebp, ecx mov ecx, [eax + 4] imul ecx, [esi - 8] add ebp, ecx mov ecx, [eax] ; there is one byte missing imul ecx, [esi - 4] add ebp, ecx.jumper_0: mov cl, [esp + 36] sar ebp, cl neg ebp add ebp, [esi] mov [edi + esi], ebp add esi, byte 4 dec ebx jz short .end xor ebp, ebp jmp edx.end: pop edi pop esi pop ebx pop ebp ret; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for; the channel must be <= 16. Especially note that this routine cannot be used; for side-channel coded 16bps channels since the effective bps is 17. ALIGN 16cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx ;[esp + 40] residual[] ;[esp + 36] lp_quantization ;[esp + 32] order ;[esp + 28] qlp_coeff[] ;[esp + 24] data_len ;[esp + 20] data[] ;ASSERT(order > 0) push ebp push ebx push esi push edi mov esi, [esp + 20] ; esi = data[] mov edi, [esp + 40] ; edi = residual[] mov eax, [esp + 32] ; eax = order mov ebx, [esp + 24] ; ebx = data_len test ebx, ebx jz near .end ; do nothing if data_len == 0 dec ebx test ebx, ebx jz near .last_one mov edx, [esp + 28] ; edx = qlp_coeff[] movd mm6, [esp + 36] ; mm6 = 0:lp_quantization mov ebp, esp and esp, 0xfffffff8 xor ecx, ecx.copy_qlp_loop: push word [edx + 4 * ecx] inc ecx cmp ecx, eax jnz short .copy_qlp_loop and ecx, 0x3 test ecx, ecx je short .za_end
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -