📄 lpc_asm.nasm
字号:
movss xmm0, [eax] ; xmm0 = 0,0,0,data[0]
add eax, 4
movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0]
shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
movaps xmm1, xmm0 ; xmm1 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
xorps xmm3, xmm3 ; xmm3 = 0,0,0,0
.warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
mulps xmm0, xmm2
mulps xmm1, xmm3 ; xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
addps xmm5, xmm0
addps xmm6, xmm1 ; xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
dec edx
jz .loop_end
ALIGN 16
.loop_start:
; start by reading the next sample
movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample]
; here we reorder the instructions; see the (#) indexes for a logical order
shufps xmm2, xmm2, 93h ; (3) 93h=2-1-0-3 => xmm2 gets rotated left by one float
add eax, 4 ; (0)
shufps xmm3, xmm3, 93h ; (4) 93h=2-1-0-3 => xmm3 gets rotated left by one float
shufps xmm0, xmm0, 0 ; (1) xmm0 = data[sample],data[sample],data[sample],data[sample]
movss xmm3, xmm2 ; (5)
movaps xmm1, xmm0 ; (2) xmm1 = data[sample],data[sample],data[sample],data[sample]
movss xmm2, xmm0 ; (6)
mulps xmm1, xmm3 ; (8)
mulps xmm0, xmm2 ; (7) xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
addps xmm6, xmm1 ; (10)
addps xmm5, xmm0 ; (9) xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
dec edx
jnz .loop_start
.loop_end:
; store autoc
mov edx, [esp + 16] ; edx == autoc
movups [edx], xmm5
movups [edx + 16], xmm6
.end:
ret
ALIGN 16
cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
;[esp + 16] == autoc[]
;[esp + 12] == lag
;[esp + 8] == data_len
;[esp + 4] == data[]
;ASSERT(lag > 0)
;ASSERT(lag <= 12)
;ASSERT(lag <= data_len)
; for(coeff = 0; coeff < lag; coeff++)
; autoc[coeff] = 0.0;
xorps xmm5, xmm5
xorps xmm6, xmm6
xorps xmm7, xmm7
mov edx, [esp + 8] ; edx == data_len
mov eax, [esp + 4] ; eax == &data[sample] <- &data[0]
movss xmm0, [eax] ; xmm0 = 0,0,0,data[0]
add eax, 4
movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0]
shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
xorps xmm3, xmm3 ; xmm3 = 0,0,0,0
xorps xmm4, xmm4 ; xmm4 = 0,0,0,0
.warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
movaps xmm1, xmm0
mulps xmm1, xmm2
addps xmm5, xmm1
movaps xmm1, xmm0
mulps xmm1, xmm3
addps xmm6, xmm1
mulps xmm0, xmm4
addps xmm7, xmm0 ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2
dec edx
jz .loop_end
ALIGN 16
.loop_start:
; start by reading the next sample
movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample]
add eax, 4
shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample]
; shift xmm4:xmm3:xmm2 left by one float
shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float
shufps xmm3, xmm3, 93h ; 93h=2-1-0-3 => xmm3 gets rotated left by one float
shufps xmm4, xmm4, 93h ; 93h=2-1-0-3 => xmm4 gets rotated left by one float
movss xmm4, xmm3
movss xmm3, xmm2
movss xmm2, xmm0
; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm3:xmm3:xmm2
movaps xmm1, xmm0
mulps xmm1, xmm2
addps xmm5, xmm1
movaps xmm1, xmm0
mulps xmm1, xmm3
addps xmm6, xmm1
mulps xmm0, xmm4
addps xmm7, xmm0
dec edx
jnz .loop_start
.loop_end:
; store autoc
mov edx, [esp + 16] ; edx == autoc
movups [edx], xmm5
movups [edx + 16], xmm6
movups [edx + 32], xmm7
.end:
ret
align 16
cident FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow
;[ebp + 32] autoc
;[ebp + 28] lag
;[ebp + 24] data_len
;[ebp + 20] data
push ebp
push ebx
push esi
push edi
mov ebp, esp
mov esi, [ebp + 20]
mov edi, [ebp + 24]
mov edx, [ebp + 28]
inc edx
and edx, byte -2
mov eax, edx
neg eax
and esp, byte -8
lea esp, [esp + 4 * eax]
mov ecx, edx
xor eax, eax
.loop0:
dec ecx
mov [esp + 4 * ecx], eax
jnz short .loop0
mov eax, edi
sub eax, edx
mov ebx, edx
and ebx, byte 1
sub eax, ebx
lea ecx, [esi + 4 * eax - 12]
cmp esi, ecx
mov eax, esi
ja short .loop2_pre
align 16 ;4 nops
.loop1_i:
movd mm0, [eax]
movd mm2, [eax + 4]
movd mm4, [eax + 8]
movd mm6, [eax + 12]
mov ebx, edx
punpckldq mm0, mm0
punpckldq mm2, mm2
punpckldq mm4, mm4
punpckldq mm6, mm6
align 16 ;3 nops
.loop1_j:
sub ebx, byte 2
movd mm1, [eax + 4 * ebx]
movd mm3, [eax + 4 * ebx + 4]
movd mm5, [eax + 4 * ebx + 8]
movd mm7, [eax + 4 * ebx + 12]
punpckldq mm1, mm3
punpckldq mm3, mm5
pfmul mm1, mm0
punpckldq mm5, mm7
pfmul mm3, mm2
punpckldq mm7, [eax + 4 * ebx + 16]
pfmul mm5, mm4
pfmul mm7, mm6
pfadd mm1, mm3
movq mm3, [esp + 4 * ebx]
pfadd mm5, mm7
pfadd mm1, mm5
pfadd mm3, mm1
movq [esp + 4 * ebx], mm3
jg short .loop1_j
add eax, byte 16
cmp eax, ecx
jb short .loop1_i
.loop2_pre:
mov ebx, eax
sub eax, esi
shr eax, 2
lea ecx, [esi + 4 * edi]
mov esi, ebx
.loop2_i:
movd mm0, [esi]
mov ebx, edi
sub ebx, eax
cmp ebx, edx
jbe short .loop2_j
mov ebx, edx
.loop2_j:
dec ebx
movd mm1, [esi + 4 * ebx]
pfmul mm1, mm0
movd mm2, [esp + 4 * ebx]
pfadd mm1, mm2
movd [esp + 4 * ebx], mm1
jnz short .loop2_j
add esi, byte 4
inc eax
cmp esi, ecx
jnz short .loop2_i
mov edi, [ebp + 32]
mov edx, [ebp + 28]
.loop3:
dec edx
mov eax, [esp + 4 * edx]
mov [edi + 4 * edx], eax
jnz short .loop3
femms
mov esp, ebp
pop edi
pop esi
pop ebx
pop ebp
ret
;void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
;
; for(i = 0; i < data_len; i++) {
; sum = 0;
; for(j = 0; j < order; j++)
; sum += qlp_coeff[j] * data[i-j-1];
; residual[i] = data[i] - (sum >> lp_quantization);
; }
;
ALIGN 16
cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
;[esp + 40] residual[]
;[esp + 36] lp_quantization
;[esp + 32] order
;[esp + 28] qlp_coeff[]
;[esp + 24] data_len
;[esp + 20] data[]
;ASSERT(order > 0)
push ebp
push ebx
push esi
push edi
mov esi, [esp + 20] ; esi = data[]
mov edi, [esp + 40] ; edi = residual[]
mov eax, [esp + 32] ; eax = order
mov ebx, [esp + 24] ; ebx = data_len
test ebx, ebx
jz near .end ; do nothing if data_len == 0
.begin:
cmp eax, byte 1
jg short .i_1more
mov ecx, [esp + 28]
mov edx, [ecx] ; edx = qlp_coeff[0]
mov eax, [esi - 4] ; eax = data[-1]
mov cl, [esp + 36] ; cl = lp_quantization
ALIGN 16
.i_1_loop_i:
imul eax, edx
sar eax, cl
neg eax
add eax, [esi]
mov [edi], eax
mov eax, [esi]
add edi, byte 4
add esi, byte 4
dec ebx
jnz .i_1_loop_i
jmp .end
.i_1more:
cmp eax, byte 32 ; for order <= 32 there is a faster routine
jbe short .i_32
; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
ALIGN 16
.i_32more_loop_i:
xor ebp, ebp
mov ecx, [esp + 32]
mov edx, ecx
shl edx, 2
add edx, [esp + 28]
neg ecx
ALIGN 16
.i_32more_loop_j:
sub edx, byte 4
mov eax, [edx]
imul eax, [esi + 4 * ecx]
add ebp, eax
inc ecx
jnz short .i_32more_loop_j
mov cl, [esp + 36]
sar ebp, cl
neg ebp
add ebp, [esi]
mov [edi], ebp
add esi, byte 4
add edi, byte 4
dec ebx
jnz .i_32more_loop_i
jmp .end
.i_32:
sub edi, esi
neg eax
lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
call .get_eip0
.get_eip0:
pop eax
add edx, eax
inc edx
mov eax, [esp + 28] ; eax = qlp_coeff[]
xor ebp, ebp
jmp edx
mov ecx, [eax + 124]
imul ecx, [esi - 128]
add ebp, ecx
mov ecx, [eax + 120]
imul ecx, [esi - 124]
add ebp, ecx
mov ecx, [eax + 116]
imul ecx, [esi - 120]
add ebp, ecx
mov ecx, [eax + 112]
imul ecx, [esi - 116]
add ebp, ecx
mov ecx, [eax + 108]
imul ecx, [esi - 112]
add ebp, ecx
mov ecx, [eax + 104]
imul ecx, [esi - 108]
add ebp, ecx
mov ecx, [eax + 100]
imul ecx, [esi - 104]
add ebp, ecx
mov ecx, [eax + 96]
imul ecx, [esi - 100]
add ebp, ecx
mov ecx, [eax + 92]
imul ecx, [esi - 96]
add ebp, ecx
mov ecx, [eax + 88]
imul ecx, [esi - 92]
add ebp, ecx
mov ecx, [eax + 84]
imul ecx, [esi - 88]
add ebp, ecx
mov ecx, [eax + 80]
imul ecx, [esi - 84]
add ebp, ecx
mov ecx, [eax + 76]
imul ecx, [esi - 80]
add ebp, ecx
mov ecx, [eax + 72]
imul ecx, [esi - 76]
add ebp, ecx
mov ecx, [eax + 68]
imul ecx, [esi - 72]
add ebp, ecx
mov ecx, [eax + 64]
imul ecx, [esi - 68]
add ebp, ecx
mov ecx, [eax + 60]
imul ecx, [esi - 64]
add ebp, ecx
mov ecx, [eax + 56]
imul ecx, [esi - 60]
add ebp, ecx
mov ecx, [eax + 52]
imul ecx, [esi - 56]
add ebp, ecx
mov ecx, [eax + 48]
imul ecx, [esi - 52]
add ebp, ecx
mov ecx, [eax + 44]
imul ecx, [esi - 48]
add ebp, ecx
mov ecx, [eax + 40]
imul ecx, [esi - 44]
add ebp, ecx
mov ecx, [eax + 36]
imul ecx, [esi - 40]
add ebp, ecx
mov ecx, [eax + 32]
imul ecx, [esi - 36]
add ebp, ecx
mov ecx, [eax + 28]
imul ecx, [esi - 32]
add ebp, ecx
mov ecx, [eax + 24]
imul ecx, [esi - 28]
add ebp, ecx
mov ecx, [eax + 20]
imul ecx, [esi - 24]
add ebp, ecx
mov ecx, [eax + 16]
imul ecx, [esi - 20]
add ebp, ecx
mov ecx, [eax + 12]
imul ecx, [esi - 16]
add ebp, ecx
mov ecx, [eax + 8]
imul ecx, [esi - 12]
add ebp, ecx
mov ecx, [eax + 4]
imul ecx, [esi - 8]
add ebp, ecx
mov ecx, [eax] ; there is one byte missing
imul ecx, [esi - 4]
add ebp, ecx
.jumper_0:
mov cl, [esp + 36]
sar ebp, cl
neg ebp
add ebp, [esi]
mov [edi + esi], ebp
add esi, byte 4
dec ebx
jz short .end
xor ebp, ebp
jmp edx
.end:
pop edi
pop esi
pop ebx
pop ebp
ret
; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
; the channel must be <= 16. Especially note that this routine cannot be used
; for side-channel coded 16bps channels since the effective bps is 17.
ALIGN 16
cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
;[esp + 40] residual[]
;[esp + 36] lp_quantization
;[esp + 32] order
;[esp + 28] qlp_coeff[]
;[esp + 24] data_len
;[esp + 20] data[]
;ASSERT(order > 0)
push ebp
push ebx
push esi
push edi
mov esi, [esp + 20] ; esi = data[]
mov edi, [esp + 40] ; edi = residual[]
mov eax, [esp + 32] ; eax = order
mov ebx, [esp + 24] ; ebx = data_len
test ebx, ebx
jz near .end ; do nothing if data_len == 0
dec ebx
test ebx, ebx
jz near .last_one
mov edx, [esp + 28] ; edx = qlp_coeff[]
movd mm6, [esp + 36] ; mm6 = 0:lp_quantization
mov ebp, esp
and esp, 0xfffffff8
xor ecx, ecx
.copy_qlp_loop:
push word [edx + 4 * ecx]
inc ecx
cmp ecx, eax
jnz short .copy_qlp_loop
and ecx, 0x3
test ecx, ecx
je short .za_end
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -