📄 lpc_asm.nasm
字号:
sub ecx, byte 4
.za_loop:
push word 0
inc eax
inc ecx
jnz short .za_loop
.za_end:
movq mm5, [esp + 2 * eax - 8]
movd mm4, [esi - 16]
punpckldq mm4, [esi - 12]
movd mm0, [esi - 8]
punpckldq mm0, [esi - 4]
packssdw mm4, mm0
cmp eax, byte 4
jnbe short .mmx_4more
align 16
.mmx_4_loop_i:
movd mm1, [esi]
movq mm3, mm4
punpckldq mm1, [esi + 4]
psrlq mm4, 16
movq mm0, mm1
psllq mm0, 48
por mm4, mm0
movq mm2, mm4
psrlq mm4, 16
pxor mm0, mm0
punpckhdq mm0, mm1
pmaddwd mm3, mm5
pmaddwd mm2, mm5
psllq mm0, 16
por mm4, mm0
movq mm0, mm3
punpckldq mm3, mm2
punpckhdq mm0, mm2
paddd mm3, mm0
psrad mm3, mm6
psubd mm1, mm3
movd [edi], mm1
punpckhdq mm1, mm1
movd [edi + 4], mm1
add edi, byte 8
add esi, byte 8
sub ebx, 2
jg .mmx_4_loop_i
jmp .mmx_end
.mmx_4more:
shl eax, 2
neg eax
add eax, byte 16
align 16
.mmx_4more_loop_i:
movd mm1, [esi]
punpckldq mm1, [esi + 4]
movq mm3, mm4
psrlq mm4, 16
movq mm0, mm1
psllq mm0, 48
por mm4, mm0
movq mm2, mm4
psrlq mm4, 16
pxor mm0, mm0
punpckhdq mm0, mm1
pmaddwd mm3, mm5
pmaddwd mm2, mm5
psllq mm0, 16
por mm4, mm0
mov ecx, esi
add ecx, eax
mov edx, esp
align 16
.mmx_4more_loop_j:
movd mm0, [ecx - 16]
movd mm7, [ecx - 8]
punpckldq mm0, [ecx - 12]
punpckldq mm7, [ecx - 4]
packssdw mm0, mm7
pmaddwd mm0, [edx]
punpckhdq mm7, mm7
paddd mm3, mm0
movd mm0, [ecx - 12]
punpckldq mm0, [ecx - 8]
punpckldq mm7, [ecx]
packssdw mm0, mm7
pmaddwd mm0, [edx]
paddd mm2, mm0
add edx, byte 8
add ecx, byte 16
cmp ecx, esi
jnz .mmx_4more_loop_j
movq mm0, mm3
punpckldq mm3, mm2
punpckhdq mm0, mm2
paddd mm3, mm0
psrad mm3, mm6
psubd mm1, mm3
movd [edi], mm1
punpckhdq mm1, mm1
movd [edi + 4], mm1
add edi, byte 8
add esi, byte 8
sub ebx, 2
jg near .mmx_4more_loop_i
.mmx_end:
emms
mov esp, ebp
.last_one:
mov eax, [esp + 32]
inc ebx
jnz near FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32.begin
.end:
pop edi
pop esi
pop ebx
pop ebp
ret
; **********************************************************************
;
; void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
; {
; unsigned i, j;
; FLAC__int32 sum;
;
; FLAC__ASSERT(order > 0);
;
; for(i = 0; i < data_len; i++) {
; sum = 0;
; for(j = 0; j < order; j++)
; sum += qlp_coeff[j] * data[i-j-1];
; data[i] = residual[i] + (sum >> lp_quantization);
; }
; }
ALIGN 16
cident FLAC__lpc_restore_signal_asm_ia32
;[esp + 40] data[]
;[esp + 36] lp_quantization
;[esp + 32] order
;[esp + 28] qlp_coeff[]
;[esp + 24] data_len
;[esp + 20] residual[]
;ASSERT(order > 0)
push ebp
push ebx
push esi
push edi
mov esi, [esp + 20] ; esi = residual[]
mov edi, [esp + 40] ; edi = data[]
mov eax, [esp + 32] ; eax = order
mov ebx, [esp + 24] ; ebx = data_len
test ebx, ebx
jz near .end ; do nothing if data_len == 0
.begin:
cmp eax, byte 1
jg short .x87_1more
mov ecx, [esp + 28]
mov edx, [ecx]
mov eax, [edi - 4]
mov cl, [esp + 36]
ALIGN 16
.x87_1_loop_i:
imul eax, edx
sar eax, cl
add eax, [esi]
mov [edi], eax
add esi, byte 4
add edi, byte 4
dec ebx
jnz .x87_1_loop_i
jmp .end
.x87_1more:
cmp eax, byte 32 ; for order <= 32 there is a faster routine
jbe short .x87_32
; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
ALIGN 16
.x87_32more_loop_i:
xor ebp, ebp
mov ecx, [esp + 32]
mov edx, ecx
shl edx, 2
add edx, [esp + 28]
neg ecx
ALIGN 16
.x87_32more_loop_j:
sub edx, byte 4
mov eax, [edx]
imul eax, [edi + 4 * ecx]
add ebp, eax
inc ecx
jnz short .x87_32more_loop_j
mov cl, [esp + 36]
sar ebp, cl
add ebp, [esi]
mov [edi], ebp
add edi, byte 4
add esi, byte 4
dec ebx
jnz .x87_32more_loop_i
jmp .end
.x87_32:
sub esi, edi
neg eax
lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
call .get_eip0
.get_eip0:
pop eax
add edx, eax
inc edx ; compensate for the shorter opcode on the last iteration
mov eax, [esp + 28] ; eax = qlp_coeff[]
xor ebp, ebp
jmp edx
mov ecx, [eax + 124] ; ecx = qlp_coeff[31]
imul ecx, [edi - 128] ; ecx = qlp_coeff[31] * data[i-32]
add ebp, ecx ; sum += qlp_coeff[31] * data[i-32]
mov ecx, [eax + 120] ; ecx = qlp_coeff[30]
imul ecx, [edi - 124] ; ecx = qlp_coeff[30] * data[i-31]
add ebp, ecx ; sum += qlp_coeff[30] * data[i-31]
mov ecx, [eax + 116] ; ecx = qlp_coeff[29]
imul ecx, [edi - 120] ; ecx = qlp_coeff[29] * data[i-30]
add ebp, ecx ; sum += qlp_coeff[29] * data[i-30]
mov ecx, [eax + 112] ; ecx = qlp_coeff[28]
imul ecx, [edi - 116] ; ecx = qlp_coeff[28] * data[i-29]
add ebp, ecx ; sum += qlp_coeff[28] * data[i-29]
mov ecx, [eax + 108] ; ecx = qlp_coeff[27]
imul ecx, [edi - 112] ; ecx = qlp_coeff[27] * data[i-28]
add ebp, ecx ; sum += qlp_coeff[27] * data[i-28]
mov ecx, [eax + 104] ; ecx = qlp_coeff[26]
imul ecx, [edi - 108] ; ecx = qlp_coeff[26] * data[i-27]
add ebp, ecx ; sum += qlp_coeff[26] * data[i-27]
mov ecx, [eax + 100] ; ecx = qlp_coeff[25]
imul ecx, [edi - 104] ; ecx = qlp_coeff[25] * data[i-26]
add ebp, ecx ; sum += qlp_coeff[25] * data[i-26]
mov ecx, [eax + 96] ; ecx = qlp_coeff[24]
imul ecx, [edi - 100] ; ecx = qlp_coeff[24] * data[i-25]
add ebp, ecx ; sum += qlp_coeff[24] * data[i-25]
mov ecx, [eax + 92] ; ecx = qlp_coeff[23]
imul ecx, [edi - 96] ; ecx = qlp_coeff[23] * data[i-24]
add ebp, ecx ; sum += qlp_coeff[23] * data[i-24]
mov ecx, [eax + 88] ; ecx = qlp_coeff[22]
imul ecx, [edi - 92] ; ecx = qlp_coeff[22] * data[i-23]
add ebp, ecx ; sum += qlp_coeff[22] * data[i-23]
mov ecx, [eax + 84] ; ecx = qlp_coeff[21]
imul ecx, [edi - 88] ; ecx = qlp_coeff[21] * data[i-22]
add ebp, ecx ; sum += qlp_coeff[21] * data[i-22]
mov ecx, [eax + 80] ; ecx = qlp_coeff[20]
imul ecx, [edi - 84] ; ecx = qlp_coeff[20] * data[i-21]
add ebp, ecx ; sum += qlp_coeff[20] * data[i-21]
mov ecx, [eax + 76] ; ecx = qlp_coeff[19]
imul ecx, [edi - 80] ; ecx = qlp_coeff[19] * data[i-20]
add ebp, ecx ; sum += qlp_coeff[19] * data[i-20]
mov ecx, [eax + 72] ; ecx = qlp_coeff[18]
imul ecx, [edi - 76] ; ecx = qlp_coeff[18] * data[i-19]
add ebp, ecx ; sum += qlp_coeff[18] * data[i-19]
mov ecx, [eax + 68] ; ecx = qlp_coeff[17]
imul ecx, [edi - 72] ; ecx = qlp_coeff[17] * data[i-18]
add ebp, ecx ; sum += qlp_coeff[17] * data[i-18]
mov ecx, [eax + 64] ; ecx = qlp_coeff[16]
imul ecx, [edi - 68] ; ecx = qlp_coeff[16] * data[i-17]
add ebp, ecx ; sum += qlp_coeff[16] * data[i-17]
mov ecx, [eax + 60] ; ecx = qlp_coeff[15]
imul ecx, [edi - 64] ; ecx = qlp_coeff[15] * data[i-16]
add ebp, ecx ; sum += qlp_coeff[15] * data[i-16]
mov ecx, [eax + 56] ; ecx = qlp_coeff[14]
imul ecx, [edi - 60] ; ecx = qlp_coeff[14] * data[i-15]
add ebp, ecx ; sum += qlp_coeff[14] * data[i-15]
mov ecx, [eax + 52] ; ecx = qlp_coeff[13]
imul ecx, [edi - 56] ; ecx = qlp_coeff[13] * data[i-14]
add ebp, ecx ; sum += qlp_coeff[13] * data[i-14]
mov ecx, [eax + 48] ; ecx = qlp_coeff[12]
imul ecx, [edi - 52] ; ecx = qlp_coeff[12] * data[i-13]
add ebp, ecx ; sum += qlp_coeff[12] * data[i-13]
mov ecx, [eax + 44] ; ecx = qlp_coeff[11]
imul ecx, [edi - 48] ; ecx = qlp_coeff[11] * data[i-12]
add ebp, ecx ; sum += qlp_coeff[11] * data[i-12]
mov ecx, [eax + 40] ; ecx = qlp_coeff[10]
imul ecx, [edi - 44] ; ecx = qlp_coeff[10] * data[i-11]
add ebp, ecx ; sum += qlp_coeff[10] * data[i-11]
mov ecx, [eax + 36] ; ecx = qlp_coeff[ 9]
imul ecx, [edi - 40] ; ecx = qlp_coeff[ 9] * data[i-10]
add ebp, ecx ; sum += qlp_coeff[ 9] * data[i-10]
mov ecx, [eax + 32] ; ecx = qlp_coeff[ 8]
imul ecx, [edi - 36] ; ecx = qlp_coeff[ 8] * data[i- 9]
add ebp, ecx ; sum += qlp_coeff[ 8] * data[i- 9]
mov ecx, [eax + 28] ; ecx = qlp_coeff[ 7]
imul ecx, [edi - 32] ; ecx = qlp_coeff[ 7] * data[i- 8]
add ebp, ecx ; sum += qlp_coeff[ 7] * data[i- 8]
mov ecx, [eax + 24] ; ecx = qlp_coeff[ 6]
imul ecx, [edi - 28] ; ecx = qlp_coeff[ 6] * data[i- 7]
add ebp, ecx ; sum += qlp_coeff[ 6] * data[i- 7]
mov ecx, [eax + 20] ; ecx = qlp_coeff[ 5]
imul ecx, [edi - 24] ; ecx = qlp_coeff[ 5] * data[i- 6]
add ebp, ecx ; sum += qlp_coeff[ 5] * data[i- 6]
mov ecx, [eax + 16] ; ecx = qlp_coeff[ 4]
imul ecx, [edi - 20] ; ecx = qlp_coeff[ 4] * data[i- 5]
add ebp, ecx ; sum += qlp_coeff[ 4] * data[i- 5]
mov ecx, [eax + 12] ; ecx = qlp_coeff[ 3]
imul ecx, [edi - 16] ; ecx = qlp_coeff[ 3] * data[i- 4]
add ebp, ecx ; sum += qlp_coeff[ 3] * data[i- 4]
mov ecx, [eax + 8] ; ecx = qlp_coeff[ 2]
imul ecx, [edi - 12] ; ecx = qlp_coeff[ 2] * data[i- 3]
add ebp, ecx ; sum += qlp_coeff[ 2] * data[i- 3]
mov ecx, [eax + 4] ; ecx = qlp_coeff[ 1]
imul ecx, [edi - 8] ; ecx = qlp_coeff[ 1] * data[i- 2]
add ebp, ecx ; sum += qlp_coeff[ 1] * data[i- 2]
mov ecx, [eax] ; ecx = qlp_coeff[ 0] (NOTE: one byte missing from instruction)
imul ecx, [edi - 4] ; ecx = qlp_coeff[ 0] * data[i- 1]
add ebp, ecx ; sum += qlp_coeff[ 0] * data[i- 1]
.jumper_0:
mov cl, [esp + 36]
sar ebp, cl ; ebp = (sum >> lp_quantization)
add ebp, [esi + edi] ; ebp = residual[i] + (sum >> lp_quantization)
mov [edi], ebp ; data[i] = residual[i] + (sum >> lp_quantization)
add edi, byte 4
dec ebx
jz short .end
xor ebp, ebp
jmp edx
.end:
pop edi
pop esi
pop ebx
pop ebp
ret
; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
; the channel must be <= 16. Especially note that this routine cannot be used
; for side-channel coded 16bps channels since the effective bps is 17.
; WATCHOUT: this routine requires that each data array have a buffer of up to
; 3 zeroes in front (at negative indices) for alignment purposes, i.e. for each
; channel n, data[n][-1] through data[n][-3] should be accessible and zero.
ALIGN 16
cident FLAC__lpc_restore_signal_asm_ia32_mmx
;[esp + 40] data[]
;[esp + 36] lp_quantization
;[esp + 32] order
;[esp + 28] qlp_coeff[]
;[esp + 24] data_len
;[esp + 20] residual[]
;ASSERT(order > 0)
push ebp
push ebx
push esi
push edi
mov esi, [esp + 20]
mov edi, [esp + 40]
mov eax, [esp + 32]
mov ebx, [esp + 24]
test ebx, ebx
jz near .end ; do nothing if data_len == 0
cmp eax, byte 4
jb near FLAC__lpc_restore_signal_asm_ia32.begin
mov edx, [esp + 28]
movd mm6, [esp + 36]
mov ebp, esp
and esp, 0xfffffff8
xor ecx, ecx
.copy_qlp_loop:
push word [edx + 4 * ecx]
inc ecx
cmp ecx, eax
jnz short .copy_qlp_loop
and ecx, 0x3
test ecx, ecx
je short .za_end
sub ecx, byte 4
.za_loop:
push word 0
inc eax
inc ecx
jnz short .za_loop
.za_end:
movq mm5, [esp + 2 * eax - 8]
movd mm4, [edi - 16]
punpckldq mm4, [edi - 12]
movd mm0, [edi - 8]
punpckldq mm0, [edi - 4]
packssdw mm4, mm0
cmp eax, byte 4
jnbe short .mmx_4more
align 16
.mmx_4_loop_i:
movq mm7, mm4
pmaddwd mm7, mm5
movq mm0, mm7
punpckhdq mm7, mm7
paddd mm7, mm0
psrad mm7, mm6
movd mm1, [esi]
paddd mm7, mm1
movd [edi], mm7
psllq mm7, 48
psrlq mm4, 16
por mm4, mm7
add esi, byte 4
add edi, byte 4
dec ebx
jnz .mmx_4_loop_i
jmp .mmx_end
.mmx_4more:
shl eax, 2
neg eax
add eax, byte 16
align 16
.mmx_4more_loop_i:
mov ecx, edi
add ecx, eax
mov edx, esp
movq mm7, mm4
pmaddwd mm7, mm5
align 16
.mmx_4more_loop_j:
movd mm0, [ecx - 16]
punpckldq mm0, [ecx - 12]
movd mm1, [ecx - 8]
punpckldq mm1, [ecx - 4]
packssdw mm0, mm1
pmaddwd mm0, [edx]
paddd mm7, mm0
add edx, byte 8
add ecx, byte 16
cmp ecx, edi
jnz .mmx_4more_loop_j
movq mm0, mm7
punpckhdq mm7, mm7
paddd mm7, mm0
psrad mm7, mm6
movd mm1, [esi]
paddd mm7, mm1
movd [edi], mm7
psllq mm7, 48
psrlq mm4, 16
por mm4, mm7
add esi, byte 4
add edi, byte 4
dec ebx
jnz short .mmx_4more_loop_i
.mmx_end:
emms
mov esp, ebp
.end:
pop edi
pop esi
pop ebx
pop ebp
ret
end
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -