📄 lpc_asm.nasm
字号:
sub ecx, byte 4.za_loop: push word 0 inc eax inc ecx jnz short .za_loop.za_end: movq mm5, [esp + 2 * eax - 8] movd mm4, [esi - 16] punpckldq mm4, [esi - 12] movd mm0, [esi - 8] punpckldq mm0, [esi - 4] packssdw mm4, mm0 cmp eax, byte 4 jnbe short .mmx_4more align 16.mmx_4_loop_i: movd mm1, [esi] movq mm3, mm4 punpckldq mm1, [esi + 4] psrlq mm4, 16 movq mm0, mm1 psllq mm0, 48 por mm4, mm0 movq mm2, mm4 psrlq mm4, 16 pxor mm0, mm0 punpckhdq mm0, mm1 pmaddwd mm3, mm5 pmaddwd mm2, mm5 psllq mm0, 16 por mm4, mm0 movq mm0, mm3 punpckldq mm3, mm2 punpckhdq mm0, mm2 paddd mm3, mm0 psrad mm3, mm6 psubd mm1, mm3 movd [edi], mm1 punpckhdq mm1, mm1 movd [edi + 4], mm1 add edi, byte 8 add esi, byte 8 sub ebx, 2 jg .mmx_4_loop_i jmp .mmx_end.mmx_4more: shl eax, 2 neg eax add eax, byte 16 align 16.mmx_4more_loop_i: movd mm1, [esi] punpckldq mm1, [esi + 4] movq mm3, mm4 psrlq mm4, 16 movq mm0, mm1 psllq mm0, 48 por mm4, mm0 movq mm2, mm4 psrlq mm4, 16 pxor mm0, mm0 punpckhdq mm0, mm1 pmaddwd mm3, mm5 pmaddwd mm2, mm5 psllq mm0, 16 por mm4, mm0 mov ecx, esi add ecx, eax mov edx, esp align 16.mmx_4more_loop_j: movd mm0, [ecx - 16] movd mm7, [ecx - 8] punpckldq mm0, [ecx - 12] punpckldq mm7, [ecx - 4] packssdw mm0, mm7 pmaddwd mm0, [edx] punpckhdq mm7, mm7 paddd mm3, mm0 movd mm0, [ecx - 12] punpckldq mm0, [ecx - 8] punpckldq mm7, [ecx] packssdw mm0, mm7 pmaddwd mm0, [edx] paddd mm2, mm0 add edx, byte 8 add ecx, byte 16 cmp ecx, esi jnz .mmx_4more_loop_j movq mm0, mm3 punpckldq mm3, mm2 punpckhdq mm0, mm2 paddd mm3, mm0 psrad mm3, mm6 psubd mm1, mm3 movd [edi], mm1 punpckhdq mm1, mm1 movd [edi + 4], mm1 add edi, byte 8 add esi, byte 8 sub ebx, 2 jg near .mmx_4more_loop_i.mmx_end: emms mov esp, ebp.last_one: mov eax, [esp + 32] inc ebx jnz near FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32.begin.end: pop edi pop esi pop ebx pop ebp ret; **********************************************************************;; void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]); {; unsigned i, j;; FLAC__int32 sum;;; FLAC__ASSERT(order > 0);;; for(i = 0; i < data_len; i++) {; sum = 0;; for(j = 0; j < order; j++); sum += qlp_coeff[j] * data[i-j-1];; data[i] = residual[i] + (sum >> lp_quantization);; }; } ALIGN 16cident FLAC__lpc_restore_signal_asm_ia32 ;[esp + 40] data[] ;[esp + 36] lp_quantization ;[esp + 32] order ;[esp + 28] qlp_coeff[] ;[esp + 24] data_len ;[esp + 20] residual[] ;ASSERT(order > 0) push ebp push ebx push esi push edi mov esi, [esp + 20] ; esi = residual[] mov edi, [esp + 40] ; edi = data[] mov eax, [esp + 32] ; eax = order mov ebx, [esp + 24] ; ebx = data_len test ebx, ebx jz near .end ; do nothing if data_len == 0.begin: cmp eax, byte 1 jg short .x87_1more mov ecx, [esp + 28] mov edx, [ecx] mov eax, [edi - 4] mov cl, [esp + 36] ALIGN 16.x87_1_loop_i: imul eax, edx sar eax, cl add eax, [esi] mov [edi], eax add esi, byte 4 add edi, byte 4 dec ebx jnz .x87_1_loop_i jmp .end.x87_1more: cmp eax, byte 32 ; for order <= 32 there is a faster routine jbe short .x87_32 ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32 ALIGN 16.x87_32more_loop_i: xor ebp, ebp mov ecx, [esp + 32] mov edx, ecx shl edx, 2 add edx, [esp + 28] neg ecx ALIGN 16.x87_32more_loop_j: sub edx, byte 4 mov eax, [edx] imul eax, [edi + 4 * ecx] add ebp, eax inc ecx jnz short .x87_32more_loop_j mov cl, [esp + 36] sar ebp, cl add ebp, [esi] mov [edi], ebp add edi, byte 4 add esi, byte 4 dec ebx jnz .x87_32more_loop_i jmp .end.x87_32: sub esi, edi neg eax lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0] call .get_eip0.get_eip0: pop eax add edx, eax inc edx ; compensate for the shorter opcode on the last iteration mov eax, [esp + 28] ; eax = qlp_coeff[] xor ebp, ebp jmp edx mov ecx, [eax + 124] ; ecx = qlp_coeff[31] imul ecx, [edi - 128] ; ecx = qlp_coeff[31] * data[i-32] add ebp, ecx ; sum += qlp_coeff[31] * data[i-32] mov ecx, [eax + 120] ; ecx = qlp_coeff[30] imul ecx, [edi - 124] ; ecx = qlp_coeff[30] * data[i-31] add ebp, ecx ; sum += qlp_coeff[30] * data[i-31] mov ecx, [eax + 116] ; ecx = qlp_coeff[29] imul ecx, [edi - 120] ; ecx = qlp_coeff[29] * data[i-30] add ebp, ecx ; sum += qlp_coeff[29] * data[i-30] mov ecx, [eax + 112] ; ecx = qlp_coeff[28] imul ecx, [edi - 116] ; ecx = qlp_coeff[28] * data[i-29] add ebp, ecx ; sum += qlp_coeff[28] * data[i-29] mov ecx, [eax + 108] ; ecx = qlp_coeff[27] imul ecx, [edi - 112] ; ecx = qlp_coeff[27] * data[i-28] add ebp, ecx ; sum += qlp_coeff[27] * data[i-28] mov ecx, [eax + 104] ; ecx = qlp_coeff[26] imul ecx, [edi - 108] ; ecx = qlp_coeff[26] * data[i-27] add ebp, ecx ; sum += qlp_coeff[26] * data[i-27] mov ecx, [eax + 100] ; ecx = qlp_coeff[25] imul ecx, [edi - 104] ; ecx = qlp_coeff[25] * data[i-26] add ebp, ecx ; sum += qlp_coeff[25] * data[i-26] mov ecx, [eax + 96] ; ecx = qlp_coeff[24] imul ecx, [edi - 100] ; ecx = qlp_coeff[24] * data[i-25] add ebp, ecx ; sum += qlp_coeff[24] * data[i-25] mov ecx, [eax + 92] ; ecx = qlp_coeff[23] imul ecx, [edi - 96] ; ecx = qlp_coeff[23] * data[i-24] add ebp, ecx ; sum += qlp_coeff[23] * data[i-24] mov ecx, [eax + 88] ; ecx = qlp_coeff[22] imul ecx, [edi - 92] ; ecx = qlp_coeff[22] * data[i-23] add ebp, ecx ; sum += qlp_coeff[22] * data[i-23] mov ecx, [eax + 84] ; ecx = qlp_coeff[21] imul ecx, [edi - 88] ; ecx = qlp_coeff[21] * data[i-22] add ebp, ecx ; sum += qlp_coeff[21] * data[i-22] mov ecx, [eax + 80] ; ecx = qlp_coeff[20] imul ecx, [edi - 84] ; ecx = qlp_coeff[20] * data[i-21] add ebp, ecx ; sum += qlp_coeff[20] * data[i-21] mov ecx, [eax + 76] ; ecx = qlp_coeff[19] imul ecx, [edi - 80] ; ecx = qlp_coeff[19] * data[i-20] add ebp, ecx ; sum += qlp_coeff[19] * data[i-20] mov ecx, [eax + 72] ; ecx = qlp_coeff[18] imul ecx, [edi - 76] ; ecx = qlp_coeff[18] * data[i-19] add ebp, ecx ; sum += qlp_coeff[18] * data[i-19] mov ecx, [eax + 68] ; ecx = qlp_coeff[17] imul ecx, [edi - 72] ; ecx = qlp_coeff[17] * data[i-18] add ebp, ecx ; sum += qlp_coeff[17] * data[i-18] mov ecx, [eax + 64] ; ecx = qlp_coeff[16] imul ecx, [edi - 68] ; ecx = qlp_coeff[16] * data[i-17] add ebp, ecx ; sum += qlp_coeff[16] * data[i-17] mov ecx, [eax + 60] ; ecx = qlp_coeff[15] imul ecx, [edi - 64] ; ecx = qlp_coeff[15] * data[i-16] add ebp, ecx ; sum += qlp_coeff[15] * data[i-16] mov ecx, [eax + 56] ; ecx = qlp_coeff[14] imul ecx, [edi - 60] ; ecx = qlp_coeff[14] * data[i-15] add ebp, ecx ; sum += qlp_coeff[14] * data[i-15] mov ecx, [eax + 52] ; ecx = qlp_coeff[13] imul ecx, [edi - 56] ; ecx = qlp_coeff[13] * data[i-14] add ebp, ecx ; sum += qlp_coeff[13] * data[i-14] mov ecx, [eax + 48] ; ecx = qlp_coeff[12] imul ecx, [edi - 52] ; ecx = qlp_coeff[12] * data[i-13] add ebp, ecx ; sum += qlp_coeff[12] * data[i-13] mov ecx, [eax + 44] ; ecx = qlp_coeff[11] imul ecx, [edi - 48] ; ecx = qlp_coeff[11] * data[i-12] add ebp, ecx ; sum += qlp_coeff[11] * data[i-12] mov ecx, [eax + 40] ; ecx = qlp_coeff[10] imul ecx, [edi - 44] ; ecx = qlp_coeff[10] * data[i-11] add ebp, ecx ; sum += qlp_coeff[10] * data[i-11] mov ecx, [eax + 36] ; ecx = qlp_coeff[ 9] imul ecx, [edi - 40] ; ecx = qlp_coeff[ 9] * data[i-10] add ebp, ecx ; sum += qlp_coeff[ 9] * data[i-10] mov ecx, [eax + 32] ; ecx = qlp_coeff[ 8] imul ecx, [edi - 36] ; ecx = qlp_coeff[ 8] * data[i- 9] add ebp, ecx ; sum += qlp_coeff[ 8] * data[i- 9] mov ecx, [eax + 28] ; ecx = qlp_coeff[ 7] imul ecx, [edi - 32] ; ecx = qlp_coeff[ 7] * data[i- 8] add ebp, ecx ; sum += qlp_coeff[ 7] * data[i- 8] mov ecx, [eax + 24] ; ecx = qlp_coeff[ 6] imul ecx, [edi - 28] ; ecx = qlp_coeff[ 6] * data[i- 7] add ebp, ecx ; sum += qlp_coeff[ 6] * data[i- 7] mov ecx, [eax + 20] ; ecx = qlp_coeff[ 5] imul ecx, [edi - 24] ; ecx = qlp_coeff[ 5] * data[i- 6] add ebp, ecx ; sum += qlp_coeff[ 5] * data[i- 6] mov ecx, [eax + 16] ; ecx = qlp_coeff[ 4] imul ecx, [edi - 20] ; ecx = qlp_coeff[ 4] * data[i- 5] add ebp, ecx ; sum += qlp_coeff[ 4] * data[i- 5] mov ecx, [eax + 12] ; ecx = qlp_coeff[ 3] imul ecx, [edi - 16] ; ecx = qlp_coeff[ 3] * data[i- 4] add ebp, ecx ; sum += qlp_coeff[ 3] * data[i- 4] mov ecx, [eax + 8] ; ecx = qlp_coeff[ 2] imul ecx, [edi - 12] ; ecx = qlp_coeff[ 2] * data[i- 3] add ebp, ecx ; sum += qlp_coeff[ 2] * data[i- 3] mov ecx, [eax + 4] ; ecx = qlp_coeff[ 1] imul ecx, [edi - 8] ; ecx = qlp_coeff[ 1] * data[i- 2] add ebp, ecx ; sum += qlp_coeff[ 1] * data[i- 2] mov ecx, [eax] ; ecx = qlp_coeff[ 0] (NOTE: one byte missing from instruction) imul ecx, [edi - 4] ; ecx = qlp_coeff[ 0] * data[i- 1] add ebp, ecx ; sum += qlp_coeff[ 0] * data[i- 1].jumper_0: mov cl, [esp + 36] sar ebp, cl ; ebp = (sum >> lp_quantization) add ebp, [esi + edi] ; ebp = residual[i] + (sum >> lp_quantization) mov [edi], ebp ; data[i] = residual[i] + (sum >> lp_quantization) add edi, byte 4 dec ebx jz short .end xor ebp, ebp jmp edx.end: pop edi pop esi pop ebx pop ebp ret; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for; the channel must be <= 16. Especially note that this routine cannot be used; for side-channel coded 16bps channels since the effective bps is 17.; WATCHOUT: this routine requires that each data array have a buffer of up to; 3 zeroes in front (at negative indices) for alignment purposes, i.e. for each; channel n, data[n][-1] through data[n][-3] should be accessible and zero. ALIGN 16cident FLAC__lpc_restore_signal_asm_ia32_mmx ;[esp + 40] data[] ;[esp + 36] lp_quantization ;[esp + 32] order ;[esp + 28] qlp_coeff[] ;[esp + 24] data_len ;[esp + 20] residual[] ;ASSERT(order > 0) push ebp push ebx push esi push edi mov esi, [esp + 20] mov edi, [esp + 40] mov eax, [esp + 32] mov ebx, [esp + 24] test ebx, ebx jz near .end ; do nothing if data_len == 0 cmp eax, byte 4 jb near FLAC__lpc_restore_signal_asm_ia32.begin mov edx, [esp + 28] movd mm6, [esp + 36] mov ebp, esp and esp, 0xfffffff8 xor ecx, ecx.copy_qlp_loop: push word [edx + 4 * ecx] inc ecx cmp ecx, eax jnz short .copy_qlp_loop and ecx, 0x3 test ecx, ecx je short .za_end sub ecx, byte 4.za_loop: push word 0 inc eax inc ecx jnz short .za_loop.za_end: movq mm5, [esp + 2 * eax - 8] movd mm4, [edi - 16] punpckldq mm4, [edi - 12] movd mm0, [edi - 8] punpckldq mm0, [edi - 4] packssdw mm4, mm0 cmp eax, byte 4 jnbe short .mmx_4more align 16.mmx_4_loop_i: movq mm7, mm4 pmaddwd mm7, mm5 movq mm0, mm7 punpckhdq mm7, mm7 paddd mm7, mm0 psrad mm7, mm6 movd mm1, [esi] paddd mm7, mm1 movd [edi], mm7 psllq mm7, 48 psrlq mm4, 16 por mm4, mm7 add esi, byte 4 add edi, byte 4 dec ebx jnz .mmx_4_loop_i jmp .mmx_end.mmx_4more: shl eax, 2 neg eax add eax, byte 16 align 16.mmx_4more_loop_i: mov ecx, edi add ecx, eax mov edx, esp movq mm7, mm4 pmaddwd mm7, mm5 align 16.mmx_4more_loop_j: movd mm0, [ecx - 16] punpckldq mm0, [ecx - 12] movd mm1, [ecx - 8] punpckldq mm1, [ecx - 4] packssdw mm0, mm1 pmaddwd mm0, [edx] paddd mm7, mm0 add edx, byte 8 add ecx, byte 16 cmp ecx, edi jnz .mmx_4more_loop_j movq mm0, mm7 punpckhdq mm7, mm7 paddd mm7, mm0 psrad mm7, mm6 movd mm1, [esi] paddd mm7, mm1 movd [edi], mm7 psllq mm7, 48 psrlq mm4, 16 por mm4, mm7 add esi, byte 4 add edi, byte 4 dec ebx jnz short .mmx_4more_loop_i.mmx_end: emms mov esp, ebp.end: pop edi pop esi pop ebx pop ebp retend
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -