📄 quant_mmx.asm
字号:
;/**************************************************************************; *; * XVID MPEG-4 VIDEO CODEC; * mmx quantization/dequantization; *; * This program is an implementation of a part of one or more MPEG-4; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending; * to use this software module in hardware or software products are; * advised that its use may infringe existing patents or copyrights, and; * any such use would be at such party's own risk. The original; * developer of this software module and his/her company, and subsequent; * editors and their companies, will have no liability for use of this; * software or modifications or derivatives thereof.; *; * This program is free software; you can redistribute it and/or modify; * it under the terms of the GNU General Public License as published by; * the Free Software Foundation; either version 2 of the License, or; * (at your option) any later version.; *; * This program is distributed in the hope that it will be useful,; * but WITHOUT ANY WARRANTY; without even the implied warranty of; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the; * GNU General Public License for more details.; *; * You should have received a copy of the GNU General Public License; * along with this program; if not, write to the Free Software; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.; *; *************************************************************************/;/**************************************************************************; *; * History:; *; * 19.11.2001 quant_inter_mmx now returns sum of abs. coefficient values; * 04.11.2001 nasm version; (c)2001 peter ross <pross@cs.rmit.edu.au>; *; *************************************************************************/bits 32section .dataalign 16%macro cglobal 1 %ifdef PREFIX global _%1 %define %1 _%1 %else global %1 %endif%endmacroplus_one times 4 dw 1;===========================================================================;; subtract by Q/2 table;;===========================================================================%macro MMX_SUB 1times 4 dw %1 / 2%endmacrommx_sub MMX_SUB 0 MMX_SUB 1 MMX_SUB 2 MMX_SUB 3 MMX_SUB 4 MMX_SUB 5 MMX_SUB 6 MMX_SUB 7 MMX_SUB 8 MMX_SUB 9 MMX_SUB 10 MMX_SUB 11 MMX_SUB 12 MMX_SUB 13 MMX_SUB 14 MMX_SUB 15 MMX_SUB 16 MMX_SUB 17 MMX_SUB 18 MMX_SUB 19 MMX_SUB 20 MMX_SUB 21 MMX_SUB 22 MMX_SUB 23 MMX_SUB 24 MMX_SUB 25 MMX_SUB 26 MMX_SUB 27 MMX_SUB 28 MMX_SUB 29 MMX_SUB 30 MMX_SUB 31;===========================================================================;; divide by 2Q table ;; use a shift of 16 to take full advantage of _pmulhw_; for q=1, _pmulhw_ will overflow so it is treated seperately; (3dnow2 provides _pmulhuw_ which wont cause overflow);;===========================================================================%macro MMX_DIV 1times 4 dw (1 << 16) / (%1 * 2) + 1%endmacrommx_div MMX_DIV 1 MMX_DIV 1 MMX_DIV 2 MMX_DIV 3 MMX_DIV 4 MMX_DIV 5 MMX_DIV 6 MMX_DIV 7 MMX_DIV 8 MMX_DIV 9 MMX_DIV 10 MMX_DIV 11 MMX_DIV 12 MMX_DIV 13 MMX_DIV 14 MMX_DIV 15 MMX_DIV 16 MMX_DIV 17 MMX_DIV 18 MMX_DIV 19 MMX_DIV 20 MMX_DIV 21 MMX_DIV 22 MMX_DIV 23 MMX_DIV 24 MMX_DIV 25 MMX_DIV 26 MMX_DIV 27 MMX_DIV 28 MMX_DIV 29 MMX_DIV 30 MMX_DIV 31;===========================================================================;; add by (odd(Q) ? Q : Q - 1) table;;===========================================================================%macro MMX_ADD 1%if %1 % 1 != 0times 4 dw %1%elsetimes 4 dw %1 - 1%endif%endmacrommx_add MMX_ADD 0 MMX_ADD 1 MMX_ADD 2 MMX_ADD 3 MMX_ADD 4 MMX_ADD 5 MMX_ADD 6 MMX_ADD 7 MMX_ADD 8 MMX_ADD 9 MMX_ADD 10 MMX_ADD 11 MMX_ADD 12 MMX_ADD 13 MMX_ADD 14 MMX_ADD 15 MMX_ADD 16 MMX_ADD 17 MMX_ADD 18 MMX_ADD 19 MMX_ADD 20 MMX_ADD 21 MMX_ADD 22 MMX_ADD 23 MMX_ADD 24 MMX_ADD 25 MMX_ADD 26 MMX_ADD 27 MMX_ADD 28 MMX_ADD 29 MMX_ADD 30 MMX_ADD 31;===========================================================================;; multiple by 2Q table;;===========================================================================%macro MMX_MUL 1times 4 dw %1 * 2%endmacrommx_mul MMX_MUL 0 MMX_MUL 1 MMX_MUL 2 MMX_MUL 3 MMX_MUL 4 MMX_MUL 5 MMX_MUL 6 MMX_MUL 7 MMX_MUL 8 MMX_MUL 9 MMX_MUL 10 MMX_MUL 11 MMX_MUL 12 MMX_MUL 13 MMX_MUL 14 MMX_MUL 15 MMX_MUL 16 MMX_MUL 17 MMX_MUL 18 MMX_MUL 19 MMX_MUL 20 MMX_MUL 21 MMX_MUL 22 MMX_MUL 23 MMX_MUL 24 MMX_MUL 25 MMX_MUL 26 MMX_MUL 27 MMX_MUL 28 MMX_MUL 29 MMX_MUL 30 MMX_MUL 31section .text;===========================================================================;; void quant_intra_mmx(int16_t * coeff, ; const int16_t const * data,; const uint32_t quant,; const uint32_t dcscalar);;;===========================================================================align 16cglobal enc_quant_intra_mmxenc_quant_intra_mmx push esi push edi mov edi, [esp + 8 + 4] ; coeff mov esi, [esp + 8 + 8] ; data mov eax, [esp + 8 + 12] ; quant cmp al, 1 mov ecx, 8 jz .q1loop movq mm7, [mmx_div + eax * 8].loop movq mm0, [esi] ; mm0 = [1st] movq mm3, [esi + 8] ; pxor mm1, mm1 ; mm1 = 0 pxor mm4, mm4 ; pcmpgtw mm1, mm0 ; mm1 = (0 > mm0) pcmpgtw mm4, mm3 ; pxor mm0, mm1 ; mm0 = |mm0| pxor mm3, mm4 ; psubw mm0, mm1 ; displace psubw mm3, mm4 ; pmulhw mm0, mm7 ; mm0 = (mm0 / 2Q) >> 16 pmulhw mm3, mm7 ; pxor mm0, mm1 ; mm0 *= sign(mm0) pxor mm3, mm4 ; psubw mm0, mm1 ; undisplace psubw mm3, mm4 ; movq [edi], mm0 movq [edi + 8], mm3 add esi, 16 add edi, 16 dec ecx jnz .loop .done mov ecx, [esp + 8 + 16] ; dcscalar mov eax, ecx movsx edx, word [esi - 128] ; data[0] shr eax, 1 ; eax = data[0] + dcscalar>>1 add eax, edx ; cdq ; expand eax -> edx:eax idiv ecx ; eax = edx:eax / dcscalar mov [edi - 128], ax ; data[0] = ax pop edi pop esi ret .q1loop movq mm0, [esi] ; mm0 = [1st] movq mm3, [esi + 8] ; pxor mm1, mm1 ; mm1 = 0 pxor mm4, mm4 ; pcmpgtw mm1, mm0 ; mm1 = (0 > mm0) pcmpgtw mm4, mm3 ; pxor mm0, mm1 ; mm0 = |mm0| pxor mm3, mm4 ; psubw mm0, mm1 ; displace psubw mm3, mm4 ; psrlw mm0, 1 ; mm0 >>= 1 (/2) psrlw mm3, 1 ; pxor mm0, mm1 ; mm0 *= sign(mm0) pxor mm3, mm4 ; psubw mm0, mm1 ; undisplace psubw mm3, mm4 movq [edi], mm0 movq [edi + 8], mm3 add esi, 16 add edi, 16 dec ecx jnz .q1loop jmp short .done;===========================================================================;; uint32_t quant_inter_mmx(int16_t * coeff,; const int16_t const * data,; const uint32_t quant);;;===========================================================================align 16cglobal enc_quant_inter_mmxenc_quant_inter_mmx push esi push edi mov edi, [esp + 8 + 4] ; coeff mov esi, [esp + 8 + 8] ; data mov eax, [esp + 8 + 12] ; quant pxor mm5, mm5 ; present movq mm6, [mmx_sub + eax * 8] ; sub cmp al, 1 mov ecx, 8 jz .q1loop movq mm7, [mmx_div + eax * 8] ; divider.loop movq mm0, [esi] ; mm0 = [1st] movq mm3, [esi + 8] ; pxor mm1, mm1 ; mm1 = 0 pxor mm4, mm4 ; pcmpgtw mm1, mm0 ; mm1 = (0 > mm0) pcmpgtw mm4, mm3 ; pxor mm0, mm1 ; mm0 = |mm0| pxor mm3, mm4 ; psubw mm0, mm1 ; displace psubw mm3, mm4 ; psubusw mm0, mm6 ; mm0 -= sub (unsigned, dont go < 0) psubusw mm3, mm6 ; pmulhw mm0, mm7 ; mm0 = (mm0 / 2Q) >> 16 pmulhw mm3, mm7 ; movq mm2, mm0 ; mm0 -> mm2 pxor mm0, mm1 ; mm0 *= sign(mm0) psubw mm0, mm1 ; undisplace movq mm1, mm3 ; mm3 -> mm1 pxor mm3, mm4 psubw mm3, mm4 movq [edi], mm0 movq [edi + 8], mm3 paddw mm2, mm1 ; mm1 + mm2 -> mm2 pmaddwd mm2, [plus_one] ; movq mm0, mm2 ; psrlq mm0, 32 ; paddd mm0, mm2 ; paddd mm5, mm0 ; sum += abs(coeff0) + abs(coeff1) + ... + abs(coeff7) add esi, 16 add edi, 16 dec ecx jnz .loop .done movd eax, mm5 ; return sum pop edi pop esi ret.q1loop movq mm0, [esi] ; mm0 = [1st] movq mm3, [esi + 8] ; pxor mm1, mm1 ; mm1 = 0 pxor mm4, mm4 ; pcmpgtw mm1, mm0 ; mm1 = (0 > mm0) pcmpgtw mm4, mm3 ; pxor mm0, mm1 ; mm0 = |mm0| pxor mm3, mm4 ; psubw mm0, mm1 ; displace psubw mm3, mm4 ; psubusw mm0, mm6 ; mm0 -= sub (unsigned, dont go < 0) psubusw mm3, mm6 ; psrlw mm0, 1 ; mm0 >>= 1 (/2) psrlw mm3, 1 ; pxor mm0, mm1 ; mm0 *= sign(mm0) pxor mm3, mm4 ; psubw mm0, mm1 ; undisplace psubw mm3, mm4 movq [edi], mm0 movq [edi + 8], mm3 paddw mm0, mm3 ; mm0 + mm3 -> mm0 pmaddwd mm0, [plus_one] ; movq mm2, mm0 ; psrlq mm2, 32 ; paddd mm0, mm2 ; paddd mm5, mm0 ; sum += abs(coeff0) + abs(coeff1) + ... + abs(coeff7) add esi, 16 add edi, 16 dec ecx jnz .q1loop jmp .done;===========================================================================;; void dequant_intra_mmx(int16_t *data,; const int16_t const *coeff,; const uint32_t quant,; const uint32_t dcscalar);;;===========================================================================align 16cglobal enc_dequant_intra_mmxenc_dequant_intra_mmx push esi push edi mov edi, [esp + 8 + 4] ; data mov esi, [esp + 8 + 8] ; coeff mov eax, [esp + 8 + 12] ; quant movq mm6, [mmx_add + eax * 8] mov ecx, 8 movq mm7, [mmx_mul + eax * 8] .loop movq mm0, [esi] ; mm0 = [1st] movq mm3, [esi + 8] ; pxor mm1, mm1 ; mm1 = 0 pxor mm4, mm4 ; pcmpgtw mm1, mm0 ; mm1 = (0 > mm0) pcmpgtw mm4, mm3 ; pxor mm2, mm2 ; mm2 = 0 pxor mm5, mm5 ; pcmpeqw mm2, mm0 ; mm2 = (0 == mm0) pcmpeqw mm5, mm3 ; pandn mm2, mm6 ; mm2 = (iszero ? 0 : add) pandn mm5, mm6 ; pxor mm0, mm1 ; mm0 = |mm0| pxor mm3, mm4 ; psubw mm0, mm1 ; displace psubw mm3, mm4 ; pmullw mm0, mm7 ; mm0 *= 2Q pmullw mm3, mm7 ; paddw mm0, mm2 ; mm0 += mm2 (add) paddw mm3, mm5 ; pxor mm0, mm1 ; mm0 *= sign(mm0) pxor mm3, mm4 ; psubw mm0, mm1 ; undisplace psubw mm3, mm4 movq [edi], mm0 movq [edi + 8], mm3 add esi, 16 add edi, 16 dec ecx jnz .loop mov ax, [esi - 128] ; ax = data[0] imul ax, [esp + 8 + 16] ; eax = data[0] * dcscalar mov [edi - 128], ax ; data[0] = ax pop edi pop esi ret;===========================================================================;; void dequant_inter_mmx(int16_t * data,; const int16_t * const coeff,; const uint32_t quant);;;===========================================================================align 16cglobal enc_dequant_inter_mmxenc_dequant_inter_mmx push esi push edi mov edi, [esp + 8 + 4] ; data mov esi, [esp + 8 + 8] ; coeff mov eax, [esp + 8 + 12] ; quant movq mm6, [mmx_add + eax * 8] movq mm7, [mmx_mul + eax * 8] mov ecx, 8 .loop movq mm0, [esi] ; mm0 = [1st] movq mm3, [esi + 8] ; pxor mm1, mm1 ; mm1 = 0 pxor mm4, mm4 ; pcmpgtw mm1, mm0 ; mm1 = (0 > mm0) pcmpgtw mm4, mm3 ; pxor mm2, mm2 ; mm2 = 0 pxor mm5, mm5 ; pcmpeqw mm2, mm0 ; mm2 = (0 == mm0) pcmpeqw mm5, mm3 ; pandn mm2, mm6 ; mm2 = (iszero ? 0 : add) pandn mm5, mm6 ; pxor mm0, mm1 ; mm0 = |mm0| pxor mm3, mm4 ; psubw mm0, mm1 ; displace psubw mm3, mm4 ; pmullw mm0, mm7 ; mm0 *= 2Q pmullw mm3, mm7 ; paddw mm0, mm2 ; mm0 += mm2 (add) paddw mm3, mm5 ; pxor mm0, mm1 ; mm0 *= sign(mm0) pxor mm3, mm4 ; psubw mm0, mm1 ; undisplace psubw mm3, mm4 movq [edi], mm0 movq [edi + 8], mm3 add esi, 16 add edi, 16 dec ecx jnz .loop pop edi pop esi ret
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -