📄 quantize_mmx.asm
字号:
;/**************************************************************************; *; * XVID MPEG-4 VIDEO CODEC; * mmx quantization/dequantization; *; * This program is an implementation of a part of one or more MPEG-4; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending; * to use this software module in hardware or software products are; * advised that its use may infringe existing patents or copyrights, and; * any such use would be at such party's own risk. The original; * developer of this software module and his/her company, and subsequent; * editors and their companies, will have no liability for use of this; * software or modifications or derivatives thereof.; *; * This program is free software; you can redistribute it and/or modify; * it under the terms of the GNU General Public License as published by; * the Free Software Foundation; either version 2 of the License, or; * (at your option) any later version.; *; * This program is distributed in the hope that it will be useful,; * but WITHOUT ANY WARRANTY; without even the implied warranty of; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the; * GNU General Public License for more details.; *; * You should have received a copy of the GNU General Public License; * along with this program; if not, write to the Free Software; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.; *; *************************************************************************/;/**************************************************************************; *; * History:; *; * 26.12.2001 minor bug fixes, dequant saturate, further optimization; * 19.11.2001 quant_inter_mmx now returns sum of abs. coefficient values; * 04.11.2001 nasm version; (c)2001 peter ross <pross@cs.rmit.edu.au>; *; *************************************************************************/; enable dequant saturate [-2048,2047], test purposes only.%define SATURATE; data/text alignment%define ALIGN 8bits 32section .data%macro cglobal 1 %ifdef PREFIX global _%1 %define %1 _%1 %else global %1 %endif%endmacroplus_one times 4 dw 1;===========================================================================;; subtract by Q/2 table;;===========================================================================%macro MMX_SUB 1times 4 dw %1 / 2%endmacroalign ALIGNmmx_sub MMX_SUB 1 MMX_SUB 2 MMX_SUB 3 MMX_SUB 4 MMX_SUB 5 MMX_SUB 6 MMX_SUB 7 MMX_SUB 8 MMX_SUB 9 MMX_SUB 10 MMX_SUB 11 MMX_SUB 12 MMX_SUB 13 MMX_SUB 14 MMX_SUB 15 MMX_SUB 16 MMX_SUB 17 MMX_SUB 18 MMX_SUB 19 MMX_SUB 20 MMX_SUB 21 MMX_SUB 22 MMX_SUB 23 MMX_SUB 24 MMX_SUB 25 MMX_SUB 26 MMX_SUB 27 MMX_SUB 28 MMX_SUB 29 MMX_SUB 30 MMX_SUB 31;===========================================================================;; divide by 2Q table ;; use a shift of 16 to take full advantage of _pmulhw_; for q=1, _pmulhw_ will overflow so it is treated seperately; (3dnow2 provides _pmulhuw_ which wont cause overflow);;===========================================================================%macro MMX_DIV 1times 4 dw (1 << 16) / (%1 * 2) + 1%endmacroalign ALIGNmmx_div MMX_DIV 1 MMX_DIV 2 MMX_DIV 3 MMX_DIV 4 MMX_DIV 5 MMX_DIV 6 MMX_DIV 7 MMX_DIV 8 MMX_DIV 9 MMX_DIV 10 MMX_DIV 11 MMX_DIV 12 MMX_DIV 13 MMX_DIV 14 MMX_DIV 15 MMX_DIV 16 MMX_DIV 17 MMX_DIV 18 MMX_DIV 19 MMX_DIV 20 MMX_DIV 21 MMX_DIV 22 MMX_DIV 23 MMX_DIV 24 MMX_DIV 25 MMX_DIV 26 MMX_DIV 27 MMX_DIV 28 MMX_DIV 29 MMX_DIV 30 MMX_DIV 31;===========================================================================;; add by (odd(Q) ? Q : Q - 1) table;;===========================================================================%macro MMX_ADD 1%if %1 % 2 != 0times 4 dw %1%elsetimes 4 dw %1 - 1%endif%endmacroalign ALIGNmmx_add MMX_ADD 1 MMX_ADD 2 MMX_ADD 3 MMX_ADD 4 MMX_ADD 5 MMX_ADD 6 MMX_ADD 7 MMX_ADD 8 MMX_ADD 9 MMX_ADD 10 MMX_ADD 11 MMX_ADD 12 MMX_ADD 13 MMX_ADD 14 MMX_ADD 15 MMX_ADD 16 MMX_ADD 17 MMX_ADD 18 MMX_ADD 19 MMX_ADD 20 MMX_ADD 21 MMX_ADD 22 MMX_ADD 23 MMX_ADD 24 MMX_ADD 25 MMX_ADD 26 MMX_ADD 27 MMX_ADD 28 MMX_ADD 29 MMX_ADD 30 MMX_ADD 31;===========================================================================;; multiple by 2Q table;;===========================================================================%macro MMX_MUL 1times 4 dw %1 * 2%endmacroalign ALIGNmmx_mul MMX_MUL 1 MMX_MUL 2 MMX_MUL 3 MMX_MUL 4 MMX_MUL 5 MMX_MUL 6 MMX_MUL 7 MMX_MUL 8 MMX_MUL 9 MMX_MUL 10 MMX_MUL 11 MMX_MUL 12 MMX_MUL 13 MMX_MUL 14 MMX_MUL 15 MMX_MUL 16 MMX_MUL 17 MMX_MUL 18 MMX_MUL 19 MMX_MUL 20 MMX_MUL 21 MMX_MUL 22 MMX_MUL 23 MMX_MUL 24 MMX_MUL 25 MMX_MUL 26 MMX_MUL 27 MMX_MUL 28 MMX_MUL 29 MMX_MUL 30 MMX_MUL 31;===========================================================================;; saturation limits ;;===========================================================================align ALIGNmmx_32768_minus_2048 times 4 dw (32768-2048)mmx_32767_minus_2047 times 4 dw (32767-2047)section .text;===========================================================================;; void quant_intra_mmx(int16_t * coeff, ; const int16_t const * data,; const uint32_t quant,; const uint32_t dcscalar);;;===========================================================================align ALIGNcglobal quant_intra_mmxquant_intra_mmx push ecx push esi push edi mov edi, [esp + 12 + 4] ; coeff mov esi, [esp + 12 + 8] ; data mov eax, [esp + 12 + 12] ; quant xor ecx, ecx cmp al, 1 jz .q1loop movq mm7, [mmx_div + eax * 8 - 8]align ALIGN.loop movq mm0, [esi + 8*ecx] ; mm0 = [1st] movq mm3, [esi + 8*ecx + 8] ; pxor mm1, mm1 ; mm1 = 0 pxor mm4, mm4 ; pcmpgtw mm1, mm0 ; mm1 = (0 > mm0) pcmpgtw mm4, mm3 ; pxor mm0, mm1 ; mm0 = |mm0| pxor mm3, mm4 ; psubw mm0, mm1 ; displace psubw mm3, mm4 ; pmulhw mm0, mm7 ; mm0 = (mm0 / 2Q) >> 16 pmulhw mm3, mm7 ; pxor mm0, mm1 ; mm0 *= sign(mm0) pxor mm3, mm4 ; psubw mm0, mm1 ; undisplace psubw mm3, mm4 ; movq [edi + 8*ecx], mm0 movq [edi + 8*ecx + 8], mm3 add ecx,2 cmp ecx,16 jnz .loop .done ; caclulate data[0] // (int32_t)dcscalar) mov ecx, [esp + 12 + 16] ; dcscalar mov edx, ecx movsx eax, word [esi] ; data[0] shr edx, 1 ; edx = dcscalar /2 cmp eax, 0 jg .gtzero sub eax, edx jmp short .mul.gtzero add eax, edx.mul cdq ; expand eax -> edx:eax idiv ecx ; eax = edx:eax / dcscalar mov [edi], ax ; coeff[0] = ax pop edi pop esi pop ecx ret align ALIGN.q1loop movq mm0, [esi + 8*ecx] ; mm0 = [1st] movq mm3, [esi + 8*ecx + 8] ; pxor mm1, mm1 ; mm1 = 0 pxor mm4, mm4 ; pcmpgtw mm1, mm0 ; mm1 = (0 > mm0) pcmpgtw mm4, mm3 ; pxor mm0, mm1 ; mm0 = |mm0| pxor mm3, mm4 ; psubw mm0, mm1 ; displace psubw mm3, mm4 ; psrlw mm0, 1 ; mm0 >>= 1 (/2) psrlw mm3, 1 ; pxor mm0, mm1 ; mm0 *= sign(mm0) pxor mm3, mm4 ; psubw mm0, mm1 ; undisplace psubw mm3, mm4 ; movq [edi + 8*ecx], mm0 movq [edi + 8*ecx + 8], mm3 add ecx,2 cmp ecx,16 jnz .q1loop jmp short .done;===========================================================================;; uint32_t quant_inter_mmx(int16_t * coeff,; const int16_t const * data,; const uint32_t quant);;;===========================================================================align ALIGNcglobal quant_inter_mmx quant_inter_mmx push ecx push esi push edi mov edi, [esp + 12 + 4] ; coeff mov esi, [esp + 12 + 8] ; data mov eax, [esp + 12 + 12] ; quant xor ecx, ecx pxor mm5, mm5 ; sum movq mm6, [mmx_sub + eax * 8 - 8] ; sub cmp al, 1 jz .q1loop movq mm7, [mmx_div + eax * 8 - 8] ; divideralign ALIGN.loop movq mm0, [esi + 8*ecx] ; mm0 = [1st] movq mm3, [esi + 8*ecx + 8] ; pxor mm1, mm1 ; mm1 = 0 pxor mm4, mm4 ; pcmpgtw mm1, mm0 ; mm1 = (0 > mm0) pcmpgtw mm4, mm3 ; pxor mm0, mm1 ; mm0 = |mm0| pxor mm3, mm4 ; psubw mm0, mm1 ; displace psubw mm3, mm4 ; psubusw mm0, mm6 ; mm0 -= sub (unsigned, dont go < 0) psubusw mm3, mm6 ; pmulhw mm0, mm7 ; mm0 = (mm0 / 2Q) >> 16 pmulhw mm3, mm7 ; paddw mm5, mm0 ; sum += mm0 pxor mm0, mm1 ; mm0 *= sign(mm0) paddw mm5, mm3 ; pxor mm3, mm4 ; psubw mm0, mm1 ; undisplace psubw mm3, mm4 movq [edi + 8*ecx], mm0 movq [edi + 8*ecx + 8], mm3 add ecx, 2 cmp ecx, 16 jnz .loop.done pmaddwd mm5, [plus_one] movq mm0, mm5 psrlq mm5, 32 paddd mm0, mm5 movd eax, mm0 ; return sum pop edi pop esi pop ecx retalign ALIGN.q1loop movq mm0, [esi + 8*ecx] ; mm0 = [1st] movq mm3, [esi + 8*ecx+ 8] ; pxor mm1, mm1 ; mm1 = 0 pxor mm4, mm4 ; pcmpgtw mm1, mm0 ; mm1 = (0 > mm0) pcmpgtw mm4, mm3 ; pxor mm0, mm1 ; mm0 = |mm0| pxor mm3, mm4 ; psubw mm0, mm1 ; displace psubw mm3, mm4 ; psubusw mm0, mm6 ; mm0 -= sub (unsigned, dont go < 0) psubusw mm3, mm6 ; psrlw mm0, 1 ; mm0 >>= 1 (/2) psrlw mm3, 1 ; paddw mm5, mm0 ; sum += mm0 pxor mm0, mm1 ; mm0 *= sign(mm0) paddw mm5, mm3 ; pxor mm3, mm4 ; psubw mm0, mm1 ; undisplace psubw mm3, mm4 movq [edi + 8*ecx], mm0 movq [edi + 8*ecx + 8], mm3 add ecx,2 cmp ecx,16 jnz .q1loop jmp .done;===========================================================================;; void dequant_intra_mmx(int16_t *data,; const int16_t const *coeff,; const uint32_t quant,; const uint32_t dcscalar);;;===========================================================================align ALIGNcglobal dequant_intra_mmxdequant_intra_mmx push esi push edi mov edi, [esp + 8 + 4] ; data mov esi, [esp + 8 + 8] ; coeff mov eax, [esp + 8 + 12] ; quant movq mm6, [mmx_add + eax * 8 - 8] movq mm7, [mmx_mul + eax * 8 - 8] xor eax, eaxalign ALIGN.loop movq mm0, [esi + 8*eax] ; mm0 = [coeff] movq mm3, [esi + 8*eax + 8] ; pxor mm1, mm1 ; mm1 = 0 pxor mm4, mm4 ; pcmpgtw mm1, mm0 ; mm1 = (0 > mm0) pcmpgtw mm4, mm3 ; pxor mm2, mm2 ; mm2 = 0 pxor mm5, mm5 ; pcmpeqw mm2, mm0 ; mm2 = (0 == mm0) pcmpeqw mm5, mm3 ; pandn mm2, mm6 ; mm2 = (iszero ? 0 : add) pandn mm5, mm6 ; pxor mm0, mm1 ; mm0 = |mm0| pxor mm3, mm4 ; psubw mm0, mm1 ; displace psubw mm3, mm4 ; pmullw mm0, mm7 ; mm0 *= 2Q pmullw mm3, mm7 ; paddw mm0, mm2 ; mm0 += mm2 (add) paddw mm3, mm5 ; pxor mm0, mm1 ; mm0 *= sign(mm0) pxor mm3, mm4 ; psubw mm0, mm1 ; undisplace psubw mm3, mm4%ifdef SATURATE movq mm2, [mmx_32767_minus_2047] movq mm4, [mmx_32768_minus_2048] paddsw mm0, mm2 paddsw mm3, mm2 psubsw mm0, mm2 psubsw mm3, mm2 psubsw mm0, mm4 psubsw mm3, mm4 paddsw mm0, mm4 paddsw mm3, mm4%endif movq [edi + 8*eax], mm0 ; [data] = mm0 movq [edi + 8*eax + 8], mm3 add eax, 2 cmp eax, 16 jnz near .loop mov ax, [esi] ; ax = data[0] imul ax, [esp + 8 + 16] ; eax = data[0] * dcscalar%ifdef SATURATE cmp ax, -2048 jl .set_n2048 cmp ax, 2047 jg .set_2047%endif mov [edi], ax pop edi pop esi ret%ifdef SATURATEalign ALIGN.set_n2048 mov word [edi], -2048 pop edi pop esi ret align ALIGN.set_2047 mov word [edi], 2047 pop edi pop esi ret%endif;===========================================================================;; void dequant_inter_mmx(int16_t * data,; const int16_t * const coeff,; const uint32_t quant);;;===========================================================================align ALIGNcglobal dequant_inter_mmxdequant_inter_mmx push esi push edi mov edi, [esp + 8 + 4] ; data mov esi, [esp + 8 + 8] ; coeff mov eax, [esp + 8 + 12] ; quant movq mm6, [mmx_add + eax * 8 - 8] movq mm7, [mmx_mul + eax * 8 - 8] xor eax, eaxalign ALIGN.loop movq mm0, [esi + 8*eax] ; mm0 = [coeff] movq mm3, [esi + 8*eax + 8] ; pxor mm1, mm1 ; mm1 = 0 pxor mm4, mm4 ; pcmpgtw mm1, mm0 ; mm1 = (0 > mm0) pcmpgtw mm4, mm3 ; pxor mm2, mm2 ; mm2 = 0 pxor mm5, mm5 ; pcmpeqw mm2, mm0 ; mm2 = (0 == mm0) pcmpeqw mm5, mm3 ; pandn mm2, mm6 ; mm2 = (iszero ? 0 : add) pandn mm5, mm6 ; pxor mm0, mm1 ; mm0 = |mm0| pxor mm3, mm4 ; psubw mm0, mm1 ; displace psubw mm3, mm4 ; pmullw mm0, mm7 ; mm0 *= 2Q pmullw mm3, mm7 ; paddw mm0, mm2 ; mm0 += mm2 (add) paddw mm3, mm5 ; pxor mm0, mm1 ; mm0 *= sign(mm0) pxor mm3, mm4 ; psubw mm0, mm1 ; undisplace psubw mm3, mm4%ifdef SATURATE movq mm2, [mmx_32767_minus_2047] movq mm4, [mmx_32768_minus_2048] paddsw mm0, mm2 paddsw mm3, mm2 psubsw mm0, mm2 psubsw mm3, mm2 psubsw mm0, mm4 psubsw mm3, mm4 paddsw mm0, mm4 paddsw mm3, mm4%endif movq [edi + 8*eax], mm0 movq [edi + 8*eax + 8], mm3 add eax, 2 cmp eax, 16 jnz near .loop pop edi pop esi ret
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -