📄 quantize_h263_mmx.asm
字号:
;/*****************************************************************************; *; * XVID MPEG-4 VIDEO CODEC; * - MPEG4 Quantization H263 implementation / MMX optimized -; *; * Copyright(C) 2001-2003 Peter Ross <pross@xvid.org>; * 2002-2003 Pascal Massimino <skal@planet-d.net>; * 2004 Jean-Marc Bastide <jmtest@voila.fr>; *; * This program is free software ; you can redistribute it and/or modify; * it under the terms of the GNU General Public License as published by; * the Free Software Foundation ; either version 2 of the License, or; * (at your option) any later version.; *; * This program is distributed in the hope that it will be useful,; * but WITHOUT ANY WARRANTY ; without even the implied warranty of; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the; * GNU General Public License for more details.; *; * You should have received a copy of the GNU General Public License; * along with this program ; if not, write to the Free Software; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA; *; * $Id: quantize_h263_mmx.asm,v 1.7 2004/08/29 10:02:38 edgomez Exp $; *; ****************************************************************************/; enable dequant saturate [-2048,2047], test purposes only.%define SATURATEBITS 32%macro cglobal 1 %ifdef PREFIX %ifdef MARK_FUNCS global _%1:function %1.endfunc-%1 %define %1 _%1:function %1.endfunc-%1 %else global _%1 %define %1 _%1 %endif %else %ifdef MARK_FUNCS global %1:function %1.endfunc-%1 %else global %1 %endif %endif%endmacro;=============================================================================; Read only Local data;=============================================================================%ifdef FORMAT_COFFSECTION .rodata%elseSECTION .rodata align=16%endifALIGN 16plus_one: times 8 dw 1;-----------------------------------------------------------------------------;; quant table;;-----------------------------------------------------------------------------ALIGN 16mmx_quant:%assign quant 0%rep 32 times 4 dw quant %assign quant quant+1%endrep;-----------------------------------------------------------------------------;; subtract by Q/2 table;;-----------------------------------------------------------------------------ALIGN 16mmx_sub:%assign quant 1%rep 31 times 4 dw quant / 2 %assign quant quant+1%endrep;-----------------------------------------------------------------------------;; divide by 2Q table;; use a shift of 16 to take full advantage of _pmulhw_; for q=1, _pmulhw_ will overflow so it is treated seperately; (3dnow2 provides _pmulhuw_ which wont cause overflow);;-----------------------------------------------------------------------------ALIGN 16mmx_div:%assign quant 1%rep 31 times 4 dw (1<<16) / (quant*2) + 1 %assign quant quant+1%endrep;=============================================================================; Code;=============================================================================SECTION .textcglobal quant_h263_intra_mmxcglobal quant_h263_intra_sse2cglobal quant_h263_inter_mmxcglobal quant_h263_inter_sse2cglobal dequant_h263_intra_mmxcglobal dequant_h263_intra_xmmcglobal dequant_h263_intra_sse2cglobal dequant_h263_inter_mmxcglobal dequant_h263_inter_xmmcglobal dequant_h263_inter_sse2;-----------------------------------------------------------------------------;; uint32_t quant_h263_intra_mmx(int16_t * coeff,; const int16_t const * data,; const uint32_t quant,; const uint32_t dcscalar,; const uint16_t *mpeg_matrices);;;-----------------------------------------------------------------------------ALIGN 16quant_h263_intra_mmx: push esi mov esi, [esp + 4 + 8] ; data mov ecx,[esp + 4 + 16] ; dcscalar movsx eax, word [esi] ; data[0] sar ecx,1 ; dcscalar /2 mov edx,eax sar edx,31 ; sgn(data[0]) xor ecx,edx ; *sgn(data[0]) sub eax,edx add eax,ecx ; + (dcscalar/2)*sgn(data[0]) mov ecx, [esp + 4 + 12] ; quant cdq idiv dword [esp + 4 + 16] ; dcscalar cmp ecx, 1 mov edx, [esp + 4 + 4] ; coeff je .low movq mm7, [mmx_div+ecx * 8 - 8] mov ecx,4.loop movq mm0, [esi] ; data pxor mm4,mm4 movq mm1, [esi + 8] pcmpgtw mm4,mm0 ; (data<0) pxor mm5,mm5 pmulhw mm0,mm7 ; /(2*quant) pcmpgtw mm5,mm1 movq mm2, [esi+16] psubw mm0,mm4 ; +(data<0) pmulhw mm1,mm7 pxor mm4,mm4 movq mm3,[esi+24] pcmpgtw mm4,mm2 psubw mm1,mm5 pmulhw mm2,mm7 pxor mm5,mm5 pcmpgtw mm5,mm3 pmulhw mm3,mm7 psubw mm2,mm4 psubw mm3,mm5 movq [edx], mm0 lea esi, [esi+32] movq [edx + 8], mm1 movq [edx + 16], mm2 movq [edx + 24], mm3 dec ecx lea edx, [edx+32] jne .loop jmp .end .low movd mm7,ecx mov ecx,4.loop_low movq mm0, [esi] pxor mm4,mm4 movq mm1, [esi + 8] pcmpgtw mm4,mm0 pxor mm5,mm5 psubw mm0,mm4 pcmpgtw mm5,mm1 psraw mm0,mm7 psubw mm1,mm5 movq mm2,[esi+16] pxor mm4,mm4 psraw mm1,mm7 pcmpgtw mm4,mm2 pxor mm5,mm5 psubw mm2,mm4 movq mm3,[esi+24] pcmpgtw mm5,mm3 psraw mm2,mm7 psubw mm3,mm5 movq [edx], mm0 psraw mm3,mm7 movq [edx + 8], mm1 movq [edx+16],mm2 lea esi, [esi+32] movq [edx+24],mm3 dec ecx lea edx, [edx+32] jne .loop_low .end mov edx, [esp + 4 + 4] ; coeff mov [edx],ax xor eax,eax ; return 0 pop esi ret.endfunc ;-----------------------------------------------------------------------------;; uint32_t quant_h263_intra_sse2(int16_t * coeff,; const int16_t const * data,; const uint32_t quant,; const uint32_t dcscalar,; const uint16_t *mpeg_matrices);;;-----------------------------------------------------------------------------ALIGN 16quant_h263_intra_sse2: push esi mov esi, [esp + 4 + 8] ; data movsx eax, word [esi] ; data[0] mov ecx,[esp + 4 + 16] ; dcscalar mov edx,eax sar ecx,1 add eax,ecx sub edx,ecx cmovl eax,edx ; +/- dcscalar/2 mov ecx, [esp + 4 + 12] ; quant cdq idiv dword [esp + 4 + 16] ; dcscalar cmp ecx, 1 mov edx, [esp + 4 + 4] ; coeff movq xmm7, [mmx_div+ecx * 8 - 8] je .low mov ecx,2 movlhps xmm7,xmm7.loop movdqa xmm0, [esi] pxor xmm4,xmm4 movdqa xmm1, [esi + 16] pcmpgtw xmm4,xmm0 pxor xmm5,xmm5 pmulhw xmm0,xmm7 pcmpgtw xmm5,xmm1 movdqa xmm2, [esi+32] psubw xmm0,xmm4 pmulhw xmm1,xmm7 pxor xmm4,xmm4 movdqa xmm3,[esi+48] pcmpgtw xmm4,xmm2 psubw xmm1,xmm5 pmulhw xmm2,xmm7 pxor xmm5,xmm5 pcmpgtw xmm5,xmm3 pmulhw xmm3,xmm7 psubw xmm2,xmm4 psubw xmm3,xmm5 movdqa [edx], xmm0 lea esi, [esi+64] movdqa [edx + 16], xmm1 movdqa [edx + 32], xmm2 movdqa [edx + 48], xmm3 dec ecx lea edx, [edx+64] jne .loop jmp .end .low movd xmm7,ecx mov ecx,2.loop_low movdqa xmm0, [esi] pxor xmm4,xmm4 movdqa xmm1, [esi + 16] pcmpgtw xmm4,xmm0 pxor xmm5,xmm5 psubw xmm0,xmm4 pcmpgtw xmm5,xmm1 psraw xmm0,xmm7 psubw xmm1,xmm5 movdqa xmm2,[esi+32] pxor xmm4,xmm4 psraw xmm1,xmm7 pcmpgtw xmm4,xmm2 pxor xmm5,xmm5 psubw xmm2,xmm4 movdqa xmm3,[esi+48] pcmpgtw xmm5,xmm3 psraw xmm2,xmm7 psubw xmm3,xmm5 movdqa [edx], xmm0 psraw xmm3,xmm7 movdqa [edx+16], xmm1 movdqa [edx+32],xmm2 lea esi, [esi+64] movdqa [edx+48],xmm3 dec ecx lea edx, [edx+64] jne .loop_low .end mov edx, [esp + 4 + 4] ; coeff mov [edx],ax xor eax,eax ; return 0 pop esi ret.endfunc ;-----------------------------------------------------------------------------;; uint32_t quant_h263_inter_mmx(int16_t * coeff,; const int16_t const * data,; const uint32_t quant,; const uint16_t *mpeg_matrices);;;----------------------------------------------------------------------------- ALIGN 16quant_h263_inter_mmx: push ecx push esi push edi mov edi, [esp + 12 + 4] ; coeff mov esi, [esp + 12 + 8] ; data mov eax, [esp + 12 + 12] ; quant xor ecx, ecx pxor mm5, mm5 ; sum movq mm6, [mmx_sub + eax * 8 - 8] ; sub cmp al, 1 jz .q1loop movq mm7, [mmx_div + eax * 8 - 8] ; dividerALIGN 8.loop movq mm0, [esi + 8*ecx] ; mm0 = [1st] movq mm3, [esi + 8*ecx + 8] pxor mm1, mm1 ; mm1 = 0 pxor mm4, mm4 ; pcmpgtw mm1, mm0 ; mm1 = (0 > mm0) pcmpgtw mm4, mm3 ; pxor mm0, mm1 ; mm0 = |mm0| pxor mm3, mm4 ; psubw mm0, mm1 ; displace psubw mm3, mm4 ; psubusw mm0, mm6 ; mm0 -= sub (unsigned, dont go < 0) psubusw mm3, mm6 ; pmulhw mm0, mm7 ; mm0 = (mm0 / 2Q) >> 16 pmulhw mm3, mm7 ; paddw mm5, mm0 ; sum += mm0 pxor mm0, mm1 ; mm0 *= sign(mm0) paddw mm5, mm3 ; pxor mm3, mm4 ; psubw mm0, mm1 ; undisplace psubw mm3, mm4 movq [edi + 8*ecx], mm0 movq [edi + 8*ecx + 8], mm3 add ecx, 2 cmp ecx, 16 jnz .loop.done pmaddwd mm5, [plus_one] movq mm0, mm5 psrlq mm5, 32 paddd mm0, mm5 movd eax, mm0 ; return sum pop edi pop esi pop ecx retALIGN 8.q1loop movq mm0, [esi + 8*ecx] ; mm0 = [1st] movq mm3, [esi + 8*ecx+ 8] ; pxor mm1, mm1 ; mm1 = 0 pxor mm4, mm4 ; pcmpgtw mm1, mm0 ; mm1 = (0 > mm0) pcmpgtw mm4, mm3 ; pxor mm0, mm1 ; mm0 = |mm0| pxor mm3, mm4 ; psubw mm0, mm1 ; displace psubw mm3, mm4 ; psubusw mm0, mm6 ; mm0 -= sub (unsigned, dont go < 0) psubusw mm3, mm6 ; psrlw mm0, 1 ; mm0 >>= 1 (/2) psrlw mm3, 1 ; paddw mm5, mm0 ; sum += mm0 pxor mm0, mm1 ; mm0 *= sign(mm0) paddw mm5, mm3 ; pxor mm3, mm4 ; psubw mm0, mm1 ; undisplace psubw mm3, mm4 movq [edi + 8*ecx], mm0 movq [edi + 8*ecx + 8], mm3 add ecx, 2 cmp ecx, 16 jnz .q1loop jmp .done.endfunc;-----------------------------------------------------------------------------;; uint32_t quant_h263_inter_sse2(int16_t * coeff,; const int16_t const * data,; const uint32_t quant,; const uint16_t *mpeg_matrices);;;-----------------------------------------------------------------------------ALIGN 16quant_h263_inter_sse2: push esi push edi mov edi, [esp + 8 + 4] ; coeff mov esi, [esp + 8 + 8] ; data mov eax, [esp + 8 + 12] ; quant xor ecx, ecx pxor xmm5, xmm5 ; sum movq mm0, [mmx_sub + eax*8 - 8] ; sub movq2dq xmm6, mm0 ; load into low 8 bytes movlhps xmm6, xmm6 ; duplicate into high 8 bytes cmp al, 1 jz near .qes2_q1loop.qes2_not1 movq mm0, [mmx_div + eax*8 - 8] ; divider movq2dq xmm7, mm0 movlhps xmm7, xmm7ALIGN 16.qes2_loop movdqa xmm0, [esi + ecx*8] ; xmm0 = [1st] movdqa xmm3, [esi + ecx*8 + 16] ; xmm3 = [2nd] pxor xmm1, xmm1 pxor xmm4, xmm4 pcmpgtw xmm1, xmm0 pcmpgtw xmm4, xmm3 pxor xmm0, xmm1 pxor xmm3, xmm4 psubw xmm0, xmm1 psubw xmm3, xmm4 psubusw xmm0, xmm6 psubusw xmm3, xmm6 pmulhw xmm0, xmm7 pmulhw xmm3, xmm7 paddw xmm5, xmm0 pxor xmm0, xmm1 paddw xmm5, xmm3 pxor xmm3, xmm4 psubw xmm0, xmm1 psubw xmm3, xmm4 movdqa [edi + ecx*8], xmm0 movdqa [edi + ecx*8 + 16], xmm3 add ecx, 4 cmp ecx, 16 jnz .qes2_loop.qes2_done
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -