📄 quantize_h263_3dne.asm
字号:
mov ebp, [byte esp] quant_intra1 3 psubw mm5, mm4 ;C8 mov esi, [dword esp + 12] ; pop back the register value mov edi, [esp + 4] ; pop back the register value sar eax, 16 lea ebx, [byte eax + 1] ; workaround for eax < 0 cmovs eax, ebx ; conditionnaly move the corrected value mov [edx], ax ; coeff[0] = ax mov ebx, [esp + 8] ; pop back the register value add esp, byte 16 ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16 psubw mm7, mm6 ;D8 movq [edx + 3 * 32 + 16], mm5 ;C9 movq [edx + 3 * 32 + 24], mm7 ;D9 xor eax, eax ret.endfunc;-----------------------------------------------------------------------------;; uint32_t quant_h263_inter_3dne(int16_t * coeff,; const int16_t const * data,; const uint32_t quant,; const uint16_t *mpeg_matrices);;;-----------------------------------------------------------------------------;This is Athlon-optimized code (ca 90 clk per call);Optimized by Jaan, 30 Nov 2002%macro quantinter 1 movq mm1, [eax] ;A2 psraw mm3, 15 ;B6%if (%1) psubw mm2, mm6 ;C10%endif psubw mm1, mm0 ;A3 pmulhw mm4, mm7 ;B7 movq mm6, [ecx + %1*24+16] ;C1 pmaxsw mm1, mm0 ;A4 paddw mm5, mm4 ;B8%if (%1) movq [edx + %1*24+16-24], mm2 ;C11%endif psubusw mm1, [ebx] ;A5 mm0 -= sub (unsigned, dont go < 0) pxor mm4, mm3 ;B9 movq mm2, [eax] ;C2 psraw mm0, 15 ;A6 psubw mm4, mm3 ;B10 psubw mm2, mm6 ;C3 pmulhw mm1, mm7 ;A7 mm0 = (mm0 / 2Q) >> 24 movq mm3, [ecx + %1*24+8] ;B1 pmaxsw mm2, mm6 ;C4 paddw mm5, mm1 ;A8 sum += mm0%if (%1) movq [edx + %1*24+8-24], mm4 ;B11%else movq [edx + 120], mm4 ;B11%endif psubusw mm2, [ebx] ;C5 pxor mm1, mm0 ;A9 mm0 *= sign(mm0) movq mm4, [eax] ;B2 psraw mm6, 15 ;C6 psubw mm1, mm0 ;A10 undisplace psubw mm4, mm3 ;B3 pmulhw mm2, mm7 ;C7 movq mm0, [ecx + %1*24+24] ;A1 mm0 = [1st] pmaxsw mm4, mm3 ;B4 paddw mm5, mm2 ;C8 movq [byte edx + %1*24], mm1 ;A11 psubusw mm4, [ebx] ;B5 pxor mm2, mm6 ;C9%endmacro%macro quantinter1 1 movq mm0, [byte ecx + %1*16] ;mm0 = [1st] movq mm3, [ecx + %1*16+8] ; movq mm1, [eax] movq mm4, [eax] psubw mm1, mm0 psubw mm4, mm3 pmaxsw mm1, mm0 pmaxsw mm4, mm3 psubusw mm1, mm6 ; mm0 -= sub (unsigned, dont go < 0) psubusw mm4, mm6 ; psraw mm0, 15 psraw mm3, 15 psrlw mm1, 1 ; mm0 = (mm0 / 2Q) >> 16 psrlw mm4, 1 ; paddw mm5, mm1 ; sum += mm0 pxor mm1, mm0 ; mm0 *= sign(mm0) paddw mm5, mm4 pxor mm4, mm3 ; psubw mm1, mm0 ; undisplace psubw mm4, mm3 cmp esp, esp movq [byte edx + %1*16], mm1 movq [edx + %1*16+8], mm4%endmacroALIGN 16cglobal quant_h263_inter_3dnequant_h263_inter_3dne: mov edx, [esp + 4] ; coeff mov ecx, [esp + 8] ; data mov eax, [esp + 12] ; quant push ebx pxor mm5, mm5 ; sum nop lea ebx,[mmx_sub + eax * 8 - 8] ; sub movq mm7, [mmx_div + eax * 8 - 8] ; divider cmp al, 1 lea eax, [mmzero] jz near .q1loop cmp esp, espALIGN 8 movq mm3, [ecx + 120] ;B1 pxor mm4, mm4 ;B2 psubw mm4, mm3 ;B3 movq mm0, [ecx] ;A1 mm0 = [1st] pmaxsw mm4, mm3 ;B4 psubusw mm4, [ebx] ;B5 quantinter 0 quantinter 1 quantinter 2 quantinter 3 quantinter 4 psraw mm3, 15 ;B6 psubw mm2, mm6 ;C10 pmulhw mm4, mm7 ;B7 paddw mm5, mm4 ;B8 pxor mm4, mm3 ;B9 psubw mm4, mm3 ;B10 movq [edx + 4*24+16], mm2 ;C11 pop ebx movq [edx + 4*24+8], mm4 ;B11 pmaddwd mm5, [plus_one] movq mm0, mm5 punpckhdq mm5, mm5 paddd mm0, mm5 movd eax, mm0 ; return sum retALIGN 16.q1loop movq mm6, [byte ebx] quantinter1 0 quantinter1 1 quantinter1 2 quantinter1 3 quantinter1 4 quantinter1 5 quantinter1 6 quantinter1 7 pmaddwd mm5, [plus_one] movq mm0, mm5 psrlq mm5, 32 paddd mm0, mm5 movd eax, mm0 ; return sum pop ebx ret.endfunc;-----------------------------------------------------------------------------;; uint32_t dequant_h263_intra_3dne(int16_t *data,; const int16_t const *coeff,; const uint32_t quant,; const uint32_t dcscalar,; const uint16_t *mpeg_matrices);;;----------------------------------------------------------------------------- ; this is the same as dequant_inter_3dne, except that we're ; saturating using 'pminsw' (saves 2 cycles/loop => ~5% faster);This is Athlon-optimized code (ca 106 clk per call)%macro dequant 1 movq mm1, [ecx+%1*24] ; c = coeff[i] ;A2 psubw mm0, mm1 ;-c ;A3 (1st dep)%if (%1) paddw mm4, mm6 ;C11 mm6 free (4th+)%endif pmaxsw mm0, mm1 ;|c| ;A4 (2nd)%if (%1) mov ebp, ebp pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+) later%endif movq mm6, [esi] ;0 ;A5 mm6 in use pandn mm7, [eax] ;B9 offset = isZero ? 0 : quant_add (2nd)%if (%1) pxor mm5, mm4 ;C13 (6th+) 1later%endif movq mm4, [esi] ;C1 ;0 mov esp, esp pcmpeqw mm6, [ecx+%1*24] ;A6 (c ==0) ? -1 : 0 (1st)ALIGN 4 psraw mm1, 15 ; sign(c) ;A7 (2nd)%if (%1) movq [edx+%1*24+16-24], mm5 ; C14 (7th) 2later%endif paddw mm7, mm3 ;B10 offset +negate back (3rd) pmullw mm0, [edi] ;*= 2Q ;A8 (3rd+) paddw mm2, mm7 ;B11 mm7 free (4th+) lea ebp, [byte ebp] movq mm5, [ecx+%1*24+16] ;C2 ; c = coeff[i] psubw mm4, mm5 ;-c ;C3 (1st dep) pandn mm6, [eax] ;A9 offset = isZero ? 0 : quant_add (2nd) pminsw mm2, [ebx] ;B12 saturates to +2047 (5th+) pxor mm3, mm2 ;B13 (6th+) movq mm2, [byte esi] ;B1 ;0%if (%1) movq [edx+%1*24+8-24], mm3 ;B14 (7th)%else movq [edx+120], mm3%endif pmaxsw mm4, mm5 ;|c| ;C4 (2nd) paddw mm6, mm1 ;A10 offset +negate back (3rd) movq mm3, [ecx+%1*24 + 8] ;B2 ; c = coeff[i] psubw mm2, mm3 ;-c ;B3 (1st dep) paddw mm0, mm6 ;A11 mm6 free (4th+) movq mm6, [byte esi] ;0 ;C5 mm6 in use pcmpeqw mm6, [ecx+%1*24+16] ;C6 (c ==0) ? -1 : 0 (1st) pminsw mm0, [ebx] ;A12 saturates to +2047 (5th+) pmaxsw mm2, mm3 ;|c| ;B4 (2nd) pxor mm1, mm0 ;A13 (6th+) pmullw mm4, [edi] ;*= 2Q ;C8 (3rd+) psraw mm5, 15 ; sign(c) ;C7 (2nd) movq mm7, [byte esi] ;0 ;B5 mm7 in use pcmpeqw mm7, [ecx+%1*24 + 8] ;B6 (c ==0) ? -1 : 0 (1st)%if (%1 < 4) movq mm0, [byte esi] ;A1 ;0%endif pandn mm6, [byte eax] ;C9 offset = isZero ? 0 : quant_add (2nd) psraw mm3, 15 ;sign(c) ;B7 (2nd) movq [byte edx+%1*24], mm1 ;A14 (7th) paddw mm6, mm5 ;C10 offset +negate back (3rd) pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+) mov esp, esp%endmacroALIGN 16cglobal dequant_h263_intra_3dnedequant_h263_intra_3dne: mov ecx, [esp+ 8] ; coeff mov eax, [esp+12] ; quant pxor mm0, mm0 pxor mm2, mm2 push edi push ebx lea edi, [mmx_mul + eax*8 - 8] ; 2*quant push ebp mov ebx, mmx_2047 movsx ebp, word [ecx] lea eax, [mmx_add + eax*8 - 8] ; quant or quant-1 push esi mov esi, mmzero pxor mm7, mm7 movq mm3, [ecx+120] ;B2 ; c = coeff[i] pcmpeqw mm7, [ecx+120] ;B6 (c ==0) ? -1 : 0 (1st) imul ebp, [esp+16+16] ; dcscalar psubw mm2, mm3 ;-c ;B3 (1st dep) pmaxsw mm2, mm3 ;|c| ;B4 (2nd) pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+) psraw mm3, 15 ; sign(c) ;B7 (2nd) mov edx, [esp+ 4+16] ; dataALIGN 8 dequant 0 cmp ebp, -2048 mov esp, esp dequant 1 cmovl ebp, [int_2048] nop dequant 2 cmp ebp, 2047 mov esp, esp dequant 3 cmovg ebp, [int2047] nop dequant 4 paddw mm4, mm6 ;C11 mm6 free (4th+) pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+) pandn mm7, [eax] ;B9 offset = isZero ? 0 : quant_add (2nd) mov eax, ebp mov esi, [esp] mov ebp, [esp+4] pxor mm5, mm4 ;C13 (6th+) paddw mm7, mm3 ;B10 offset +negate back (3rd) movq [edx+4*24+16], mm5 ;C14 (7th) paddw mm2, mm7 ;B11 mm7 free (4th+) pminsw mm2, [ebx] ;B12 saturates to +2047 (5th+) mov ebx, [esp+8] mov edi, [esp+12] add esp, byte 16 pxor mm3, mm2 ;B13 (6th+) movq [edx+4*24+8], mm3 ;B14 (7th) mov [edx], ax xor eax, eax ret.endfunc;-----------------------------------------------------------------------------;; uint32_t dequant_h263_inter_3dne(int16_t * data,; const int16_t * const coeff,; const uint32_t quant,; const uint16_t *mpeg_matrices);;;-----------------------------------------------------------------------------; this is the same as dequant_inter_3dne,; except that we're saturating using 'pminsw' (saves 2 cycles/loop); This is Athlon-optimized code (ca 100 clk per call)ALIGN 16cglobal dequant_h263_inter_3dnedequant_h263_inter_3dne: mov ecx, [esp+ 8] ; coeff mov eax, [esp+12] ; quant pxor mm0, mm0 pxor mm2, mm2 push edi push ebx push esi lea edi, [mmx_mul + eax*8 - 8] ; 2*quant mov ebx, mmx_2047 pxor mm7, mm7 movq mm3, [ecx+120] ;B2 ; c = coeff[i] pcmpeqw mm7, [ecx+120] ;B6 (c ==0) ? -1 : 0 (1st) lea eax, [mmx_add + eax*8 - 8] ; quant or quant-1 psubw mm2, mm3 ;-c ;B3 (1st dep) mov esi, mmzero pmaxsw mm2, mm3 ;|c| ;B4 (2nd) pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+) psraw mm3, 15 ; sign(c) ;B7 (2nd) mov edx, [dword esp+ 4+12] ; dataALIGN 8 dequant 0 dequant 1 dequant 2 dequant 3 dequant 4 paddw mm4, mm6 ;C11 mm6 free (4th+) pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+) pandn mm7, [eax] ;B9 offset = isZero ? 0 : quant_add (2nd) mov esi, [esp] pxor mm5, mm4 ;C13 (6th+) paddw mm7, mm3 ;B10 offset +negate back (3rd) movq [edx+4*24+16], mm5 ;C14 (7th) paddw mm2, mm7 ;B11 mm7 free (4th+) pminsw mm2, [ebx] ;B12 saturates to +2047 (5th+) mov ebx, [esp+4] mov edi, [esp+8] add esp, byte 12 pxor mm3, mm2 ;B13 (6th+) movq [edx+4*24+8], mm3 ;B14 (7th) xor eax, eax ret.endfunc
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -