📄 quantize_h263_3dne.asm

📁 wince下的xvidcore开发库,可用于MP4等视频播放开发
💻 ASM
📖 第 1 页 / 共 2 页
字号:
上一页 12
   quant_intra1 3
   psubw mm5, mm4                    ;C8
   mov esi, [dword esp + 12]         ; pop back the register value
   mov edi, [esp + 4]                ; pop back the register value
   sar eax, 16
   lea ebx, [byte eax + 1]           ; workaround for eax < 0
   cmovs eax, ebx                    ; conditionnaly move the corrected value
   mov [edx], ax                     ; coeff[0] = ax
   mov ebx, [esp + 8]                ; pop back the register value
   add esp, byte 16                  ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16
   psubw mm7, mm6                    ;D8
   movq [edx + 3 * 32 + 16], mm5     ;C9
   movq [edx + 3 * 32 + 24], mm7     ;D9
 
   xor eax, eax
   ret
 
 
 
 
 ;-----------------------------------------------------------------------------
 ;
 ; uint32_t quant_h263_inter_3dne(int16_t * coeff,
 ;                                const int16_t const * data,
 ;                                const uint32_t quant,
 ;                                const uint16_t *mpeg_matrices);
 ;
 ;-----------------------------------------------------------------------------
 ;This is Athlon-optimized code (ca 90 clk per call)
 ;Optimized by Jaan, 30 Nov 2002
 
 
 %macro quantinter 1
   movq mm1, [eax]               ;A2
   psraw mm3, 15                 ;B6
 %if (%1)
   psubw mm2, mm6                ;C10
 %endif
   psubw mm1, mm0                ;A3
   pmulhw mm4, mm7               ;B7
   movq mm6, [ecx + %1*24+16]    ;C1
   pmaxsw mm1, mm0               ;A4
   paddw mm5, mm4                ;B8
 %if (%1)
   movq [edx + %1*24+16-24], mm2 ;C11
 %endif
   psubusw mm1, [ebx]            ;A5 mm0 -= sub (unsigned, dont go < 0)
   pxor mm4, mm3                 ;B9
   movq mm2, [eax]               ;C2
   psraw mm0, 15                 ;A6
   psubw mm4, mm3                ;B10
   psubw mm2, mm6                ;C3
   pmulhw mm1, mm7               ;A7 mm0 = (mm0 / 2Q) >> 24
   movq mm3, [ecx + %1*24+8] ;B1
   pmaxsw mm2, mm6               ;C4
   paddw mm5, mm1                ;A8 sum += mm0
 %if (%1)
   movq [edx + %1*24+8-24], mm4  ;B11
 %else
   movq [edx + 120], mm4         ;B11
 %endif
   psubusw mm2, [ebx]            ;C5
   pxor mm1, mm0                 ;A9 mm0 *= sign(mm0)
   movq mm4, [eax]               ;B2
   psraw mm6, 15                 ;C6
   psubw mm1, mm0                ;A10 undisplace
   psubw mm4, mm3                ;B3
   pmulhw mm2, mm7               ;C7
   movq mm0, [ecx + %1*24+24]    ;A1 mm0 = [1st]
   pmaxsw mm4, mm3               ;B4
   paddw mm5, mm2                ;C8
   movq [byte edx + %1*24], mm1  ;A11
   psubusw mm4, [ebx]            ;B5
   pxor mm2, mm6                 ;C9
 %endmacro
 
 %macro quantinter1 1
   movq mm0, [byte ecx + %1*16]  ;mm0 = [1st]
   movq mm3, [ecx + %1*16+8] ;
   movq mm1, [eax]
   movq mm4, [eax]
   psubw mm1, mm0
   psubw mm4, mm3
   pmaxsw mm1, mm0
   pmaxsw mm4, mm3
   psubusw mm1, mm6              ; mm0 -= sub (unsigned, dont go < 0)
   psubusw mm4, mm6              ;
   psraw mm0, 15
   psraw mm3, 15
   psrlw mm1, 1                  ; mm0 = (mm0 / 2Q) >> 16
   psrlw mm4, 1                  ;
   paddw mm5, mm1                ; sum += mm0
   pxor mm1, mm0                 ; mm0 *= sign(mm0)
   paddw mm5, mm4
   pxor mm4, mm3                 ;
   psubw mm1, mm0                ; undisplace
   psubw mm4, mm3
   cmp esp, esp
   movq [byte edx + %1*16], mm1
   movq [edx + %1*16+8], mm4
 %endmacro
 
 ALIGN 16
 cglobal quant_h263_inter_3dne
 quant_h263_inter_3dne:
   mov edx, [esp  + 4]               ; coeff
   mov ecx, [esp  + 8]               ; data
   mov eax, [esp  + 12]              ; quant
   push ebx
 
   pxor mm5, mm5                     ; sum
   nop
   lea ebx,[mmx_sub + eax * 8 - 8]   ; sub
   movq mm7, [mmx_div + eax * 8 - 8] ; divider
 
   cmp al, 1
   lea eax, [mmzero]
   jz near .q1loop
   cmp esp, esp
 ALIGN 8
   movq mm3, [ecx + 120]     ;B1
   pxor mm4, mm4             ;B2
   psubw mm4, mm3            ;B3
   movq mm0, [ecx]           ;A1 mm0 = [1st]
   pmaxsw mm4, mm3           ;B4
   psubusw mm4, [ebx]        ;B5
 
   quantinter 0
   quantinter 1
   quantinter 2
   quantinter 3
   quantinter 4
 
   psraw mm3, 15             ;B6
   psubw mm2, mm6            ;C10
   pmulhw mm4, mm7           ;B7
   paddw mm5, mm4            ;B8
   pxor mm4, mm3             ;B9
   psubw mm4, mm3            ;B10
   movq [edx + 4*24+16], mm2 ;C11
   pop ebx
   movq [edx + 4*24+8], mm4  ;B11
   pmaddwd mm5, [plus_one]
   movq mm0, mm5
   punpckhdq mm5, mm5
   paddd mm0, mm5
   movd eax, mm0             ; return sum
 
   ret
 
 ALIGN 16
 .q1loop
   movq mm6, [byte ebx]
 
   quantinter1 0
   quantinter1 1
   quantinter1 2
   quantinter1 3
   quantinter1 4
   quantinter1 5
   quantinter1 6
   quantinter1 7
 
   pmaddwd mm5, [plus_one]
   movq mm0, mm5
   psrlq mm5, 32
   paddd mm0, mm5
   movd eax, mm0 ; return sum
 
   pop ebx
 
   ret
 
 ;-----------------------------------------------------------------------------
 ;
 ; uint32_t dequant_h263_intra_3dne(int16_t *data,
 ;                                  const int16_t const *coeff,
 ;                                  const uint32_t quant,
 ;                                  const uint32_t dcscalar,
 ;                                  const uint16_t *mpeg_matrices);
 ;
 ;-----------------------------------------------------------------------------
 
   ; this is the same as dequant_inter_3dne, except that we're
   ; saturating using 'pminsw' (saves 2 cycles/loop => ~5% faster)
 
 ;This is Athlon-optimized code (ca 106 clk per call)
 
 %macro dequant 1
   movq mm1, [ecx+%1*24]         ; c  = coeff[i] ;A2
   psubw mm0, mm1                ;-c     ;A3 (1st dep)
 %if (%1)
   paddw mm4, mm6                ;C11 mm6 free (4th+)
 %endif
   pmaxsw mm0, mm1               ;|c|        ;A4 (2nd)
 %if (%1)
   mov ebp, ebp
   pminsw mm4, [ebx]             ;C12 saturates to +2047 (5th+) later
 %endif
   movq mm6, [esi]               ;0      ;A5  mm6 in use
   pandn mm7, [eax]              ;B9 offset = isZero ? 0 : quant_add (2nd)
 %if (%1)
   pxor mm5, mm4                 ;C13 (6th+) 1later
 %endif
   movq mm4, [esi]               ;C1 ;0
   mov esp, esp
   pcmpeqw mm6, [ecx+%1*24]      ;A6 (c ==0) ? -1 : 0 (1st)
 ALIGN 4
   psraw mm1, 15                 ; sign(c)   ;A7 (2nd)
 %if (%1)
   movq [edx+%1*24+16-24], mm5   ; C14 (7th) 2later
 %endif
   paddw mm7, mm3                ;B10  offset +negate back (3rd)
   pmullw mm0, [edi]             ;*= 2Q  ;A8 (3rd+)
   paddw mm2, mm7                ;B11 mm7 free (4th+)
   lea ebp, [byte ebp]
   movq mm5, [ecx+%1*24+16]      ;C2 ; c  = coeff[i]
   psubw mm4, mm5                ;-c         ;C3 (1st dep)
   pandn mm6, [eax]              ;A9 offset = isZero ? 0 : quant_add (2nd)
   pminsw mm2, [ebx]             ;B12 saturates to +2047 (5th+)
   pxor mm3, mm2                 ;B13 (6th+)
   movq mm2, [byte esi]          ;B1 ;0
 %if (%1)
   movq [edx+%1*24+8-24], mm3    ;B14 (7th)
 %else
   movq [edx+120], mm3
 %endif
   pmaxsw mm4, mm5               ;|c|        ;C4 (2nd)
   paddw mm6, mm1                ;A10  offset +negate back (3rd)
   movq mm3, [ecx+%1*24 + 8]     ;B2 ; c  = coeff[i]
   psubw mm2, mm3                ;-c     ;B3 (1st dep)
   paddw mm0, mm6                ;A11 mm6 free (4th+)
   movq mm6, [byte esi]          ;0          ;C5  mm6 in use
   pcmpeqw mm6, [ecx+%1*24+16]   ;C6 (c ==0) ? -1 : 0 (1st)
   pminsw mm0, [ebx]             ;A12 saturates to +2047 (5th+)
   pmaxsw mm2, mm3               ;|c|        ;B4 (2nd)
   pxor mm1, mm0                 ;A13 (6th+)
   pmullw mm4, [edi]             ;*= 2Q  ;C8 (3rd+)
   psraw mm5, 15                 ; sign(c)   ;C7 (2nd)
   movq mm7, [byte esi]          ;0          ;B5 mm7 in use
   pcmpeqw mm7, [ecx+%1*24 + 8]  ;B6 (c ==0) ? -1 : 0 (1st)
 %if (%1 < 4)
   movq mm0, [byte esi]          ;A1 ;0
 %endif
   pandn mm6, [byte eax]         ;C9 offset = isZero ? 0 : quant_add (2nd)
   psraw mm3, 15                 ;sign(c)    ;B7 (2nd)
   movq [byte edx+%1*24], mm1    ;A14 (7th)
   paddw mm6, mm5                ;C10  offset +negate back (3rd)
   pmullw mm2, [edi]             ;*= 2Q  ;B8 (3rd+)
   mov esp, esp
 %endmacro
 
 
 ALIGN 16
 cglobal dequant_h263_intra_3dne
 dequant_h263_intra_3dne:
   mov ecx, [esp+ 8]                 ; coeff
   mov eax, [esp+12]                 ; quant
   pxor mm0, mm0
   pxor mm2, mm2
   push edi
   push ebx
   lea edi, [mmx_mul + eax*8 - 8]    ; 2*quant
   push ebp
   mov ebx, mmx_2047
   movsx ebp, word [ecx]
   lea eax, [mmx_add + eax*8 - 8]    ; quant or quant-1
   push esi
   mov esi, mmzero
   pxor mm7, mm7
   movq mm3, [ecx+120]               ;B2 ; c  = coeff[i]
   pcmpeqw mm7, [ecx+120]            ;B6 (c ==0) ? -1 : 0 (1st)
 
   imul ebp, [esp+16+16]             ; dcscalar
   psubw mm2, mm3                    ;-c         ;B3 (1st dep)
   pmaxsw mm2, mm3                   ;|c|        ;B4 (2nd)
   pmullw mm2, [edi]                 ;*= 2Q  ;B8 (3rd+)
   psraw mm3, 15                     ; sign(c)   ;B7 (2nd)
   mov edx, [esp+ 4+16]              ; data
 
 ALIGN 8
   dequant 0
 
   cmp ebp, -2048
   mov esp, esp
 
   dequant 1
 
   cmovl ebp, [int_2048]
   nop
 
   dequant 2
 
   cmp ebp, 2047
   mov esp, esp
 
   dequant 3
 
   cmovg ebp, [int2047]
   nop
 
   dequant 4
 
   paddw mm4, mm6            ;C11 mm6 free (4th+)
   pminsw mm4, [ebx]         ;C12 saturates to +2047 (5th+)
   pandn mm7, [eax]          ;B9 offset = isZero ? 0 : quant_add (2nd)
   mov eax, ebp
   mov esi, [esp]
   mov ebp, [esp+4]
   pxor mm5, mm4             ;C13 (6th+)
   paddw mm7, mm3            ;B10  offset +negate back (3rd)
   movq [edx+4*24+16], mm5   ;C14 (7th)
   paddw mm2, mm7            ;B11 mm7 free (4th+)
   pminsw mm2, [ebx]         ;B12 saturates to +2047 (5th+)
   mov ebx, [esp+8]
   mov edi, [esp+12]
   add esp, byte 16
   pxor mm3, mm2             ;B13 (6th+)
   movq [edx+4*24+8], mm3    ;B14 (7th)
   mov [edx], ax
 
   xor eax, eax
   ret
 
 ;-----------------------------------------------------------------------------
 ;
 ; uint32_t dequant_h263_inter_3dne(int16_t * data,
 ;                                  const int16_t * const coeff,
 ;                                  const uint32_t quant,
 ;                                  const uint16_t *mpeg_matrices);
 ;
 ;-----------------------------------------------------------------------------
 
 ; this is the same as dequant_inter_3dne,
 ; except that we're saturating using 'pminsw' (saves 2 cycles/loop)
 ; This is Athlon-optimized code (ca 100 clk per call)
 
 ALIGN 16
 cglobal dequant_h263_inter_3dne
 dequant_h263_inter_3dne:
   mov ecx, [esp+ 8]         ; coeff
   mov eax, [esp+12]         ; quant
   pxor mm0, mm0
   pxor mm2, mm2
   push edi
   push ebx
   push esi
   lea edi, [mmx_mul + eax*8 - 8]    ; 2*quant
   mov ebx, mmx_2047
   pxor mm7, mm7
   movq mm3, [ecx+120]               ;B2 ; c  = coeff[i]
   pcmpeqw mm7, [ecx+120]            ;B6 (c ==0) ? -1 : 0 (1st)
   lea eax, [mmx_add + eax*8 - 8]    ; quant or quant-1
   psubw mm2, mm3                    ;-c ;B3 (1st dep)
   mov esi, mmzero
   pmaxsw mm2, mm3                   ;|c|        ;B4 (2nd)
   pmullw mm2, [edi]                 ;*= 2Q      ;B8 (3rd+)
   psraw mm3, 15                     ; sign(c)   ;B7 (2nd)
   mov edx, [dword esp+ 4+12]        ; data
 
 ALIGN 8
 
   dequant 0
   dequant 1
   dequant 2
   dequant 3
   dequant 4
 
   paddw mm4, mm6            ;C11 mm6 free (4th+)
   pminsw mm4, [ebx]         ;C12 saturates to +2047 (5th+)
   pandn mm7, [eax]          ;B9 offset = isZero ? 0 : quant_add (2nd)
   mov esi, [esp]
   pxor mm5, mm4             ;C13 (6th+)
   paddw mm7, mm3            ;B10  offset +negate back (3rd)
   movq [edx+4*24+16], mm5   ;C14 (7th)
   paddw mm2, mm7            ;B11 mm7 free (4th+)
   pminsw mm2, [ebx]         ;B12 saturates to +2047 (5th+)
   mov ebx, [esp+4]
   mov edi, [esp+8]
   add esp, byte 12
   pxor mm3, mm2             ;B13 (6th+)
   movq [edx+4*24+8], mm3    ;B14 (7th)
 
   xor eax, eax
   ret
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -