⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 quantize_mmx.asm

📁 MPEG4的VC代码
💻 ASM
📖 第 1 页 / 共 2 页
字号:
		movd	eax, mm0		; return sum		pop	edi		pop	esi		pop ecx		retalign ALIGN.q1loop		movq	mm0, [esi + 8*ecx]		; mm0 = [1st]		movq	mm3, [esi + 8*ecx+ 8]		; 		pxor	mm1, mm1		; mm1 = 0		pxor	mm4, mm4		;		pcmpgtw	mm1, mm0		; mm1 = (0 > mm0)		pcmpgtw	mm4, mm3		; 		pxor	mm0, mm1		; mm0 = |mm0|		pxor	mm3, mm4		; 		psubw	mm0, mm1		; displace		psubw	mm3, mm4		; 		psubusw	mm0, mm6		; mm0 -= sub (unsigned, dont go < 0)		psubusw	mm3, mm6		;		psrlw	mm0, 1			; mm0 >>= 1   (/2)		psrlw	mm3, 1			;		paddw	mm5, mm0		; sum += mm0		pxor	mm0, mm1		; mm0 *= sign(mm0)		paddw	mm5, mm3		;		pxor	mm3, mm4		;		psubw	mm0, mm1		; undisplace		psubw	mm3, mm4		movq	[edi + 8*ecx], mm0		movq	[edi + 8*ecx + 8], mm3				add ecx,2		cmp ecx,16		jnz	.q1loop		jmp	.done;===========================================================================;; uint32_t quant_inter_sse2(int16_t * coeff,;					const int16_t const * data,;					const uint32_t quant);;;===========================================================================align 16cglobal quant_inter_sse2		quant_inter_sse2		push	esi		push	edi		mov		edi, [esp + 8 + 4]			; coeff		mov		esi, [esp + 8 + 8]			; data		mov		eax, [esp + 8 + 12]			; quant		xor		ecx, ecx		pxor	xmm5, xmm5					; sum		movq	mm0, [mmx_sub + eax*8 - 8]	; sub		movq2dq	xmm6, mm0					; load into low 8 bytes		movlhps	xmm6, xmm6					; duplicate into high 8 bytes		cmp		al, 1		jz		near .qes2_q1loop.qes2_not1		movq	mm0, [mmx_div + eax*8 - 8]	; divider		movq2dq	xmm7, mm0		movlhps	xmm7, xmm7align 16.qes2_loop		movdqa	xmm0, [esi + ecx*8]			; xmm0 = [1st]		movdqa	xmm3, [esi + ecx*8 + 16]	; xmm3 = [2nd]		pxor	xmm1, xmm1		pxor	xmm4, xmm4		pcmpgtw	xmm1, xmm0		pcmpgtw	xmm4, xmm3		pxor	xmm0, xmm1		pxor	xmm3, xmm4		psubw	xmm0, xmm1		psubw	xmm3, xmm4		psubusw	xmm0, xmm6		psubusw	xmm3, xmm6		pmulhw	xmm0, xmm7		pmulhw	xmm3, xmm7		paddw	xmm5, xmm0		pxor	xmm0, xmm1		paddw	xmm5, xmm3		pxor	xmm3, xmm4		psubw	xmm0, xmm1		psubw	xmm3, xmm4		movdqa	[edi + ecx*8], xmm0		movdqa	[edi + ecx*8 + 16], xmm3		add		ecx, 4			cmp		ecx, 16		jnz		.qes2_loop.qes2_done		movdqu	xmm6, [plus_one]		pmaddwd xmm5, xmm6		movhlps	xmm6, xmm5		paddd	xmm5, xmm6		movdq2q	mm0, xmm5		movq    mm5, mm0		psrlq   mm5, 32		paddd   mm0, mm5		movd	eax, mm0					; return sum		pop		edi		pop		esi		retalign 16.qes2_q1loop		movdqa	xmm0, [esi + ecx*8]			; xmm0 = [1st]		movdqa	xmm3, [esi + ecx*8 + 16]	; xmm3 = [2nd]		pxor	xmm1, xmm1		pxor	xmm4, xmm4		pcmpgtw	xmm1, xmm0		pcmpgtw	xmm4, xmm3		pxor	xmm0, xmm1		pxor	xmm3, xmm4		psubw	xmm0, xmm1		psubw	xmm3, xmm4		psubusw	xmm0, xmm6		psubusw	xmm3, xmm6		psrlw	xmm0, 1		psrlw	xmm3, 1		paddw	xmm5, xmm0		pxor	xmm0, xmm1		paddw	xmm5, xmm3		pxor	xmm3, xmm4		psubw	xmm0, xmm1		psubw	xmm3, xmm4		movdqa	[edi + ecx*8], xmm0		movdqa	[edi + ecx*8 + 16], xmm3				add		ecx,4		cmp		ecx,16		jnz		.qes2_q1loop		jmp		.qes2_done				;===========================================================================;; void dequant_intra_mmx(int16_t *data,;					const int16_t const *coeff,;					const uint32_t quant,;					const uint32_t dcscalar);;;===========================================================================  ; note: we only saturate to +2047 *before* restoring the sign.  ; Hence, final clamp really is [-2048,2047]align ALIGNcglobal dequant_intra_mmxdequant_intra_mmx:  mov    edx, [esp+ 4]        ; data  mov    ecx, [esp+ 8]        ; coeff  mov    eax, [esp+12]        ; quant  movq mm6, [mmx_add + eax*8 - 8]  ; quant or quant-1   movq mm7, [mmx_mul + eax*8 - 8]  ; 2*quant  mov eax, -16align ALIGN.loop  movq mm0, [ecx+8*eax+8*16]      ; c  = coeff[i]  movq mm3, [ecx+8*eax+8*16 + 8]  ; c' = coeff[i+1]  pxor mm1, mm1  pxor mm4, mm4  pcmpgtw mm1, mm0  ; sign(c)  pcmpgtw mm4, mm3  ; sign(c')  pxor mm2, mm2  pxor mm5, mm5  pcmpeqw mm2, mm0  ; c is zero  pcmpeqw mm5, mm3  ; c' is zero  pandn mm2, mm6    ; offset = isZero ? 0 : quant_add  pandn mm5, mm6  pxor mm0, mm1     ; negate if negative  pxor mm3, mm4     ; negate if negative  psubw mm0, mm1   psubw mm3, mm4  pmullw mm0, mm7 ; *= 2Q  pmullw mm3, mm7 ; *= 2Q  paddw mm0, mm2 ; + offset  paddw mm3, mm5 ; + offset  paddw mm0, mm1 ; negate back  paddw mm3, mm4 ; negate back    ; saturates to +2047  movq mm2, [mmx_32767_minus_2047]  add eax, 2  paddsw mm0, mm2  paddsw mm3, mm2  psubsw mm0, mm2  psubsw mm3, mm2  pxor mm0, mm1  pxor mm3, mm4  movq [edx + 8*eax + 8*16   - 2*8], mm0  movq [edx + 8*eax + 8*16+8 - 2*8], mm3  jnz	near .loop    ; deal with DC  movd mm0, [ecx]  pmullw mm0, [esp+16]    ; dcscalar  movq mm2, [mmx_32767_minus_2047]  paddsw mm0, mm2  psubsw mm0, mm2  movq mm3, [mmx_32768_minus_2048]  psubsw mm0, mm3  paddsw mm0, mm3  movd eax, mm0  mov [edx], ax  ret;===========================================================================;; void dequant_intra_xmm(int16_t *data,;					const int16_t const *coeff,;					const uint32_t quant,;					const uint32_t dcscalar);;;===========================================================================  ; this is the same as dequant_inter_mmx, except that we're  ; saturating using 'pminsw' (saves 2 cycles/loop => ~5% faster)align ALIGNcglobal dequant_intra_xmmdequant_intra_xmm:  mov    edx, [esp+ 4]        ; data  mov    ecx, [esp+ 8]        ; coeff  mov    eax, [esp+12]        ; quant  movq mm6, [mmx_add + eax*8 - 8]  ; quant or quant-1   movq mm7, [mmx_mul + eax*8 - 8]  ; 2*quant  mov eax, -16align ALIGN.loop  movq mm0, [ecx+8*eax+8*16]      ; c  = coeff[i]  movq mm3, [ecx+8*eax+8*16 + 8]  ; c' = coeff[i+1]  pxor mm1, mm1  pxor mm4, mm4  pcmpgtw mm1, mm0  ; sign(c)  pcmpgtw mm4, mm3  ; sign(c')  pxor mm2, mm2  pxor mm5, mm5  pcmpeqw mm2, mm0  ; c is zero  pcmpeqw mm5, mm3  ; c' is zero  pandn mm2, mm6    ; offset = isZero ? 0 : quant_add  pandn mm5, mm6  pxor mm0, mm1     ; negate if negative  pxor mm3, mm4     ; negate if negative  psubw mm0, mm1   psubw mm3, mm4  pmullw mm0, mm7 ; *= 2Q  pmullw mm3, mm7 ; *= 2Q  paddw mm0, mm2 ; + offset  paddw mm3, mm5 ; + offset  paddw mm0, mm1 ; negate back  paddw mm3, mm4 ; negate back    ; saturates to +2047  movq mm2, [mmx_2047]  pminsw mm0, mm2  add eax, 2  pminsw mm3, mm2  pxor mm0, mm1  pxor mm3, mm4  movq [edx + 8*eax + 8*16   - 2*8], mm0  movq [edx + 8*eax + 8*16+8 - 2*8], mm3  jnz	near .loop    ; deal with DC  movd mm0, [ecx]  pmullw mm0, [esp+16]    ; dcscalar  movq mm2, [mmx_32767_minus_2047]  paddsw mm0, mm2  psubsw mm0, mm2  movq mm2, [mmx_32768_minus_2048]  psubsw mm0, mm2  paddsw mm0, mm2  movd eax, mm0  mov [edx], ax  ret;===========================================================================;; void dequant_intra_sse2(int16_t *data,;					const int16_t const *coeff,;					const uint32_t quant,;					const uint32_t dcscalar);;;===========================================================================align ALIGNcglobal dequant_intra_sse2dequant_intra_sse2:	mov edx, [esp+ 4]        ; data	mov ecx, [esp+ 8]        ; coeff	mov eax, [esp+12]        ; quant	movq mm6, [mmx_add + eax * 8 - 8]	movq mm7, [mmx_mul + eax * 8 - 8]	movq2dq xmm6, mm6	movq2dq xmm7, mm7	movlhps xmm6, xmm6	movlhps xmm7, xmm7	mov eax, -16align ALIGN.loop	movdqa xmm0, [ecx + 8*16 + 8*eax]      ; c  = coeff[i]	movdqa xmm3, [ecx + 8*16 + 8*eax+ 16]	pxor xmm1, xmm1	pxor xmm4, xmm4	pcmpgtw xmm1, xmm0  ; sign(c)	pcmpgtw xmm4, xmm3	pxor xmm2, xmm2	pxor xmm5, xmm5	pcmpeqw xmm2, xmm0  ; c is zero	pcmpeqw xmm5, xmm3	pandn xmm2, xmm6    ; offset = isZero ? 0 : quant_add	pandn xmm5, xmm6	pxor xmm0, xmm1     ; negate if negative	pxor xmm3, xmm4	psubw xmm0, xmm1	psubw xmm3, xmm4	pmullw xmm0, xmm7 ; *= 2Q	pmullw xmm3, xmm7	paddw xmm0, xmm2 ; + offset	paddw xmm3, xmm5	paddw xmm0, xmm1 ; negate back	paddw xmm3, xmm4	; saturates to +2047	movdqa xmm2, [sse2_2047]	pminsw xmm0, xmm2	add eax, 4	pminsw xmm3, xmm2	pxor xmm0, xmm1	pxor xmm3, xmm4	movdqa [edx + 8*16 - 8*4 + 8*eax], xmm0	movdqa [edx + 8*16 - 8*4 + 8*eax + 16], xmm3	jnz	near .loop	; deal with DC	movd mm0, [ecx]	pmullw mm0, [esp+16]    ; dcscalar	movq mm2, [mmx_32767_minus_2047]	paddsw mm0, mm2	psubsw mm0, mm2	movq mm2, [mmx_32768_minus_2048]	psubsw mm0, mm2	paddsw mm0, mm2	movd eax, mm0	mov [edx], ax	ret;===========================================================================;; void dequant_inter_mmx(int16_t * data,;					const int16_t * const coeff,;					const uint32_t quant);;;===========================================================================align ALIGNcglobal dequant_inter_mmxdequant_inter_mmx:  mov    edx, [esp+ 4]        ; data  mov    ecx, [esp+ 8]        ; coeff  mov    eax, [esp+12]        ; quant  movq mm6, [mmx_add + eax*8 - 8]  ; quant or quant-1   movq mm7, [mmx_mul + eax*8 - 8]  ; 2*quant  mov eax, -16align ALIGN.loop  movq mm0, [ecx+8*eax+8*16]      ; c  = coeff[i]  movq mm3, [ecx+8*eax+8*16 + 8]  ; c' = coeff[i+1]  pxor mm1, mm1  pxor mm4, mm4  pcmpgtw mm1, mm0  ; sign(c)  pcmpgtw mm4, mm3  ; sign(c')  pxor mm2, mm2  pxor mm5, mm5  pcmpeqw mm2, mm0  ; c is zero  pcmpeqw mm5, mm3  ; c' is zero  pandn mm2, mm6    ; offset = isZero ? 0 : quant_add  pandn mm5, mm6  pxor mm0, mm1     ; negate if negative  pxor mm3, mm4     ; negate if negative  psubw mm0, mm1   psubw mm3, mm4  pmullw mm0, mm7 ; *= 2Q  pmullw mm3, mm7 ; *= 2Q  paddw mm0, mm2 ; + offset  paddw mm3, mm5 ; + offset  paddw mm0, mm1 ; negate back  paddw mm3, mm4 ; negate back    ; saturates to +2047  movq mm2, [mmx_32767_minus_2047]  add eax, 2  paddsw mm0, mm2  paddsw mm3, mm2  psubsw mm0, mm2  psubsw mm3, mm2  pxor mm0, mm1  pxor mm3, mm4  movq [edx + 8*eax + 8*16   - 2*8], mm0  movq [edx + 8*eax + 8*16+8 - 2*8], mm3  jnz	near .loop  ret;===========================================================================;; void dequant_inter_xmm(int16_t * data,;					const int16_t * const coeff,;					const uint32_t quant);;;===========================================================================  ; this is the same as dequant_inter_mmx,  ; except that we're saturating using 'pminsw' (saves 2 cycles/loop)align ALIGNcglobal dequant_inter_xmmdequant_inter_xmm:  mov    edx, [esp+ 4]        ; data  mov    ecx, [esp+ 8]        ; coeff  mov    eax, [esp+12]        ; quant  movq mm6, [mmx_add + eax*8 - 8]  ; quant or quant-1   movq mm7, [mmx_mul + eax*8 - 8]  ; 2*quant  mov eax, -16align ALIGN.loop  movq mm0, [ecx+8*eax+8*16]      ; c  = coeff[i]  movq mm3, [ecx+8*eax+8*16 + 8]  ; c' = coeff[i+1]  pxor mm1, mm1  pxor mm4, mm4  pcmpgtw mm1, mm0  ; sign(c)  pcmpgtw mm4, mm3  ; sign(c')  pxor mm2, mm2  pxor mm5, mm5  pcmpeqw mm2, mm0  ; c is zero  pcmpeqw mm5, mm3  ; c' is zero  pandn mm2, mm6    ; offset = isZero ? 0 : quant_add  pandn mm5, mm6  pxor mm0, mm1     ; negate if negative  pxor mm3, mm4     ; negate if negative  psubw mm0, mm1   psubw mm3, mm4  pmullw mm0, mm7 ; *= 2Q  pmullw mm3, mm7 ; *= 2Q  paddw mm0, mm2 ; + offset  paddw mm3, mm5 ; + offset  paddw mm0, mm1 ; start restoring sign  paddw mm3, mm4 ; start restoring sign      ; saturates to +2047  movq mm2, [mmx_2047]  pminsw mm0, mm2  add eax, 2  pminsw mm3, mm2  pxor mm0, mm1 ; finish restoring sign  pxor mm3, mm4 ; finish restoring sign  movq [edx + 8*eax + 8*16   - 2*8], mm0  movq [edx + 8*eax + 8*16+8 - 2*8], mm3  jnz	near .loop  ret;===========================================================================;; void dequant_inter_sse2(int16_t * data,;					const int16_t * const coeff,;					const uint32_t quant);;;===========================================================================align ALIGNcglobal dequant_inter_sse2dequant_inter_sse2	mov edx, [esp + 4]	; data	mov ecx, [esp + 8]	; coeff	mov eax, [esp + 12]	; quant	movq mm6, [mmx_add + eax * 8 - 8]	movq mm7, [mmx_mul + eax * 8 - 8]	movq2dq	xmm6, mm6	movq2dq xmm7, mm7	movlhps xmm6, xmm6	movlhps	xmm7, xmm7	mov eax, -16align ALIGN.loop	movdqa xmm0, [ecx + 8*16 + 8*eax]  ; c  = coeff[i]	movdqa xmm3, [ecx + 8*16 + 8*eax + 16]	pxor xmm1, xmm1	pxor xmm4, xmm4	pcmpgtw	xmm1, xmm0  ; sign(c)	pcmpgtw	xmm4, xmm3	pxor xmm2, xmm2	pxor xmm5, xmm5	pcmpeqw	xmm2, xmm0  ; c is zero	pcmpeqw	xmm5, xmm3	pandn xmm2, xmm6	pandn xmm5, xmm6	pxor xmm0, xmm1  ; negate if negative	pxor xmm3, xmm4	psubw xmm0, xmm1	psubw xmm3, xmm4	pmullw xmm0, xmm7  ; *= 2Q	pmullw xmm3, xmm7	paddw xmm0, xmm2  ; + offset	paddw xmm3, xmm5	paddw xmm0, xmm1  ; start restoring sign	paddw xmm3, xmm4	; saturates to +2047	movdqa xmm2, [sse2_2047]	pminsw xmm0, xmm2	add eax, 4	pminsw xmm3, xmm2	pxor xmm0, xmm1 ; finish restoring sign	pxor xmm3, xmm4	movdqa [edx + 8*16 - 8*4 + 8*eax], xmm0	movdqa [edx + 8*16 - 8*4 + 8*eax + 16], xmm3	jnz	near .loop	ret

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -