⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 quantize4_mmx.asm

📁 MPEG-4编解码的实现(包括MPEG4视音频编解码)
💻 ASM
📖 第 1 页 / 共 2 页
字号:
		pmulhw  mm0, mm2		; (level<<4 + intra_matrix[i]>>1) / intra_matrix[i]

		movq    mm2, [intra_matrix + 8*ecx + 8]
		psrlw   mm2, 1
		paddw   mm3, mm2

		movq    mm2, [intra_matrix_fix + ecx*8 + 8]
		pmulhw  mm3, mm2

        paddw   mm0, mm5
		paddw   mm3, mm5

		psrlw	mm0, 2			; mm0 >>= 1   (/4)
		psrlw	mm3, 2			;
		
		pxor	mm0, mm1		; mm0 *= sign(mm0)
		pxor	mm3, mm4        ;
		psubw	mm0, mm1		; undisplace
		psubw	mm3, mm4		;
		
		movq	[edi + 8*ecx], mm0
		movq	[edi + 8*ecx + 8], mm3

		add ecx,2
		cmp ecx,16
		jnz	near .q2loop
		jmp	near .done


;===========================================================================
;
; uint32_t quant4_inter_mmx(int16_t * coeff,
;					const int16_t const * data,
;					const uint32_t quant);
;
;===========================================================================

align ALIGN
cglobal quant4_inter_mmx
		quant4_inter_mmx

		push	ecx
		push	esi
		push	edi

		mov	edi, [esp + 12 + 4]		; coeff
		mov	esi, [esp + 12 + 8]		; data
		mov	eax, [esp + 12 + 12]	; quant

		xor ecx, ecx

		pxor mm5, mm5					; sum

		cmp	al, 1
		jz  near .q1loop

		cmp	al, 2
		jz  near .q2loop

		movq	mm7, [mmx_div + eax * 8 - 8]	; divider

align ALIGN
.loop
		movq	mm0, [esi + 8*ecx]		; mm0 = [1st]
		movq	mm3, [esi + 8*ecx + 8]	; 
		pxor	mm1, mm1		; mm1 = 0
		pxor	mm4, mm4		;
		pcmpgtw	mm1, mm0		; mm1 = (0 > mm0)
		pcmpgtw	mm4, mm3		; 
		pxor	mm0, mm1		; mm0 = |mm0|
		pxor	mm3, mm4		; 
		psubw	mm0, mm1		; displace
		psubw	mm3, mm4		;

		psllw   mm0, 4
		psllw   mm3, 4
		
		movq    mm2, [inter_matrix + 8*ecx]
		psrlw   mm2, 1
		paddw   mm0, mm2
		
		movq    mm2, [inter_matrix_fix + ecx*8]
		pmulhw  mm0, mm2		; (level<<4 + inter_matrix[i]>>1) / inter_matrix[i]

		movq    mm2, [inter_matrix + 8*ecx + 8]
		psrlw   mm2, 1
		paddw   mm3, mm2

		movq    mm2, [inter_matrix_fix + ecx*8 + 8]
		pmulhw  mm3, mm2

		pmulhw	mm0, mm7		; mm0 = (mm0 / 2Q) >> 16
		pmulhw	mm3, mm7		; 
		psrlw   mm0, 1			; additional shift by 1 => 16 + 1 = 17
		psrlw   mm3, 1
		
		paddw	mm5, mm0		; sum += mm0
		pxor	mm0, mm1		; mm0 *= sign(mm0)
		paddw	mm5, mm3		;
		pxor	mm3, mm4		;
		psubw	mm0, mm1		; undisplace
		psubw	mm3, mm4
		movq	[edi + 8*ecx], mm0
		movq	[edi + 8*ecx + 8], mm3

		add ecx, 2	
		cmp ecx, 16
		jnz near .loop

.done
		pmaddwd mm5, [mmx_one]
		movq    mm0, mm5
		psrlq   mm5, 32
		paddd   mm0, mm5
		movd	eax, mm0		; return sum

		pop	edi
		pop	esi
		pop ecx

		ret

align ALIGN
.q1loop
		movq	mm0, [esi + 8*ecx]		; mm0 = [1st]
		movq	mm3, [esi + 8*ecx+ 8]
				; 
		pxor	mm1, mm1		; mm1 = 0
		pxor	mm4, mm4		;

		pcmpgtw	mm1, mm0		; mm1 = (0 > mm0)
		pcmpgtw	mm4, mm3		;

		pxor	mm0, mm1		; mm0 = |mm0|
		pxor	mm3, mm4		; 
		psubw	mm0, mm1		; displace
		psubw	mm3, mm4		;
		
		psllw   mm0, 4
		psllw   mm3, 4
		
		movq    mm2, [inter_matrix + 8*ecx]
		psrlw   mm2, 1
		paddw   mm0, mm2
		
		movq    mm2, [inter_matrix_fix + ecx*8]
		pmulhw  mm0, mm2		; (level<<4 + inter_matrix[i]>>1) / inter_matrix[i]

		movq    mm2, [inter_matrix + 8*ecx + 8]
		psrlw   mm2, 1
		paddw   mm3, mm2

		movq    mm2, [inter_matrix_fix + ecx*8 + 8]
		pmulhw  mm3, mm2
 
		psrlw	mm0, 1			; mm0 >>= 1   (/2)
		psrlw	mm3, 1			;

		paddw	mm5, mm0		; sum += mm0
		pxor	mm0, mm1		; mm0 *= sign(mm0)
		paddw	mm5, mm3		;
		pxor	mm3, mm4		;
		psubw	mm0, mm1		; undisplace
		psubw	mm3, mm4

		movq	[edi + 8*ecx], mm0
		movq	[edi + 8*ecx + 8], mm3
		
		add ecx,2
		cmp ecx,16
		jnz	near .q1loop

		jmp	.done


align ALIGN
.q2loop
		movq	mm0, [esi + 8*ecx]		; mm0 = [1st]
		movq	mm3, [esi + 8*ecx+ 8]
				; 
		pxor	mm1, mm1		; mm1 = 0
		pxor	mm4, mm4		;

		pcmpgtw	mm1, mm0		; mm1 = (0 > mm0)
		pcmpgtw	mm4, mm3		;

		pxor	mm0, mm1		; mm0 = |mm0|
		pxor	mm3, mm4		; 
		psubw	mm0, mm1		; displace
		psubw	mm3, mm4		;
		
		psllw   mm0, 4
		psllw   mm3, 4
		
		movq    mm2, [inter_matrix + 8*ecx]
		psrlw   mm2, 1
		paddw   mm0, mm2
		
		movq    mm2, [inter_matrix_fix + ecx*8]
		pmulhw  mm0, mm2		; (level<<4 + inter_matrix[i]>>1) / inter_matrix[i]

		movq    mm2, [inter_matrix + 8*ecx + 8]
		psrlw   mm2, 1
		paddw   mm3, mm2

		movq    mm2, [inter_matrix_fix + ecx*8 + 8]
		pmulhw  mm3, mm2
 
		psrlw	mm0, 2			; mm0 >>= 1   (/2)
		psrlw	mm3, 2			;

		paddw	mm5, mm0		; sum += mm0
		pxor	mm0, mm1		; mm0 *= sign(mm0)
		paddw	mm5, mm3		;
		pxor	mm3, mm4		;
		psubw	mm0, mm1		; undisplace
		psubw	mm3, mm4

		movq	[edi + 8*ecx], mm0
		movq	[edi + 8*ecx + 8], mm3
		
		add ecx,2
		cmp ecx,16
		jnz	near .q2loop

		jmp	.done


;===========================================================================
;
; void dequant4_intra_mmx(int16_t *data,
;                    const int16_t const *coeff,
;                    const uint32_t quant,
;                    const uint32_t dcscalar);
;
;===========================================================================

align 16
cglobal dequant4_intra_mmx
dequant4_intra_mmx

        push    esi
        push    edi

        mov    edi, [esp + 8 + 4]        ; data
        mov    esi, [esp + 8 + 8]        ; coeff
        mov    eax, [esp + 8 + 12]        ; quant
        
        movq mm7, [mmx_mul_quant  + eax*8 - 8]
    
        xor eax, eax


align 16        
.loop
        movq    mm0, [esi + 8*eax]        ; mm0 = [coeff]
        
        pxor    mm1, mm1                ; mm1 = 0
        pcmpeqw    mm1, mm0                ; mm1 = (0 == mm0)

        pxor    mm2, mm2                ; mm2 = 0
        pcmpgtw    mm2, mm0                ; mm2 = (0 > mm0)
        pxor    mm0, mm2                ; mm0 = |mm0|
        psubw    mm0, mm2                ; displace

        pmullw    mm0, mm7                ; mm0 *= quant
        
        movq    mm3, [intra_matrix + 8*eax]

        movq  mm4, mm0                    ;
        pmullw    mm0, mm3                ; mm0 = low(mm0 * mm3)
        pmulhw    mm3, mm4                ; mm3 = high(mm0 * mm3)

        movq    mm4, mm0                ; mm0,mm4 = unpack(mm3, mm0)
        punpcklwd mm0, mm3                ;
        punpckhwd mm4, mm3                ;
        psrld mm0, 3                    ; mm0,mm4 /= 8
        psrld mm4, 3                    ;
        packssdw mm0, mm4                ; mm0 = pack(mm4, mm0)

        pxor    mm0, mm2                ; mm0 *= sign(mm0)
        psubw    mm0, mm2                ; undisplace
        pandn    mm1, mm0                ; mm1 = ~(iszero) & mm0

%ifdef SATURATE
        movq mm2, [mmx_32767_minus_2047] 
        movq mm6, [mmx_32768_minus_2048] 
        paddsw    mm1, mm2
        psubsw    mm1, mm2
        psubsw    mm1, mm6
        paddsw    mm1, mm6
%endif

        movq    [edi + 8*eax], mm1        ; [data] = mm0

        add eax, 1
        cmp eax, 16
        jnz    near .loop

        mov    ax, [esi]                    ; ax = data[0]
        imul     ax, [esp + 8 + 16]        ; eax = data[0] * dcscalar
        mov    [edi], ax                    ; data[0] = ax

%ifdef SATURATE
        cmp ax, -2048
        jl .set_n2048
        cmp ax, 2047
        jg .set_2047
%endif

        pop    edi
        pop    esi
        ret

%ifdef SATURATE
.set_n2048
        mov    word [edi], -2048
        pop    edi
        pop    esi
        ret
    
.set_2047
        mov    word [edi], 2047
        pop    edi
        pop    esi

		ret
%endif



;===========================================================================
;
; void dequant4_inter_mmx(int16_t * data,
;                    const int16_t * const coeff,
;                    const uint32_t quant);
;
;===========================================================================

align 16
cglobal dequant4_inter_mmx
dequant4_inter_mmx

        push    esi
        push    edi
		
        mov    edi, [esp + 8 + 4]        ; data
        mov    esi, [esp + 8 + 8]        ; coeff
        mov    eax, [esp + 8 + 12]        ; quant
        movq mm7, [mmx_mul_quant  + eax*8 - 8]
        movq mm6, [mmx_one]
        xor eax, eax
        pxor mm5, mm5        ; mismatch sum


align 16        
.loop
        movq    mm0, [esi + 8*eax]                        ; mm0 = [coeff]

        pxor    mm1, mm1                ; mm1 = 0
        pcmpeqw    mm1, mm0                ; mm1 = (0 == mm0)

        pxor    mm2, mm2                ; mm2 = 0
        pcmpgtw    mm2, mm0                ; mm2 = (0 > mm0)
        pxor    mm0, mm2                ; mm0 = |mm0|
        psubw    mm0, mm2                ; displace

        psllw    mm0, 1                ;
        paddsw    mm0, mm6            ; mm0 = 2*mm0 + 1
        pmullw    mm0, mm7            ; mm0 *= quant

        movq    mm3, [inter_matrix + 8*eax]

        movq  mm4, mm0
        pmullw    mm0, mm3            ; mm0 = low(mm0 * mm3)
        pmulhw    mm3, mm4            ; mm3 = high(mm0 * mm3)

        movq    mm4, mm0            ; mm0,mm4 = unpack(mm3, mm0)
        punpcklwd mm0, mm3            ;
        punpckhwd mm4, mm3            ;

        psrad mm0, 4                ; mm0,mm4 /= 16
        psrad mm4, 4                ;
        packssdw mm0, mm4            ; mm0 = pack(mm4, mm0)

        pxor    mm0, mm2            ; mm0 *= sign(mm0)
        psubw    mm0, mm2            ; undisplace
        pandn    mm1, mm0            ; mm1 = ~(iszero) & mm0


;%ifdef SATURATE
        movq mm2, [mmx_32767_minus_2047] 
        movq mm4, [mmx_32768_minus_2048]
        paddsw    mm1, mm2
        psubsw    mm1, mm2
        psubsw    mm1, mm4
        paddsw    mm1, mm4
;%endif

        pxor mm5, mm1        ; mismatch
    
        movq    [edi + 8*eax], mm1        ; [data] = mm0

        add eax, 1
        cmp eax, 16
        jnz    near .loop

        ; mismatch control

        movq mm0, mm5
        movq mm1, mm5
        movq mm2, mm5
        psrlq mm0, 48
        psrlq mm1, 32
        psrlq mm2, 16
        pxor mm5, mm0
        pxor mm5, mm1
        pxor mm5, mm2

        movd eax, mm5
        test eax, 0x1
        jnz .done

        xor word [edi + 2*63], 1

.done    
        pop    edi
        pop    esi

        ret

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -