📄 interpolate8x8_mmx.asm
字号:
paddb mm0, mm4
paddb mm1, mm3
paddb mm2, mm1
paddb mm2, [mmx_two]
pand mm2, [mmx_mask2]
psrlq mm2, 2
paddb mm0, mm2
lea esi, [esi+edx]
lea edi, [edi+edx]
movq [ecx], mm0 ; (src1 + src2 + src3 + src4 + 2) / 4 -> dst
%endmacro
%macro AVG4_MMX_RND1 0
movq mm0, [eax] ; src1 -> mm0
movq mm1, [ebx] ; src2 -> mm1
movq mm2, mm0
movq mm3, mm1
pand mm2, [mmx_three]
pand mm3, [mmx_three]
pand mm0, [mmx_mask2]
pand mm1, [mmx_mask2]
psrlq mm0, 2
psrlq mm1, 2
lea eax,[eax+edx]
lea ebx,[ebx+edx]
paddb mm0, mm1
paddb mm2, mm3
movq mm4, [esi] ; src3 -> mm0
movq mm5, [edi] ; src4 -> mm1
movq mm1, mm4
movq mm3, mm5
pand mm1, [mmx_three]
pand mm3, [mmx_three]
pand mm4, [mmx_mask2]
pand mm5, [mmx_mask2]
psrlq mm4, 2
psrlq mm5, 2
paddb mm4, mm5
paddb mm0, mm4
paddb mm1, mm3
paddb mm2, mm1
paddb mm2, [mmx_one]
pand mm2, [mmx_mask2]
psrlq mm2, 2
paddb mm0, mm2
lea esi,[esi+edx]
lea edi,[edi+edx]
movq [ecx], mm0 ; (src1 + src2 + src3 + src4 + 2) / 4 -> dst
%endmacro
ALIGN 16
interpolate8x8_avg4_mmx:
push ebx
push edi
push esi
mov eax, [esp + 12 + 28] ; rounding
test eax, eax
mov ecx, [esp + 12 + 4] ; dst -> edi
mov eax, [esp + 12 + 8] ; src1 -> esi
mov ebx, [esp + 12 + 12] ; src2 -> eax
mov esi, [esp + 12 + 16] ; src3 -> esi
mov edi, [esp + 12 + 20] ; src4 -> edi
mov edx, [esp + 12 + 24] ; stride -> edx
movq mm7, [mmx_one]
jnz near .rounding1
AVG4_MMX_RND0
lea ecx, [ecx+edx]
AVG4_MMX_RND0
lea ecx, [ecx+edx]
AVG4_MMX_RND0
lea ecx, [ecx+edx]
AVG4_MMX_RND0
lea ecx, [ecx+edx]
AVG4_MMX_RND0
lea ecx, [ecx+edx]
AVG4_MMX_RND0
lea ecx, [ecx+edx]
AVG4_MMX_RND0
lea ecx, [ecx+edx]
AVG4_MMX_RND0
pop esi
pop edi
pop ebx
ret
.rounding1
AVG4_MMX_RND1
lea ecx, [ecx+edx]
AVG4_MMX_RND1
lea ecx, [ecx+edx]
AVG4_MMX_RND1
lea ecx, [ecx+edx]
AVG4_MMX_RND1
lea ecx, [ecx+edx]
AVG4_MMX_RND1
lea ecx, [ecx+edx]
AVG4_MMX_RND1
lea ecx, [ecx+edx]
AVG4_MMX_RND1
lea ecx, [ecx+edx]
AVG4_MMX_RND1
pop esi
pop edi
pop ebx
ret
.endfunc
;-----------------------------------------------------------------------------
;
; void interpolate8x8_6tap_lowpass_h_mmx(uint8_t const *dst,
; const uint8_t * const src,
; const uint32_t stride,
; const uint32_t rounding);
;
;-----------------------------------------------------------------------------
%macro LOWPASS_6TAP_H_MMX 0
movq mm0, [eax]
movq mm2, [eax+1]
movq mm1, mm0
movq mm3, mm2
punpcklbw mm0, mm7
punpcklbw mm2, mm7
punpckhbw mm1, mm7
punpckhbw mm3, mm7
paddw mm0, mm2
paddw mm1, mm3
psllw mm0, 2
psllw mm1, 2
movq mm2, [eax-1]
movq mm4, [eax+2]
movq mm3, mm2
movq mm5, mm4
punpcklbw mm2, mm7
punpcklbw mm4, mm7
punpckhbw mm3, mm7
punpckhbw mm5, mm7
paddw mm2, mm4
paddw mm3, mm5
psubsw mm0, mm2
psubsw mm1, mm3
pmullw mm0, [mmx_five]
pmullw mm1, [mmx_five]
movq mm2, [eax-2]
movq mm4, [eax+3]
movq mm3, mm2
movq mm5, mm4
punpcklbw mm2, mm7
punpcklbw mm4, mm7
punpckhbw mm3, mm7
punpckhbw mm5, mm7
paddw mm2, mm4
paddw mm3, mm5
paddsw mm0, mm2
paddsw mm1, mm3
paddsw mm0, mm6
paddsw mm1, mm6
psraw mm0, 5
psraw mm1, 5
lea eax, [eax+edx]
packuswb mm0, mm1
movq [ecx], mm0
%endmacro
ALIGN 16
interpolate8x8_6tap_lowpass_h_mmx:
mov eax, [esp + 16] ; rounding
movq mm6, [rounding_lowpass_mmx + eax * 8]
mov ecx, [esp + 4] ; dst -> edi
mov eax, [esp + 8] ; src -> esi
mov edx, [esp + 12] ; stride -> edx
pxor mm7, mm7
LOWPASS_6TAP_H_MMX
lea ecx, [ecx+edx]
LOWPASS_6TAP_H_MMX
lea ecx, [ecx+edx]
LOWPASS_6TAP_H_MMX
lea ecx, [ecx+edx]
LOWPASS_6TAP_H_MMX
lea ecx, [ecx+edx]
LOWPASS_6TAP_H_MMX
lea ecx, [ecx+edx]
LOWPASS_6TAP_H_MMX
lea ecx, [ecx+edx]
LOWPASS_6TAP_H_MMX
lea ecx, [ecx+edx]
LOWPASS_6TAP_H_MMX
ret
.endfunc
;-----------------------------------------------------------------------------
;
; void interpolate8x8_6tap_lowpass_v_mmx(uint8_t const *dst,
; const uint8_t * const src,
; const uint32_t stride,
; const uint32_t rounding);
;
;-----------------------------------------------------------------------------
%macro LOWPASS_6TAP_V_MMX 0
movq mm0, [eax]
movq mm2, [eax+edx]
movq mm1, mm0
movq mm3, mm2
punpcklbw mm0, mm7
punpcklbw mm2, mm7
punpckhbw mm1, mm7
punpckhbw mm3, mm7
paddw mm0, mm2
paddw mm1, mm3
psllw mm0, 2
psllw mm1, 2
movq mm4, [eax+2*edx]
sub eax, ebx
movq mm2, [eax+2*edx]
movq mm3, mm2
movq mm5, mm4
punpcklbw mm2, mm7
punpcklbw mm4, mm7
punpckhbw mm3, mm7
punpckhbw mm5, mm7
paddw mm2, mm4
paddw mm3, mm5
psubsw mm0, mm2
psubsw mm1, mm3
pmullw mm0, [mmx_five]
pmullw mm1, [mmx_five]
movq mm2, [eax+edx]
movq mm4, [eax+2*ebx]
movq mm3, mm2
movq mm5, mm4
punpcklbw mm2, mm7
punpcklbw mm4, mm7
punpckhbw mm3, mm7
punpckhbw mm5, mm7
paddw mm2, mm4
paddw mm3, mm5
paddsw mm0, mm2
paddsw mm1, mm3
paddsw mm0, mm6
paddsw mm1, mm6
psraw mm0, 5
psraw mm1, 5
lea eax, [eax+4*edx]
packuswb mm0, mm1
movq [ecx], mm0
%endmacro
ALIGN 16
interpolate8x8_6tap_lowpass_v_mmx:
push ebx
mov eax, [esp + 4 + 16] ; rounding
movq mm6, [rounding_lowpass_mmx + eax * 8]
mov ecx, [esp + 4 + 4] ; dst -> edi
mov eax, [esp + 4 + 8] ; src -> esi
mov edx, [esp + 4 + 12] ; stride -> edx
mov ebx, edx
shl ebx, 1
add ebx, edx
pxor mm7, mm7
LOWPASS_6TAP_V_MMX
lea ecx, [ecx+edx]
LOWPASS_6TAP_V_MMX
lea ecx, [ecx+edx]
LOWPASS_6TAP_V_MMX
lea ecx, [ecx+edx]
LOWPASS_6TAP_V_MMX
lea ecx, [ecx+edx]
LOWPASS_6TAP_V_MMX
lea ecx, [ecx+edx]
LOWPASS_6TAP_V_MMX
lea ecx, [ecx+edx]
LOWPASS_6TAP_V_MMX
lea ecx, [ecx+edx]
LOWPASS_6TAP_V_MMX
pop ebx
ret
.endfunc
;===========================================================================
;
; The next functions combine both source halfpel interpolation step and the
; averaging (with rouding) step to avoid wasting memory bandwidth computing
; intermediate halfpel images and then averaging them.
;
;===========================================================================
%macro PROLOG0 0
mov ecx, [esp+ 4] ; Dst
mov eax, [esp+ 8] ; Src
mov edx, [esp+12] ; BpS
%endmacro
%macro PROLOG 2 ; %1: Rounder, %2 load Dst-Rounder
pxor mm6, mm6
movq mm7, [%1] ; TODO: dangerous! (eax isn't checked)
%if %2
movq mm5, [rounding1_mmx]
%endif
PROLOG0
%endmacro
; performs: mm0 == (mm0+mm2) mm1 == (mm1+mm3)
%macro MIX 0
punpcklbw mm0, mm6
punpcklbw mm2, mm6
punpckhbw mm1, mm6
punpckhbw mm3, mm6
paddusw mm0, mm2
paddusw mm1, mm3
%endmacro
%macro MIX_DST 0
movq mm3, mm2
paddusw mm0, mm7 ; rounder
paddusw mm1, mm7 ; rounder
punpcklbw mm2, mm6
punpckhbw mm3, mm6
psrlw mm0, 1
psrlw mm1, 1
paddusw mm0, mm2 ; mix Src(mm0/mm1) with Dst(mm2/mm3)
paddusw mm1, mm3
paddusw mm0, mm5
paddusw mm1, mm5
psrlw mm0, 1
psrlw mm1, 1
packuswb mm0, mm1
%endmacro
%macro MIX2 0
punpcklbw mm0, mm6
punpcklbw mm2, mm6
paddusw mm0, mm2
paddusw mm0, mm7
punpckhbw mm1, mm6
punpckhbw mm3, mm6
paddusw mm1, mm7
paddusw mm1, mm3
psrlw mm0, 1
psrlw mm1, 1
packuswb mm0, mm1
%endmacro
;===========================================================================
;
; void interpolate8x8_halfpel_add_mmx(uint8_t * const dst,
; const uint8_t * const src,
; const uint32_t stride,
; const uint32_t rounding);
;
;
;===========================================================================
%macro ADD_FF_MMX 1
movq mm0, [eax]
movq mm2, [ecx]
movq mm1, mm0
movq mm3, mm2
%if (%1!=0)
lea eax,[eax+%1*edx]
%endif
MIX
paddusw mm0, mm5 ; rounder
paddusw mm1, mm5 ; rounder
psrlw mm0, 1
psrlw mm1, 1
packuswb mm0, mm1
movq [ecx], mm0
%if (%1!=0)
lea ecx,[ecx+%1*edx]
%endif
%endmacro
ALIGN 16
interpolate8x8_halfpel_add_mmx:
PROLOG rounding1_mmx, 1
ADD_FF_MMX 1
ADD_FF_MMX 1
ADD_FF_MMX 1
ADD_FF_MMX 1
ADD_FF_MMX 1
ADD_FF_MMX 1
ADD_FF_MMX 1
ADD_FF_MMX 0
ret
.endfunc
;===========================================================================
;
; void interpolate8x8_halfpel_h_add_mmx(uint8_t * const dst,
; const uint8_t * const src,
; const uint32_t stride,
; const uint32_t rounding);
;
;
;===========================================================================
%macro ADD_FH_MMX 0
movq mm0, [eax]
movq mm2, [eax+1]
movq mm1, mm0
movq mm3, mm2
lea eax,[eax+edx]
MIX
movq mm2, [ecx] ; prepare mix with Dst[0]
MIX_DST
movq [ecx], mm0
%endmacro
ALIGN 16
interpolate8x8_halfpel_h_add_mmx:
PROLOG rounding1_mmx, 1
ADD_FH_MMX
lea ecx,[ecx+edx]
ADD_FH_MMX
lea ecx,[ecx+edx]
ADD_FH_MMX
lea ecx,[ecx+edx]
ADD_FH_MMX
lea ecx,[ecx+edx]
ADD_FH_MMX
lea ecx,[ecx+edx]
ADD_FH_MMX
lea ecx,[ecx+edx]
ADD_FH_MMX
lea ecx,[ecx+edx]
ADD_FH_MMX
ret
.endfunc
;===========================================================================
;
; void interpolate8x8_halfpel_v_add_mmx(uint8_t * const dst,
; const uint8_t * const src,
; const uint32_t stride,
; const uint32_t rounding);
;
;
;===========================================================================
%macro ADD_HF_MMX 0
movq mm0, [eax]
movq mm2, [eax+edx]
movq mm1, mm0
movq mm3, mm2
lea eax,[eax+edx]
MIX
movq mm2, [ecx] ; prepare mix with Dst[0]
MIX_DST
movq [ecx], mm0
%endmacro
ALIGN 16
interpolate8x8_halfpel_v_add_mmx:
PROLOG rounding1_mmx, 1
ADD_HF_MMX
lea ecx,[ecx+edx]
ADD_HF_MMX
lea ecx,[ecx+edx]
ADD_HF_MMX
lea ecx,[ecx+edx]
ADD_HF_MMX
lea ecx,[ecx+edx]
ADD_HF_MMX
lea ecx,[ecx+edx]
ADD_HF_MMX
lea ecx,[ecx+edx]
ADD_HF_MMX
lea ecx,[ecx+edx]
ADD_HF_MMX
ret
.endfunc
; The trick is to correct the result of 'pavgb' with some combination of the
; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t).
; The boolean relations are:
; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st
; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij|kl)&st
; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)|st
; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij|kl)|st
; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.
; Moreover, we process 2 lines at a times, for better overlapping (~15% faster).
;===========================================================================
;
; void interpolate8x8_halfpel_hv_add_mmx(uint8_t * const dst,
; const uint8_t * const src,
; const uint32_t stride,
; const uint32_t rounding);
;
;
;===========================================================================
%macro ADD_HH_MMX 0
lea eax,[eax+edx]
; transfert prev line to mm0/mm1
movq mm0, mm2
movq mm1, mm3
; load new line in mm2/mm3
movq mm2, [eax]
movq mm4, [eax+1]
movq mm3, mm2
movq mm5, mm4
punpcklbw mm2, mm6
punpcklbw mm4, mm6
paddusw mm2, mm4
punpckhbw mm3, mm6
punpckhbw mm5, mm6
paddusw mm3, mm5
; mix current line (mm2/mm3) with previous (mm0,mm1);
; we'll preserve mm2/mm3 for next line...
paddusw mm0, mm2
paddusw mm1, mm3
movq mm4, [ecx] ; prepare mix with Dst[0]
movq mm5, mm4
paddusw mm0, mm7 ; finish mixing current line
paddusw mm1, mm7
punpcklbw mm4, mm6
punpckhbw mm5, mm6
psrlw mm0, 2
psrlw mm1, 2
paddusw mm0, mm4 ; mix Src(mm0/mm1) with Dst(mm2/mm3)
paddusw mm1, mm5
paddusw mm0, [rounding1_mmx]
paddusw mm1, [rounding1_mmx]
psrlw mm0, 1
psrlw mm1, 1
packuswb mm0, mm1
movq [ecx], mm0
%endmacro
ALIGN 16
interpolate8x8_halfpel_hv_add_mmx:
PROLOG rounding2_mmx, 0 ; mm5 is busy. Don't load dst-rounder
; preprocess first line
movq mm0, [eax]
movq mm2, [eax+1]
movq mm1, mm0
movq mm3, mm2
punpcklbw mm0, mm6
punpcklbw mm2, mm6
punpckhbw mm1, mm6
punpckhbw mm3, mm6
paddusw mm2, mm0
paddusw mm3, mm1
; Input: mm2/mm3 contains the value (Src[0]+Src[1]) of previous line
ADD_HH_MMX
lea ecx,[ecx+edx]
ADD_HH_MMX
lea ecx,[ecx+edx]
ADD_HH_MMX
lea ecx,[ecx+edx]
ADD_HH_MMX
lea ecx,[ecx+edx]
ADD_HH_MMX
lea ecx,[ecx+edx]
ADD_HH_MMX
lea ecx,[ecx+edx]
ADD_HH_MMX
lea ecx,[ecx+edx]
ADD_HH_MMX
ret
.endfunc
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -