📄 interpolate8x8_xmm.asm
字号:
movq mm2, [eax] ; loop invariant add eax, edx COPY_V_SSE_RND1 lea ecx,[ecx+2*edx] COPY_V_SSE_RND1 ret.endfunc;===========================================================================;; void interpolate8x4_halfpel_hv_xmm(uint8_t * const dst,; const uint8_t * const src,; const uint32_t stride,; const uint32_t rounding);;;;===========================================================================; The trick is to correct the result of 'pavgb' with some combination of the; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t).; The boolean relations are:; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij|kl)&st; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)|st; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij|kl)|st; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.; Moreover, we process 2 lines at a times, for better overlapping (~15% faster).ALIGN 16interpolate8x4_halfpel_hv_xmm: mov eax, [esp+16] ; rounding mov ecx, [esp+ 4] ; Dst test eax, eax mov eax, [esp+ 8] ; Src mov edx, [esp+12] ; stride movq mm7, [mmx_one] ; loop invariants: mm2=(i+j+1)/2 and mm3= i^j movq mm2, [eax] movq mm3, [eax+1] movq mm6, mm2 pavgb mm2, mm3 pxor mm3, mm6 ; mm2/mm3 ready jnz near .rounding1 COPY_HV_SSE_RND0 add ecx, edx COPY_HV_SSE_RND0 ret.rounding1 COPY_HV_SSE_RND1 add ecx, edx COPY_HV_SSE_RND1 ret.endfunc;===========================================================================;; The next functions combine both source halfpel interpolation step and the; averaging (with rouding) step to avoid wasting memory bandwidth computing; intermediate halfpel images and then averaging them.;;===========================================================================%macro PROLOG0 0 mov ecx, [esp+ 4] ; Dst mov eax, [esp+ 8] ; Src mov edx, [esp+12] ; BpS%endmacro%macro PROLOG1 0 PROLOG0 test dword [esp+16], 1; Rounding?%endmacro%macro EPILOG 0 ret%endmacro;===========================================================================;; void interpolate8x8_halfpel_add_xmm(uint8_t * const dst,; const uint8_t * const src,; const uint32_t stride,; const uint32_t rounding);;;;===========================================================================%macro ADD_FF 2 movq mm0, [eax+%1] movq mm1, [eax+%2];;---;; movq mm2, mm0;; movq mm3, mm1;;--- pavgb mm0, [ecx+%1] pavgb mm1, [ecx+%2];;--;; por mm2, [ecx+%1];; por mm3, [ecx+%2];; pand mm2, [mmx_one];; pand mm3, [mmx_one];; psubsb mm0, mm2;; psubsb mm1, mm3;;-- movq [ecx+%1], mm0 movq [ecx+%2], mm1%endmacroALIGN 16interpolate8x8_halfpel_add_xmm: ; 23c PROLOG1 ADD_FF 0, edx lea eax,[eax+2*edx] lea ecx,[ecx+2*edx] ADD_FF 0, edx lea eax,[eax+2*edx] lea ecx,[ecx+2*edx] ADD_FF 0, edx lea eax,[eax+2*edx] lea ecx,[ecx+2*edx] ADD_FF 0, edx EPILOG.endfunc;===========================================================================;; void interpolate8x8_halfpel_h_add_xmm(uint8_t * const dst,; const uint8_t * const src,; const uint32_t stride,; const uint32_t rounding);;;;===========================================================================%macro ADD_FH_RND0 2 movq mm0, [eax+%1] movq mm1, [eax+%2] pavgb mm0, [eax+%1+1] pavgb mm1, [eax+%2+1] pavgb mm0, [ecx+%1] pavgb mm1, [ecx+%2] movq [ecx+%1],mm0 movq [ecx+%2],mm1%endmacro%macro ADD_FH_RND1 2 movq mm0, [eax+%1] movq mm1, [eax+%2] movq mm4, mm0 movq mm5, mm1 movq mm2, [eax+%1+1] movq mm3, [eax+%2+1] pavgb mm0, mm2 ; lea ?? pxor mm2, mm4 pavgb mm1, mm3 pxor mm3, mm5 pand mm2, [mmx_one] pand mm3, [mmx_one] psubb mm0, mm2 psubb mm1, mm3 pavgb mm0, [ecx+%1] pavgb mm1, [ecx+%2] movq [ecx+%1],mm0 movq [ecx+%2],mm1%endmacroALIGN 16interpolate8x8_halfpel_h_add_xmm: ; 32c PROLOG1 jnz near .Loop1 ADD_FH_RND0 0, edx lea eax,[eax+2*edx] lea ecx,[ecx+2*edx] ADD_FH_RND0 0, edx lea eax,[eax+2*edx] lea ecx,[ecx+2*edx] ADD_FH_RND0 0, edx lea eax,[eax+2*edx] lea ecx,[ecx+2*edx] ADD_FH_RND0 0, edx EPILOG.Loop1 ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 ; movq mm7, [mmx_one] ADD_FH_RND1 0, edx lea eax,[eax+2*edx] lea ecx,[ecx+2*edx] ADD_FH_RND1 0, edx lea eax,[eax+2*edx] lea ecx,[ecx+2*edx] ADD_FH_RND1 0, edx lea eax,[eax+2*edx] lea ecx,[ecx+2*edx] ADD_FH_RND1 0, edx EPILOG.endfunc;===========================================================================;; void interpolate8x8_halfpel_v_add_xmm(uint8_t * const dst,; const uint8_t * const src,; const uint32_t stride,; const uint32_t rounding);;;;===========================================================================%macro ADD_8_HF_RND0 0 movq mm0, [eax] movq mm1, [eax+edx] pavgb mm0, mm1 pavgb mm1, [eax+2*edx] lea eax,[eax+2*edx] pavgb mm0, [ecx] pavgb mm1, [ecx+edx] movq [ecx],mm0 movq [ecx+edx],mm1%endmacro%macro ADD_8_HF_RND1 0 movq mm1, [eax+edx] movq mm2, [eax+2*edx] lea eax,[eax+2*edx] movq mm4, mm0 movq mm5, mm1 pavgb mm0, mm1 pxor mm4, mm1 pavgb mm1, mm2 pxor mm5, mm2 pand mm4, mm7 ; lsb's of (i^j)... pand mm5, mm7 ; lsb's of (i^j)... psubb mm0, mm4 ; ...are substracted from result of pavgb pavgb mm0, [ecx] movq [ecx], mm0 psubb mm1, mm5 ; ...are substracted from result of pavgb pavgb mm1, [ecx+edx] movq [ecx+edx], mm1%endmacroALIGN 16interpolate8x8_halfpel_v_add_xmm: PROLOG1 jnz near .Loop1 pxor mm7, mm7 ; this is a NOP ADD_8_HF_RND0 lea ecx,[ecx+2*edx] ADD_8_HF_RND0 lea ecx,[ecx+2*edx] ADD_8_HF_RND0 lea ecx,[ecx+2*edx] ADD_8_HF_RND0 EPILOG.Loop1 movq mm0, [eax] ; loop invariant movq mm7, [mmx_one] ADD_8_HF_RND1 movq mm0, mm2 lea ecx,[ecx+2*edx] ADD_8_HF_RND1 movq mm0, mm2 lea ecx,[ecx+2*edx] ADD_8_HF_RND1 movq mm0, mm2 lea ecx,[ecx+2*edx] ADD_8_HF_RND1 EPILOG.endfunc; The trick is to correct the result of 'pavgb' with some combination of the; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t).; The boolean relations are:; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij|kl)&st; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)|st; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij|kl)|st; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.; Moreover, we process 2 lines at a times, for better overlapping (~15% faster).;===========================================================================;; void interpolate8x8_halfpel_hv_add_xmm(uint8_t * const dst,; const uint8_t * const src,; const uint32_t stride,; const uint32_t rounding);;;;===========================================================================%macro ADD_HH_RND0 0 lea eax,[eax+edx] movq mm0, [eax] movq mm1, [eax+1] movq mm6, mm0 pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step lea eax,[eax+edx] pxor mm1, mm6 ; mm1=(j^k). preserved for next step por mm3, mm1 ; ij |= jk movq mm6, mm2 pxor mm6, mm0 ; mm6 = s^t pand mm3, mm6 ; (ij|jk) &= st pavgb mm2, mm0 ; mm2 = (s+t+1)/2 pand mm3, mm7 ; mask lsb psubb mm2, mm3 ; apply. pavgb mm2, [ecx] movq [ecx], mm2 movq mm2, [eax] movq mm3, [eax+1] movq mm6, mm2 pavgb mm2, mm3 ; preserved for next iteration lea ecx,[ecx+edx] pxor mm3, mm6 ; preserved for next iteration por mm1, mm3 movq mm6, mm0 pxor mm6, mm2 pand mm1, mm6 pavgb mm0, mm2 pand mm1, mm7 psubb mm0, mm1 pavgb mm0, [ecx] movq [ecx], mm0%endmacro%macro ADD_HH_RND1 0 lea eax,[eax+edx] movq mm0, [eax] movq mm1, [eax+1] movq mm6, mm0 pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step lea eax,[eax+edx] pxor mm1, mm6 ; mm1=(j^k). preserved for next step pand mm3, mm1 movq mm6, mm2 pxor mm6, mm0 por mm3, mm6 pavgb mm2, mm0 pand mm3, mm7 psubb mm2, mm3 pavgb mm2, [ecx] movq [ecx], mm2 movq mm2, [eax] movq mm3, [eax+1] movq mm6, mm2 pavgb mm2, mm3 ; preserved for next iteration lea ecx,[ecx+edx] pxor mm3, mm6 ; preserved for next iteration pand mm1, mm3 movq mm6, mm0 pxor mm6, mm2 por mm1, mm6 pavgb mm0, mm2 pand mm1, mm7 psubb mm0, mm1 pavgb mm0, [ecx] movq [ecx], mm0%endmacroALIGN 16interpolate8x8_halfpel_hv_add_xmm: PROLOG1 movq mm7, [mmx_one] ; loop invariants: mm2=(i+j+1)/2 and mm3= i^j movq mm2, [eax] movq mm3, [eax+1] movq mm6, mm2 pavgb mm2, mm3 pxor mm3, mm6 ; mm2/mm3 ready jnz near .Loop1 ADD_HH_RND0 add ecx, edx ADD_HH_RND0 add ecx, edx ADD_HH_RND0 add ecx, edx ADD_HH_RND0 EPILOG.Loop1 ADD_HH_RND1 add ecx, edx ADD_HH_RND1 add ecx, edx ADD_HH_RND1 add ecx, edx ADD_HH_RND1 EPILOG.endfunc
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -