📄 interpolate8x8_xmm.asm

📁 MPEG4的VC代码
💻 ASM
字号:
;/*****************************************************************************; *; *  XVID MPEG-4 VIDEO CODEC; *	 xmm 8x8 block-based halfpel interpolation; *; *  Copyright(C) 2002 Michael Militzer <michael@xvid.org>; *  Copyright(C) 2002 -Skal-; *; *  This file is part of XviD, a free MPEG-4 video encoder/decoder; *; *  XviD is free software; you can redistribute it and/or modify it; *  under the terms of the GNU General Public License as published by; *  the Free Software Foundation; either version 2 of the License, or; *  (at your option) any later version.; *; *  This program is distributed in the hope that it will be useful,; *  but WITHOUT ANY WARRANTY; without even the implied warranty of; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the; *  GNU General Public License for more details.; *; *  You should have received a copy of the GNU General Public License; *  along with this program; if not, write to the Free Software; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA; *; *  Under section 8 of the GNU General Public License, the copyright; *  holders of XVID explicitly forbid distribution in the following; *  countries:; *; *    - Japan; *    - United States of America; *; *  Linking XviD statically or dynamically with other modules is making a; *  combined work based on XviD.  Thus, the terms and conditions of the; *  GNU General Public License cover the whole combination.; *; *  As a special exception, the copyright holders of XviD give you; *  permission to link XviD with independent modules that communicate with; *  XviD solely through the VFW1.1 and DShow interfaces, regardless of the; *  license terms of these independent modules, and to copy and distribute; *  the resulting combined work under terms of your choice, provided that; *  every copy of the combined work is accompanied by a complete copy of; *  the source code of XviD (the version of XviD used to produce the; *  combined work), being distributed under the terms of the GNU General; *  Public License plus this exception.  An independent module is a module; *  which is not derived from or based on XviD.; *; *  Note that people who make modified versions of XviD are not obligated; *  to grant this special exception for their modified versions; it is; *  their choice whether to do so.  The GNU General Public License gives; *  permission to release a modified version without this exception; this; *  exception also makes it possible to release a modified version which; *  carries forward this exception.; *; * $Id: interpolate8x8_xmm.asm,v 1.3 2002/11/17 00:20:30 edgomez Exp $; *; ****************************************************************************/bits 32%macro cglobal 1 	%ifdef PREFIX		global _%1 		%define %1 _%1	%else		global %1	%endif%endmacrosection .dataalign 16mmx_onetimes 8 db 1section .textcglobal interpolate8x8_halfpel_h_xmmcglobal interpolate8x8_halfpel_v_xmmcglobal interpolate8x8_halfpel_hv_xmm;===========================================================================;; void interpolate8x8_halfpel_h_xmm(uint8_t * const dst,;						const uint8_t * const src,;						const uint32_t stride,;						const uint32_t rounding);;;===========================================================================%macro COPY_H_SSE_RND0 0  movq mm0,  [eax]  pavgb mm0, [eax+1]  movq mm1,  [eax+edx]  pavgb mm1, [eax+edx+1]  lea eax,[eax+2*edx]  movq [ecx],mm0  movq [ecx+edx],mm1%endmacro%macro COPY_H_SSE_RND1 0  movq mm0, [eax]  movq mm1, [eax+edx]  movq mm4, mm0  movq mm5, mm1  movq mm2, [eax+1]    movq mm3, [eax+edx+1]  pavgb mm0, mm2  pxor mm2, mm4  pavgb mm1, mm3  lea eax,[eax+2*edx]  pxor mm3, mm5  pand mm2, mm7  pand mm3, mm7  psubb mm0, mm2  movq [ecx], mm0  psubb mm1, mm3	movq [ecx+edx], mm1%endmacroalign 16interpolate8x8_halfpel_h_xmm:  mov eax, [esp+16]; rounding  mov ecx, [esp+ 4] ; Dst  test eax,eax  mov eax, [esp+ 8] ; Src  mov edx, [esp+12] ; stride  jnz near .rounding1  COPY_H_SSE_RND0  lea ecx,[ecx+2*edx]  COPY_H_SSE_RND0  lea ecx,[ecx+2*edx]  COPY_H_SSE_RND0  lea ecx,[ecx+2*edx]  COPY_H_SSE_RND0  ret.rounding1   ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1  movq mm7, [mmx_one]  COPY_H_SSE_RND1  lea ecx, [ecx+2*edx]  COPY_H_SSE_RND1  lea ecx,[ecx+2*edx]  COPY_H_SSE_RND1  lea ecx,[ecx+2*edx]  COPY_H_SSE_RND1  ret;===========================================================================;; void interpolate8x8_halfpel_v_xmm(uint8_t * const dst,;						const uint8_t * const src,;						const uint32_t stride,;						const uint32_t rounding);;;===========================================================================%macro COPY_V_SSE_RND0 0  movq mm0,  [eax]    movq mm1,  [eax+edx]  pavgb mm0, mm1  pavgb mm1, [eax+2*edx]  lea eax,[eax+2*edx]  movq [ecx],mm0  movq [ecx+edx],mm1%endmacro%macro COPY_V_SSE_RND1 0  movq mm0, mm2  movq mm1, [eax]  movq mm2, [eax+edx]  lea eax,[eax+2*edx]  movq mm4, mm0  movq mm5, mm1  pavgb mm0, mm1  pxor mm4, mm1    pavgb mm1, mm2  pxor mm5, mm2  pand mm4, mm7    ; lsb's of (i^j)...  pand mm5, mm7    ; lsb's of (i^j)...  psubb mm0, mm4 ; ...are substracted from result of pavgb  movq [ecx], mm0  psubb mm1, mm5 ; ...are substracted from result of pavgb  movq [ecx+edx], mm1%endmacroalign 16interpolate8x8_halfpel_v_xmm:  mov eax, [esp+16]; rounding  mov ecx, [esp+ 4] ; Dst  test eax,eax  mov eax, [esp+ 8] ; Src  mov edx, [esp+12] ; stride    ; we process 2 line at a time  jnz near .rounding1  COPY_V_SSE_RND0  lea ecx, [ecx+2*edx]  COPY_V_SSE_RND0  lea ecx, [ecx+2*edx]  COPY_V_SSE_RND0  lea ecx, [ecx+2*edx]  COPY_V_SSE_RND0  ret.rounding1   ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1  movq mm7, [mmx_one]  movq mm2, [eax]   ; loop invariant  add eax, edx  COPY_V_SSE_RND1  lea ecx,[ecx+2*edx]  COPY_V_SSE_RND1  lea ecx,[ecx+2*edx]  COPY_V_SSE_RND1  lea ecx,[ecx+2*edx]  COPY_V_SSE_RND1  ret;===========================================================================;; void interpolate8x8_halfpel_hv_xmm(uint8_t * const dst,;						const uint8_t * const src,;						const uint32_t stride, ;						const uint32_t rounding);;;;===========================================================================; The trick is to correct the result of 'pavgb' with some combination of the; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t).; The boolean relations are:;   (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st ;   (i+j+k+l+2)/4 = (s+t+1)/2 - (ij|kl)&st;   (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)|st;   (i+j+k+l+0)/4 = (s+t+1)/2 - (ij|kl)|st; with  s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.; Moreover, we process 2 lines at a times, for better overlapping (~15% faster).%macro COPY_HV_SSE_RND0 0    lea eax,[eax+edx]    movq mm0, [eax]    movq mm1, [eax+1]    movq mm6, mm0    pavgb mm0, mm1  ; mm0=(j+k+1)/2. preserved for next step    lea eax,[eax+edx]    pxor mm1, mm6   ; mm1=(j^k).     preserved for next step    por mm3, mm1    ; ij |= jk    movq mm6, mm2    pxor mm6, mm0   ; mm6 = s^t    pand mm3, mm6   ; (ij|jk) &= st    pavgb mm2, mm0  ; mm2 = (s+t+1)/2    pand mm3, mm7   ; mask lsb    psubb mm2, mm3  ; apply.    movq [ecx], mm2    movq mm2, [eax]    movq mm3, [eax+1]    movq mm6, mm2    pavgb mm2, mm3  ; preserved for next iteration    lea ecx,[ecx+edx]    pxor mm3, mm6   ; preserved for next iteration    por mm1, mm3    movq mm6, mm0    pxor mm6, mm2    pand mm1, mm6    pavgb mm0, mm2    pand mm1, mm7    psubb mm0, mm1    movq [ecx], mm0%endmacro%macro COPY_HV_SSE_RND1 0    lea eax,[eax+edx]    movq mm0, [eax]    movq mm1, [eax+1]    movq mm6, mm0    pavgb mm0, mm1  ; mm0=(j+k+1)/2. preserved for next step    lea eax,[eax+edx]    pxor mm1, mm6   ; mm1=(j^k).     preserved for next step    pand mm3, mm1    movq mm6, mm2    pxor mm6, mm0    por mm3, mm6    pavgb mm2, mm0    pand mm3, mm7    psubb mm2, mm3    movq [ecx], mm2    movq mm2, [eax]    movq mm3, [eax+1]    movq mm6, mm2    pavgb mm2, mm3  ; preserved for next iteration    lea ecx,[ecx+edx]    pxor mm3, mm6   ; preserved for next iteration    pand mm1, mm3    movq mm6, mm0    pxor mm6, mm2    por mm1, mm6    pavgb mm0, mm2    pand mm1, mm7    psubb mm0, mm1    movq [ecx], mm0%endmacroalign 16interpolate8x8_halfpel_hv_xmm:  mov eax, [esp+16] ; rounding  mov ecx, [esp+ 4] ; Dst  test eax,eax  mov eax, [esp+ 8] ; Src  mov edx, [esp+12] ; stride  movq mm7, [mmx_one]    ; loop invariants: mm2=(i+j+1)/2  and  mm3= i^j  movq mm2, [eax]  movq mm3, [eax+1]  movq mm6, mm2  pavgb mm2, mm3  pxor mm3, mm6   ; mm2/mm3 ready  jnz near .rounding1  COPY_HV_SSE_RND0  add ecx, edx  COPY_HV_SSE_RND0  add ecx, edx  COPY_HV_SSE_RND0  add ecx, edx  COPY_HV_SSE_RND0  ret.rounding1  COPY_HV_SSE_RND1  add ecx, edx  COPY_HV_SSE_RND1  add ecx, edx  COPY_HV_SSE_RND1  add ecx, edx  COPY_HV_SSE_RND1  ret
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -