📄 reduced_mmx.asm
字号:
;/*****************************************************************************; *; * XVID MPEG-4 VIDEO CODEC; * - Reduced-Resolution utilities -; *; * Copyright(C) 2002 Pascal Massimino <skal@planet-d.net>; *; * XviD is free software; you can redistribute it and/or modify it; * under the terms of the GNU General Public License as published by; * the Free Software Foundation; either version 2 of the License, or; * (at your option) any later version.; *; * This program is distributed in the hope that it will be useful,; * but WITHOUT ANY WARRANTY; without even the implied warranty of; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the; * GNU General Public License for more details.; *; * You should have received a copy of the GNU General Public License; * along with this program; if not, write to the Free Software; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA; *; * $Id: reduced_mmx.asm,v 1.6 2004/08/29 10:02:38 edgomez Exp $; *; *************************************************************************/BITS 32%macro cglobal 1 %ifdef PREFIX %ifdef MARK_FUNCS global _%1:function %1.endfunc-%1 %define %1 _%1:function %1.endfunc-%1 %else global _%1 %define %1 _%1 %endif %else %ifdef MARK_FUNCS global %1:function %1.endfunc-%1 %else global %1 %endif %endif%endmacro;===========================================================================%ifdef FORMAT_COFFSECTION .rodata%elseSECTION .rodata align=16%endifalign 16Up31 dw 3, 1, 3, 1Up13 dw 1, 3, 1, 3Up93 dw 9, 3, 9, 3Up39 dw 3, 9, 3, 9Cst0 dw 0, 0, 0, 0Cst2 dw 2, 2, 2, 2Cst3 dw 3, 3, 3, 3Cst32 dw 32,32,32,32Cst2000 dw 2, 0, 0, 0Cst0002 dw 0, 0, 0, 2Mask_ff dw 0xff,0xff,0xff,0xff;===========================================================================SECTION .textcglobal xvid_Copy_Upsampled_8x8_16To8_mmxcglobal xvid_Add_Upsampled_8x8_16To8_mmxcglobal xvid_Copy_Upsampled_8x8_16To8_xmmcglobal xvid_Add_Upsampled_8x8_16To8_xmmcglobal xvid_HFilter_31_mmxcglobal xvid_VFilter_31_x86cglobal xvid_HFilter_31_x86cglobal xvid_Filter_18x18_To_8x8_mmxcglobal xvid_Filter_Diff_18x18_To_8x8_mmx;//////////////////////////////////////////////////////////////////////;// 8x8 -> 16x16 upsampling (16b);//////////////////////////////////////////////////////////////////////%macro MUL_PACK 4 ; %1/%2: regs %3/%4/%5: Up13/Up31 pmullw %1, %3 ; [Up13] pmullw mm4, %4 ; [Up31] pmullw %2, %3 ; [Up13] pmullw mm5, %4 ; [Up31] paddsw %1, [Cst2] paddsw %2, [Cst2] paddsw %1, mm4 paddsw %2, mm5%endmacro ; MMX-way of reordering columns...%macro COL03 3 ;%1/%2: regs, %3: row -output: mm4/mm5 movq %1, [edx+%3*16+0*2] ; %1 = 0|1|2|3 movq %2,[edx+%3*16+1*2] ; %2 = 1|2|3|4 movq mm5, %1 ; mm5 = 0|1|2|3 movq mm4, %1 ; mm4 = 0|1|2|3 punpckhwd mm5,%2 ; mm5 = 2|3|3|4 punpcklwd mm4,%2 ; mm4 = 0|1|1|2 punpcklwd %1,%1 ; %1 = 0|0|1|1 punpcklwd %2, mm5 ; %2 = 1|2|2|3 punpcklwd %1, mm4 ; %1 = 0|0|0|1%endmacro%macro COL47 3 ;%1-%2: regs, %3: row -output: mm4/mm5 movq mm5, [edx+%3*16+4*2] ; mm5 = 4|5|6|7 movq %1, [edx+%3*16+3*2] ; %1 = 3|4|5|6 movq %2, mm5 ; %2 = 4|5|6|7 movq mm4, mm5 ; mm4 = 4|5|6|7 punpckhwd %2, %2 ; %2 = 6|6|7|7 punpckhwd mm5, %2 ; mm5 = 6|7|7|7 movq %2, %1 ; %2 = 3|4|5|6 punpcklwd %1, mm4 ; %1 = 3|4|4|5 punpckhwd %2, mm4 ; %2 = 5|6|6|7 punpcklwd mm4, %2 ; mm4 = 4|5|5|6%endmacro%macro MIX_ROWS 4 ; %1/%2:prev %3/4:cur (preserved) mm4/mm5: output ; we need to perform: (%1,%3) -> (%1 = 3*%1+%3, mm4 = 3*%3+%1), %3 preserved. movq mm4, [Cst3] movq mm5, [Cst3] pmullw mm4, %3 pmullw mm5, %4 paddsw mm4, %1 paddsw mm5, %2 pmullw %1, [Cst3] pmullw %2, [Cst3] paddsw %1, %3 paddsw %2, %4%endmacro;===========================================================================;; void xvid_Copy_Upsampled_8x8_16To8_mmx(uint8_t *Dst,; const int16_t *Src, const int BpS);;;=========================================================================== ; Note: we can use ">>2" instead of "/4" here, since we ; are (supposed to be) averaging positive values%macro STORE_1 2 psraw %1, 2 psraw %2, 2 packuswb %1,%2 movq [ecx], %1%endmacro%macro STORE_2 2 ; pack and store (%1,%2) + (mm4,mm5) psraw %1, 4 psraw %2, 4 psraw mm4, 4 psraw mm5, 4 packuswb %1,%2 packuswb mm4, mm5 movq [ecx], %1 movq [ecx+eax], mm4 lea ecx, [ecx+2*eax]%endmacro;//////////////////////////////////////////////////////////////////////align 16xvid_Copy_Upsampled_8x8_16To8_mmx: ; 344c mov ecx, [esp+4] ; Dst mov edx, [esp+8] ; Src mov eax, [esp+12] ; BpS movq mm6, [Up13] movq mm7, [Up31] COL03 mm0, mm1, 0 MUL_PACK mm0,mm1, mm6, mm7 movq mm4, mm0 movq mm5, mm1 STORE_1 mm4, mm5 add ecx, eax COL03 mm2, mm3, 1 MUL_PACK mm2,mm3, mm6, mm7 MIX_ROWS mm0, mm1, mm2, mm3 STORE_2 mm0, mm1 COL03 mm0, mm1, 2 MUL_PACK mm0,mm1, mm6, mm7 MIX_ROWS mm2, mm3, mm0, mm1 STORE_2 mm2, mm3 COL03 mm2, mm3, 3 MUL_PACK mm2,mm3, mm6, mm7 MIX_ROWS mm0, mm1, mm2, mm3 STORE_2 mm0, mm1 COL03 mm0, mm1, 4 MUL_PACK mm0,mm1, mm6, mm7 MIX_ROWS mm2, mm3, mm0, mm1 STORE_2 mm2, mm3 COL03 mm2, mm3, 5 MUL_PACK mm2,mm3, mm6, mm7 MIX_ROWS mm0, mm1, mm2, mm3 STORE_2 mm0, mm1 COL03 mm0, mm1, 6 MUL_PACK mm0,mm1, mm6, mm7 MIX_ROWS mm2, mm3, mm0, mm1 STORE_2 mm2, mm3 COL03 mm2, mm3, 7 MUL_PACK mm2,mm3, mm6, mm7 MIX_ROWS mm0, mm1, mm2, mm3 STORE_2 mm0, mm1 STORE_1 mm2, mm3 mov ecx, [esp+4] add ecx, 8 COL47 mm0, mm1, 0 MUL_PACK mm0,mm1, mm6, mm7 movq mm4, mm0 movq mm5, mm1 STORE_1 mm4, mm5 add ecx, eax COL47 mm2, mm3, 1 MUL_PACK mm2,mm3, mm6, mm7 MIX_ROWS mm0, mm1, mm2, mm3 STORE_2 mm0, mm1 COL47 mm0, mm1, 2 MUL_PACK mm0,mm1, mm6, mm7 MIX_ROWS mm2, mm3, mm0, mm1 STORE_2 mm2, mm3 COL47 mm2, mm3, 3 MUL_PACK mm2,mm3, mm6, mm7 MIX_ROWS mm0, mm1, mm2, mm3 STORE_2 mm0, mm1 COL47 mm0, mm1, 4 MUL_PACK mm0,mm1, mm6, mm7 MIX_ROWS mm2, mm3, mm0, mm1 STORE_2 mm2, mm3 COL47 mm2, mm3, 5 MUL_PACK mm2,mm3, mm6, mm7 MIX_ROWS mm0, mm1, mm2, mm3 STORE_2 mm0, mm1 COL47 mm0, mm1, 6 MUL_PACK mm0,mm1, mm6, mm7 MIX_ROWS mm2, mm3, mm0, mm1 STORE_2 mm2, mm3 COL47 mm2, mm3, 7 MUL_PACK mm2,mm3, mm6, mm7 MIX_ROWS mm0, mm1, mm2, mm3 STORE_2 mm0, mm1 STORE_1 mm2, mm3 ret.endfunc;===========================================================================;; void xvid_Add_Upsampled_8x8_16To8_mmx(uint8_t *Dst,; const int16_t *Src, const int BpS);;;=========================================================================== ; Note: grrr... the 'pcmpgtw' stuff are the "/4" and "/16" operators ; implemented with ">>2" and ">>4" using: ; x/4 = ( (x-(x<0))>>2 ) + (x<0) ; x/16 = ( (x-(x<0))>>4 ) + (x<0)%macro STORE_ADD_1 2 ; We substract the rounder '2' for corner pixels, ; since when 'x' is negative, (x*4 + 2)/4 is *not* ; equal to 'x'. In fact, the correct relation is: ; (x*4 + 2)/4 = x - (x<0) ; So, better revert to (x*4)/4 = x. psubsw %1, [Cst2000] psubsw %2, [Cst0002] pxor mm6, mm6 pxor mm7, mm7 pcmpgtw mm6, %1 pcmpgtw mm7, %2 paddsw %1, mm6 paddsw %2, mm7 psraw %1, 2 psraw %2, 2 psubsw %1, mm6 psubsw %2, mm7 ; mix with destination [ecx] movq mm6, [ecx] movq mm7, [ecx] punpcklbw mm6, [Cst0] punpckhbw mm7, [Cst0] paddsw %1, mm6 paddsw %2, mm7 packuswb %1,%2 movq [ecx], %1%endmacro%macro STORE_ADD_2 2 pxor mm6, mm6 pxor mm7, mm7 pcmpgtw mm6, %1 pcmpgtw mm7, %2 paddsw %1, mm6 paddsw %2, mm7 psraw %1, 4 psraw %2, 4 psubsw %1, mm6 psubsw %2, mm7 pxor mm6, mm6 pxor mm7, mm7 pcmpgtw mm6, mm4 pcmpgtw mm7, mm5 paddsw mm4, mm6 paddsw mm5, mm7 psraw mm4, 4 psraw mm5, 4 psubsw mm4, mm6 psubsw mm5, mm7 ; mix with destination movq mm6, [ecx] movq mm7, [ecx] punpcklbw mm6, [Cst0] punpckhbw mm7, [Cst0] paddsw %1, mm6 paddsw %2, mm7 movq mm6, [ecx+eax] movq mm7, [ecx+eax] punpcklbw mm6, [Cst0] punpckhbw mm7, [Cst0] paddsw mm4, mm6 paddsw mm5, mm7 packuswb %1,%2 packuswb mm4, mm5 movq [ecx], %1 movq [ecx+eax], mm4 lea ecx, [ecx+2*eax]%endmacro;//////////////////////////////////////////////////////////////////////align 16xvid_Add_Upsampled_8x8_16To8_mmx: ; 579c mov ecx, [esp+4] ; Dst mov edx, [esp+8] ; Src mov eax, [esp+12] ; BpS COL03 mm0, mm1, 0 MUL_PACK mm0,mm1, [Up13], [Up31] movq mm4, mm0 movq mm5, mm1 STORE_ADD_1 mm4, mm5 add ecx, eax COL03 mm2, mm3, 1 MUL_PACK mm2,mm3, [Up13], [Up31] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 COL03 mm0, mm1, 2 MUL_PACK mm0,mm1, [Up13], [Up31] MIX_ROWS mm2, mm3, mm0, mm1 STORE_ADD_2 mm2, mm3 COL03 mm2, mm3, 3 MUL_PACK mm2,mm3, [Up13], [Up31] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 COL03 mm0, mm1, 4 MUL_PACK mm0,mm1, [Up13], [Up31] MIX_ROWS mm2, mm3, mm0, mm1 STORE_ADD_2 mm2, mm3 COL03 mm2, mm3, 5 MUL_PACK mm2,mm3, [Up13], [Up31] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 COL03 mm0, mm1, 6 MUL_PACK mm0,mm1, [Up13], [Up31] MIX_ROWS mm2, mm3, mm0, mm1 STORE_ADD_2 mm2, mm3 COL03 mm2, mm3, 7 MUL_PACK mm2,mm3, [Up13], [Up31] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 STORE_ADD_1 mm2, mm3 mov ecx, [esp+4] add ecx, 8 COL47 mm0, mm1, 0 MUL_PACK mm0,mm1, [Up13], [Up31] movq mm4, mm0 movq mm5, mm1 STORE_ADD_1 mm4, mm5 add ecx, eax COL47 mm2, mm3, 1 MUL_PACK mm2,mm3, [Up13], [Up31] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 COL47 mm0, mm1, 2 MUL_PACK mm0,mm1, [Up13], [Up31] MIX_ROWS mm2, mm3, mm0, mm1 STORE_ADD_2 mm2, mm3 COL47 mm2, mm3, 3 MUL_PACK mm2,mm3, [Up13], [Up31] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 COL47 mm0, mm1, 4 MUL_PACK mm0,mm1, [Up13], [Up31] MIX_ROWS mm2, mm3, mm0, mm1 STORE_ADD_2 mm2, mm3 COL47 mm2, mm3, 5 MUL_PACK mm2,mm3, [Up13], [Up31] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 COL47 mm0, mm1, 6 MUL_PACK mm0,mm1, [Up13], [Up31] MIX_ROWS mm2, mm3, mm0, mm1 STORE_ADD_2 mm2, mm3 COL47 mm2, mm3, 7 MUL_PACK mm2,mm3, [Up13], [Up31] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 STORE_ADD_1 mm2, mm3 ret.endfunc;===========================================================================;; void xvid_Copy_Upsampled_8x8_16To8_xmm(uint8_t *Dst,; const int16_t *Src, const int BpS);;;=========================================================================== ; xmm version can take (little) advantage of 'pshufw'%macro COL03_SSE 3 ;%1/%2: regs, %3: row -trashes mm4/mm5 movq %2, [edx+%3*16+0*2] ; <- 0|1|2|3 pshufw %1, %2, (0+0*4+0*16+1*64) ; %1 = 0|0|0|1 pshufw mm4, %2, (0+1*4+1*16+2*64) ; mm4= 0|1|1|2 pshufw %2, %2, (1+2*4+2*16+3*64) ; %2 = 1|2|2|3 pshufw mm5, [edx+%3*16+2*2], (0+1*4+1*16+2*64) ; mm5 = 2|3|3|4%endmacro%macro COL47_SSE 3 ;%1-%2: regs, %3: row -trashes mm4/mm5 pshufw %1, [edx+%3*16+2*2], (1+2*4+2*16+3*64) ; 3|4|4|5 movq mm5, [edx+%3*16+2*4] ; <- 4|5|6|7 pshufw mm4, mm5, (0+1*4+1*16+2*64) ; 4|5|5|6 pshufw %2, mm5, (1+2*4+2*16+3*64) ; 5|6|6|7 pshufw mm5, mm5, (2+3*4+3*16+3*64) ; 6|7|7|7%endmacro;//////////////////////////////////////////////////////////////////////align 16xvid_Copy_Upsampled_8x8_16To8_xmm: ; 315c mov ecx, [esp+4] ; Dst mov edx, [esp+8] ; Src
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -