⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 fdct_mmx_ffmpeg.asm

📁 1.Xvid Mpeg4 1.13 version 2.Building 0 error(s), 0 warning(s) By Jesse Stone Taiwan
💻 ASM
字号:
;/****************************************************************************; *; *  XVID MPEG-4 VIDEO CODEC; *  - MMX and XMM forward discrete cosine transform -; *; *  Copyright(C) 2003 Edouard Gomez <ed.gomez@free.fr>; *; *  This program is free software; you can redistribute it and/or modify it; *  under the terms of the GNU General Public License as published by; *  the Free Software Foundation; either version 2 of the License, or; *  (at your option) any later version.; *; *  This program is distributed in the hope that it will be useful,; *  but WITHOUT ANY WARRANTY; without even the implied warranty of; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the; *  GNU General Public License for more details.; *; *  You should have received a copy of the GNU General Public License; *  along with this program; if not, write to the Free Software; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA; *; * $Id: fdct_mmx_ffmpeg.asm,v 1.5 2004/08/29 10:02:38 edgomez Exp $; *; ***************************************************************************/;/****************************************************************************; *; *  Initial, but incomplete version provided by Intel at AppNote AP-922; *    http://developer.intel.com/vtune/cbts/strmsimd/922down.htm; *  Copyright (C) 1999 Intel Corporation; *; *  Completed and corrected in fdctmm32.c/fdctmm32.doc; *    http://members.tripod.com/~liaor/; *  Copyright (C) 2000 - Royce Shih-Wea Liao <liaor@iname.com>; *; *  Minimizing coefficients reordering changing the tables constants order; *    http://ffmpeg.sourceforge.net/; *  Copyright (C) 2001 Fabrice Bellard.; *; *  The version coded here is just a port to NASM syntax from the FFMPEG's; *  version. So all credits go to the previous authors for all their; *  respective work in order to have a nice/fast mmx fDCT.; ***************************************************************************/BITS 32;=============================================================================; Macros and other preprocessor constants;=============================================================================%macro cglobal 1	%ifdef PREFIX		%ifdef MARK_FUNCS			global _%1:function %1.endfunc-%1			%define %1 _%1:function %1.endfunc-%1		%else			global _%1			%define %1 _%1		%endif	%else		%ifdef MARK_FUNCS			global %1:function %1.endfunc-%1		%else			global %1		%endif	%endif%endmacro;;; Define this if you want an unrolled version of the code%define UNROLLED_LOOP%define BITS_FRW_ACC   3%define SHIFT_FRW_COL  BITS_FRW_ACC%define SHIFT_FRW_ROW  (BITS_FRW_ACC + 17)%define RND_FRW_ROW    (1 << (SHIFT_FRW_ROW-1))%define RND_FRW_COL    (1 << (SHIFT_FRW_COL-1));=============================================================================; Local Data (Read Only);=============================================================================%ifdef FORMAT_COFFSECTION .rodata%elseSECTION .rodata align=16%endifALIGN 8tab_frw_01234567:  dw  16384,   16384,   -8867,  -21407  dw  16384,   16384,   21407,    8867  dw  16384,  -16384,   21407,   -8867  dw -16384,   16384,    8867,  -21407  dw  22725,   19266,  -22725,  -12873  dw  12873,    4520,   19266,   -4520  dw  12873,  -22725,   19266,  -22725  dw   4520,   19266,    4520,  -12873  dw  22725,   22725,  -12299,  -29692  dw  22725,   22725,   29692,   12299  dw  22725,  -22725,   29692,  -12299  dw -22725,   22725,   12299,  -29692  dw  31521,   26722,  -31521,  -17855  dw  17855,    6270,   26722,   -6270  dw  17855,  -31521,   26722,  -31521  dw   6270,   26722,    6270,  -17855  dw  21407,   21407,  -11585,  -27969  dw  21407,   21407,   27969,   11585  dw  21407,  -21407,   27969,  -11585  dw -21407,   21407,   11585,  -27969  dw  29692,   25172,  -29692,  -16819  dw  16819,    5906,   25172,   -5906  dw  16819,  -29692,   25172,  -29692  dw   5906,   25172,    5906,  -16819  dw  19266,   19266,  -10426,  -25172  dw  19266,   19266,   25172,   10426  dw  19266,  -19266,   25172,  -10426  dw -19266,   19266,   10426,  -25172  dw  26722,   22654,  -26722,  -15137  dw  15137,    5315,   22654,   -5315  dw  15137,  -26722,   22654,  -26722  dw   5315,   22654,    5315,  -15137  dw  16384,   16384,   -8867,  -21407  dw  16384,   16384,   21407,    8867  dw  16384,  -16384,   21407,   -8867  dw -16384,   16384,    8867,  -21407  dw  22725,   19266,  -22725,  -12873  dw  12873,    4520,   19266,   -4520  dw  12873,  -22725,   19266,  -22725  dw   4520,   19266,    4520,  -12873  dw  19266,   19266,  -10426,  -25172  dw  19266,   19266,   25172,   10426  dw  19266,  -19266,   25172,  -10426  dw -19266,   19266,   10426,  -25172  dw  26722,   22654,  -26722,  -15137  dw  15137,    5315,   22654,   -5315  dw  15137,  -26722,   22654,  -26722  dw   5315,   22654,    5315,  -15137  dw  21407,   21407,  -11585,  -27969  dw  21407,   21407,   27969,   11585  dw  21407,  -21407,   27969,  -11585  dw -21407,   21407,   11585,  -27969  dw  29692,   25172,  -29692,  -16819  dw  16819,    5906,   25172,   -5906  dw  16819,  -29692,   25172,  -29692  dw   5906,   25172,    5906,  -16819,  dw  22725,   22725,  -12299,  -29692  dw  22725,   22725,   29692,   12299  dw  22725,  -22725,   29692,  -12299  dw -22725,   22725,   12299,  -29692  dw  31521,   26722,  -31521,  -17855  dw  17855,    6270,   26722,   -6270  dw  17855,  -31521,   26722,  -31521  dw   6270,   26722,    6270,  -17855ALIGN 8fdct_one_corr:  dw 1, 1, 1, 1ALIGN 8fdct_tg_all_16:  dw  13036,	13036,	13036,	13036  dw  27146,	27146,	27146,	27146  dw -21746, -21746, -21746, -21746ALIGN 8cos_4_16:  dw -19195, -19195, -19195, -19195ALIGN 8ocos_4_16:  dw 23170, 23170, 23170, 23170ALIGN 8fdct_r_row:  dd RND_FRW_ROW, RND_FRW_ROW;=============================================================================; Factorized parts of the code turned into macros for better understanding;=============================================================================	;; Macro for column DCT	;; FDCT_COLUMN_MMX(int16_t *out, const int16_t *in, int offset);	;;  - out, register name holding the out address	;;  - in, register name holding the in address	;;  - column number to process%macro FDCT_COLUMN_COMMON 3  movq mm0, [%2 + %3*2 + 1*16]  movq mm1, [%2 + %3*2 + 6*16]  movq mm2, mm0  movq mm3, [%2 + %3*2 + 2*16]  paddsw mm0, mm1  movq mm4, [%2 + %3*2 + 5*16]  psllw mm0, SHIFT_FRW_COL  movq mm5, [%2 + %3*2 + 0*16]  paddsw mm4, mm3  paddsw mm5, [%2 + %3*2 + 7*16]  psllw mm4, SHIFT_FRW_COL  movq mm6, mm0  psubsw mm2, mm1  movq mm1, [fdct_tg_all_16 + 4*2]  psubsw mm0, mm4  movq mm7, [%2 + %3*2 + 3*16]  pmulhw mm1, mm0  paddsw mm7, [%2 + %3*2 + 4*16]  psllw mm5, SHIFT_FRW_COL  paddsw mm6, mm4  psllw mm7, SHIFT_FRW_COL  movq mm4, mm5  psubsw mm5, mm7  paddsw mm1, mm5  paddsw mm4, mm7  por mm1, [fdct_one_corr]  psllw mm2, SHIFT_FRW_COL + 1  pmulhw mm5, [fdct_tg_all_16 + 4*2]  movq mm7, mm4  psubsw mm3, [%2 + %3*2 + 5*16]  psubsw mm4, mm6  movq [%1 + %3*2 + 2*16], mm1  paddsw mm7, mm6  movq mm1, [%2 + %3*2 + 3*16]  psllw mm3, SHIFT_FRW_COL + 1  psubsw mm1, [%2 + %3*2 + 4*16]  movq mm6, mm2  movq [%1 + %3*2 + 4*16], mm4  paddsw mm2, mm3  pmulhw mm2, [ocos_4_16]  psubsw mm6, mm3  pmulhw mm6, [ocos_4_16]  psubsw mm5, mm0  por mm5, [fdct_one_corr]  psllw mm1, SHIFT_FRW_COL  por mm2, [fdct_one_corr]  movq mm4, mm1  movq mm3, [%2 + %3*2 + 0*16]  paddsw mm1, mm6  psubsw mm3, [%2 + %3*2 + 7*16]  psubsw mm4, mm6  movq mm0, [fdct_tg_all_16 + 0*2]  psllw mm3, SHIFT_FRW_COL  movq mm6, [fdct_tg_all_16 + 8*2]  pmulhw mm0, mm1  movq [%1 + %3*2 + 0*16], mm7  pmulhw mm6, mm4  movq [%1 + %3*2 + 6*16], mm5  movq mm7, mm3  movq mm5, [fdct_tg_all_16 + 8*2]  psubsw mm7, mm2  paddsw mm3, mm2  pmulhw mm5, mm7  paddsw mm0, mm3  paddsw mm6, mm4  pmulhw mm3, [fdct_tg_all_16 + 0*2]  por mm0, [fdct_one_corr]  paddsw mm5, mm7  psubsw mm7, mm6  movq [%1 + %3*2 + 1*16], mm0  paddsw mm5, mm4  movq [%1 + %3*2 + 3*16], mm7  psubsw mm3, mm1  movq [%1 + %3*2 + 5*16], mm5  movq [%1 + %3*2 + 7*16], mm3%endmacro	;; Macro for row DCT using MMX punpcklw instructions	;; FDCT_ROW_MMX(int16_t *out, const int16_t *in, const int16_t *table);	;;  - out, register name holding the out address	;;  - in, register name holding the in address	;;  - table coefficients address (register or absolute)%macro FDCT_ROW_MMX 3  movd mm1, [%2 + 6*2]  punpcklwd mm1, [%2 + 4*2]  movq mm2, mm1  psrlq mm1, 0x20  movq mm0, [%2 + 0*2]  punpcklwd mm1, mm2  movq mm5, mm0  paddsw mm0, mm1  psubsw mm5, mm1  movq mm1, mm0  movq mm6, mm5  punpckldq mm3, mm5  punpckhdq mm6, mm3  movq mm3, [%3 + 0*2]  movq mm4, [%3 + 4*2]  punpckldq mm2, mm0  pmaddwd mm3, mm0  punpckhdq mm1, mm2  movq mm2, [%3 + 16*2]  pmaddwd mm4, mm1  pmaddwd mm0, [%3 + 8*2]  movq mm7, [%3 + 20*2]  pmaddwd mm2, mm5  paddd mm3, [fdct_r_row]  pmaddwd mm7, mm6  pmaddwd mm1, [%3 + 12*2]  paddd mm3, mm4  pmaddwd mm5, [%3 + 24*2]  pmaddwd mm6, [%3 + 28*2]  paddd mm2, mm7  paddd mm0, [fdct_r_row]  psrad mm3, SHIFT_FRW_ROW  paddd mm2, [fdct_r_row]  paddd mm0, mm1  paddd mm5, [fdct_r_row]  psrad mm2, SHIFT_FRW_ROW  paddd mm5, mm6  psrad mm0, SHIFT_FRW_ROW  psrad mm5, SHIFT_FRW_ROW  packssdw mm3, mm0  packssdw mm2, mm5  movq mm6, mm3  punpcklwd mm3, mm2  punpckhwd mm6, mm2  movq [%1 + 0*2], mm3  movq [%1 + 4*2], mm6%endmacro	;; Macro for column DCT using XMM instuction pshufw	;; FDCT_ROW_XMM(int16_t *out, const int16_t *in, const int16_t *table);	;;  - out, register name holding the out address	;;  - in, register name holding the in address	;;  - table coefficient address%macro FDCT_ROW_XMM 3	;; fdct_row_mmx2(const int16_t *in, int16_t *out, const int16_t *table)  pshufw mm5, [%2 + 4*2], 0x1B  movq mm0, [%2 + 0*2]  movq mm1, mm0  paddsw mm0, mm5  psubsw mm1, mm5  pshufw mm2, mm0, 0x4E  pshufw mm3, mm1, 0x4E  movq mm4, [%3 +  0*2]  movq mm6, [%3 +  4*2]  movq mm5, [%3 + 16*2]  movq mm7, [%3 + 20*2]  pmaddwd mm4, mm0  pmaddwd mm5, mm1  pmaddwd mm6, mm2  pmaddwd mm7, mm3  pmaddwd mm0, [%3 +  8*2]  pmaddwd mm2, [%3 + 12*2]  pmaddwd mm1, [%3 + 24*2]  pmaddwd mm3, [%3 + 28*2]  paddd mm4, mm6  paddd mm5, mm7  paddd mm0, mm2  paddd mm1, mm3  movq mm7, [fdct_r_row]  paddd mm4, mm7  paddd mm5, mm7  paddd mm0, mm7  paddd mm1, mm7  psrad mm4, SHIFT_FRW_ROW  psrad mm5, SHIFT_FRW_ROW  psrad mm0, SHIFT_FRW_ROW  psrad mm1, SHIFT_FRW_ROW  packssdw mm4, mm0  packssdw mm5, mm1  movq mm2, mm4  punpcklwd mm4, mm5  punpckhwd mm2, mm5  movq [%1 + 0*2], mm4  movq [%1 + 4*2], mm2%endmacro%macro MAKE_FDCT_FUNC 2ALIGN 16cglobal %1%1:	;; Move the destination/source address to the eax register  mov eax, [esp + 4]	;; Process the columns (4 at a time)  FDCT_COLUMN_COMMON eax, eax, 0 ; columns 0..3  FDCT_COLUMN_COMMON eax, eax, 4 ; columns 4..7%ifdef UNROLLED_LOOP	; Unrolled loop version%assign i 0%rep 8	;; Process the 'i'th row  %2 eax+2*i*8, eax+2*i*8, tab_frw_01234567+2*32*i	%assign i i+1%endrep%else  mov ecx, 8  mov edx, tab_frw_01234567ALIGN 8.loop  %2 eax, eax, edx  add eax, 2*8  add edx, 2*32  dec ecx  jne .loop%endif  ret.endfunc%endmacro;=============================================================================; Code;=============================================================================SECTION .text;-----------------------------------------------------------------------------; void fdct_mmx_ffmpeg(int16_t block[64]);;-----------------------------------------------------------------------------MAKE_FDCT_FUNC fdct_mmx_ffmpeg, FDCT_ROW_MMX;-----------------------------------------------------------------------------; void fdct_xmm_ffmpeg(int16_t block[64]);;-----------------------------------------------------------------------------MAKE_FDCT_FUNC fdct_xmm_ffmpeg, FDCT_ROW_XMM

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -