📄 fdct_mmx_ffmpeg.asm
字号:
;/****************************************************************************
; *
; * XVID MPEG-4 VIDEO CODEC
; * - MMX and XMM forward discrete cosine transform -
; *
; * Copyright(C) 2003 Edouard Gomez <ed.gomez@free.fr>
; *
; * This program is free software; you can redistribute it and/or modify it
; * under the terms of the GNU General Public License as published by
; * the Free Software Foundation; either version 2 of the License, or
; * (at your option) any later version.
; *
; * This program is distributed in the hope that it will be useful,
; * but WITHOUT ANY WARRANTY; without even the implied warranty of
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
; * GNU General Public License for more details.
; *
; * You should have received a copy of the GNU General Public License
; * along with this program; if not, write to the Free Software
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
; *
; * $Id: fdct_mmx_ffmpeg.asm,v 1.1 2005/07/21 09:08:25 klschoef Exp $
; *
; ***************************************************************************/
;/****************************************************************************
; *
; * Initial, but incomplete version provided by Intel at AppNote AP-922
; * http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
; * Copyright (C) 1999 Intel Corporation
; *
; * Completed and corrected in fdctmm32.c/fdctmm32.doc
; * http://members.tripod.com/~liaor/
; * Copyright (C) 2000 - Royce Shih-Wea Liao <liaor@iname.com>
; *
; * Minimizing coefficients reordering changing the tables constants order
; * http://ffmpeg.sourceforge.net/
; * Copyright (C) 2001 Fabrice Bellard.
; *
; * The version coded here is just a port to NASM syntax from the FFMPEG's
; * version. So all credits go to the previous authors for all their
; * respective work in order to have a nice/fast mmx fDCT.
; ***************************************************************************/
BITS 32
;=============================================================================
; Macros and other preprocessor constants
;=============================================================================
%macro cglobal 1
%ifdef PREFIX
global _%1
%define %1 _%1
%else
global %1
%endif
%endmacro
;;; Define this if you want an unrolled version of the code
%define UNROLLED_LOOP
%define BITS_FRW_ACC 3
%define SHIFT_FRW_COL BITS_FRW_ACC
%define SHIFT_FRW_ROW (BITS_FRW_ACC + 17)
%define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1))
%define RND_FRW_COL (1 << (SHIFT_FRW_COL-1))
;=============================================================================
; Local Data (Read Only)
;=============================================================================
%ifdef FORMAT_COFF
SECTION .rodata data
%else
SECTION .rodata data align=16
%endif
ALIGN 8
tab_frw_01234567:
dw 16384, 16384, -8867, -21407
dw 16384, 16384, 21407, 8867
dw 16384, -16384, 21407, -8867
dw -16384, 16384, 8867, -21407
dw 22725, 19266, -22725, -12873
dw 12873, 4520, 19266, -4520
dw 12873, -22725, 19266, -22725
dw 4520, 19266, 4520, -12873
dw 22725, 22725, -12299, -29692
dw 22725, 22725, 29692, 12299
dw 22725, -22725, 29692, -12299
dw -22725, 22725, 12299, -29692
dw 31521, 26722, -31521, -17855
dw 17855, 6270, 26722, -6270
dw 17855, -31521, 26722, -31521
dw 6270, 26722, 6270, -17855
dw 21407, 21407, -11585, -27969
dw 21407, 21407, 27969, 11585
dw 21407, -21407, 27969, -11585
dw -21407, 21407, 11585, -27969
dw 29692, 25172, -29692, -16819
dw 16819, 5906, 25172, -5906
dw 16819, -29692, 25172, -29692
dw 5906, 25172, 5906, -16819
dw 19266, 19266, -10426, -25172
dw 19266, 19266, 25172, 10426
dw 19266, -19266, 25172, -10426
dw -19266, 19266, 10426, -25172
dw 26722, 22654, -26722, -15137
dw 15137, 5315, 22654, -5315
dw 15137, -26722, 22654, -26722
dw 5315, 22654, 5315, -15137
dw 16384, 16384, -8867, -21407
dw 16384, 16384, 21407, 8867
dw 16384, -16384, 21407, -8867
dw -16384, 16384, 8867, -21407
dw 22725, 19266, -22725, -12873
dw 12873, 4520, 19266, -4520
dw 12873, -22725, 19266, -22725
dw 4520, 19266, 4520, -12873
dw 19266, 19266, -10426, -25172
dw 19266, 19266, 25172, 10426
dw 19266, -19266, 25172, -10426
dw -19266, 19266, 10426, -25172
dw 26722, 22654, -26722, -15137
dw 15137, 5315, 22654, -5315
dw 15137, -26722, 22654, -26722
dw 5315, 22654, 5315, -15137
dw 21407, 21407, -11585, -27969
dw 21407, 21407, 27969, 11585
dw 21407, -21407, 27969, -11585
dw -21407, 21407, 11585, -27969
dw 29692, 25172, -29692, -16819
dw 16819, 5906, 25172, -5906
dw 16819, -29692, 25172, -29692
dw 5906, 25172, 5906, -16819,
dw 22725, 22725, -12299, -29692
dw 22725, 22725, 29692, 12299
dw 22725, -22725, 29692, -12299
dw -22725, 22725, 12299, -29692
dw 31521, 26722, -31521, -17855
dw 17855, 6270, 26722, -6270
dw 17855, -31521, 26722, -31521
dw 6270, 26722, 6270, -17855
ALIGN 8
fdct_one_corr:
dw 1, 1, 1, 1
ALIGN 8
fdct_tg_all_16:
dw 13036, 13036, 13036, 13036
dw 27146, 27146, 27146, 27146
dw -21746, -21746, -21746, -21746
ALIGN 8
cos_4_16:
dw -19195, -19195, -19195, -19195
ALIGN 8
ocos_4_16:
dw 23170, 23170, 23170, 23170
ALIGN 8
fdct_r_row:
dd RND_FRW_ROW, RND_FRW_ROW
;=============================================================================
; Factorized parts of the code turned into macros for better understanding
;=============================================================================
;; Macro for column DCT
;; FDCT_COLUMN_MMX(int16_t *out, const int16_t *in, int offset);
;; - out, register name holding the out address
;; - in, register name holding the in address
;; - column number to process
%macro FDCT_COLUMN_COMMON 3
movq mm0, [%2 + %3*2 + 1*16]
movq mm1, [%2 + %3*2 + 6*16]
movq mm2, mm0
movq mm3, [%2 + %3*2 + 2*16]
paddsw mm0, mm1
movq mm4, [%2 + %3*2 + 5*16]
psllw mm0, SHIFT_FRW_COL
movq mm5, [%2 + %3*2 + 0*16]
paddsw mm4, mm3
paddsw mm5, [%2 + %3*2 + 7*16]
psllw mm4, SHIFT_FRW_COL
movq mm6, mm0
psubsw mm2, mm1
movq mm1, [fdct_tg_all_16 + 4*2]
psubsw mm0, mm4
movq mm7, [%2 + %3*2 + 3*16]
pmulhw mm1, mm0
paddsw mm7, [%2 + %3*2 + 4*16]
psllw mm5, SHIFT_FRW_COL
paddsw mm6, mm4
psllw mm7, SHIFT_FRW_COL
movq mm4, mm5
psubsw mm5, mm7
paddsw mm1, mm5
paddsw mm4, mm7
por mm1, [fdct_one_corr]
psllw mm2, SHIFT_FRW_COL + 1
pmulhw mm5, [fdct_tg_all_16 + 4*2]
movq mm7, mm4
psubsw mm3, [%2 + %3*2 + 5*16]
psubsw mm4, mm6
movq [%1 + %3*2 + 2*16], mm1
paddsw mm7, mm6
movq mm1, [%2 + %3*2 + 3*16]
psllw mm3, SHIFT_FRW_COL + 1
psubsw mm1, [%2 + %3*2 + 4*16]
movq mm6, mm2
movq [%1 + %3*2 + 4*16], mm4
paddsw mm2, mm3
pmulhw mm2, [ocos_4_16]
psubsw mm6, mm3
pmulhw mm6, [ocos_4_16]
psubsw mm5, mm0
por mm5, [fdct_one_corr]
psllw mm1, SHIFT_FRW_COL
por mm2, [fdct_one_corr]
movq mm4, mm1
movq mm3, [%2 + %3*2 + 0*16]
paddsw mm1, mm6
psubsw mm3, [%2 + %3*2 + 7*16]
psubsw mm4, mm6
movq mm0, [fdct_tg_all_16 + 0*2]
psllw mm3, SHIFT_FRW_COL
movq mm6, [fdct_tg_all_16 + 8*2]
pmulhw mm0, mm1
movq [%1 + %3*2 + 0*16], mm7
pmulhw mm6, mm4
movq [%1 + %3*2 + 6*16], mm5
movq mm7, mm3
movq mm5, [fdct_tg_all_16 + 8*2]
psubsw mm7, mm2
paddsw mm3, mm2
pmulhw mm5, mm7
paddsw mm0, mm3
paddsw mm6, mm4
pmulhw mm3, [fdct_tg_all_16 + 0*2]
por mm0, [fdct_one_corr]
paddsw mm5, mm7
psubsw mm7, mm6
movq [%1 + %3*2 + 1*16], mm0
paddsw mm5, mm4
movq [%1 + %3*2 + 3*16], mm7
psubsw mm3, mm1
movq [%1 + %3*2 + 5*16], mm5
movq [%1 + %3*2 + 7*16], mm3
%endmacro
;; Macro for row DCT using MMX punpcklw instructions
;; FDCT_ROW_MMX(int16_t *out, const int16_t *in, const int16_t *table);
;; - out, register name holding the out address
;; - in, register name holding the in address
;; - table coefficients address (register or absolute)
%macro FDCT_ROW_MMX 3
movd mm1, [%2 + 6*2]
punpcklwd mm1, [%2 + 4*2]
movq mm2, mm1
psrlq mm1, 0x20
movq mm0, [%2 + 0*2]
punpcklwd mm1, mm2
movq mm5, mm0
paddsw mm0, mm1
psubsw mm5, mm1
movq mm1, mm0
movq mm6, mm5
punpckldq mm3, mm5
punpckhdq mm6, mm3
movq mm3, [%3 + 0*2]
movq mm4, [%3 + 4*2]
punpckldq mm2, mm0
pmaddwd mm3, mm0
punpckhdq mm1, mm2
movq mm2, [%3 + 16*2]
pmaddwd mm4, mm1
pmaddwd mm0, [%3 + 8*2]
movq mm7, [%3 + 20*2]
pmaddwd mm2, mm5
paddd mm3, [fdct_r_row]
pmaddwd mm7, mm6
pmaddwd mm1, [%3 + 12*2]
paddd mm3, mm4
pmaddwd mm5, [%3 + 24*2]
pmaddwd mm6, [%3 + 28*2]
paddd mm2, mm7
paddd mm0, [fdct_r_row]
psrad mm3, SHIFT_FRW_ROW
paddd mm2, [fdct_r_row]
paddd mm0, mm1
paddd mm5, [fdct_r_row]
psrad mm2, SHIFT_FRW_ROW
paddd mm5, mm6
psrad mm0, SHIFT_FRW_ROW
psrad mm5, SHIFT_FRW_ROW
packssdw mm3, mm0
packssdw mm2, mm5
movq mm6, mm3
punpcklwd mm3, mm2
punpckhwd mm6, mm2
movq [%1 + 0*2], mm3
movq [%1 + 4*2], mm6
%endmacro
;; Macro for column DCT using XMM instuction pshufw
;; FDCT_ROW_XMM(int16_t *out, const int16_t *in, const int16_t *table);
;; - out, register name holding the out address
;; - in, register name holding the in address
;; - table coefficient address
%macro FDCT_ROW_XMM 3
;; fdct_row_mmx2(const int16_t *in, int16_t *out, const int16_t *table)
pshufw mm5, [%2 + 4*2], 0x1B
movq mm0, [%2 + 0*2]
movq mm1, mm0
paddsw mm0, mm5
psubsw mm1, mm5
pshufw mm2, mm0, 0x4E
pshufw mm3, mm1, 0x4E
movq mm4, [%3 + 0*2]
movq mm6, [%3 + 4*2]
movq mm5, [%3 + 16*2]
movq mm7, [%3 + 20*2]
pmaddwd mm4, mm0
pmaddwd mm5, mm1
pmaddwd mm6, mm2
pmaddwd mm7, mm3
pmaddwd mm0, [%3 + 8*2]
pmaddwd mm2, [%3 + 12*2]
pmaddwd mm1, [%3 + 24*2]
pmaddwd mm3, [%3 + 28*2]
paddd mm4, mm6
paddd mm5, mm7
paddd mm0, mm2
paddd mm1, mm3
movq mm7, [fdct_r_row]
paddd mm4, mm7
paddd mm5, mm7
paddd mm0, mm7
paddd mm1, mm7
psrad mm4, SHIFT_FRW_ROW
psrad mm5, SHIFT_FRW_ROW
psrad mm0, SHIFT_FRW_ROW
psrad mm1, SHIFT_FRW_ROW
packssdw mm4, mm0
packssdw mm5, mm1
movq mm2, mm4
punpcklwd mm4, mm5
punpckhwd mm2, mm5
movq [%1 + 0*2], mm4
movq [%1 + 4*2], mm2
%endmacro
%macro MAKE_FDCT_FUNC 2
ALIGN 16
cglobal %1
%1:
;; Move the destination/source address to the eax register
mov eax, [esp + 4]
;; Process the columns (4 at a time)
FDCT_COLUMN_COMMON eax, eax, 0 ; columns 0..3
FDCT_COLUMN_COMMON eax, eax, 4 ; columns 4..7
%ifdef UNROLLED_LOOP
; Unrolled loop version
%assign i 0
%rep 8
;; Process the 'i'th row
%2 eax+2*i*8, eax+2*i*8, tab_frw_01234567+2*32*i
%assign i i+1
%endrep
%else
mov ecx, 8
mov edx, tab_frw_01234567
ALIGN 8
.loop
%2 eax, eax, edx
add eax, 2*8
add edx, 2*32
dec ecx
jne .loop
%endif
ret
%endmacro
;=============================================================================
; Code
;=============================================================================
SECTION .text
;-----------------------------------------------------------------------------
; void fdct_mmx_ffmpeg(int16_t block[64]);
;-----------------------------------------------------------------------------
MAKE_FDCT_FUNC fdct_mmx_ffmpeg, FDCT_ROW_MMX
;-----------------------------------------------------------------------------
; void fdct_xmm_ffmpeg(int16_t block[64]);
;-----------------------------------------------------------------------------
MAKE_FDCT_FUNC fdct_xmm_ffmpeg, FDCT_ROW_XMM
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -