📄 fdct_mmx_ffmpeg.asm

📁 wince下的xvidcore开发库,可用于MP4等视频播放开发
💻 ASM
字号:
 ;/****************************************************************************
 ; *
 ; *  XVID MPEG-4 VIDEO CODEC
 ; *  - MMX and XMM forward discrete cosine transform -
 ; *
 ; *  Copyright(C) 2003 Edouard Gomez <ed.gomez@free.fr>
 ; *
 ; *  This program is free software; you can redistribute it and/or modify it
 ; *  under the terms of the GNU General Public License as published by
 ; *  the Free Software Foundation; either version 2 of the License, or
 ; *  (at your option) any later version.
 ; *
 ; *  This program is distributed in the hope that it will be useful,
 ; *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 ; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 ; *  GNU General Public License for more details.
 ; *
 ; *  You should have received a copy of the GNU General Public License
 ; *  along with this program; if not, write to the Free Software
 ; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
 ; *
 ; * $Id: fdct_mmx_ffmpeg.asm,v 1.1 2005/07/21 09:08:25 klschoef Exp $
 ; *
 ; ***************************************************************************/
 
 ;/****************************************************************************
 ; *
 ; *  Initial, but incomplete version provided by Intel at AppNote AP-922
 ; *    http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
 ; *  Copyright (C) 1999 Intel Corporation
 ; *
 ; *  Completed and corrected in fdctmm32.c/fdctmm32.doc
 ; *    http://members.tripod.com/~liaor/
 ; *  Copyright (C) 2000 - Royce Shih-Wea Liao <liaor@iname.com>
 ; *
 ; *  Minimizing coefficients reordering changing the tables constants order
 ; *    http://ffmpeg.sourceforge.net/
 ; *  Copyright (C) 2001 Fabrice Bellard.
 ; *
 ; *  The version coded here is just a port to NASM syntax from the FFMPEG's
 ; *  version. So all credits go to the previous authors for all their
 ; *  respective work in order to have a nice/fast mmx fDCT.
 ; ***************************************************************************/
 
 BITS 32
 
 ;=============================================================================
 ; Macros and other preprocessor constants
 ;=============================================================================
 
 %macro cglobal 1
         %ifdef PREFIX
                 global _%1
                 %define %1 _%1
         %else
                 global %1
         %endif
 %endmacro
 
 ;;; Define this if you want an unrolled version of the code
 %define UNROLLED_LOOP
 
 %define BITS_FRW_ACC   3
 %define SHIFT_FRW_COL  BITS_FRW_ACC
 %define SHIFT_FRW_ROW  (BITS_FRW_ACC + 17)
 %define RND_FRW_ROW    (1 << (SHIFT_FRW_ROW-1))
 %define RND_FRW_COL    (1 << (SHIFT_FRW_COL-1))
 
 ;=============================================================================
 ; Local Data (Read Only)
 ;=============================================================================
 
 %ifdef FORMAT_COFF
 SECTION .rodata data
 %else
 SECTION .rodata data align=16
 %endif
 
 ALIGN 8
 tab_frw_01234567:
   dw  16384,   16384,   -8867,  -21407
   dw  16384,   16384,   21407,    8867
   dw  16384,  -16384,   21407,   -8867
   dw -16384,   16384,    8867,  -21407
   dw  22725,   19266,  -22725,  -12873
   dw  12873,    4520,   19266,   -4520
   dw  12873,  -22725,   19266,  -22725
   dw   4520,   19266,    4520,  -12873
 
   dw  22725,   22725,  -12299,  -29692
   dw  22725,   22725,   29692,   12299
   dw  22725,  -22725,   29692,  -12299
   dw -22725,   22725,   12299,  -29692
   dw  31521,   26722,  -31521,  -17855
   dw  17855,    6270,   26722,   -6270
   dw  17855,  -31521,   26722,  -31521
   dw   6270,   26722,    6270,  -17855
 
   dw  21407,   21407,  -11585,  -27969
   dw  21407,   21407,   27969,   11585
   dw  21407,  -21407,   27969,  -11585
   dw -21407,   21407,   11585,  -27969
   dw  29692,   25172,  -29692,  -16819
   dw  16819,    5906,   25172,   -5906
   dw  16819,  -29692,   25172,  -29692
   dw   5906,   25172,    5906,  -16819
 
   dw  19266,   19266,  -10426,  -25172
   dw  19266,   19266,   25172,   10426
   dw  19266,  -19266,   25172,  -10426
   dw -19266,   19266,   10426,  -25172
   dw  26722,   22654,  -26722,  -15137
   dw  15137,    5315,   22654,   -5315
   dw  15137,  -26722,   22654,  -26722
   dw   5315,   22654,    5315,  -15137
 
   dw  16384,   16384,   -8867,  -21407
   dw  16384,   16384,   21407,    8867
   dw  16384,  -16384,   21407,   -8867
   dw -16384,   16384,    8867,  -21407
   dw  22725,   19266,  -22725,  -12873
   dw  12873,    4520,   19266,   -4520
   dw  12873,  -22725,   19266,  -22725
   dw   4520,   19266,    4520,  -12873
 
   dw  19266,   19266,  -10426,  -25172
   dw  19266,   19266,   25172,   10426
   dw  19266,  -19266,   25172,  -10426
   dw -19266,   19266,   10426,  -25172
   dw  26722,   22654,  -26722,  -15137
   dw  15137,    5315,   22654,   -5315
   dw  15137,  -26722,   22654,  -26722
   dw   5315,   22654,    5315,  -15137
 
   dw  21407,   21407,  -11585,  -27969
   dw  21407,   21407,   27969,   11585
   dw  21407,  -21407,   27969,  -11585
   dw -21407,   21407,   11585,  -27969
   dw  29692,   25172,  -29692,  -16819
   dw  16819,    5906,   25172,   -5906
   dw  16819,  -29692,   25172,  -29692
   dw   5906,   25172,    5906,  -16819,
 
   dw  22725,   22725,  -12299,  -29692
   dw  22725,   22725,   29692,   12299
   dw  22725,  -22725,   29692,  -12299
   dw -22725,   22725,   12299,  -29692
   dw  31521,   26722,  -31521,  -17855
   dw  17855,    6270,   26722,   -6270
   dw  17855,  -31521,   26722,  -31521
   dw   6270,   26722,    6270,  -17855
 
 ALIGN 8
 fdct_one_corr:
   dw 1, 1, 1, 1
 
 ALIGN 8
 fdct_tg_all_16:
   dw  13036,    13036,  13036,  13036
   dw  27146,    27146,  27146,  27146
   dw -21746, -21746, -21746, -21746
 
 ALIGN 8
 cos_4_16:
   dw -19195, -19195, -19195, -19195
 
 ALIGN 8
 ocos_4_16:
   dw 23170, 23170, 23170, 23170
 
 ALIGN 8
 fdct_r_row:
   dd RND_FRW_ROW, RND_FRW_ROW
 
 ;=============================================================================
 ; Factorized parts of the code turned into macros for better understanding
 ;=============================================================================
 
         ;; Macro for column DCT
         ;; FDCT_COLUMN_MMX(int16_t *out, const int16_t *in, int offset);
         ;;  - out, register name holding the out address
         ;;  - in, register name holding the in address
         ;;  - column number to process
 %macro FDCT_COLUMN_COMMON 3
   movq mm0, [%2 + %3*2 + 1*16]
   movq mm1, [%2 + %3*2 + 6*16]
   movq mm2, mm0
   movq mm3, [%2 + %3*2 + 2*16]
   paddsw mm0, mm1
   movq mm4, [%2 + %3*2 + 5*16]
   psllw mm0, SHIFT_FRW_COL
   movq mm5, [%2 + %3*2 + 0*16]
   paddsw mm4, mm3
   paddsw mm5, [%2 + %3*2 + 7*16]
   psllw mm4, SHIFT_FRW_COL
   movq mm6, mm0
   psubsw mm2, mm1
   movq mm1, [fdct_tg_all_16 + 4*2]
   psubsw mm0, mm4
   movq mm7, [%2 + %3*2 + 3*16]
   pmulhw mm1, mm0
   paddsw mm7, [%2 + %3*2 + 4*16]
   psllw mm5, SHIFT_FRW_COL
   paddsw mm6, mm4
   psllw mm7, SHIFT_FRW_COL
   movq mm4, mm5
   psubsw mm5, mm7
   paddsw mm1, mm5
   paddsw mm4, mm7
   por mm1, [fdct_one_corr]
   psllw mm2, SHIFT_FRW_COL + 1
   pmulhw mm5, [fdct_tg_all_16 + 4*2]
   movq mm7, mm4
   psubsw mm3, [%2 + %3*2 + 5*16]
   psubsw mm4, mm6
   movq [%1 + %3*2 + 2*16], mm1
   paddsw mm7, mm6
   movq mm1, [%2 + %3*2 + 3*16]
   psllw mm3, SHIFT_FRW_COL + 1
   psubsw mm1, [%2 + %3*2 + 4*16]
   movq mm6, mm2
   movq [%1 + %3*2 + 4*16], mm4
   paddsw mm2, mm3
   pmulhw mm2, [ocos_4_16]
   psubsw mm6, mm3
   pmulhw mm6, [ocos_4_16]
   psubsw mm5, mm0
   por mm5, [fdct_one_corr]
   psllw mm1, SHIFT_FRW_COL
   por mm2, [fdct_one_corr]
   movq mm4, mm1
   movq mm3, [%2 + %3*2 + 0*16]
   paddsw mm1, mm6
   psubsw mm3, [%2 + %3*2 + 7*16]
   psubsw mm4, mm6
   movq mm0, [fdct_tg_all_16 + 0*2]
   psllw mm3, SHIFT_FRW_COL
   movq mm6, [fdct_tg_all_16 + 8*2]
   pmulhw mm0, mm1
   movq [%1 + %3*2 + 0*16], mm7
   pmulhw mm6, mm4
   movq [%1 + %3*2 + 6*16], mm5
   movq mm7, mm3
   movq mm5, [fdct_tg_all_16 + 8*2]
   psubsw mm7, mm2
   paddsw mm3, mm2
   pmulhw mm5, mm7
   paddsw mm0, mm3
   paddsw mm6, mm4
   pmulhw mm3, [fdct_tg_all_16 + 0*2]
   por mm0, [fdct_one_corr]
   paddsw mm5, mm7
   psubsw mm7, mm6
   movq [%1 + %3*2 + 1*16], mm0
   paddsw mm5, mm4
   movq [%1 + %3*2 + 3*16], mm7
   psubsw mm3, mm1
   movq [%1 + %3*2 + 5*16], mm5
   movq [%1 + %3*2 + 7*16], mm3
 %endmacro
 
         ;; Macro for row DCT using MMX punpcklw instructions
         ;; FDCT_ROW_MMX(int16_t *out, const int16_t *in, const int16_t *table);
         ;;  - out, register name holding the out address
         ;;  - in, register name holding the in address
         ;;  - table coefficients address (register or absolute)
 %macro FDCT_ROW_MMX 3
   movd mm1, [%2 + 6*2]
   punpcklwd mm1, [%2 + 4*2]
   movq mm2, mm1
   psrlq mm1, 0x20
   movq mm0, [%2 + 0*2]
   punpcklwd mm1, mm2
   movq mm5, mm0
   paddsw mm0, mm1
   psubsw mm5, mm1
   movq mm1, mm0
   movq mm6, mm5
   punpckldq mm3, mm5
   punpckhdq mm6, mm3
   movq mm3, [%3 + 0*2]
   movq mm4, [%3 + 4*2]
   punpckldq mm2, mm0
   pmaddwd mm3, mm0
   punpckhdq mm1, mm2
   movq mm2, [%3 + 16*2]
   pmaddwd mm4, mm1
   pmaddwd mm0, [%3 + 8*2]
   movq mm7, [%3 + 20*2]
   pmaddwd mm2, mm5
   paddd mm3, [fdct_r_row]
   pmaddwd mm7, mm6
   pmaddwd mm1, [%3 + 12*2]
   paddd mm3, mm4
   pmaddwd mm5, [%3 + 24*2]
   pmaddwd mm6, [%3 + 28*2]
   paddd mm2, mm7
   paddd mm0, [fdct_r_row]
   psrad mm3, SHIFT_FRW_ROW
   paddd mm2, [fdct_r_row]
   paddd mm0, mm1
   paddd mm5, [fdct_r_row]
   psrad mm2, SHIFT_FRW_ROW
   paddd mm5, mm6
   psrad mm0, SHIFT_FRW_ROW
   psrad mm5, SHIFT_FRW_ROW
   packssdw mm3, mm0
   packssdw mm2, mm5
   movq mm6, mm3
   punpcklwd mm3, mm2
   punpckhwd mm6, mm2
   movq [%1 + 0*2], mm3
   movq [%1 + 4*2], mm6
 %endmacro
 
         ;; Macro for column DCT using XMM instuction pshufw
         ;; FDCT_ROW_XMM(int16_t *out, const int16_t *in, const int16_t *table);
         ;;  - out, register name holding the out address
         ;;  - in, register name holding the in address
         ;;  - table coefficient address
 %macro FDCT_ROW_XMM 3
         ;; fdct_row_mmx2(const int16_t *in, int16_t *out, const int16_t *table)
   pshufw mm5, [%2 + 4*2], 0x1B
   movq mm0, [%2 + 0*2]
   movq mm1, mm0
   paddsw mm0, mm5
   psubsw mm1, mm5
   pshufw mm2, mm0, 0x4E
   pshufw mm3, mm1, 0x4E
   movq mm4, [%3 +  0*2]
   movq mm6, [%3 +  4*2]
   movq mm5, [%3 + 16*2]
   movq mm7, [%3 + 20*2]
   pmaddwd mm4, mm0
   pmaddwd mm5, mm1
   pmaddwd mm6, mm2
   pmaddwd mm7, mm3
   pmaddwd mm0, [%3 +  8*2]
   pmaddwd mm2, [%3 + 12*2]
   pmaddwd mm1, [%3 + 24*2]
   pmaddwd mm3, [%3 + 28*2]
   paddd mm4, mm6
   paddd mm5, mm7
   paddd mm0, mm2
   paddd mm1, mm3
   movq mm7, [fdct_r_row]
   paddd mm4, mm7
   paddd mm5, mm7
   paddd mm0, mm7
   paddd mm1, mm7
   psrad mm4, SHIFT_FRW_ROW
   psrad mm5, SHIFT_FRW_ROW
   psrad mm0, SHIFT_FRW_ROW
   psrad mm1, SHIFT_FRW_ROW
   packssdw mm4, mm0
   packssdw mm5, mm1
   movq mm2, mm4
   punpcklwd mm4, mm5
   punpckhwd mm2, mm5
   movq [%1 + 0*2], mm4
   movq [%1 + 4*2], mm2
 %endmacro
 
 %macro MAKE_FDCT_FUNC 2
 ALIGN 16
 cglobal %1
 %1:
         ;; Move the destination/source address to the eax register
   mov eax, [esp + 4]
 
         ;; Process the columns (4 at a time)
   FDCT_COLUMN_COMMON eax, eax, 0 ; columns 0..3
   FDCT_COLUMN_COMMON eax, eax, 4 ; columns 4..7
 
 %ifdef UNROLLED_LOOP
         ; Unrolled loop version
 %assign i 0
 %rep 8
         ;; Process the 'i'th row
   %2 eax+2*i*8, eax+2*i*8, tab_frw_01234567+2*32*i
         %assign i i+1
 %endrep
 %else
   mov ecx, 8
   mov edx, tab_frw_01234567
 ALIGN 8
 .loop
   %2 eax, eax, edx
   add eax, 2*8
   add edx, 2*32
   dec ecx
   jne .loop
 %endif
 
   ret
 %endmacro
 
 ;=============================================================================
 ; Code
 ;=============================================================================
 
 SECTION .text
 
 ;-----------------------------------------------------------------------------
 ; void fdct_mmx_ffmpeg(int16_t block[64]);
 ;-----------------------------------------------------------------------------
 
 MAKE_FDCT_FUNC fdct_mmx_ffmpeg, FDCT_ROW_MMX
 
 ;-----------------------------------------------------------------------------
 ; void fdct_xmm_ffmpeg(int16_t block[64]);
 ;-----------------------------------------------------------------------------
 
 MAKE_FDCT_FUNC fdct_xmm_ffmpeg, FDCT_ROW_XMM
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -