⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 skl_fdct_mmx.asm

📁 从FFMPEG转换而来的H264解码程序,VC下编译..
💻 ASM
📖 第 1 页 / 共 2 页
字号:
;/****************************************************************************
; *
; *  XVID MPEG-4 VIDEO CODEC
; *  - MMX and XMM forward discrete cosine transform -
; *
; *  Copyright(C) 2002 Pascal Massimino <skal@planet-d.net>
; *
; *  This program is free software; you can redistribute it and/or modify it
; *  under the terms of the GNU General Public License as published by
; *  the Free Software Foundation; either version 2 of the License, or
; *  (at your option) any later version.
; *
; *  This program is distributed in the hope that it will be useful,
; *  but WITHOUT ANY WARRANTY; without even the implied warranty of
; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
; *  GNU General Public License for more details.
; *
; *  You should have received a copy of the GNU General Public License
; *  along with this program; if not, write to the Free Software
; *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
; *
; ***************************************************************************/

%include "skl_nasm.h"

;;; Define this if you want an unrolled version of the code
%define UNROLLED_LOOP

;=============================================================================
;
; Vertical pass is an implementation of the scheme:
;  Loeffler C., Ligtenberg A., and Moschytz C.S.:
;  Practical Fast 1D DCT Algorithm with Eleven Multiplications,
;  Proc. ICASSP 1989, 988-991.
;
; Horizontal pass is a double 4x4 vector/matrix multiplication,
; (see also Intel's Application Note 922:
;  http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
;  Copyright (C) 1999 Intel Corporation)
;
; Notes:
;  * tan(3pi/16) is greater than 0.5, and would use the
;    sign bit when turned into 16b fixed-point precision. So,
;    we use the trick: x*tan3 = x*(tan3-1)+x
;
;  * There's only one SSE-specific instruction (pshufw).
;    Porting to SSE2 also seems straightforward.
;
;  * There's still 1 or 2 ticks to save in fLLM_PASS, but
;    I prefer having a readable code, instead of a tightly
;    scheduled one...
;
;  * Quantization stage (as well as pre-transposition for the
;    idct way back) can be included in the fTab* constants
;    (with induced loss of precision, somehow)
;
;  * Some more details at: http://skal.planet-d.net/coding/dct.html
;
;=============================================================================
;
;   idct-like IEEE errors:
;
;  =========================
;  Peak error:   1.0000
;  Peak MSE:     0.0365
;  Overall MSE:  0.0201
;  Peak ME:      0.0265
;  Overall ME:   0.0006
;
;  == Mean square errors ==
;   0.000 0.001 0.001 0.002 0.000 0.002 0.001 0.000    [0.001]
;   0.035 0.029 0.032 0.032 0.031 0.032 0.034 0.035    [0.032]
;   0.026 0.028 0.027 0.027 0.025 0.028 0.028 0.025    [0.027]
;   0.037 0.032 0.031 0.030 0.028 0.029 0.026 0.031    [0.030]
;   0.000 0.001 0.001 0.002 0.000 0.002 0.001 0.001    [0.001]
;   0.025 0.024 0.022 0.022 0.022 0.022 0.023 0.023    [0.023]
;   0.026 0.028 0.025 0.028 0.030 0.025 0.026 0.027    [0.027]
;   0.021 0.020 0.020 0.022 0.020 0.022 0.017 0.019    [0.020]
;
;  == Abs Mean errors ==
;   0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000    [0.000]
;   0.020 0.001 0.003 0.003 0.000 0.004 0.002 0.003    [0.002]
;   0.000 0.001 0.001 0.001 0.001 0.004 0.000 0.000    [0.000]
;   0.027 0.001 0.000 0.002 0.002 0.002 0.001 0.000    [0.003]
;   0.000 0.000 0.000 0.000 0.000 0.001 0.000 0.001    [-0.000]
;   0.001 0.003 0.001 0.001 0.002 0.001 0.000 0.000    [-0.000]
;   0.000 0.002 0.002 0.001 0.001 0.002 0.001 0.000    [-0.000]
;   0.000 0.002 0.001 0.002 0.001 0.002 0.001 0.001    [-0.000]
;
;=============================================================================

;=============================================================================
; Read only data
;=============================================================================

%ifdef FORMAT_COFF
SECTION .data data
%else
SECTION .data data align=16
%endif

ALIGN 16
tan1:
	dw  0x32ec,0x32ec,0x32ec,0x32ec    ; tan( pi/16)
tan2:
	dw  0x6a0a,0x6a0a,0x6a0a,0x6a0a    ; tan(2pi/16)  (=sqrt(2)-1)
tan3:
	dw  0xab0e,0xab0e,0xab0e,0xab0e    ; tan(3pi/16)-1
sqrt2:
	dw  0x5a82,0x5a82,0x5a82,0x5a82    ; 0.5/sqrt(2)

ALIGN 16
fdct_table:
;fTab1:
  dw 0x4000, 0x4000, 0x58c5, 0x4b42
  dw 0x4000, 0x4000, 0x3249, 0x11a8
  dw 0x539f, 0x22a3, 0x4b42, 0xee58
  dw 0xdd5d, 0xac61, 0xa73b, 0xcdb7
  dw 0x4000, 0xc000, 0x3249, 0xa73b
  dw 0xc000, 0x4000, 0x11a8, 0x4b42
  dw 0x22a3, 0xac61, 0x11a8, 0xcdb7
  dw 0x539f, 0xdd5d, 0x4b42, 0xa73b

;fTab2:
  dw 0x58c5, 0x58c5, 0x7b21, 0x6862
  dw 0x58c5, 0x58c5, 0x45bf, 0x187e
  dw 0x73fc, 0x300b, 0x6862, 0xe782
  dw 0xcff5, 0x8c04, 0x84df, 0xba41
  dw 0x58c5, 0xa73b, 0x45bf, 0x84df
  dw 0xa73b, 0x58c5, 0x187e, 0x6862
  dw 0x300b, 0x8c04, 0x187e, 0xba41
  dw 0x73fc, 0xcff5, 0x6862, 0x84df

;fTab3:
  dw 0x539f, 0x539f, 0x73fc, 0x6254
  dw 0x539f, 0x539f, 0x41b3, 0x1712
  dw 0x6d41, 0x2d41, 0x6254, 0xe8ee
  dw 0xd2bf, 0x92bf, 0x8c04, 0xbe4d
  dw 0x539f, 0xac61, 0x41b3, 0x8c04
  dw 0xac61, 0x539f, 0x1712, 0x6254
  dw 0x2d41, 0x92bf, 0x1712, 0xbe4d
  dw 0x6d41, 0xd2bf, 0x6254, 0x8c04

;fTab4:
  dw 0x4b42, 0x4b42, 0x6862, 0x587e
  dw 0x4b42, 0x4b42, 0x3b21, 0x14c3
  dw 0x6254, 0x28ba, 0x587e, 0xeb3d
  dw 0xd746, 0x9dac, 0x979e, 0xc4df
  dw 0x4b42, 0xb4be, 0x3b21, 0x979e
  dw 0xb4be, 0x4b42, 0x14c3, 0x587e
  dw 0x28ba, 0x9dac, 0x14c3, 0xc4df
  dw 0x6254, 0xd746, 0x587e, 0x979e

;fTab1:
  dw 0x4000, 0x4000, 0x58c5, 0x4b42
  dw 0x4000, 0x4000, 0x3249, 0x11a8
  dw 0x539f, 0x22a3, 0x4b42, 0xee58
  dw 0xdd5d, 0xac61, 0xa73b, 0xcdb7
  dw 0x4000, 0xc000, 0x3249, 0xa73b
  dw 0xc000, 0x4000, 0x11a8, 0x4b42
  dw 0x22a3, 0xac61, 0x11a8, 0xcdb7
  dw 0x539f, 0xdd5d, 0x4b42, 0xa73b

;fTab4:
  dw 0x4b42, 0x4b42, 0x6862, 0x587e
  dw 0x4b42, 0x4b42, 0x3b21, 0x14c3
  dw 0x6254, 0x28ba, 0x587e, 0xeb3d
  dw 0xd746, 0x9dac, 0x979e, 0xc4df
  dw 0x4b42, 0xb4be, 0x3b21, 0x979e
  dw 0xb4be, 0x4b42, 0x14c3, 0x587e
  dw 0x28ba, 0x9dac, 0x14c3, 0xc4df
  dw 0x6254, 0xd746, 0x587e, 0x979e

;fTab3:
  dw 0x539f, 0x539f, 0x73fc, 0x6254
  dw 0x539f, 0x539f, 0x41b3, 0x1712
  dw 0x6d41, 0x2d41, 0x6254, 0xe8ee
  dw 0xd2bf, 0x92bf, 0x8c04, 0xbe4d
  dw 0x539f, 0xac61, 0x41b3, 0x8c04
  dw 0xac61, 0x539f, 0x1712, 0x6254
  dw 0x2d41, 0x92bf, 0x1712, 0xbe4d
  dw 0x6d41, 0xd2bf, 0x6254, 0x8c04

;fTab2:
  dw 0x58c5, 0x58c5, 0x7b21, 0x6862
  dw 0x58c5, 0x58c5, 0x45bf, 0x187e
  dw 0x73fc, 0x300b, 0x6862, 0xe782
  dw 0xcff5, 0x8c04, 0x84df, 0xba41
  dw 0x58c5, 0xa73b, 0x45bf, 0x84df
  dw 0xa73b, 0x58c5, 0x187e, 0x6862
  dw 0x300b, 0x8c04, 0x187e, 0xba41
  dw 0x73fc, 0xcff5, 0x6862, 0x84df

ALIGN 16
fdct_rounding_1:
  dw 6, 8, 8, 8
  dw 10, 8, 8, 8
  dw 8, 8, 8, 8
  dw 8, 8, 8, 8
  dw 6, 8, 8, 8
  dw 8, 8, 8, 8
  dw 8, 8, 8, 8
  dw 8, 8, 8, 8

ALIGN 16
fdct_rounding_2:
  dw 6, 8, 8, 8
  dw 8, 8, 8, 8
  dw 8, 8, 8, 8
  dw 8, 8, 8, 8
  dw 6, 8, 8, 8
  dw 8, 8, 8, 8
  dw 8, 8, 8, 8
  dw 8, 8, 8, 8

ALIGN 16
MMX_One:
  dw 1, 1, 1, 1

;=============================================================================
; Helper Macros for real code
;=============================================================================

;-----------------------------------------------------------------------------
; FDCT LLM vertical pass (~39c)
; %1=dst, %2=src, %3:Shift
;-----------------------------------------------------------------------------

%macro fLLM_PASS 3
  movq mm0, [%2+0*16]   ; In0
  movq mm2, [%2+2*16]   ; In2
  movq mm3, mm0
  movq mm4, mm2
  movq mm7, [%2+7*16]   ; In7
  movq mm5, [%2+5*16]   ; In5

  psubsw mm0, mm7       ; t7 = In0-In7
  paddsw mm7, mm3       ; t0 = In0+In7
  psubsw mm2, mm5       ; t5 = In2-In5
  paddsw mm5, mm4       ; t2 = In2+In5

  movq mm3, [%2+3*16]   ; In3
  movq mm4, [%2+4*16]   ; In4
  movq mm1, mm3
  psubsw mm3, mm4       ; t4 = In3-In4
  paddsw mm4, mm1       ; t3 = In3+In4
  movq mm6, [%2+6*16]   ; In6
  movq mm1, [%2+1*16]   ; In1
  psubsw mm1, mm6       ; t6 = In1-In6
  paddsw mm6, [%2+1*16] ; t1 = In1+In6

  psubsw mm7, mm4       ; tm03 = t0-t3
  psubsw mm6, mm5       ; tm12 = t1-t2

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -