⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 fdct_mmx_skal.asm

📁 mpeg4视频解码源码.rar
💻 ASM
字号:
;/****************************************************************************; *; *  XVID MPEG-4 VIDEO CODEC; *  - MMX and XMM forward discrete cosine transform -; *; *  Copyright(C) 2002 Pascal Massimino <skal@planet-d.net>; *		  2004 Andre Werthmann <wertmann@aei.mpg.de>; *; *  This program is free software; you can redistribute it and/or modify it; *  under the terms of the GNU General Public License as published by; *  the Free Software Foundation; either version 2 of the License, or; *  (at your option) any later version.; *; *  This program is distributed in the hope that it will be useful,; *  but WITHOUT ANY WARRANTY; without even the implied warranty of; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the; *  GNU General Public License for more details.; *; *  You should have received a copy of the GNU General Public License; *  along with this program; if not, write to the Free Software; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA; *; * $Id$; *; ***************************************************************************/BITS 64%macro cglobal 1	%ifdef PREFIX		%ifdef MARK_FUNCS			global _%1:function %1.endfunc-%1			%define %1 _%1:function %1.endfunc-%1		%else			global _%1			%define %1 _%1		%endif	%else		%ifdef MARK_FUNCS			global %1:function %1.endfunc-%1		%else			global %1		%endif	%endif%endmacro;;; Define this if you want an unrolled version of the code%define UNROLLED_LOOP;=============================================================================;; Vertical pass is an implementation of the scheme:;  Loeffler C., Ligtenberg A., and Moschytz C.S.:;  Practical Fast 1D DCT Algorithm with Eleven Multiplications,;  Proc. ICASSP 1989, 988-991.;; Horizontal pass is a double 4x4 vector/matrix multiplication,; (see also Intel's Application Note 922:;  http://developer.intel.com/vtune/cbts/strmsimd/922down.htm;  Copyright (C) 1999 Intel Corporation);; Notes:;  * tan(3pi/16) is greater than 0.5, and would use the;    sign bit when turned into 16b fixed-point precision. So,;    we use the trick: x*tan3 = x*(tan3-1)+x;;  * There's only one SSE-specific instruction (pshufw).;    Porting to SSE2 also seems straightforward.;;  * There's still 1 or 2 ticks to save in fLLM_PASS, but;    I prefer having a readable code, instead of a tightly;    scheduled one...;;  * Quantization stage (as well as pre-transposition for the;    idct way back) can be included in the fTab* constants;    (with induced loss of precision, somehow);;  * Some more details at: http://skal.planet-d.net/coding/dct.html;;=============================================================================;;   idct-like IEEE errors:;;  =========================;  Peak error:   1.0000;  Peak MSE:     0.0365;  Overall MSE:  0.0201;  Peak ME:      0.0265;  Overall ME:   0.0006;;  == Mean square errors ==;   0.000 0.001 0.001 0.002 0.000 0.002 0.001 0.000    [0.001];   0.035 0.029 0.032 0.032 0.031 0.032 0.034 0.035    [0.032];   0.026 0.028 0.027 0.027 0.025 0.028 0.028 0.025    [0.027];   0.037 0.032 0.031 0.030 0.028 0.029 0.026 0.031    [0.030];   0.000 0.001 0.001 0.002 0.000 0.002 0.001 0.001    [0.001];   0.025 0.024 0.022 0.022 0.022 0.022 0.023 0.023    [0.023];   0.026 0.028 0.025 0.028 0.030 0.025 0.026 0.027    [0.027];   0.021 0.020 0.020 0.022 0.020 0.022 0.017 0.019    [0.020];;  == Abs Mean errors ==;   0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000    [0.000];   0.020 0.001 0.003 0.003 0.000 0.004 0.002 0.003    [0.002];   0.000 0.001 0.001 0.001 0.001 0.004 0.000 0.000    [0.000];   0.027 0.001 0.000 0.002 0.002 0.002 0.001 0.000    [0.003];   0.000 0.000 0.000 0.000 0.000 0.001 0.000 0.001    [-0.000];   0.001 0.003 0.001 0.001 0.002 0.001 0.000 0.000    [-0.000];   0.000 0.002 0.002 0.001 0.001 0.002 0.001 0.000    [-0.000];   0.000 0.002 0.001 0.002 0.001 0.002 0.001 0.001    [-0.000];;=============================================================================;=============================================================================; Read only data;=============================================================================%ifdef FORMAT_COFFSECTION .rodata%elseSECTION .rodata align=16%endifALIGN 16tan1:	dw  0x32ec,0x32ec,0x32ec,0x32ec    ; tan( pi/16)tan2:	dw  0x6a0a,0x6a0a,0x6a0a,0x6a0a    ; tan(2pi/16)  (=sqrt(2)-1)tan3:	dw  0xab0e,0xab0e,0xab0e,0xab0e    ; tan(3pi/16)-1sqrt2:	dw  0x5a82,0x5a82,0x5a82,0x5a82    ; 0.5/sqrt(2)ALIGN 16fdct_table:;fTab1:  dw 0x4000, 0x4000, 0x58c5, 0x4b42  dw 0x4000, 0x4000, 0x3249, 0x11a8  dw 0x539f, 0x22a3, 0x4b42, 0xee58  dw 0xdd5d, 0xac61, 0xa73b, 0xcdb7  dw 0x4000, 0xc000, 0x3249, 0xa73b  dw 0xc000, 0x4000, 0x11a8, 0x4b42  dw 0x22a3, 0xac61, 0x11a8, 0xcdb7  dw 0x539f, 0xdd5d, 0x4b42, 0xa73b;fTab2:  dw 0x58c5, 0x58c5, 0x7b21, 0x6862  dw 0x58c5, 0x58c5, 0x45bf, 0x187e  dw 0x73fc, 0x300b, 0x6862, 0xe782  dw 0xcff5, 0x8c04, 0x84df, 0xba41  dw 0x58c5, 0xa73b, 0x45bf, 0x84df  dw 0xa73b, 0x58c5, 0x187e, 0x6862  dw 0x300b, 0x8c04, 0x187e, 0xba41  dw 0x73fc, 0xcff5, 0x6862, 0x84df;fTab3:  dw 0x539f, 0x539f, 0x73fc, 0x6254  dw 0x539f, 0x539f, 0x41b3, 0x1712  dw 0x6d41, 0x2d41, 0x6254, 0xe8ee  dw 0xd2bf, 0x92bf, 0x8c04, 0xbe4d  dw 0x539f, 0xac61, 0x41b3, 0x8c04  dw 0xac61, 0x539f, 0x1712, 0x6254  dw 0x2d41, 0x92bf, 0x1712, 0xbe4d  dw 0x6d41, 0xd2bf, 0x6254, 0x8c04;fTab4:  dw 0x4b42, 0x4b42, 0x6862, 0x587e  dw 0x4b42, 0x4b42, 0x3b21, 0x14c3  dw 0x6254, 0x28ba, 0x587e, 0xeb3d  dw 0xd746, 0x9dac, 0x979e, 0xc4df  dw 0x4b42, 0xb4be, 0x3b21, 0x979e  dw 0xb4be, 0x4b42, 0x14c3, 0x587e  dw 0x28ba, 0x9dac, 0x14c3, 0xc4df  dw 0x6254, 0xd746, 0x587e, 0x979e;fTab1:  dw 0x4000, 0x4000, 0x58c5, 0x4b42  dw 0x4000, 0x4000, 0x3249, 0x11a8  dw 0x539f, 0x22a3, 0x4b42, 0xee58  dw 0xdd5d, 0xac61, 0xa73b, 0xcdb7  dw 0x4000, 0xc000, 0x3249, 0xa73b  dw 0xc000, 0x4000, 0x11a8, 0x4b42  dw 0x22a3, 0xac61, 0x11a8, 0xcdb7  dw 0x539f, 0xdd5d, 0x4b42, 0xa73b;fTab4:  dw 0x4b42, 0x4b42, 0x6862, 0x587e  dw 0x4b42, 0x4b42, 0x3b21, 0x14c3  dw 0x6254, 0x28ba, 0x587e, 0xeb3d  dw 0xd746, 0x9dac, 0x979e, 0xc4df  dw 0x4b42, 0xb4be, 0x3b21, 0x979e  dw 0xb4be, 0x4b42, 0x14c3, 0x587e  dw 0x28ba, 0x9dac, 0x14c3, 0xc4df  dw 0x6254, 0xd746, 0x587e, 0x979e;fTab3:  dw 0x539f, 0x539f, 0x73fc, 0x6254  dw 0x539f, 0x539f, 0x41b3, 0x1712  dw 0x6d41, 0x2d41, 0x6254, 0xe8ee  dw 0xd2bf, 0x92bf, 0x8c04, 0xbe4d  dw 0x539f, 0xac61, 0x41b3, 0x8c04  dw 0xac61, 0x539f, 0x1712, 0x6254  dw 0x2d41, 0x92bf, 0x1712, 0xbe4d  dw 0x6d41, 0xd2bf, 0x6254, 0x8c04;fTab2:  dw 0x58c5, 0x58c5, 0x7b21, 0x6862  dw 0x58c5, 0x58c5, 0x45bf, 0x187e  dw 0x73fc, 0x300b, 0x6862, 0xe782  dw 0xcff5, 0x8c04, 0x84df, 0xba41  dw 0x58c5, 0xa73b, 0x45bf, 0x84df  dw 0xa73b, 0x58c5, 0x187e, 0x6862  dw 0x300b, 0x8c04, 0x187e, 0xba41  dw 0x73fc, 0xcff5, 0x6862, 0x84dfALIGN 16fdct_rounding_1:  dw 6, 8, 8, 8  dw 10, 8, 8, 8  dw 8, 8, 8, 8  dw 8, 8, 8, 8  dw 6, 8, 8, 8  dw 8, 8, 8, 8  dw 8, 8, 8, 8  dw 8, 8, 8, 8ALIGN 16fdct_rounding_2:  dw 6, 8, 8, 8  dw 8, 8, 8, 8  dw 8, 8, 8, 8  dw 8, 8, 8, 8  dw 6, 8, 8, 8  dw 8, 8, 8, 8  dw 8, 8, 8, 8  dw 8, 8, 8, 8ALIGN 16MMX_One:  dw 1, 1, 1, 1;=============================================================================; Helper Macros for real code;=============================================================================;-----------------------------------------------------------------------------; FDCT LLM vertical pass (~39c); %1=dst, %2=src, %3:Shift;-----------------------------------------------------------------------------%macro fLLM_PASS 3  movq mm0, [%2+0*16]   ; In0  movq mm2, [%2+2*16]   ; In2  movq mm3, mm0  movq mm4, mm2  movq mm7, [%2+7*16]   ; In7  movq mm5, [%2+5*16]   ; In5  psubsw mm0, mm7       ; t7 = In0-In7  paddsw mm7, mm3       ; t0 = In0+In7  psubsw mm2, mm5       ; t5 = In2-In5  paddsw mm5, mm4       ; t2 = In2+In5  movq mm3, [%2+3*16]   ; In3  movq mm4, [%2+4*16]   ; In4  movq mm1, mm3  psubsw mm3, mm4       ; t4 = In3-In4  paddsw mm4, mm1       ; t3 = In3+In4  movq mm6, [%2+6*16]   ; In6  movq mm1, [%2+1*16]   ; In1  psubsw mm1, mm6       ; t6 = In1-In6  paddsw mm6, [%2+1*16] ; t1 = In1+In6  psubsw mm7, mm4       ; tm03 = t0-t3  psubsw mm6, mm5       ; tm12 = t1-t2  paddsw mm4, mm4       ; 2.t3  paddsw mm5, mm5       ; 2.t2  paddsw mm4, mm7       ; tp03 = t0+t3  paddsw mm5, mm6       ; tp12 = t1+t2  psllw mm2, %3+1       ; shift t5 (shift +1 to..  psllw mm1, %3+1       ; shift t6  ..compensate cos4/2)  psllw mm4, %3         ; shift t3  psllw mm5, %3         ; shift t2  psllw mm7, %3         ; shift t0  psllw mm6, %3         ; shift t1  psllw mm3, %3         ; shift t4  psllw mm0, %3         ; shift t7  psubsw mm4, mm5       ; out4 = tp03-tp12  psubsw mm1, mm2       ; mm1: t6-t5  paddsw mm5, mm5  paddsw mm2, mm2  paddsw mm5, mm4       ; out0 = tp03+tp12  movq [%1+4*16], mm4   ; => out4  paddsw mm2, mm1       ; mm2: t6+t5  movq [%1+0*16], mm5   ; => out0  movq mm4, [tan2 wrt rip]      ; mm4 <= tan2  pmulhw mm4, mm7       ; tm03*tan2  movq mm5, [tan2 wrt rip]      ; mm5 <= tan2  psubsw mm4, mm6       ; out6 = tm03*tan2 - tm12  pmulhw mm5, mm6       ; tm12*tan2  paddsw mm5, mm7       ; out2 = tm12*tan2 + tm03  movq mm6, [sqrt2 wrt rip]  movq mm7, [MMX_One wrt rip]  pmulhw mm2, mm6       ; mm2: tp65 = (t6 + t5)*cos4  por mm5, mm7          ; correct out2  por mm4, mm7          ; correct out6  pmulhw mm1, mm6       ; mm1: tm65 = (t6 - t5)*cos4  por mm2, mm7          ; correct tp65  movq [%1+2*16], mm5   ; => out2  movq mm5, mm3         ; save t4  movq [%1+6*16], mm4   ; => out6  movq mm4, mm0         ; save t7  psubsw mm3, mm1       ; mm3: tm465 = t4 - tm65  psubsw mm0, mm2       ; mm0: tm765 = t7 - tp65  paddsw mm2, mm4       ; mm2: tp765 = t7 + tp65  paddsw mm1, mm5       ; mm1: tp465 = t4 + tm65  movq mm4, [tan3 wrt rip]      ; tan3 - 1  movq mm5, [tan1 wrt rip]      ; tan1  movq mm7, mm3         ; save tm465  pmulhw mm3, mm4       ; tm465*(tan3-1)  movq mm6, mm1         ; save tp465  pmulhw mm1, mm5       ; tp465*tan1  paddsw mm3, mm7       ; tm465*tan3  pmulhw mm4, mm0       ; tm765*(tan3-1)  paddsw mm4, mm0       ; tm765*tan3  pmulhw mm5, mm2       ; tp765*tan1  paddsw mm1, mm2       ; out1 = tp765 + tp465*tan1  psubsw mm0, mm3       ; out3 = tm765 - tm465*tan3  paddsw mm7, mm4       ; out5 = tm465 + tm765*tan3  psubsw mm5, mm6       ; out7 =-tp465 + tp765*tan1  movq [%1+1*16], mm1   ; => out1  movq [%1+3*16], mm0   ; => out3  movq [%1+5*16], mm7   ; => out5  movq [%1+7*16], mm5   ; => out7%endmacro;-----------------------------------------------------------------------------; fMTX_MULT_XMM (~20c); %1=dst, %2=src, %3 = Coeffs, %4/%5=rounders;-----------------------------------------------------------------------------%macro fMTX_MULT_XMM 5  movq mm0, [%2 + 0]  ; mm0 = [0123]		; the 'pshufw' below is the only SSE instruction.		; For MMX-only version, it should be emulated with		; some 'punpck' soup...  pshufw mm1, [%2 + 8], 00011011b ; mm1 = [7654]  movq mm7, mm0  paddsw mm0, mm1              ; mm0 = [a0 a1 a2 a3]  psubsw mm7, mm1              ; mm7 = [b0 b1 b2 b3]  movq mm1, mm0  punpckldq mm0, mm7           ; mm0 = [a0 a1 b0 b1]  punpckhdq mm1, mm7           ; mm1 = [b2 b3 a2 a3]  movq mm2, qword [%3 + 0]     ;  [   M00    M01      M16    M17]  movq mm3, qword [%3 + 8]     ;  [   M02    M03      M18    M19]  pmaddwd mm2, mm0             ;  [a0.M00+a1.M01 | b0.M16+b1.M17]  movq mm4, qword [%3 + 16]    ;  [   M04    M05      M20    M21]  pmaddwd mm3, mm1             ;  [a2.M02+a3.M03 | b2.M18+b3.M19]  movq mm5, qword [%3 + 24]    ;  [   M06    M07      M22    M23]  pmaddwd mm4, mm0             ;  [a0.M04+a1.M05 | b0.M20+b1.M21]  movq mm6, qword [%3 + 32]    ;  [   M08    M09      M24    M25]  pmaddwd mm5, mm1             ;  [a2.M06+a3.M07 | b2.M22+b3.M23]  movq mm7, qword [%3 + 40]    ;  [   M10    M11      M26    M27]  pmaddwd mm6, mm0             ;  [a0.M08+a1.M09 | b0.M24+b1.M25]  paddd mm2, mm3               ;  [ out0 | out1 ]  pmaddwd mm7, mm1             ;  [a0.M10+a1.M11 | b0.M26+b1.M27]  psrad mm2, 16  pmaddwd mm0, qword [%3 + 48] ;  [a0.M12+a1.M13 | b0.M28+b1.M29]  paddd mm4, mm5               ;  [ out2 | out3 ]  pmaddwd mm1, qword [%3 + 56] ;  [a0.M14+a1.M15 | b0.M30+b1.M31]  psrad mm4, 16  paddd mm6, mm7               ;  [ out4 | out5 ]  psrad mm6, 16  paddd mm0, mm1               ;  [ out6 | out7 ]  psrad mm0, 16  packssdw mm2, mm4            ;  [ out0|out1|out2|out3 ]  paddsw mm2, [%4]             ;  Round  packssdw mm6, mm0            ;  [ out4|out5|out6|out7 ]  paddsw mm6, [%5]             ;  Round  psraw mm2, 4                 ; => [-2048, 2047]  psraw mm6, 4  movq [%1 + 0], mm2  movq [%1 + 8], mm6%endmacro;-----------------------------------------------------------------------------; MAKE_FDCT_FUNC; %1 funcname, %2 macro for row dct;-----------------------------------------------------------------------------%macro MAKE_FDCT_FUNC 2ALIGN 16cglobal %1%1:%ifdef UNROLLED_LOOP  mov rcx, rdi		; fist arg%else  push rbx  mov rcx, rdi		; first arg too%endif  fLLM_PASS rcx+0, rcx+0, 3  fLLM_PASS rcx+8, rcx+8, 3%ifdef UNROLLED_LOOP  lea rsi, [fdct_table wrt rip]  lea rdi, [fdct_rounding_1 wrt rip]  lea rax, [fdct_rounding_2 wrt rip]%assign i 0%rep 8  %2 rcx+i*16, rcx+i*16, rsi+i*64, rdi+i*8, rax+i*8  %assign i i+1%endrep%else  mov rax, 8  lea rdx, [fdct_table wrt rip]	; load offset, pic code  lea rbx, [fdct_rounding_1 wrt rip]	; same  lea rdi, [fdct_rounding_2 wrt rip]	; same too.loop  %2 rcx, rcx, rdx, rbx, rdi  add rcx, 2*8  add rdx, 2*32  add rbx, 2*4  add rdi, 2*4  dec rax  jne .loop  pop rbx%endif  ret.endfunc%endmacro;=============================================================================; Code;=============================================================================SECTION .text align=16;-----------------------------------------------------------------------------; void fdct_xmm_skal(int16_t block[64]];;-----------------------------------------------------------------------------MAKE_FDCT_FUNC fdct_skal_x86_64, fMTX_MULT_XMM

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -