fdct_mmx_skal.asm

来自「wince下的xvidcore开发库,可用于MP4等视频播放开发」· 汇编代码 · 共 517 行 · 第 1/2 页
ASM
517 行
 ;/****************************************************************************
 ; *
 ; *  XVID MPEG-4 VIDEO CODEC
 ; *  - MMX and XMM forward discrete cosine transform -
 ; *
 ; *  Copyright(C) 2002 Pascal Massimino <skal@planet-d.net>
 ; *
 ; *  This program is free software; you can redistribute it and/or modify it
 ; *  under the terms of the GNU General Public License as published by
 ; *  the Free Software Foundation; either version 2 of the License, or
 ; *  (at your option) any later version.
 ; *
 ; *  This program is distributed in the hope that it will be useful,
 ; *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 ; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 ; *  GNU General Public License for more details.
 ; *
 ; *  You should have received a copy of the GNU General Public License
 ; *  along with this program; if not, write to the Free Software
 ; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
 ; *
 ; * $Id: fdct_mmx_skal.asm,v 1.1 2005/07/21 09:08:25 klschoef Exp $
 ; *
 ; ***************************************************************************/
 
 BITS 32
 
 %macro cglobal 1
         %ifdef PREFIX
                 global _%1
                 %define %1 _%1
         %else
                 global %1
         %endif
 %endmacro
 
 ;;; Define this if you want an unrolled version of the code
 %define UNROLLED_LOOP
 
 ;=============================================================================
 ;
 ; Vertical pass is an implementation of the scheme:
 ;  Loeffler C., Ligtenberg A., and Moschytz C.S.:
 ;  Practical Fast 1D DCT Algorithm with Eleven Multiplications,
 ;  Proc. ICASSP 1989, 988-991.
 ;
 ; Horizontal pass is a double 4x4 vector/matrix multiplication,
 ; (see also Intel's Application Note 922:
 ;  http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
 ;  Copyright (C) 1999 Intel Corporation)
 ;
 ; Notes:
 ;  * tan(3pi/16) is greater than 0.5, and would use the
 ;    sign bit when turned into 16b fixed-point precision. So,
 ;    we use the trick: x*tan3 = x*(tan3-1)+x
 ;
 ;  * There's only one SSE-specific instruction (pshufw).
 ;    Porting to SSE2 also seems straightforward.
 ;
 ;  * There's still 1 or 2 ticks to save in fLLM_PASS, but
 ;    I prefer having a readable code, instead of a tightly
 ;    scheduled one...
 ;
 ;  * Quantization stage (as well as pre-transposition for the
 ;    idct way back) can be included in the fTab* constants
 ;    (with induced loss of precision, somehow)
 ;
 ;  * Some more details at: http://skal.planet-d.net/coding/dct.html
 ;
 ;=============================================================================
 ;
 ;   idct-like IEEE errors:
 ;
 ;  =========================
 ;  Peak error:   1.0000
 ;  Peak MSE:     0.0365
 ;  Overall MSE:  0.0201
 ;  Peak ME:      0.0265
 ;  Overall ME:   0.0006
 ;
 ;  == Mean square errors ==
 ;   0.000 0.001 0.001 0.002 0.000 0.002 0.001 0.000    [0.001]
 ;   0.035 0.029 0.032 0.032 0.031 0.032 0.034 0.035    [0.032]
 ;   0.026 0.028 0.027 0.027 0.025 0.028 0.028 0.025    [0.027]
 ;   0.037 0.032 0.031 0.030 0.028 0.029 0.026 0.031    [0.030]
 ;   0.000 0.001 0.001 0.002 0.000 0.002 0.001 0.001    [0.001]
 ;   0.025 0.024 0.022 0.022 0.022 0.022 0.023 0.023    [0.023]
 ;   0.026 0.028 0.025 0.028 0.030 0.025 0.026 0.027    [0.027]
 ;   0.021 0.020 0.020 0.022 0.020 0.022 0.017 0.019    [0.020]
 ;
 ;  == Abs Mean errors ==
 ;   0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000    [0.000]
 ;   0.020 0.001 0.003 0.003 0.000 0.004 0.002 0.003    [0.002]
 ;   0.000 0.001 0.001 0.001 0.001 0.004 0.000 0.000    [0.000]
 ;   0.027 0.001 0.000 0.002 0.002 0.002 0.001 0.000    [0.003]
 ;   0.000 0.000 0.000 0.000 0.000 0.001 0.000 0.001    [-0.000]
 ;   0.001 0.003 0.001 0.001 0.002 0.001 0.000 0.000    [-0.000]
 ;   0.000 0.002 0.002 0.001 0.001 0.002 0.001 0.000    [-0.000]
 ;   0.000 0.002 0.001 0.002 0.001 0.002 0.001 0.001    [-0.000]
 ;
 ;=============================================================================
 
 ;=============================================================================
 ; Read only data
 ;=============================================================================
 
 %ifdef FORMAT_COFF
 SECTION .rodata data
 %else
 SECTION .rodata data align=16
 %endif
 
 ALIGN 16
 tan1:
         dw  0x32ec,0x32ec,0x32ec,0x32ec    ; tan( pi/16)
 tan2:
         dw  0x6a0a,0x6a0a,0x6a0a,0x6a0a    ; tan(2pi/16)  (=sqrt(2)-1)
 tan3:
         dw  0xab0e,0xab0e,0xab0e,0xab0e    ; tan(3pi/16)-1
 sqrt2:
         dw  0x5a82,0x5a82,0x5a82,0x5a82    ; 0.5/sqrt(2)
 
 ALIGN 16
 fdct_table:
 ;fTab1:
   dw 0x4000, 0x4000, 0x58c5, 0x4b42
   dw 0x4000, 0x4000, 0x3249, 0x11a8
   dw 0x539f, 0x22a3, 0x4b42, 0xee58
   dw 0xdd5d, 0xac61, 0xa73b, 0xcdb7
   dw 0x4000, 0xc000, 0x3249, 0xa73b
   dw 0xc000, 0x4000, 0x11a8, 0x4b42
   dw 0x22a3, 0xac61, 0x11a8, 0xcdb7
   dw 0x539f, 0xdd5d, 0x4b42, 0xa73b
 
 ;fTab2:
   dw 0x58c5, 0x58c5, 0x7b21, 0x6862
   dw 0x58c5, 0x58c5, 0x45bf, 0x187e
   dw 0x73fc, 0x300b, 0x6862, 0xe782
   dw 0xcff5, 0x8c04, 0x84df, 0xba41
   dw 0x58c5, 0xa73b, 0x45bf, 0x84df
   dw 0xa73b, 0x58c5, 0x187e, 0x6862
   dw 0x300b, 0x8c04, 0x187e, 0xba41
   dw 0x73fc, 0xcff5, 0x6862, 0x84df
 
 ;fTab3:
   dw 0x539f, 0x539f, 0x73fc, 0x6254
   dw 0x539f, 0x539f, 0x41b3, 0x1712
   dw 0x6d41, 0x2d41, 0x6254, 0xe8ee
   dw 0xd2bf, 0x92bf, 0x8c04, 0xbe4d
   dw 0x539f, 0xac61, 0x41b3, 0x8c04
   dw 0xac61, 0x539f, 0x1712, 0x6254
   dw 0x2d41, 0x92bf, 0x1712, 0xbe4d
   dw 0x6d41, 0xd2bf, 0x6254, 0x8c04
 
 ;fTab4:
   dw 0x4b42, 0x4b42, 0x6862, 0x587e
   dw 0x4b42, 0x4b42, 0x3b21, 0x14c3
   dw 0x6254, 0x28ba, 0x587e, 0xeb3d
   dw 0xd746, 0x9dac, 0x979e, 0xc4df
   dw 0x4b42, 0xb4be, 0x3b21, 0x979e
   dw 0xb4be, 0x4b42, 0x14c3, 0x587e
   dw 0x28ba, 0x9dac, 0x14c3, 0xc4df
   dw 0x6254, 0xd746, 0x587e, 0x979e
 
 ;fTab1:
   dw 0x4000, 0x4000, 0x58c5, 0x4b42
   dw 0x4000, 0x4000, 0x3249, 0x11a8
   dw 0x539f, 0x22a3, 0x4b42, 0xee58
   dw 0xdd5d, 0xac61, 0xa73b, 0xcdb7
   dw 0x4000, 0xc000, 0x3249, 0xa73b
   dw 0xc000, 0x4000, 0x11a8, 0x4b42
   dw 0x22a3, 0xac61, 0x11a8, 0xcdb7
   dw 0x539f, 0xdd5d, 0x4b42, 0xa73b
 
 ;fTab4:
   dw 0x4b42, 0x4b42, 0x6862, 0x587e
   dw 0x4b42, 0x4b42, 0x3b21, 0x14c3
   dw 0x6254, 0x28ba, 0x587e, 0xeb3d
   dw 0xd746, 0x9dac, 0x979e, 0xc4df
   dw 0x4b42, 0xb4be, 0x3b21, 0x979e
   dw 0xb4be, 0x4b42, 0x14c3, 0x587e
   dw 0x28ba, 0x9dac, 0x14c3, 0xc4df
   dw 0x6254, 0xd746, 0x587e, 0x979e
 
 ;fTab3:
   dw 0x539f, 0x539f, 0x73fc, 0x6254
   dw 0x539f, 0x539f, 0x41b3, 0x1712
   dw 0x6d41, 0x2d41, 0x6254, 0xe8ee
   dw 0xd2bf, 0x92bf, 0x8c04, 0xbe4d
   dw 0x539f, 0xac61, 0x41b3, 0x8c04
   dw 0xac61, 0x539f, 0x1712, 0x6254
   dw 0x2d41, 0x92bf, 0x1712, 0xbe4d
   dw 0x6d41, 0xd2bf, 0x6254, 0x8c04
 
 ;fTab2:
   dw 0x58c5, 0x58c5, 0x7b21, 0x6862
   dw 0x58c5, 0x58c5, 0x45bf, 0x187e
   dw 0x73fc, 0x300b, 0x6862, 0xe782
   dw 0xcff5, 0x8c04, 0x84df, 0xba41
   dw 0x58c5, 0xa73b, 0x45bf, 0x84df
   dw 0xa73b, 0x58c5, 0x187e, 0x6862
   dw 0x300b, 0x8c04, 0x187e, 0xba41
   dw 0x73fc, 0xcff5, 0x6862, 0x84df
 
 ALIGN 16
 fdct_rounding_1:
   dw 6, 8, 8, 8
   dw 10, 8, 8, 8
   dw 8, 8, 8, 8
   dw 8, 8, 8, 8
   dw 6, 8, 8, 8
   dw 8, 8, 8, 8
   dw 8, 8, 8, 8
   dw 8, 8, 8, 8
 
 ALIGN 16
 fdct_rounding_2:
   dw 6, 8, 8, 8
   dw 8, 8, 8, 8
   dw 8, 8, 8, 8
   dw 8, 8, 8, 8
   dw 6, 8, 8, 8
   dw 8, 8, 8, 8
   dw 8, 8, 8, 8
   dw 8, 8, 8, 8
 
 ALIGN 16
 MMX_One:
   dw 1, 1, 1, 1
 
 ;=============================================================================
 ; Helper Macros for real code
 ;=============================================================================
 
 ;-----------------------------------------------------------------------------
 ; FDCT LLM vertical pass (~39c)
 ; %1=dst, %2=src, %3:Shift
 ;-----------------------------------------------------------------------------
 
 %macro fLLM_PASS 3
   movq mm0, [%2+0*16]   ; In0
   movq mm2, [%2+2*16]   ; In2
   movq mm3, mm0
   movq mm4, mm2
   movq mm7, [%2+7*16]   ; In7
   movq mm5, [%2+5*16]   ; In5
 
   psubsw mm0, mm7       ; t7 = In0-In7
   paddsw mm7, mm3       ; t0 = In0+In7
   psubsw mm2, mm5       ; t5 = In2-In5
   paddsw mm5, mm4       ; t2 = In2+In5
 
   movq mm3, [%2+3*16]   ; In3
   movq mm4, [%2+4*16]   ; In4
   movq mm1, mm3
   psubsw mm3, mm4       ; t4 = In3-In4
   paddsw mm4, mm1       ; t3 = In3+In4
   movq mm6, [%2+6*16]   ; In6
   movq mm1, [%2+1*16]   ; In1
fdct_mmx_skal.asm - 源码说明

本页面展示了「wince下的xvidcore开发库,可用于MP4等视频播放开发」中的 fdct_mmx_skal.asm 源码文件，采用汇编编程语言编写，共 517 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫开发者社区收录了大量与WinCE相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?