⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 simple_idct_mmx.asm

📁 从FFMPEG转换而来的H264解码程序,VC下编译..
💻 ASM
📖 第 1 页 / 共 3 页
字号:
;/*
; * Simple IDCT MMX
; *
; * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
; *
; * This library is free software; you can redistribute it and/or
; * modify it under the terms of the GNU Lesser General Public
; * License as published by the Free Software Foundation; either
; * version 2 of the License, or (at your option) any later version.
; *
; * This library is distributed in the hope that it will be useful,
; * but WITHOUT ANY WARRANTY; without even the implied warranty of
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
; * Lesser General Public License for more details.
; *
; * You should have received a copy of the GNU Lesser General Public
; * License along with this library; if not, write to the Free Software
; * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
; *
; * Ported to nasm by Peter Ross <pross@xvid.org>
; */

BITS 32

;=============================================================================
; Macros and other preprocessor constants
;=============================================================================

%macro cglobal 1
	%ifdef PREFIX
		%ifdef MARK_FUNCS
			global _%1:function %1.endfunc-%1
			%define %1 _%1:function %1.endfunc-%1
		%else
			global _%1
			%define %1 _%1
		%endif
	%else
		%ifdef MARK_FUNCS
			global %1:function %1.endfunc-%1
		%else
			global %1
		%endif
	%endif
%endmacro

%define ROW_SHIFT 11
%define COL_SHIFT 20
%define C0 23170	;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 23170.475006
%define C1 22725	;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 22725.260826
%define C2 21407	;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 21406.727617
%define C3 19266	;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 19265.545870
%define C4 16383	;cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5 = 16384.000000
%define C5 12873	;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 12872.826198
%define C6 8867		;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 8866.956905
%define C7 4520		;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 4520.335430

;===========================================================================
; Data (Read Only)
;===========================================================================

;%ifdef FORMAT_COFF
;SECTION .rodata
;%else
;SECTION .rodata align=16
;%endif

section .text

;-----------------------------------------------------------------------------
; Trigonometric Tables
;-----------------------------------------------------------------------------

ALIGN 16
wm1010:
	dw 0, 0xffff, 0, 0xffff

ALIGN 16
d40000:
	dd 0x40000, 0

ALIGN 16
coeffs:
  dw 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,		; 0
  dw 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,		; 8

  dw C4,  C4,  C4,  C4		; 16
  dw C4, -C4,  C4, -C4		; 24

  dw C2,  C6,  C2,  C6		; 32
  dw C6, -C2,  C6, -C2		; 40

  dw C1,  C3,  C1,  C3		; 48
  dw C5,  C7,  C5,  C7		; 56

  dw C3, -C7,  C3, -C7		; 64
  dw -C1, -C5, -C1, -C5		; 72

  dw C5, -C1,  C5, -C1		; 80
  dw C7,  C3,  C7,  C3		; 88

  dw C7, -C5,  C7, -C5		; 96
  dw C3, -C1,  C3, -C1		; 104


;===========================================================================
; Helper macros
;===========================================================================

;---------------------------------------------------------------------------
; DC_COND_IDCT
;---------------------------------------------------------------------------

%macro	DC_COND_IDCT	8
%define	src0		%1
%define	src4		%2
%define	src1		%3
%define	src5		%4
%define	dst			%5
%define	rounder_op	%6
%define	rounder_arg	%7
%define	shift		%8
  movq mm0,[src0]               ; R4    R0  r4  r0
  movq mm1,[src4]               ; R6    R2  r6  r2
  movq mm2,[src1]               ; R3    R1  r3  r1
  movq mm3,[src5]               ; R7    R5  r7  r5
  movq mm4,[wm1010]
  pand mm4,mm0
  por mm4,mm1
  por mm4,mm2
  por mm4,mm3
  packssdw mm4,mm4
  movd eax,mm4
  or eax,eax
  jz near .skip1
  movq mm4,[coeffs+16]          ; C4    C4  C4  C4
  pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0
  movq mm5,[coeffs+24]          ; -C4   C4  -C4 C4
  pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0
  movq mm5,[coeffs+32]          ; C6    C2  C6  C2
  pmaddwd mm5,mm1               ; C6R6+C2R2 C6r6+C2r2
  movq mm6,[coeffs+40]          ; -C2   C6  -C2 C6
  pmaddwd mm1,mm6               ; -C2R6+C6R2    -C2r6+C6r2
  movq mm7,[coeffs+48]          ; C3    C1  C3  C1
  pmaddwd mm7,mm2               ; C3R3+C1R1 C3r3+C1r1
  rounder_op mm4, rounder_arg
  movq mm6,mm4                  ; C4R4+C4R0 C4r4+C4r0
  paddd mm4,mm5                 ; A0        a0
  psubd mm6,mm5                 ; A3        a3
  movq mm5,[coeffs+56]          ; C7    C5  C7  C5
  pmaddwd mm5,mm3               ; C7R7+C5R5 C7r7+C5r5
  rounder_op mm0, rounder_arg
  paddd mm1,mm0                 ; A1        a1
  paddd mm0,mm0
  psubd mm0,mm1                 ; A2        a2
  pmaddwd mm2,[coeffs+64]       ; -C7R3+C3R1    -C7r3+C3r1
  paddd mm7,mm5                 ; B0        b0
  movq mm5,[coeffs+72]          ; -C5   -C1 -C5 -C1
  pmaddwd mm5,mm3               ; -C5R7-C1R5    -C5r7-C1r5
  paddd mm7,mm4                 ; A0+B0     a0+b0
  paddd mm4,mm4                 ; 2A0       2a0
  psubd mm4,mm7                 ; A0-B0     a0-b0
  paddd mm5,mm2                 ; B1        b1
  psrad mm7,shift
  psrad mm4,shift
  movq mm2,mm1                  ; A1        a1
  paddd mm1,mm5                 ; A1+B1     a1+b1
  psubd mm2,mm5                 ; A1-B1     a1-b1
  psrad mm1,shift
  psrad mm2,shift
  packssdw mm7,mm1              ; A1+B1 a1+b1   A0+B0   a0+b0
  packssdw mm2,mm4              ; A0-B0 a0-b0   A1-B1   a1-b1
  movq [dst],mm7
  movq mm1,[src1]               ; R3    R1  r3  r1
  movq mm4,[coeffs+80]          ;-C1    C5  -C1     C5
  movq [dst + 24],mm2
  pmaddwd mm4,mm1               ; -C1R3+C5R1    -C1r3+C5r1
  movq mm7,[coeffs+88]          ; C3    C7  C3  C7
  pmaddwd mm1,[coeffs+96]       ; -C5R3+C7R1    -C5r3+C7r1
  pmaddwd mm7,mm3               ; C3R7+C7R5 C3r7+C7r5
  movq mm2,mm0                  ; A2        a2
  pmaddwd mm3,[coeffs+104]      ; -C1R7+C3R5    -C1r7+C3r5
  paddd mm4,mm7                 ; B2        b2
  paddd mm2,mm4                 ; A2+B2     a2+b2
  psubd mm0,mm4                 ; a2-B2     a2-b2
  psrad mm2,shift
  psrad mm0,shift
  movq mm4,mm6                  ; A3        a3
  paddd mm3,mm1                 ; B3        b3
  paddd mm6,mm3                 ; A3+B3     a3+b3
  psubd mm4,mm3                 ; a3-B3     a3-b3
  psrad mm6,shift
  packssdw mm2,mm6              ; A3+B3 a3+b3   A2+B2   a2+b2
  movq [ dst + 8],mm2
  psrad mm4,shift
  packssdw mm4,mm0              ; A2-B2 a2-b2   A3-B3   a3-b3
  movq [ dst + 16],mm4
  jmp short .skip2
.skip1
  pslld mm0,16
  paddd mm0,[d40000]
  psrad mm0,13
  packssdw mm0,mm0
  movq [ dst ],mm0
  movq [ dst + 8],mm0
  movq [ dst + 16],mm0
  movq [ dst + 24],mm0
.skip2
%undef  src0
%undef  src4
%undef  src1
%undef  src5
%undef  dst
%undef  rounder_op
%undef  rounder_arg
%undef  shift
%endmacro

;---------------------------------------------------------------------------
; Z_COND_IDCT
;---------------------------------------------------------------------------

%macro	Z_COND_IDCT	9
%define src0        %1
%define src4        %2
%define src1        %3
%define src5        %4
%define dst         %5
%define rounder_op  %6
%define rounder_arg %7
%define shift       %8
%define bt          %9
  movq mm0,[src0]               ; R4    R0  r4  r0
  movq mm1,[src4]               ; R6    R2  r6  r2
  movq mm2,[src1]               ; R3    R1  r3  r1
  movq mm3,[src5]               ; R7    R5  r7  r5
  movq mm4,mm0
  por mm4,mm1
  por mm4,mm2
  por mm4,mm3
  packssdw mm4,mm4
  movd eax,mm4
  or eax,eax
  jz near bt
  movq mm4,[coeffs+16]          ; C4    C4  C4  C4
  pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0
  movq mm5,[coeffs+24]          ; -C4   C4  -C4 C4
  pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0
  movq mm5,[coeffs+32]          ; C6    C2  C6  C2
  pmaddwd mm5,mm1               ; C6R6+C2R2 C6r6+C2r2
  movq mm6,[coeffs+40]          ; -C2   C6  -C2 C6
  pmaddwd mm1,mm6               ; -C2R6+C6R2    -C2r6+C6r2
  movq mm7,[coeffs+48]          ; C3    C1  C3  C1
  pmaddwd mm7,mm2               ; C3R3+C1R1 C3r3+C1r1
  rounder_op mm4, rounder_arg
  movq mm6,mm4                  ; C4R4+C4R0 C4r4+C4r0
  paddd mm4,mm5                 ; A0        a0
  psubd mm6,mm5                 ; A3        a3
  movq mm5,[coeffs+56]          ; C7    C5  C7  C5
  pmaddwd mm5,mm3               ; C7R7+C5R5 C7r7+C5r5
  rounder_op mm0, rounder_arg
  paddd mm1,mm0                 ; A1        a1
  paddd mm0,mm0
  psubd mm0,mm1                 ; A2        a2
  pmaddwd mm2,[coeffs+64]       ; -C7R3+C3R1    -C7r3+C3r1
  paddd mm7,mm5                 ; B0        b0
  movq mm5,[coeffs+72]          ; -C5   -C1 -C5 -C1
  pmaddwd mm5,mm3               ; -C5R7-C1R5    -C5r7-C1r5
  paddd mm7,mm4                 ; A0+B0     a0+b0
  paddd mm4,mm4                 ; 2A0       2a0
  psubd mm4,mm7                 ; A0-B0     a0-b0
  paddd mm5,mm2                 ; B1        b1
  psrad mm7,shift
  psrad mm4,shift
  movq mm2,mm1                  ; A1        a1
  paddd mm1,mm5                 ; A1+B1     a1+b1
  psubd mm2,mm5                 ; A1-B1     a1-b1
  psrad mm1,shift
  psrad mm2,shift
  packssdw mm7,mm1              ; A1+B1 a1+b1   A0+B0   a0+b0
  packssdw mm2,mm4              ; A0-B0 a0-b0   A1-B1   a1-b1
  movq [ dst ],mm7
  movq mm1,[src1]               ; R3    R1  r3  r1
  movq mm4,[coeffs+80]          ; -C1   C5  -C1     C5
  movq [ dst + 24 ],mm2
  pmaddwd mm4,mm1               ; -C1R3+C5R1    -C1r3+C5r1
  movq mm7,[coeffs+88]          ; C3    C7  C3  C7
  pmaddwd mm1,[coeffs+96]       ; -C5R3+C7R1    -C5r3+C7r1
  pmaddwd mm7,mm3               ; C3R7+C7R5 C3r7+C7r5
  movq mm2,mm0                  ; A2        a2
  pmaddwd mm3,[coeffs+104]      ; -C1R7+C3R5    -C1r7+C3r5
  paddd mm4,mm7                 ; B2        b2
  paddd mm2,mm4                 ; A2+B2     a2+b2
  psubd mm0,mm4                 ; a2-B2     a2-b2
  psrad mm2,shift
  psrad mm0,shift
  movq mm4,mm6                  ; A3        a3
  paddd mm3,mm1                 ; B3        b3
  paddd mm6,mm3                 ; A3+B3     a3+b3
  psubd mm4,mm3                 ; a3-B3     a3-b3
  psrad mm6,shift
  packssdw mm2,mm6              ; A3+B3 a3+b3   A2+B2   a2+b2
  movq [ dst + 8],mm2
  psrad mm4,shift
  packssdw mm4,mm0              ; A2-B2 a2-b2   A3-B3   a3-b3
  movq [dst + 16],mm4
%undef	src0
%undef	src4
%undef	src1
%undef	src5
%undef	dst
%undef	rounder_op
%undef	rounder_arg
%undef	shift
%undef	bt
%endmacro

;---------------------------------------------------------------------------
; IDCT0
;---------------------------------------------------------------------------

%macro	IDCT0		8
%define	src0		%1
%define src4        %2
%define src1        %3
%define src5        %4
%define dst         %5
%define rounder_op  %6
%define rounder_arg %7
%define shift       %8
  movq mm0,[src0]               ; R4    R0  r4  r0
  movq mm1,[src4]               ; R6    R2  r6  r2
  movq mm2,[src1]               ; R3    R1  r3  r1
  movq mm3,[src5]               ; R7    R5  r7  r5
  movq mm4,[coeffs+16]          ; C4    C4  C4  C4
  pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0
  movq mm5,[coeffs+24]          ; -C4   C4  -C4 C4
  pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0
  movq mm5,[coeffs+32]          ; C6    C2  C6  C2
  pmaddwd mm5,mm1               ; C6R6+C2R2 C6r6+C2r2
  movq mm6,[coeffs+40]          ; -C2   C6  -C2 C6
  pmaddwd mm1,mm6               ; -C2R6+C6R2    -C2r6+C6r2
  ; rounder_op mm4, rounder_arg
  movq mm6,mm4                  ; C4R4+C4R0 C4r4+C4r0
  movq mm7,[coeffs+48]          ; C3    C1  C3  C1
  ; rounder_op mm0, rounder_arg
  pmaddwd mm7,mm2               ; C3R3+C1R1 C3r3+C1r1
  paddd mm4,mm5                 ; A0        a0
  psubd mm6,mm5                 ; A3        a3
  movq mm5,mm0                  ; -C4R4+C4R0    -C4r4+C4r0

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -