⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 simple_idct_mmx.asm

📁 这是一个压缩解压包,用C语言进行编程的,里面有详细的源代码.
💻 ASM
📖 第 1 页 / 共 3 页
字号:
;/*; * Simple IDCT MMX; *; * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>; *; * This library is free software; you can redistribute it and/or; * modify it under the terms of the GNU Lesser General Public; * License as published by the Free Software Foundation; either; * version 2 of the License, or (at your option) any later version.; *; * This library is distributed in the hope that it will be useful,; * but WITHOUT ANY WARRANTY; without even the implied warranty of; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU; * Lesser General Public License for more details.; *; * You should have received a copy of the GNU Lesser General Public; * License along with this library; if not, write to the Free Software; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA; *; * Ported to nasm by Peter Ross <pross@xvid.org>; */BITS 32;=============================================================================; Macros and other preprocessor constants;=============================================================================%macro cglobal 1	%ifdef PREFIX		%ifdef MARK_FUNCS			global _%1:function %1.endfunc-%1			%define %1 _%1:function %1.endfunc-%1		%else			global _%1			%define %1 _%1		%endif	%else		%ifdef MARK_FUNCS			global %1:function %1.endfunc-%1		%else			global %1		%endif	%endif%endmacro%define ROW_SHIFT 11%define COL_SHIFT 20%define C0 23170	;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 23170.475006%define C1 22725	;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 22725.260826%define C2 21407	;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 21406.727617%define C3 19266	;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 19265.545870%define C4 16383	;cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5 = 16384.000000%define C5 12873	;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 12872.826198%define C6 8867		;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 8866.956905%define C7 4520		;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 4520.335430;===========================================================================; Data (Read Only);===========================================================================%ifdef FORMAT_COFFSECTION .rodata%elseSECTION .rodata align=16%endif;-----------------------------------------------------------------------------; Trigonometric Tables;-----------------------------------------------------------------------------ALIGN 16wm1010:	dw 0, 0xffff, 0, 0xffffALIGN 16d40000:	dd 0x40000, 0ALIGN 16coeffs:  dw 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,		; 0  dw 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,		; 8  dw C4,  C4,  C4,  C4		; 16  dw C4, -C4,  C4, -C4		; 24  dw C2,  C6,  C2,  C6		; 32  dw C6, -C2,  C6, -C2		; 40  dw C1,  C3,  C1,  C3		; 48  dw C5,  C7,  C5,  C7		; 56  dw C3, -C7,  C3, -C7		; 64  dw -C1, -C5, -C1, -C5		; 72  dw C5, -C1,  C5, -C1		; 80  dw C7,  C3,  C7,  C3		; 88  dw C7, -C5,  C7, -C5		; 96  dw C3, -C1,  C3, -C1		; 104;===========================================================================; Helper macros;===========================================================================;---------------------------------------------------------------------------; DC_COND_IDCT;---------------------------------------------------------------------------%macro	DC_COND_IDCT	8%define	src0		%1%define	src4		%2%define	src1		%3%define	src5		%4%define	dst			%5%define	rounder_op	%6%define	rounder_arg	%7%define	shift		%8  movq mm0,[src0]               ; R4    R0  r4  r0  movq mm1,[src4]               ; R6    R2  r6  r2  movq mm2,[src1]               ; R3    R1  r3  r1  movq mm3,[src5]               ; R7    R5  r7  r5  movq mm4,[wm1010]  pand mm4,mm0  por mm4,mm1  por mm4,mm2  por mm4,mm3  packssdw mm4,mm4  movd eax,mm4  or eax,eax  jz near .skip1  movq mm4,[coeffs+16]          ; C4    C4  C4  C4  pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0  movq mm5,[coeffs+24]          ; -C4   C4  -C4 C4  pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0  movq mm5,[coeffs+32]          ; C6    C2  C6  C2  pmaddwd mm5,mm1               ; C6R6+C2R2 C6r6+C2r2  movq mm6,[coeffs+40]          ; -C2   C6  -C2 C6  pmaddwd mm1,mm6               ; -C2R6+C6R2    -C2r6+C6r2  movq mm7,[coeffs+48]          ; C3    C1  C3  C1  pmaddwd mm7,mm2               ; C3R3+C1R1 C3r3+C1r1  rounder_op mm4, rounder_arg  movq mm6,mm4                  ; C4R4+C4R0 C4r4+C4r0  paddd mm4,mm5                 ; A0        a0  psubd mm6,mm5                 ; A3        a3  movq mm5,[coeffs+56]          ; C7    C5  C7  C5  pmaddwd mm5,mm3               ; C7R7+C5R5 C7r7+C5r5  rounder_op mm0, rounder_arg  paddd mm1,mm0                 ; A1        a1  paddd mm0,mm0  psubd mm0,mm1                 ; A2        a2  pmaddwd mm2,[coeffs+64]       ; -C7R3+C3R1    -C7r3+C3r1  paddd mm7,mm5                 ; B0        b0  movq mm5,[coeffs+72]          ; -C5   -C1 -C5 -C1  pmaddwd mm5,mm3               ; -C5R7-C1R5    -C5r7-C1r5  paddd mm7,mm4                 ; A0+B0     a0+b0  paddd mm4,mm4                 ; 2A0       2a0  psubd mm4,mm7                 ; A0-B0     a0-b0  paddd mm5,mm2                 ; B1        b1  psrad mm7,shift  psrad mm4,shift  movq mm2,mm1                  ; A1        a1  paddd mm1,mm5                 ; A1+B1     a1+b1  psubd mm2,mm5                 ; A1-B1     a1-b1  psrad mm1,shift  psrad mm2,shift  packssdw mm7,mm1              ; A1+B1 a1+b1   A0+B0   a0+b0  packssdw mm2,mm4              ; A0-B0 a0-b0   A1-B1   a1-b1  movq [dst],mm7  movq mm1,[src1]               ; R3    R1  r3  r1  movq mm4,[coeffs+80]          ;-C1    C5  -C1     C5  movq [dst + 24],mm2  pmaddwd mm4,mm1               ; -C1R3+C5R1    -C1r3+C5r1  movq mm7,[coeffs+88]          ; C3    C7  C3  C7  pmaddwd mm1,[coeffs+96]       ; -C5R3+C7R1    -C5r3+C7r1  pmaddwd mm7,mm3               ; C3R7+C7R5 C3r7+C7r5  movq mm2,mm0                  ; A2        a2  pmaddwd mm3,[coeffs+104]      ; -C1R7+C3R5    -C1r7+C3r5  paddd mm4,mm7                 ; B2        b2  paddd mm2,mm4                 ; A2+B2     a2+b2  psubd mm0,mm4                 ; a2-B2     a2-b2  psrad mm2,shift  psrad mm0,shift  movq mm4,mm6                  ; A3        a3  paddd mm3,mm1                 ; B3        b3  paddd mm6,mm3                 ; A3+B3     a3+b3  psubd mm4,mm3                 ; a3-B3     a3-b3  psrad mm6,shift  packssdw mm2,mm6              ; A3+B3 a3+b3   A2+B2   a2+b2  movq [ dst + 8],mm2  psrad mm4,shift  packssdw mm4,mm0              ; A2-B2 a2-b2   A3-B3   a3-b3  movq [ dst + 16],mm4  jmp short .skip2.skip1  pslld mm0,16  paddd mm0,[d40000]  psrad mm0,13  packssdw mm0,mm0  movq [ dst ],mm0  movq [ dst + 8],mm0  movq [ dst + 16],mm0  movq [ dst + 24],mm0.skip2%undef  src0%undef  src4%undef  src1%undef  src5%undef  dst%undef  rounder_op%undef  rounder_arg%undef  shift%endmacro;---------------------------------------------------------------------------; Z_COND_IDCT;---------------------------------------------------------------------------%macro	Z_COND_IDCT	9%define src0        %1%define src4        %2%define src1        %3%define src5        %4%define dst         %5%define rounder_op  %6%define rounder_arg %7%define shift       %8%define bt          %9  movq mm0,[src0]               ; R4    R0  r4  r0  movq mm1,[src4]               ; R6    R2  r6  r2  movq mm2,[src1]               ; R3    R1  r3  r1  movq mm3,[src5]               ; R7    R5  r7  r5  movq mm4,mm0  por mm4,mm1  por mm4,mm2  por mm4,mm3  packssdw mm4,mm4  movd eax,mm4  or eax,eax  jz near bt  movq mm4,[coeffs+16]          ; C4    C4  C4  C4  pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0  movq mm5,[coeffs+24]          ; -C4   C4  -C4 C4  pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0  movq mm5,[coeffs+32]          ; C6    C2  C6  C2  pmaddwd mm5,mm1               ; C6R6+C2R2 C6r6+C2r2  movq mm6,[coeffs+40]          ; -C2   C6  -C2 C6  pmaddwd mm1,mm6               ; -C2R6+C6R2    -C2r6+C6r2  movq mm7,[coeffs+48]          ; C3    C1  C3  C1  pmaddwd mm7,mm2               ; C3R3+C1R1 C3r3+C1r1  rounder_op mm4, rounder_arg  movq mm6,mm4                  ; C4R4+C4R0 C4r4+C4r0  paddd mm4,mm5                 ; A0        a0  psubd mm6,mm5                 ; A3        a3  movq mm5,[coeffs+56]          ; C7    C5  C7  C5  pmaddwd mm5,mm3               ; C7R7+C5R5 C7r7+C5r5  rounder_op mm0, rounder_arg  paddd mm1,mm0                 ; A1        a1  paddd mm0,mm0  psubd mm0,mm1                 ; A2        a2  pmaddwd mm2,[coeffs+64]       ; -C7R3+C3R1    -C7r3+C3r1  paddd mm7,mm5                 ; B0        b0  movq mm5,[coeffs+72]          ; -C5   -C1 -C5 -C1  pmaddwd mm5,mm3               ; -C5R7-C1R5    -C5r7-C1r5  paddd mm7,mm4                 ; A0+B0     a0+b0  paddd mm4,mm4                 ; 2A0       2a0  psubd mm4,mm7                 ; A0-B0     a0-b0  paddd mm5,mm2                 ; B1        b1  psrad mm7,shift  psrad mm4,shift  movq mm2,mm1                  ; A1        a1  paddd mm1,mm5                 ; A1+B1     a1+b1  psubd mm2,mm5                 ; A1-B1     a1-b1  psrad mm1,shift  psrad mm2,shift  packssdw mm7,mm1              ; A1+B1 a1+b1   A0+B0   a0+b0  packssdw mm2,mm4              ; A0-B0 a0-b0   A1-B1   a1-b1  movq [ dst ],mm7  movq mm1,[src1]               ; R3    R1  r3  r1  movq mm4,[coeffs+80]          ; -C1   C5  -C1     C5  movq [ dst + 24 ],mm2  pmaddwd mm4,mm1               ; -C1R3+C5R1    -C1r3+C5r1  movq mm7,[coeffs+88]          ; C3    C7  C3  C7  pmaddwd mm1,[coeffs+96]       ; -C5R3+C7R1    -C5r3+C7r1  pmaddwd mm7,mm3               ; C3R7+C7R5 C3r7+C7r5  movq mm2,mm0                  ; A2        a2  pmaddwd mm3,[coeffs+104]      ; -C1R7+C3R5    -C1r7+C3r5  paddd mm4,mm7                 ; B2        b2  paddd mm2,mm4                 ; A2+B2     a2+b2  psubd mm0,mm4                 ; a2-B2     a2-b2  psrad mm2,shift  psrad mm0,shift  movq mm4,mm6                  ; A3        a3  paddd mm3,mm1                 ; B3        b3  paddd mm6,mm3                 ; A3+B3     a3+b3  psubd mm4,mm3                 ; a3-B3     a3-b3  psrad mm6,shift  packssdw mm2,mm6              ; A3+B3 a3+b3   A2+B2   a2+b2  movq [ dst + 8],mm2  psrad mm4,shift  packssdw mm4,mm0              ; A2-B2 a2-b2   A3-B3   a3-b3  movq [dst + 16],mm4%undef	src0%undef	src4%undef	src1%undef	src5%undef	dst%undef	rounder_op%undef	rounder_arg%undef	shift%undef	bt%endmacro;---------------------------------------------------------------------------; IDCT0;---------------------------------------------------------------------------%macro	IDCT0		8%define	src0		%1%define src4        %2%define src1        %3%define src5        %4%define dst         %5%define rounder_op  %6%define rounder_arg %7%define shift       %8  movq mm0,[src0]               ; R4    R0  r4  r0  movq mm1,[src4]               ; R6    R2  r6  r2  movq mm2,[src1]               ; R3    R1  r3  r1  movq mm3,[src5]               ; R7    R5  r7  r5  movq mm4,[coeffs+16]          ; C4    C4  C4  C4  pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0  movq mm5,[coeffs+24]          ; -C4   C4  -C4 C4  pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0  movq mm5,[coeffs+32]          ; C6    C2  C6  C2  pmaddwd mm5,mm1               ; C6R6+C2R2 C6r6+C2r2  movq mm6,[coeffs+40]          ; -C2   C6  -C2 C6  pmaddwd mm1,mm6               ; -C2R6+C6R2    -C2r6+C6r2  ; rounder_op mm4, rounder_arg  movq mm6,mm4                  ; C4R4+C4R0 C4r4+C4r0  movq mm7,[coeffs+48]          ; C3    C1  C3  C1  ; rounder_op mm0, rounder_arg  pmaddwd mm7,mm2               ; C3R3+C1R1 C3r3+C1r1  paddd mm4,mm5                 ; A0        a0  psubd mm6,mm5                 ; A3        a3  movq mm5,mm0                  ; -C4R4+C4R0    -C4r4+C4r0  paddd mm0,mm1                 ; A1        a1  psubd mm5,mm1                 ; A2        a2  movq mm1,[coeffs+56]          ; C7    C5  C7  C5  pmaddwd mm1,mm3               ; C7R7+C5R5 C7r7+C5r5  pmaddwd mm2,[coeffs+64]       ; -C7R3+C3R1    -C7r3+C3r1  paddd mm7,mm1                 ; B0        b0  movq mm1,[coeffs+72]          ; -C5   -C1 -C5 -C1  pmaddwd mm1,mm3               ; -C5R7-C1R5    -C5r7-C1r5  paddd mm7,mm4                 ; A0+B0     a0+b0  paddd mm4,mm4                 ; 2A0       2a0  psubd mm4,mm7                 ; A0-B0     a0-b0  paddd mm1,mm2                 ; B1        b1  psrad mm7,shift  psrad mm4,shift  movq mm2,mm0                  ; A1        a1  paddd mm0,mm1                 ; A1+B1     a1+b1  psubd mm2,mm1                 ; A1-B1     a1-b1  psrad mm0,shift  psrad mm2,shift  packssdw mm7,mm7              ; A0+B0 a0+b0  movd [ dst ],mm7  packssdw mm0,mm0              ; A1+B1 a1+b1  movd [ dst + 16],mm0  packssdw mm2,mm2              ; A1-B1 a1-b1  movd [ dst + 96 ],mm2  packssdw mm4,mm4              ; A0-B0 a0-b0  movd [ dst + 112],mm4  movq mm0,[src1]               ; R3    R1  r3  r1  movq mm4,[coeffs+80]          ; -C1   C5  -C1     C5  pmaddwd mm4,mm0               ; -C1R3+C5R1    -C1r3+C5r1  movq mm7,[coeffs+88]          ; C3    C7  C3  C7  pmaddwd mm0,[coeffs+96]       ; -C5R3+C7R1    -C5r3+C7r1  pmaddwd mm7,mm3               ; C3R7+C7R5 C3r7+C7r5  movq mm2,mm5                  ; A2        a2  pmaddwd mm3,[coeffs+104]      ; -C1R7+C3R5    -C1r7+C3r5  paddd mm4,mm7                 ; B2        b2  paddd mm2,mm4                 ; A2+B2     a2+b2  psubd mm5,mm4                 ; a2-B2     a2-b2  psrad mm2,shift  psrad mm5,shift  movq mm4,mm6                  ; A3        a3  paddd mm3,mm0                 ; B3        b3  paddd mm6,mm3                 ; A3+B3     a3+b3  psubd mm4,mm3                 ; a3-B3     a3-b3  psrad mm6,shift  psrad mm4,shift  packssdw mm2,mm2              ; A2+B2 a2+b2  packssdw mm6,mm6              ; A3+B3 a3+b3  movd [ dst + 32 ],mm2  packssdw mm4,mm4              ; A3-B3 a3-b3  packssdw mm5,mm5              ; A2-B2 a2-b2  movd [ dst + 48 ],mm6  movd [ dst + 64 ],mm4  movd [ dst + 80 ],mm5%undef	src0%undef	src4%undef	src1%undef	src5%undef	dst%undef	rounder_op%undef	rounder_arg%undef	shift%endmacro;---------------------------------------------------------------------------; IDCT4;---------------------------------------------------------------------------%macro	IDCT4		8%define	src0		%1%define	src4		%2

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -