⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 simple_idct_mmx.asm

📁 从FFMPEG转换而来的H264解码程序,VC下编译..
💻 ASM
📖 第 1 页 / 共 3 页
字号:
  paddd mm0,mm1                 ; A1        a1
  psubd mm5,mm1                 ; A2        a2
  movq mm1,[coeffs+56]          ; C7    C5  C7  C5
  pmaddwd mm1,mm3               ; C7R7+C5R5 C7r7+C5r5
  pmaddwd mm2,[coeffs+64]       ; -C7R3+C3R1    -C7r3+C3r1
  paddd mm7,mm1                 ; B0        b0
  movq mm1,[coeffs+72]          ; -C5   -C1 -C5 -C1
  pmaddwd mm1,mm3               ; -C5R7-C1R5    -C5r7-C1r5
  paddd mm7,mm4                 ; A0+B0     a0+b0
  paddd mm4,mm4                 ; 2A0       2a0
  psubd mm4,mm7                 ; A0-B0     a0-b0
  paddd mm1,mm2                 ; B1        b1
  psrad mm7,shift
  psrad mm4,shift
  movq mm2,mm0                  ; A1        a1
  paddd mm0,mm1                 ; A1+B1     a1+b1
  psubd mm2,mm1                 ; A1-B1     a1-b1
  psrad mm0,shift
  psrad mm2,shift
  packssdw mm7,mm7              ; A0+B0 a0+b0
  movd [ dst ],mm7
  packssdw mm0,mm0              ; A1+B1 a1+b1
  movd [ dst + 16],mm0
  packssdw mm2,mm2              ; A1-B1 a1-b1
  movd [ dst + 96 ],mm2
  packssdw mm4,mm4              ; A0-B0 a0-b0
  movd [ dst + 112],mm4
  movq mm0,[src1]               ; R3    R1  r3  r1
  movq mm4,[coeffs+80]          ; -C1   C5  -C1     C5
  pmaddwd mm4,mm0               ; -C1R3+C5R1    -C1r3+C5r1
  movq mm7,[coeffs+88]          ; C3    C7  C3  C7
  pmaddwd mm0,[coeffs+96]       ; -C5R3+C7R1    -C5r3+C7r1
  pmaddwd mm7,mm3               ; C3R7+C7R5 C3r7+C7r5
  movq mm2,mm5                  ; A2        a2
  pmaddwd mm3,[coeffs+104]      ; -C1R7+C3R5    -C1r7+C3r5
  paddd mm4,mm7                 ; B2        b2
  paddd mm2,mm4                 ; A2+B2     a2+b2
  psubd mm5,mm4                 ; a2-B2     a2-b2
  psrad mm2,shift
  psrad mm5,shift
  movq mm4,mm6                  ; A3        a3
  paddd mm3,mm0                 ; B3        b3
  paddd mm6,mm3                 ; A3+B3     a3+b3
  psubd mm4,mm3                 ; a3-B3     a3-b3
  psrad mm6,shift
  psrad mm4,shift
  packssdw mm2,mm2              ; A2+B2 a2+b2
  packssdw mm6,mm6              ; A3+B3 a3+b3
  movd [ dst + 32 ],mm2
  packssdw mm4,mm4              ; A3-B3 a3-b3
  packssdw mm5,mm5              ; A2-B2 a2-b2
  movd [ dst + 48 ],mm6
  movd [ dst + 64 ],mm4
  movd [ dst + 80 ],mm5
%undef	src0
%undef	src4
%undef	src1
%undef	src5
%undef	dst
%undef	rounder_op
%undef	rounder_arg
%undef	shift
%endmacro

;---------------------------------------------------------------------------
; IDCT4
;---------------------------------------------------------------------------

%macro	IDCT4		8
%define	src0		%1
%define	src4		%2
%define	src1		%3
%define	src5		%4
%define	dst			%5
%define	rounder_op	%6
%define	rounder_arg	%7
%define	shift		%8
  movq mm0,[src0]               ; R4    R0  r4  r0
  movq mm1,[src4]               ; R6    R2  r6  r2
  movq mm3,[src5]               ; R7    R5  r7  r5
  movq mm4,[coeffs+16]          ; C4    C4  C4  C4
  pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0
  movq mm5,[coeffs+24]          ; -C4   C4  -C4 C4
  pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0
  movq mm5,[coeffs+32]          ; C6    C2  C6  C2
  pmaddwd mm5,mm1               ; C6R6+C2R2 C6r6+C2r2
  movq mm6,[coeffs+40]          ; -C2   C6  -C2 C6
  pmaddwd mm1,mm6               ; -C2R6+C6R2    -C2r6+C6r2
  ; rounder_op mm4, rounder_arg
  movq mm6,mm4                  ; C4R4+C4R0 C4r4+C4r0
  ; rounder_op mm0, rounder_arg
  paddd mm4,mm5                 ; A0        a0
  psubd mm6,mm5                 ; A3        a3
  movq mm5,mm0                  ; -C4R4+C4R0    -C4r4+C4r0
  paddd mm0,mm1                 ; A1        a1
  psubd mm5,mm1                 ; A2        a2
  movq mm1,[coeffs+56]          ; C7    C5  C7  C5
  pmaddwd mm1,mm3               ; C7R7+C5R5 C7r7+C5r5
  movq mm7,[coeffs+72]          ; -C5   -C1 -C5 -C1
  pmaddwd mm7,mm3               ; -C5R7-C1R5    -C5r7-C1r5
  paddd mm1,mm4                 ; A0+B0     a0+b0
  paddd mm4,mm4                 ; 2A0       2a0
  psubd mm4,mm1                 ; A0-B0     a0-b0
  psrad mm1,shift
  psrad mm4,shift
  movq mm2,mm0                  ; A1        a1
  paddd mm0,mm7                 ; A1+B1     a1+b1
  psubd mm2,mm7                 ; A1-B1     a1-b1
  psrad mm0,shift
  psrad mm2,shift
  packssdw mm1,mm1              ; A0+B0 a0+b0
  movd [ dst ],mm1
  packssdw mm0,mm0              ; A1+B1 a1+b1
  movd [ dst + 16 ],mm0
  packssdw mm2,mm2              ; A1-B1 a1-b1
  movd [ dst + 96 ],mm2
  packssdw mm4,mm4              ; A0-B0 a0-b0
  movd [ dst + 112 ],mm4
  movq mm1,[coeffs+88]          ; C3    C7  C3  C7
  pmaddwd mm1,mm3               ; C3R7+C7R5 C3r7+C7r5
  movq mm2,mm5                  ; A2        a2
  pmaddwd mm3,[coeffs+104]      ; -C1R7+C3R5    -C1r7+C3r5
  paddd mm2,mm1                 ; A2+B2     a2+b2
  psubd mm5,mm1                 ; a2-B2     a2-b2
  psrad mm2,shift
  psrad mm5,shift
  movq mm1,mm6                  ; A3        a3
  paddd mm6,mm3                 ; A3+B3     a3+b3
  psubd mm1,mm3                 ; a3-B3     a3-b3
  psrad mm6,shift
  psrad mm1,shift
  packssdw mm2,mm2              ; A2+B2 a2+b2
  packssdw mm6,mm6              ; A3+B3 a3+b3
  movd [dst + 32],mm2
  packssdw mm1,mm1              ; A3-B3 a3-b3
  packssdw mm5,mm5              ; A2-B2 a2-b2
  movd [dst + 48],mm6
  movd [dst + 64],mm1
  movd [dst + 80],mm5
%undef	src0
%undef	src4
%undef	src1
%undef	src5
%undef	dst
%undef	rounder_op
%undef	rounder_arg
%undef	shift
%endmacro

;---------------------------------------------------------------------------
; IDCT6
;---------------------------------------------------------------------------

%macro	IDCT6		8
%define	src0		%1
%define	src4		%2
%define	src1		%3
%define	src5		%4
%define	dst			%5
%define	rounder_op	%6
%define	rounder_arg	%7
%define	shift		%8
  movq mm0,[src0]               ; R4    R0  r4  r0
  movq mm3,[src5]               ; R7    R5  r7  r5
  movq mm4,[coeffs+16]          ; C4    C4  C4  C4
  pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0
  movq mm5,[coeffs+24]          ; -C4   C4  -C4 C4
  pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0
  ; rounder_op mm4, rounder_arg
  movq mm6,mm4                  ; C4R4+C4R0 C4r4+C4r0
  ; rounder_op mm0, rounder_arg
  movq mm5,mm0                  ; -C4R4+C4R0    -C4r4+C4r0
  movq mm1,[coeffs+56]          ; C7    C5  C7  C5
  pmaddwd mm1,mm3               ; C7R7+C5R5 C7r7+C5r5
  movq mm7,[coeffs+72]          ; -C5   -C1 -C5 -C1
  pmaddwd mm7,mm3               ; -C5R7-C1R5    -C5r7-C1r5
  paddd mm1,mm4                 ; A0+B0     a0+b0
  paddd mm4,mm4                 ; 2A0       2a0
  psubd mm4,mm1                 ; A0-B0     a0-b0
  psrad mm1,shift
  psrad mm4,shift
  movq mm2,mm0                  ; A1        a1
  paddd mm0,mm7                 ; A1+B1     a1+b1
  psubd mm2,mm7                 ; A1-B1     a1-b1
  psrad mm0,shift
  psrad mm2,shift
  packssdw mm1,mm1              ; A0+B0 a0+b0
  movd [ dst ],mm1
  packssdw mm0,mm0              ; A1+B1 a1+b1
  movd [ dst + 16 ],mm0
  packssdw mm2,mm2              ; A1-B1 a1-b1
  movd [ dst + 96 ],mm2
  packssdw mm4,mm4              ; A0-B0 a0-b0
  movd [ dst + 112 ],mm4
  movq mm1,[coeffs+88]          ; C3    C7  C3  C7
  pmaddwd mm1,mm3               ; C3R7+C7R5 C3r7+C7r5
  movq mm2,mm5                  ; A2        a2
  pmaddwd mm3,[coeffs+104]      ; -C1R7+C3R5    -C1r7+C3r5
  paddd mm2,mm1                 ; A2+B2     a2+b2
  psubd mm5,mm1                 ; a2-B2     a2-b2
  psrad mm2,shift
  psrad mm5,shift
  movq mm1,mm6                  ; A3        a3
  paddd mm6,mm3                 ; A3+B3     a3+b3
  psubd mm1,mm3                 ; a3-B3     a3-b3
  psrad mm6,shift
  psrad mm1,shift
  packssdw mm2,mm2              ; A2+B2 a2+b2
  packssdw mm6,mm6              ; A3+B3 a3+b3
  movd [dst + 32],mm2
  packssdw mm1,mm1              ; A3-B3 a3-b3
  packssdw mm5,mm5              ; A2-B2 a2-b2
  movd [dst + 48],mm6
  movd [dst + 64],mm1
  movd [dst + 80],mm5
%undef  src0
%undef  src4
%undef  src1
%undef  src5
%undef  dst
%undef  rounder_op
%undef  rounder_arg
%undef	shift
%endmacro

;---------------------------------------------------------------------------
; IDCT2
;---------------------------------------------------------------------------

%macro	IDCT2		8
%define	src0		%1
%define	src4		%2
%define	src1		%3
%define	src5		%4
%define	dst			%5
%define	rounder_op	%6
%define	rounder_arg	%7
%define shift       %8
  movq mm0,[src0]               ; R4    R0  r4  r0
  movq mm2,[src1]               ; R3    R1  r3  r1
  movq mm3,[src5]               ; R7    R5  r7  r5
  movq mm4,[coeffs+16]          ; C4    C4  C4  C4
  pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0
  movq mm5,[coeffs+24]          ; -C4   C4  -C4 C4
  pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0
  ; rounder_op mm4, rounder_arg
  movq mm6,mm4                  ; C4R4+C4R0 C4r4+C4r0
  movq mm7,[coeffs+48]          ; C3    C1  C3  C1
  ; rounder_op mm0, rounder_arg
  pmaddwd mm7,mm2               ; C3R3+C1R1 C3r3+C1r1
  movq mm5,mm0                  ; -C4R4+C4R0    -C4r4+C4r0
  movq mm1,[coeffs+56]          ; C7    C5  C7  C5
  pmaddwd mm1,mm3               ; C7R7+C5R5 C7r7+C5r5
  pmaddwd mm2,[coeffs+64]       ; -C7R3+C3R1    -C7r3+C3r1
  paddd mm7,mm1                 ; B0        b0
  movq mm1,[coeffs+72]          ; -C5   -C1 -C5 -C1
  pmaddwd mm1,mm3               ; -C5R7-C1R5    -C5r7-C1r5
  paddd mm7,mm4                 ; A0+B0     a0+b0
  paddd mm4,mm4                 ; 2A0       2a0
  psubd mm4,mm7                 ; A0-B0     a0-b0
  paddd mm1,mm2                 ; B1        b1
  psrad mm7,shift
  psrad mm4,shift
  movq mm2,mm0                  ; A1        a1
  paddd mm0,mm1                 ; A1+B1     a1+b1
  psubd mm2,mm1                 ; A1-B1     a1-b1
  psrad mm0,shift
  psrad mm2,shift
  packssdw mm7,mm7              ; A0+B0 a0+b0
  movd [dst],mm7
  packssdw mm0,mm0              ; A1+B1 a1+b1
  movd [dst + 16],mm0
  packssdw mm2,mm2              ; A1-B1 a1-b1
  movd [dst + 96],mm2
  packssdw mm4,mm4              ; A0-B0 a0-b0
  movd [dst + 112],mm4
  movq mm0,[src1]               ; R3    R1  r3  r1
  movq mm4,[coeffs+80]          ; -C1   C5  -C1     C5
  pmaddwd mm4,mm0               ; -C1R3+C5R1    -C1r3+C5r1
  movq mm7,[coeffs+88]          ; C3    C7  C3  C7
  pmaddwd mm0,[coeffs+96]       ; -C5R3+C7R1    -C5r3+C7r1
  pmaddwd mm7,mm3               ; C3R7+C7R5 C3r7+C7r5
  movq mm2,mm5                  ; A2        a2
  pmaddwd mm3,[coeffs+104]      ; -C1R7+C3R5    -C1r7+C3r5
  paddd mm4,mm7                 ; B2        b2
  paddd mm2,mm4                 ; A2+B2     a2+b2
  psubd mm5,mm4                 ; a2-B2     a2-b2
  psrad mm2,shift
  psrad mm5,shift
  movq mm4,mm6                  ; A3        a3
  paddd mm3,mm0                 ; B3        b3
  paddd mm6,mm3                 ; A3+B3     a3+b3
  psubd mm4,mm3                 ; a3-B3     a3-b3
  psrad mm6,shift
  psrad mm4,shift
  packssdw mm2,mm2              ; A2+B2 a2+b2
  packssdw mm6,mm6              ; A3+B3 a3+b3
  movd [dst + 32],mm2
  packssdw mm4,mm4              ; A3-B3 a3-b3
  packssdw mm5,mm5              ; A2-B2 a2-b2
  movd [dst + 48],mm6
  movd [dst + 64],mm4
  movd [dst + 80],mm5
%undef  src0
%undef  src4
%undef  src1
%undef  src5
%undef  dst
%undef  rounder_op
%undef  rounder_arg
%undef  shift
%endmacro

;---------------------------------------------------------------------------
; IDCT3
;---------------------------------------------------------------------------

%macro  IDCT3       8
%define src0        %1
%define src4        %2
%define src1        %3
%define src5        %4
%define dst         %5
%define rounder_op  %6
%define rounder_arg %7
%define shift       %8
  movq mm0,[src0]               ; R4    R0  r4  r0
  movq mm2,[src1]               ; R3    R1  r3  r1
  movq mm4,[coeffs+16]          ; C4    C4  C4  C4
  pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0
  movq mm5,[coeffs+24]          ; -C4   C4  -C4 C4
  pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0
  ; rounder_op mm4, rounder_arg
  movq mm6,mm4                  ; C4R4+C4R0 C4r4+C4r0
  movq mm7,[coeffs+48]          ; C3    C1  C3  C1
  ; rounder_op mm0, rounder_arg
  pmaddwd mm7,mm2               ; C3R3+C1R1 C3r3+C1r1
  movq mm5,mm0                  ; -C4R4+C4R0    -C4r4+C4r0
  movq mm3,[coeffs+64]
  pmaddwd mm3,mm2               ; -C7R3+C3R1    -C7r3+C3r1
  paddd mm7,mm4                 ; A0+B0     a0+b0
  paddd mm4,mm4                 ; 2A0       2a0
  psubd mm4,mm7                 ; A0-B0     a0-b0
  psrad mm7,shift
  psrad mm4,shift
  movq mm1,mm0                  ; A1        a1
  paddd mm0,mm3                 ; A1+B1     a1+b1
  psubd mm1,mm3                 ; A1-B1     a1-b1
  psrad mm0,shift
  psrad mm1,shift

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -