📄 dct-a.asm
字号:
;*****************************************************************************;* dct.asm: h264 encoder library;*****************************************************************************;* Copyright (C) 2003 x264 project;* $Id: dct-a.asm,v 1.1 2006/02/23 14:51:09 kevin-fu Exp $;*;* Authors: Min Chen <chenm001.163.com> (converted to nasm);* Laurent Aimar <fenrir@via.ecp.fr> (initial version);*;* This program is free software; you can redistribute it and/or modify;* it under the terms of the GNU General Public License as published by;* the Free Software Foundation; either version 2 of the License, or;* (at your option) any later version.;*;* This program is distributed in the hope that it will be useful,;* but WITHOUT ANY WARRANTY; without even the implied warranty of;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the;* GNU General Public License for more details.;*;* You should have received a copy of the GNU General Public License;* along with this program; if not, write to the Free Software;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.;*****************************************************************************;*****************************************************************************;* *;* Revision history: *;* *;* 2004.04.28 portab all 4x4 function to nasm (CM) *;* *;*****************************************************************************BITS 64;=============================================================================; Macros and other preprocessor constants;=============================================================================%include "amd64inc.asm"%macro MMX_ZERO 1 pxor %1, %1%endmacro%macro MMX_LOAD_DIFF_4P 5 movd %1, %4 punpcklbw %1, %3 movd %2, %5 punpcklbw %2, %3 psubw %1, %2%endmacro%macro MMX_LOAD_DIFF_8P 5 movq %1, %4 punpcklbw %1, %3 movq %2, %5 punpcklbw %2, %3 psubw %1, %2%endmacro%macro MMX_SUMSUB_BA 2 paddw %1, %2 paddw %2, %2 psubw %2, %1%endmacro%macro MMX_SUMSUB_BADC 4 paddw %1, %2 paddw %3, %4 paddw %2, %2 paddw %4, %4 psubw %2, %1 psubw %4, %3%endmacro%macro MMX_SUMSUB2_AB 3 movq %3, %1 paddw %1, %1 paddw %1, %2 psubw %3, %2 psubw %3, %2%endmacro%macro MMX_SUMSUBD2_AB 4 movq %4, %1 movq %3, %2 psraw %2, 1 psraw %4, 1 paddw %1, %2 psubw %4, %3%endmacro%macro SBUTTERFLY 5 mov%1 %5, %3 punpckl%2 %3, %4 punpckh%2 %5, %4%endmacro;-----------------------------------------------------------------------------; input ABCD output ADTC;-----------------------------------------------------------------------------%macro MMX_TRANSPOSE 5 SBUTTERFLY q, wd, %1, %2, %5 SBUTTERFLY q, wd, %3, %4, %2 SBUTTERFLY q, dq, %1, %3, %4 SBUTTERFLY q, dq, %5, %2, %3%endmacro;-----------------------------------------------------------------------------; input ABCDEFGH output AFHDTECB ;-----------------------------------------------------------------------------%macro SSE2_TRANSPOSE8x8 9 SBUTTERFLY dqa, wd, %1, %2, %9 SBUTTERFLY dqa, wd, %3, %4, %2 SBUTTERFLY dqa, wd, %5, %6, %4 SBUTTERFLY dqa, wd, %7, %8, %6 SBUTTERFLY dqa, dq, %1, %3, %8 SBUTTERFLY dqa, dq, %9, %2, %3 SBUTTERFLY dqa, dq, %5, %7, %2 SBUTTERFLY dqa, dq, %4, %6, %7 SBUTTERFLY dqa, qdq, %1, %5, %6 SBUTTERFLY dqa, qdq, %9, %4, %5 SBUTTERFLY dqa, qdq, %8, %2, %4 SBUTTERFLY dqa, qdq, %3, %7, %2%endmacro%macro MMX_STORE_DIFF_4P 5 paddw %1, %3 psraw %1, 6 movd %2, %5 punpcklbw %2, %4 paddsw %1, %2 packuswb %1, %1 movd %5, %1%endmacro%macro MMX_STORE_DIFF_8P 4 psraw %1, 6 movq %2, %4 punpcklbw %2, %3 paddsw %1, %2 packuswb %1, %1 movq %4, %1%endmacro;=============================================================================; Constants;=============================================================================SECTION .rodata align=16pw_1: times 8 dw 1pw_32: times 8 dw 32;=============================================================================; Code;=============================================================================SECTION .textcglobal x264_dct4x4dc_mmxextALIGN 16;-----------------------------------------------------------------------------; void dct4x4dc( int16_t d[4][4] );-----------------------------------------------------------------------------x264_dct4x4dc_mmxext: movq mm0, [parm1q+ 0] movq mm1, [parm1q+ 8] movq mm2, [parm1q+16] movq mm3, [parm1q+24] MMX_SUMSUB_BADC mm1, mm0, mm3, mm2 ; mm1=s01 mm0=d01 mm3=s23 mm2=d23 MMX_SUMSUB_BADC mm3, mm1, mm2, mm0 ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23 MMX_TRANSPOSE mm3, mm1, mm0, mm2, mm4 ; in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0 MMX_SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23 MMX_SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23 MMX_TRANSPOSE mm0, mm2, mm3, mm4, mm1 ; in: mm0, mm2, mm3, mm4 out: mm0, mm4, mm1, mm3 movq mm6, [pw_1 GLOBAL] paddw mm0, mm6 paddw mm4, mm6 psraw mm0, 1 movq [parm1q+ 0],mm0 psraw mm4, 1 movq [parm1q+ 8],mm4 paddw mm1, mm6 paddw mm3, mm6 psraw mm1, 1 movq [parm1q+16],mm1 psraw mm3, 1 movq [parm1q+24],mm3 retcglobal x264_idct4x4dc_mmxextALIGN 16;-----------------------------------------------------------------------------; void x264_idct4x4dc_mmxext( int16_t d[4][4] );-----------------------------------------------------------------------------x264_idct4x4dc_mmxext: movq mm0, [parm1q+ 0] movq mm1, [parm1q+ 8] movq mm2, [parm1q+16] movq mm3, [parm1q+24] MMX_SUMSUB_BADC mm1, mm0, mm3, mm2 ; mm1=s01 mm0=d01 mm3=s23 mm2=d23 MMX_SUMSUB_BADC mm3, mm1, mm2, mm0 ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23 MMX_TRANSPOSE mm3, mm1, mm0, mm2, mm4 ; in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0 MMX_SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23 MMX_SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23 MMX_TRANSPOSE mm0, mm2, mm3, mm4, mm1 ; in: mm0, mm2, mm3, mm4 out: mm0, mm4, mm1, mm3 movq [parm1q+ 0], mm0 movq [parm1q+ 8], mm4 movq [parm1q+16], mm1 movq [parm1q+24], mm3 retcglobal x264_sub4x4_dct_mmxextALIGN 16;-----------------------------------------------------------------------------; void x264_sub4x4_dct_mmxext( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );-----------------------------------------------------------------------------x264_sub4x4_dct_mmxext: firstpush rbx pushreg rbx endprolog mov r10, parm1q ; dct mov rax, parm2q ; pix1%ifdef WIN64 mov rcx, parm4q ; pix2 movsxd rdx, dword [rsp+40+8] ; i_pix2 movsxd rbx, parm3d ; i_pix1%else movsxd rbx, parm3d ; i_pix1 movsxd rdx, parm5d ; i_pix2%endif MMX_ZERO mm7 ; Load 4 lines MMX_LOAD_DIFF_4P mm0, mm6, mm7, [rax ], [rcx] MMX_LOAD_DIFF_4P mm1, mm6, mm7, [rax+rbx ], [rcx+rdx] MMX_LOAD_DIFF_4P mm2, mm6, mm7, [rax+rbx*2], [rcx+rdx*2] add rax, rbx add rcx, rdx MMX_LOAD_DIFF_4P mm3, mm6, mm7, [rax+rbx*2], [rcx+rdx*2] MMX_SUMSUB_BADC mm3, mm0, mm2, mm1 ; mm3=s03 mm0=d03 mm2=s12 mm1=d12
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -