📄 dct-a.asm
字号:
;*****************************************************************************;* dct-a.asm: h264 encoder library;*****************************************************************************;* Copyright (C) 2003-2008 x264 project;*;* Authors: Holger Lubitz <holger@lubitz.org>;* Loren Merritt <lorenm@u.washington.edu>;* Laurent Aimar <fenrir@via.ecp.fr>;* Min Chen <chenm001.163.com>;*;* This program is free software; you can redistribute it and/or modify;* it under the terms of the GNU General Public License as published by;* the Free Software Foundation; either version 2 of the License, or;* (at your option) any later version.;*;* This program is distributed in the hope that it will be useful,;* but WITHOUT ANY WARRANTY; without even the implied warranty of;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the;* GNU General Public License for more details.;*;* You should have received a copy of the GNU General Public License;* along with this program; if not, write to the Free Software;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.;*****************************************************************************%include "x86inc.asm"%include "x86util.asm"SECTION_RODATApw_32: times 8 dw 32pw_8000: times 8 dw 0x8000hsub_mul: times 8 db 1, -1pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15pb_scan4framea: db 12,13,6,7,14,15,0,1,8,9,2,3,4,5,10,11pb_scan4frameb: db 0,1,8,9,2,3,4,5,10,11,12,13,6,7,14,15pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7pb_1: times 16 db 1SECTION .text%macro WALSH4_1D 5 SUMSUB_BADC m%4, m%3, m%2, m%1, m%5 SUMSUB_BADC m%4, m%2, m%3, m%1, m%5 SWAP %1, %4, %3%endmacro%macro SUMSUB_17BIT 4 ; a, b, tmp, 0x8000 movq m%3, m%4 pxor m%1, m%4 psubw m%3, m%2 pxor m%2, m%4 pavgw m%3, m%1 pavgw m%2, m%1 pxor m%3, m%4 pxor m%2, m%4 SWAP %1, %2, %3%endmacroINIT_MMX;-----------------------------------------------------------------------------; void x264_dct4x4dc_mmx( int16_t d[4][4] );-----------------------------------------------------------------------------cglobal x264_dct4x4dc_mmx, 1,1 movq m3, [r0+24] movq m2, [r0+16] movq m1, [r0+ 8] movq m0, [r0+ 0] movq m7, [pw_8000 GLOBAL] ; convert to unsigned and back, so that pavgw works WALSH4_1D 0,1,2,3,4 TRANSPOSE4x4W 0,1,2,3,4 SUMSUB_BADC m1, m0, m3, m2, m4 SWAP 0, 1 SWAP 2, 3 SUMSUB_17BIT 0,2,4,7 SUMSUB_17BIT 1,3,5,7 movq [r0+0], m0 movq [r0+8], m2 movq [r0+16], m3 movq [r0+24], m1 RET;-----------------------------------------------------------------------------; void x264_idct4x4dc_mmx( int16_t d[4][4] );-----------------------------------------------------------------------------cglobal x264_idct4x4dc_mmx, 1,1 movq m3, [r0+24] movq m2, [r0+16] movq m1, [r0+ 8] movq m0, [r0+ 0] WALSH4_1D 0,1,2,3,4 TRANSPOSE4x4W 0,1,2,3,4 WALSH4_1D 0,1,2,3,4 movq [r0+ 0], m0 movq [r0+ 8], m1 movq [r0+16], m2 movq [r0+24], m3 RET%macro SUB_DCT4 1;-----------------------------------------------------------------------------; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 );-----------------------------------------------------------------------------cglobal x264_sub4x4_dct_%1, 3,3%ifidn %1, mmx.skip_prologue: LOAD_DIFF m0, m4, m5, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE] LOAD_DIFF m3, m4, m5, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE] LOAD_DIFF m1, m4, m5, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE] LOAD_DIFF m2, m4, m5, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]%else mova m5, [hsub_mul GLOBAL] LOAD_DIFF8x4_SSSE3 0, 3, 1, 2, 4, 5, r1, r2%endif DCT4_1D 0,1,2,3,4 TRANSPOSE4x4W 0,1,2,3,4 DCT4_1D 0,1,2,3,4 movq [r0+ 0], m0 movq [r0+ 8], m1 movq [r0+16], m2 movq [r0+24], m3 RET%endmacroSUB_DCT4 mmxSUB_DCT4 ssse3;-----------------------------------------------------------------------------; void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] );-----------------------------------------------------------------------------cglobal x264_add4x4_idct_mmx, 2,2 pxor m7, m7.skip_prologue: movq m1, [r1+ 8] movq m3, [r1+24] movq m2, [r1+16] movq m0, [r1+ 0] IDCT4_1D 0,1,2,3,4,5 TRANSPOSE4x4W 0,1,2,3,4 paddw m0, [pw_32 GLOBAL] IDCT4_1D 0,1,2,3,4,5 STORE_DIFF m0, m4, m7, [r0+0*FDEC_STRIDE] STORE_DIFF m1, m4, m7, [r0+1*FDEC_STRIDE] STORE_DIFF m2, m4, m7, [r0+2*FDEC_STRIDE] STORE_DIFF m3, m4, m7, [r0+3*FDEC_STRIDE] RET;-----------------------------------------------------------------------------; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 );-----------------------------------------------------------------------------%macro SUB_NxN_DCT 6cglobal %1, 3,3,11%if mmsize == 8 pxor m7, m7%else add r2, 4*FDEC_STRIDE mova m7, [hsub_mul GLOBAL]%endif.skip_prologue:%ifdef WIN64 sub rsp, 8%endif call %2 add r0, %3 add r1, %4-%5-%6*FENC_STRIDE add r2, %4-%5-%6*FDEC_STRIDE call %2 add r0, %3 add r1, (%4-%6)*FENC_STRIDE-%5-%4 add r2, (%4-%6)*FDEC_STRIDE-%5-%4 call %2 add r0, %3 add r1, %4-%5-%6*FENC_STRIDE add r2, %4-%5-%6*FDEC_STRIDE%ifdef WIN64 add rsp, 8 call %2 RET%else jmp %2%endif%endmacro;-----------------------------------------------------------------------------; void x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] );-----------------------------------------------------------------------------%macro ADD_NxN_IDCT 6-7cglobal %1, 2,2,11 pxor m7, m7%if mmsize==16 add r0, 4*FDEC_STRIDE%endif.skip_prologue:%ifdef WIN64 sub rsp, 8%endif call %2 add r0, %4-%5-%6*FDEC_STRIDE add r1, %3 call %2 add r0, (%4-%6)*FDEC_STRIDE-%5-%4 add r1, %3 call %2 add r0, %4-%5-%6*FDEC_STRIDE add r1, %3%ifdef WIN64 add rsp, 8 call %2 RET%else jmp %2%endif%endmacro%ifndef ARCH_X86_64SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx.skip_prologue, 32, 4, 0, 0ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx.skip_prologue, 32, 4, 0, 0SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx.skip_prologue, 32, 8, 4, 4ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx.skip_prologue, 32, 8, 4, 4cextern x264_sub8x8_dct8_mmx.skip_prologuecextern x264_add8x8_idct8_mmx.skip_prologueSUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx.skip_prologue, 128, 8, 0, 0ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx.skip_prologue, 128, 8, 0, 0%endifINIT_XMMcextern x264_sub8x8_dct_sse2.skip_prologuecextern x264_sub8x8_dct_ssse3.skip_prologueSUB_NxN_DCT x264_sub16x16_dct_sse2, x264_sub8x8_dct_sse2.skip_prologue, 128, 8, 0, 0SUB_NxN_DCT x264_sub16x16_dct_ssse3, x264_sub8x8_dct_ssse3.skip_prologue, 128, 8, 0, 0cextern x264_add8x8_idct_sse2.skip_prologueADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2.skip_prologue, 2*64, 8, 0, 0cextern x264_sub8x8_dct8_sse2.skip_prologuecextern x264_add8x8_idct8_sse2.skip_prologueSUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2.skip_prologue, 128, 8, 0, 0ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2.skip_prologue, 128, 8, 0, 0cextern x264_sub8x8_dct8_ssse3.skip_prologueSUB_NxN_DCT x264_sub16x16_dct8_ssse3, x264_sub8x8_dct8_ssse3.skip_prologue, 128, 8, 0, 0;-----------------------------------------------------------------------------; void add8x8_idct_dc( uint8_t *p_dst, int16_t *dct2x2 );-----------------------------------------------------------------------------%macro ADD_DC 3 movq mm4, [%3+FDEC_STRIDE*0] movq mm5, [%3+FDEC_STRIDE*1] movq mm6, [%3+FDEC_STRIDE*2] paddusb mm4, %1 paddusb mm5, %1 paddusb mm6, %1 paddusb %1, [%3+FDEC_STRIDE*3] psubusb mm4, %2 psubusb mm5, %2 psubusb mm6, %2 psubusb %1, %2 movq [%3+FDEC_STRIDE*0], mm4 movq [%3+FDEC_STRIDE*1], mm5 movq [%3+FDEC_STRIDE*2], mm6 movq [%3+FDEC_STRIDE*3], %1%endmacrocglobal x264_add8x8_idct_dc_mmx, 2,2 movq mm0, [r1] pxor mm1, mm1 add r0, FDEC_STRIDE*4 paddw mm0, [pw_32 GLOBAL] psraw mm0, 6 psubw mm1, mm0 packuswb mm0, mm0 packuswb mm1, mm1 punpcklbw mm0, mm0 punpcklbw mm1, mm1 pshufw mm2, mm0, 0xFA pshufw mm3, mm1, 0xFA punpcklbw mm0, mm0 punpcklbw mm1, mm1 ADD_DC mm0, mm1, r0-FDEC_STRIDE*4 ADD_DC mm2, mm3, r0 RETcglobal x264_add8x8_idct_dc_ssse3, 2,2 movq xmm0, [r1] pxor xmm1, xmm1 add r0, FDEC_STRIDE*4 paddw xmm0, [pw_32 GLOBAL] psraw xmm0, 6 psubw xmm1, xmm0 movdqa xmm5, [pb_idctdc_unpack GLOBAL] packuswb xmm0, xmm0 packuswb xmm1, xmm1 pshufb xmm0, xmm5 pshufb xmm1, xmm5 movq xmm2, [r0+FDEC_STRIDE*-4] movq xmm3, [r0+FDEC_STRIDE*-3] movq xmm4, [r0+FDEC_STRIDE*-2] movq xmm5, [r0+FDEC_STRIDE*-1] movhps xmm2, [r0+FDEC_STRIDE* 0] movhps xmm3, [r0+FDEC_STRIDE* 1] movhps xmm4, [r0+FDEC_STRIDE* 2] movhps xmm5, [r0+FDEC_STRIDE* 3] paddusb xmm2, xmm0 paddusb xmm3, xmm0 paddusb xmm4, xmm0 paddusb xmm5, xmm0 psubusb xmm2, xmm1 psubusb xmm3, xmm1 psubusb xmm4, xmm1 psubusb xmm5, xmm1 movq [r0+FDEC_STRIDE*-4], xmm2 movq [r0+FDEC_STRIDE*-3], xmm3 movq [r0+FDEC_STRIDE*-2], xmm4 movq [r0+FDEC_STRIDE*-1], xmm5 movhps [r0+FDEC_STRIDE* 0], xmm2 movhps [r0+FDEC_STRIDE* 1], xmm3 movhps [r0+FDEC_STRIDE* 2], xmm4 movhps [r0+FDEC_STRIDE* 3], xmm5 RETcglobal x264_add16x16_idct_dc_mmx, 2,3 mov r2, 4.loop: movq mm0, [r1] pxor mm1, mm1 paddw mm0, [pw_32 GLOBAL] psraw mm0, 6 psubw mm1, mm0 packuswb mm0, mm0 packuswb mm1, mm1 punpcklbw mm0, mm0 punpcklbw mm1, mm1 pshufw mm2, mm0, 0xFA pshufw mm3, mm1, 0xFA punpcklbw mm0, mm0 punpcklbw mm1, mm1 ADD_DC mm0, mm1, r0 ADD_DC mm2, mm3, r0+8 add r1, 8 add r0, FDEC_STRIDE*4 dec r2 jg .loop REP_RET%macro IDCT_DC_STORE 3 movdqa xmm4, [r0+%1+FDEC_STRIDE*0] movdqa xmm5, [r0+%1+FDEC_STRIDE*1] movdqa xmm6, [r0+%1+FDEC_STRIDE*2] movdqa xmm7, [r0+%1+FDEC_STRIDE*3] paddusb xmm4, %2 paddusb xmm5, %2 paddusb xmm6, %2 paddusb xmm7, %2 psubusb xmm4, %3 psubusb xmm5, %3 psubusb xmm6, %3 psubusb xmm7, %3 movdqa [r0+%1+FDEC_STRIDE*0], xmm4 movdqa [r0+%1+FDEC_STRIDE*1], xmm5 movdqa [r0+%1+FDEC_STRIDE*2], xmm6 movdqa [r0+%1+FDEC_STRIDE*3], xmm7%endmacrocglobal x264_add16x16_idct_dc_sse2, 2,2,8 call .loop add r0, FDEC_STRIDE*4%ifdef WIN64 call .loop RET%endif.loop: add r0, FDEC_STRIDE*4 movq xmm0, [r1+0] movq xmm2, [r1+8] add r1, 16 punpcklwd xmm0, xmm0 punpcklwd xmm2, xmm2 pxor xmm1, xmm1 pxor xmm3, xmm3 paddw xmm0, [pw_32 GLOBAL] paddw xmm2, [pw_32 GLOBAL] psraw xmm0, 6 psraw xmm2, 6 psubw xmm1, xmm0 psubw xmm3, xmm2 packuswb xmm0, xmm1 packuswb xmm2, xmm3 movdqa xmm1, xmm0 movdqa xmm3, xmm2 punpcklbw xmm0, xmm0 punpcklbw xmm2, xmm2 punpckhbw xmm1, xmm1 punpckhbw xmm3, xmm3 IDCT_DC_STORE FDEC_STRIDE*-4, xmm0, xmm1 IDCT_DC_STORE 0, xmm2, xmm3 retcglobal x264_add16x16_idct_dc_ssse3, 2,2,8 call .loop add r0, FDEC_STRIDE*4%ifdef WIN64 call .loop RET%endif.loop: add r0, FDEC_STRIDE*4 movdqa xmm0, [r1] add r1, 16 pxor xmm1, xmm1 paddw xmm0, [pw_32 GLOBAL] psraw xmm0, 6 psubw xmm1, xmm0 movdqa xmm5, [ pb_idctdc_unpack GLOBAL] movdqa xmm6, [pb_idctdc_unpack2 GLOBAL] packuswb xmm0, xmm0
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -