📄 idct_mmx.asm
字号:
;/******************************************************************************; * *; * This file is part of XviD, a free MPEG-4 video encoder/decoder *; * *; * XviD is an implementation of a part of one or more MPEG-4 Video tools *; * as specified in ISO/IEC 14496-2 standard. Those intending to use this *; * software module in hardware or software products are advised that its *; * use may infringe existing patents or copyrights, and any such use *; * would be at such party's own risk. The original developer of this *; * software module and his/her company, and subsequent editors and their *; * companies, will have no liability for use of this software or *; * modifications or derivatives thereof. *; * *; * XviD is free software; you can redistribute it and/or modify it *; * under the terms of the GNU General Public License as published by *; * the Free Software Foundation; either version 2 of the License, or *; * (at your option) any later version. *; * *; * XviD is distributed in the hope that it will be useful, but *; * WITHOUT ANY WARRANTY; without even the implied warranty of *; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *; * GNU General Public License for more details. *; * *; * You should have received a copy of the GNU General Public License *; * along with this program; if not, write to the Free Software *; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *; * *; ******************************************************************************/;;/******************************************************************************; * *; * idct_mmx.asm, MMX optimized inverse DCT *; * *; * Initial version provided by Intel at Application Note AP-922 *; * http://developer.intel.com/vtune/cbts/strmsimd/922down.htm *; * Copyright (C) 1999 Intel Corporation, *; * *; * corrected and further optimized in idct8x8_xmm.asm *; * Copyright (C) 2000-2001 Peter Gubanov <peter@elecard.net.ru> *; * Rounding trick Copyright (c) 2000 Michel Lespinasse <walken@zoy.org> *; * *; * http://www.elecard.com/peter/idct.html *; * http://www.linuxvideo.org/mpeg2dec/ *; * *; * ported to NASM and some minor changes *; * Copyright (C) 2001 - Michael Militzer <isibaar@xvid.org>, *; * *; * For more information visit the XviD homepage: http://www.xvid.org *; * *; ******************************************************************************/;;/******************************************************************************; * *; * Revision history: *; * *; * 07.11.2001 bugfix, idct now meets IEEE-1180 (Isibaar) *; * 02.11.2001 initial version (Isibaar) *; * *; ******************************************************************************/BITS 32GLOBAL enc__idct_mmxGLOBAL enc_idct_mmx%define INP_8 eax + 8%define OUT_8 eax + 8%define INP_0 eax%define INP_1 eax + 16%define INP_2 eax + 32%define INP_3 eax + 48%define INP_4 eax + 64%define INP_5 eax + 80%define INP_6 eax + 96%define INP_7 eax + 112%define OUT_0 eax%define OUT_1 eax + 16%define OUT_2 eax + 32%define OUT_3 eax + 48%define OUT_4 eax + 64%define OUT_5 eax + 80%define OUT_6 eax + 96%define OUT_7 eax + 112%define TABLE_0 ecx%define TABLE_1 ecx + 64%define TABLE_2 ecx + 128%define TABLE_3 ecx + 192%define TABLE_4 ecx + 256%define TABLE_5 ecx + 320%define TABLE_6 ecx + 384%define TABLE_7 ecx + 448%define ROUNDER_0 ebx%define ROUNDER_1 ebx + 8%define ROUNDER_2 ebx + 16%define ROUNDER_3 ebx + 24%define ROUNDER_4 ebx + 32%define ROUNDER_5 ebx + 40%define ROUNDER_6 ebx + 48%define ROUNDER_7 ebx + 56SECTION .dataALIGN 16BITS_INV_ACC equ 5SHIFT_INV_ROW equ 16 - BITS_INV_ACCSHIFT_INV_COL equ 1 + BITS_INV_ACCrounder dd 65536, 65536, ; rounder_0 dd 3597, 3597, ; rounder_1 dd 2260, 2260, ; rounder_2 dd 1203, 1203, ; rounder_3 dd 0, 0, ; rounder_4 dd 120, 120, ; rounder_5 dd 512, 512, ; rounder_6 dd 512, 512 ; rounder_7tg_1_16 dw 13036, 13036, 13036, 13036tg_2_16 dw 27146, 27146, 27146, 27146tg_3_16 dw -21746, -21746, -21746, -21746ocos_4_16 dw 23170, 23170, 23170, 23170full_tab dw 16384, 16384, 16384, -16384, ; movq-> w06 w04 w02 w00 dw 21407, 8867, 8867, -21407, ; w07 w05 w03 w01 dw 16384, -16384, 16384, 16384, ; w14 w12 w10 w08 dw -8867, 21407, -21407, -8867, ; w15 w13 w11 w09 dw 22725, 12873, 19266, -22725, ; w22 w20 w18 w16 dw 19266, 4520, -4520, -12873, ; w23 w21 w19 w17 dw 12873, 4520, 4520, 19266, ; w30 w28 w26 w24 dw -22725, 19266, -12873, -22725, ; w31 w29 w27 w25 dw 22725, 22725, 22725, -22725, ; movq-> w06 w04 w02 w00 dw 29692, 12299, 12299, -29692, ; w07 w05 w03 w01 dw 22725, -22725, 22725, 22725, ; w14 w12 w10 w08 dw -12299, 29692, -29692, -12299, ; w15 w13 w11 w09 dw 31521, 17855, 26722, -31521, ; w22 w20 w18 w16 dw 26722, 6270, -6270, -17855, ; w23 w21 w19 w17 dw 17855, 6270, 6270, 26722, ; w30 w28 w26 w24 dw -31521, 26722, -17855, -31521, ; w31 w29 w27 w25 dw 21407, 21407, 21407, -21407, ; movq-> w06 w04 w02 w00 dw 27969, 11585, 11585, -27969, ; w07 w05 w03 w01 dw 21407, -21407, 21407, 21407, ; w14 w12 w10 w08 dw -11585, 27969, -27969, -11585, ; w15 w13 w11 w09 dw 29692, 16819, 25172, -29692, ; w22 w20 w18 w16 dw 25172, 5906, -5906, -16819, ; w23 w21 w19 w17 dw 16819, 5906, 5906, 25172, ; w30 w28 w26 w24 dw -29692, 25172, -16819, -29692, ; w31 w29 w27 w25 dw 19266, 19266, 19266, -19266, ; movq-> w06 w04 w02 w00 dw 25172, 10426, 10426, -25172, ; w07 w05 w03 w01 dw 19266, -19266, 19266, 19266, ; w14 w12 w10 w08 dw -10426, 25172, -25172, -10426, ; w15 w13 w11 w09 dw 26722, 15137, 22654, -26722, ; w22 w20 w18 w16 dw 22654, 5315, -5315, -15137, ; w23 w21 w19 w17 dw 15137, 5315, 5315, 22654, ; w30 w28 w26 w24 dw -26722, 22654, -15137, -26722, ; w31 w29 w27 w25 dw 16384, 16384, 16384, -16384, ; movq-> w06 w04 w02 w00 dw 21407, 8867, 8867, -21407, ; w07 w05 w03 w01 dw 16384, -16384, 16384, 16384, ; w14 w12 w10 w08 dw -8867, 21407, -21407, -8867, ; w15 w13 w11 w09 dw 22725, 12873, 19266, -22725, ; w22 w20 w18 w16 dw 19266, 4520, -4520, -12873, ; w23 w21 w19 w17 dw 12873, 4520, 4520, 19266, ; w30 w28 w26 w24 dw -22725, 19266, -12873, -22725, ; w31 w29 w27 w25 dw 19266, 19266, 19266, -19266, ; movq-> w06 w04 w02 w00 dw 25172, 10426, 10426, -25172, ; w07 w05 w03 w01 dw 19266, -19266, 19266, 19266, ; w14 w12 w10 w08 dw -10426, 25172, -25172, -10426, ; w15 w13 w11 w09 dw 26722, 15137, 22654, -26722, ; w22 w20 w18 w16 dw 22654, 5315, -5315, -15137, ; w23 w21 w19 w17 dw 15137, 5315, 5315, 22654, ; w30 w28 w26 w24 dw -26722, 22654, -15137, -26722, ; w31 w29 w27 w25 dw 21407, 21407, 21407, -21407, ; movq-> w06 w04 w02 w00 dw 27969, 11585, 11585, -27969, ; w07 w05 w03 w01 dw 21407, -21407, 21407, 21407, ; w14 w12 w10 w08 dw -11585, 27969, -27969, -11585, ; w15 w13 w11 w09 dw 29692, 16819, 25172, -29692, ; w22 w20 w18 w16 dw 25172, 5906, -5906, -16819, ; w23 w21 w19 w17 dw 16819, 5906, 5906, 25172, ; w30 w28 w26 w24 dw -29692, 25172, -16819, -29692, ; w31 w29 w27 w25 dw 22725, 22725, 22725, -22725, ; movq-> w06 w04 w02 w00 dw 29692, 12299, 12299, -29692, ; w07 w05 w03 w01 dw 22725, -22725, 22725, 22725, ; w14 w12 w10 w08 dw -12299, 29692, -29692, -12299, ; w15 w13 w11 w09 dw 31521, 17855, 26722, -31521, ; w22 w20 w18 w16 dw 26722, 6270, -6270, -17855, ; w23 w21 w19 w17 dw 17855, 6270, 6270, 26722, ; w30 w28 w26 w24 dw -31521, 26722, -17855, -31521, ; w31 w29 w27 w25 SECTION .text;;void enc_idct_mmx(short *block);enc__idct_mmx:enc_idct_mmx: push ebx push edi mov INP_0, dword [esp + 12] ; block mov TABLE_0, full_tab mov ROUNDER_0, rounder movq mm0, [INP_0] ; 0 ; x3 x2 x1 x0 movq mm1, [INP_0+8] ; 1 ; x7 x6 x5 x4 movq mm2, mm0 ; 2 ; x3 x2 x1 x0 movq mm3, [TABLE_0] ; 3 ; w06 w04 w02 w00 punpcklwd mm0, mm1 ; x5 x1 x4 x0 movq mm5, mm0 ; 5 ; x5 x1 x4 x0 punpckldq mm0, mm0 ; x4 x0 x4 x0 movq mm4, [TABLE_0+8] ; 4 ; w07 w05 w03 w01 punpckhwd mm2, mm1 ; 1 ; x7 x3 x6 x2 pmaddwd mm3, mm0 ; x4*w06+x0*w04 x4*w02+x0*w00 movq mm6, mm2 ; 6 ; x7 x3 x6 x2 movq mm1, [TABLE_0+32] ; 1 ; w22 w20 w18 w16 punpckldq mm2, mm2 ; x6 x2 x6 x2 pmaddwd mm4, mm2 ; x6*w07+x2*w05 x6*w03+x2*w01 punpckhdq mm5, mm5 ; x5 x1 x5 x1 pmaddwd mm0, [TABLE_0+16] ; x4*w14+x0*w12 x4*w10+x0*w08 punpckhdq mm6, mm6 ; x7 x3 x7 x3 movq mm7, [TABLE_0+40] ; 7 ; w23 w21 w19 w17 pmaddwd mm1, mm5 ; x5*w22+x1*w20 x5*w18+x1*w16 paddd mm3, [ROUNDER_0] ; +rounder pmaddwd mm7, mm6 ; x7*w23+x3*w21 x7*w19+x3*w17 pmaddwd mm2, [TABLE_0+24] ; x6*w15+x2*w13 x6*w11+x2*w09 paddd mm3, mm4 ; 4 ; a1=sum(even1) a0=sum(even0) pmaddwd mm5, [TABLE_0+48] ; x5*w30+x1*w28 x5*w26+x1*w24 movq mm4, mm3 ; 4 ; a1 a0 pmaddwd mm6, [TABLE_0+56] ; x7*w31+x3*w29 x7*w27+x3*w25 paddd mm1, mm7 ; 7 ; b1=sum(odd1) b0=sum(odd0) paddd mm0, [ROUNDER_0] ; +rounder psubd mm3, mm1 ; a1-b1 a0-b0 psrad mm3, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0 paddd mm1, mm4 ; 4 ; a1+b1 a0+b0 paddd mm0, mm2 ; 2 ; a3=sum(even3) a2=sum(even2) psrad mm1, SHIFT_INV_ROW ; y1=a1+b1 y0=a0+b0 paddd mm5, mm6 ; 6 ; b3=sum(odd3) b2=sum(odd2) movq mm4, mm0 ; 4 ; a3 a2 paddd mm0, mm5 ; a3+b3 a2+b2 psubd mm4, mm5 ; 5 ; a3-b3 a2-b2 psrad mm0, SHIFT_INV_ROW ; y3=a3+b3 y2=a2+b2 psrad mm4, SHIFT_INV_ROW ; y4=a3-b3 y5=a2-b2 packssdw mm1, mm0 ; 0 ; y3 y2 y1 y0 packssdw mm4, mm3 ; 3 ; y6 y7 y4 y5 movq mm7, mm4 ; 7 ; y6 y7 y4 y5 psrld mm4, 16 ; 0 y6 0 y4 pslld mm7, 16 ; y7 0 y5 0 movq [OUT_0], mm1 ; 1 ; save y3 y2 y1 y0 por mm7, mm4 ; 4 ; y7 y6 y5 y4 movq [OUT_0+8], mm7 ; 7 ; save y7 y6 y5 y4 movq mm0, [INP_1] ; 0 ; x3 x2 x1 x0 movq mm1, [INP_1+8] ; 1 ; x7 x6 x5 x4 movq mm2, mm0 ; 2 ; x3 x2 x1 x0 movq mm3, [TABLE_1] ; 3 ; w06 w04 w02 w00 punpcklwd mm0, mm1 ; x5 x1 x4 x0 movq mm5, mm0 ; 5 ; x5 x1 x4 x0 punpckldq mm0, mm0 ; x4 x0 x4 x0 movq mm4, [TABLE_1+8] ; 4 ; w07 w05 w03 w01 punpckhwd mm2, mm1 ; 1 ; x7 x3 x6 x2 pmaddwd mm3, mm0 ; x4*w06+x0*w04 x4*w02+x0*w00 movq mm6, mm2 ; 6 ; x7 x3 x6 x2 movq mm1, [TABLE_1+32] ; 1 ; w22 w20 w18 w16 punpckldq mm2, mm2 ; x6 x2 x6 x2 pmaddwd mm4, mm2 ; x6*w07+x2*w05 x6*w03+x2*w01 punpckhdq mm5, mm5 ; x5 x1 x5 x1 pmaddwd mm0, [TABLE_1+16] ; x4*w14+x0*w12 x4*w10+x0*w08 punpckhdq mm6, mm6 ; x7 x3 x7 x3 movq mm7, [TABLE_1+40] ; 7 ; w23 w21 w19 w17 pmaddwd mm1, mm5 ; x5*w22+x1*w20 x5*w18+x1*w16 paddd mm3, [ROUNDER_1] ; +rounder pmaddwd mm7, mm6 ; x7*w23+x3*w21 x7*w19+x3*w17 pmaddwd mm2, [TABLE_1+24] ; x6*w15+x2*w13 x6*w11+x2*w09 paddd mm3, mm4 ; 4 ; a1=sum(even1) a0=sum(even0) pmaddwd mm5, [TABLE_1+48] ; x5*w30+x1*w28 x5*w26+x1*w24 movq mm4, mm3 ; 4 ; a1 a0 pmaddwd mm6, [TABLE_1+56] ; x7*w31+x3*w29 x7*w27+x3*w25 paddd mm1, mm7 ; 7 ; b1=sum(odd1) b0=sum(odd0) paddd mm0, [ROUNDER_1] ; +rounder psubd mm3, mm1 ; a1-b1 a0-b0 psrad mm3, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0 paddd mm1, mm4 ; 4 ; a1+b1 a0+b0 paddd mm0, mm2 ; 2 ; a3=sum(even3) a2=sum(even2) psrad mm1, SHIFT_INV_ROW ; y1=a1+b1 y0=a0+b0 paddd mm5, mm6 ; 6 ; b3=sum(odd3) b2=sum(odd2) movq mm4, mm0 ; 4 ; a3 a2 paddd mm0, mm5 ; a3+b3 a2+b2 psubd mm4, mm5 ; 5 ; a3-b3 a2-b2 psrad mm0, SHIFT_INV_ROW ; y3=a3+b3 y2=a2+b2 psrad mm4, SHIFT_INV_ROW ; y4=a3-b3 y5=a2-b2 packssdw mm1, mm0 ; 0 ; y3 y2 y1 y0 packssdw mm4, mm3 ; 3 ; y6 y7 y4 y5 movq mm7, mm4 ; 7 ; y6 y7 y4 y5 psrld mm4, 16 ; 0 y6 0 y4
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -