⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 quant-a.asm

📁 在dsp上实现h.264编解码
💻 ASM
字号:
;*****************************************************************************;* quant-a.asm: h264 encoder library;*****************************************************************************;* Copyright (C) 2005 SHU264 project;*;* Authors: Alex Izvorski <aizvorksi@gmail.com>;*          Christian Heine <sennindemokrit@gmx.net>;*;* This program is free software; you can redistribute it and/or modify;* it under the terms of the GNU General Public License as published by;* the Free Software Foundation; either version 2 of the License, or;* (at your option) any later version.;*;* This program is distributed in the hope that it will be useful,;* but WITHOUT ANY WARRANTY; without even the implied warranty of;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the;* GNU General Public License for more details.;*;* You should have received a copy of the GNU General Public License;* along with this program; if not, write to the Free Software;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.;*****************************************************************************;*****************************************************************************;*                                                                           *;*  Revision history:                                                        *;*                                                                           *;*  2005.07.26  quant 4x4 & 8x8 MMX functions (AI)                           *;*  2005.09.04  quant MMXEXT (added precision) and DC (CH)                   *;*  2005.09.21  faster MMX and added MMXEXT16 (CH)                           *;*                                                                           *;*****************************************************************************BITS 32%macro cglobal 1    %ifdef PREFIX        global _%1        %define %1 _%1    %else        global %1    %endif%endmacroSECTION .rodatapw_1:  times 4 dw 1pd_1:  times 2 dd 1SECTION .textcglobal SHU264_quant_2x2_dc_core15_mmxcglobal SHU264_quant_4x4_dc_core15_mmxcglobal SHU264_quant_4x4_core15_mmxcglobal SHU264_quant_8x8_core15_mmxcglobal SHU264_quant_2x2_dc_core16_mmxextcglobal SHU264_quant_4x4_dc_core16_mmxextcglobal SHU264_quant_4x4_core16_mmxextcglobal SHU264_quant_8x8_core16_mmxextcglobal SHU264_quant_2x2_dc_core32_mmxextcglobal SHU264_quant_4x4_dc_core32_mmxextcglobal SHU264_quant_4x4_core32_mmxextcglobal SHU264_quant_8x8_core32_mmxextcglobal SHU264_dequant_4x4_mmxcglobal SHU264_dequant_8x8_mmx%macro MMX_QUANT_AC_START 0    mov         eax, [esp+ 4]   ; &dct[0][0]    mov         ecx, [esp+ 8]   ; &quant_mf[0][0]    movd        mm6, [esp+12]   ; i_qbits    movd        mm7, [esp+16]   ; f    punpckldq   mm7, mm7        ; f in each dword%endmacro%macro MMX_QUANT15_DC_START 0    mov         eax, [esp+ 4]   ; &dct[0][0]    movd        mm5, [esp+ 8]   ; i_qmf    movd        mm6, [esp+12]   ; i_qbits    movd        mm7, [esp+16]   ; f    punpcklwd   mm5, mm5    punpcklwd   mm5, mm5        ; i_qmf in each word    punpckldq   mm7, mm7        ; f in each dword%endmacro%macro MMX_QUANT15_1x4 4;;; %1      (m64)       dct[y][x];;; %2      (m64/mmx)   quant_mf[y][x] or quant_mf[0][0] (as int16_t);;; %3      (mmx)       i_qbits in the low doubleword;;; %4      (mmx)       f as doublewords;;; trashes mm0-mm2,mm4    movq        mm0, %1     ; load dct coeffs    pxor        mm4, mm4    pcmpgtw     mm4, mm0    ; sign(coeff)    pxor        mm0, mm4    psubw       mm0, mm4    ; abs(coeff)    movq        mm2, mm0    pmullw      mm0, %2    pmulhw      mm2, %2    movq        mm1, mm0    punpcklwd   mm0, mm2    punpckhwd   mm1, mm2    paddd       mm0, %4     ; round with f    paddd       mm1, %4    psrad       mm0, %3    psrad       mm1, %3    packssdw    mm0, mm1    ; pack    pxor        mm0, mm4    ; restore sign    psubw       mm0, mm4    movq        %1, mm0     ; store%endmacroALIGN 16;-----------------------------------------------------------------------------;   void __cdecl SHU264_quant_2x2_dc_core15_mmx( int16_t dct[2][2],;       int const i_qmf, int const i_qbits, int const f );;-----------------------------------------------------------------------------SHU264_quant_2x2_dc_core15_mmx:    MMX_QUANT15_DC_START    MMX_QUANT15_1x4 [eax], mm5, mm6, mm7    retALIGN 16;-----------------------------------------------------------------------------;   void __cdecl SHU264_quant_4x4_dc_core15_mmx( int16_t dct[4][4],;       int const i_qmf, int const i_qbits, int const f );;-----------------------------------------------------------------------------SHU264_quant_4x4_dc_core15_mmx:    MMX_QUANT15_DC_START%rep 4    MMX_QUANT15_1x4 [eax], mm5, mm6, mm7    add         eax, byte 8%endrep    retALIGN 16;-----------------------------------------------------------------------------;   void __cdecl SHU264_quant_4x4_core15_mmx( int16_t dct[4][4],;       int const quant_mf[4][4], int const i_qbits, int const f );;-----------------------------------------------------------------------------SHU264_quant_4x4_core15_mmx:    MMX_QUANT_AC_START%rep 4    movq        mm5, [ecx]    packssdw    mm5, [ecx+8]    MMX_QUANT15_1x4 [eax], mm5, mm6, mm7    add         ecx, byte 16    add         eax, byte 8%endrep    retALIGN 16;-----------------------------------------------------------------------------;   void __cdecl SHU264_quant_8x8_core15_mmx( int16_t dct[8][8],;       int const quant_mf[8][8], int const i_qbits, int const f );;-----------------------------------------------------------------------------SHU264_quant_8x8_core15_mmx:    MMX_QUANT_AC_START%rep 16    movq        mm5, [ecx]    packssdw    mm5, [ecx+8]    MMX_QUANT15_1x4 [eax], mm5, mm6, mm7    add         ecx, byte 16    add         eax, byte 8%endrep    ret; ============================================================================%macro MMXEXT_QUANT16_DC_START 0    mov         eax, [esp+ 4]   ; &dct[0][0]    movd        mm5, [esp+ 8]   ; i_qmf    movd        mm6, [esp+12]   ; i_qbits    movd        mm7, [esp+16]   ; f    pshufw      mm5, mm5, 0     ; i_qmf in each word    punpckldq   mm7, mm7        ; f in each dword%endmacro%macro MMXEXT_QUANT16_1x4 4;;; %1      (m64)       dct[y][x];;; %2      (m64/mmx)   quant_mf[y][x] or quant_mf[0][0] (as uint16_t);;; %3      (mmx)       i_qbits in the low doubleword;;; %4      (mmx)       f as doublewords;;; trashes mm0-mm2,mm4    movq        mm0, %1     ; load dct coeffs    pxor        mm4, mm4    pcmpgtw     mm4, mm0    ; sign(coeff)    pxor        mm0, mm4    psubw       mm0, mm4    ; abs(coeff)    movq        mm2, mm0    pmullw      mm0, %2    pmulhuw     mm2, %2    movq        mm1, mm0    punpcklwd   mm0, mm2    punpckhwd   mm1, mm2    paddd       mm0, %4     ; round with f    paddd       mm1, %4    psrad       mm0, %3    psrad       mm1, %3    packssdw    mm0, mm1    ; pack    pxor        mm0, mm4    ; restore sign    psubw       mm0, mm4    movq        %1, mm0     ; store%endmacroALIGN 16;-----------------------------------------------------------------------------;   void __cdecl SHU264_quant_2x2_dc_core16_mmxext( int16_t dct[2][2],;       int const i_qmf, int const i_qbits, int const f );;-----------------------------------------------------------------------------SHU264_quant_2x2_dc_core16_mmxext:    MMXEXT_QUANT16_DC_START    MMXEXT_QUANT16_1x4 [eax], mm5, mm6, mm7    retALIGN 16;-----------------------------------------------------------------------------;   void __cdecl SHU264_quant_4x4_dc_core16_mmxext( int16_t dct[4][4],;       int const i_qmf, int const i_qbits, int const f );;-----------------------------------------------------------------------------SHU264_quant_4x4_dc_core16_mmxext:    MMXEXT_QUANT16_DC_START%rep 4    MMXEXT_QUANT16_1x4 [eax], mm5, mm6, mm7    add         eax, byte 8%endrep    retALIGN 16;-----------------------------------------------------------------------------;   void __cdecl SHU264_quant_4x4_core16_mmxext( int16_t dct[4][4],;       int const quant_mf[4][4], int const i_qbits, int const f );;-----------------------------------------------------------------------------SHU264_quant_4x4_core16_mmxext:    MMX_QUANT_AC_START%rep 4    pshufw      mm5, [ecx], 10110001b    paddw       mm5, [ecx+8]    pshufw      mm5, mm5, 10001101b    MMXEXT_QUANT16_1x4 [eax], mm5, mm6, mm7    add         ecx, byte 16    add         eax, byte 8%endrep    retALIGN 16;-----------------------------------------------------------------------------;   void __cdecl SHU264_quant_8x8_core16_mmxext( int16_t dct[8][8],;       int const quant_mf[8][8], int const i_qbits, int const f );;-----------------------------------------------------------------------------SHU264_quant_8x8_core16_mmxext:    MMX_QUANT_AC_START%rep 16    pshufw      mm5, [ecx], 10110001b    paddw       mm5, [ecx+8]    pshufw      mm5, mm5, 10001101b    MMXEXT_QUANT16_1x4 [eax], mm5, mm6, mm7    add         ecx, byte 16    add         eax, byte 8%endrep    ret%macro MMX_QUANT32_DC_START 0    mov         eax, [esp+ 4]   ; &dct[0][0]    movd        mm5, [esp+ 8]   ; i_qmf    movd        mm6, [esp+12]   ; i_qbits    movd        mm7, [esp+16]   ; f    punpckldq   mm5, mm5        ; i_qmf in each dword    punpckldq   mm7, mm7        ; f in each dword%endmacro%macro MMXEXT_QUANT32_1x4 5;;; %1      (m64)       dct[y][x];;; %2,%3   (m64/mmx)   quant_mf[y][x] or quant_mf[0][0] (as int16_t);;; %4      (mmx)       i_qbits in the low quadword;;; %5      (mmx)       f as doublewords;;; trashes mm0-mm4    movq        mm0, %1     ; load dct coeffs    pxor        mm4, mm4    pcmpgtw     mm4, mm0    ; sign(mm0)    pxor        mm0, mm4    psubw       mm0, mm4    ; abs(mm0)    movq        mm1, mm0    punpcklwd   mm0, mm0    ; duplicate the words for the upcomming    punpckhwd   mm1, mm1    ; 32 bit multiplication    movq        mm2, mm0    ; like in school ...    movq        mm3, mm1    pmulhuw     mm0, %2     ; ... multiply the parts ...    pmulhuw     mm1, %3    pmullw      mm2, %2    pmullw      mm3, %3    pslld       mm0, 16     ; ... shift ...    pslld       mm1, 16    paddd       mm0, mm2    ; ... and add them    paddd       mm1, mm3    paddd       mm0, %5     ; round with f    paddd       mm1, %5    psrad       mm0, %4    psrad       mm1, %4    packssdw    mm0, mm1    ; pack to int16_t    pxor        mm0, mm4    ; restore sign    psubw       mm0, mm4    movq        %1, mm0     ; store%endmacroALIGN 16;-----------------------------------------------------------------------------;   void __cdecl SHU264_quant_2x2_dc_core32_mmxext( int16_t dct[2][2],;       int const i_qmf, int const i_qbits, int const f );;-----------------------------------------------------------------------------SHU264_quant_2x2_dc_core32_mmxext:    MMX_QUANT32_DC_START    MMXEXT_QUANT32_1x4 [eax], mm5, mm5, mm6, mm7    retALIGN 16;-----------------------------------------------------------------------------;   void __cdecl SHU264_quant_4x4_dc_core32_mmxext( int16_t dct[4][4],;       int const i_qmf, int const i_qbits, int const f );;-----------------------------------------------------------------------------SHU264_quant_4x4_dc_core32_mmxext:    MMX_QUANT32_DC_START%rep 4    MMXEXT_QUANT32_1x4 [eax], mm5, mm5, mm6, mm7    add         eax, byte 8%endrep    retALIGN 16;-----------------------------------------------------------------------------;   void __cdecl SHU264_quant_4x4_core32_mmxext( int16_t dct[4][4],;       int const quant_mf[4][4], int const i_qbits, int const f );;-----------------------------------------------------------------------------SHU264_quant_4x4_core32_mmxext:    MMX_QUANT_AC_START%rep 4    MMXEXT_QUANT32_1x4 [eax], [ecx], [ecx+8], mm6, mm7    add         eax, byte 8    add         ecx, byte 16%endrep    retALIGN 16;-----------------------------------------------------------------------------;   void __cdecl SHU264_quant_8x8_core32_mmxext( int16_t dct[8][8],;       int const quant_mf[8][8], int const i_qbits, int const f );;-----------------------------------------------------------------------------SHU264_quant_8x8_core32_mmxext:    MMX_QUANT_AC_START%rep 16    MMXEXT_QUANT32_1x4 [eax], [ecx], [ecx+8], mm6, mm7    add         eax, byte 8    add         ecx, byte 16%endrep    ret;=============================================================================; dequant;=============================================================================%macro DEQUANT16_L_1x4 3;;; %1      dct[y][x];;; %2,%3   dequant_mf[i_mf][y][x];;; mm5     i_qbits    movq     mm1, %2    movq     mm2, %3    movq     mm0, %1    packssdw mm1, mm2    pmullw   mm0, mm1    psllw    mm0, mm5    movq     %1,  mm0%endmacro%macro DEQUANT16_R_1x4 3;;; %1      dct[y][x];;; %2,%3   dequant_mf[i_mf][y][x];;; mm5     -i_qbits;;; mm6     f as words    movq     mm1, %2    movq     mm2, %3    movq     mm0, %1    packssdw mm1, mm2    pmullw   mm0, mm1    paddw    mm0, mm6    psraw    mm0, mm5    movq     %1,  mm0%endmacro%macro DEQUANT32_R_1x4 3;;; %1      dct[y][x];;; %2,%3   dequant_mf[i_mf][y][x];;; mm5     -i_qbits;;; mm6     f as dwords;;; mm7     0    movq      mm0, %1    movq      mm1, mm0    punpcklwd mm0, mm0    punpckhwd mm1, mm1    movq      mm2, mm0    movq      mm3, mm1    pmulhw    mm0, %2    pmulhw    mm1, %3    pmullw    mm2, %2    pmullw    mm3, %3    pslld     mm0, 16    pslld     mm1, 16    paddd     mm0, mm2    paddd     mm1, mm3    paddd     mm0, mm6    paddd     mm1, mm6    psrad     mm0, mm5    psrad     mm1, mm5    packssdw  mm0, mm1    movq      %1,  mm0%endmacro%macro DEQUANT_WxH 3ALIGN 16;;; void SHU264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )%1:    mov  edx, [esp+12] ; i_qp    imul eax, edx, 0x2b    shr  eax, 8       ; i_qbits = i_qp / 6    lea  ecx, [eax+eax*2]    sub  edx, ecx    sub  edx, ecx     ; i_mf = i_qp % 6    shl  edx, %3+2    add  edx, [esp+8] ; dequant_mf[i_mf]    mov  ecx, [esp+4] ; dct    sub  eax, %3    jge  .lshift    cmp  eax, byte -1    je   .rshift16    ; negative qbits => rightshift    jmp  .rshift32    ; dct * dequant overflows 16bit.lshift:    movd mm5, eax    mov  eax, 8*(%2-1).loopl16%rep 2    DEQUANT16_L_1x4 [ecx+eax], [edx+eax*2], [edx+eax*2+8]    sub  eax, byte 8%endrep    jge  .loopl16    nop    ret.rshift16:    neg   eax    movq  mm6, [pw_1]    movd  mm5, eax    pxor  mm7, mm7    psllw mm6, mm5    psrlw mm6, 1    mov  eax, 8*(%2-1).loopr16%rep 2    DEQUANT16_R_1x4 [ecx+eax], [edx+eax*2], [edx+eax*2+8]    sub  eax, byte 8%endrep    jge  .loopr16    nop    ret.rshift32:    neg   eax    movq  mm6, [pd_1]    movd  mm5, eax    pxor  mm7, mm7    pslld mm6, mm5    psrld mm6, 1    mov  eax, 8*(%2-1).loopr32%rep 2    DEQUANT32_R_1x4 [ecx+eax], [edx+eax*2], [edx+eax*2+8]    sub  eax, byte 8%endrep    jge  .loopr32    nop    ret%endmacroDEQUANT_WxH SHU264_dequant_4x4_mmx, 4, 4DEQUANT_WxH SHU264_dequant_8x8_mmx, 16, 6

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -