mc-a2.asm

来自「在dsp上实现h.264编解码」· 汇编代码 · 共 372 行
ASM
372 行
;*****************************************************************************;* mc-a2.asm: h264 encoder library;*****************************************************************************;* Copyright (C) 2005 x264 project;*;* This program is free software; you can redistribute it and/or modify;* it under the terms of the GNU General Public License as published by;* the Free Software Foundation; either version 2 of the License, or;* (at your option) any later version.;*;* This program is distributed in the hope that it will be useful,;* but WITHOUT ANY WARRANTY; without even the implied warranty of;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the;* GNU General Public License for more details.;*;* You should have received a copy of the GNU General Public License;* along with this program; if not, write to the Free Software;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.;*****************************************************************************BITS 64;=============================================================================; Macros and other preprocessor constants;=============================================================================%include "amd64inc.asm";=============================================================================; Read only data;=============================================================================SECTION .rodataALIGN 16mmx_dw_one:    times 4 dw 16mmx_dd_one:    times 2 dd 512mmx_dw_20:    times 4 dw 20mmx_dw_5:    times 4 dw -5%assign tbuffer 0;=============================================================================; Macros;=============================================================================%macro LOAD_4 9    movd %1, %5    movd %2, %6    movd %3, %7    movd %4, %8    punpcklbw %1, %9    punpcklbw %2, %9    punpcklbw %3, %9    punpcklbw %4, %9%endmacro%macro FILT_2 2    psubw %1, %2    psllw %2, 2    psubw %1, %2%endmacro%macro FILT_4 3    paddw %2, %3    psllw %2, 2    paddw %1, %2    psllw %2, 2    paddw %1, %2%endmacro%macro FILT_6 4    psubw %1, %2    psllw %2, 2    psubw %1, %2    paddw %1, %3    paddw %1, %4    psraw %1, 5%endmacro%macro FILT_ALL 1    LOAD_4      mm1, mm2, mm3, mm4, [%1], [%1 + rcx], [%1 + 2 * rcx], [%1 + rbx], mm0    FILT_2      mm1, mm2    movd        mm5, [%1 + 4 * rcx]    movd        mm6, [%1 + rdx]    FILT_4      mm1, mm3, mm4    punpcklbw   mm5, mm0    punpcklbw   mm6, mm0    psubw       mm1, mm5    psllw       mm5, 2    psubw       mm1, mm5    paddw       mm1, mm6%endmacro;=============================================================================; Code;=============================================================================SECTION .textcglobal x264_horizontal_filter_mmxextcglobal x264_center_filter_mmxext;-----------------------------------------------------------------------------;; void x264_center_filter_mmxext( uint8_t *dst1, int i_dst1_stride,;                                 uint8_t *dst2, int i_dst2_stride,;                                  uint8_t *src, int i_src_stride,;                                  int i_width, int i_height );;;-----------------------------------------------------------------------------ALIGN 16x264_center_filter_mmxext :    push        r15    pushreg     r15%ifdef WIN64    push        rdi    pushreg     rdi    push        rsi    pushreg     rsi%endif    push        rbp    pushreg     rbp    push        rbx    pushreg     rbx    push        r12    pushreg     r12    push        r13    pushreg     r13    push        r14    pushreg     r14    lea         rbp,    [rsp]    setframe    rbp, 0    endprolog%ifdef WIN64    movsxd      r13,    dword [rsp+64+48]   ; src_stride    mov         r12,    [rsp+64+40]         ; src%else    movsxd      r13,    r9d                 ; src_stride    mov         r12,    r8                  ; src%endif    sub         r12,    r13    sub         r12,    r13                 ; tsrc = src - 2 * src_stride    ; use 24 instead of 18 (used in i386/mc-a2.asm) to keep rsp aligned    lea         rax,    [r13 + r13 + 24 + tbuffer]    sub         rsp,    rax    mov         r10,    parm3q                 ; dst2    movsxd      r11,    parm4d                 ; dst2_stride    mov         r8,     parm1q                 ; dst1    movsxd      r9,     parm2d                 ; dst1_stride%ifdef WIN64    movsxd      r14,    dword [rbp + 64 + 56]  ; width    movsxd      r15,    dword [rbp + 64 + 64]  ; height%else    movsxd      r14,    dword [rbp + 56]    ; width    movsxd      r15,    dword [rbp + 64]    ; height%endif    mov         rcx,    r13                 ; src_stride    lea         rbx,    [r13 + r13 * 2]     ; 3 * src_stride    lea         rdx,    [r13 + r13 * 4]     ; 5 * src_stride    pxor        mm0,    mm0                 ; 0 ---> mm0    movq        mm7,    [mmx_dd_one GLOBAL] ; for rounding.loopcy:    xor         rax,    rax    mov         rsi,    r12             ; tsrc    FILT_ALL    rsi    pshufw      mm2,    mm1, 0    movq        [rsp + tbuffer],  mm2    movq        [rsp + tbuffer + 8],  mm1    paddw       mm1,    [mmx_dw_one GLOBAL]    psraw       mm1,    5    packuswb    mm1,    mm1    movd        [r8],   mm1             ; dst1[0] = mm1    add         rax,    8    add         rsi,    4    lea         rdi,    [r8 - 4]        ; rdi = dst1 - 4.loopcx1:    FILT_ALL    rsi    movq        [rsp + tbuffer + 2 * rax],  mm1    paddw       mm1,    [mmx_dw_one GLOBAL]    psraw       mm1,    5    packuswb    mm1,    mm1    movd        [rdi + rax],  mm1   ; dst1[rax - 4] = mm1    add         rsi,    4    add         rax,    4    cmp         rax,    r14         ; cmp rax, width    jnz         .loopcx1    FILT_ALL    rsi    pshufw      mm2,    mm1,  7    movq        [rsp + tbuffer + 2 * rax],  mm1    movq        [rsp + tbuffer + 2 * rax + 8],  mm2    paddw       mm1,    [mmx_dw_one GLOBAL]    psraw       mm1,    5    packuswb    mm1,    mm1    movd        [rdi + rax],  mm1   ; dst1[rax - 4] = mm1    add         r12,    r13         ; tsrc = tsrc + src_stride    add         r8,     r9          ; dst1 = dst1 + dst1_stride    xor         rax,    rax.loopcx2:    movq        mm2,    [rsp + 2 * rax + 2  + 4 + tbuffer]    movq        mm3,    [rsp + 2 * rax + 4  + 4 + tbuffer]    movq        mm4,    [rsp + 2 * rax + 6  + 4 + tbuffer]    movq        mm5,    [rsp + 2 * rax + 8  + 4 + tbuffer]    movq        mm1,    [rsp + 2 * rax      + 4 + tbuffer]    movq        mm6,    [rsp + 2 * rax + 10 + 4 + tbuffer]    paddw       mm2,    mm5    paddw       mm3,    mm4    paddw       mm1,    mm6    movq        mm5,    [mmx_dw_20 GLOBAL]    movq        mm4,    [mmx_dw_5 GLOBAL]    movq        mm6,    mm1    pxor        mm7,    mm7    punpckhwd   mm5,    mm2    punpcklwd   mm4,    mm3    punpcklwd   mm2,    [mmx_dw_20 GLOBAL]    punpckhwd   mm3,    [mmx_dw_5 GLOBAL]    pcmpgtw     mm7,    mm1    pmaddwd     mm2,    mm4    pmaddwd     mm3,    mm5    punpcklwd   mm1,    mm7    punpckhwd   mm6,    mm7    paddd       mm2,    mm1    paddd       mm3,    mm6    paddd       mm2,    [mmx_dd_one GLOBAL]    paddd       mm3,    [mmx_dd_one GLOBAL]    psrad       mm2,    10    psrad       mm3,    10    packssdw    mm2,    mm3    packuswb    mm2,    mm0    movd        [r10 + rax], mm2    ; dst2[rax] = mm2    add         rax,    4    cmp         rax,    r14         ; cmp rax, width    jnz         .loopcx2    add         r10,    r11         ; dst2 += dst2_stride    dec         r15                 ; height    test        r15,    r15    jnz         .loopcy    lea         rsp,    [rbp]    pop         r14    pop         r13    pop         r12    pop         rbx    pop         rbp%ifdef WIN64    pop         rsi    pop         rdi%endif    pop         r15    ret;-----------------------------------------------------------------------------;; void x264_horizontal_filter_mmxext( uint8_t *dst, int i_dst_stride,;                                     uint8_t *src, int i_src_stride,;                                     int i_width, int i_height );;;-----------------------------------------------------------------------------ALIGN 16x264_horizontal_filter_mmxext :    movsxd      r10,    parm2d               ; dst_stride    movsxd      r11,    parm4d               ; src_stride%ifdef WIN64    mov         rdx,    r8                   ; src    mov         r9,     rcx                  ; dst    movsxd      rcx,    parm6d               ; height%else    movsxd      rcx,    parm6d               ; height    mov         r9,     rdi                  ; dst%endif        movsxd      r8,     parm5d               ; width    pxor        mm0,    mm0    movq        mm7,    [mmx_dw_one GLOBAL]    sub         rdx,    2loophy:    dec         rcx    xor         rax,    raxloophx:    prefetchnta [rdx + rax + 48]           LOAD_4      mm1,    mm2, mm3, mm4, [rdx + rax], [rdx + rax + 1], [rdx + rax + 2], [rdx + rax + 3], mm0    FILT_2      mm1,    mm2    movd        mm5,    [rdx + rax + 4]    movd        mm6,    [rdx + rax + 5]    FILT_4      mm1,    mm3, mm4    movd        mm2,    [rdx + rax + 4]    movd        mm3,    [rdx + rax + 6]    punpcklbw   mm5,    mm0    punpcklbw   mm6,    mm0    FILT_6      mm1,    mm5, mm6, mm7    movd        mm4,    [rdx + rax + 7]    movd        mm5,    [rdx + rax + 8]    punpcklbw   mm2,    mm0    punpcklbw   mm3,    mm0                  ; mm2(1), mm3(20), mm6(-5) ready    FILT_2      mm2,    mm6    movd        mm6,    [rdx + rax + 9]    punpcklbw   mm4,    mm0    punpcklbw   mm5,    mm0                  ; mm2(1-5), mm3(20), mm4(20), mm5(-5) ready    FILT_4      mm2,    mm3, mm4    punpcklbw   mm6,    mm0    FILT_6      mm2,    mm5, mm6, mm7    packuswb    mm1,    mm2    movq        [r9 + rax],  mm1    add         rax,    8    cmp         rax,    r8                   ; cmp rax, width    jnz         loophx    add         rdx,    r11                  ; src_pitch    add         r9,     r10                  ; dst_pitch    test        rcx,    rcx    jnz         loophy    ret
mc-a2.asm - 源码说明

本页面展示了「在dsp上实现h.264编解码」中的 mc-a2.asm 源码文件，采用汇编编程语言编写，共 372 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与dsp相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?