📄 mc-a.asm.svn-base

📁 H.264 source codes
💻 SVN-BASE
字号:
;*****************************************************************************;* mc.asm: h264 encoder library;*****************************************************************************;* Copyright (C) 2003 x264 project;* $Id: mc.asm,v 1.3 2004/06/18 01:59:58 chenm001 Exp $;*;* Authors: Min Chen <chenm001.163.com> (converted to nasm);*          Laurent Aimar <fenrir@via.ecp.fr> (init algorithm);*;* This program is free software; you can redistribute it and/or modify;* it under the terms of the GNU General Public License as published by;* the Free Software Foundation; either version 2 of the License, or;* (at your option) any later version.;*;* This program is distributed in the hope that it will be useful,;* but WITHOUT ANY WARRANTY; without even the implied warranty of;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the;* GNU General Public License for more details.;*;* You should have received a copy of the GNU General Public License;* along with this program; if not, write to the Free Software;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.;*****************************************************************************;*****************************************************************************;*                                                                           *;*  Revision history:                                                        *;*                                                                           *;*  2004.05.17 portab mc_copy_w4/8/16 (CM)                                   *;*                                                                           *;*****************************************************************************BITS 64;=============================================================================; Macros and other preprocessor constants;=============================================================================%macro cglobal 1	%ifdef PREFIX		global _%1		%define %1 _%1	%else		global %1	%endif%endmacro;-----------------------------------------------------------------------------; Various memory constants (trigonometric values or rounding values);-----------------------------------------------------------------------------ALIGN 16;=============================================================================; Code;=============================================================================SECTION .textcglobal x264_pixel_avg_w4_mmxextcglobal x264_pixel_avg_w8_mmxextcglobal x264_pixel_avg_w16_mmxextcglobal x264_pixel_avg_w16_sse2cglobal x264_mc_copy_w4_mmxextcglobal x264_mc_copy_w8_mmxextcglobal x264_mc_copy_w16_mmxextcglobal x264_mc_copy_w16_sse2cglobal x264_mc_chroma_sseALIGN 16;-----------------------------------------------------------------------------; void x264_pixel_avg_w4_mmxext( uint8_t *dst,  int i_dst_stride,;                                uint8_t *src1, int i_src1_stride,;                                uint8_t *src2, int i_src2_stride,;                                int i_height );;-----------------------------------------------------------------------------x264_pixel_avg_w4_mmxext:    push        rbp    mov         rbp, rsp    push        r12    push        r13    mov         r12, r8             ; src2    movsxd      r13, r9d            ; i_src2_stride    mov         r10, rdx            ; src1    movsxd      r11, ecx            ; i_src1_stride    mov         r8, rdi             ; dst    movsxd      r9, esi             ; i_dst_stride    movsxd      rax, dword [rbp+16] ; i_heightALIGN 4.height_loop        movd        mm0, [r10]    pavgb       mm0, [r12]    movd        mm1, [r10+r11]    pavgb       mm1, [r12+r13]    movd        [r8], mm0    movd        [r8+r9], mm1    dec         rax    dec         rax    lea         r10, [r10+r11*2]    lea         r12, [r12+r13*2]    lea         r8, [r8+r9*2]    jne         .height_loop    pop         r13    pop         r12    pop         rbp    ret                          ALIGN 16;-----------------------------------------------------------------------------; void x264_pixel_avg_w8_mmxext( uint8_t *dst,  int i_dst_stride,;                                uint8_t *src1, int i_src1_stride,;                                uint8_t *src2, int i_src2_stride,;                                int i_height );;-----------------------------------------------------------------------------x264_pixel_avg_w8_mmxext:    push        rbp    mov         rbp, rsp    push        r12    push        r13    mov         r12, r8             ; src2    movsxd      r13, r9d            ; i_src2_stride    mov         r10, rdx            ; src1    movsxd      r11, ecx            ; i_src1_stride    mov         r8, rdi             ; dst    movsxd      r9, esi             ; i_dst_stride    movsxd      rax, dword [rbp+16] ; i_heightALIGN 4.height_loop        movq        mm0, [r10]    pavgb       mm0, [r12]    movq        [r8], mm0    dec         rax    lea         r10, [r10+r11]    lea         r12, [r12+r13]    lea         r8, [r8+r9]    jne         .height_loop    pop         r13    pop         r12    pop         rbp    retALIGN 16;-----------------------------------------------------------------------------; void x264_pixel_avg_w16_mmxext( uint8_t *dst,  int i_dst_stride,;                                 uint8_t *src1, int i_src1_stride,;                                 uint8_t *src2, int i_src2_stride,;                                 int i_height );;-----------------------------------------------------------------------------x264_pixel_avg_w16_mmxext:    push        rbp    mov         rbp, rsp    push        r12    push        r13    mov         r12, r8             ; src2    movsxd      r13, r9d            ; i_src2_stride    mov         r10, rdx            ; src1    movsxd      r11, ecx            ; i_src1_stride    mov         r8, rdi             ; dst    movsxd      r9, esi             ; i_dst_stride    movsxd      rax, dword [rbp+16] ; i_heightALIGN 4.height_loop        movq        mm0, [r10  ]    movq        mm1, [r10+8]    pavgb       mm0, [r12  ]    pavgb       mm1, [r12+8]    movq        [r8  ], mm0    movq        [r8+8], mm1    dec         rax    lea         r10, [r10+r11]    lea         r12, [r12+r13]    lea         r8, [r8+r9]    jne         .height_loop    pop         r13    pop         r12    pop         rbp    retALIGN 16;-----------------------------------------------------------------------------; void x264_pixel_avg_w16_sse2( uint8_t *dst,  int i_dst_stride,;                               uint8_t *src1, int i_src1_stride,;                               uint8_t *src2, int i_src2_stride,;                               int i_height );;-----------------------------------------------------------------------------x264_pixel_avg_w16_sse2:    push        rbp    mov         rbp, rsp    push        r12    push        r13    mov         r12, r8             ; src2    movsxd      r13, r9d            ; i_src2_stride    mov         r10, rdx            ; src1    movsxd      r11, ecx            ; i_src1_stride    mov         r8, rdi             ; dst    movsxd      r9, esi             ; i_dst_stride    movsxd      rax, dword [rbp+16] ; i_heightALIGN 4.height_loop        movdqu      xmm0, [r10]    pavgb       xmm0, [r12]    movdqu      [r8], xmm0    dec         rax    lea         r10, [r10+r11]    lea         r12, [r12+r13]    lea         r8, [r8+r9]    jne         .height_loop    pop         r13    pop         r12    pop         rbp    retALIGN 16;-----------------------------------------------------------------------------;  void x264_mc_copy_w4_mmxext( uint8_t *src, int i_src_stride,;                               uint8_t *dst, int i_dst_stride, int i_height );-----------------------------------------------------------------------------x264_mc_copy_w4_mmxext:    mov     eax, r8d            ; i_height    mov     r8, rdi             ; src    movsxd  r9, esi             ; i_src_stride    mov     r10, rdx            ; dst    movsxd  r11, ecx            ; i_dst_stride    ALIGN 4.height_loop    mov     ecx, [r8]    mov     edx, [r8+r9]    mov     [r10], ecx    mov     [r10+r11], edx    lea     r8, [r8+r9*2]    lea     r10, [r10+r11*2]    dec     eax    dec     eax    jne     .height_loop    retcglobal mc_copy_w8ALIGN 16;-----------------------------------------------------------------------------;   void x264_mc_copy_w8_mmxext( uint8_t *src, int i_src_stride,;                                uint8_t *dst, int i_dst_stride, int i_height );-----------------------------------------------------------------------------x264_mc_copy_w8_mmxext:    mov     eax, r8d            ; i_height    mov     r8, rdi             ; src    movsxd  r9, esi             ; i_src_stride    mov     r10, rdx            ; dst    movsxd  r11, ecx            ; i_dst_stride    lea     rcx, [r9+r9*2]      ; 3 * i_src_stride    lea     rdx, [r11+r11*2]    ; 3 * i_dst_strideALIGN 4.height_loop    movq    mm0, [r8]    movq    mm1, [r8+r9]    movq    mm2, [r8+r9*2]    movq    mm3, [r8+rcx]    movq    [r10], mm0    movq    [r10+r11], mm1    movq    [r10+r11*2], mm2    movq    [r10+rdx], mm3    lea     r8, [r8+r9*4]    lea     r10, [r10+r11*4]        sub     eax, byte 4    jnz     .height_loop    retcglobal mc_copy_w16ALIGN 16;-----------------------------------------------------------------------------;   void x264_mc_copy_w16_mmxext( uint8_t *src, int i_src_stride,;                                 uint8_t *dst, int i_dst_stride, int i_height );-----------------------------------------------------------------------------x264_mc_copy_w16_mmxext:    mov     eax, r8d            ; i_height    mov     r8, rdi             ; src    movsxd  r9, esi             ; i_src_stride    mov     r10, rdx            ; dst    movsxd  r11, ecx            ; i_dst_stride        lea     rcx, [r9+r9*2]      ; 3 * i_src_stride    lea     rdx, [r11+r11*2]    ; 3 * i_dst_strideALIGN 4.height_loop    movq    mm0, [r8]    movq    mm1, [r8+8]    movq    mm2, [r8+r9]    movq    mm3, [r8+r9+8]    movq    mm4, [r8+r9*2]    movq    mm5, [r8+r9*2+8]    movq    mm6, [r8+rcx]    movq    mm7, [r8+rcx+8]    movq    [r10], mm0    movq    [r10+8], mm1    movq    [r10+r11], mm2    movq    [r10+r11+8], mm3    movq    [r10+r11*2], mm4    movq    [r10+r11*2+8], mm5    movq    [r10+rdx], mm6    movq    [r10+rdx+8], mm7    lea     r8, [r8+r9*4]    lea     r10, [r10+r11*4]    sub     eax, byte 4    jnz     .height_loop        retALIGN 16;-----------------------------------------------------------------------------;   void x264_mc_copy_w16_sse2( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height );-----------------------------------------------------------------------------x264_mc_copy_w16_sse2:    mov     eax, r8d            ; i_height    mov     r8, rdi             ; src    movsxd  r9, esi             ; i_src_stride    mov     r10, rdx            ; dst    movsxd  r11, ecx            ; i_dst_strideALIGN 4.height_loop    movdqu  xmm0, [r8]    movdqu  xmm1, [r8+r9]    movdqu  [r10], xmm0    movdqu  [r10+r11], xmm1    dec     eax    dec     eax    lea     r8, [r8+r9*2]    lea     r10, [r10+r11*2]    jnz     .height_loop        retSECTION .rodataALIGN 16eights    times 4   dw 8thirty2s  times 4   dw 32SECTION .datax264_mc_chroma_sse_dx:    dw  0x264_mc_chroma_sse_dy:    dw  0SECTION .textALIGN 16;-----------------------------------------------------------------------------;   void x264_mc_chroma_sse( uint8_t *src, int i_src_stride,;                               uint8_t *dst, int i_dst_stride,;                               int dx, int dy,;                               int i_height, int i_width );-----------------------------------------------------------------------------x264_mc_chroma_sse:    push    r12    push    r13    mov     [x264_mc_chroma_sse_dx], r8d    mov     [x264_mc_chroma_sse_dy], r9d    pxor    mm3, mm3    pshufw  mm5, [x264_mc_chroma_sse_dx], 0    ; mm5 - dx    pshufw  mm6, [x264_mc_chroma_sse_dy], 0    ; mm6 - dy    movq    mm4, [eights]    movq    mm0, mm4    psubw   mm4, mm5            ; mm4 - 8-dx    psubw   mm0, mm6            ; mm0 - 8-dy    movq    mm7, mm5    pmullw  mm5, mm0            ; mm5 = dx*(8-dy) =     cB    pmullw  mm7, mm6            ; mm7 = dx*dy =         cD    pmullw  mm6, mm4            ; mm6 = (8-dx)*dy =     cC    pmullw  mm4, mm0            ; mm4 = (8-dx)*(8-dy) = cA    mov     r8, rdi             ; src    movsxd  r9, esi             ; i_src_stride    mov     r10, rdx            ; dst    movsxd  r11, ecx            ; i_dst_stride    movsxd  r12, dword [rsp+24] ; i_height    movsxd  r13, dword [rsp+32] ; i_width    mov     rax, r8    mov     rdi, r10    mov     rcx, r9    mov     rdx, r12ALIGN 4.height_loop    movd    mm1, [rax+rcx]    movd    mm0, [rax]    punpcklbw mm1, mm3          ; 00 px1 | 00 px2 | 00 px3 | 00 px4    punpcklbw mm0, mm3    pmullw  mm1, mm6            ; 2nd line * cC    pmullw  mm0, mm4            ; 1st line * cA    paddw   mm0, mm1            ; mm0 <- result    movd    mm2, [rax+1]    movd    mm1, [rax+rcx+1]    punpcklbw mm2, mm3    punpcklbw mm1, mm3    paddw   mm0, [thirty2s]    pmullw  mm2, mm5            ; line * cB    pmullw  mm1, mm7            ; line * cD    paddw   mm0, mm2    paddw   mm0, mm1    psrlw   mm0, 6    packuswb mm0, mm3           ; 00 00 00 00 px1 px2 px3 px4    movd    [rdi], mm0    add     rax, rcx    add     rdi, r11            ; i_dst_stride    dec     rdx    jnz     .height_loop    mov     rax, r13            ; i_width    sub     rax, 8    jnz     .finish             ; width != 8 so assume 4    mov     r13, rax            ; i_width    mov     rdi, r10            ; dst    mov     rax, r8             ; src    mov     rdx, r12            ; i_height    add     rdi, 4    add     rax, 4    jmp    .height_loop.finish    pop     r13    pop     r12    ret
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -