📄 mc-a.asm.svn-base
字号:
mov r11d, [parm3q+parm4q] mov [parm1q], r10d mov [parm1q+parm2q], r11d lea parm3q, [parm3q+parm4q*2] lea parm1q, [parm1q+parm2q*2] dec eax dec eax jg .height_loop rep ret;-----------------------------------------------------------------------------; void x264_mc_copy_w8_mmx( uint8_t *dst, int i_dst_stride,; uint8_t *src, int i_src_stride, int i_height );-----------------------------------------------------------------------------cglobal x264_mc_copy_w8_mmx mov eax, parm5d ; i_height lea r10, [parm4q+parm4q*2] ; 3 * i_src_stride lea r11, [parm2q+parm2q*2] ; 3 * i_dst_strideALIGN 4.height_loop movq mm0, [parm3q] movq mm1, [parm3q+parm4q] movq mm2, [parm3q+parm4q*2] movq mm3, [parm3q+r10] movq [parm1q], mm0 movq [parm1q+parm2q], mm1 movq [parm1q+parm2q*2], mm2 movq [parm1q+r11], mm3 lea parm3q, [parm3q+parm4q*4] lea parm1q, [parm1q+parm2q*4] sub eax, byte 4 jg .height_loop rep ret;-----------------------------------------------------------------------------; void x264_mc_copy_w16_mmx( uint8_t *dst, int i_dst_stride,; uint8_t *src, int i_src_stride, int i_height );-----------------------------------------------------------------------------cglobal x264_mc_copy_w16_mmx mov eax, parm5d ; i_height lea r10, [parm4q+parm4q*2] ; 3 * i_src_stride lea r11, [parm2q+parm2q*2] ; 3 * i_dst_strideALIGN 4.height_loop movq mm0, [parm3q] movq mm1, [parm3q+8] movq mm2, [parm3q+parm4q] movq mm3, [parm3q+parm4q+8] movq mm4, [parm3q+parm4q*2] movq mm5, [parm3q+parm4q*2+8] movq mm6, [parm3q+r10] movq mm7, [parm3q+r10+8] movq [parm1q], mm0 movq [parm1q+8], mm1 movq [parm1q+parm2q], mm2 movq [parm1q+parm2q+8], mm3 movq [parm1q+parm2q*2], mm4 movq [parm1q+parm2q*2+8], mm5 movq [parm1q+r11], mm6 movq [parm1q+r11+8], mm7 lea parm3q, [parm3q+parm4q*4] lea parm1q, [parm1q+parm2q*4] sub eax, byte 4 jg .height_loop rep ret;-----------------------------------------------------------------------------; void x264_mc_copy_w16_sse2( uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride, int i_height );-----------------------------------------------------------------------------cglobal x264_mc_copy_w16_sse2 mov eax, parm5d ; i_heightALIGN 4.height_loop movdqu xmm0, [parm3q] movdqu xmm1, [parm3q+parm4q] movdqu [parm1q], xmm0 movdqu [parm1q+parm2q], xmm1 sub eax, byte 2 lea parm3q, [parm3q+parm4q*2] lea parm1q, [parm1q+parm2q*2] jg .height_loop rep ret;=============================================================================; chroma MC;=============================================================================;-----------------------------------------------------------------------------; void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride,; uint8_t *dst, int i_dst_stride,; int dx, int dy,; int i_width, int i_height );-----------------------------------------------------------------------------cglobal x264_mc_chroma_mmxext mov r10d, parm6d mov r11d, parm5d sar r10d, 3 sar r11d, 3 imul r10d, parm2d pxor mm3, mm3 add r10d, r11d movsxd r10, r10d mov r11d, parm8d add parm1q, r10 ; src += (dx>>3) + (dy>>3) * src_stride and parm5d, 7 ; dx &= 7 je .mc1d and parm6d, 7 ; dy &= 7 je .mc1d movd mm0, parm5d movd mm1, parm6d pshufw mm5, mm0, 0 ; mm5 = dx pshufw mm6, mm1, 0 ; mm6 = dy movq mm4, [pw_8 GLOBAL] movq mm0, mm4 psubw mm4, mm5 ; mm4 = 8-dx psubw mm0, mm6 ; mm0 = 8-dy movq mm7, mm5 pmullw mm5, mm0 ; mm5 = dx*(8-dy) = cB pmullw mm7, mm6 ; mm7 = dx*dy = cD pmullw mm6, mm4 ; mm6 = (8-dx)*dy = cC pmullw mm4, mm0 ; mm4 = (8-dx)*(8-dy) = cA mov rax, parm1q mov r10, parm3qALIGN 4.height_loop movd mm1, [rax+parm2q] movd mm0, [rax] punpcklbw mm1, mm3 ; 00 px1 | 00 px2 | 00 px3 | 00 px4 punpcklbw mm0, mm3 pmullw mm1, mm6 ; 2nd line * cC pmullw mm0, mm4 ; 1st line * cA paddw mm0, mm1 ; mm0 <- result movd mm2, [rax+1] movd mm1, [rax+parm2q+1] punpcklbw mm2, mm3 punpcklbw mm1, mm3 paddw mm0, [pw_32 GLOBAL] pmullw mm2, mm5 ; line * cB pmullw mm1, mm7 ; line * cD paddw mm0, mm2 paddw mm0, mm1 psrlw mm0, 6 packuswb mm0, mm3 ; 00 00 00 00 px1 px2 px3 px4 movd [r10], mm0 add rax, parm2q add r10, parm4q ; i_dst_stride dec r11d jnz .height_loop sub parm7d, 8 jnz .finish ; width != 8 so assume 4 mov r10, parm3q ; dst mov rax, parm1q ; src mov r11d, parm8d ; i_height add r10, 4 add rax, 4 jmp .height_loop.finish rep retALIGN 4.mc1d%ifdef WIN64%define pel_offset rsi%else%define pel_offset r9%endif mov eax, parm5d or eax, parm6d and eax, 7 cmp parm5d, 0 mov pel_offset, 1 cmove pel_offset, parm2q ; pel_offset = dx ? 1 : src_stride movd mm6, eax movq mm5, [pw_8 GLOBAL] pshufw mm6, mm6, 0 movq mm7, [pw_4 GLOBAL] psubw mm5, mm6 cmp parm7d, 8 je .height_loop1_w8ALIGN 4.height_loop1_w4 movd mm0, [parm1q+pel_offset] movd mm1, [parm1q] punpcklbw mm0, mm3 punpcklbw mm1, mm3 pmullw mm0, mm6 pmullw mm1, mm5 paddw mm0, mm7 paddw mm0, mm1 psrlw mm0, 3 packuswb mm0, mm3 movd [parm3q], mm0 add parm1q, parm2q add parm3q, parm4q dec r11d jnz .height_loop1_w4 rep retALIGN 4.height_loop1_w8 movq mm0, [parm1q+pel_offset] movq mm1, [parm1q] movq mm2, mm0 movq mm4, mm1 punpcklbw mm0, mm3 punpcklbw mm1, mm3 punpckhbw mm2, mm3 punpckhbw mm4, mm3 pmullw mm0, mm6 pmullw mm1, mm5 pmullw mm2, mm6 pmullw mm4, mm5 paddw mm0, mm7 paddw mm2, mm7 paddw mm0, mm1 paddw mm2, mm4 psrlw mm0, 3 psrlw mm2, 3 packuswb mm0, mm2 movq [parm3q], mm0 add parm1q, parm2q add parm3q, parm4q dec r11d jnz .height_loop1_w8 rep ret;-----------------------------------------------------------------------------; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y, ; uint8_t *pix_uv, int stride_uv, int mb_x );-----------------------------------------------------------------------------cglobal x264_prefetch_fenc_mmxext mov eax, parm5d and eax, 3 imul eax, parm2d lea parm1q, [parm1q+rax*4+64] prefetcht0 [parm1q] prefetcht0 [parm1q+parm2q] lea parm1q, [parm1q+parm2q*2] prefetcht0 [parm1q] prefetcht0 [parm1q+parm2q] mov eax, parm5d and eax, 6 imul eax, parm4d lea parm3q, [parm3q+rax+64] prefetcht0 [parm3q] prefetcht0 [parm3q+parm4q] ret;-----------------------------------------------------------------------------; void x264_prefetch_ref_mmxext( uint8_t *pix, int stride, int parity );-----------------------------------------------------------------------------cglobal x264_prefetch_ref_mmxext dec parm3d and parm3d, parm2d lea parm1q, [parm1q+parm3q*8+64] lea rax, [parm2q*3] prefetcht0 [parm1q] prefetcht0 [parm1q+parm2q] prefetcht0 [parm1q+parm2q*2] prefetcht0 [parm1q+rax] lea parm1q, [parm1q+parm2q*4] prefetcht0 [parm1q] prefetcht0 [parm1q+parm2q] prefetcht0 [parm1q+parm2q*2] prefetcht0 [parm1q+rax] ret
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -