📄 mc-a.asm
字号:
x264_pixel_avg2_w20_cache_mmxext: AVG_CACHELINE_START AVG_CACHELINE_LOOP 0, movq AVG_CACHELINE_LOOP 8, movq AVG_CACHELINE_LOOP 16, movd add r2, r3 add r0, r1 dec r5d jg .height_loop REP_RET%ifndef ARCH_X86_64AVG_CACHELINE_CHECK 8, 32, mmxextAVG_CACHELINE_CHECK 12, 32, mmxextAVG_CACHELINE_CHECK 16, 32, mmxextAVG_CACHELINE_CHECK 20, 32, mmxextAVG_CACHELINE_CHECK 16, 64, mmxextAVG_CACHELINE_CHECK 20, 64, mmxext%endifAVG_CACHELINE_CHECK 8, 64, mmxextAVG_CACHELINE_CHECK 12, 64, mmxextAVG_CACHELINE_CHECK 16, 64, sse2AVG_CACHELINE_CHECK 20, 64, sse2;=============================================================================; pixel copy;=============================================================================%macro COPY4 4 %2 m0, [r2] %2 m1, [r2+r3] %2 m2, [r2+r3*2] %2 m3, [r2+%4] %1 [r0], m0 %1 [r0+r1], m1 %1 [r0+r1*2], m2 %1 [r0+%3], m3%endmacroINIT_MMX;-----------------------------------------------------------------------------; void x264_mc_copy_w4_mmx( uint8_t *dst, int i_dst_stride,; uint8_t *src, int i_src_stride, int i_height );-----------------------------------------------------------------------------cglobal x264_mc_copy_w4_mmx, 4,6 cmp dword r4m, 4 lea r5, [r3*3] lea r4, [r1*3] je .end COPY4 movd, movd, r4, r5 lea r2, [r2+r3*4] lea r0, [r0+r1*4].end: COPY4 movd, movd, r4, r5 RETcglobal x264_mc_copy_w8_mmx, 5,7 lea r6, [r3*3] lea r5, [r1*3].height_loop: COPY4 movq, movq, r5, r6 lea r2, [r2+r3*4] lea r0, [r0+r1*4] sub r4d, 4 jg .height_loop REP_RETcglobal x264_mc_copy_w16_mmx, 5,7 lea r6, [r3*3] lea r5, [r1*3].height_loop: movq mm0, [r2] movq mm1, [r2+8] movq mm2, [r2+r3] movq mm3, [r2+r3+8] movq mm4, [r2+r3*2] movq mm5, [r2+r3*2+8] movq mm6, [r2+r6] movq mm7, [r2+r6+8] movq [r0], mm0 movq [r0+8], mm1 movq [r0+r1], mm2 movq [r0+r1+8], mm3 movq [r0+r1*2], mm4 movq [r0+r1*2+8], mm5 movq [r0+r5], mm6 movq [r0+r5+8], mm7 lea r2, [r2+r3*4] lea r0, [r0+r1*4] sub r4d, 4 jg .height_loop REP_RETINIT_XMM%macro COPY_W16_SSE2 2cglobal %1, 5,7 lea r6, [r3*3] lea r5, [r1*3].height_loop: COPY4 movdqa, %2, r5, r6 lea r2, [r2+r3*4] lea r0, [r0+r1*4] sub r4d, 4 jg .height_loop REP_RET%endmacroCOPY_W16_SSE2 x264_mc_copy_w16_sse2, movdqu; cacheline split with mmx has too much overhead; the speed benefit is near-zero.; but with SSE3 the overhead is zero, so there's no reason not to include it.COPY_W16_SSE2 x264_mc_copy_w16_sse3, lddquCOPY_W16_SSE2 x264_mc_copy_w16_aligned_sse2, movdqa;=============================================================================; prefetch;=============================================================================; FIXME assumes 64 byte cachelines;-----------------------------------------------------------------------------; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y,; uint8_t *pix_uv, int stride_uv, int mb_x );-----------------------------------------------------------------------------%ifdef ARCH_X86_64cglobal x264_prefetch_fenc_mmxext, 5,5 mov eax, r4d and eax, 3 imul eax, r1d lea r0, [r0+rax*4+64] prefetcht0 [r0] prefetcht0 [r0+r1] lea r0, [r0+r1*2] prefetcht0 [r0] prefetcht0 [r0+r1] and r4d, 6 imul r4d, r3d lea r2, [r2+r4+64] prefetcht0 [r2] prefetcht0 [r2+r3] RET%elsecglobal x264_prefetch_fenc_mmxext mov r2, [esp+20] mov r1, [esp+8] mov r0, [esp+4] and r2, 3 imul r2, r1 lea r0, [r0+r2*4+64] prefetcht0 [r0] prefetcht0 [r0+r1] lea r0, [r0+r1*2] prefetcht0 [r0] prefetcht0 [r0+r1] mov r2, [esp+20] mov r1, [esp+16] mov r0, [esp+12] and r2, 6 imul r2, r1 lea r0, [r0+r2+64] prefetcht0 [r0] prefetcht0 [r0+r1] ret%endif ; ARCH_X86_64;-----------------------------------------------------------------------------; void x264_prefetch_ref_mmxext( uint8_t *pix, int stride, int parity );-----------------------------------------------------------------------------cglobal x264_prefetch_ref_mmxext, 3,3 dec r2d and r2d, r1d lea r0, [r0+r2*8+64] lea r2, [r1*3] prefetcht0 [r0] prefetcht0 [r0+r1] prefetcht0 [r0+r1*2] prefetcht0 [r0+r2] lea r0, [r0+r1*4] prefetcht0 [r0] prefetcht0 [r0+r1] prefetcht0 [r0+r1*2] prefetcht0 [r0+r2] RET;=============================================================================; chroma MC;============================================================================= %define t0 rax%ifdef ARCH_X86_64 %define t1 r10%else %define t1 r1%endif%macro MC_CHROMA_START 0 movifnidn r2, r2mp movifnidn r3d, r3m movifnidn r4d, r4m movifnidn r5d, r5m mov t0d, r5d mov t1d, r4d sar t0d, 3 sar t1d, 3 imul t0d, r3d add t0d, t1d movsxdifnidn t0, t0d add r2, t0 ; src += (dx>>3) + (dy>>3) * src_stride%endmacro;-----------------------------------------------------------------------------; void x264_mc_chroma_mmxext( uint8_t *dst, int dst_stride,; uint8_t *src, int src_stride,; int dx, int dy,; int width, int height );-----------------------------------------------------------------------------%macro MC_CHROMA 1-2 0cglobal x264_mc_chroma_%1%if mmsize == 16 cmp dword r6m, 4 jle x264_mc_chroma_mmxext%endif PROLOGUE 0,6,%2 MC_CHROMA_START pxor m3, m3 and r4d, 7 ; dx &= 7 jz .mc1dy and r5d, 7 ; dy &= 7 jz .mc1dx movd m5, r4d movd m6, r5d SPLATW m5, m5 ; m5 = dx SPLATW m6, m6 ; m6 = dy mova m4, [pw_8 GLOBAL] mova m0, m4 psubw m4, m5 ; m4 = 8-dx psubw m0, m6 ; m0 = 8-dy mova m7, m5 pmullw m5, m0 ; m5 = dx*(8-dy) = cB pmullw m7, m6 ; m7 = dx*dy = cD pmullw m6, m4 ; m6 = (8-dx)*dy = cC pmullw m4, m0 ; m4 = (8-dx)*(8-dy) = cA mov r4d, r7m%ifdef ARCH_X86_64 mov r10, r0 mov r11, r2%else mov r0, r0mp mov r1, r1m mov r5, r2%endif.loop2d: movh m1, [r2+r3] movh m0, [r2] punpcklbw m1, m3 ; 00 px1 | 00 px2 | 00 px3 | 00 px4 punpcklbw m0, m3 pmullw m1, m6 ; 2nd line * cC pmullw m0, m4 ; 1st line * cA paddw m0, m1 ; m0 <- result movh m2, [r2+1] movh m1, [r2+r3+1] punpcklbw m2, m3 punpcklbw m1, m3 paddw m0, [pw_32 GLOBAL] pmullw m2, m5 ; line * cB pmullw m1, m7 ; line * cD paddw m0, m2 paddw m0, m1 psrlw m0, 6 packuswb m0, m3 ; 00 00 00 00 px1 px2 px3 px4 movh [r0], m0 add r2, r3 add r0, r1 ; dst_stride dec r4d jnz .loop2d%if mmsize == 8 sub dword r6m, 8 jnz .finish ; width != 8 so assume 4%ifdef ARCH_X86_64 lea r0, [r10+4] ; dst lea r2, [r11+4] ; src%else mov r0, r0mp lea r2, [r5+4] add r0, 4%endif mov r4d, r7m ; height jmp .loop2d%else REP_RET%endif ; mmsize.mc1dy: and r5d, 7 movd m6, r5d mov r5, r3 ; pel_offset = dx ? 1 : src_stride jmp .mc1d.mc1dx: movd m6, r4d mov r5d, 1.mc1d: mova m5, [pw_8 GLOBAL] SPLATW m6, m6 mova m7, [pw_4 GLOBAL] psubw m5, m6 movifnidn r0, r0mp movifnidn r1d, r1m mov r4d, r7m%if mmsize == 8 cmp dword r6m, 8 je .loop1d_w8%endif.loop1d_w4: movh m0, [r2+r5] movh m1, [r2] punpcklbw m0, m3 punpcklbw m1, m3 pmullw m0, m6 pmullw m1, m5 paddw m0, m7 paddw m0, m1 psrlw m0, 3 packuswb m0, m3 movh [r0], m0 add r2, r3 add r0, r1 dec r4d jnz .loop1d_w4.finish: REP_RET%if mmsize == 8.loop1d_w8: movu m0, [r2+r5] mova m1, [r2] mova m2, m0 mova m4, m1 punpcklbw m0, m3 punpcklbw m1, m3 punpckhbw m2, m3 punpckhbw m4, m3 pmullw m0, m6 pmullw m1, m5 pmullw m2, m6 pmullw m4, m5 paddw m0, m7 paddw m2, m7 paddw m0, m1 paddw m2, m4 psrlw m0, 3 psrlw m2, 3 packuswb m0, m2 mova [r0], m0 add r2, r3 add r0, r1 dec r4d jnz .loop1d_w8 REP_RET%endif ; mmsize%endmacro ; MC_CHROMAINIT_MMXMC_CHROMA mmxextINIT_XMMMC_CHROMA sse2, 8INIT_MMXcglobal x264_mc_chroma_ssse3, 0,6,8 MC_CHROMA_START and r4d, 7 and r5d, 7 mov t0d, r4d shl t0d, 8 sub t0d, r4d mov r4d, 8 add t0d, 8 sub r4d, r5d imul r5d, t0d ; (x*255+8)*y imul r4d, t0d ; (x*255+8)*(8-y) cmp dword r6m, 4 jg .width8 mova m5, [pw_32 GLOBAL] movd m6, r5d movd m7, r4d movifnidn r0, r0mp movifnidn r1d, r1m movifnidn r4d, r7m SPLATW m6, m6 SPLATW m7, m7 movh m0, [r2] punpcklbw m0, [r2+1] add r2, r3.loop4: movh m1, [r2] movh m3, [r2+r3] punpcklbw m1, [r2+1] punpcklbw m3, [r2+r3+1] lea r2, [r2+2*r3] mova m2, m1 mova m4, m3 pmaddubsw m0, m7 pmaddubsw m1, m6 pmaddubsw m2, m7 pmaddubsw m3, m6 paddw m0, m5 paddw m2, m5 paddw m1, m0 paddw m3, m2 mova m0, m4 psrlw m1, 6 psrlw m3, 6 packuswb m1, m1 packuswb m3, m3 movh [r0], m1 movh [r0+r1], m3 sub r4d, 2 lea r0, [r0+2*r1] jg .loop4 REP_RETINIT_XMM.width8: mova m5, [pw_32 GLOBAL] movd m6, r5d movd m7, r4d movifnidn r0, r0mp movifnidn r1d, r1m movifnidn r4d, r7m SPLATW m6, m6 SPLATW m7, m7 movh m0, [r2] movh m1, [r2+1] punpcklbw m0, m1 add r2, r3.loop8: movh m1, [r2] movh m2, [r2+1] movh m3, [r2+r3] movh m4, [r2+r3+1] punpcklbw m1, m2 punpcklbw m3, m4 lea r2, [r2+2*r3] mova m2, m1 mova m4, m3 pmaddubsw m0, m7 pmaddubsw m1, m6 pmaddubsw m2, m7 pmaddubsw m3, m6 paddw m0, m5 paddw m2, m5 paddw m1, m0 paddw m3, m2 mova m0, m4 psrlw m1, 6 psrlw m3, 6 packuswb m1, m3 movh [r0], m1 movhps [r0+r1], m1 sub r4d, 2 lea r0, [r0+2*r1] jg .loop8 REP_RET; mc_chroma 1d ssse3 is negligibly faster, and definitely not worth the extra code size
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -