📄 sad-a.asm
字号:
%endif RET%endmacro%macro SAD_X4_END_SSE2 0 mov r0, r6mp psllq xmm1, 32 psllq xmm3, 32 paddw xmm0, xmm1 paddw xmm2, xmm3 movhlps xmm1, xmm0 movhlps xmm3, xmm2 paddw xmm0, xmm1 paddw xmm2, xmm3 movq [r0+0], xmm0 movq [r0+8], xmm2 RET%endmacro%macro SAD_X3_START_1x16P_SSE2_MISALIGN 0 movdqa xmm2, [r0] movdqu xmm0, [r1] movdqu xmm1, [r2] psadbw xmm0, xmm2 psadbw xmm1, xmm2 psadbw xmm2, [r3]%endmacro%macro SAD_X3_1x16P_SSE2_MISALIGN 2 movdqa xmm3, [r0+%1] movdqu xmm4, [r1+%2] movdqu xmm5, [r2+%2] psadbw xmm4, xmm3 psadbw xmm5, xmm3 psadbw xmm3, [r3+%2] paddw xmm0, xmm4 paddw xmm1, xmm5 paddw xmm2, xmm3%endmacro%macro SAD_X4_START_1x16P_SSE2_MISALIGN 0 movdqa xmm3, [r0] movdqu xmm0, [r1] movdqu xmm1, [r2] movdqu xmm2, [r3] psadbw xmm0, xmm3 psadbw xmm1, xmm3 psadbw xmm2, xmm3 psadbw xmm3, [r4]%endmacro%macro SAD_X4_1x16P_SSE2_MISALIGN 2 movdqa xmm7, [r0+%1] movdqu xmm4, [r1+%2] movdqu xmm5, [r2+%2] movdqu xmm6, [r3+%2] psadbw xmm4, xmm7 psadbw xmm5, xmm7 psadbw xmm6, xmm7 psadbw xmm7, [r4+%2] paddw xmm0, xmm4 paddw xmm1, xmm5 paddw xmm2, xmm6 paddw xmm3, xmm7%endmacro%macro SAD_X3_2x16P_SSE2_MISALIGN 1%if %1 SAD_X3_START_1x16P_SSE2_MISALIGN%else SAD_X3_1x16P_SSE2_MISALIGN 0, 0%endif SAD_X3_1x16P_SSE2_MISALIGN FENC_STRIDE, r4 add r0, 2*FENC_STRIDE lea r1, [r1+2*r4] lea r2, [r2+2*r4] lea r3, [r3+2*r4]%endmacro%macro SAD_X4_2x16P_SSE2_MISALIGN 1%if %1 SAD_X4_START_1x16P_SSE2_MISALIGN%else SAD_X4_1x16P_SSE2_MISALIGN 0, 0%endif SAD_X4_1x16P_SSE2_MISALIGN FENC_STRIDE, r5 add r0, 2*FENC_STRIDE lea r1, [r1+2*r5] lea r2, [r2+2*r5] lea r3, [r3+2*r5] lea r4, [r4+2*r5]%endmacro;-----------------------------------------------------------------------------; void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,; uint8_t *pix2, int i_stride, int scores[3] );-----------------------------------------------------------------------------%macro SAD_X_SSE2 4cglobal x264_pixel_sad_x%1_%2x%3_%4, 2+%1,2+%1,9%ifdef WIN64 %assign i %1+1 movsxd r %+ i, r %+ i %+ d%endif SAD_X%1_2x%2P_SSE2 1%rep %3/2-1 SAD_X%1_2x%2P_SSE2 0%endrep SAD_X%1_END_SSE2%endmacro%macro SAD_X_SSE2_MISALIGN 4cglobal x264_pixel_sad_x%1_%2x%3_%4_misalign, 2+%1,2+%1,9%ifdef WIN64 %assign i %1+1 movsxd r %+ i, r %+ i %+ d%endif SAD_X%1_2x%2P_SSE2_MISALIGN 1%rep %3/2-1 SAD_X%1_2x%2P_SSE2_MISALIGN 0%endrep SAD_X%1_END_SSE2%endmacroSAD_X_SSE2 3, 16, 16, sse2SAD_X_SSE2 3, 16, 8, sse2SAD_X_SSE2 3, 8, 16, sse2SAD_X_SSE2 3, 8, 8, sse2SAD_X_SSE2 3, 8, 4, sse2SAD_X_SSE2 4, 16, 16, sse2SAD_X_SSE2 4, 16, 8, sse2SAD_X_SSE2 4, 8, 16, sse2SAD_X_SSE2 4, 8, 8, sse2SAD_X_SSE2 4, 8, 4, sse2SAD_X_SSE2_MISALIGN 3, 16, 16, sse2SAD_X_SSE2_MISALIGN 3, 16, 8, sse2SAD_X_SSE2_MISALIGN 4, 16, 16, sse2SAD_X_SSE2_MISALIGN 4, 16, 8, sse2%define movdqu lddquSAD_X_SSE2 3, 16, 16, sse3SAD_X_SSE2 3, 16, 8, sse3SAD_X_SSE2 4, 16, 16, sse3SAD_X_SSE2 4, 16, 8, sse3%undef movdqu;=============================================================================; SAD cacheline split;=============================================================================; Core2 (Conroe) can load unaligned data just as quickly as aligned data...; unless the unaligned data spans the border between 2 cachelines, in which; case it's really slow. The exact numbers may differ, but all Intel cpus prior; to Nehalem have a large penalty for cacheline splits.; (8-byte alignment exactly half way between two cachelines is ok though.); LDDQU was supposed to fix this, but it only works on Pentium 4.; So in the split case we load aligned data and explicitly perform the; alignment between registers. Like on archs that have only aligned loads,; except complicated by the fact that PALIGNR takes only an immediate, not; a variable alignment.; It is also possible to hoist the realignment to the macroblock level (keep; 2 copies of the reference frame, offset by 32 bytes), but the extra memory; needed for that method makes it often slower.; sad 16x16 costs on Core2:; good offsets: 49 cycles (50/64 of all mvs); cacheline split: 234 cycles (14/64 of all mvs. ammortized: +40 cycles); page split: 3600 cycles (14/4096 of all mvs. ammortized: +11.5 cycles); cache or page split with palignr: 57 cycles (ammortized: +2 cycles); computed jump assumes this loop is exactly 80 bytes%macro SAD16_CACHELINE_LOOP_SSE2 1 ; alignmentALIGN 16sad_w16_align%1_sse2: movdqa xmm1, [r2+16] movdqa xmm2, [r2+r3+16] movdqa xmm3, [r2] movdqa xmm4, [r2+r3] pslldq xmm1, 16-%1 pslldq xmm2, 16-%1 psrldq xmm3, %1 psrldq xmm4, %1 por xmm1, xmm3 por xmm2, xmm4 psadbw xmm1, [r0] psadbw xmm2, [r0+r1] paddw xmm0, xmm1 paddw xmm0, xmm2 lea r0, [r0+2*r1] lea r2, [r2+2*r3] dec r4 jg sad_w16_align%1_sse2 ret%endmacro; computed jump assumes this loop is exactly 64 bytes%macro SAD16_CACHELINE_LOOP_SSSE3 1 ; alignmentALIGN 16sad_w16_align%1_ssse3: movdqa xmm1, [r2+16] movdqa xmm2, [r2+r3+16] palignr xmm1, [r2], %1 palignr xmm2, [r2+r3], %1 psadbw xmm1, [r0] psadbw xmm2, [r0+r1] paddw xmm0, xmm1 paddw xmm0, xmm2 lea r0, [r0+2*r1] lea r2, [r2+2*r3] dec r4 jg sad_w16_align%1_ssse3 ret%endmacro%macro SAD16_CACHELINE_FUNC 2 ; cpu, heightcglobal x264_pixel_sad_16x%2_cache64_%1 mov eax, r2m and eax, 0x37 cmp eax, 0x30 jle x264_pixel_sad_16x%2_sse2 PROLOGUE 4,6 mov r4d, r2d and r4d, 15%ifidn %1, ssse3 shl r4d, 6 ; code size = 64%else lea r4, [r4*5] shl r4d, 4 ; code size = 80%endif%define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1))%ifdef PIC lea r5, [sad_w16_addr GLOBAL] add r5, r4%else lea r5, [sad_w16_addr + r4 GLOBAL]%endif and r2, ~15 mov r4d, %2/2 pxor xmm0, xmm0 call r5 movhlps xmm1, xmm0 paddw xmm0, xmm1 movd eax, xmm0 RET%endmacro%macro SAD_CACHELINE_START_MMX2 4 ; width, height, iterations, cacheline mov eax, r2m and eax, 0x17|%1|(%4>>1) cmp eax, 0x10|%1|(%4>>1) jle x264_pixel_sad_%1x%2_mmxext and eax, 7 shl eax, 3 movd mm6, [sw_64 GLOBAL] movd mm7, eax psubw mm6, mm7 PROLOGUE 4,5 and r2, ~7 mov r4d, %3 pxor mm0, mm0%endmacro%macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cachelinecglobal x264_pixel_sad_16x%1_cache%2_mmxext SAD_CACHELINE_START_MMX2 16, %1, %1, %2.loop: movq mm1, [r2] movq mm2, [r2+8] movq mm3, [r2+16] movq mm4, mm2 psrlq mm1, mm7 psllq mm2, mm6 psllq mm3, mm6 psrlq mm4, mm7 por mm1, mm2 por mm3, mm4 psadbw mm1, [r0] psadbw mm3, [r0+8] paddw mm0, mm1 paddw mm0, mm3 add r2, r3 add r0, r1 dec r4 jg .loop movd eax, mm0 RET%endmacro%macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cachelinecglobal x264_pixel_sad_8x%1_cache%2_mmxext SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2.loop: movq mm1, [r2+8] movq mm2, [r2+r3+8] movq mm3, [r2] movq mm4, [r2+r3] psllq mm1, mm6 psllq mm2, mm6 psrlq mm3, mm7 psrlq mm4, mm7 por mm1, mm3 por mm2, mm4 psadbw mm1, [r0] psadbw mm2, [r0+r1] paddw mm0, mm1 paddw mm0, mm2 lea r2, [r2+2*r3] lea r0, [r0+2*r1] dec r4 jg .loop movd eax, mm0 RET%endmacro; sad_x3/x4_cache64: check each mv.; if they're all within a cacheline, use normal sad_x3/x4.; otherwise, send them individually to sad_cache64.%macro CHECK_SPLIT 3 ; pix, width, cacheline mov eax, %1 and eax, 0x17|%2|(%3>>1) cmp eax, 0x10|%2|(%3>>1) jg .split%endmacro%macro SADX3_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, namecglobal x264_pixel_sad_x3_%1x%2_cache%3_%6 CHECK_SPLIT r1m, %1, %3 CHECK_SPLIT r2m, %1, %3 CHECK_SPLIT r3m, %1, %3 jmp x264_pixel_sad_x3_%1x%2_%4.split:%ifdef ARCH_X86_64 PROLOGUE 6,7%ifdef WIN64 movsxd r4, r4d sub rsp, 8%endif push r3 push r2 mov r2, r1 mov r1, FENC_STRIDE mov r3, r4 mov r10, r0 mov r11, r5 call x264_pixel_sad_%1x%2_cache%3_%5 mov [r11], eax%ifdef WIN64 mov r2, [rsp]%else pop r2%endif mov r0, r10 call x264_pixel_sad_%1x%2_cache%3_%5 mov [r11+4], eax%ifdef WIN64 mov r2, [rsp+8]%else pop r2%endif mov r0, r10 call x264_pixel_sad_%1x%2_cache%3_%5 mov [r11+8], eax%ifdef WIN64 add rsp, 24%endif RET%else push edi mov edi, [esp+28] push dword [esp+24] push dword [esp+16] push dword 16 push dword [esp+20] call x264_pixel_sad_%1x%2_cache%3_%5 mov ecx, [esp+32] mov [edi], eax mov [esp+8], ecx call x264_pixel_sad_%1x%2_cache%3_%5 mov ecx, [esp+36] mov [edi+4], eax mov [esp+8], ecx call x264_pixel_sad_%1x%2_cache%3_%5 mov [edi+8], eax add esp, 16 pop edi ret%endif%endmacro%macro SADX4_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, namecglobal x264_pixel_sad_x4_%1x%2_cache%3_%6 CHECK_SPLIT r1m, %1, %3 CHECK_SPLIT r2m, %1, %3 CHECK_SPLIT r3m, %1, %3 CHECK_SPLIT r4m, %1, %3 jmp x264_pixel_sad_x4_%1x%2_%4.split:%ifdef ARCH_X86_64 PROLOGUE 6,7 mov r11, r6mp%ifdef WIN64 movsxd r5, r5d%endif push r4 push r3 push r2 mov r2, r1 mov r1, FENC_STRIDE mov r3, r5 mov r10, r0 call x264_pixel_sad_%1x%2_cache%3_%5 mov [r11], eax%ifdef WIN64 mov r2, [rsp]%else pop r2%endif mov r0, r10 call x264_pixel_sad_%1x%2_cache%3_%5 mov [r11+4], eax%ifdef WIN64 mov r2, [rsp+8]%else pop r2%endif mov r0, r10 call x264_pixel_sad_%1x%2_cache%3_%5 mov [r11+8], eax%ifdef WIN64 mov r2, [rsp+16]%else pop r2%endif mov r0, r10 call x264_pixel_sad_%1x%2_cache%3_%5 mov [r11+12], eax%ifdef WIN64 add rsp, 24%endif RET%else push edi mov edi, [esp+32] push dword [esp+28] push dword [esp+16] push dword 16 push dword [esp+20] call x264_pixel_sad_%1x%2_cache%3_%5 mov ecx, [esp+32] mov [edi], eax mov [esp+8], ecx call x264_pixel_sad_%1x%2_cache%3_%5 mov ecx, [esp+36] mov [edi+4], eax mov [esp+8], ecx call x264_pixel_sad_%1x%2_cache%3_%5 mov ecx, [esp+40] mov [edi+8], eax mov [esp+8], ecx call x264_pixel_sad_%1x%2_cache%3_%5 mov [edi+12], eax add esp, 16 pop edi ret%endif%endmacro%macro SADX34_CACHELINE_FUNC 1+ SADX3_CACHELINE_FUNC %1 SADX4_CACHELINE_FUNC %1%endmacro; instantiate the aligned sads%ifndef ARCH_X86_64SAD16_CACHELINE_FUNC_MMX2 8, 32SAD16_CACHELINE_FUNC_MMX2 16, 32SAD8_CACHELINE_FUNC_MMX2 4, 32SAD8_CACHELINE_FUNC_MMX2 8, 32SAD8_CACHELINE_FUNC_MMX2 16, 32SAD16_CACHELINE_FUNC_MMX2 8, 64SAD16_CACHELINE_FUNC_MMX2 16, 64%endif ; !ARCH_X86_64SAD8_CACHELINE_FUNC_MMX2 4, 64SAD8_CACHELINE_FUNC_MMX2 8, 64SAD8_CACHELINE_FUNC_MMX2 16, 64%ifndef ARCH_X86_64SADX34_CACHELINE_FUNC 16, 16, 32, mmxext, mmxext, mmxextSADX34_CACHELINE_FUNC 16, 8, 32, mmxext, mmxext, mmxextSADX34_CACHELINE_FUNC 8, 16, 32, mmxext, mmxext, mmxextSADX34_CACHELINE_FUNC 8, 8, 32, mmxext, mmxext, mmxextSADX34_CACHELINE_FUNC 16, 16, 64, mmxext, mmxext, mmxextSADX34_CACHELINE_FUNC 16, 8, 64, mmxext, mmxext, mmxext%endif ; !ARCH_X86_64SADX34_CACHELINE_FUNC 8, 16, 64, mmxext, mmxext, mmxextSADX34_CACHELINE_FUNC 8, 8, 64, mmxext, mmxext, mmxext%ifndef ARCH_X86_64SAD16_CACHELINE_FUNC sse2, 8SAD16_CACHELINE_FUNC sse2, 16%assign i 1%rep 15SAD16_CACHELINE_LOOP_SSE2 i%assign i i+1%endrepSADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2, sse2SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2, sse2%endif ; !ARCH_X86_64SADX34_CACHELINE_FUNC 8, 16, 64, sse2, mmxext, sse2SAD16_CACHELINE_FUNC ssse3, 8SAD16_CACHELINE_FUNC ssse3, 16%assign i 1%rep 15SAD16_CACHELINE_LOOP_SSSE3 i%assign i i+1%endrepSADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3, ssse3SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3, ssse3
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -