📄 mc-a.asm.svn-base
字号:
ALIGN 4 .height_loop BIWEIGHT_4P_MMX [edi ], [edx ] BIWEIGHT_4P_MMX [edi+4 ], [edx+4 ] BIWEIGHT_4P_MMX [edi+esi ], [edx+ecx ] BIWEIGHT_4P_MMX [edi+esi+4], [edx+ecx+4] lea edi, [edi+esi*2] lea edx, [edx+ecx*2] sub eax, byte 2 jg .height_loop BIWEIGHT_END_MMX;-----------------------------------------------------------------------------; int __cdecl x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int );-----------------------------------------------------------------------------cglobal x264_pixel_avg_weight_4x4_mmxext BIWEIGHT_START_MMX BIWEIGHT_4P_MMX [edi ], [edx ] BIWEIGHT_4P_MMX [edi+esi ], [edx+ecx ] BIWEIGHT_4P_MMX [edi+esi*2], [edx+ecx*2] add edi, esi add edx, ecx BIWEIGHT_4P_MMX [edi+esi*2], [edx+ecx*2] BIWEIGHT_END_MMX;=============================================================================; pixel copy;=============================================================================;-----------------------------------------------------------------------------; void x264_mc_copy_w4_mmx( uint8_t *src, int i_src_stride,; uint8_t *dst, int i_dst_stride, int i_height );-----------------------------------------------------------------------------cglobal x264_mc_copy_w4_mmx push ebx push esi push edi mov esi, [esp+24] ; src mov edi, [esp+16] ; dst mov ebx, [esp+28] ; i_src_stride mov edx, [esp+20] ; i_dst_stride mov ecx, [esp+32] ; i_heightALIGN 4.height_loop mov eax, [esi] mov [edi], eax mov eax, [esi+ebx] mov [edi+edx], eax lea esi, [esi+ebx*2] lea edi, [edi+edx*2] dec ecx dec ecx jg .height_loop pop edi pop esi pop ebx ret;-----------------------------------------------------------------------------; void x264_mc_copy_w8_mmx( uint8_t *src, int i_src_stride,; uint8_t *dst, int i_dst_stride, int i_height );-----------------------------------------------------------------------------cglobal x264_mc_copy_w8_mmx push ebx push esi push edi mov esi, [esp+24] ; src mov edi, [esp+16] ; dst mov ebx, [esp+28] ; i_src_stride mov edx, [esp+20] ; i_dst_stride mov ecx, [esp+32] ; i_heightALIGN 4.height_loop movq mm0, [esi] movq [edi], mm0 movq mm1, [esi+ebx] movq [edi+edx], mm1 movq mm2, [esi+ebx*2] movq [edi+edx*2], mm2 lea esi, [esi+ebx*2] lea edi, [edi+edx*2] movq mm3, [esi+ebx] movq [edi+edx], mm3 lea esi, [esi+ebx*2] lea edi, [edi+edx*2] sub ecx, byte 4 jg .height_loop pop edi pop esi pop ebx ret;-----------------------------------------------------------------------------; void x264_mc_copy_w16_mmx( uint8_t *src, int i_src_stride,; uint8_t *dst, int i_dst_stride, int i_height );-----------------------------------------------------------------------------cglobal x264_mc_copy_w16_mmx push ebx push esi push edi mov esi, [esp+24] ; src mov edi, [esp+16] ; dst mov ebx, [esp+28] ; i_src_stride mov edx, [esp+20] ; i_dst_stride mov ecx, [esp+32] ; i_heightALIGN 4.height_loop movq mm0, [esi] movq mm1, [esi+8] movq [edi], mm0 movq [edi+8], mm1 movq mm2, [esi+ebx] movq mm3, [esi+ebx+8] movq [edi+edx], mm2 movq [edi+edx+8], mm3 movq mm4, [esi+ebx*2] movq mm5, [esi+ebx*2+8] movq [edi+edx*2], mm4 movq [edi+edx*2+8], mm5 lea esi, [esi+ebx*2] lea edi, [edi+edx*2] movq mm6, [esi+ebx] movq mm7, [esi+ebx+8] movq [edi+edx], mm6 movq [edi+edx+8], mm7 lea esi, [esi+ebx*2] lea edi, [edi+edx*2] sub ecx, byte 4 jg .height_loop pop edi pop esi pop ebx ret;-----------------------------------------------------------------------------; void x264_mc_copy_w16_sse2( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height );-----------------------------------------------------------------------------cglobal x264_mc_copy_w16_sse2 push ebx push esi push edi mov esi, [esp+24] ; src mov edi, [esp+16] ; dst mov ebx, [esp+28] ; i_src_stride mov edx, [esp+20] ; i_dst_stride mov ecx, [esp+32] ; i_heightALIGN 4.height_loop movdqu xmm0, [esi] movdqu xmm1, [esi+ebx] movdqu [edi], xmm0 movdqu [edi+edx], xmm1 dec ecx dec ecx lea esi, [esi+ebx*2] lea edi, [edi+edx*2] jg .height_loop pop edi pop esi pop ebx ret;=============================================================================; chroma MC;=============================================================================;-----------------------------------------------------------------------------; void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride,; uint8_t *dst, int i_dst_stride,; int dx, int dy,; int i_width, int i_height );-----------------------------------------------------------------------------cglobal x264_mc_chroma_mmxext picpush ebx picgetgot ebx push edi mov ecx, [picesp+4+24] mov edx, [picesp+4+20] mov eax, ecx mov edi, edx sar ecx, 3 sar edx, 3 imul ecx, [picesp+4+8] add ecx, edx add [picesp+4+4], ecx ; src += (dx>>3) + (dy>>3) * src_stride pxor mm3, mm3 and edi, 7 and eax, 7 movd mm5, edi movd mm6, eax pshufw mm5, mm5, 0 ; mm5 = dx&7 pshufw mm6, mm6, 0 ; mm6 = dy&7 movq mm4, [pw_8 GOT_ebx] movq mm0, mm4 psubw mm4, mm5 ; mm4 = 8-dx psubw mm0, mm6 ; mm0 = 8-dy movq mm7, mm5 pmullw mm5, mm0 ; mm5 = dx*(8-dy) = cB pmullw mm7, mm6 ; mm7 = dx*dy = cD pmullw mm6, mm4 ; mm6 = (8-dx)*dy = cC pmullw mm4, mm0 ; mm4 = (8-dx)*(8-dy) = cA mov eax, [picesp+4+4] ; src mov edi, [picesp+4+12] ; dst mov ecx, [picesp+4+8] ; i_src_stride mov edx, [picesp+4+32] ; i_heightALIGN 4.height_loop movd mm1, [eax+ecx] movd mm0, [eax] punpcklbw mm1, mm3 ; 00 px1 | 00 px2 | 00 px3 | 00 px4 punpcklbw mm0, mm3 pmullw mm1, mm6 ; 2nd line * cC pmullw mm0, mm4 ; 1st line * cA paddw mm0, mm1 ; mm0 <- result movd mm2, [eax+1] movd mm1, [eax+ecx+1] punpcklbw mm2, mm3 punpcklbw mm1, mm3 paddw mm0, [pw_32 GOT_ebx] pmullw mm2, mm5 ; line * cB pmullw mm1, mm7 ; line * cD paddw mm0, mm2 paddw mm0, mm1 psrlw mm0, 6 packuswb mm0, mm3 ; 00 00 00 00 px1 px2 px3 px4 movd [edi], mm0 add eax, ecx add edi, [picesp+4+16] dec edx jnz .height_loop sub [picesp+4+28], dword 8 jnz .finish ; width != 8 so assume 4 mov edi, [picesp+4+12] ; dst mov eax, [picesp+4+4] ; src mov edx, [picesp+4+32] ; i_height add edi, 4 add eax, 4 jmp .height_loop.finish pop edi picpop ebx ret; prefetches tuned for 64 byte cachelines (K7/K8/Core2); TODO add 32 and 128 byte versions for P3/P4;-----------------------------------------------------------------------------; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y, ; uint8_t *pix_uv, int stride_uv, int mb_x );-----------------------------------------------------------------------------cglobal x264_prefetch_fenc_mmxext mov eax, [esp+20] mov ecx, [esp+8] mov edx, [esp+4] and eax, 3 imul eax, ecx lea edx, [edx+eax*4+64] prefetcht0 [edx] prefetcht0 [edx+ecx] lea edx, [edx+ecx*2] prefetcht0 [edx] prefetcht0 [edx+ecx] mov eax, [esp+20] mov ecx, [esp+16] mov edx, [esp+12] and eax, 6 imul eax, ecx lea edx, [edx+eax+64] prefetcht0 [edx] prefetcht0 [edx+ecx] ret;-----------------------------------------------------------------------------; void x264_prefetch_ref_mmxext( uint8_t *pix, int stride, int parity );-----------------------------------------------------------------------------cglobal x264_prefetch_ref_mmxext mov eax, [esp+12] mov ecx, [esp+8] mov edx, [esp+4] sub eax, 1 and eax, ecx lea edx, [edx+eax*8+64] lea eax, [ecx*3] prefetcht0 [edx] prefetcht0 [edx+ecx] prefetcht0 [edx+ecx*2] prefetcht0 [edx+eax] lea edx, [edx+ecx*4] prefetcht0 [edx] prefetcht0 [edx+ecx] prefetcht0 [edx+ecx*2] prefetcht0 [edx+eax] ret
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -