📄 memcopy.c
字号:
// memcopy.cpp : Defines the entry point for the console application.
//
//#ifdef USEMMX
void memfill(void *dst, int n32, unsigned long i)
{
__asm {
movd mm0, n32
PUNPCKLBW mm0,mm0
PUNPCKLWD mm0,mm0
PUNPCKLDQ mm0,mm0
mov edi, dst
mov eax,i
xor ecx,ecx
loopwrite1:
add ecx,32
cmp ecx,eax
jg loopdone
movntq 0[edi], mm0
movntq 8[edi], mm0
movntq 16[edi], mm0
movntq 24[edi], mm0
add edi, 32
jmp loopwrite1
add ecx,32
cmp ecx,eax
jg loopdone
movntq 0[edi], mm0
movntq 8[edi], mm0
movntq 16[edi], mm0
movntq 24[edi], mm0
add edi, 32
jmp loopwrite1
loopdone:
sub eax,ecx
add eax,32
jz done
mov ebx,n32
loopwrite2:
mov [edi],bl
add edi, 1
sub eax,1
jne loopwrite2
done:
emms
}
}
void memcopy(void *dst, void *src, int nbytes)
{
_asm {
mov esi, src
mov edi, dst
mov ecx, nbytes
shr ecx, 6 // 64 bytes per iteration
loop1:
prefetchnta 64[ESI] // Prefetch next loop, non-temporal
prefetchnta 96[ESI]
movq mm1, 0[ESI] // Read in source data
movq mm2, 8[ESI]
movq mm3, 16[ESI]
movq mm4, 24[ESI]
movq mm5, 32[ESI]
movq mm6, 40[ESI]
movq mm7, 48[ESI]
movq mm0, 56[ESI]
movntq 0[EDI], mm1 // Non-temporal stores
movntq 8[EDI], mm2
movntq 16[EDI], mm3
movntq 24[EDI], mm4
movntq 32[EDI], mm5
movntq 40[EDI], mm6
movntq 48[EDI], mm7
movntq 56[EDI], mm0
add esi, 64
add edi, 64
dec ecx
jnz loop1
emms
}
}
void copyframewithextend16(void * dst,void *src,int h,int w)
{
_asm
{
mov esi, src
mov eax,w
add eax,32
pxor mm7,mm7
pxor mm6,mm6
//扩展左上角
mov edi, dst
sub edi,16
mov ebx,eax
shl ebx,4
sub edi,ebx
xor ecx,ecx
mov cl,[esi]
movd mm0,ecx
PUNPCKLBW mm0,mm0
PUNPCKLWD mm0,mm0
PUNPCKLDQ mm0,mm0
mov ecx,4
loop_cpbytewithextend_1:
movntq [edi],mm0
movntq [edi+8],mm0
add edi,eax
movntq [edi],mm0
movntq [edi+8],mm0
add edi,eax
movntq [edi],mm0
movntq [edi+8],mm0
add edi,eax
movntq [edi],mm0
movntq [edi+8],mm0
add edi,eax
sub ecx,1
jne loop_cpbytewithextend_1
//扩展右上角
mov edi, dst
add edi,w
sub edi,ebx
add esi,w
sub esi,1
xor ecx,ecx
mov cl,[esi]
movd mm0,ecx
PUNPCKLBW mm0,mm0
PUNPCKLWD mm0,mm0
PUNPCKLDQ mm0,mm0
mov ecx,4
loop_cpbytewithextend_2:
movntq [edi],mm0
movntq [edi+8],mm0
add edi,eax
movntq [edi],mm0
movntq [edi+8],mm0
add edi,eax
movntq [edi],mm0
movntq [edi+8],mm0
add edi,eax
movntq [edi],mm0
movntq [edi+8],mm0
add edi,eax
sub ecx,1
jne loop_cpbytewithextend_2
//扩展左下角
mov edi, dst
mov esi, src
mov ebx,w
mov ecx,h
sub ecx,1
movd mm7,ebx
movd mm6,ecx
PMADDWD mm7,mm6
movd ebx,mm7
add esi,ebx
mov ebx,w
add ebx,32
mov ecx,h
movd mm7,ebx
movd mm6,ecx
PMADDWD mm7,mm6
movd ebx,mm7
add edi,ebx
sub edi,16
xor ecx,ecx
mov cl,[esi]
movd mm0,ecx
PUNPCKLBW mm0,mm0
PUNPCKLWD mm0,mm0
PUNPCKLDQ mm0,mm0
mov ecx,4
loop_cpbytewithextend_3:
movntq [edi],mm0
movntq [edi+8],mm0
add edi,eax
movntq [edi],mm0
movntq [edi+8],mm0
add edi,eax
movntq [edi],mm0
movntq [edi+8],mm0
add edi,eax
movntq [edi],mm0
movntq [edi+8],mm0
add edi,eax
sub ecx,1
jne loop_cpbytewithextend_3
//扩展右下角
mov edi, dst
mov esi, src
mov ebx,w
mov ecx,h
movd mm7,ebx
movd mm6,ecx
PMADDWD mm7,mm6
movd ebx,mm7
add esi,ebx
sub esi,1
mov ebx,w
add ebx,32
mov ecx,h
movd mm7,ebx
movd mm6,ecx
PMADDWD mm7,mm6
movd ebx,mm7
add edi,ebx
add edi,w
xor ecx,ecx
mov cl,[esi]
movd mm0,ecx
PUNPCKLBW mm0,mm0
PUNPCKLWD mm0,mm0
PUNPCKLDQ mm0,mm0
mov ecx,4
loop_cpbytewithextend_4:
movntq [edi],mm0
movntq [edi+8],mm0
add edi,eax
movntq [edi],mm0
movntq [edi+8],mm0
add edi,eax
movntq [edi],mm0
movntq [edi+8],mm0
add edi,eax
movntq [edi],mm0
movntq [edi+8],mm0
add edi,eax
sub ecx,1
jne loop_cpbytewithextend_4
//扩展顶部
mov edi, dst
mov ebx,w
add ebx,32
shl ebx,4
sub edi,ebx
mov edx,16
loop_cpbytewithextend_6:
mov esi, src
mov ecx,w
shr ecx,3
loop_cpbytewithextend_5:
movq mm0,[esi]
add esi,8
movntq [edi],mm0
add edi,8
sub ecx,1
jne loop_cpbytewithextend_5
add edi,32
sub edx,1
jne loop_cpbytewithextend_6
//扩展底部
mov edi, dst
mov ebx,w
add ebx,32
mov ecx,h
movd mm7,ebx
movd mm6,ecx
PMADDWD mm7,mm6
movd ebx,mm7
add edi,ebx
mov esi, src
mov ebx,w
mov ecx,h
sub ecx,1
movd mm7,ebx
movd mm6,ecx
PMADDWD mm7,mm6
movd ebx,mm7
add esi,ebx
mov ebx,esi
mov edx,16
loop_cpbytewithextend_8:
mov esi,ebx
mov ecx,w
shr ecx,3
loop_cpbytewithextend_7:
movq mm0,[esi]
add esi,8
movntq [edi],mm0
add edi,8
sub ecx,1
jne loop_cpbytewithextend_7
add edi,32
sub edx,1
jne loop_cpbytewithextend_8
//扩展左边
mov edi,dst
mov esi,src
mov edx,h
mov ebx,w
mov ecx,ebx
add ecx,32
loop_cpbytewithextend_9:
xor eax,eax
mov al,[esi]
movd mm0,eax
PUNPCKLBW mm0,mm0
PUNPCKLWD mm0,mm0
PUNPCKLDQ mm0,mm0
movq mm1,mm0
pslld mm1,8
por mm0,mm1
pslld mm1,8
por mm0,mm1
pslld mm1,8
por mm0,mm1
movq [edi-8],mm0
movq [edi-16],mm0
add esi,ebx
add edi,ecx
sub edx,1
jne loop_cpbytewithextend_9
//扩展右边
mov edi,dst
mov esi,src
mov edx,h
mov ebx,w
mov ecx,ebx
add ecx,32
add edi,ebx
add esi,ebx
sub esi,1
loop_cpbytewithextend_10:
xor eax,eax
mov al,[esi]
movd mm0,eax
PUNPCKLBW mm0,mm0
PUNPCKLWD mm0,mm0
PUNPCKLDQ mm0,mm0
movq mm1,mm0
pslld mm1,8
por mm0,mm1
pslld mm1,8
por mm0,mm1
pslld mm1,8
por mm0,mm1
movq [edi],mm0
movq [edi+8],mm0
add esi,ebx
add edi,ecx
sub edx,1
jne loop_cpbytewithextend_10
//拷贝中间
mov edi,dst
mov esi,src
mov edx,h
mov eax,w
mov ebx,eax
add ebx,32
loop_cpbytewithextend_12:
mov ecx,eax
shr ecx,3
movd mm6,edi
movd mm7,esi
loop_cpbytewithextend_11:
movq mm0,[esi]
add esi,8
movq [edi],mm0
add edi,8
sub ecx,1
jne loop_cpbytewithextend_11
movd edi,mm6
movd esi,mm7
add edi,ebx
add esi,eax
sub edx,1
jne loop_cpbytewithextend_12
emms
}
}
/* void copyframewithextend64(void * dst,void *src,int h,int w)
{
_asm
{
mov esi, src
mov eax,w
add eax,128
pxor mm7,mm7
pxor mm6,mm6
//扩展左上角
mov edi, dst
sub edi,64
mov ebx,eax
shl ebx,6
sub edi,ebx
movd mm0,[esi]
movd2qd xmm0,mm0
PUNPCKLQDQ xmm0,xmm0
PUNPCKLQDQ xmm0,xmm0
mov ecx,16
loop_cpbytewithextend_1:
movdqu [edi],xmm0
movdqu [edi+16],xmm0
movdqu [edi+32],xmm0
movdqu [edi+48],xmm0
add edi,eax
movdqu [edi],xmm0
movdqu [edi+16],xmm0
movdqu [edi+32],xmm0
movdqu [edi+48],xmm0
add edi,eax
movdqu [edi],xmm0
movdqu [edi+16],xmm0
movdqu [edi+32],xmm0
movdqu [edi+48],xmm0
add edi,eax
movdqu [edi],xmm0
movdqu [edi+16],xmm0
movdqu [edi+32],xmm0
movdqu [edi+48],xmm0
add edi,eax
sub ecx,1
jne loop_cpbytewithextend_1
//扩展右上角
mov edi, dst
add edi,w
sub edi,ebx
add esi,w
sub esi,1
movd mm0,[esi]
movd2qd xmm0,mm0
PUNPCKLQDQ xmm0,xmm0
PUNPCKLQDQ xmm0,xmm0
mov ecx,16
loop_cpbytewithextend_2:
movdqu [edi],xmm0
movdqu [edi+16],xmm0
movdqu [edi+32],xmm0
movdqu [edi+48],xmm0
add edi,eax
movdqu [edi],xmm0
movdqu [edi+16],xmm0
movdqu [edi+32],xmm0
movdqu [edi+48],xmm0
add edi,eax
movdqu [edi],xmm0
movdqu [edi+16],xmm0
movdqu [edi+32],xmm0
movdqu [edi+48],xmm0
add edi,eax
movdqu [edi],xmm0
movdqu [edi+16],xmm0
movdqu [edi+32],xmm0
movdqu [edi+48],xmm0
add edi,eax
sub ecx,1
jne loop_cpbytewithextend_2
//扩展左下角
mov edi, dst
mov esi, src
mov ebx,w
mov ecx,h
sub ecx,1
movd mm7,ebx
movd mm6,ecx
PMADDWD mm7,mm6
movd ebx,mm7
add esi,ebx
mov ebx,w
add ebx,32
mov ecx,h
movd mm7,ebx
movd mm6,ecx
PMADDWD mm7,mm6
movd ebx,mm7
add edi,ebx
sub edi,16
xor ecx,ecx
mov cl,[esi]
movd mm0,ecx
PUNPCKLBW mm0,mm0
PUNPCKLWD mm0,mm0
PUNPCKLDQ mm0,mm0
mov ecx,4
loop_cpbytewithextend_3:
movntq [edi],mm0
movntq [edi+8],mm0
add edi,eax
movntq [edi],mm0
movntq [edi+8],mm0
add edi,eax
movntq [edi],mm0
movntq [edi+8],mm0
add edi,eax
movntq [edi],mm0
movntq [edi+8],mm0
add edi,eax
sub ecx,1
jne loop_cpbytewithextend_3
//扩展右下角
mov edi, dst
mov esi, src
mov ebx,w
mov ecx,h
movd mm7,ebx
movd mm6,ecx
PMADDWD mm7,mm6
movd ebx,mm7
add esi,ebx
sub esi,1
mov ebx,w
add ebx,32
mov ecx,h
movd mm7,ebx
movd mm6,ecx
PMADDWD mm7,mm6
movd ebx,mm7
add edi,ebx
add edi,w
xor ecx,ecx
mov cl,[esi]
movd mm0,ecx
PUNPCKLBW mm0,mm0
PUNPCKLWD mm0,mm0
PUNPCKLDQ mm0,mm0
mov ecx,4
loop_cpbytewithextend_4:
movntq [edi],mm0
movntq [edi+8],mm0
add edi,eax
movntq [edi],mm0
movntq [edi+8],mm0
add edi,eax
movntq [edi],mm0
movntq [edi+8],mm0
add edi,eax
movntq [edi],mm0
movntq [edi+8],mm0
add edi,eax
sub ecx,1
jne loop_cpbytewithextend_4
//扩展顶部
mov edi, dst
mov ebx,w
add ebx,32
shl ebx,4
sub edi,ebx
mov edx,16
loop_cpbytewithextend_6:
mov esi, src
mov ecx,w
shr ecx,3
loop_cpbytewithextend_5:
movq mm0,[esi]
add esi,8
movntq [edi],mm0
add edi,8
sub ecx,1
jne loop_cpbytewithextend_5
add edi,32
sub edx,1
jne loop_cpbytewithextend_6
//扩展底部
mov edi, dst
mov ebx,w
add ebx,32
mov ecx,h
movd mm7,ebx
movd mm6,ecx
PMADDWD mm7,mm6
movd ebx,mm7
add edi,ebx
mov esi, src
mov ebx,w
mov ecx,h
sub ecx,1
movd mm7,ebx
movd mm6,ecx
PMADDWD mm7,mm6
movd ebx,mm7
add esi,ebx
mov ebx,esi
mov edx,16
loop_cpbytewithextend_8:
mov esi,ebx
mov ecx,w
shr ecx,3
loop_cpbytewithextend_7:
movq mm0,[esi]
add esi,8
movntq [edi],mm0
add edi,8
sub ecx,1
jne loop_cpbytewithextend_7
add edi,32
sub edx,1
jne loop_cpbytewithextend_8
//扩展左边
mov edi,dst
mov esi,src
mov edx,h
mov ebx,w
mov ecx,ebx
add ecx,32
loop_cpbytewithextend_9:
xor eax,eax
mov al,[esi]
movd mm0,eax
PUNPCKLBW mm0,mm0
PUNPCKLWD mm0,mm0
PUNPCKLDQ mm0,mm0
movq mm1,mm0
pslld mm1,8
por mm0,mm1
pslld mm1,8
por mm0,mm1
pslld mm1,8
por mm0,mm1
movq [edi-8],mm0
movq [edi-16],mm0
add esi,ebx
add edi,ecx
sub edx,1
jne loop_cpbytewithextend_9
//扩展右边
mov edi,dst
mov esi,src
mov edx,h
mov ebx,w
mov ecx,ebx
add ecx,32
add edi,ebx
add esi,ebx
sub esi,1
loop_cpbytewithextend_10:
xor eax,eax
mov al,[esi]
movd mm0,eax
PUNPCKLBW mm0,mm0
PUNPCKLWD mm0,mm0
PUNPCKLDQ mm0,mm0
movq mm1,mm0
pslld mm1,8
por mm0,mm1
pslld mm1,8
por mm0,mm1
pslld mm1,8
por mm0,mm1
movq [edi],mm0
movq [edi+8],mm0
add esi,ebx
add edi,ecx
sub edx,1
jne loop_cpbytewithextend_10
//拷贝中间
mov edi,dst
mov esi,src
mov edx,h
mov eax,w
mov ebx,eax
add ebx,32
loop_cpbytewithextend_12:
mov ecx,eax
shr ecx,3
movd mm6,edi
movd mm7,esi
loop_cpbytewithextend_11:
movq mm0,[esi]
add esi,8
movq [edi],mm0
add edi,8
sub ecx,1
jne loop_cpbytewithextend_11
movd edi,mm6
movd esi,mm7
add edi,ebx
add esi,eax
sub edx,1
jne loop_cpbytewithextend_12
emms
}
}*/
//#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -