📄 winmain.c
字号:
/**********************************************************************/
#include <windows.h>
#include <stdio.h>
#define BUF_SIZE 1024*1024*4
char TxtBuf[5000];
DWORD TxtBufLen = 0;
BOOL CheckMMXTechnology(void);
void byte_copy(BYTE *pDest, BYTE *pSrc, DWORD bytes);
void warmed_byte_copy(BYTE *pDest, BYTE *pSrc, DWORD bytes);
void word_copy(BYTE *pDest, BYTE *pSrc, DWORD bytes);
void warmed_word_copy(BYTE *pDest, BYTE *pSrc, DWORD bytes);
void dword_copy(BYTE *pDest, BYTE *pSrc, DWORD bytes);
void warmed_dword_copy(BYTE *pDest, BYTE *pSrc, DWORD bytes);
void mmx_copy(BYTE *pDest, BYTE *pSrc, DWORD bytes);
void warmed_mmx_copy(BYTE *pDest, BYTE *pSrc, DWORD bytes);
void fpu_copy(BYTE *pDest, BYTE *pSrc, DWORD bytes);
void warmed_fpu_copy(BYTE *pDest, BYTE *pSrc, DWORD bytes);
void run_tests(BYTE *pDest, BYTE *pSrc, DWORD bytes);
void fill_dword(BYTE *pDest, DWORD bytes);
void fill_mmx(BYTE *pDest, DWORD bytes);
BOOL MMXTechnology;
// -----------------------------------------------------------------------
int PASCAL WinMain(HANDLE hInstance, HANDLE hPrevInstance, LPSTR lpCmdLine, int nCmdShow)
{
BYTE *pDest, *pSrc, *pD, *pS;
DWORD BufLen = BUF_SIZE;
MMXTechnology = CheckMMXTechnology();
pSrc = malloc (BufLen+4096);
pDest= malloc (BufLen+4096);
// - source & destination aligned -
pS = (BYTE *)(((DWORD)pSrc+31) &~31);
pD = (BYTE *)(((DWORD)pDest+31) &~31);
TxtBufLen += wsprintf (TxtBuf+TxtBufLen, "Copy & Fill %d bytes\n", BufLen);
TxtBufLen += wsprintf (TxtBuf+TxtBufLen, "MB/s : cold cache / warm cache\n");
TxtBufLen += wsprintf (TxtBuf+TxtBufLen, "--------------------------------\n");
TxtBufLen += wsprintf (TxtBuf+TxtBufLen, "Source & Destination Aligned\n");
run_tests (pD, pS, BufLen);
// - source unaligned, destination aligned -
pS = (BYTE *)((((DWORD)pSrc+31) &~31)+1);
pD = (BYTE *)(((DWORD)pDest+31) &~31);
TxtBufLen += wsprintf (TxtBuf+TxtBufLen, "--------------------------------\n");
TxtBufLen += wsprintf (TxtBuf+TxtBufLen, "Source Unaligned, Destination Aligned\n");
run_tests (pD, pS, BufLen);
// - source aligned, destination unaligned -
pS = (BYTE *)(((DWORD)pSrc+31) &~31);
pD = (BYTE *)((((DWORD)pDest+31) &~31+1));
TxtBufLen += wsprintf (TxtBuf+TxtBufLen, "--------------------------------\n");
TxtBufLen += wsprintf (TxtBuf+TxtBufLen, "Source Aligned, Destination Unaligned\n");
run_tests (pD, pS, BufLen);
free(pDest);
free(pSrc);
MessageBox(NULL, TxtBuf, "Memory Transfer Timing Results", MB_APPLMODAL | MB_OK);
return FALSE;
}
// ----------------------------------------------------------
void byte_copy(BYTE *pDest, BYTE *pSrc, DWORD bytes)
{
_asm
{
mov ecx, bytes
mov esi, pSrc
mov edi, pDest
rep movsb
}
}
// ----------------------------------------------------------
void warmed_byte_copy(BYTE *pDest, BYTE *pSrc, DWORD bytes)
{
_asm
{
mov edx, bytes
shr edx, 12 // # of 4k blocks
mov esi, pSrc
mov edi, pDest
next4Kb:
// warm the cache
mov ebx, esi
mov ecx, 4096/32 // # of cache lines per 4k block
warmB:
mov al, [esi] // read first byte from each cache line to cause a fill
add esi, 32
dec ecx
jnz warmB
mov ecx, 4096 // copy 4k block, byte by byte
mov esi, ebx
rep movsb
dec edx
jnz next4Kb
}
}
// ----------------------------------------------------------
void word_copy(BYTE *pDest, BYTE *pSrc, DWORD bytes)
{
_asm
{
mov ecx, bytes
shr ecx, 1 // doing 2 bytes at a time
mov esi, pSrc
mov edi, pDest
rep movsw
}
}
// ----------------------------------------------------------
void warmed_word_copy(BYTE *pDest, BYTE *pSrc, DWORD bytes)
{
_asm
{
mov edx, bytes
shr edx, 12 // # of 4k blocks
mov esi, pSrc
mov edi, pDest
next4Kw:
// warm the cache
mov ebx, esi
mov ecx, 4096/32 // # of cache lines per 4k block
warmW:
mov al, [esi] // read first byte from each cache line to cause a fill
add esi, 32
dec ecx
jnz warmW
mov ecx, 4096/2 // copy 4k block, word by word
mov esi, ebx
rep movsw
dec edx
jnz next4Kw
}
}
// ----------------------------------------------------------
void dword_copy(BYTE *pDest, BYTE *pSrc, DWORD bytes)
{
_asm
{
mov ecx, bytes
shr ecx, 2 // doing 4 bytes at a time
mov esi, pSrc
mov edi, pDest
rep movsd
}
}
// ----------------------------------------------------------
void warmed_dword_copy(BYTE *pDest, BYTE *pSrc, DWORD bytes)
{
_asm
{
mov edx, bytes
shr edx, 12 // # of 4k blocks
mov esi, pSrc
mov edi, pDest
next4Kd:
// warm the cache
mov ebx, esi
mov ecx, 4096/32 // # of cache lines per 4k block
warmD:
mov al, [esi] // read first byte from each cache line to cause a fill
add esi, 32
dec ecx
jnz warmD
mov ecx, 4096/4 // copy 4k block, dword by dword
mov esi, ebx
rep movsd
dec edx
jnz next4Kd
}
}
// ----------------------------------------------------------
void fpu_copy(BYTE *pDest, BYTE *pSrc, DWORD bytes)
{
_asm
{
mov ecx, bytes
shr ecx, 3 // doing 8 bytes at a time
mov esi, pSrc
mov edi, pDest
CopyFPU:
fild qword ptr [esi]
fistp qword ptr [edi]
add esi, 8
add edi, 8
dec ecx
jnz CopyFPU
}
}
// ----------------------------------------------------------
void warmed_fpu_copy(BYTE *pDest, BYTE *pSrc, DWORD bytes)
{
_asm
{
mov edx, bytes
shr edx, 12 // # of 4k blocks
mov esi, pSrc
mov edi, pDest
next4KFPU:
// warm the cache
mov ebx, esi
mov ecx, 4096/32 // # of cache lines per 4k block
warmm:
mov al, [esi] // read first byte from each cache line to cause a fill
add esi, 32
dec ecx
jnz warmm
mov ecx, 4096/8 // copy 4k block, qword by qword
mov esi, ebx
copy4KloopFPU:
fild qword ptr [esi]
fistp qword ptr [edi]
add esi, 8
add edi, 8
dec ecx
jnz copy4KloopFPU
dec edx
jnz next4KFPU
}
}
// ----------------------------------------------------------
void mmx_copy(BYTE *pDest, BYTE *pSrc, DWORD bytes)
{
_asm
{
mov ecx, bytes
shr ecx, 3 // doing 8 bytes at a time
mov esi, pSrc
mov edi, pDest
CopyM:
movq mm0, [esi]
movq [edi], mm0
add esi, 8
add edi, 8
dec ecx
jnz CopyM
emms
}
}
// ----------------------------------------------------------
void warmed_mmx_copy(BYTE *pDest, BYTE *pSrc, DWORD bytes)
{
_asm
{
mov edx, bytes
shr edx, 12 // # of 4k blocks
mov esi, pSrc
mov edi, pDest
next4Km:
// warm the cache
mov ebx, esi
mov ecx, 4096/32 // # of cache lines per 4k block
warmm:
mov al, [esi] // read first byte from each cache line to cause a fill
add esi, 32
dec ecx
jnz warmm
mov ecx, 4096/8 // copy 4k block, qword by qword
mov esi, ebx
copy4Kloopm:
movq mm0, [esi]
movq [edi], mm0
add esi, 8
add edi, 8
dec ecx
jnz copy4Kloopm
dec edx
jnz next4Km
emms
}
}
// -----------------------------------------------------------------------
BOOL CheckMMXTechnology(void)
{
BOOL retval = TRUE;
DWORD RegEDX;
__try {
_asm {
mov eax, 1 // set up CPUID to return processor version and features
// 0 = vendor string, 1 = version info, 2 = cache info
_emit 0fh // CPUID
_emit 0a2h
mov RegEDX, edx // features returned in edx
}
} __except(EXCEPTION_EXECUTE_HANDLER) { retval = FALSE; }
if (retval == FALSE)
return FALSE; // processor does not support CPUID
if (RegEDX & 0x800000) // bit 23 is set for MMX technology
{
__try { _asm emms } // try executing the MMX instruction "emms"
__except(EXCEPTION_EXECUTE_HANDLER) { retval = FALSE; }
}
else
return FALSE; // processor supports CPUID but does not support MMX technology
// if retval == 0 here, it means the processor has MMX technology but
// floating-point emulation is on; so MMX technology is unavailable
return retval;
}
// -----------------------------------------------------------------------
void run_tests(BYTE *pD, BYTE *pS, DWORD BufLen)
{
DWORD start, len;
start = timeGetTime();
byte_copy(pD, pS, BufLen);
len = timeGetTime() - start;
TxtBufLen += sprintf (TxtBuf+TxtBufLen, "bytes (rep movsb)\t\t %4.1f / ",(float)BUF_SIZE/(float)len/1000.0);
start = timeGetTime();
warmed_byte_copy(pD, pS, BufLen);
len = timeGetTime() - start;
TxtBufLen += sprintf (TxtBuf+TxtBufLen, "%4.1f\n",(float)BUF_SIZE/(float)len/1000.0);
start = timeGetTime();
word_copy(pD, pS, BufLen);
len = timeGetTime() - start;
TxtBufLen += sprintf (TxtBuf+TxtBufLen, "words (rep movsw)\t\t %4.1f / ",(float)BUF_SIZE/(float)len/1000.0);
start = timeGetTime();
warmed_word_copy(pD, pS, BufLen);
len = timeGetTime() - start;
TxtBufLen += sprintf (TxtBuf+TxtBufLen, "%4.1f\n",(float)BUF_SIZE/(float)len/1000.0);
start = timeGetTime();
dword_copy(pD, pS, BufLen);
len = timeGetTime() - start;
TxtBufLen += sprintf (TxtBuf+TxtBufLen, "dwords (rep movsd)\t\t %4.1f / ",(float)BUF_SIZE/(float)len/1000.0);
start = timeGetTime();
warmed_dword_copy(pD, pS, BufLen);
len = timeGetTime() - start;
TxtBufLen += sprintf (TxtBuf+TxtBufLen, "%4.1f\n",(float)BUF_SIZE/(float)len/1000.0);
_asm fninit;
start = timeGetTime();
fpu_copy(pD, pS, BufLen);
len = timeGetTime() - start;
TxtBufLen += sprintf (TxtBuf+TxtBufLen, "qwords (FP registers)\t\t %4.1f / ",(float)BUF_SIZE/(float)len/1000.0);
start = timeGetTime();
warmed_fpu_copy(pD, pS, BufLen);
len = timeGetTime() - start;
TxtBufLen += sprintf (TxtBuf+TxtBufLen, "%4.1f\n",(float)BUF_SIZE/(float)len/1000.0);
if (MMXTechnology)
{
start = timeGetTime();
mmx_copy(pD, pS, BufLen);
len = timeGetTime() - start;
TxtBufLen += sprintf (TxtBuf+TxtBufLen, "qwords (MMX(tm) Technology)\t %4.1f / ",(float)BUF_SIZE/(float)len/1000.0);
start = timeGetTime();
warmed_mmx_copy(pD, pS, BufLen);
len = timeGetTime() - start;
TxtBufLen += sprintf (TxtBuf+TxtBufLen, "%4.1f\n",(float)BUF_SIZE/(float)len/1000.0);
}
start = timeGetTime();
fill_dword(pD, BufLen);
len = timeGetTime() - start;
TxtBufLen += sprintf (TxtBuf+TxtBufLen, "Fill dwords (rep stosd)\t %4.1f\n",(float)BUF_SIZE/(float)len/1000.0);
start = timeGetTime();
fill_mmx(pD, BufLen);
len = timeGetTime() - start;
TxtBufLen += sprintf (TxtBuf+TxtBufLen, "Fill qwords (movq)\t\t %4.1f\n",(float)BUF_SIZE/(float)len/1000.0);
}
void fill_dword(BYTE *pDest, DWORD bytes)
{
_asm
{
mov ecx, bytes
shr ecx, 2 // filling dwords
mov edi, pDest
xor eax, eax // fill value is zero
rep stosd
}
}
void fill_mmx(BYTE *pDest, DWORD bytes)
{
_asm
{
mov ecx, bytes
shr ecx, 3 // filling qwords
mov edi, pDest
pxor mm0, mm0 // fill value is zero
fill_mmx_loop:
movq [edi], mm0
add edi, 8
dec ecx
jnz fill_mmx_loop
emms
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -