📄 memcpy.optimize.c

📁 代码优化,有效使用内存,透视优化技术,对比优化方法,如果你在追求代码效率的最大化,该资源你不能不读.
💻 C
字号:
/*
	The example demonstrating technique of memory copying optimization
	by means of prefetching
*/

// CONFIGURATION
#define BLOCK_SIZE (1*M)					// size of the copied block
											// ATTENTION: actual size of
											// allocated memory will be equal to
											// 6*BLOCK_SIZE*A_NITER
											// so be careful ;-)
											// yes, i know that this is a lame
											// algorithm,  but, actualy,
											// this was done in order to 
											// guarantee that time memory
											// blocks are actually in cache
											// rather than in main memory

#define _PAGE_SIZE (4*K)					// page size

#include <DoCPU.h>
#include <DoCPU.cmd.h>


/* ---------------------------------------------------------------------------

							NON-OPTIMIZED VERSION

--------------------------------------------------------------------------- */
normal_memcpy(char *dst, char *src, int len)
{
	memcpy(dst, src, len);		// copying memory using the built-in memcpy function
}


/* ---------------------------------------------------------------------------

						OPTIMIZED VERSION
										(using prefetching)

--------------------------------------------------------------------------- */
prefetch_memcpy(char *dst, char *src, int len)
{
	int a, b, temp;

	// copying memory by pages to make page crossing rare
	for (a = 0; a < len; a += _PAGE_SIZE)
	{
		// prefetching
		temp = *(int *)((char *) src + a + _PAGE_SIZE);
		for (b = a; b < (a + _PAGE_SIZE); b += L1_CACHE_LINE_SIZE)
			_prefetchnta(src + b);

		// copying prefetched memory block using the built-in memcpy function
		memcpy(dst + a, src + a, _PAGE_SIZE);
	}
	return temp;
}


	/* -----------------------------------------------------------------------

						PESSIMIZED VERSION
							(using non-cached write instruction)

	----------------------------------------------------------------------- */
_turbo_memcpy(char *dst, char *src, int len)
{
	int a, b, temp;
	
	for (a = 0; a < len; a += _PAGE_SIZE)
	{

		// prefetching
		temp = *(int *)((char *) src + a + _PAGE_SIZE);
		for (b = a; b < a + _PAGE_SIZE; b += L1_CACHE_LINE_SIZE)
			__prefetchnta(src + b);


		// copying memory by non-cached write instruction
		for (b = a; b < a + _PAGE_SIZE; b += 16*8)
		{
			__stream_cpy(dst + b + 16*0, src + b + 16*0);
			__stream_cpy(dst + b + 16*1, src + b + 16*1);

			__stream_cpy(dst + b + 16*2, src + b + 16*2);
			__stream_cpy(dst + b + 16*3, src + b + 16*3);

			__stream_cpy(dst + b + 16*4, src + b + 16*4);
			__stream_cpy(dst + b + 16*5, src + b + 16*5);

			__stream_cpy(dst + b + 16*6, src + b + 16*6);
			__stream_cpy(dst + b + 16*7, src + b + 16*7);
		}

	}
	return temp;
}


main()
{

	char *src, *dst;

	// allocating memory
	src = (char *) _malloc32(BLOCK_SIZE*3*A_NITER);
	dst = (char *) _malloc32(BLOCK_SIZE*3*A_NITER);

	PRINT("= = = memcpy optimization using prefetch = = =\n");
	PRINT_TITLE;

	AL_BEGIN;
		L_BEGIN(0)
			normal_memcpy(dst, src, BLOCK_SIZE);
			dst += BLOCK_SIZE;
			src += BLOCK_SIZE;
		L_END(0)
	AL_END;
	// ATTENTION: the dst and src pointers are not moved. This is done _on purpose_, to guarantee
	// that when copying tiny memory blocks by other functions
	// we will be forced to load data from memory rather than from cache

	AL_BEGIN;
		L_BEGIN(1)
			prefetch_memcpy(dst, src, BLOCK_SIZE);
			dst += BLOCK_SIZE;
			src += BLOCK_SIZE;
		L_END(1)
	AL_END;

	AL_BEGIN;
		L_BEGIN(2)
			_turbo_memcpy(dst, src, BLOCK_SIZE);
			dst += BLOCK_SIZE;
			src += BLOCK_SIZE;
		L_END(2)
	AL_END;


	Ax_OUT("prefetch ", 0, 1);
	Ax_OUT("strem cpy", 0, 2);


}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -