📄 prefetch.hierarchy.l2.c

📁 代码优化,有效使用内存,透视优化技术,对比优化方法,如果你在追求代码效率的最大化,该资源你不能不读.
💻 C
字号:
/*----------------------------------------------------------------------------
 *
 *						THE EXAMPLE DEMONSTRATING THE EFFICIENCY
 *			OF LOADING DATA INTO L1 CACHE BYPASSING L2
 *			==========================================================
 *
 * Build 0x002 10.08.2002
----------------------------------------------------------------------------*/
#define BLOCK1_SIZE	(128*K)				// block that fits entirely into L2 cache,
										// but exceeding the L1 cache limits

#define BLOCK2_SIZE	(4*M)				// block that significantly exceeds
										// L2 cache size
#define STEP_SIZE	32
#include <DoCPU.h>
#define a _p2

/*----------------------------------------------------------------------------
 *
 *							TEST WITHOUT PREFETCHING
 *												(non-optimized version)
 *
----------------------------------------------------------------------------*/
float no_prefetch(int *p1, int *p2)
{
	int x = 0;
	int _p2, _p1 = 0;
	
	for(_p2 = 0; _p2 < BLOCK2_SIZE; _p2 += STEP_SIZE)
	{
		//				* * * processing BLOCK1 * * *
		// since it is processed in the loop (the A_BEGIN loop),
		// it is desirable that nothing would force it out of the L2 cache
		x += *(int *)((char *)p1 + _p1); if ((_p1 += 4) > BLOCK1_SIZE) _p1 = 0;

		//				* * * processing BLOCK1 * * *
		// this block significantly exceeds the L2 cache size, therefore,
		// it is not necessary to cache it - anyway at next loop iterations
		// the required data won't be present in cache...
		// Furthermore, L2 already contains BLOCK1, and discarding it is also
		// undesirable. However, the processor automatically places data
		// from BLOCK2 into L2 cache, thus choosing the least optimal caching strategy...
		x += *(int *)((char *)p2 + a);
		x += *(int *)((char *)p2 + a + 4);
		x += *(int *)((char *)p2 + a + 8);
		x += *(int *)((char *)p2 + a + 12);
		x += *(int *)((char *)p2 + a + 16);
		x += *(int *)((char *)p2 + a + 20);
		x += *(int *)((char *)p2 + a + 24);
		x += *(int *)((char *)p2 + a + 28);
	}
	return x;
}

/*----------------------------------------------------------------------------
 *
 *				TEST WITH prefetchnta (mem -> L1)
 *												(optimized version)
 *
----------------------------------------------------------------------------*/
int have_prefetchnta(int *p1, int *p2)
{
	int x = 0;
	int _p2, _p1 = 0;

	for(_p2 = 0; _p2 < BLOCK2_SIZE; _p2 += STEP_SIZE)
	{
		//				* * * processing BLOCK1 * * *
		// prefetching into L1 cache is not needed, because the performance
		// of the L2 cache (where this block resides)  is sufficient and
		// overhead for loading this data from L2 cache is negligible
		x += *(int *)((char *)p1 + _p1); if ((_p1 += 4) > BLOCK1_SIZE) _p1 = 0;

		// instruct th processor to load the data from BLOCK2 into L1 cache,
		// bypassing L2 cache. First, this allows to eliminate the delay
		// for loading the data from the slow RAM,  and, second,
		// prevents forcing BLOCK1 out of L2 cache
		_prefetchnta((char *)((char *)p2 + a + STEP_SIZE*6));
		//											^^^^
		// Note: we are prefetching the data that will be accessed six
		// iterations later. Why? The point is, that 
		// memory subsystem latency exceeds the time of one loop iteration;
		// by loading the data required for next iterations we are losing only 6 first iterations
		// rather than each six iterations, i.e., this technique is 
		// acceptable as it provides maximum performance gain
		// (for more details see "Planning prefetching distance")
		
		//				* * * processing BLOCK1 * * *
		// The data now is being loaded from L1 cache!
		x += *(int *)((char *)p2 + a);
		x += *(int *)((char *)p2 + a + 4);
		x += *(int *)((char *)p2 + a + 8);
		x += *(int *)((char *)p2 + a + 12);
		x += *(int *)((char *)p2 + a + 16);
		x += *(int *)((char *)p2 + a + 20);
		x += *(int *)((char *)p2 + a + 24);
		x += *(int *)((char *)p2 + a + 28);
	}
	return x;
}

/*----------------------------------------------------------------------------
 *
 *				TEST WITH prefetcht0 (Mem -> L2 -> L1)
 *													(optimized version)
 *
----------------------------------------------------------------------------*/
int have_prefetcht0(int *p1, int *p2)
{
	int x = 0;
	int _p2, _p1 = 0;

	for(_p2 = 0; _p2 < BLOCK2_SIZE; _p2 += STEP_SIZE)
	{
		//			* * * processing BLOCK1 * * *
		// prefetching into L1 cache is not needed, because the performance
		// of the L2 cache (where this block resides)  is sufficient and
		// overhead for loading this data from L2 cache is negligible
		x += *(int *)((char *)p1 + _p1); if ((_p1 += 4) > BLOCK1_SIZE) _p1 = 0;

		// for experiment, let us prefetch data into L1 and L2 cache.
		// This also eliminates delays for loading the data from slow RAM
		// but also litters L2 cache, which is filled with intensely used
		// BLOCK1;  consequently, prefetching efficiency slightly decreases
		// This dicrease, however, is not severe, because only one cache line
		// per iteration is discarded from cache
		_prefetcht0((char *)((char *)p2 + a + STEP_SIZE*6));

		//		* * * processing BLOCK1 * * *
		// The data now is being loaded from L1 cache!
		x += *(int*) ((char *)p2 + a);
		x += *(int *)((char *)p2 + a + 4);
		x += *(int *)((char *)p2 + a + 8);
		x += *(int *)((char *)p2 + a + 12);
		x += *(int *)((char *)p2 + a + 16);
		x += *(int *)((char *)p2 + a + 20);
		x += *(int *)((char *)p2 + a + 24);
		x += *(int *)((char *)p2 + a + 28);
	}
	return x;
}


/*----------------------------------------------------------------------------
 *
 *				TEST WITH prefetcht1 (Mem -> L2)
 *													(optimized version)
 *
----------------------------------------------------------------------------*/
int have_prefetcht1(int *p1, int *p2)
{
	int x = 0;
	int _p2, _p1 = 0;

	for(_p2 = 0; _p2 < BLOCK2_SIZE; _p2 += STEP_SIZE)
	{
		
		//				* * * processing BLOCK1 * * *
		// prefetching into L1 cache is not needed, because the performance
		// of the L2 cache (where this block resides)  is sufficient and
		// overhead for loading this data from L2 cache is negligible
		x += *(int *)((char *)p1 + _p1); if ((_p1 += 4) > BLOCK1_SIZE) _p1 = 0;

		// for experiment let us perform prefetching only into L2 cache; this
		// is the most unefficient prefetching strategy!  
		// Now the processor will have to wait until the data is loaded from L2 cache
		// into L1 cache, useful data are now forced out of the L2 cache
		// Naturally, this does not mean that  prefetch1  is "inefficient" instruction.
		// Simply, in this particular case such strategy proved to be
		// most inefficient
		_prefetcht1((char *)((char *)p2 + a + STEP_SIZE*6));

		//				* * * processing BLOCK1 * * *
		// the data are in L2 cache and the CPU has to wait several clocks
		// for the data to be loaded from there
		x += *(int *)((char *)p2 + a);
		x += *(int *)((char *)p2 + a + 4);
		x += *(int *)((char *)p2 + a + 8);
		x += *(int *)((char *)p2 + a + 12);
		x += *(int *)((char *)p2 + a + 16);
		x += *(int *)((char *)p2 + a + 20);
		x += *(int *)((char *)p2 + a + 24);
		x += *(int *)((char *)p2 + a + 28);
	}
	return x;
}

main()
{
	int *p1, *p2;

	// TITLE
	PRINT("= = = memory optimization using prefetch (prefetch type) = = =\n");
	PRINT_TITLE;

	// allocating memory
	p1 = (int *) _malloc32(BLOCK1_SIZE);
	p2 = malloc(BLOCK2_SIZE);

	A_BEGIN(1);
		no_prefetch(p1,p2);
	A_END(1);

	A_BEGIN(2)
		have_prefetchnta(p1, p2);
	A_END(2);
	
	A_BEGIN(3)
		have_prefetcht0(p1, p2);
	A_END(3);

	A_BEGIN(4)
			have_prefetcht1(p1, p2);
	A_END(4);

	Lx_OUT("PreFetchNTA",Ax_GET(1),Ax_GET(2));
	Lx_OUT("PreFetchT0 ",Ax_GET(1),Ax_GET(3));
	Lx_OUT("PreFetchT1 ",Ax_GET(1),Ax_GET(4));

}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -