📄 l1.overassoc.c

📁 代码优化,有效使用内存,透视优化技术,对比优化方法,如果你在追求代码效率的最大化,该资源你不能不读.

💻 C

字号:

/*----------------------------------------------------------------------------
 *
 *				THE EXAMPLE DEMONSTRATING PERFORMANCE DROP
 *			that occurs when processed data hit the same cache line
 *			==============================================================
 *
 * Build 0x001 06.08.2002
----------------------------------------------------------------------------*/
// CONFIGURATION
#define N_ITER			466			// number of iterations
									// theoretically, LINE_SIZE*N_ITER bytes of cache memory 
									// will be required, since in this case
									// 466*64 = ~30 KB

#define CACHE_BANK_SIZE	(4*K)		// cache bank size

#define LINE_SIZE		64			// maximum possible size of cache lines

#define BLOCK_SIZE		((CACHE_BANK_SIZE+LINE_SIZE)*N_ITER) // block size

#include <DoCPU.h>					// include the profiler


//----------------------------------------------------------------------------
//
//					VERSION DEMONSTRATING CACHE LINE CONFLICT
//
//----------------------------------------------------------------------------
int over_assoc(int *p)
{
	int a;
	volatile int x=0;

	// attention: top-level loop is skipped since the 
	// profiler will anyway execute it 10 times
	for(a=0; a < N_ITER; a++)
		// read memory with 4 KB step, as a result,
		// cache saturation is quickly reached both on
		// P-II/P-III/P-4  and on AMD Athlon.
		// Cache is overloaded, since more that
		// 12 cells are processed, and read buffering
		// implemented on Athlon doesn't save
		// the situation
		//x+=*(int *)((int)p + a*CACHE_BANK_SIZE);
		*(int *)((char *)p + a*CACHE_BANK_SIZE)=x;

	return x;
}

//----------------------------------------------------------------------------
//
//					VERSION THAT EXECUTES WITHOUT CONFLICTS
//
//----------------------------------------------------------------------------
int optimize(int  *p)
{
	int a=0;
	volatile int x=0;

	// attention: top-level loop is skipped since the 
	// profiler will anyway execute it 10 times
	for(a=0; a < N_ITER; a++)
	{
		// Read memory with the step of  CACHE_BANK_SIZE+LINE_SIZE
		// since in this case 4096+64=4160 bytes;
		// since base addrsses of all cells are different
		// we have no conflicts and cache memory is used
		// with 100% efficiency
		//x+=*(int *)((int)p + a*(CACHE_BANK_SIZE+LINE_SIZE));
		*(int *)((char *)p + a*(CACHE_BANK_SIZE+LINE_SIZE))=x;
	}
	return x;
}


main()
{
	int *p;
	int a = 0;

	PRINT("= = = Demonstration of the cache line conflict = = =\n");
	PRINT_TITLE;

	// allocating memory
	p = (int *)_malloc32(BLOCK_SIZE);

	// pessimized version
	A_BEGIN(0)
		a+=over_assoc(p);
	A_END(0)

	// optimized version
	A_BEGIN(1)
		a+=optimize(p);
	A_END(1)

	// console output of the results
	Lx_OUT("",Ax_GET(1),Ax_GET(0));

}

⌨️ 快捷键说明

复制代码 Ctrl + C

搜索代码 Ctrl + F

全屏模式 F11

切换主题 Ctrl + Shift + D

显示快捷键 ?

增大字号 Ctrl + =

减小字号 Ctrl + -