📄 parallel.test.c

📁 代码优化,有效使用内存,透视优化技术,对比优化方法,如果你在追求代码效率的最大化,该资源你不能不读.
💻 C
字号:
/* ---------------------------------------------------------------------------
 * @
 *
 *		UTILITY FOR DEMONSTRATING THE EFFICIENCY OF PARALLEL MEMORY READ
 *		=================================================================
 *
 * Build 0x003	12.06.2002
 * Build 0x002	20.05.2002
--------------------------------------------------------------------------- */

// CONFIGURATION
#define _BLOCK_SIZE			(16*M)					// processed block size
#define _STEP_SIZE			(L1_CACHE_SIZE)			// size of the processed sub-block


#include <DoCPU.h>
main()
{
	int a, b;
	int x = 0;
	int *p;
	int BLOCK_SIZE = _BLOCK_SIZE;
	int STEP_SIZE  = _STEP_SIZE;
	
	// Allocating the memory
	p = malloc(BLOCK_SIZE);

	// TITLE
	PRINT("=== demonstrating the efficiency of parallel read ===\n");
	PRINT_TITLE;
	PRINT("------------------ Testing ------------------\n");

	/* -----------------------------------------------------------------------
	 *
	 *	measuring the throughput when sequentially reading the data
	 *
	----------------------------------------------------------------------- */
	CLEAR_L2_CACHE();VVV;
	A_BEGIN(1)
		for (a=0; a<BLOCK_SIZE; a += 8 * sizeof(int))
		{
			// loading the first cell
			// since it is missing from the cache,
			// processor sends to the chipset 
			// a request to read it
			x += *(int *)((char *)p + a + 0 * sizeof(int));

			// loading the next cell
			// since there is no dependence by data,
			// the processor can execute this command
			// without waiting for the previous one to complete
			// however, since the CPU sees 
			// that this cell is returned with the previous
			// request, it doesn't generate a new request,
			// but instead, waits for the current to complete
			x += *(int *)((char *)p + a + 1 * sizeof(int));

			// similarly, the processor waits for the first
			// request to accomplish..
			x += *(int *)((char *)p + a + 2 * sizeof(int));
			x += *(int *)((char *)p + a + 3 * sizeof(int));
			x += *(int *)((char *)p + a + 4 * sizeof(int));
			x += *(int *)((char *)p + a + 5 * sizeof(int));
			x += *(int *)((char *)p + a + 6 * sizeof(int));
			x += *(int *)((char *)p + a + 7 * sizeof(int));
		}
	A_END(1)

	printf("%s:\t %4.2f (Mbytes/s)\n",	/* displaying the results */
		_TEXT("Sequential read"),((BLOCK_SIZE)/cpu2time(Ax_GET(1))));


	/* -----------------------------------------------------------------------
	 *
	 *	Measuring the throughput at parallel data read
	 *
	----------------------------------------------------------------------- */
	CLEAR_L2_CACHE();VVV;
	A_BEGIN(2)
		for (b = 0; b < BLOCK_SIZE; b += STEP_SIZE)
		{						//       ^^^^^^	size of the processed block
								//				must not exceed the
								//				size of L1 cache

			// first pass of the loop in which the parallel
			// data loading is performed
			for (a=b; a<(b+STEP_SIZE); a+= 128)
			{
				// loading the first cell
				// since it is missing from the cache,
				// the processor sends the request to the chipset
				// to read this cell
				x += *(int *)((char *)p + a + 0);

				// loading the next cell
				// since there is no dependence by data,
				// the processor can execute this command
				// without waiting for the previous one to complete
				// however, since the processor sees that the
				// will not be returned with the block just
				// requested, it sends just another request
				// to the chipset without 
				// waiting for the previous one to accomplish
				x += *(int *)((char *)p + a + 32);

				// proceeding the similar way, - now there are three requests on the bus!
				x += *(int *)((char *)p + a + 64);

				// the fourth request is sent to the bus
				// while the first one might not be 
				// accomplished yet
				x += *(int *)((char *)p + a + 96);
			}

			for (a=b; a<(b+STEP_SIZE); a += 32)
			{
				// it is not necessary to read the next cell
				// since it was read in the first loop
				// x += *(int *)((char *)p + a + 0);

				// these cells will be already in cache!
				// now they can be loaded very fast!
				x += *(int *)((char *)p + a + 4);
				x += *(int *)((char *)p + a + 8);
				x += *(int *)((char *)p + a + 12);
				x += *(int *)((char *)p + a + 16);
				x += *(int *)((char *)p + a + 20);
				x += *(int *)((char *)p + a + 24);
				x += *(int *)((char *)p + a + 28);
			}
		}
	A_END(2)

	printf("%s:\t %4.2f (Mbytes/s)\n",	/* output of the results */
		_TEXT("Read with 32-byte step"),(BLOCK_SIZE)/cpu2time(Ax_GET(2)));
	PRINT(_TEXT("--------------------------------------------------\n"));

return x;

}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -