bandwidthtest.cu
来自「gpu编程」· CU 代码 · 共 829 行 · 第 1/2 页
CU
829 行
printf("\n"); //print results if(printmode == CSV) { printResultsCSV(memSizes, bandwidths, count); } else { printResultsReadable(memSizes, bandwidths, count); } //clean up free(memSizes); free(bandwidths);}//////////////////////////////////////////////////////////////////////////////// Intense shmoo mode - covers a large range of values with varying increments//////////////////////////////////////////////////////////////////////////////voidtestBandwidthShmoo(memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice, int endDevice){ //print info for user switch(kind) { case DEVICE_TO_HOST: printf("Device to Host Bandwidth for "); break; case HOST_TO_DEVICE: printf("Host to Device Bandwidth for "); break; case DEVICE_TO_DEVICE: printf("Device to Device Bandwidth\n"); break; } if( DEVICE_TO_DEVICE != kind ) { switch(memMode) { case PAGEABLE: printf("Pageable memory\n"); break; case PINNED: printf("Pinned memory\n"); break; } } //count the number of copies to make unsigned int count = 1 + (SHMOO_LIMIT_20KB / SHMOO_INCREMENT_1KB) + ((SHMOO_LIMIT_50KB - SHMOO_LIMIT_20KB) / SHMOO_INCREMENT_2KB) + ((SHMOO_LIMIT_100KB - SHMOO_LIMIT_50KB) / SHMOO_INCREMENT_10KB) + ((SHMOO_LIMIT_1MB - SHMOO_LIMIT_100KB) / SHMOO_INCREMENT_100KB) + ((SHMOO_LIMIT_16MB - SHMOO_LIMIT_1MB) / SHMOO_INCREMENT_1MB) + ((SHMOO_LIMIT_32MB - SHMOO_LIMIT_16MB) / SHMOO_INCREMENT_2MB) + ((SHMOO_MEMSIZE_MAX - SHMOO_LIMIT_32MB) / SHMOO_INCREMENT_4MB); unsigned int *memSizes = ( unsigned int * )malloc( count * sizeof( unsigned int ) ); float *bandwidths = ( float * ) malloc( count * sizeof(float) ); // Before calculating the cumulative bandwidth, initialize bandwidths array to NULL for (int i = 0; i < count; i++) bandwidths[i] = 0.0f; // Use the device asked by the user for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) { cudaSetDevice(currentDevice); //Run the shmoo int iteration = 0; unsigned int memSize = 0; while( memSize <= SHMOO_MEMSIZE_MAX ) { if( memSize < SHMOO_LIMIT_20KB ) { memSize += SHMOO_INCREMENT_1KB; } else if( memSize < SHMOO_LIMIT_50KB ) { memSize += SHMOO_INCREMENT_2KB; }else if( memSize < SHMOO_LIMIT_100KB ) { memSize += SHMOO_INCREMENT_10KB; }else if( memSize < SHMOO_LIMIT_1MB ) { memSize += SHMOO_INCREMENT_100KB; }else if( memSize < SHMOO_LIMIT_16MB ) { memSize += SHMOO_INCREMENT_1MB; }else if( memSize < SHMOO_LIMIT_32MB ) { memSize += SHMOO_INCREMENT_2MB; }else { memSize += SHMOO_INCREMENT_4MB; } memSizes[iteration] = memSize; switch(kind) { case DEVICE_TO_HOST: bandwidths[iteration] += testDeviceToHostTransfer( memSizes[iteration], memMode ); break; case HOST_TO_DEVICE: bandwidths[iteration] += testHostToDeviceTransfer( memSizes[iteration], memMode ); break; case DEVICE_TO_DEVICE: bandwidths[iteration] += testDeviceToDeviceTransfer( memSizes[iteration] ); break; } iteration++; printf("."); } } // Complete the bandwidth computation on all the devices printf("\n"); //print results if( CSV == printmode) { printResultsCSV(memSizes, bandwidths, count); } else { printResultsReadable(memSizes, bandwidths, count); } //clean up free(memSizes); free(bandwidths);}///////////////////////////////////////////////////////////////////////////////// test the bandwidth of a device to host memcopy of a specific size///////////////////////////////////////////////////////////////////////////////floattestDeviceToHostTransfer(unsigned int memSize, memoryMode memMode){ unsigned int timer = 0; float elapsedTimeInMs = 0.0f; float bandwidthInMBs = 0.0f; unsigned char *h_idata = NULL; unsigned char *h_odata = NULL; CUT_SAFE_CALL( cutCreateTimer( &timer ) ); //allocate host memory if( PINNED == memMode ) { //pinned memory mode - use special function to get OS-pinned memory CUDA_SAFE_CALL( cudaMallocHost( (void**)&h_idata, memSize ) ); CUDA_SAFE_CALL( cudaMallocHost( (void**)&h_odata, memSize ) ); } else { //pageable memory mode - use malloc h_idata = (unsigned char *)malloc( memSize ); h_odata = (unsigned char *)malloc( memSize ); } //initialize the memory for(unsigned int i = 0; i < memSize/sizeof(unsigned char); i++) { h_idata[i] = (unsigned char) (i & 0xff); } // allocate device memory unsigned char* d_idata; CUDA_SAFE_CALL( cudaMalloc( (void**) &d_idata, memSize)); //initialize the device memory CUDA_SAFE_CALL( cudaMemcpy( d_idata, h_idata, memSize, cudaMemcpyHostToDevice) ); //copy data from GPU to Host CUT_SAFE_CALL( cutStartTimer( timer)); for( unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++ ) { CUDA_SAFE_CALL( cudaMemcpy( h_odata, d_idata, memSize, cudaMemcpyDeviceToHost) ); } //note: Since Device to Host memcopies are blocking, there is no need // for a cudaThreadSynchronize() here. //get the the total elapsed time in ms CUT_SAFE_CALL( cutStopTimer( timer)); elapsedTimeInMs = cutGetTimerValue( timer); //calculate bandwidth in MB/s bandwidthInMBs = (1e3f * memSize * (float)MEMCOPY_ITERATIONS) / (elapsedTimeInMs * (float)(1 << 20)); //clean up memory CUT_SAFE_CALL( cutDeleteTimer( timer)); if( PINNED == memMode ) { CUDA_SAFE_CALL( cudaFreeHost(h_idata) ); CUDA_SAFE_CALL( cudaFreeHost(h_odata) ); } else { free(h_idata); free(h_odata); } CUDA_SAFE_CALL(cudaFree(d_idata)); return bandwidthInMBs;}/////////////////////////////////////////////////////////////////////////////////! test the bandwidth of a host to device memcopy of a specific size///////////////////////////////////////////////////////////////////////////////floattestHostToDeviceTransfer(unsigned int memSize, memoryMode memMode){ unsigned int timer = 0; float elapsedTimeInMs = 0.0f; float bandwidthInMBs = 0.0f; CUT_SAFE_CALL( cutCreateTimer( &timer ) ); //allocate host memory unsigned char *h_odata = NULL; if( PINNED == memMode ) { //pinned memory mode - use special function to get OS-pinned memory CUDA_SAFE_CALL( cudaMallocHost( (void**)&h_odata, memSize ) ); } else { //pageable memory mode - use malloc h_odata = (unsigned char *)malloc( memSize ); } unsigned char *h_cacheClear1 = (unsigned char *)malloc( CACHE_CLEAR_SIZE ); unsigned char *h_cacheClear2 = (unsigned char *)malloc( CACHE_CLEAR_SIZE ); //initialize the memory for(unsigned int i = 0; i < memSize/sizeof(unsigned char); i++) { h_odata[i] = (unsigned char) (i & 0xff); } for(unsigned int i = 0; i < CACHE_CLEAR_SIZE / sizeof(unsigned char); i++) { h_cacheClear1[i] = (unsigned char) (i & 0xff); h_cacheClear2[i] = (unsigned char) (0xff - (i & 0xff)); } //allocate device memory unsigned char* d_idata; CUDA_SAFE_CALL( cudaMalloc( (void**) &d_idata, memSize)); //copy host memory to device memory for(unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { CUT_SAFE_CALL( cutStartTimer( timer)); CUDA_SAFE_CALL( cudaMemcpy( d_idata, h_odata, memSize, cudaMemcpyHostToDevice) ); //Note: since Host to Device memory copies are blocking, // there is no need for a cudaThreadSynchronize() here. //the the total elapsed time in ms CUT_SAFE_CALL( cutStopTimer( timer)); elapsedTimeInMs += cutGetTimerValue( timer); CUT_SAFE_CALL( cutResetTimer( timer)); //prevent unrealistic caching effects by copying a large amount of data for(unsigned int j = 0 ; j < CACHE_CLEAR_SIZE / sizeof(unsigned char); j++) { h_cacheClear1[i] = h_cacheClear2[i] & i; } } //calculate bandwidth in MB/s bandwidthInMBs = (1e3f * memSize * (float)MEMCOPY_ITERATIONS) / (elapsedTimeInMs * (float)(1 << 20)); //clean up memory CUT_SAFE_CALL( cutDeleteTimer( timer)); if( PINNED == memMode ) { CUDA_SAFE_CALL( cudaFreeHost(h_odata) ); } else { free(h_odata); } free(h_cacheClear1); free(h_cacheClear2); CUDA_SAFE_CALL(cudaFree(d_idata)); return bandwidthInMBs;}/////////////////////////////////////////////////////////////////////////////////! test the bandwidth of a device to device memcopy of a specific size///////////////////////////////////////////////////////////////////////////////floattestDeviceToDeviceTransfer(unsigned int memSize){ unsigned int timer = 0; float elapsedTimeInMs = 0.0f; float bandwidthInMBs = 0.0f; CUT_SAFE_CALL( cutCreateTimer( &timer ) ); //allocate host memory unsigned char *h_idata = (unsigned char *)malloc( memSize ); //initialize the host memory for(unsigned int i = 0; i < memSize/sizeof(unsigned char); i++) { h_idata[i] = (unsigned char) (i & 0xff); } //allocate device memory unsigned char *d_idata; CUDA_SAFE_CALL( cudaMalloc( (void**) &d_idata, memSize)); unsigned char *d_odata; CUDA_SAFE_CALL( cudaMalloc( (void**) &d_odata, memSize)); //initialize memory CUDA_SAFE_CALL( cudaMemcpy( d_idata, h_idata, memSize, cudaMemcpyHostToDevice) ); //run the memcopy CUT_SAFE_CALL( cutStartTimer( timer)); for( unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++ ) { CUDA_SAFE_CALL( cudaMemcpy( d_odata, d_idata, memSize, cudaMemcpyDeviceToDevice) ); } //Since device to device memory copies are non-blocking, //cudaThreadSynchronize() is required in order to get //proper timing. CUDA_SAFE_CALL( cudaThreadSynchronize() ); //get the the total elapsed time in ms CUT_SAFE_CALL( cutStopTimer( timer)); elapsedTimeInMs = cutGetTimerValue( timer); //calculate bandwidth in MB/s bandwidthInMBs = 2.0f * (1e3f * memSize * (float)MEMCOPY_ITERATIONS) / (elapsedTimeInMs * (float)(1 << 20)); //clean up memory CUT_SAFE_CALL( cutDeleteTimer( timer)); free(h_idata); CUDA_SAFE_CALL(cudaFree(d_idata)); CUDA_SAFE_CALL(cudaFree(d_odata)); return bandwidthInMBs;}///////////////////////////////////////////////////////////print results in an easily read format////////////////////////////////////////////////////////void printResultsReadable(unsigned int *memSizes, float *bandwidths, unsigned int count){ printf("Transfer Size (Bytes)\tBandwidth(MB/s)\n"); for(unsigned int i = 0; i < count; i++) { printf("%9u\t\t%.1f\n", memSizes[i], bandwidths[i]); } printf("\n"); fflush(stdout);}/////////////////////////////////////////////////////////////////////////////print results in CSV format///////////////////////////////////////////////////////////////////////////void printResultsCSV(unsigned int *memSizes, float *bandwidths, unsigned int count){ printf("Transfer size (Bytes),"); for(unsigned int i = 0; i < count; i++) { printf("%u,", memSizes[i]); } printf("\n"); printf("Bandwidth (MB/s),"); for(unsigned int i = 0; i < count; i++) { printf("%.1f,", bandwidths[i]); } printf("\n\n"); fflush(stdout);}/////////////////////////////////////////////////////////////////////////////Print help screen///////////////////////////////////////////////////////////////////////////void printHelp(void){ printf("Usage: bandwidthTest [OPTION]...\n"); printf("Test the bandwidth for device to host, host to device, and device to device transfers\n"); printf("\n"); printf("Example: measure the bandwidth of device to host pinned memory copies in the range 1024 Bytes to 102400 Bytes in 1024 Byte increments\n"); printf("./bandwidthTest --memory=pinned --mode=range --start=1024 --end=102400 --increment=1024 --dtoh\n"); printf("\n"); printf("Options:\n"); printf("--help\tDisplay this help menu\n"); printf("--csv\tPrint results as a CSV\n"); printf("--device=[deviceno]\tSpecify the device device to be used\n"); printf(" all - compute cumulative bandwidth on all the devices\n"); printf(" 0,1,2,...,n - Specify any particular device to be used\n"); printf("--memory=[MEMMODE]\tSpecify which memory mode to use\n"); printf(" pageable - pageable memory\n"); printf(" pinned - non-pageable system memory\n"); printf("--mode=[MODE]\tSpecify the mode to use\n"); printf(" quick - performs a quick measurement\n"); printf(" range - measures a user-specified range of values\n"); printf(" shmoo - performs an intense shmoo of a large range of values\n"); printf("--htod\tMeasure host to device transfers\n"); printf("--dtoh\tMeasure device to host transfers\n"); printf("--dtod\tMeasure device to device transfers\n"); printf("Range mode options\n"); printf("--start=[SIZE]\tStarting transfer size in bytes\n"); printf("--end=[SIZE]\tEnding transfer size in bytes\n"); printf("--increment=[SIZE]\tIncrement size in bytes\n");}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?