bandwidthtest.cu

来自「gpu编程」· CU 代码 · 共 829 行 · 第 1/2 页

CU
829
字号
        printf("\n");        //print results        if(printmode == CSV)        {            printResultsCSV(memSizes, bandwidths, count);        }        else        {            printResultsReadable(memSizes, bandwidths, count);        }        //clean up        free(memSizes);        free(bandwidths);}//////////////////////////////////////////////////////////////////////////////// Intense shmoo mode - covers a large range of values with varying increments//////////////////////////////////////////////////////////////////////////////voidtestBandwidthShmoo(memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice, int endDevice){    //print info for user    switch(kind)    {    case DEVICE_TO_HOST:    printf("Device to Host Bandwidth for ");        break;    case HOST_TO_DEVICE:    printf("Host to Device Bandwidth for ");        break;    case DEVICE_TO_DEVICE:  printf("Device to Device Bandwidth\n");        break;    }    if( DEVICE_TO_DEVICE != kind )    {   switch(memMode)        {        case PAGEABLE:  printf("Pageable memory\n");            break;        case PINNED:    printf("Pinned memory\n");            break;        }    }    //count the number of copies to make    unsigned int count = 1 + (SHMOO_LIMIT_20KB  / SHMOO_INCREMENT_1KB)                        + ((SHMOO_LIMIT_50KB - SHMOO_LIMIT_20KB) / SHMOO_INCREMENT_2KB)                        + ((SHMOO_LIMIT_100KB - SHMOO_LIMIT_50KB) / SHMOO_INCREMENT_10KB)                        + ((SHMOO_LIMIT_1MB - SHMOO_LIMIT_100KB) / SHMOO_INCREMENT_100KB)                        + ((SHMOO_LIMIT_16MB - SHMOO_LIMIT_1MB) / SHMOO_INCREMENT_1MB)                        + ((SHMOO_LIMIT_32MB - SHMOO_LIMIT_16MB) / SHMOO_INCREMENT_2MB)                        + ((SHMOO_MEMSIZE_MAX - SHMOO_LIMIT_32MB) / SHMOO_INCREMENT_4MB);    unsigned int *memSizes = ( unsigned int * )malloc( count * sizeof( unsigned int ) );    float *bandwidths = ( float * ) malloc( count * sizeof(float) );    // Before calculating the cumulative bandwidth, initialize bandwidths array to NULL    for (int i = 0; i < count; i++)        bandwidths[i] = 0.0f;       // Use the device asked by the user    for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++)    {        cudaSetDevice(currentDevice);        //Run the shmoo        int iteration = 0;        unsigned int memSize = 0;        while( memSize <= SHMOO_MEMSIZE_MAX )        {            if( memSize < SHMOO_LIMIT_20KB )            {                memSize += SHMOO_INCREMENT_1KB;            }            else if( memSize < SHMOO_LIMIT_50KB )            {                memSize += SHMOO_INCREMENT_2KB;            }else if( memSize < SHMOO_LIMIT_100KB )            {                memSize += SHMOO_INCREMENT_10KB;            }else if( memSize < SHMOO_LIMIT_1MB )            {                memSize += SHMOO_INCREMENT_100KB;            }else if( memSize < SHMOO_LIMIT_16MB )            {                memSize += SHMOO_INCREMENT_1MB;            }else if( memSize < SHMOO_LIMIT_32MB )            {                memSize += SHMOO_INCREMENT_2MB;            }else             {                memSize += SHMOO_INCREMENT_4MB;            }            memSizes[iteration] = memSize;            switch(kind)            {            case DEVICE_TO_HOST:    bandwidths[iteration] += testDeviceToHostTransfer( memSizes[iteration], memMode );                break;            case HOST_TO_DEVICE:    bandwidths[iteration] += testHostToDeviceTransfer( memSizes[iteration], memMode );                break;            case DEVICE_TO_DEVICE:  bandwidths[iteration] += testDeviceToDeviceTransfer( memSizes[iteration] );                break;            }            iteration++;            printf(".");       }    } // Complete the bandwidth computation on all the devices    printf("\n");    //print results    if( CSV == printmode)    {        printResultsCSV(memSizes, bandwidths, count);    }    else    {        printResultsReadable(memSizes, bandwidths, count);    }    //clean up    free(memSizes);    free(bandwidths);}/////////////////////////////////////////////////////////////////////////////////  test the bandwidth of a device to host memcopy of a specific size///////////////////////////////////////////////////////////////////////////////floattestDeviceToHostTransfer(unsigned int memSize, memoryMode memMode){    unsigned int timer = 0;    float elapsedTimeInMs = 0.0f;    float bandwidthInMBs = 0.0f;    unsigned char *h_idata = NULL;    unsigned char *h_odata = NULL;    CUT_SAFE_CALL( cutCreateTimer( &timer ) );        //allocate host memory    if( PINNED == memMode )    {        //pinned memory mode - use special function to get OS-pinned memory        CUDA_SAFE_CALL( cudaMallocHost( (void**)&h_idata, memSize ) );        CUDA_SAFE_CALL( cudaMallocHost( (void**)&h_odata, memSize ) );    }    else    {        //pageable memory mode - use malloc        h_idata = (unsigned char *)malloc( memSize );        h_odata = (unsigned char *)malloc( memSize );    }    //initialize the memory    for(unsigned int i = 0; i < memSize/sizeof(unsigned char); i++)    {        h_idata[i] = (unsigned char) (i & 0xff);    }    // allocate device memory    unsigned char* d_idata;    CUDA_SAFE_CALL( cudaMalloc( (void**) &d_idata, memSize));    //initialize the device memory    CUDA_SAFE_CALL( cudaMemcpy( d_idata, h_idata, memSize,                                cudaMemcpyHostToDevice) );    //copy data from GPU to Host    CUT_SAFE_CALL( cutStartTimer( timer));    for( unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++ )    {        CUDA_SAFE_CALL( cudaMemcpy( h_odata, d_idata, memSize,                                cudaMemcpyDeviceToHost) );    }       //note:  Since Device to Host memcopies are blocking, there is no need    //       for a cudaThreadSynchronize() here.    //get the the total elapsed time in ms    CUT_SAFE_CALL( cutStopTimer( timer));    elapsedTimeInMs = cutGetTimerValue( timer);        //calculate bandwidth in MB/s    bandwidthInMBs = (1e3f * memSize * (float)MEMCOPY_ITERATIONS) /                                         (elapsedTimeInMs * (float)(1 << 20));    //clean up memory    CUT_SAFE_CALL( cutDeleteTimer( timer));    if( PINNED == memMode )    {        CUDA_SAFE_CALL( cudaFreeHost(h_idata) );        CUDA_SAFE_CALL( cudaFreeHost(h_odata) );    }    else    {        free(h_idata);        free(h_odata);    }    CUDA_SAFE_CALL(cudaFree(d_idata));        return bandwidthInMBs;}/////////////////////////////////////////////////////////////////////////////////! test the bandwidth of a host to device memcopy of a specific size///////////////////////////////////////////////////////////////////////////////floattestHostToDeviceTransfer(unsigned int memSize, memoryMode memMode){    unsigned int timer = 0;    float elapsedTimeInMs = 0.0f;    float bandwidthInMBs = 0.0f;    CUT_SAFE_CALL( cutCreateTimer( &timer ) );    //allocate host memory    unsigned char *h_odata = NULL;    if( PINNED == memMode )    {        //pinned memory mode - use special function to get OS-pinned memory        CUDA_SAFE_CALL( cudaMallocHost( (void**)&h_odata, memSize ) );    }    else    {        //pageable memory mode - use malloc        h_odata = (unsigned char *)malloc( memSize );    }    unsigned char *h_cacheClear1 = (unsigned char *)malloc( CACHE_CLEAR_SIZE );    unsigned char *h_cacheClear2 = (unsigned char *)malloc( CACHE_CLEAR_SIZE );    //initialize the memory    for(unsigned int i = 0; i < memSize/sizeof(unsigned char); i++)    {        h_odata[i] = (unsigned char) (i & 0xff);    }    for(unsigned int i = 0; i < CACHE_CLEAR_SIZE / sizeof(unsigned char); i++)    {        h_cacheClear1[i] = (unsigned char) (i & 0xff);        h_cacheClear2[i] = (unsigned char) (0xff - (i & 0xff));    }    //allocate device memory    unsigned char* d_idata;    CUDA_SAFE_CALL( cudaMalloc( (void**) &d_idata, memSize));    //copy host memory to device memory    for(unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)    {        CUT_SAFE_CALL( cutStartTimer( timer));        CUDA_SAFE_CALL( cudaMemcpy( d_idata, h_odata, memSize,                                cudaMemcpyHostToDevice) );                //Note:  since Host to Device memory copies are blocking,        //       there is no need for a cudaThreadSynchronize() here.        //the the total elapsed time in ms        CUT_SAFE_CALL( cutStopTimer( timer));        elapsedTimeInMs += cutGetTimerValue( timer);        CUT_SAFE_CALL( cutResetTimer( timer));                //prevent unrealistic caching effects by copying a large amount of data        for(unsigned int j = 0 ; j < CACHE_CLEAR_SIZE / sizeof(unsigned char); j++)        {            h_cacheClear1[i] = h_cacheClear2[i] & i;        }    }    //calculate bandwidth in MB/s    bandwidthInMBs = (1e3f * memSize * (float)MEMCOPY_ITERATIONS) /                                         (elapsedTimeInMs * (float)(1 << 20));    //clean up memory    CUT_SAFE_CALL( cutDeleteTimer( timer));    if( PINNED == memMode )    {        CUDA_SAFE_CALL( cudaFreeHost(h_odata) );    }    else    {        free(h_odata);    }    free(h_cacheClear1);    free(h_cacheClear2);    CUDA_SAFE_CALL(cudaFree(d_idata));    return bandwidthInMBs;}/////////////////////////////////////////////////////////////////////////////////! test the bandwidth of a device to device memcopy of a specific size///////////////////////////////////////////////////////////////////////////////floattestDeviceToDeviceTransfer(unsigned int memSize){    unsigned int timer = 0;    float elapsedTimeInMs = 0.0f;    float bandwidthInMBs = 0.0f;    CUT_SAFE_CALL( cutCreateTimer( &timer ) );    //allocate host memory    unsigned char *h_idata = (unsigned char *)malloc( memSize );    //initialize the host memory    for(unsigned int i = 0; i < memSize/sizeof(unsigned char); i++)    {        h_idata[i] = (unsigned char) (i & 0xff);    }    //allocate device memory    unsigned char *d_idata;    CUDA_SAFE_CALL( cudaMalloc( (void**) &d_idata, memSize));    unsigned char *d_odata;    CUDA_SAFE_CALL( cudaMalloc( (void**) &d_odata, memSize));    //initialize memory    CUDA_SAFE_CALL( cudaMemcpy( d_idata, h_idata, memSize,                                cudaMemcpyHostToDevice) );    //run the memcopy    CUT_SAFE_CALL( cutStartTimer( timer));    for( unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++ )    {        CUDA_SAFE_CALL( cudaMemcpy( d_odata, d_idata, memSize,                                cudaMemcpyDeviceToDevice) );    }      //Since device to device memory copies are non-blocking,    //cudaThreadSynchronize() is required in order to get    //proper timing.    CUDA_SAFE_CALL( cudaThreadSynchronize() );    //get the the total elapsed time in ms    CUT_SAFE_CALL( cutStopTimer( timer));    elapsedTimeInMs = cutGetTimerValue( timer);        //calculate bandwidth in MB/s    bandwidthInMBs = 2.0f * (1e3f * memSize * (float)MEMCOPY_ITERATIONS) /                                         (elapsedTimeInMs * (float)(1 << 20));        //clean up memory    CUT_SAFE_CALL( cutDeleteTimer( timer));    free(h_idata);    CUDA_SAFE_CALL(cudaFree(d_idata));    CUDA_SAFE_CALL(cudaFree(d_odata));    return bandwidthInMBs;}///////////////////////////////////////////////////////////print results in an easily read format////////////////////////////////////////////////////////void printResultsReadable(unsigned int *memSizes, float *bandwidths, unsigned int count){    printf("Transfer Size (Bytes)\tBandwidth(MB/s)\n");    for(unsigned int i = 0; i < count; i++)    {        printf("%9u\t\t%.1f\n", memSizes[i], bandwidths[i]);    }    printf("\n");    fflush(stdout);}/////////////////////////////////////////////////////////////////////////////print results in CSV format///////////////////////////////////////////////////////////////////////////void printResultsCSV(unsigned int *memSizes, float *bandwidths, unsigned int count){    printf("Transfer size (Bytes),");    for(unsigned int i = 0; i < count; i++)    {        printf("%u,", memSizes[i]);    }    printf("\n");    printf("Bandwidth (MB/s),");    for(unsigned int i = 0; i < count; i++)    {        printf("%.1f,", bandwidths[i]);    }    printf("\n\n");    fflush(stdout);}/////////////////////////////////////////////////////////////////////////////Print help screen///////////////////////////////////////////////////////////////////////////void printHelp(void){    printf("Usage:  bandwidthTest [OPTION]...\n");    printf("Test the bandwidth for device to host, host to device, and device to device transfers\n");    printf("\n");    printf("Example:  measure the bandwidth of device to host pinned memory copies in the range 1024 Bytes to 102400 Bytes in 1024 Byte increments\n");    printf("./bandwidthTest --memory=pinned --mode=range --start=1024 --end=102400 --increment=1024 --dtoh\n");    printf("\n");    printf("Options:\n");    printf("--help\tDisplay this help menu\n");    printf("--csv\tPrint results as a CSV\n");    printf("--device=[deviceno]\tSpecify the device device to be used\n");    printf("  all - compute cumulative bandwidth on all the devices\n");    printf("  0,1,2,...,n - Specify any particular device to be used\n");    printf("--memory=[MEMMODE]\tSpecify which memory mode to use\n");    printf("  pageable - pageable memory\n");    printf("  pinned   - non-pageable system memory\n");    printf("--mode=[MODE]\tSpecify the mode to use\n");    printf("  quick - performs a quick measurement\n");    printf("  range - measures a user-specified range of values\n");    printf("  shmoo - performs an intense shmoo of a large range of values\n");    printf("--htod\tMeasure host to device transfers\n");       printf("--dtoh\tMeasure device to host transfers\n");    printf("--dtod\tMeasure device to device transfers\n");        printf("Range mode options\n");    printf("--start=[SIZE]\tStarting transfer size in bytes\n");    printf("--end=[SIZE]\tEnding transfer size in bytes\n");    printf("--increment=[SIZE]\tIncrement size in bytes\n");}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?