📄 transpose.cu
字号:
/*
* Copyright 1993-2007 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software.
*
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
* OR PERFORMANCE OF THIS SOURCE CODE.
*
* U.S. Government End Users. This source code is a "commercial item" as
* that term is defined at 48 C.F.R. 2.numIterations1 (OCT 1995), consisting of
* "commercial computer software" and "commercial computer software
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
* and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* source code with only those rights set forth herein.
*
* Any use of this source code in individual and commercial software must
* include, in the user documentation and internal comments to the code,
* the above Disclaimer and U.S. Government End Users Notice.
*/
/* Matrix transpose with Cuda
* Host code.
* This example transposes arbitrary-size matrices. It compares a naive
* transpose kernel that suffers from non-coalesced writes, to an optimized
* transpose with fully coalesced memory access and no bank conflicts. On
* a G80 GPU, the optimized transpose can be more than 10x faster for large
* matrices.
*/
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, project
#include <cutil.h>
// includes, kernels
#include <transpose_kernel.cu>
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest( int argc, char** argv);
extern "C" void computeGold( float* reference, float* idata,
const unsigned int size_x, const unsigned int size_y );
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv)
{
runTest( argc, argv);
CUT_EXIT(argc, argv);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void
runTest( int argc, char** argv)
{
// size of the matrix
#ifdef __DEVICE_EMULATION__
const unsigned int size_x = 32;
const unsigned int size_y = 128;
#else
const unsigned int size_x = 256;
const unsigned int size_y = 4096;
#endif
// size of memory required to store the matrix
const unsigned int mem_size = sizeof(float) * size_x * size_y;
unsigned int timer;
cutCreateTimer(&timer);
CUT_DEVICE_INIT(argc, argv);
// allocate host memory
float* h_idata = (float*) malloc(mem_size);
// initalize the memory
srand(15235911);
for( unsigned int i = 0; i < (size_x * size_y); ++i)
{
h_idata[i] = (float) i; // rand();
}
// allocate device memory
float* d_idata;
float* d_odata;
CUDA_SAFE_CALL( cudaMalloc( (void**) &d_idata, mem_size));
CUDA_SAFE_CALL( cudaMalloc( (void**) &d_odata, mem_size));
// copy host memory to device
CUDA_SAFE_CALL( cudaMemcpy( d_idata, h_idata, mem_size,
cudaMemcpyHostToDevice) );
// setup execution parameters
dim3 grid(size_x / BLOCK_DIM, size_y / BLOCK_DIM, 1);
dim3 threads(BLOCK_DIM, BLOCK_DIM, 1);
// warmup so we don't time CUDA startup
transpose_naive<<< grid, threads >>>(d_odata, d_idata, size_x, size_y);
transpose<<< grid, threads >>>(d_odata, d_idata, size_x, size_y);
int numIterations = 1;
printf("Transposing a %d by %d matrix of floats...\n", size_x, size_y);
// execute the kernel
cutStartTimer(timer);
for (int i = 0; i < numIterations; ++i)
{
transpose_naive<<< grid, threads >>>(d_odata, d_idata, size_x, size_y);
}
cudaThreadSynchronize();
cutStopTimer(timer);
float naiveTime = cutGetTimerValue(timer);
// execute the kernel
cutResetTimer(timer);
cutStartTimer(timer);
for (int i = 0; i < numIterations; ++i)
{
transpose<<< grid, threads >>>(d_odata, d_idata, size_x, size_y);
}
cudaThreadSynchronize();
cutStopTimer(timer);
float optimizedTime = cutGetTimerValue(timer);
printf("Naive transpose average time: %0.3f ms\n", naiveTime / numIterations);
printf("Optimized transpose average time: %0.3f ms\n\n", optimizedTime / numIterations);
// check if kernel execution generated and error
CUT_CHECK_ERROR("Kernel execution failed");
// copy result from device to host
float* h_odata = (float*) malloc(mem_size);
CUDA_SAFE_CALL( cudaMemcpy( h_odata, d_odata, mem_size,
cudaMemcpyDeviceToHost) );
// compute reference solution
float* reference = (float*) malloc( mem_size);
computeGold( reference, h_idata, size_x, size_y);
// check result
CUTBoolean res = cutComparef( reference, h_odata, size_x * size_y);
printf( "Test %s\n", (1 == res) ? "PASSED" : "FAILED");
// cleanup memory
free(h_idata);
free(h_odata);
free( reference);
CUDA_SAFE_CALL(cudaFree(d_idata));
CUDA_SAFE_CALL(cudaFree(d_odata));
CUT_SAFE_CALL( cutDeleteTimer(timer));
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -