📄 dct8x8.cu
字号:
/*
* Copyright 1993-2007 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software.
*
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
* OR PERFORMANCE OF THIS SOURCE CODE.
*
* U.S. Government End Users. This source code is a "commercial item" as
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
* "commercial computer software" and "commercial computer software
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
* and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* source code with only those rights set forth herein.
*
* Any use of this source code in individual and commercial software must
* include, in the user documentation and internal comments to the code,
* the above Disclaimer and U.S. Government End Users Notice.
*/
/**
**************************************************************************
* \file dct8x8.cu
* \brief Contains entry point, wrappers to host and device code and benchmark.
*
* This sample implements forward and inverse Discrete Cosine Transform to blocks
* of image pixels (of 8x8 size), as in JPEG standard. The typical work flow is as
* follows:
* 1. Run CPU version (Host code) and measure execution time;
* 2. Run CUDA version (Device code) and measure execution time;
* 3. Output execution timings and calculate CUDA speedup.
*/
#include "Common.h"
/**
* The number of DCT kernel calls
*/
#ifdef __DEVICE_EMULATION__
#define BENCHMARK_SIZE 1
#else
#define BENCHMARK_SIZE 10
#endif
/**
* Texture reference that is passed through this global variable into device code.
* This is done because any conventional passing through argument list way results
* in compiler internal error. 2008.03.11
*/
texture<float, 2, cudaReadModeElementType> TexSrc;
// includes kernels
#include "dct8x8_kernel1.cu"
#include "dct8x8_kernel2.cu"
/**
**************************************************************************
* Wrapper function for 1st gold version of DCT, quantization and IDCT implementations
*
* \param ImgSrc [IN] - Source byte image plane
* \param ImgDst [IN] - Quantized result byte image plane
* \param Stride [IN] - Stride for both source and result planes
* \param Size [IN] - Size of both planes
*
* \return Execution time in milliseconds
*/
float WrapperGold1(byte *ImgSrc, byte *ImgDst, int Stride, ROI Size)
{
//allocate float buffers for DCT and other data
int StrideF;
float *ImgF1 = MallocPlaneFloat(Size.width, Size.height, &StrideF);
float *ImgF2 = MallocPlaneFloat(Size.width, Size.height, &StrideF);
//convert source image to float representation
CopyByte2Float(ImgSrc, Stride, ImgF1, StrideF, Size);
AddFloatPlane(-128.0f, ImgF1, StrideF, Size);
//create and start CUDA timer
unsigned int timerGold = 0;
CUT_SAFE_CALL(cutCreateTimer(&timerGold));
CUT_SAFE_CALL(cutResetTimer(timerGold));
//perform block-wise DCT processing and benchmarking
for (int i=0; i<BENCHMARK_SIZE; i++)
{
CUT_SAFE_CALL(cutStartTimer(timerGold));
computeDCT8x8Gold1(ImgF1, ImgF2, StrideF, Size);
CUT_SAFE_CALL(cutStopTimer(timerGold));
}
//stop and destroy CUDA timer
float TimerGoldSpan = cutGetAverageTimerValue(timerGold);
CUT_SAFE_CALL(cutDeleteTimer(timerGold));
//perform quantization
quantizeGold(ImgF2, StrideF, Size);
//perform block-wise IDCT processing
computeIDCT8x8Gold1(ImgF2, ImgF1, StrideF, Size);
//convert image back to byte representation
AddFloatPlane(128.0f, ImgF1, StrideF, Size);
CopyFloat2Byte(ImgF1, StrideF, ImgDst, Stride, Size);
//free float buffers
FreePlane(ImgF1);
FreePlane(ImgF2);
//return time taken by the operation
return TimerGoldSpan;
}
/**
**************************************************************************
* Wrapper function for 2nd gold version of DCT, quantization and IDCT implementations
*
* \param ImgSrc [IN] - Source byte image plane
* \param ImgDst [IN] - Quantized result byte image plane
* \param Stride [IN] - Stride for both source and result planes
* \param Size [IN] - Size of both planes
*
* \return Execution time in milliseconds
*/
float WrapperGold2(byte *ImgSrc, byte *ImgDst, int Stride, ROI Size)
{
//allocate float buffers for DCT and other data
int StrideF;
float *ImgF1 = MallocPlaneFloat(Size.width, Size.height, &StrideF);
float *ImgF2 = MallocPlaneFloat(Size.width, Size.height, &StrideF);
//convert source image to float representation
CopyByte2Float(ImgSrc, Stride, ImgF1, StrideF, Size);
AddFloatPlane(-128.0f, ImgF1, StrideF, Size);
//create and start CUDA timer
unsigned int timerGold = 0;
CUT_SAFE_CALL(cutCreateTimer(&timerGold));
CUT_SAFE_CALL(cutResetTimer(timerGold));
//perform block-wise DCT processing and benchmarking
for (int i=0; i<BENCHMARK_SIZE; i++)
{
CUT_SAFE_CALL(cutStartTimer(timerGold));
computeDCT8x8Gold2(ImgF1, ImgF2, StrideF, Size);
CUT_SAFE_CALL(cutStopTimer(timerGold));
}
//stop and destroy CUDA timer
float TimerGoldSpan = cutGetAverageTimerValue(timerGold);
CUT_SAFE_CALL(cutDeleteTimer(timerGold));
//perform quantization
quantizeGold(ImgF2, StrideF, Size);
//perform block-wise IDCT processing
computeIDCT8x8Gold2(ImgF2, ImgF1, StrideF, Size);
//convert image back to byte representation
AddFloatPlane(128.0f, ImgF1, StrideF, Size);
CopyFloat2Byte(ImgF1, StrideF, ImgDst, Stride, Size);
//free float buffers
FreePlane(ImgF1);
FreePlane(ImgF2);
//return time taken by the operation
return TimerGoldSpan;
}
/**
**************************************************************************
* Wrapper function for 1st CUDA version of DCT, quantization and IDCT implementations
*
* \param ImgSrc [IN] - Source byte image plane
* \param ImgDst [IN] - Quantized result byte image plane
* \param Stride [IN] - Stride for both source and result planes
* \param Size [IN] - Size of both planes
*
* \return Execution time in milliseconds
*/
float WrapperCUDA1(byte *ImgSrc, byte *ImgDst, int Stride, ROI Size)
{
//prepare channel format descriptor for passing texture into kernels
cudaChannelFormatDesc floattex = cudaCreateChannelDesc<float>();
//allocate device memory
cudaArray *Src;
float *Dst;
size_t DstStride;
CUDA_SAFE_CALL(cudaMallocArray(&Src, &floattex, Size.width, Size.height));
CUDA_SAFE_CALL(cudaMallocPitch((void **)(&Dst), &DstStride, Size.width * sizeof(float), Size.height));
DstStride /= sizeof(float);
//convert source image to float representation
int ImgSrcFStride;
float *ImgSrcF = MallocPlaneFloat(Size.width, Size.height, &ImgSrcFStride);
CopyByte2Float(ImgSrc, Stride, ImgSrcF, ImgSrcFStride, Size);
AddFloatPlane(-128.0f, ImgSrcF, ImgSrcFStride, Size);
//copy from host memory to device
CUDA_SAFE_CALL(cudaMemcpy2DToArray(Src, 0, 0,
ImgSrcF, ImgSrcFStride * sizeof(float),
Size.width * sizeof(float), Size.height,
cudaMemcpyHostToDevice) );
//setup execution parameters
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(Size.width / BLOCK_SIZE, Size.height / BLOCK_SIZE);
//create and start CUDA timer
unsigned int timerCUDA = 0;
CUT_SAFE_CALL(cutCreateTimer(&timerCUDA));
CUT_SAFE_CALL(cutResetTimer(timerCUDA));
//execute DCT kernel and benchmark
CUDA_SAFE_CALL(cudaBindTextureToArray(TexSrc, Src));
for (int i=0; i<BENCHMARK_SIZE; i++)
{
CUT_SAFE_CALL(cutStartTimer(timerCUDA));
CUDAkernel1DCT<<< grid, threads >>>(Dst, (int) DstStride, 0, 0);
cudaThreadSynchronize();
CUT_SAFE_CALL(cutStopTimer(timerCUDA));
}
CUDA_SAFE_CALL(cudaUnbindTexture(TexSrc));
CUT_CHECK_ERROR("Kernel execution failed");
// finalize CUDA timer
float TimerCUDASpan = cutGetAverageTimerValue(timerCUDA);
CUT_SAFE_CALL(cutDeleteTimer(timerCUDA));
// execute Quantization kernel
CUDAkernelQuantizationEmulator<<< grid, threads >>>(Dst, (int) DstStride);
CUT_CHECK_ERROR("Kernel execution failed");
//copy quantized coefficients from host memory to device array
CUDA_SAFE_CALL(cudaMemcpy2DToArray(Src, 0, 0,
Dst, DstStride * sizeof(float),
Size.width * sizeof(float), Size.height,
cudaMemcpyDeviceToDevice) );
// execute IDCT kernel
CUDA_SAFE_CALL(cudaBindTextureToArray(TexSrc, Src));
CUDAkernel1IDCT<<< grid, threads >>>(Dst, (int) DstStride, 0, 0);
CUDA_SAFE_CALL(cudaUnbindTexture(TexSrc));
CUT_CHECK_ERROR("Kernel execution failed");
//copy quantized image block to host
CUDA_SAFE_CALL(cudaMemcpy2D(ImgSrcF, ImgSrcFStride * sizeof(float),
Dst, DstStride * sizeof(float),
Size.width * sizeof(float), Size.height,
cudaMemcpyDeviceToHost) );
//convert image back to byte representation
AddFloatPlane(128.0f, ImgSrcF, ImgSrcFStride, Size);
CopyFloat2Byte(ImgSrcF, ImgSrcFStride, ImgDst, Stride, Size);
//clean up memory
CUDA_SAFE_CALL(cudaFreeArray(Src));
CUDA_SAFE_CALL(cudaFree(Dst));
FreePlane(ImgSrcF);
//return time taken by the operation
return TimerCUDASpan;
}
/**
**************************************************************************
* Wrapper function for 2nd CUDA version of DCT, quantization and IDCT implementations
*
* \param ImgSrc [IN] - Source byte image plane
* \param ImgDst [IN] - Quantized result byte image plane
* \param Stride [IN] - Stride for both source and result planes
* \param Size [IN] - Size of both planes
*
* \return Execution time in milliseconds
*/
float WrapperCUDA2(byte *ImgSrc, byte *ImgDst, int Stride, ROI Size)
{
//setup CUDA execution parameters
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -