📄 dct8x8.cu

📁 cuda开发环境下的矩阵运算
💻 CU
📖 第 1 页 / 共 2 页
字号:
12 下一页
/*
 * Copyright 1993-2007 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO USER:
 *
 * This source code is subject to NVIDIA ownership rights under U.S. and
 * international Copyright laws.  Users and possessors of this source code
 * are hereby granted a nonexclusive, royalty-free license to use this code
 * in individual and commercial software.
 *
 * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
 * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
 * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
 * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
 * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
 * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
 * OR PERFORMANCE OF THIS SOURCE CODE.
 *
 * U.S. Government End Users.   This source code is a "commercial item" as
 * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
 * "commercial computer  software"  and "commercial computer software
 * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
 * and is provided to the U.S. Government only as a commercial end item.
 * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
 * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
 * source code with only those rights set forth herein.
 *
 * Any use of this source code in individual and commercial software must
 * include, in the user documentation and internal comments to the code,
 * the above Disclaimer and U.S. Government End Users Notice.
 */

/**
**************************************************************************
* \file dct8x8.cu
* \brief Contains entry point, wrappers to host and device code and benchmark.
*
* This sample implements forward and inverse Discrete Cosine Transform to blocks
* of image pixels (of 8x8 size), as in JPEG standard. The typical work flow is as 
* follows:
* 1. Run CPU version (Host code) and measure execution time;
* 2. Run CUDA version (Device code) and measure execution time;
* 3. Output execution timings and calculate CUDA speedup.
*/

#include "Common.h"


/**
*  The number of DCT kernel calls
*/
#ifdef __DEVICE_EMULATION__
#define BENCHMARK_SIZE	1
#else
#define BENCHMARK_SIZE	10
#endif


/**
*  Texture reference that is passed through this global variable into device code.
*  This is done because any conventional passing through argument list way results 
*  in compiler internal error. 2008.03.11
*/
texture<float, 2, cudaReadModeElementType> TexSrc;


// includes kernels
#include "dct8x8_kernel1.cu"
#include "dct8x8_kernel2.cu"


/**
**************************************************************************
*  Wrapper function for 1st gold version of DCT, quantization and IDCT implementations
*
* \param ImgSrc			[IN] - Source byte image plane
* \param ImgDst			[IN] - Quantized result byte image plane
* \param Stride			[IN] - Stride for both source and result planes
* \param Size			[IN] - Size of both planes
*  
* \return Execution time in milliseconds
*/
float WrapperGold1(byte *ImgSrc, byte *ImgDst, int Stride, ROI Size)
{
	//allocate float buffers for DCT and other data
	int StrideF;
	float *ImgF1 = MallocPlaneFloat(Size.width, Size.height, &StrideF);
	float *ImgF2 = MallocPlaneFloat(Size.width, Size.height, &StrideF);

	//convert source image to float representation
	CopyByte2Float(ImgSrc, Stride, ImgF1, StrideF, Size);
	AddFloatPlane(-128.0f, ImgF1, StrideF, Size);

	//create and start CUDA timer
	unsigned int timerGold = 0;
	CUT_SAFE_CALL(cutCreateTimer(&timerGold));
	CUT_SAFE_CALL(cutResetTimer(timerGold));

	//perform block-wise DCT processing and benchmarking
	for (int i=0; i<BENCHMARK_SIZE; i++)
	{
		CUT_SAFE_CALL(cutStartTimer(timerGold));
		computeDCT8x8Gold1(ImgF1, ImgF2, StrideF, Size);
		CUT_SAFE_CALL(cutStopTimer(timerGold));
	}

	//stop and destroy CUDA timer
	float TimerGoldSpan = cutGetAverageTimerValue(timerGold);
	CUT_SAFE_CALL(cutDeleteTimer(timerGold));

	//perform quantization
	quantizeGold(ImgF2, StrideF, Size);

	//perform block-wise IDCT processing
	computeIDCT8x8Gold1(ImgF2, ImgF1, StrideF, Size);

	//convert image back to byte representation
	AddFloatPlane(128.0f, ImgF1, StrideF, Size);
	CopyFloat2Byte(ImgF1, StrideF, ImgDst, Stride, Size);

	//free float buffers
	FreePlane(ImgF1);
	FreePlane(ImgF2);

	//return time taken by the operation
	return TimerGoldSpan;
}


/**
**************************************************************************
*  Wrapper function for 2nd gold version of DCT, quantization and IDCT implementations
*
* \param ImgSrc			[IN] - Source byte image plane
* \param ImgDst			[IN] - Quantized result byte image plane
* \param Stride			[IN] - Stride for both source and result planes
* \param Size			[IN] - Size of both planes
*  
* \return Execution time in milliseconds
*/
float WrapperGold2(byte *ImgSrc, byte *ImgDst, int Stride, ROI Size)
{
	//allocate float buffers for DCT and other data
	int StrideF;
	float *ImgF1 = MallocPlaneFloat(Size.width, Size.height, &StrideF);
	float *ImgF2 = MallocPlaneFloat(Size.width, Size.height, &StrideF);

	//convert source image to float representation
	CopyByte2Float(ImgSrc, Stride, ImgF1, StrideF, Size);
	AddFloatPlane(-128.0f, ImgF1, StrideF, Size);

	//create and start CUDA timer
	unsigned int timerGold = 0;
	CUT_SAFE_CALL(cutCreateTimer(&timerGold));
	CUT_SAFE_CALL(cutResetTimer(timerGold));

	//perform block-wise DCT processing and benchmarking
	for (int i=0; i<BENCHMARK_SIZE; i++)
	{
		CUT_SAFE_CALL(cutStartTimer(timerGold));
		computeDCT8x8Gold2(ImgF1, ImgF2, StrideF, Size);
		CUT_SAFE_CALL(cutStopTimer(timerGold));
	}

	//stop and destroy CUDA timer
	float TimerGoldSpan = cutGetAverageTimerValue(timerGold);
	CUT_SAFE_CALL(cutDeleteTimer(timerGold));

	//perform quantization
	quantizeGold(ImgF2, StrideF, Size);

	//perform block-wise IDCT processing
	computeIDCT8x8Gold2(ImgF2, ImgF1, StrideF, Size);

	//convert image back to byte representation
	AddFloatPlane(128.0f, ImgF1, StrideF, Size);
	CopyFloat2Byte(ImgF1, StrideF, ImgDst, Stride, Size);

	//free float buffers
	FreePlane(ImgF1);
	FreePlane(ImgF2);

	//return time taken by the operation
	return TimerGoldSpan;
}


/**
**************************************************************************
*  Wrapper function for 1st CUDA version of DCT, quantization and IDCT implementations
*
* \param ImgSrc			[IN] - Source byte image plane
* \param ImgDst			[IN] - Quantized result byte image plane
* \param Stride			[IN] - Stride for both source and result planes
* \param Size			[IN] - Size of both planes
*  
* \return Execution time in milliseconds
*/
float WrapperCUDA1(byte *ImgSrc, byte *ImgDst, int Stride, ROI Size)
{
	//prepare channel format descriptor for passing texture into kernels
	cudaChannelFormatDesc floattex = cudaCreateChannelDesc<float>();

	//allocate device memory
	cudaArray *Src;
	float *Dst;
	size_t DstStride;
	CUDA_SAFE_CALL(cudaMallocArray(&Src, &floattex, Size.width, Size.height));
	CUDA_SAFE_CALL(cudaMallocPitch((void **)(&Dst), &DstStride, Size.width * sizeof(float), Size.height));
	DstStride /= sizeof(float);

	//convert source image to float representation
	int ImgSrcFStride;
	float *ImgSrcF = MallocPlaneFloat(Size.width, Size.height, &ImgSrcFStride);
	CopyByte2Float(ImgSrc, Stride, ImgSrcF, ImgSrcFStride, Size);
	AddFloatPlane(-128.0f, ImgSrcF, ImgSrcFStride, Size);

	//copy from host memory to device
	CUDA_SAFE_CALL(cudaMemcpy2DToArray(Src, 0, 0,
									   ImgSrcF, ImgSrcFStride * sizeof(float), 
									   Size.width * sizeof(float), Size.height,
									   cudaMemcpyHostToDevice) );

	//setup execution parameters
	dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
	dim3 grid(Size.width / BLOCK_SIZE, Size.height / BLOCK_SIZE);

	//create and start CUDA timer
	unsigned int timerCUDA = 0;
	CUT_SAFE_CALL(cutCreateTimer(&timerCUDA));
	CUT_SAFE_CALL(cutResetTimer(timerCUDA));

	//execute DCT kernel and benchmark
	CUDA_SAFE_CALL(cudaBindTextureToArray(TexSrc, Src));
	for (int i=0; i<BENCHMARK_SIZE; i++)
	{
		CUT_SAFE_CALL(cutStartTimer(timerCUDA));
		CUDAkernel1DCT<<< grid, threads >>>(Dst, (int) DstStride, 0, 0);
		cudaThreadSynchronize();
		CUT_SAFE_CALL(cutStopTimer(timerCUDA));
	}
	CUDA_SAFE_CALL(cudaUnbindTexture(TexSrc));
	CUT_CHECK_ERROR("Kernel execution failed");

	// finalize CUDA timer
	float TimerCUDASpan = cutGetAverageTimerValue(timerCUDA);
	CUT_SAFE_CALL(cutDeleteTimer(timerCUDA));

	// execute Quantization kernel
	CUDAkernelQuantizationEmulator<<< grid, threads >>>(Dst, (int) DstStride);
	CUT_CHECK_ERROR("Kernel execution failed");

	//copy quantized coefficients from host memory to device array
	CUDA_SAFE_CALL(cudaMemcpy2DToArray(Src, 0, 0,
									   Dst, DstStride * sizeof(float),
									   Size.width * sizeof(float), Size.height,
									   cudaMemcpyDeviceToDevice) );

	// execute IDCT kernel
	CUDA_SAFE_CALL(cudaBindTextureToArray(TexSrc, Src));
	CUDAkernel1IDCT<<< grid, threads >>>(Dst, (int) DstStride, 0, 0);
	CUDA_SAFE_CALL(cudaUnbindTexture(TexSrc));
	CUT_CHECK_ERROR("Kernel execution failed");

	//copy quantized image block to host
	CUDA_SAFE_CALL(cudaMemcpy2D(ImgSrcF, ImgSrcFStride * sizeof(float), 
								Dst, DstStride * sizeof(float), 
								Size.width * sizeof(float), Size.height,
								cudaMemcpyDeviceToHost) );

	//convert image back to byte representation
	AddFloatPlane(128.0f, ImgSrcF, ImgSrcFStride, Size);
	CopyFloat2Byte(ImgSrcF, ImgSrcFStride, ImgDst, Stride, Size);

	//clean up memory
	CUDA_SAFE_CALL(cudaFreeArray(Src));
	CUDA_SAFE_CALL(cudaFree(Dst));
	FreePlane(ImgSrcF);

	//return time taken by the operation
	return TimerCUDASpan;
}


/**
**************************************************************************
*  Wrapper function for 2nd CUDA version of DCT, quantization and IDCT implementations
*
* \param ImgSrc			[IN] - Source byte image plane
* \param ImgDst			[IN] - Quantized result byte image plane
* \param Stride			[IN] - Stride for both source and result planes
* \param Size			[IN] - Size of both planes
*  
* \return Execution time in milliseconds
*/
float WrapperCUDA2(byte *ImgSrc, byte *ImgDst, int Stride, ROI Size)
{
	//setup CUDA execution parameters
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -