📄 dct8x8.cu

📁 cuda开发环境下的矩阵运算
💻 CU
📖 第 1 页 / 共 2 页
字号:
上一页 12
	int CudaDeviceNum;
	cudaDeviceProp DeviceProperties;
	cudaGetDevice(&CudaDeviceNum);
	cudaGetDeviceProperties(&DeviceProperties, CudaDeviceNum);
	const int NumThreadsInWarp = DeviceProperties.warpSize;
	const int NumWarpsInBlock = 4; //chosen according to Occupancy Calculator
	const int NumThreadsInBlock = NumThreadsInWarp * NumWarpsInBlock;
	const int NumThreadsInBlock8x8 = BLOCK_SIZE;	//Each thread processes single row of 8x8 block, then column (row-column 1D DCT or IDCT)
	const int NumBlocks8x8InBlock = NumThreadsInBlock / NumThreadsInBlock8x8;
	const int NumBlocks8x8InImage = Size.width * Size.height / BLOCK_SIZE2;
	const int NumBlocksInImage   = NumBlocks8x8InImage / NumBlocks8x8InBlock;
	const int NumEndianBlocks8x8 = NumBlocks8x8InImage % NumBlocks8x8InBlock;
	size_t SharedMemAmount = NumBlocks8x8InBlock * BLOCK_SIZE2 * sizeof(float);
	if (SharedMemAmount > DeviceProperties.sharedMemPerBlock)
	{
		return -1;
	}
	int WidthInBlocks = Size.width / BLOCK_SIZE;
	int HeightInBlocks = Size.height / BLOCK_SIZE;
	float InvWidthInBlocksF = 1.0f / WidthInBlocks;

	//Configure parameters that handle case when not all blocks8x8 will be processed by Full Warps kernel
	//In this case the unprocessed area is processed by the previous kernel
	int LastUnprocessedBlockNum = NumBlocksInImage * NumBlocks8x8InBlock;
	int LastUnprocOffsetYInBlocks = LastUnprocessedBlockNum / WidthInBlocks;
	int LastUnprocOffsetXInBlocks = LastUnprocessedBlockNum % WidthInBlocks;;
	if (HeightInBlocks - LastUnprocOffsetYInBlocks > 1)
	{
		//ensuring we overlap the whole unprocessed region
		LastUnprocOffsetXInBlocks = 0;
	}
	dim3 ThreadsEndianBlocks(BLOCK_SIZE, BLOCK_SIZE);
	dim3 GridEndianBlocks(WidthInBlocks - LastUnprocOffsetXInBlocks, HeightInBlocks - LastUnprocOffsetYInBlocks);

	//setup execution parameters for quantization
	dim3 ThreadsSmallBlocks(BLOCK_SIZE, BLOCK_SIZE);
	dim3 GridSmallBlocks(Size.width / BLOCK_SIZE, Size.height / BLOCK_SIZE);

	//prepare channel format descriptor for passing texture into kernels
	cudaChannelFormatDesc floattex = cudaCreateChannelDesc<float>();

	//allocate device memory
	cudaArray *Src;
	float *Dst;
	size_t DstStride;
	CUDA_SAFE_CALL(cudaMallocArray(&Src, &floattex, Size.width, Size.height));
	CUDA_SAFE_CALL(cudaMallocPitch((void **)(&Dst), &DstStride, Size.width * sizeof(float), Size.height));
	DstStride /= sizeof(float);

	//convert source image to float representation
	int ImgSrcFStride;
	float *ImgSrcF = MallocPlaneFloat(Size.width, Size.height, &ImgSrcFStride);
	CopyByte2Float(ImgSrc, Stride, ImgSrcF, ImgSrcFStride, Size);
	AddFloatPlane(-128.0f, ImgSrcF, ImgSrcFStride, Size);

	//copy from host memory to device
	CUDA_SAFE_CALL(cudaMemcpy2DToArray( Src, 0, 0,
										ImgSrcF, ImgSrcFStride * sizeof(float), 
										Size.width * sizeof(float), Size.height,
										cudaMemcpyHostToDevice) );

	//create and start CUDA timer
	unsigned int timerCUDA = 0;
	CUT_SAFE_CALL(cutCreateTimer(&timerCUDA));
	CUT_SAFE_CALL(cutResetTimer(timerCUDA));

	//setup execution parameters
	dim3 ThreadsFullWarps(NumThreadsInBlock);
	dim3 GridFullWarps(NumBlocksInImage);

	//execute DCT kernel and benchmark
	CUDA_SAFE_CALL(cudaBindTextureToArray(TexSrc, Src));
	for (int i=0; i<BENCHMARK_SIZE; i++)
	{
		if (NumBlocksInImage > 0)
		{
			CUT_SAFE_CALL(cutStartTimer(timerCUDA));
			CUDAkernel2DCT<<< GridFullWarps, ThreadsFullWarps, SharedMemAmount >>>(Dst, (int) DstStride, NumBlocks8x8InBlock, WidthInBlocks, InvWidthInBlocksF);
			cudaThreadSynchronize();
			CUT_SAFE_CALL(cutStopTimer(timerCUDA));
		}
		//if the number of image blocks is multiple of 4 then this kernel call can be omitted
		if (NumEndianBlocks8x8 > 0)
		{
			CUDAkernel1DCT<<< GridEndianBlocks, ThreadsEndianBlocks >>>(Dst, (int) DstStride, LastUnprocOffsetXInBlocks, LastUnprocOffsetYInBlocks);
		}
	}
	CUDA_SAFE_CALL(cudaUnbindTexture(TexSrc));
	CUT_CHECK_ERROR("Kernel execution failed");

	// finalize CUDA timer
	float TimerCUDASpan = cutGetAverageTimerValue(timerCUDA);
	CUT_SAFE_CALL(cutDeleteTimer(timerCUDA));

	// execute Quantization kernel
	CUDAkernelQuantizationEmulator<<< GridSmallBlocks, ThreadsSmallBlocks >>>(Dst, (int) DstStride);
	CUT_CHECK_ERROR("Kernel execution failed");

	//copy quantized coefficients from host memory to device array
	CUDA_SAFE_CALL(cudaMemcpy2DToArray( Src, 0, 0,
										Dst, DstStride * sizeof(float),
										Size.width * sizeof(float), Size.height,
										cudaMemcpyDeviceToDevice) );

	// execute IDCT kernel
	CUDA_SAFE_CALL(cudaBindTextureToArray(TexSrc, Src));
	if (NumBlocksInImage > 0)
	{
		CUDAkernel2IDCT<<< GridFullWarps, ThreadsFullWarps, SharedMemAmount >>>(Dst, (int) DstStride, NumBlocks8x8InBlock, WidthInBlocks, InvWidthInBlocksF);
	}
	if (NumEndianBlocks8x8 > 0)
	{
		CUDAkernel1IDCT<<< GridEndianBlocks, ThreadsEndianBlocks >>>(Dst, (int) DstStride, LastUnprocOffsetXInBlocks, LastUnprocOffsetYInBlocks);
	}
	CUDA_SAFE_CALL(cudaUnbindTexture(TexSrc));
	CUT_CHECK_ERROR("Kernel execution failed");

	//copy quantized image block to host
	CUDA_SAFE_CALL(cudaMemcpy2D(ImgSrcF, ImgSrcFStride * sizeof(float), 
								Dst, DstStride * sizeof(float), 
								Size.width * sizeof(float), Size.height,
								cudaMemcpyDeviceToHost) );

	//convert image back to byte representation
	AddFloatPlane(128.0f, ImgSrcF, ImgSrcFStride, Size);
	CopyFloat2Byte(ImgSrcF, ImgSrcFStride, ImgDst, Stride, Size);

	//clean up memory
	CUDA_SAFE_CALL(cudaFreeArray(Src));
	CUDA_SAFE_CALL(cudaFree(Dst));
	FreePlane(ImgSrcF);

	//return time taken by the operation
	return TimerCUDASpan;
}


/**
**************************************************************************
*  Program entry point
*
* \param argc		[IN] - Number of command-line arguments
* \param argv		[IN] - Array of command-line arguments
*  
* \return Status code
*/
int main(int argc, char** argv)
{
	//
	// Sample initialization
	//

	//initialize CUDA
	CUT_DEVICE_INIT(argc, argv);

	//source and results image filenames
	char SampleImageFname[] = "barbara.bmp";
	char SampleImageFnameResGold1[] = "barbara_gold1.bmp";
	char SampleImageFnameResGold2[] = "barbara_gold2.bmp";
	char SampleImageFnameResCUDA1[] = "barbara_cuda1.bmp";
	char SampleImageFnameResCUDA2[] = "barbara_cuda2.bmp";

	char *pSampleImageFpath = cutFindFilePath(SampleImageFname, argv[0]);

	//preload image (acquire dimensions)
	int ImgWidth, ImgHeight;
	ROI ImgSize;
	int res = PreLoadBmp(pSampleImageFpath, &ImgWidth, &ImgHeight);
	ImgSize.width = ImgWidth;
	ImgSize.height = ImgHeight;

	//CONSOLE INFORMATION: saying hello to user
	printf("CUDA sample DCT/IDCT implementation\n");
	printf("===================================\n");
	printf("Loading test image: %s... ", SampleImageFname);

	if (res)
	{
		printf("\nError: Image file not found or invalid!\n");
		printf("Press ENTER to exit...\n");
		getchar();

		//finalize
		CUT_EXIT(argc, argv);
		return 1;
	}

	//check image dimensions are multiples of BLOCK_SIZE
	if (ImgWidth % BLOCK_SIZE != 0 || ImgHeight % BLOCK_SIZE != 0)
	{
		printf("\nError: Input image dimensions must be multiples of 8!\n");
		printf("Press ENTER to exit...\n");
		getchar();

		//finalize
		CUT_EXIT(argc, argv);
		return 1;
	}
	printf("[%d x %d]... ", ImgWidth, ImgHeight);

	//allocate image buffers
	int ImgStride;
	byte *ImgSrc = MallocPlaneByte(ImgWidth, ImgHeight, &ImgStride);
	byte *ImgDstGold1 = MallocPlaneByte(ImgWidth, ImgHeight, &ImgStride);
	byte *ImgDstGold2 = MallocPlaneByte(ImgWidth, ImgHeight, &ImgStride);
	byte *ImgDstCUDA1 = MallocPlaneByte(ImgWidth, ImgHeight, &ImgStride);
	byte *ImgDstCUDA2 = MallocPlaneByte(ImgWidth, ImgHeight, &ImgStride);

	//load sample image
	LoadBmpAsGray(pSampleImageFpath, ImgStride, ImgSize, ImgSrc);

	//
	// RUNNING WRAPPERS
	//

	//compute Gold 1 version of DCT/quantization/IDCT
	printf("Success\nRunning Gold 1 (CPU) version... ");
	float TimeGold1 = WrapperGold1(ImgSrc, ImgDstGold1, ImgStride, ImgSize);

	//compute Gold 2 version of DCT/quantization/IDCT
	printf("Success\nRunning Gold 2 (CPU) version... ");
	float TimeGold2 = WrapperGold2(ImgSrc, ImgDstGold2, ImgStride, ImgSize);

	//compute CUDA 1 version of DCT/quantization/IDCT
	printf("Success\nRunning CUDA 1 (GPU) version... ");
	float TimeCUDA1 = WrapperCUDA1(ImgSrc, ImgDstCUDA1, ImgStride, ImgSize);

	//compute CUDA 2 version of DCT/quantization/IDCT
	printf("Success\nRunning CUDA 2 (GPU) version... ");
	float TimeCUDA2 = WrapperCUDA2(ImgSrc, ImgDstCUDA2, ImgStride, ImgSize);

	//
	// Execution statistics, result saving and validation
	//

	//dump result of Gold 1 processing
	printf("Success\nDumping result to %s... ", SampleImageFnameResGold1);
	DumpBmpAsGray(SampleImageFnameResGold1, ImgDstGold1, ImgStride, ImgSize);

	//dump result of Gold 2 processing
	printf("Success\nDumping result to %s... ", SampleImageFnameResGold2);
	DumpBmpAsGray(SampleImageFnameResGold2, ImgDstGold2, ImgStride, ImgSize);

	//dump result of CUDA 1 processing
	printf("Success\nDumping result to %s... ", SampleImageFnameResCUDA1);
	DumpBmpAsGray(SampleImageFnameResCUDA1, ImgDstCUDA1, ImgStride, ImgSize);

	//dump result of CUDA 2 processing
	printf("Success\nDumping result to %s... ", SampleImageFnameResCUDA2);
	DumpBmpAsGray(SampleImageFnameResCUDA2, ImgDstCUDA2, ImgStride, ImgSize);

	//print speed info
	printf("Success\n\n");

#ifdef __DEVICE_EMULATION__
	printf("Processing time : not relevant in CUDA emulation mode\n");
#else
	printf("Processing time (CUDA 1) : %f ms \n", TimeCUDA1);
	printf("Processing time (CUDA 2) : %f ms \n", TimeCUDA2);
#endif

	//calculate PSNR between each pair of images
	float PSNR_Src_DstGold1      = CalculatePSNR(ImgSrc, ImgDstGold1, ImgStride, ImgSize);
	float PSNR_Src_DstGold2      = CalculatePSNR(ImgSrc, ImgDstGold2, ImgStride, ImgSize);
	float PSNR_Src_DstCUDA1      = CalculatePSNR(ImgSrc, ImgDstCUDA1, ImgStride, ImgSize);
	float PSNR_Src_DstCUDA2      = CalculatePSNR(ImgSrc, ImgDstCUDA2, ImgStride, ImgSize);
	float PSNR_DstGold1_DstCUDA1 = CalculatePSNR(ImgDstGold1, ImgDstCUDA1, ImgStride, ImgSize);
	float PSNR_DstGold2_DstCUDA2 = CalculatePSNR(ImgDstGold2, ImgDstCUDA2, ImgStride, ImgSize);

	printf("PSNR Original    <---> CPU(Gold 1) : %f\n", PSNR_Src_DstGold1);
	printf("PSNR Original    <---> CPU(Gold 2) : %f\n", PSNR_Src_DstGold2);
	printf("PSNR Original    <---> GPU(CUDA 1) : %f\n", PSNR_Src_DstCUDA1);
	printf("PSNR Original    <---> GPU(CUDA 2) : %f\n", PSNR_Src_DstCUDA2);
	printf("PSNR CPU(Gold 1) <---> GPU(CUDA 1) : %f\n", PSNR_DstGold1_DstCUDA1);
	printf("PSNR CPU(Gold 2) <---> GPU(CUDA 2) : %f\n", PSNR_DstGold2_DstCUDA2);

	if (PSNR_DstGold1_DstCUDA1 > 45 && PSNR_DstGold2_DstCUDA2 > 45)
	{
		printf("\nTEST PASSED!\n");
	}
	else
	{
		printf("\nTEST FAILED! (CPU and GPU results differ too much)\n");
	}

	//
	// Finalization
	//

	//release byte planes
	FreePlane(ImgSrc);
	FreePlane(ImgDstGold1);
	FreePlane(ImgDstGold2);
	FreePlane(ImgDstCUDA1);
	FreePlane(ImgDstCUDA2);

	//finalize
	CUT_EXIT(argc, argv);
	return 0;
}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -