📄 dct8x8_kernel1.cu

📁 cuda开发环境下的矩阵运算
💻 CU
字号:
/*
 * Copyright 1993-2007 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO USER:
 *
 * This source code is subject to NVIDIA ownership rights under U.S. and
 * international Copyright laws.  Users and possessors of this source code
 * are hereby granted a nonexclusive, royalty-free license to use this code
 * in individual and commercial software.
 *
 * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
 * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
 * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
 * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
 * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
 * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
 * OR PERFORMANCE OF THIS SOURCE CODE.
 *
 * U.S. Government End Users.   This source code is a "commercial item" as
 * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
 * "commercial computer  software"  and "commercial computer software
 * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
 * and is provided to the U.S. Government only as a commercial end item.
 * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
 * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
 * source code with only those rights set forth herein.
 *
 * Any use of this source code in individual and commercial software must
 * include, in the user documentation and internal comments to the code,
 * the above Disclaimer and U.S. Government End Users Notice.
 */

/**
**************************************************************************
* \file dct8x8_kernel1.cu
* \brief Contains 1st CUDA implementations of DCT, IDCT and quantization routines, 
*        used in JPEG internal data processing. Device code.
*
* This code implements first CUDA versions of forward and inverse Discrete Cosine 
* Transform to blocks of image pixels (of 8x8 size), as in JPEG standard. The data 
* processing is done using floating point representation.
* The routine that performs quantization of DCT coefficients is also included.
*/

#pragma once

#include "Common.h"


/**
*  This unitary matrix performs discrete cosine transform of rows of the matrix to the left
*/
__constant__ float DCTv8matrix[] = {
	0.3535533905932738f,  0.4903926402016152f,  0.4619397662556434f,  0.4157348061512726f,  0.3535533905932738f,  0.2777851165098011f,  0.1913417161825449f,  0.0975451610080642f, 
	0.3535533905932738f,  0.4157348061512726f,  0.1913417161825449f, -0.0975451610080641f, -0.3535533905932737f, -0.4903926402016152f, -0.4619397662556434f, -0.2777851165098011f, 
	0.3535533905932738f,  0.2777851165098011f, -0.1913417161825449f, -0.4903926402016152f, -0.3535533905932738f,  0.0975451610080642f,  0.4619397662556433f,  0.4157348061512727f, 
	0.3535533905932738f,  0.0975451610080642f, -0.4619397662556434f, -0.2777851165098011f,  0.3535533905932737f,  0.4157348061512727f, -0.1913417161825450f, -0.4903926402016153f, 
	0.3535533905932738f, -0.0975451610080641f, -0.4619397662556434f,  0.2777851165098009f,  0.3535533905932738f, -0.4157348061512726f, -0.1913417161825453f,  0.4903926402016152f, 
	0.3535533905932738f, -0.2777851165098010f, -0.1913417161825452f,  0.4903926402016153f, -0.3535533905932733f, -0.0975451610080649f,  0.4619397662556437f, -0.4157348061512720f, 
	0.3535533905932738f, -0.4157348061512727f,  0.1913417161825450f,  0.0975451610080640f, -0.3535533905932736f,  0.4903926402016152f, -0.4619397662556435f,  0.2777851165098022f, 
	0.3535533905932738f, -0.4903926402016152f,  0.4619397662556433f, -0.4157348061512721f,  0.3535533905932733f, -0.2777851165098008f,  0.1913417161825431f, -0.0975451610080625f
};


// Temporary blocks
__shared__ float CurBlockLocal1[BLOCK_SIZE2];
__shared__ float CurBlockLocal2[BLOCK_SIZE2];


/**
**************************************************************************
*  Performs 1st implementation of 8x8 block-wise Forward Discrete Cosine Transform of the given 
*  image plane and outputs result to the array of coefficients.
*
* \param Dst			[OUT] - Coefficients plane
* \param ImgWidth		[IN] - Stride of Dst
* \param OffsetXBlocks	[IN] - Offset along X in blocks from which to perform processing
* \param OffsetYBlocks	[IN] - Offset along Y in blocks from which to perform processing
*  
* \return None
*/
__global__ void CUDAkernel1DCT(float *Dst, int ImgWidth, int OffsetXBlocks, int OffsetYBlocks)
{
    // Block index
	const int bx = blockIdx.x + OffsetXBlocks;
	const int by = blockIdx.y + OffsetYBlocks;

    // Thread index (current coefficient)
    const int tx = threadIdx.x;
    const int ty = threadIdx.y;

	// Texture coordinates
	const float tex_x = (float)( (bx << BLOCK_SIZE_LOG2) + tx ) + 0.5f;
	const float tex_y = (float)( (by << BLOCK_SIZE_LOG2) + ty ) + 0.5f;

	//copy current image pixel to the first block
	CurBlockLocal1[ (ty << BLOCK_SIZE_LOG2) + tx ] = tex2D(TexSrc, tex_x, tex_y);

	//synchronize threads to make sure the block is copied
	__syncthreads();

	//calculate the multiplication of DCTv8matrixT * A and place it in the second block
	float curelem = 0;
	int DCTv8matrixIndex = 0 * BLOCK_SIZE + ty;
	int CurBlockLocal1Index = 0 * BLOCK_SIZE + tx;
	#pragma unroll
	for (int i=0; i<BLOCK_SIZE; i++)
	{
		curelem += DCTv8matrix[DCTv8matrixIndex] * CurBlockLocal1[CurBlockLocal1Index];
		DCTv8matrixIndex += BLOCK_SIZE;
		CurBlockLocal1Index += BLOCK_SIZE;
	}
	CurBlockLocal2[ (ty << BLOCK_SIZE_LOG2) + tx ] = curelem;

	//synchronize threads to make sure the first 2 matrices are multiplied and the result is stored in the second block
	__syncthreads();

	//calculate the multiplication of (DCTv8matrixT * A) * DCTv8matrix and place it in the first block
	curelem = 0;
	int CurBlockLocal2Index = (ty << BLOCK_SIZE_LOG2) + 0;
	DCTv8matrixIndex = 0 * BLOCK_SIZE + tx;
	#pragma unroll
	for (int i=0; i<BLOCK_SIZE; i++)
	{
		curelem += CurBlockLocal2[CurBlockLocal2Index] * DCTv8matrix[DCTv8matrixIndex];
		CurBlockLocal2Index += 1;
		DCTv8matrixIndex += BLOCK_SIZE;
	}
	CurBlockLocal1[ (ty << BLOCK_SIZE_LOG2) + tx ] = curelem;

	//synchronize threads to make sure the matrices are multiplied and the result is stored back in the first block
	__syncthreads();

	//copy current coefficient to its place in the result array
	Dst[ FAST_INT_MUL(((by << BLOCK_SIZE_LOG2) + ty), ImgWidth) + ((bx << BLOCK_SIZE_LOG2) + tx) ] = CurBlockLocal1[ (ty << BLOCK_SIZE_LOG2) + tx ];
}


/**
**************************************************************************
*  Performs 1st implementation of 8x8 block-wise Inverse Discrete Cosine Transform of the given 
*  DCT coefficients plane and outputs result to the image array
*
* \param Dst			[OUT] - Image plane
* \param ImgWidth		[IN] - Stride of Dst
* \param OffsetXBlocks	[IN] - Offset along X in blocks from which to perform processing
* \param OffsetYBlocks	[IN] - Offset along Y in blocks from which to perform processing
*  
* \return None
*/
__global__ void CUDAkernel1IDCT(float *Dst, int ImgWidth, int OffsetXBlocks, int OffsetYBlocks)
{
    // Block index
    int bx = blockIdx.x + OffsetXBlocks;
    int by = blockIdx.y + OffsetYBlocks;

    // Thread index (current image pixel)
    int tx = threadIdx.x;
    int ty = threadIdx.y;

	// Texture coordinates
	const float tex_x = (float)( (bx << BLOCK_SIZE_LOG2) + tx ) + 0.5f;
	const float tex_y = (float)( (by << BLOCK_SIZE_LOG2) + ty ) + 0.5f;

	//copy current image pixel to the first block
	CurBlockLocal1[ (ty << BLOCK_SIZE_LOG2) + tx ] = tex2D(TexSrc, tex_x, tex_y);

	//synchronize threads to make sure the block is copied
	__syncthreads();

	//calculate the multiplication of DCTv8matrix * A and place it in the second block
	float curelem = 0;
	int DCTv8matrixIndex = (ty << BLOCK_SIZE_LOG2) + 0;
	int CurBlockLocal1Index = 0 * BLOCK_SIZE + tx;
	#pragma unroll
	for (int i=0; i<BLOCK_SIZE; i++)
	{
		curelem += DCTv8matrix[DCTv8matrixIndex] * CurBlockLocal1[CurBlockLocal1Index];
		DCTv8matrixIndex += 1;
		CurBlockLocal1Index += BLOCK_SIZE;
	}
	CurBlockLocal2[ (ty << BLOCK_SIZE_LOG2) + tx ] = curelem;

	//synchronize threads to make sure the first 2 matrices are multiplied and the result is stored in the second block
	__syncthreads();

	//calculate the multiplication of (DCTv8matrix * A) * DCTv8matrixT and place it in the first block
	curelem = 0;
	int CurBlockLocal2Index = (ty << BLOCK_SIZE_LOG2) + 0;
	DCTv8matrixIndex = (tx << BLOCK_SIZE_LOG2) + 0;
	#pragma unroll
	for (int i=0; i<BLOCK_SIZE; i++)
	{
		curelem += CurBlockLocal2[CurBlockLocal2Index] * DCTv8matrix[DCTv8matrixIndex];
		CurBlockLocal2Index += 1;
		DCTv8matrixIndex += 1;
	}
	CurBlockLocal1[ (ty << BLOCK_SIZE_LOG2) + tx ] = curelem;

	//synchronize threads to make sure the matrices are multiplied and the result is stored back in the first block
	__syncthreads();

	//copy current coefficient to its place in the result array
	Dst[ FAST_INT_MUL(((by << BLOCK_SIZE_LOG2) + ty), ImgWidth) + ((bx << BLOCK_SIZE_LOG2) + tx) ] = CurBlockLocal1[ (ty << BLOCK_SIZE_LOG2) + tx ];
}


/**
*  JPEG quality=0_of_12 quantization matrix
*/
__constant__ float Q[] = {  
	32,  33,  51,  81,  66,  39,  34,  17,
	33,  36,  48,  47,  28,  23,  12,  12,
	51,  48,  47,  28,  23,  12,  12,  12,
	81,  47,  28,  23,  12,  12,  12,  12,
	66,  28,  23,  12,  12,  12,  12,  12,
	39,  23,  12,  12,  12,  12,  12,  12,
	34,  12,  12,  12,  12,  12,  12,  12,
	17,  12,  12,  12,  12,  12,  12,  12 
};


/**
**************************************************************************
*  Performs in-place quantization of given DCT coefficients plane using 
*  predefined quantization matrices
*
* \param SrcDst			[IN/OUT] - DCT coefficients plane
* \param ImgWidth		[IN] - Stride of Dst
*  
* \return None
*/
__global__ void CUDAkernelQuantizationEmulator(float *SrcDst, int ImgWidth)
{
    // Block index
    int bx = blockIdx.x;
    int by = blockIdx.y;

    // Thread index (current coefficient)
    int tx = threadIdx.x;
    int ty = threadIdx.y;

	//copy current coefficient to the local variable
	float curCoef = SrcDst[ (by * BLOCK_SIZE + ty) * ImgWidth + (bx * BLOCK_SIZE + tx) ];

	//quantize the current coefficient
	float quantized = round( curCoef / Q[ ty * BLOCK_SIZE + tx ] );
	curCoef = quantized * Q[ ty * BLOCK_SIZE + tx ];

	//copy quantized coefficient back to the DCT-plane
	SrcDst[ (by * BLOCK_SIZE + ty) * ImgWidth + (bx * BLOCK_SIZE + tx) ] = curCoef;
}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -