📄 dct8x8_kernel1.cu
字号:
/*
* Copyright 1993-2007 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software.
*
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
* OR PERFORMANCE OF THIS SOURCE CODE.
*
* U.S. Government End Users. This source code is a "commercial item" as
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
* "commercial computer software" and "commercial computer software
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
* and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* source code with only those rights set forth herein.
*
* Any use of this source code in individual and commercial software must
* include, in the user documentation and internal comments to the code,
* the above Disclaimer and U.S. Government End Users Notice.
*/
/**
**************************************************************************
* \file dct8x8_kernel1.cu
* \brief Contains 1st CUDA implementations of DCT, IDCT and quantization routines,
* used in JPEG internal data processing. Device code.
*
* This code implements first CUDA versions of forward and inverse Discrete Cosine
* Transform to blocks of image pixels (of 8x8 size), as in JPEG standard. The data
* processing is done using floating point representation.
* The routine that performs quantization of DCT coefficients is also included.
*/
#pragma once
#include "Common.h"
/**
* This unitary matrix performs discrete cosine transform of rows of the matrix to the left
*/
__constant__ float DCTv8matrix[] = {
0.3535533905932738f, 0.4903926402016152f, 0.4619397662556434f, 0.4157348061512726f, 0.3535533905932738f, 0.2777851165098011f, 0.1913417161825449f, 0.0975451610080642f,
0.3535533905932738f, 0.4157348061512726f, 0.1913417161825449f, -0.0975451610080641f, -0.3535533905932737f, -0.4903926402016152f, -0.4619397662556434f, -0.2777851165098011f,
0.3535533905932738f, 0.2777851165098011f, -0.1913417161825449f, -0.4903926402016152f, -0.3535533905932738f, 0.0975451610080642f, 0.4619397662556433f, 0.4157348061512727f,
0.3535533905932738f, 0.0975451610080642f, -0.4619397662556434f, -0.2777851165098011f, 0.3535533905932737f, 0.4157348061512727f, -0.1913417161825450f, -0.4903926402016153f,
0.3535533905932738f, -0.0975451610080641f, -0.4619397662556434f, 0.2777851165098009f, 0.3535533905932738f, -0.4157348061512726f, -0.1913417161825453f, 0.4903926402016152f,
0.3535533905932738f, -0.2777851165098010f, -0.1913417161825452f, 0.4903926402016153f, -0.3535533905932733f, -0.0975451610080649f, 0.4619397662556437f, -0.4157348061512720f,
0.3535533905932738f, -0.4157348061512727f, 0.1913417161825450f, 0.0975451610080640f, -0.3535533905932736f, 0.4903926402016152f, -0.4619397662556435f, 0.2777851165098022f,
0.3535533905932738f, -0.4903926402016152f, 0.4619397662556433f, -0.4157348061512721f, 0.3535533905932733f, -0.2777851165098008f, 0.1913417161825431f, -0.0975451610080625f
};
// Temporary blocks
__shared__ float CurBlockLocal1[BLOCK_SIZE2];
__shared__ float CurBlockLocal2[BLOCK_SIZE2];
/**
**************************************************************************
* Performs 1st implementation of 8x8 block-wise Forward Discrete Cosine Transform of the given
* image plane and outputs result to the array of coefficients.
*
* \param Dst [OUT] - Coefficients plane
* \param ImgWidth [IN] - Stride of Dst
* \param OffsetXBlocks [IN] - Offset along X in blocks from which to perform processing
* \param OffsetYBlocks [IN] - Offset along Y in blocks from which to perform processing
*
* \return None
*/
__global__ void CUDAkernel1DCT(float *Dst, int ImgWidth, int OffsetXBlocks, int OffsetYBlocks)
{
// Block index
const int bx = blockIdx.x + OffsetXBlocks;
const int by = blockIdx.y + OffsetYBlocks;
// Thread index (current coefficient)
const int tx = threadIdx.x;
const int ty = threadIdx.y;
// Texture coordinates
const float tex_x = (float)( (bx << BLOCK_SIZE_LOG2) + tx ) + 0.5f;
const float tex_y = (float)( (by << BLOCK_SIZE_LOG2) + ty ) + 0.5f;
//copy current image pixel to the first block
CurBlockLocal1[ (ty << BLOCK_SIZE_LOG2) + tx ] = tex2D(TexSrc, tex_x, tex_y);
//synchronize threads to make sure the block is copied
__syncthreads();
//calculate the multiplication of DCTv8matrixT * A and place it in the second block
float curelem = 0;
int DCTv8matrixIndex = 0 * BLOCK_SIZE + ty;
int CurBlockLocal1Index = 0 * BLOCK_SIZE + tx;
#pragma unroll
for (int i=0; i<BLOCK_SIZE; i++)
{
curelem += DCTv8matrix[DCTv8matrixIndex] * CurBlockLocal1[CurBlockLocal1Index];
DCTv8matrixIndex += BLOCK_SIZE;
CurBlockLocal1Index += BLOCK_SIZE;
}
CurBlockLocal2[ (ty << BLOCK_SIZE_LOG2) + tx ] = curelem;
//synchronize threads to make sure the first 2 matrices are multiplied and the result is stored in the second block
__syncthreads();
//calculate the multiplication of (DCTv8matrixT * A) * DCTv8matrix and place it in the first block
curelem = 0;
int CurBlockLocal2Index = (ty << BLOCK_SIZE_LOG2) + 0;
DCTv8matrixIndex = 0 * BLOCK_SIZE + tx;
#pragma unroll
for (int i=0; i<BLOCK_SIZE; i++)
{
curelem += CurBlockLocal2[CurBlockLocal2Index] * DCTv8matrix[DCTv8matrixIndex];
CurBlockLocal2Index += 1;
DCTv8matrixIndex += BLOCK_SIZE;
}
CurBlockLocal1[ (ty << BLOCK_SIZE_LOG2) + tx ] = curelem;
//synchronize threads to make sure the matrices are multiplied and the result is stored back in the first block
__syncthreads();
//copy current coefficient to its place in the result array
Dst[ FAST_INT_MUL(((by << BLOCK_SIZE_LOG2) + ty), ImgWidth) + ((bx << BLOCK_SIZE_LOG2) + tx) ] = CurBlockLocal1[ (ty << BLOCK_SIZE_LOG2) + tx ];
}
/**
**************************************************************************
* Performs 1st implementation of 8x8 block-wise Inverse Discrete Cosine Transform of the given
* DCT coefficients plane and outputs result to the image array
*
* \param Dst [OUT] - Image plane
* \param ImgWidth [IN] - Stride of Dst
* \param OffsetXBlocks [IN] - Offset along X in blocks from which to perform processing
* \param OffsetYBlocks [IN] - Offset along Y in blocks from which to perform processing
*
* \return None
*/
__global__ void CUDAkernel1IDCT(float *Dst, int ImgWidth, int OffsetXBlocks, int OffsetYBlocks)
{
// Block index
int bx = blockIdx.x + OffsetXBlocks;
int by = blockIdx.y + OffsetYBlocks;
// Thread index (current image pixel)
int tx = threadIdx.x;
int ty = threadIdx.y;
// Texture coordinates
const float tex_x = (float)( (bx << BLOCK_SIZE_LOG2) + tx ) + 0.5f;
const float tex_y = (float)( (by << BLOCK_SIZE_LOG2) + ty ) + 0.5f;
//copy current image pixel to the first block
CurBlockLocal1[ (ty << BLOCK_SIZE_LOG2) + tx ] = tex2D(TexSrc, tex_x, tex_y);
//synchronize threads to make sure the block is copied
__syncthreads();
//calculate the multiplication of DCTv8matrix * A and place it in the second block
float curelem = 0;
int DCTv8matrixIndex = (ty << BLOCK_SIZE_LOG2) + 0;
int CurBlockLocal1Index = 0 * BLOCK_SIZE + tx;
#pragma unroll
for (int i=0; i<BLOCK_SIZE; i++)
{
curelem += DCTv8matrix[DCTv8matrixIndex] * CurBlockLocal1[CurBlockLocal1Index];
DCTv8matrixIndex += 1;
CurBlockLocal1Index += BLOCK_SIZE;
}
CurBlockLocal2[ (ty << BLOCK_SIZE_LOG2) + tx ] = curelem;
//synchronize threads to make sure the first 2 matrices are multiplied and the result is stored in the second block
__syncthreads();
//calculate the multiplication of (DCTv8matrix * A) * DCTv8matrixT and place it in the first block
curelem = 0;
int CurBlockLocal2Index = (ty << BLOCK_SIZE_LOG2) + 0;
DCTv8matrixIndex = (tx << BLOCK_SIZE_LOG2) + 0;
#pragma unroll
for (int i=0; i<BLOCK_SIZE; i++)
{
curelem += CurBlockLocal2[CurBlockLocal2Index] * DCTv8matrix[DCTv8matrixIndex];
CurBlockLocal2Index += 1;
DCTv8matrixIndex += 1;
}
CurBlockLocal1[ (ty << BLOCK_SIZE_LOG2) + tx ] = curelem;
//synchronize threads to make sure the matrices are multiplied and the result is stored back in the first block
__syncthreads();
//copy current coefficient to its place in the result array
Dst[ FAST_INT_MUL(((by << BLOCK_SIZE_LOG2) + ty), ImgWidth) + ((bx << BLOCK_SIZE_LOG2) + tx) ] = CurBlockLocal1[ (ty << BLOCK_SIZE_LOG2) + tx ];
}
/**
* JPEG quality=0_of_12 quantization matrix
*/
__constant__ float Q[] = {
32, 33, 51, 81, 66, 39, 34, 17,
33, 36, 48, 47, 28, 23, 12, 12,
51, 48, 47, 28, 23, 12, 12, 12,
81, 47, 28, 23, 12, 12, 12, 12,
66, 28, 23, 12, 12, 12, 12, 12,
39, 23, 12, 12, 12, 12, 12, 12,
34, 12, 12, 12, 12, 12, 12, 12,
17, 12, 12, 12, 12, 12, 12, 12
};
/**
**************************************************************************
* Performs in-place quantization of given DCT coefficients plane using
* predefined quantization matrices
*
* \param SrcDst [IN/OUT] - DCT coefficients plane
* \param ImgWidth [IN] - Stride of Dst
*
* \return None
*/
__global__ void CUDAkernelQuantizationEmulator(float *SrcDst, int ImgWidth)
{
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;
// Thread index (current coefficient)
int tx = threadIdx.x;
int ty = threadIdx.y;
//copy current coefficient to the local variable
float curCoef = SrcDst[ (by * BLOCK_SIZE + ty) * ImgWidth + (bx * BLOCK_SIZE + tx) ];
//quantize the current coefficient
float quantized = round( curCoef / Q[ ty * BLOCK_SIZE + tx ] );
curCoef = quantized * Q[ ty * BLOCK_SIZE + tx ];
//copy quantized coefficient back to the DCT-plane
SrcDst[ (by * BLOCK_SIZE + ty) * ImgWidth + (bx * BLOCK_SIZE + tx) ] = curCoef;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -