📄 programcu.cu

📁 SiftGPU is an implementation of SIFT [1] for GPU. SiftGPU processes pixels parallely to build Gaussi
💻 CU
📖 第 1 页 / 共 4 页
字号:
上一页 1 2 34
//////////////////////////////////////////////////////////////////////////////////////////

#define MULT_TBLOCK_DIMX 128
#define MULT_TBLOCK_DIMY 1
#define MULT_BLOCK_DIMX (MULT_TBLOCK_DIMX)
#define MULT_BLOCK_DIMY (8 * MULT_TBLOCK_DIMY)


texture<uint4, 1, cudaReadModeElementType> texDes1;
texture<uint4, 1, cudaReadModeElementType> texDes2;

void __global__ MultiplyDescriptor_Kernel(int* d_result, int num1, int num2, int3* d_temp)
{
	int idx01 = (blockIdx.y  * MULT_BLOCK_DIMY),  idx02 = (blockIdx.x  * MULT_BLOCK_DIMX);

	int idx1 = idx01 + threadIdx.y, idx2 = idx02 + threadIdx.x;
	__shared__ int data1[17 * 2 * MULT_BLOCK_DIMY];
	int read_idx1 = idx01 * 8 +  threadIdx.x, read_idx2 = idx2 * 8;
	int col4 = threadIdx.x & 0x3, row4 = threadIdx.x >> 2;
	int cache_idx1 = IMUL(row4, 17) + (col4 << 2);

	///////////////////////////////////////////////////////////////
	//Load feature descriptors
	///////////////////////////////////////////////////////////////
#if MULT_BLOCK_DIMY == 16
	uint4 v = tex1Dfetch(texDes1, read_idx1);
	data1[cache_idx1]   = v.x;	data1[cache_idx1+1] = v.y;
	data1[cache_idx1+2] = v.z;	data1[cache_idx1+3] = v.w;
#elif MULT_BLOCK_DIMY == 8
	if(threadIdx.x < 64)
	{
		uint4 v = tex1Dfetch(texDes1, read_idx1);
		data1[cache_idx1]   = v.x;		data1[cache_idx1+1] = v.y;
		data1[cache_idx1+2] = v.z;		data1[cache_idx1+3] = v.w;
	}
#else
#error
#endif
	__syncthreads();

	///
	if(idx2 >= num2) return;
	///////////////////////////////////////////////////////////////////////////
	//compare descriptors

	int results[MULT_BLOCK_DIMY];
#pragma unroll
	for(int i = 0; i < MULT_BLOCK_DIMY; ++i) results[i] = 0;

#pragma unroll
	for(int i = 0; i < 8; ++i)
	{
		uint4 v = tex1Dfetch(texDes2, read_idx2 + i);
		unsigned char* p2 = (unsigned char*)(&v);
#pragma unroll
		for(int k = 0; k < MULT_BLOCK_DIMY; ++k)
		{
			unsigned char* p1 = (unsigned char*) (data1 + k * 34 + i *  4 + (i/4));
			results[k] += 	 ( IMUL(p1[0], p2[0])	+ IMUL(p1[1], p2[1])  
							 + IMUL(p1[2], p2[2])  	+ IMUL(p1[3], p2[3])  
							 + IMUL(p1[4], p2[4])  	+ IMUL(p1[5], p2[5])  
							 + IMUL(p1[6], p2[6])  	+ IMUL(p1[7], p2[7])  
							 + IMUL(p1[8], p2[8])  	+ IMUL(p1[9], p2[9])  
							 + IMUL(p1[10], p2[10])	+ IMUL(p1[11], p2[11])
							 + IMUL(p1[12], p2[12])	+ IMUL(p1[13], p2[13])
							 + IMUL(p1[14], p2[14])	+ IMUL(p1[15], p2[15]));
		}
	}

	int dst_idx = IMUL(idx1, num2)  + idx2;
	if(d_temp)
	{
		int3 cmp_result = make_int3(0, -1, 0);

#pragma unroll
		for(int i = 0; i < MULT_BLOCK_DIMY; ++i)
		{
			if(idx1 + i < num1)
			{
				cmp_result = results[i] > cmp_result.x? 
				make_int3(results[i], idx1 + i, cmp_result.x) : 
				make_int3(cmp_result.x, cmp_result.y, max(cmp_result.z, results[i]));
				d_result[dst_idx + IMUL(i, num2)] = results[i];
			}
		}
		d_temp[ IMUL(blockIdx.y, num2) + idx2] = cmp_result; 
	}else
	{
#pragma unroll
		for(int i = 0; i < MULT_BLOCK_DIMY; ++i)
		{
			if(idx1 + i < num1) d_result[dst_idx + IMUL(i, num2)] = results[i];
		}
	}

}


void ProgramCU::MultiplyDescriptor(CuTexImage* des1, CuTexImage* des2, CuTexImage* texDot, CuTexImage* texCRT)
{
	int num1 = des1->GetImgWidth() / 8;	
	int num2 = des2->GetImgWidth() / 8;
	dim3 grid(	(num2 + MULT_BLOCK_DIMX - 1)/ MULT_BLOCK_DIMX,
		(num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY);
	dim3 block(MULT_TBLOCK_DIMX, MULT_TBLOCK_DIMY);
	texDot->InitTexture( num2,num1);
	if(texCRT) texCRT->InitTexture(num2, (num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY, 32);
	des1->BindTexture(texDes1);
	des2->BindTexture(texDes2);

	MultiplyDescriptor_Kernel<<<grid, block>>>((int*)texDot->_cuData, num1, num2, 
												(texCRT? (int3*)texCRT->_cuData : NULL));
	ProgramCU::CheckErrorCUDA("MultiplyDescriptor");
}

texture<float, 1, cudaReadModeElementType> texLoc1;
texture<float2, 1, cudaReadModeElementType> texLoc2;
struct Matrix33{float mat[3][3];};



void __global__ MultiplyDescriptorG_Kernel(int* d_result, int num1, int num2, int3* d_temp,
										   Matrix33 H, float hdistmax, Matrix33 F, float fdistmax)
{
	int idx01 = (blockIdx.y  * MULT_BLOCK_DIMY);	
	int idx02 = (blockIdx.x  * MULT_BLOCK_DIMX);

	int idx1 = idx01 + threadIdx.y;	
	int idx2 = idx02 + threadIdx.x;
	__shared__ int data1[17 * 2 * MULT_BLOCK_DIMY];
	__shared__ float loc1[MULT_BLOCK_DIMY * 2];
	int read_idx1 = idx01 * 8 +  threadIdx.x ;
	int read_idx2 = idx2 * 8;
	int col4 = threadIdx.x & 0x3, row4 = threadIdx.x >> 2;
	int cache_idx1 = IMUL(row4, 17) + (col4 << 2);
#if MULT_BLOCK_DIMY == 16
	uint4 v = tex1Dfetch(texDes1, read_idx1);
	data1[cache_idx1]   = v.x;
	data1[cache_idx1+1] = v.y;
	data1[cache_idx1+2] = v.z;
	data1[cache_idx1+3] = v.w;
#elif MULT_BLOCK_DIMY == 8
	if(threadIdx.x < 64)
	{
		uint4 v = tex1Dfetch(texDes1, read_idx1);
		data1[cache_idx1]   = v.x;
		data1[cache_idx1+1] = v.y;
		data1[cache_idx1+2] = v.z;
		data1[cache_idx1+3] = v.w;
	}
#else
#error
#endif
	__syncthreads();
	if(threadIdx.x < MULT_BLOCK_DIMY * 2)
	{
		loc1[threadIdx.x] = tex1Dfetch(texLoc1, 2 * idx01 + threadIdx.x);
	}
	__syncthreads();
	if(idx2 >= num2) return;
	int results[MULT_BLOCK_DIMY];
	/////////////////////////////////////////////////////////////////////////////////////////////
	//geometric verification
	/////////////////////////////////////////////////////////////////////////////////////////////
	int good_count = 0;
	float2 loc2 = tex1Dfetch(texLoc2, idx2);
#pragma unroll
	for(int i = 0; i < MULT_BLOCK_DIMY; ++i)
	{

		if(idx1 + i < num1)
		{
			float* loci = loc1 + i * 2;
			float locx = loci[0], locy = loci[1];
			//homography
			float x[3], diff[2];
			x[0] = H.mat[0][0] * locx + H.mat[0][1] * locy + H.mat[0][2];
			x[1] = H.mat[1][0] * locx + H.mat[1][1] * locy + H.mat[1][2];
			x[2] = H.mat[2][0] * locx + H.mat[2][1] * locy + H.mat[2][2];
			diff[0] = fabs(FDIV(x[0], x[2]) - loc2.x);
			diff[1] = fabs(FDIV(x[1], x[2]) - loc2.y);
			if(diff[0] < hdistmax && diff[1] < hdistmax)
			{
				//check fundamental matrix
				float fx1[3], ftx2[3], x2fx1, se; 
				fx1[0] = F.mat[0][0] * locx + F.mat[0][1] * locy + F.mat[0][2];
				fx1[1] = F.mat[1][0] * locx + F.mat[1][1] * locy + F.mat[1][2];
				//fx1[2] = F.mat[2][0] * locx + F.mat[2][1] * locy + F.mat[2][2];

				ftx2[0] = F.mat[0][0] * locx + F.mat[1][0] * locy + F.mat[2][0];
				ftx2[1] = F.mat[0][1] * locx + F.mat[1][1] * locy + F.mat[2][1];
				ftx2[2] = F.mat[0][2] * locx + F.mat[1][2] * locy + F.mat[2][2];

				x2fx1 = locx * ftx2[0]  + locy * ftx2[1] + ftx2[2];
				se = FDIV(x2fx1 * x2fx1, fx1[0] * fx1[0] + fx1[1] * fx1[1] + ftx2[0] * ftx2[0] + ftx2[1] * ftx2[1]);
				results[i] = se < fdistmax? 0: -262144;
			}else
			{
				results[i] = -262144;
			}
		}else
		{
			results[i] = -262144;
		}
		good_count += (results[i] >=0);
	}
	/////////////////////////////////////////////////////////////////////////////////////////////
	///compare feature descriptors anyway
	/////////////////////////////////////////////////////////////////////////////////////////////
	if(good_count > 0)
	{
#pragma unroll
		for(int i = 0; i < 8; ++i)
		{
			uint4 v = tex1Dfetch(texDes2, read_idx2 + i);
			unsigned char* p2 = (unsigned char*)(&v);
#pragma unroll
			for(int k = 0; k < MULT_BLOCK_DIMY; ++k)
			{
				unsigned char* p1 = (unsigned char*) (data1 + k * 34 + i *  4 + (i/4));
				results[k] += 	 ( IMUL(p1[0], p2[0])	+ IMUL(p1[1], p2[1])  
								 + IMUL(p1[2], p2[2])  	+ IMUL(p1[3], p2[3])  
								 + IMUL(p1[4], p2[4])  	+ IMUL(p1[5], p2[5])  
								 + IMUL(p1[6], p2[6])  	+ IMUL(p1[7], p2[7])  
								 + IMUL(p1[8], p2[8])  	+ IMUL(p1[9], p2[9])  
								 + IMUL(p1[10], p2[10])	+ IMUL(p1[11], p2[11])
								 + IMUL(p1[12], p2[12])	+ IMUL(p1[13], p2[13])
								 + IMUL(p1[14], p2[14])	+ IMUL(p1[15], p2[15]));
			}
		}
	}
	int dst_idx = IMUL(idx1, num2)  + idx2;
	if(d_temp)
	{
		int3 cmp_result = make_int3(0, -1, 0);
#pragma unroll
		for(int i= 0; i < MULT_BLOCK_DIMY; ++i)
		{
			if(idx1 + i < num1)
			{
				cmp_result = results[i] > cmp_result.x? 
				make_int3(results[i], idx1 + i, cmp_result.x) : 
				make_int3(cmp_result.x, cmp_result.y, max(cmp_result.z, results[i]));
				d_result[dst_idx + IMUL(i, num2)] = max(results[i], 0);
			}else
			{
				break;
			}
		}
		d_temp[ IMUL(blockIdx.y, num2) + idx2] = cmp_result; 
	}else
	{
#pragma unroll
		for(int i = 0; i < MULT_BLOCK_DIMY; ++i)
		{
			if(idx1 + i < num1) d_result[dst_idx + IMUL(i, num2)] = max(results[i], 0);
			else break;
		}
	}

}


void ProgramCU::MultiplyDescriptorG(CuTexImage* des1, CuTexImage* des2,
		CuTexImage* loc1, CuTexImage* loc2, CuTexImage* texDot, CuTexImage* texCRT,
		float H[3][3], float hdistmax, float F[3][3], float fdistmax)
{
	int num1 = des1->GetImgWidth() / 8;	
	int num2 = des2->GetImgWidth() / 8;
	Matrix33 MatF, MatH;
	//copy the matrix
	memcpy(MatF.mat, F, 9 * sizeof(float));
	memcpy(MatH.mat, H, 9 * sizeof(float));
	//thread blocks
	dim3 grid(	(num2 + MULT_BLOCK_DIMX - 1)/ MULT_BLOCK_DIMX,
		(num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY);
	dim3 block(MULT_TBLOCK_DIMX, MULT_TBLOCK_DIMY);
	//intermediate results
	texDot->InitTexture( num2,num1);
	if(texCRT) texCRT->InitTexture( num2, (num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY, 3);
	loc1->BindTexture(texLoc1);	
	loc2->BindTexture(texLoc2);
	des1->BindTexture(texDes1);	
	des2->BindTexture(texDes2);
	MultiplyDescriptorG_Kernel<<<grid, block>>>((int*)texDot->_cuData, num1, num2, 
												(texCRT? (int3*)texCRT->_cuData : NULL),
												MatH, hdistmax, MatF, fdistmax);
}


texture<int,  1, cudaReadModeElementType> texDOT;

#define ROWMATCH_BLOCK_WIDTH 32
#define ROWMATCH_BLOCK_HEIGHT 1

void __global__  RowMatch_Kernel(int*d_dot, int* d_result, int num2, float distmax, float ratiomax)
{
#if ROWMATCH_BLOCK_HEIGHT == 1
	__shared__ int dotmax[ROWMATCH_BLOCK_WIDTH];
	__shared__ int dotnxt[ROWMATCH_BLOCK_WIDTH];
	__shared__ int dotidx[ROWMATCH_BLOCK_WIDTH];
	int	row = blockIdx.y;
#else
	__shared__ int x_dotmax[ROWMATCH_BLOCK_HEIGHT][ROWMATCH_BLOCK_WIDTH];
	__shared__ int x_dotnxt[ROWMATCH_BLOCK_HEIGHT][ROWMATCH_BLOCK_WIDTH];
	__shared__ int x_dotidx[ROWMATCH_BLOCK_HEIGHT][ROWMATCH_BLOCK_WIDTH];
	int*	dotmax = x_dotmax[threadIdx.y];
	int*	dotnxt = x_dotnxt[threadIdx.y];
	int*	dotidx = x_dotidx[threadIdx.y];
	int row = IMUL(blockIdx.y, ROWMATCH_BLOCK_HEIGHT) + threadIdx.y;
#endif

	int base_address = IMUL(row , num2);
	int t_dotmax = 0, t_dotnxt = 0, t_dotidx = -1;
	for(int i = 0; i < num2; i += ROWMATCH_BLOCK_WIDTH)
	{
		if(threadIdx.x + i < num2)
		{
			int v = tex1Dfetch(texDOT, base_address + threadIdx.x + i);//d_dot[base_address + threadIdx.x + i];//
			bool test = v > t_dotmax;
			t_dotnxt = test? t_dotmax : max(t_dotnxt, v);
			t_dotidx = test? (threadIdx.x + i) : t_dotidx;
			t_dotmax = test? v: t_dotmax;
		}
		__syncthreads();
	}
	dotmax[threadIdx.x] = t_dotmax;
	dotnxt[threadIdx.x] = t_dotnxt;
	dotidx[threadIdx.x] = t_dotidx;
	__syncthreads();
	
#pragma unroll
	for(int step = ROWMATCH_BLOCK_WIDTH/2; step >0; step /= 2)
	{
		if(threadIdx.x < step)
		{
			int v1 = dotmax[threadIdx.x], v2 = dotmax[threadIdx.x + step];
			bool test =  v2 > v1;
			dotnxt[threadIdx.x] = test? max(v1, dotnxt[threadIdx.x + step]) :max(dotnxt[threadIdx.x], v2);
			dotidx[threadIdx.x] = test? dotidx[threadIdx.x + step] : dotidx[threadIdx.x];
			dotmax[threadIdx.x] = test? v2 : v1;
		}
		__syncthreads();
	}
	if(threadIdx.x == 0)
	{
		float dist =  acos(min(dotmax[0] * 0.000003814697265625f, 1.0));
		float distn = acos(min(dotnxt[0] * 0.000003814697265625f, 1.0));
		//float ratio = dist / distn;
		d_result[row] = (dist < distmax) && (dist < distn * ratiomax) ? dotidx[0] : -1;//?  : -1;
	}

}


void ProgramCU::GetRowMatch(CuTexImage* texDot, CuTexImage* texMatch, float distmax, float ratiomax)
{
	int num1 = texDot->GetImgHeight();
	int num2 = texDot->GetImgWidth();
	dim3 grid(1, num1/ROWMATCH_BLOCK_HEIGHT);
	dim3 block(ROWMATCH_BLOCK_WIDTH, ROWMATCH_BLOCK_HEIGHT);
	texDot->BindTexture(texDOT);
	RowMatch_Kernel<<<grid, block>>>((int*)texDot->_cuData,
		(int*)texMatch->_cuData, num2, distmax, ratiomax);
}

#define COLMATCH_BLOCK_WIDTH 32

//texture<int3,  1, cudaReadModeElementType> texCT;

void __global__  ColMatch_Kernel(int3*d_crt, int* d_result, int height, int num2, float distmax, float ratiomax)
{
	int col = COLMATCH_BLOCK_WIDTH * blockIdx.x + threadIdx.x;
	if(col >= num2) return;
	int3 result = d_crt[col];//tex1Dfetch(texCT, col);
	int read_idx = col + num2;
	for(int i = 1; i < height; ++i, read_idx += num2)
	{
		int3 temp = d_crt[read_idx];//tex1Dfetch(texCT, read_idx);
		result = result.x < temp.x?
			make_int3(temp.x, temp.y, max(result.x, temp.z)) :
			make_int3(result.x, result.y, max(result.z, temp.x));
	}

	float dist =  acos(min(result.x * 0.000003814697265625f, 1.0));
	float distn = acos(min(result.z * 0.000003814697265625f, 1.0));
		//float ratio = dist / distn;
	d_result[col] = (dist < distmax) && (dist < distn * ratiomax) ? result.y : -1;//?  : -1;

}

void ProgramCU::GetColMatch(CuTexImage* texCRT, CuTexImage* texMatch, float distmax, float ratiomax)
{
	int height = texCRT->GetImgHeight();
	int num2 = texCRT->GetImgWidth();
	//texCRT->BindTexture(texCT);
    dim3 grid((num2 + COLMATCH_BLOCK_WIDTH -1) / COLMATCH_BLOCK_WIDTH);    dim3 block(COLMATCH_BLOCK_WIDTH);
	ColMatch_Kernel<<<grid, block>>>((int3*)texCRT->_cuData, (int*) texMatch->_cuData, height, num2, distmax, ratiomax);
}

#endif
上一页 1 2 34
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -