📄 programcu.cu

📁 SiftGPU is an implementation of SIFT [1] for GPU. SiftGPU processes pixels parallely to build Gaussi
💻 CU
📖 第 1 页 / 共 4 页
字号:
		float vxn = tex1Dfetch(texC, index + 1);
		float vxp = tex1Dfetch(texC, index - 1);
		float vyp = tex1Dfetch(texC, index - width);
		float vyn = tex1Dfetch(texC, index + width);
		float dx = vxn - vxp, dy = vyn - vyp;
		float grd = 0.5f * sqrt(dx * dx  + dy * dy);
		float rot = (grd == 0.0f? 0.0f : atan2(dy, dx));
		d_got[index] = make_float2(grd, rot);
	}
}

void __global__ ComputeDOG_Kernel(float* d_dog, int width, int height)
{
	int row = (blockIdx.y << DOG_BLOCK_LOG_DIMY) + threadIdx.y;
	int col = (blockIdx.x << DOG_BLOCK_LOG_DIMX) + threadIdx.x;
	if(col < width && row < height) 
	{
		int index = IMUL(row, width) + col;
		float vp = tex1Dfetch(texP, index);
		float v = tex1Dfetch(texC, index);
		d_dog[index] = v - vp;
	}
}

void ProgramCU::ComputeDOG(CuTexImage* gus, CuTexImage* dog, CuTexImage* got)
{
	int width = gus->GetImgWidth(), height = gus->GetImgHeight();
	dim3 grid((width + DOG_BLOCK_DIMX - 1)/ DOG_BLOCK_DIMX,  (height + DOG_BLOCK_DIMY - 1)/DOG_BLOCK_DIMY);
	dim3 block(DOG_BLOCK_DIMX, DOG_BLOCK_DIMY);
	gus->BindTexture(texC);
	(gus -1)->BindTexture(texP);
	if(got->_cuData)
		ComputeDOG_Kernel<<<grid, block>>>((float*) dog->_cuData, (float2*) got->_cuData, width, height);
	else
		ComputeDOG_Kernel<<<grid, block>>>((float*) dog->_cuData, width, height);
}
#define KEY_BLOCK_LOG_DIMX 3
#define KEY_BLOCK_LOG_DIMY 3
#define KEY_BLOCK_DIMX (1<<KEY_BLOCK_LOG_DIMX)
#define KEY_BLOCK_DIMY (1<<KEY_BLOCK_LOG_DIMY)

//4/5, 3/2 -> 33
//4, 1 -> 45
//4, 0 -> 60

#define READ_CMP_DOG_DATA(datai, tex, idx) \
		datai[0] = tex1Dfetch(tex, idx - 1);\
		datai[1] = tex1Dfetch(tex, idx);\
		datai[2] = tex1Dfetch(tex, idx + 1);\
		if(v > nmax)\
		{\
			   nmax = max(nmax, datai[0]);\
			   nmax = max(nmax, datai[1]);\
			   nmax = max(nmax, datai[2]);\
			   if(v < nmax) goto key_finish;\
		}else\
		{\
			   nmin = min(nmin, datai[0]);\
			   nmin = min(nmin, datai[1]);\
			   nmin = min(nmin, datai[2]);\
			   if(v > nmin) goto key_finish;\
		}


void __global__ ComputeKEY_Kernel(float4* d_key, int width, int colmax, int rowmax, 
					float dog_threshold0,  float dog_threshold, float edge_threshold, int subpixel_localization)
{
       float data[3][3], v;
       float datap[3][3], datan[3][3];
       int row = (blockIdx.y << KEY_BLOCK_LOG_DIMY) + threadIdx.y + 1;
       int col = (blockIdx.x << KEY_BLOCK_LOG_DIMX) + threadIdx.x + 1;
       int index = IMUL(row, width) + col;
	   int idx[3] ={index - width, index, index + width};
       int in_image =0;
       float nmax, nmin, result = 0.0f;
	   float dx = 0, dy = 0, ds = 0;
	   bool offset_test_passed = true;

       if(row < rowmax && col < colmax)
       {
			in_image = 1;
			data[1][1] = v = tex1Dfetch(texC, idx[1]);
			if(fabs(v) <= dog_threshold0) goto key_finish;

			data[1][0] = tex1Dfetch(texC, idx[1] - 1);
			data[1][2] = tex1Dfetch(texC, idx[1] + 1);
			nmax = max(data[1][0], data[1][2]);
			nmin = min(data[1][0], data[1][2]);

			if(v <=nmax && v >= nmin) goto key_finish;
			//if((v > nmax && v < 0 )|| (v < nmin && v > 0)) goto key_finish;
			READ_CMP_DOG_DATA(data[0], texC, idx[0]);
			READ_CMP_DOG_DATA(data[2], texC, idx[2]);

			//edge supression
			float vx2 = v * 2.0f;
			float fxx = data[1][0] + data[1][2] - vx2;
			float fyy = data[0][1] + data[2][1] - vx2;
			float fxy = 0.25f * (data[2][2] + data[0][0] - data[2][0] - data[0][2]);
			float temp1 = fxx * fyy - fxy * fxy;
			float temp2 = (fxx + fyy) * (fxx + fyy);
			if(temp1 <=0 || temp2 > edge_threshold * temp1) goto key_finish;


			//read the previous level
			READ_CMP_DOG_DATA(datap[0], texP, idx[0]);
			READ_CMP_DOG_DATA(datap[1], texP, idx[1]);
			READ_CMP_DOG_DATA(datap[2], texP, idx[2]);


			//read the next level
			READ_CMP_DOG_DATA(datan[0], texN, idx[0]);
			READ_CMP_DOG_DATA(datan[1], texN, idx[1]);
			READ_CMP_DOG_DATA(datan[2], texN, idx[2]);
			
			if(subpixel_localization)
			{
				//subpixel localization
				float fx = 0.5f * (data[1][2] - data[1][0]);
				float fy = 0.5f * (data[2][1] - data[0][1]);
				float fs = 0.5f * (datan[1][1] - datap[1][1]);

				float fss = (datan[1][1] + datap[1][1] - vx2);
				float fxs = 0.25f* (datan[1][2] + datap[1][0] - datan[1][0] - datap[1][2]);
				float fys = 0.25f* (datan[2][1] + datap[0][1] - datan[0][1] - datap[2][1]);

				//need to solve dx, dy, ds;
				// |-fx|     | fxx fxy fxs |   |dx|
				// |-fy|  =  | fxy fyy fys | * |dy|
				// |-fs|     | fxs fys fss |   |ds|
				float4 A0 = fxx > 0? make_float4(fxx, fxy, fxs, -fx) : make_float4(-fxx, -fxy, -fxs, fx);
				float4 A1 = fxy > 0? make_float4(fxy, fyy, fys, -fy) : make_float4(-fxy, -fyy, -fys, fy);
				float4 A2 = fxs > 0? make_float4(fxs, fys, fss, -fs) : make_float4(-fxs, -fys, -fss, fs);
				float maxa = max(max(A0.x, A1.x), A2.x);
				if(maxa >= 1e-10)
				{
					if(maxa == A1.x)
					{
						float4 TEMP = A1; A1 = A0; A0 = TEMP;
					}else if(maxa == A2.x)
					{
						float4 TEMP = A2; A2 = A0; A0 = TEMP;
					}
					A0.y /= A0.x;	A0.z /= A0.x;	A0.w/= A0.x;
					A1.y -= A1.x * A0.y;	A1.z -= A1.x * A0.z;	A1.w -= A1.x * A0.w;
					A2.y -= A2.x * A0.y;	A2.z -= A2.x * A0.z;	A2.w -= A2.x * A0.w;
					if(abs(A2.y) > abs(A1.y))
					{
						float4 TEMP = A2;	A2 = A1; A1 = TEMP;
					}
					if(abs(A1.y) >= 1e-10) 
					{
						A1.z /= A1.y;	A1.w /= A1.y;
						A2.z -= A2.y * A1.z;	A2.w -= A2.y * A1.w;
						if(abs(A2.z) >= 1e-10) 
						{
							ds = A2.w / A2.z;
							dy = A1.w - ds * A1.z;
							dx = A0.w - ds * A0.z - dy * A0.y;

							offset_test_passed = 
								fabs(data[1][1] + 0.5f * (dx * fx + dy * fy + ds * fs)) > dog_threshold
								&&fabs(ds) < 1.0f && fabs(dx) < 1.0f && fabs(dy) < 1.0f;
						}
					}
				}
			}
			if(offset_test_passed) result = v > nmax ? 1.0 : -1.0;
       }
key_finish:
       if(in_image) d_key[index] = make_float4(result, dx, dy, ds);
}


void ProgramCU::ComputeKEY(CuTexImage* dog, CuTexImage* key, float Tdog, float Tedge)
{
	int width = dog->GetImgWidth(), height = dog->GetImgHeight();
	float Tdog1 = (GlobalUtil::_SubpixelLocalization? 0.8f : 1.0f) * Tdog;
	CuTexImage* dogp = dog - 1;
	CuTexImage* dogn = dog + 1;
	
	dim3 grid((width - 1 + KEY_BLOCK_DIMX - 1)/ KEY_BLOCK_DIMX,  (height - 1 + KEY_BLOCK_DIMY - 1)/KEY_BLOCK_DIMY);
	dim3 block(KEY_BLOCK_DIMX, KEY_BLOCK_DIMY);
	dogp->BindTexture(texP);
	dog ->BindTexture(texC);
	dogn->BindTexture(texN);
	Tedge = (Tedge+1)*(Tedge+1)/Tedge;
	ComputeKEY_Kernel<<<grid, block>>>((float4*) key->_cuData, width, width -1, height -1, 
		Tdog1, Tdog, Tedge, GlobalUtil::_SubpixelLocalization);

}

#define HIST_INIT_WIDTH 128


void __global__ InitHist_Kernel(int4* hist, int ws, int wd, int height)
{
       int row = IMUL(blockIdx.y, blockDim.y) + threadIdx.y;
       int col = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
	   if(row < height && col < wd)
	   {
			int hidx = IMUL(row, wd) + col;
			int scol = col << 2;
			int sidx = IMUL(row, ws) + scol;
			int v[4] = {0, 0, 0, 0}; 
			if(row > 0 && row < height -1)
			{
#pragma unroll
				for(int i = 0; i < 4 ; ++i, ++scol)
				{
					float4 temp = tex1Dfetch(texDataF4, sidx +i);
					v[i] = (scol < ws -1 && scol > 0 && temp.x!=0) ? 1 : 0;
				}
			}
			hist[hidx] = make_int4(v[0], v[1], v[2], v[3]);

	   }
}



void ProgramCU::InitHistogram(CuTexImage* key, CuTexImage* hist)
{
	int ws = key->GetImgWidth(), hs = key->GetImgHeight();
	int wd = hist->GetImgWidth(), hd = hist->GetImgHeight();
	dim3 grid((wd  + HIST_INIT_WIDTH - 1)/ HIST_INIT_WIDTH,  hd);
	dim3 block(HIST_INIT_WIDTH, 1);
	key->BindTexture(texDataF4);
	InitHist_Kernel<<<grid, block>>>((int4*) hist->_cuData, ws, wd, hd);
	//CheckHistInit(key, hist);
}



void __global__ ReduceHist_Kernel(int4* d_hist, int ws, int wd, int height)
{
       int row = IMUL(blockIdx.y, blockDim.y) + threadIdx.y;
       int col = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
	   if(row < height && col < wd)
	   {
			int hidx = IMUL(row, wd) + col;
			int scol = col << 2;
			int sidx = IMUL(row, ws) + scol;
			int v[4] = {0, 0, 0, 0}; 
#pragma unroll
			for(int i = 0; i < 4 && scol < ws; ++i, ++scol)
			{
				int4 temp = tex1Dfetch(texDataI4, sidx + i);
				v[i] = temp.x + temp.y + temp.z + temp.w;
			}
			d_hist[hidx] = make_int4(v[0], v[1], v[2], v[3]);
	   }
}

void ProgramCU::ReduceHistogram(CuTexImage*hist1, CuTexImage* hist2)
{
	int ws = hist1->GetImgWidth(), hs = hist1->GetImgHeight();
	int wd = hist2->GetImgWidth(), hd = hist2->GetImgHeight();
	int temp = (int)floor(logf(float(wd * 2/ 3)) / logf(2.0f));
	const int wi = min(7, max(temp , 0));
	hist1->BindTexture(texDataI4);

	const int BW = 1 << wi, BH =  1 << (7 - wi);
	dim3 grid((wd  + BW - 1)/ BW,  (hd + BH -1) / BH);
	dim3 block(BW, BH);
	ReduceHist_Kernel<<<grid, block>>>((int4*)hist2->_cuData, ws, wd, hd);
}

#define LISTGEN_BLOCK_DIM 128

void __global__ ListGen_Kernel(int4* d_list, int width)
{
	int idx1 = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
    int4 pos = tex1Dfetch(texDataList, idx1);
	int idx2 = IMUL(pos.y, width) + pos.x;
	int4 temp = tex1Dfetch(texDataI4, idx2);
	int  sum1 = temp.x + temp.y;
	int  sum2 = sum1 + temp.z;
	pos.x <<= 2;
	if(pos.z >= sum2)
	{
		pos.x += 3;
		pos.z -= sum2;
	}else if(pos.z >= sum1)
	{
		pos.x += 2;
		pos.z -= sum1;
	}else if(pos.z >= temp.x)
	{
		pos.x += 1;
		pos.z -= temp.x;
	}
	d_list[idx1] = pos;
}
//input list (x, y) (x, y) ....
void ProgramCU::GenerateList(CuTexImage* list, CuTexImage* hist)
{
	int len = list->GetImgWidth();
	list->BindTexture(texDataList);
	hist->BindTexture(texDataI4);
	dim3  grid((len + LISTGEN_BLOCK_DIM -1) /LISTGEN_BLOCK_DIM);
	dim3  block(LISTGEN_BLOCK_DIM);
	ListGen_Kernel<<<grid, block>>>((int4*) list->_cuData, hist->GetImgWidth());
}

void __global__ ComputeOrientation_Kernel(float4* d_list, 
										  int list_len,
										  int width, int height, 
										  float sigma, float sigma_step, 
										  float gaussian_factor, float sample_factor,
										  int num_orientation,
										  int existing_keypoint, 
										  int subpixel, 
										  int keepsign)
{
	const float ten_degree_per_radius = 5.7295779513082320876798154814105;
	const float radius_per_ten_degrees = 1.0 / 5.7295779513082320876798154814105;
	int idx = IMUL(blockDim.x, blockIdx.x) + threadIdx.x;
	if(idx >= list_len) return;
	float4 key; 
	if(existing_keypoint)
	{
		key = tex1Dfetch(texDataF4, idx);		
	}else 
	{
		int4 ikey = tex1Dfetch(texDataList, idx);
		key.x = ikey.x + 0.5f;
		key.y = ikey.y + 0.5f;
		key.z = sigma;
		if(subpixel || keepsign)
		{
			float4 offset = tex1Dfetch(texDataF4, IMUL(width, ikey.y) + ikey.x);
			if(subpixel)
			{
				key.x += offset.y;
				key.y += offset.z;
				key.z *= pow(sigma_step, offset.w);
			}
			if(keepsign) key.z *= offset.x;
		}
	}
	if(num_orientation == 0)
	{
		key.w = 0;
		d_list[idx] = key;
		return;
	}
	float vote[37];
	float gsigma = key.z * gaussian_factor;
	float win = fabs(key.z) * sample_factor;
	float dist_threshold = win * win + 0.5;
	float factor = -0.5f / (gsigma * gsigma);
	float xmin = max(1.5f, floor(key.x - win) + 0.5f);
	float ymin = max(1.5f, floor(key.y - win) + 0.5f);
	float xmax = min(width - 1.5f, floor(key.x + win) + 0.5f);
	float ymax = min(height -1.5f, floor(key.y + win) + 0.5f);
#pragma unroll
	for(int i = 0; i < 36; ++i) vote[i] = 0.0f;
	for(float y = ymin; y <= ymax; y += 1.0f)
	{
		for(float x = xmin; x <= xmax; x += 1.0f)
		{
			float dx = x - key.x;
			float dy = y - key.y;
			float sq_dist  = dx * dx + dy * dy;
			if(sq_dist >= dist_threshold) continue;
			float2 got = tex2D(texDataF2, x, y);
			float weight = got.x * exp(sq_dist * factor);
			float fidx = floor(got.y * ten_degree_per_radius);
			int oidx = fidx;
			if(oidx < 0) oidx += 36;
			vote[oidx] += weight; 
		}
	}

	//filter the vote

	const float one_third = 1.0 /3.0;
#pragma unroll
	for(int i = 0; i < 6; ++i)
	{
		vote[36] = vote[0];
		float pre = vote[35];
#pragma unroll
		for(int j = 0; j < 36; ++j)
		{
			float temp = one_third * (pre + vote[j] + vote[j + 1]);
			pre = vote[j];			vote[j] = temp;
		}
	}

	vote[36] = vote[0];
	if(num_orientation == 1 || existing_keypoint)
	{
		int index_max = 0;
		float max_vote = vote[0];
#pragma unroll
		for(int i = 1; i < 36; ++i)
		{
			index_max =  vote[i] > max_vote? i : index_max;
			max_vote = max(max_vote, vote[i]);
		}
		float pre = vote[index_max == 0? 35 : index_max -1];
		float next = vote[index_max + 1];
		float weight = max_vote;
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -