📄 marssort.cu

📁 GPU实现的MapReduce framework,对于学习并行编程和cuda平台的编程方面有着极好的参考价值
💻 CU
📖 第 1 页 / 共 4 页
字号:
	}
	else
	{
		bs_cmpbuf[tid].x =-1;
	}

    __syncthreads();

    // Parallel bitonic sort.
	int compareValue=0;
    for (int k = 2; k <= SHARED_MEM_INT2; k *= 2)
    {
        // Bitonic merge:
        for (int j = k / 2; j>0; j /= 2)
        {
            int ixj = tid ^ j;
            
            if (ixj > tid)
            {
                if ((tid & k) == 0)
                {
					compareValue=getCompareValue(d_rawData, bs_cmpbuf[tid], bs_cmpbuf[ixj]);
					//if (shared[tid] > shared[ixj])
					if(compareValue>0)
                    {
                        swap(bs_cmpbuf[tid], bs_cmpbuf[ixj]);
                    }
                }
                else
                {
					compareValue=getCompareValue(d_rawData, bs_cmpbuf[tid], bs_cmpbuf[ixj]);
                    //if (shared[tid] < shared[ixj])
					if(compareValue<0)
                    {
                        swap(bs_cmpbuf[tid], bs_cmpbuf[ixj]);
                    }
                }
            }
            
            __syncthreads();
        }
    }

    // Write result.
	/*if(tid<rLen)
	{
		d_output[tid] = bs_cmpbuf[tid+SHARED_MEM_INT2-rLen];
	}*/
	int startCopy=SHARED_MEM_INT2-rLen;
	if(tid>=startCopy)
	{
		d_output[tid-startCopy]=bs_cmpbuf[tid];
	}
}

__global__ void bitonicSortMultipleBlocks_kernel(void* d_rawData, int totalLenInBytes, cmp_type_t * d_values, int* d_bound, int startBlock, int numBlock, cmp_type_t *d_output)
{
	__shared__ int bs_pStart;
	__shared__ int bs_pEnd;
	__shared__ int bs_numElement;
    __shared__ cmp_type_t bs_shared[SHARED_MEM_INT2];
	

    const int by = blockIdx.y;
	const int bx = blockIdx.x;
	const int tx = threadIdx.x;
	const int ty = threadIdx.y;	
	const int tid=tx+ty*blockDim.x;
	const int bid=bx+by*gridDim.x;
	//const int numThread=blockDim.x;
	//const int resultID=(bx)*numThread+tid;
	if(bid>=numBlock) return;

	if(tid==0)
	{
		bs_pStart=d_bound[(bid+startBlock)<<1];
		bs_pEnd=d_bound[((bid+startBlock)<<1)+1];
		bs_numElement=bs_pEnd-bs_pStart;
		//if(bid==82&& bs_pStart==6339)
		//	printf("%d, %d, %d\n", bs_pStart, bs_pEnd, bs_numElement);
		
	}
	__syncthreads();
    // Copy input to shared mem.
	if(tid<bs_numElement)
	{
		bs_shared[tid] = d_values[tid+bs_pStart];
		//if(bid==82 && bs_pStart==6339)
		//	printf("tid %d, pos, %d, %d, %d, %d\n", tid,tid+bs_pStart, bs_pStart,bs_pEnd, d_values[tid+bs_pStart].x);
		//if(6342==tid+bs_pStart)
		//	printf(")))tid %d, pos, %d, %d, %d, %d\n", tid,tid+bs_pStart, bs_pStart,bs_pEnd, d_values[tid+bs_pStart].x);
	}
	else
	{
		bs_shared[tid].x =-1;
	}

    __syncthreads();

    // Parallel bitonic sort.
	int compareValue=0;
    for (int k = 2; k <= SHARED_MEM_INT2; k *= 2)
    {
        // Bitonic merge:
        for (int j = k / 2; j>0; j /= 2)
        {
            int ixj = tid ^ j;
            
            if (ixj > tid)
            {
                if ((tid & k) == 0)
                {
					compareValue=getCompareValue(d_rawData, bs_shared[tid], bs_shared[ixj]);
					//if (shared[tid] > shared[ixj])
					if(compareValue>0)
                    {
                        swap(bs_shared[tid], bs_shared[ixj]);
                    }
                }
                else
                {
					compareValue=getCompareValue(d_rawData, bs_shared[tid], bs_shared[ixj]);
                    //if (shared[tid] < shared[ixj])
					if(compareValue<0)
                    {
                        swap(bs_shared[tid], bs_shared[ixj]);
                    }
                }
            }
            
            __syncthreads();
        }
    }

    // Write result.
	//if(tid<bs_numElement)
	//{
	//	d_output[tid+bs_pStart] = bs_shared[tid+SHARED_MEM_INT2-bs_numElement];
	//}
	//int startCopy=SHARED_MEM_INT2-bs_numElement;
	if(tid>=bs_numElement)
	{
		d_output[tid-bs_numElement]=bs_shared[tid];
	}
}


__global__ void initialize_kernel(cmp_type_t* d_data, int startPos, int rLen, cmp_type_t value)
{
    const int by = blockIdx.y;
	const int bx = blockIdx.x;
	const int tx = threadIdx.x;
	const int ty = threadIdx.y;	
	const int tid=tx+ty*blockDim.x;
	const int bid=bx+by*gridDim.x;
	const int numThread=blockDim.x;
	const int resultID=(bid)*numThread+tid;
	int pos=startPos+resultID;
	if(pos<rLen)
	d_data[pos]=value;
}
void bitonicSortMultipleBlocks(void* d_rawData, int totalLenInBytes, cmp_type_t * d_values, int* d_bound, int numBlock, cmp_type_t * d_output)
{
	int numThreadsPerBlock_x=SHARED_MEM_INT2;
	int numThreadsPerBlock_y=1;
	int numBlock_x=NUM_BLOCK_PER_CHUNK_BITONIC_SORT;
	int numBlock_y=1;
	int numChunk=numBlock/numBlock_x;
	if(numBlock%numBlock_x!=0)
		numChunk++;

	dim3  thread( numThreadsPerBlock_x, numThreadsPerBlock_y, 1);
	dim3  grid( numBlock_x, numBlock_y , 1);
	int i=0;
	int start=0;
	int end=0;
	for(i=0;i<numChunk;i++)
	{
		start=i*numBlock_x;
		end=start+numBlock_x;
		if(end>numBlock)
			end=numBlock;
		//printf("bitonicSortMultipleBlocks_kernel: %d, range, %d, %d\n", i, start, end);
		bitonicSortMultipleBlocks_kernel<<<grid,thread>>>(d_rawData, totalLenInBytes, d_values, d_bound, start, end-start, d_output);
		cudaThreadSynchronize();
	}
//	cudaThreadSynchronize();
}


void bitonicSortSingleBlock(void* d_rawData, int totalLenInBytes, cmp_type_t * d_values, int rLen, cmp_type_t * d_output)
{
	int numThreadsPerBlock_x=SHARED_MEM_INT2;
	int numThreadsPerBlock_y=1;
	int numBlock_x=1;
	int numBlock_y=1;
	

	dim3  thread( numThreadsPerBlock_x, numThreadsPerBlock_y, 1);
	dim3  grid( numBlock_x, numBlock_y , 1);
	bitonicSortSingleBlock_kernel<<<grid,thread>>>(d_rawData, totalLenInBytes, d_values, rLen, d_output);
	cudaThreadSynchronize();
}



void initialize(cmp_type_t *d_data, int rLen, cmp_type_t value)
{
	int numThreadsPerBlock_x=512;
	int numThreadsPerBlock_y=1;
	int numBlock_x=512;
	int numBlock_y=1;
	int chunkSize=numBlock_x*numThreadsPerBlock_x;
	int numChunk=rLen/chunkSize;
	if(rLen%chunkSize!=0)
		numChunk++;

	dim3  thread( numThreadsPerBlock_x, numThreadsPerBlock_y, 1);
	dim3  grid( numBlock_x, numBlock_y , 1);
	int i=0;
	int start=0;
	int end=0;
	for(i=0;i<numChunk;i++)
	{
		start=i*chunkSize;
		end=start+chunkSize;
		if(end>rLen)
			end=rLen;
		initialize_kernel<<<grid,thread>>>(d_data, start, rLen, value);
	} 
	cudaThreadSynchronize();
}
void bitonicSortGPU(void* d_rawData, int totalLenInBytes, cmp_type_t* d_Rin, int rLen, void *d_Rout)
{
	unsigned int numRecordsR;

	unsigned int size = rLen;
	unsigned int level = 0;
	while( size != 1 )
	{
		size = size/2;
		level++;
	}

	if( (1<<level) < rLen )
	{
		level++;
	}

	numRecordsR = (1<<level);
	if(rLen<=NUM_THREADS_CHUNK)
	{
		bitonicSortSingleBlock((void*)d_rawData, totalLenInBytes, d_Rin, rLen, (cmp_type_t*)d_Rout);
	}
	else
	if( rLen <= 256*1024 )
	{
		//unsigned int numRecordsR = rLen;
		
		unsigned int numThreadsSort = NUM_THREADS_CHUNK;
		if(numRecordsR<NUM_THREADS_CHUNK)
			numRecordsR=NUM_THREADS_CHUNK;
		unsigned int numBlocksXSort = numRecordsR/numThreadsSort;
		unsigned int numBlocksYSort = 1;
		dim3 gridSort( numBlocksXSort, numBlocksYSort );		
		unsigned int memSizeRecordsR = sizeof( cmp_type_t ) * numRecordsR;
		//copy the <offset, length> pairs.
		cmp_type_t* d_R;
		CUDA_SAFE_CALL( cudaMalloc( (void**) &d_R, memSizeRecordsR) );
		cmp_type_t tempValue;
		tempValue.x=tempValue.y=-1;
		initialize(d_R, numRecordsR, tempValue);
		CUDA_SAFE_CALL( cudaMemcpy( d_R, d_Rin, rLen*sizeof(cmp_type_t), cudaMemcpyDeviceToDevice) );
		

		for( int k = 2; k <= numRecordsR; k *= 2 )
		{
			for( int j = k/2; j > 0; j /= 2 )
			{
				bitonicKernel<<<gridSort, numThreadsSort>>>((void*)d_rawData, totalLenInBytes, d_R, numRecordsR, k, j);
			}
		}
		CUDA_SAFE_CALL( cudaMemcpy( d_Rout, d_R+(numRecordsR-rLen), sizeof(cmp_type_t)*rLen, cudaMemcpyDeviceToDevice) );
		cudaFree( d_R );
		cudaThreadSynchronize();
	}
	else
	{
		unsigned int numThreadsSort = NUM_THREADS_CHUNK;
		unsigned int numBlocksYSort = 1;
		unsigned int numBlocksXSort = (numRecordsR/numThreadsSort)/numBlocksYSort;
		if(numBlocksXSort>=(1<<16))
		{
			numBlocksXSort=(1<<15);
			numBlocksYSort=(numRecordsR/numThreadsSort)/numBlocksXSort;			
		}
		unsigned int numBlocksChunk = NUM_BLOCKS_CHUNK;
		unsigned int numThreadsChunk = NUM_THREADS_CHUNK;
		
		unsigned int chunkSize = numBlocksChunk*numThreadsChunk;
		unsigned int numChunksR = numRecordsR/chunkSize;

		dim3 gridSort( numBlocksXSort, numBlocksYSort );
		unsigned int memSizeRecordsR = sizeof( cmp_type_t ) * numRecordsR;

		cmp_type_t* d_R;
		CUDA_SAFE_CALL( cudaMalloc( (void**) &d_R, memSizeRecordsR) );
		cmp_type_t tempValue;
		tempValue.x=tempValue.y=-1;
		initialize(d_R, numRecordsR, tempValue);
		CUDA_SAFE_CALL( cudaMemcpy( d_R, d_Rin, rLen*sizeof(cmp_type_t), cudaMemcpyDeviceToDevice) );

		for( int chunkIdx = 0; chunkIdx < numChunksR; chunkIdx++ )
		{
			unitBitonicSortKernel<<< numBlocksChunk, numThreadsChunk>>>( (void*)d_rawData, totalLenInBytes, d_R, numRecordsR, chunkIdx );
		}

		int j;
		for( int k = numThreadsChunk*2; k <= numRecordsR; k *= 2 )
		{
			for( j = k/2; j > numThreadsChunk/2; j /= 2 )
			{
				bitonicKernel<<<gridSort, numThreadsSort>>>( (void*)d_rawData, totalLenInBytes, d_R, numRecordsR, k, j);
			}

			for( int chunkIdx = 0; chunkIdx < numChunksR; chunkIdx++ )
			{
				partBitonicSortKernel<<< numBlocksChunk, numThreadsChunk>>>((void*)d_rawData, totalLenInBytes, d_R, numRecordsR, chunkIdx, k/numThreadsSort );
			}
		}
		CUDA_SAFE_CALL( cudaMemcpy( d_Rout, d_R+(numRecordsR-rLen), sizeof(cmp_type_t)*rLen, cudaMemcpyDeviceToDevice) );
		cudaFree( d_R );
		cudaThreadSynchronize();
	}
}

__global__ void getIntYArray_kernel(int2* d_input, int startPos, int rLen, int* d_output)
{
    const int by = blockIdx.y;
	const int bx = blockIdx.x;
	const int tx = threadIdx.x;
	const int ty = threadIdx.y;	
	const int tid=tx+ty*blockDim.x;
	const int bid=bx+by*gridDim.x;
	const int numThread=blockDim.x;
	const int resultID=(bid)*numThread+tid;
	int pos=startPos+resultID;
	if(pos<rLen)
	{
		int2 value=d_input[pos];
		d_output[pos]=value.y;
	}
}


__global__ void getXYArray_kernel(cmp_type_t* d_input, int startPos, int rLen, int2* d_output)
{
    const int by = blockIdx.y;
	const int bx = blockIdx.x;
	const int tx = threadIdx.x;
	const int ty = threadIdx.y;	
	const int tid=tx+ty*blockDim.x;
	const int bid=bx+by*gridDim.x;
	const int numThread=blockDim.x;
	const int resultID=(bid)*numThread+tid;
	int pos=startPos+resultID;
	if(pos<rLen)
	{
		cmp_type_t value=d_input[pos];
		d_output[pos].x=value.x;
		d_output[pos].y=value.y;
	}
}

__global__ void getZWArray_kernel(cmp_type_t* d_input, int startPos, int rLen, int2* d_output)
{
    const int by = blockIdx.y;
	const int bx = blockIdx.x;
	const int tx = threadIdx.x;
	const int ty = threadIdx.y;	
	const int tid=tx+ty*blockDim.x;
	const int bid=bx+by*gridDim.x;
	const int numThread=blockDim.x;
	const int resultID=(bid)*numThread+tid;
	int pos=startPos+resultID;
	if(pos<rLen)
	{
		cmp_type_t value=d_input[pos];
		d_output[pos].x=value.z;
		d_output[pos].y=value.w;
	}
}


__global__ void setXYArray_kernel(cmp_type_t* d_input, int startPos, int rLen, int2* d_value)
{
    const int by = blockIdx.y;
	const int bx = blockIdx.x;
	const int tx = threadIdx.x;
	const int ty = threadIdx.y;	
	const int tid=tx+ty*blockDim.x;
	const int bid=bx+by*gridDim.x;
	const int numThread=blockDim.x;
	const int resultID=(bid)*numThread+tid;
	int pos=startPos+resultID;
	if(pos<rLen)
	{
		cmp_type_t value=d_input[pos];
		value.x=d_value[pos].x;
		value.y=d_value[pos].y;
		d_input[pos]=value;
	}
}

__global__ void setZWArray_kernel(cmp_type_t* d_input, int startPos, int rLen, int2* d_value)
{
    const int by = blockIdx.y;
	const int bx = blockIdx.x;
	const int tx = threadIdx.x;
	const int ty = threadIdx.y;	
	const int tid=tx+ty*blockDim.x;
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -