📄 backup_simplestreams.cu

📁 md5_cuda编程
💻 CU
📖 第 1 页 / 共 2 页
字号:
上一页 12
	*ob++ = *ib++;
	ib = inblock; fb = fret;	/* R[i]=L[i] XOR f(R[i-1],key)  */
	*ob++ = *ib++ ^ *fb++;
	*ob++ = *ib++ ^ *fb++;
	*ob++ = *ib++ ^ *fb++;
	*ob++ = *ib++ ^ *fb++;
}

__device__ void endes(unsigned char *inblock, unsigned char *outblock)			/* encrypt 64-bit inblock	*/
{
	unsigned char iters[17][8];		/* workspace for each iteration */
	unsigned char swap[8];			/* place to interchange L and R */
	register int i;
	register unsigned char *s, *t;

	permute(inblock,iperm,iters[0]);/* apply initial permutation	*/
	for (i=0; i<16; i++)			/* 16 churning operations	*/
		iter(i,iters[i],iters[i+1]);
									/* don't re-copy to save space  */
	s = swap; t = &iters[16][4];	/* interchange left		*/
	*s++ = *t++; *s++ = *t++; *s++ = *t++; *s++ = *t++;
	t = &iters[16][0];				/* and right			*/
	*s++ = *t++; *s++ = *t++; *s++ = *t++; *s++ = *t++;
	permute(swap,fperm,outblock);   /* apply final permutation	*/
}

__device__ void dedes(unsigned char *inblock, unsigned char *outblock)	/* decrypt 64-bit inblock	*/
{	unsigned char iters[17][8];				/* workspace for each iteration */
	unsigned char swap[8];					/* place to interchange L and R */
	register int i;
	register unsigned char *s, *t;

	permute(inblock,iperm,iters[0]);/* apply initial permutation	*/
	for (i=0; i<16; i++)		/* 16 churning operations	*/
		iter(15-i,iters[i],iters[i+1]);
					/* reverse order from encrypting*/
	s = swap; t = &iters[16][4];	/* interchange left		*/
	*s++ = *t++; *s++ = *t++; *s++ = *t++; *s++ = *t++;
	t = &iters[16][0];		/* and right			*/
	*s++ = *t++; *s++ = *t++; *s++ = *t++; *s++ = *t++;
	permute(swap,fperm,outblock);   /* apply final permutation	*/
}

__global__ void kernel_endes(unsigned char *in, unsigned char *out, int size)			
{
	int start= (THREAD_NUM*blockIdx.x) + threadIdx.x;

	int idx =  start;

	__syncthreads();

	do{
		endes(in+idx, out+idx);
		__syncthreads();

		idx+= (THREAD_NUM<<THREAD_BLK_NUM_LOG2<<3);
	} while (idx < size);
}

__global__ void kernel_dedes(unsigned char *in, unsigned char *out, int size)	
{
	int start= (THREAD_NUM*blockIdx.x) + threadIdx.x;

	int idx =  start;

	__syncthreads();

	do{
		dedes(in+idx, out+idx);
		__syncthreads();

		idx+= (THREAD_NUM<<THREAD_BLK_NUM_LOG2<<3);
	} while (idx < size);
}

__host__ void cpy_const_data_from_host_to_device()
{
	cudaMemcpyToSymbol(iperm, iperm_host, sizeof(iperm));
	cudaMemcpyToSymbol(fperm, fperm_host, sizeof(fperm));
	cudaMemcpyToSymbol(s, s_host, sizeof(s));
	cudaMemcpyToSymbol(p32, p32_host, sizeof(p32));
	cudaMemcpyToSymbol(kn, kn_host, sizeof(kn));
}

void generate_input(unsigned char* input, int num)
{
	for (int i=0; i<num; i++)
		input[i]= 'a' + (i%26);
}

int correct_data(unsigned char *input,  unsigned char *out, const int n)
{
    for(int i = 0; i < n; i++)
        if(input[i] != 'a' + (i%26))
            return 0;
    return 1;
}

///////////////////////////////////////////////////////////////////////////////////////////////////////////
/*
__device__ int get_value(const int c);
unsigned int get_value_host(const int c);


__global__ void init_array(int *values, int *out, int size)
{ 
	int i= (THREAD_NUM*blockIdx.x) + threadIdx.x;

      do {
	    // 1) Copy input to .
    	int v = values[i];
		int o =0;

		// 2) Synchronization
	    __syncthreads();

		// 3) Process

		 o = get_value(v);


		 // 5) Write the result to device memory.
		 //values[i] = o; 
		 out[i] =o;

		// 4) Synchronization again
         __syncthreads();


		 i+= (THREAD_NUM<<THREAD_BLK_NUM_LOG2);//(THREAD_NUM)*THREAD_BLK_NUM; 

		 } while (i < size ); 

}

__device__ int get_value(const int in)
{
	unsigned int out=in;

	 for (int loop=0; loop<REPEAT_TIME; loop++)
	 {
		 out=(out*out) & 0xFFFF;
	 }

	return out;
}

unsigned int get_value_host(const int in)
{
	unsigned int out=in;

	 for (int loop=0; loop<REPEAT_TIME; loop++)
	 {
		 out=(out*out) & 0xFFFF;
	 }
	return out;

}

*/

int test();

int main(int argc, unsigned char *argv[])
{
    int nstreams = 8;               // number of streams for CUDA calls
    int nbytes = NUM * 8;   // number of data bytes
    dim3 threads, blocks;           // kernel launch configuration
    float elapsed_time=0, time_memcpy=0, time_kernel=0;   // timing variables

	test();

	unsigned char keyx[9]= "cxskq";
	desinit(keyx);			/* set up tables for DES */

    // check the compute capability of the device
    int num_devices=0;
    CUDA_SAFE_CALL( cudaGetDeviceCount(&num_devices) );
    if(0==num_devices)
    {
        printf("your system does not have a CUDA capable device\n");
        return 1;
	}
    cudaDeviceProp device_properties;
    CUDA_SAFE_CALL( cudaGetDeviceProperties(&device_properties, 0) );
    if( (1 == device_properties.major) && (device_properties.minor < 1))
        printf("%s does not have compute capability 1.1 or later\n\n", device_properties.name);
	
	printf("////////////////////////////////////////////////\n");
	printf("64-bit_array_size=%d(M) (total size=%d (M))thread_blk_num=%d\n", NUM/1024/1024, 2*NUM/1024/1024, THREAD_BLK_NUM);

    // allocate host
    unsigned char *a = 0;                     // pointer to the array data in host memory
	unsigned char *out=0;
    // allocate host memory (pinned is required for achieve asynchronicity)
    CUDA_SAFE_CALL( cudaMallocHost((void**)&a, nbytes) );
	memset(a,0,nbytes);

    CUDA_SAFE_CALL( cudaMallocHost((void**)&out, nbytes) );
	memset(out,0,nbytes);

    // allocate and initialize an array of stream handles
    cudaStream_t *streams = (cudaStream_t*) malloc(nstreams * sizeof(cudaStream_t));
    for(int i = 0; i < nstreams; i++)
        CUDA_SAFE_CALL( cudaStreamCreate(&(streams[i])) );

    // create CUDA event handles
    cudaEvent_t start_event, stop_event;
    CUDA_SAFE_CALL( cudaEventCreate(&start_event) );
    CUDA_SAFE_CALL( cudaEventCreate(&stop_event) );

    // allocate device memory
    unsigned char *d_a = 0, *d_o = 0;             // pointers to data and init value in the device memory
    CUDA_SAFE_CALL( cudaMalloc((void**)&d_a, nbytes) );
    CUDA_SAFE_CALL( cudaMalloc((void**)&d_o, nbytes) );

	cudaMemset(d_a, 0, nbytes);
	cudaMemset(d_o, 0, nbytes);

    /*
	// time memcopy from device
    cudaEventRecord(start_event, 0);     // record in stream-0, to ensure that all previous CUDA calls have completed

    CUDA_SAFE_CALL( cudaMemcpy(d_a, a, nbytes, cudaMemcpyHostToDevice) );

    cudaMemcpyAsync(out, d_o, nbytes, cudaMemcpyDeviceToHost, streams[0]);
    cudaEventRecord(stop_event, 0);
    cudaEventSynchronize(stop_event);   // block until the event is actually recorded
    CUDA_SAFE_CALL( cudaEventElapsedTime(&time_memcpy, start_event, stop_event) );
    printf("2-way memcopy :\t%.2f\n", time_memcpy);
    
    // time kernel
    threads=dim3(THREAD_NUM, 1);
	blocks=dim3(THREAD_BLK_NUM, 1);
    cudaEventRecord(start_event, 0);
	cpy_const_data_from_host_to_device();
	kernel_endes<<<blocks, threads, SHARED_MEM_SIZE, streams[0]>>>(d_a, d_o, nbytes/8);
    cudaEventRecord(stop_event, 0);
    cudaEventSynchronize(stop_event);
    CUDA_SAFE_CALL( cudaEventElapsedTime(&time_kernel, start_event, stop_event) );
    printf("kernel computing: %.2f\n", time_kernel);

    //////////////////////////////////////////////////////////////////////
    // time non-streamed execution for reference
    threads=dim3(THREAD_NUM, 1);
	blocks=dim3(1, 1);

	memset(a,0,nbytes);

	cudaEventRecord(start_event, 0);

    CUDA_SAFE_CALL( cudaMemcpy(d_a, a, nbytes, cudaMemcpyHostToDevice) );

	cpy_const_data_from_host_to_device();
	kernel_endes<<<blocks, threads,SHARED_MEM_SIZE>>>(d_a, d_o, nbytes/8);
    cudaMemcpy(out, d_o, nbytes, cudaMemcpyDeviceToHost);
    cudaEventRecord(stop_event, 0);
    cudaEventSynchronize(stop_event);
    CUDA_SAFE_CALL( cudaEventElapsedTime(&elapsed_time, start_event, stop_event) );
	printf("Time to 2-way memcpy & execute:\n");
    printf("non-streamed:\t%.2f (%.2f expected = %.2f + %.2f)\n", elapsed_time,  time_kernel + time_memcpy,
		time_kernel, time_memcpy);
	*/

    //////////////////////////////////////////////////////////////////////
    // time execution with nstreams streams
    threads=dim3(THREAD_NUM,1);
    //blocks=dim3(n/(nstreams*threads.x),1);
	blocks=dim3(THREAD_BLK_NUM,1);
    //memset(a, c, nbytes);     // set host memory bits to all 1s, for testing correctness
    //cudaMemset(d_a, 0, nbytes); // set device memory to all 0s, for testing correctness

	generate_input(a,nbytes);

	clock_t start_clock_gpu, end_clock_gpu;
	start_clock_gpu = clock();

	cudaMemset(d_a, 0, nbytes); 
	cudaMemset(d_o, 0, nbytes); 
    cudaEventRecord(start_event, 0);

	// asynchronoously launch nstreams memcopies.  Note that memcopy in stream x will only
    //   commence executing when all previous CUDA calls in stream x have completed
    for(int i = 0; i < nstreams; i++)
        cudaMemcpyAsync(d_a + i * nbytes / nstreams, a + i * nbytes / nstreams, nbytes / nstreams, cudaMemcpyHostToDevice, streams[i]);

	cpy_const_data_from_host_to_device();
    // asynchronously launch nstreams kernels, each operating on its own portion of data
    for(int i = 0; i < nstreams; i++)
		kernel_endes<<<blocks, threads, SHARED_MEM_SIZE, streams[i]>>>(d_a + i * nbytes / nstreams, d_o + i * nbytes / nstreams, nbytes/nstreams);

    // asynchronoously launch nstreams memcopies.  Note that memcopy in stream x will only
    //   commence executing when all previous CUDA calls in stream x have completed
    for(int i = 0; i < nstreams; i++)
        cudaMemcpyAsync(out + i * nbytes / nstreams, d_o + i * nbytes / nstreams, nbytes / nstreams, cudaMemcpyDeviceToHost, streams[i]);

	
	printf("Encoding ... \n" );
	for (int i=0; i<80; i++)
		printf("i=%d in=%x out=%x correct=%d\n", i, a[i] & 0xFF, out[i]& 0xFF, 0 );

	memset(a,0,nbytes);

	/// Decode
	// asynchronoously launch nstreams memcopies.  Note that memcopy in stream x will only
    //   commence executing when all previous CUDA calls in stream x have completed
    for(int i = 0; i < nstreams; i++)
        cudaMemcpyAsync(d_o + i * nbytes / nstreams, out + i * nbytes / nstreams, nbytes / nstreams, cudaMemcpyHostToDevice, streams[i]);

	cpy_const_data_from_host_to_device();
    // asynchronously launch nstreams kernels, each operating on its own portion of data
    for(int i = 0; i < nstreams; i++)
		kernel_dedes<<<blocks, threads, SHARED_MEM_SIZE, streams[i]>>>(d_o + i * nbytes / nstreams, d_a + i * nbytes / nstreams, nbytes/nstreams);

    // asynchronoously launch nstreams memcopies.  Note that memcopy in stream x will only
    //   commence executing when all previous CUDA calls in stream x have completed
    for(int i = 0; i < nstreams; i++)
        cudaMemcpyAsync(a + i * nbytes / nstreams, d_a + i * nbytes / nstreams, nbytes / nstreams, cudaMemcpyDeviceToHost, streams[i]);

	cudaEventRecord(stop_event, 0);
    cudaEventSynchronize(stop_event);
    CUDA_SAFE_CALL( cudaEventElapsedTime(&elapsed_time, start_event, stop_event) );

	end_clock_gpu = clock();
    printf("%d streams:\t%.2f (%.2f expected = %.2f + %.2f/%d)\n", nstreams, elapsed_time , time_kernel + time_memcpy / nstreams,
		time_kernel, time_memcpy,nstreams );

     printf("Time taken by GPU using clock()= %d ms\n", end_clock_gpu - start_clock_gpu);

	printf("Decoding ... \n" );
	for (int i=0; i<80; i++)
		printf("i=%d in=%x out=%x correct=%d\n", i, a[i] & 0xFF, out[i] & 0xFF, 0 );

    // check whether the output is correct
    printf("------------VERIFY USING CPU-------------------\n");

	//for (int i=0; i<80; i++)
	//	printf("i=%d in=%x out=%x correct=%x\n", i, a[i], out[i], get_value_host(a[i]));

	clock_t start_clock_cpu, end_clock_cpu;

	start_clock_cpu = clock();
	int verification = correct_data(a, out, nbytes);
	end_clock_cpu = clock();

    if(verification)
        printf("Test PASSED\n");
    else
        printf("Test FAILED\n");

     printf("Time taken by single threaded CPU using clock()= %d ms\n", end_clock_cpu - start_clock_cpu);

	 float faster_time = (end_clock_gpu - start_clock_gpu)<1 ? 1: (end_clock_cpu - start_clock_cpu) / (end_clock_gpu - start_clock_gpu);

     printf("GPU is %.2f (%d/%d) times faster than CPU.\n", faster_time,
		 (end_clock_cpu - start_clock_cpu) , (end_clock_gpu - start_clock_gpu));

    // release resources
    for(int i = 0; i < nstreams; i++)
        cudaStreamDestroy(streams[i]);
    cudaEventDestroy(start_event);
    cudaEventDestroy(stop_event);
    cudaFreeHost(a);
    cudaFree(d_a);
    //cudaFree(d_c);

    CUT_EXIT(argc, argv);

    return 0;
}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -