📄 simplestreams.cu

📁 md5_cuda编程
💻 CU
字号:
//////////////////////////////////////////////////////////////////
// LICENSE: GPL V2
// AUTHOR: ding.yiming@gmail.com
//////////////////////////////////////////////////////////////////

#include <stdio.h>
#include <cutil.h>

#include "include.cu"
#include "string.h"

__host__ void cpy_const_data_from_host_to_device();


extern unsigned char h_md5[16];

char g_input_md5[256];

///////////////////////////////////////////////////////////////////////////////////////////////////////////


int main(int argc, unsigned char *argv[])
{
    int nstreams = STREAM_NUM;               // number of streams for CUDA calls
    int nbytes = sizeof(int) *STREAM_NUM*THREAD_NUM*THREAD_BLK_NUM*OUTPUT_INT_NUM;   // number of data bytes
    dim3 threads, blocks;           // kernel launch configuration
    float elapsed_time=0;   // timing variables

    // check the compute capability of the device
    int num_devices=0;
    CUDA_SAFE_CALL( cudaGetDeviceCount(&num_devices) );
    if(0==num_devices)
    {
        printf("your system does not have a CUDA capable device\n");
        return 1;
	}
    cudaDeviceProp device_properties;
    CUDA_SAFE_CALL( cudaGetDeviceProperties(&device_properties, 0) );
    if( (1 == device_properties.major) && (device_properties.minor < 1))
        printf("%s does not have compute capability 1.1 or later\n\n", device_properties.name);
	
	printf("//////////////////// MD5 Crack by GPU ////////////////////////////\n");

	printf("Input 16-byte MD5 data array on a 5-char password ranging from 'A~Z,a~z,^-{\\}' : \n");
	memset(g_input_md5,0,sizeof(g_input_md5));

	/*
	
	do {
		int correct=1;
		//scanf_s("%32s", g_input_md5,32);
		scanf("%s", g_input_md5);
		if (strlen(g_input_md5)!=32)
		{
			printf("Wrong input!\n");
			correct=0;
			continue;
		}

		for (int i=0; i<32; i++)
			if (!((g_input_md5[i]>='A' && g_input_md5[i]<='F')
				|| (g_input_md5[i]>='a' && g_input_md5[i]<='f')
				|| (g_input_md5[i]>='0' && g_input_md5[i]<='9')))

			{
				printf("Wrong input!\n");
				correct=0;
				break;
			}

		if (correct)
			break;
	} while (1);

	char *endchar=0;

	for (int i=0; i<16; i++)
	{
		char input[3];
		input[0]=g_input_md5[i*2];
		input[1]=g_input_md5[i*2+1];
		input[2]=0;
		h_md5[i] = strtoul(input,&endchar, 16); 
	}
	*/

	for (int i=0; i<16; i++)
		printf("%02x", h_md5[i]);
	printf("\n");
	printf("ThreadNum=%d, MD5CountPerThread=%d(K)\n", THREAD_NUM, NUM_PER_THREAD/1024); 
	printf("ThreadBlockNum=%d Stream Number=%d\n", THREAD_BLK_NUM, STREAM_NUM);
	printf("ChannelNum(ThreadNum*ThreadBlockNum)=%d(K)\n", THREAD_NUM*THREAD_BLK_NUM/1024);
	printf("TotalNumber(ChannelNum*MD5CountPerThread*StreamNum)=%d(M)\n", (NUM_PER_THREAD/1024)*(THREAD_NUM*THREAD_BLK_NUM/1024)*STREAM_NUM);

    // allocate host
    unsigned char *a = 0;                     // pointer to the array data in host memory
	unsigned char *out=0;
    // allocate host memory (pinned is required for achieve asynchronicity)
    CUDA_SAFE_CALL( cudaMallocHost((void**)&a, nbytes) );
	memset(a,0,nbytes);

    CUDA_SAFE_CALL( cudaMallocHost((void**)&out, nbytes) );
	memset(out,0,nbytes);

    // allocate and initialize an array of stream handles
    cudaStream_t *streams = (cudaStream_t*) malloc(nstreams * sizeof(cudaStream_t));
    for(int i = 0; i < nstreams; i++)
        CUDA_SAFE_CALL( cudaStreamCreate(&(streams[i])) );

    // create CUDA event handles
    cudaEvent_t start_event, stop_event;
    CUDA_SAFE_CALL( cudaEventCreate(&start_event) );
    CUDA_SAFE_CALL( cudaEventCreate(&stop_event) );

    // allocate device memory
    unsigned char *d_a = 0, *d_o = 0;             // pointers to data and init value in the device memory
    CUDA_SAFE_CALL( cudaMalloc((void**)&d_a, nbytes) );
    CUDA_SAFE_CALL( cudaMalloc((void**)&d_o, nbytes) );

	cudaMemset(d_a, 0, nbytes);
	cudaMemset(d_o, 0, nbytes);

	// Generate inputs
	//generate_input(a,nbytes);

	cpy_const_data_from_host_to_device();

    //////////////////////////////////////////////////////////////////////
    // time execution with nstreams streams
    threads=dim3(THREAD_NUM,1);
	blocks=dim3(THREAD_BLK_NUM,1);
    //memset(a, c, nbytes);     // set host memory bits to all 1s, for testing correctness
    //cudaMemset(d_a, 0, nbytes); // set device memory to all 0s, for testing correctness

	//generate_input(a,nbytes);

	clock_t start_clock_gpu, end_clock_gpu;
	start_clock_gpu = clock();

	cpy_const_data_from_host_to_device();

	for (int count=0; count<RUN_COUNT; count++)
	{

	cudaMemset(d_a, 0, nbytes); 
	cudaMemset(d_o, 0, nbytes); 

    cudaEventRecord(start_event, 0);
	// asynchronoously launch nstreams memcopies.  Note that memcopy in stream x will only
    //   commence executing when all previous CUDA calls in stream x have completed
    //for(int i = 0; i < nstreams; i++)
    //    CUDA_SAFE_CALL(cudaMemcpyAsync(d_a + i * nbytes / nstreams, a + i * nbytes / nstreams, nbytes / nstreams, cudaMemcpyHostToDevice, streams[i]));

    // asynchronously launch nstreams kernels, each operating on its own portion of data
    for(int i = 0; i < nstreams; i++)
	{
		MDString<<<blocks, threads, SHARED_MEM_SIZE, streams[i]>>>(count*nstreams+i, (unsigned int*)(d_a + i * nbytes / nstreams), 
			(unsigned int*)(d_o + i * nbytes / nstreams));
	}
    // check for any errors
    CUT_CHECK_ERROR("Kernel execution failed");

    // asynchronoously launch nstreams memcopies.  Note that memcopy in stream x will only
    //   commence executing when all previous CUDA calls in stream x have completed
    for(int i = 0; i < nstreams; i++)
        CUDA_SAFE_CALL(cudaMemcpyAsync(out + i * nbytes / nstreams, d_o + i * nbytes / nstreams, nbytes / nstreams, cudaMemcpyDeviceToHost, streams[i]));

	cudaEventRecord(stop_event, 0);
    cudaEventSynchronize(stop_event);
    CUDA_SAFE_CALL( cudaEventElapsedTime(&elapsed_time, start_event, stop_event) );

	end_clock_gpu = clock();

	unsigned int * output = (unsigned int *)(out);
	for (int i=0; i<STREAM_NUM*THREAD_NUM*THREAD_BLK_NUM*OUTPUT_INT_NUM; i=i+OUTPUT_INT_NUM)
	{
		if (output[i]!=0)
		{
			printf("ChannelID=%d MatchedCount=%d \nOffset=%d \n", i, ((output[i] & 0xFF000000)>>24), 
				(output[i] & 0xFFFFFF)-1);

			printf("Passowrd=%c%c%c%c%c\n", 
				output[i+OUTPUT_INT_NUM/2]&0xFF, (output[i+OUTPUT_INT_NUM/2]&0xFF00)>>8 , (output[i+OUTPUT_INT_NUM/2]&0xFF0000)>>16, 
				(output[i+OUTPUT_INT_NUM/2]&0xFF000000)>>24,
				output[i+1+OUTPUT_INT_NUM/2]&0xFF
				);

			if (((output[i] & 0xFF000000)>>24)>1)
			{
				printf("Offset=%d\n", output[i+1]-1);
				printf("Matched Passowrd=%c%c%c%c%c\n", 
					output[i+2+OUTPUT_INT_NUM/2]&0xFF, (output[i+2+OUTPUT_INT_NUM/2]&0xFF00)>>8 , (output[i+2+OUTPUT_INT_NUM/2]&0xFF0000)>>16, 
					(output[i+2+OUTPUT_INT_NUM/2]&0xFF000000)>>24,
					output[i+3+OUTPUT_INT_NUM/2]&0xFF
					);
			}
		}
	}

	printf("In progress ..., %d %% Completed, Elapsed Time %dms \n", (count+1)*100/RUN_COUNT, end_clock_gpu - start_clock_gpu );

	}; // for count

	//printf("Time calculated by CUDA clock function =%.2fms \n", elapsed_time  );
	//printf("Time calculated gotten by CPU clock function =%dms \n", end_clock_gpu - start_clock_gpu  );
	printf("Total Elapsed Time =%dms \n", end_clock_gpu - start_clock_gpu  );

    // release resources
    for(int i = 0; i < nstreams; i++)
        cudaStreamDestroy(streams[i]);
    cudaEventDestroy(start_event);
    cudaEventDestroy(stop_event);

    cudaFreeHost(a);
    cudaFree(d_a);
    cudaFreeHost(out);
    cudaFree(d_o);

    CUT_EXIT(argc, argv);

    return 0;
}
💿 文件大小 32 K
👤 上传用户 zhoujun008
📂 所属分类加密解密
🏷️ 相关标签

#cuda #md #编程
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -