📄 simplestreams.cu
字号:
//////////////////////////////////////////////////////////////////
// LICENSE: GPL V2
// AUTHOR: ding.yiming@gmail.com
//////////////////////////////////////////////////////////////////
#include <stdio.h>
#include <cutil.h>
#include "include.cu"
#include "string.h"
__host__ void cpy_const_data_from_host_to_device();
extern unsigned char h_md5[16];
char g_input_md5[256];
///////////////////////////////////////////////////////////////////////////////////////////////////////////
int main(int argc, unsigned char *argv[])
{
int nstreams = STREAM_NUM; // number of streams for CUDA calls
int nbytes = sizeof(int) *STREAM_NUM*THREAD_NUM*THREAD_BLK_NUM*OUTPUT_INT_NUM; // number of data bytes
dim3 threads, blocks; // kernel launch configuration
float elapsed_time=0; // timing variables
// check the compute capability of the device
int num_devices=0;
CUDA_SAFE_CALL( cudaGetDeviceCount(&num_devices) );
if(0==num_devices)
{
printf("your system does not have a CUDA capable device\n");
return 1;
}
cudaDeviceProp device_properties;
CUDA_SAFE_CALL( cudaGetDeviceProperties(&device_properties, 0) );
if( (1 == device_properties.major) && (device_properties.minor < 1))
printf("%s does not have compute capability 1.1 or later\n\n", device_properties.name);
printf("//////////////////// MD5 Crack by GPU ////////////////////////////\n");
printf("Input 16-byte MD5 data array on a 5-char password ranging from 'A~Z,a~z,^-{\\}' : \n");
memset(g_input_md5,0,sizeof(g_input_md5));
/*
do {
int correct=1;
//scanf_s("%32s", g_input_md5,32);
scanf("%s", g_input_md5);
if (strlen(g_input_md5)!=32)
{
printf("Wrong input!\n");
correct=0;
continue;
}
for (int i=0; i<32; i++)
if (!((g_input_md5[i]>='A' && g_input_md5[i]<='F')
|| (g_input_md5[i]>='a' && g_input_md5[i]<='f')
|| (g_input_md5[i]>='0' && g_input_md5[i]<='9')))
{
printf("Wrong input!\n");
correct=0;
break;
}
if (correct)
break;
} while (1);
char *endchar=0;
for (int i=0; i<16; i++)
{
char input[3];
input[0]=g_input_md5[i*2];
input[1]=g_input_md5[i*2+1];
input[2]=0;
h_md5[i] = strtoul(input,&endchar, 16);
}
*/
for (int i=0; i<16; i++)
printf("%02x", h_md5[i]);
printf("\n");
printf("ThreadNum=%d, MD5CountPerThread=%d(K)\n", THREAD_NUM, NUM_PER_THREAD/1024);
printf("ThreadBlockNum=%d Stream Number=%d\n", THREAD_BLK_NUM, STREAM_NUM);
printf("ChannelNum(ThreadNum*ThreadBlockNum)=%d(K)\n", THREAD_NUM*THREAD_BLK_NUM/1024);
printf("TotalNumber(ChannelNum*MD5CountPerThread*StreamNum)=%d(M)\n", (NUM_PER_THREAD/1024)*(THREAD_NUM*THREAD_BLK_NUM/1024)*STREAM_NUM);
// allocate host
unsigned char *a = 0; // pointer to the array data in host memory
unsigned char *out=0;
// allocate host memory (pinned is required for achieve asynchronicity)
CUDA_SAFE_CALL( cudaMallocHost((void**)&a, nbytes) );
memset(a,0,nbytes);
CUDA_SAFE_CALL( cudaMallocHost((void**)&out, nbytes) );
memset(out,0,nbytes);
// allocate and initialize an array of stream handles
cudaStream_t *streams = (cudaStream_t*) malloc(nstreams * sizeof(cudaStream_t));
for(int i = 0; i < nstreams; i++)
CUDA_SAFE_CALL( cudaStreamCreate(&(streams[i])) );
// create CUDA event handles
cudaEvent_t start_event, stop_event;
CUDA_SAFE_CALL( cudaEventCreate(&start_event) );
CUDA_SAFE_CALL( cudaEventCreate(&stop_event) );
// allocate device memory
unsigned char *d_a = 0, *d_o = 0; // pointers to data and init value in the device memory
CUDA_SAFE_CALL( cudaMalloc((void**)&d_a, nbytes) );
CUDA_SAFE_CALL( cudaMalloc((void**)&d_o, nbytes) );
cudaMemset(d_a, 0, nbytes);
cudaMemset(d_o, 0, nbytes);
// Generate inputs
//generate_input(a,nbytes);
cpy_const_data_from_host_to_device();
//////////////////////////////////////////////////////////////////////
// time execution with nstreams streams
threads=dim3(THREAD_NUM,1);
blocks=dim3(THREAD_BLK_NUM,1);
//memset(a, c, nbytes); // set host memory bits to all 1s, for testing correctness
//cudaMemset(d_a, 0, nbytes); // set device memory to all 0s, for testing correctness
//generate_input(a,nbytes);
clock_t start_clock_gpu, end_clock_gpu;
start_clock_gpu = clock();
cpy_const_data_from_host_to_device();
for (int count=0; count<RUN_COUNT; count++)
{
cudaMemset(d_a, 0, nbytes);
cudaMemset(d_o, 0, nbytes);
cudaEventRecord(start_event, 0);
// asynchronoously launch nstreams memcopies. Note that memcopy in stream x will only
// commence executing when all previous CUDA calls in stream x have completed
//for(int i = 0; i < nstreams; i++)
// CUDA_SAFE_CALL(cudaMemcpyAsync(d_a + i * nbytes / nstreams, a + i * nbytes / nstreams, nbytes / nstreams, cudaMemcpyHostToDevice, streams[i]));
// asynchronously launch nstreams kernels, each operating on its own portion of data
for(int i = 0; i < nstreams; i++)
{
MDString<<<blocks, threads, SHARED_MEM_SIZE, streams[i]>>>(count*nstreams+i, (unsigned int*)(d_a + i * nbytes / nstreams),
(unsigned int*)(d_o + i * nbytes / nstreams));
}
// check for any errors
CUT_CHECK_ERROR("Kernel execution failed");
// asynchronoously launch nstreams memcopies. Note that memcopy in stream x will only
// commence executing when all previous CUDA calls in stream x have completed
for(int i = 0; i < nstreams; i++)
CUDA_SAFE_CALL(cudaMemcpyAsync(out + i * nbytes / nstreams, d_o + i * nbytes / nstreams, nbytes / nstreams, cudaMemcpyDeviceToHost, streams[i]));
cudaEventRecord(stop_event, 0);
cudaEventSynchronize(stop_event);
CUDA_SAFE_CALL( cudaEventElapsedTime(&elapsed_time, start_event, stop_event) );
end_clock_gpu = clock();
unsigned int * output = (unsigned int *)(out);
for (int i=0; i<STREAM_NUM*THREAD_NUM*THREAD_BLK_NUM*OUTPUT_INT_NUM; i=i+OUTPUT_INT_NUM)
{
if (output[i]!=0)
{
printf("ChannelID=%d MatchedCount=%d \nOffset=%d \n", i, ((output[i] & 0xFF000000)>>24),
(output[i] & 0xFFFFFF)-1);
printf("Passowrd=%c%c%c%c%c\n",
output[i+OUTPUT_INT_NUM/2]&0xFF, (output[i+OUTPUT_INT_NUM/2]&0xFF00)>>8 , (output[i+OUTPUT_INT_NUM/2]&0xFF0000)>>16,
(output[i+OUTPUT_INT_NUM/2]&0xFF000000)>>24,
output[i+1+OUTPUT_INT_NUM/2]&0xFF
);
if (((output[i] & 0xFF000000)>>24)>1)
{
printf("Offset=%d\n", output[i+1]-1);
printf("Matched Passowrd=%c%c%c%c%c\n",
output[i+2+OUTPUT_INT_NUM/2]&0xFF, (output[i+2+OUTPUT_INT_NUM/2]&0xFF00)>>8 , (output[i+2+OUTPUT_INT_NUM/2]&0xFF0000)>>16,
(output[i+2+OUTPUT_INT_NUM/2]&0xFF000000)>>24,
output[i+3+OUTPUT_INT_NUM/2]&0xFF
);
}
}
}
printf("In progress ..., %d %% Completed, Elapsed Time %dms \n", (count+1)*100/RUN_COUNT, end_clock_gpu - start_clock_gpu );
}; // for count
//printf("Time calculated by CUDA clock function =%.2fms \n", elapsed_time );
//printf("Time calculated gotten by CPU clock function =%dms \n", end_clock_gpu - start_clock_gpu );
printf("Total Elapsed Time =%dms \n", end_clock_gpu - start_clock_gpu );
// release resources
for(int i = 0; i < nstreams; i++)
cudaStreamDestroy(streams[i]);
cudaEventDestroy(start_event);
cudaEventDestroy(stop_event);
cudaFreeHost(a);
cudaFree(d_a);
cudaFreeHost(out);
cudaFree(d_o);
CUT_EXIT(argc, argv);
return 0;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -