📄 cusvd.cu
字号:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cutil.h>
#define NUM (512) //NUM must be mutiple of 16 to obtain the best performance
#include "cuSVD_kernel.cu"
int main(int argc, char** argv)
{
CUT_DEVICE_INIT();
unsigned int num2 = NUM * NUM;
unsigned int iteration = 2 * (NUM-1);
printf("\n");
dim3 grid (NUM>>1, 1, 1);
dim3 threads(256, 1, 1);
float * w;
float * u;
float * orign;
float * unit;
float * d_w;
float * d_u;
float * d_w_temp;
float * d_u_temp;
float * d_index;
float index[NUM];
for(int i = 0; i < NUM; i++)
{
if(i%2)
{index[i] = i-2.0f;}
else
{index[i] = i+2.0f;}
}
index[0] = 0.0f;
index[1] = 2.0f;
index[NUM-2] = NUM-1;
w = (float*)malloc(num2 * sizeof(float));
u = (float*)malloc(num2 * sizeof(float));
orign = (float*)malloc(num2 * sizeof(float));
unit = (float*)malloc(num2 * sizeof(float));
FILE *fp;
//if((fp=fopen("C:\\b.dat","rb"))==NULL)
{
printf("cannot open file\n");
}
for(int i = 0; i < NUM; i++)
{
for(int j = 0; j < NUM; j++)
{
//orign[i*NUM + j] = 0.001f * (j + 1.0f) + (i + 1.0f);
orign[i*NUM + j] = (float)rand()/(float)RAND_MAX;
//fread(&orign[i*NUM + j],sizeof(float),1,fp);
w[i*NUM + j] = unit[i*NUM + j] = 0.0f;
//printf("%3.3f ", orign[i*NUM + j]);
}
unit[i*NUM + i] = 1.0f;
//printf("\n");
}
//fclose(fp);
unsigned int timer = 0;
float elapsedTimeInMs = 0.0f;
CUDA_SAFE_CALL( cudaMalloc((void**)&d_w, sizeof(float) * num2));
CUDA_SAFE_CALL( cudaMalloc((void**)&d_u, sizeof(float) * num2));
CUDA_SAFE_CALL( cudaMalloc((void**)&d_w_temp, sizeof(float) * num2));
CUDA_SAFE_CALL( cudaMalloc((void**)&d_u_temp, sizeof(float) * num2));
CUDA_SAFE_CALL( cudaMalloc((void**)&d_index, sizeof(float) * NUM));
CUDA_SAFE_CALL( cudaMemcpy(d_index, index, sizeof(float) * NUM, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL( cudaMemcpy(d_u, unit, sizeof(float) * num2, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL( cudaMemcpy(d_w, orign, sizeof(float) * num2, cudaMemcpyHostToDevice));
CUT_SAFE_CALL( cutCreateTimer( &timer ) );
CUT_SAFE_CALL( cutStartTimer( timer));
for(int i=0;i<iteration;i++)
{
bjrot<<<grid, threads, 0>>>(d_w_temp, d_w, d_u_temp, d_u, d_index);
bjrot<<<grid, threads, 0>>>(d_w, d_w_temp, d_u, d_u_temp, d_index);
}
CUT_SAFE_CALL( cutStopTimer( timer));
elapsedTimeInMs = cutGetTimerValue( timer);
CUDA_SAFE_CALL( cudaMemcpy(u, d_u, sizeof(float) * num2, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL( cudaMemcpy(w, d_w, sizeof(float) * num2, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL( cudaFree(d_w));
CUDA_SAFE_CALL( cudaFree(d_u));
CUDA_SAFE_CALL( cudaFree(d_w_temp));
CUDA_SAFE_CALL( cudaFree(d_u_temp));
CUDA_SAFE_CALL( cudaFree(d_index));
float wi[NUM];
float sorttemp;
for(int i = 0; i < NUM; i ++)
{
wi[i]=0.0f;
for( int j = 0; j < NUM; j++)
{
wi[i] += w[i*NUM + j] * w[i*NUM + j];
}
wi[i] = sqrt(wi[i]);
}
for(int i=0;i<NUM; i++)
for(int j=0; j < NUM; j++)
if(wi[i]> wi[j])
{
sorttemp = wi[i];
wi[i] = wi[j];
wi[j] = sorttemp;
}
//for(int i=0; i<NUM; i++)
//printf("%f ", w[i]);
for(int i=0; i<NUM; i++)
printf("%f ", wi[i]);
printf("\n");
free(w);
free(u);
free(orign);
free(unit);
printf("\n");
printf("%f", elapsedTimeInMs);
CUT_EXIT(argc, argv);
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -