📄 backup_simplestreams.cu
字号:
/*
* This sample illustrates the usage of CUDA streams for overlapping
* kernel execution with device/host memcopies. The kernel is used to
* initialize an array to a specific value, after which the array is
* copied to the host (CPU) memory. To increase performance, multiple
* kernel/memcopy pairs are launched asynchronously, each pair in its
* own stream. Devices with Compute Capability 1.1 can overlap a kernel
* and a memcopy as long as they are issued in different streams. Kernels
* are serialized. Thus, if n pairs are launched, streamed approach
* can reduce the memcopy cost to the (1/n)th of a single copy of the entire
* data set.
*
* Additionally, this sample uses CUDA events to measure elapsed time for
* CUDA calls. Events are a part of CUDA API and provide a system independent
* way to measure execution times on CUDA devices with approximately 0.5
* microsecond precision.
*
* Elapsed times are averaged over nreps repetitions (10 by default).
*
*/
#include <stdio.h>
#include <cutil.h>
///////////// CONFIGURATION
#ifdef __DEVICE_EMULATION__
#define NUM 2048 // reduced workload for emulation (n should be divisible by 512*nstreams)
#define THREAD_NUM 32
#define THREAD_BLK_NUM 8
#define THREAD_BLK_NUM_LOG2 3
#else
#define NUM (1 * 1024 * 1024)
#define THREAD_NUM 32
#define THREAD_BLK_NUM 8
#define THREAD_BLK_NUM_LOG2 3
#endif
#define REPEAT_TIME 100
#define SHARED_MEM_SIZE 1024
__device__ __constant__ unsigned char iperm[16][16][8],fperm[16][16][8]; /* inital and final permutations*/
__device__ __constant__ unsigned char s[4][4096]; /* S1 thru S8 precomputed */
__device__ __constant__ unsigned char p32[4][256][4]; /* for permuting 32-bit f output*/
__device__ __constant__ unsigned char kn[16][6]; /* key selections */
unsigned char iperm_host[16][16][8],fperm_host[16][16][8]; /* inital and final permutations*/
unsigned char s_host[4][4096]; /* S1 thru S8 precomputed */
unsigned char p32_host[4][256][4]; /* for permuting 32-bit f output*/
unsigned char kn_host[16][6]; /* key selections */
/* End of DES algorithm (except for calling desinit below) */
__device__ void permute(unsigned char *inblock,unsigned char perm[16][16][8],unsigned char *outblock) /* permute inblock with perm */
{ register int i,j;
register unsigned char *ib, *ob; /* ptr to input or output block */
register unsigned char *p, *q;
for (i=0, ob = outblock; i<8; i++)
*ob++ = 0; /* clear output block */
ib = inblock;
for (j = 0; j < 16; j += 2, ib++) /* for each input nibble */
{ ob = outblock;
p = perm[j][(*ib >> 4) & 017];
q = perm[j + 1][*ib & 017];
for (i = 0; i < 8; i++) /* and each output byte */
*ob++ |= *p++ | *q++; /* OR the masks together*/
}
}
unsigned char ip[] /* initial permutation P */
= { 58, 50, 42, 34, 26, 18, 10, 2,
60, 52, 44, 36, 28, 20, 12, 4,
62, 54, 46, 38, 30, 22, 14, 6,
64, 56, 48, 40, 32, 24, 16, 8,
57, 49, 41, 33, 25, 17, 9, 1,
59, 51, 43, 35, 27, 19, 11, 3,
61, 53, 45, 37, 29, 21, 13, 5,
63, 55, 47, 39, 31, 23, 15, 7 };
unsigned char fp[] /* final permutation F */
= { 40, 8, 48, 16, 56, 24, 64, 32,
39, 7, 47, 15, 55, 23, 63, 31,
38, 6, 46, 14, 54, 22, 62, 30,
37, 5, 45, 13, 53, 21, 61, 29,
36, 4, 44, 12, 52, 20, 60, 28,
35, 3, 43, 11, 51, 19, 59, 27,
34, 2, 42, 10, 50, 18, 58, 26,
33, 1, 41, 9, 49, 17, 57, 25 };
/* expansion operation matrix */ /* rwo: unused */
/* unsigned char ei[] = { 32, 1, 2, 3, 4, 5,
4, 5, 6, 7, 8, 9,
8, 9, 10, 11, 12, 13,
12, 13, 14, 15, 16, 17,
16, 17, 18, 19, 20, 21,
20, 21, 22, 23, 24, 25,
24, 25, 26, 27, 28, 29,
28, 29, 30, 31, 32, 1 }; */
unsigned char pc1[] /* permuted choice table (key) */
= { 57, 49, 41, 33, 25, 17, 9,
1, 58, 50, 42, 34, 26, 18,
10, 2, 59, 51, 43, 35, 27,
19, 11, 3, 60, 52, 44, 36,
63, 55, 47, 39, 31, 23, 15,
7, 62, 54, 46, 38, 30, 22,
14, 6, 61, 53, 45, 37, 29,
21, 13, 5, 28, 20, 12, 4 };
__device__ unsigned char totrot[] /* number left rotations of pc1 */
= { 1,2,4,6,8,10,12,14,15,17,19,21,23,25,27,28 };
__device__ unsigned char pc1m[56]; /* place to modify pc1 into */
__device__ unsigned char pcr[56]; /* place to rotate pc1 into */
__device__ unsigned char pc2[] /* permuted choice key (table) */
= { 14, 17, 11, 24, 1, 5,
3, 28, 15, 6, 21, 10,
23, 19, 12, 4, 26, 8,
16, 7, 27, 20, 13, 2,
41, 52, 31, 37, 47, 55,
30, 40, 51, 45, 33, 48,
44, 49, 39, 56, 34, 53,
46, 42, 50, 36, 29, 32 };
__device__ unsigned char si[8][64] /* 48->32 bit compression tables*/
= { /* S[1] */
14, 4, 13, 1, 2, 15, 11, 8, 3, 10, 6, 12, 5, 9, 0, 7,
0, 15, 7, 4, 14, 2, 13, 1, 10, 6, 12, 11, 9, 5, 3, 8,
4, 1, 14, 8, 13, 6, 2, 11, 15, 12, 9, 7, 3, 10, 5, 0,
15, 12, 8, 2, 4, 9, 1, 7, 5, 11, 3, 14, 10, 0, 6, 13,
/* S[2] */
15, 1, 8, 14, 6, 11, 3, 4, 9, 7, 2, 13, 12, 0, 5, 10,
3, 13, 4, 7, 15, 2, 8, 14, 12, 0, 1, 10, 6, 9, 11, 5,
0, 14, 7, 11, 10, 4, 13, 1, 5, 8, 12, 6, 9, 3, 2, 15,
13, 8, 10, 1, 3, 15, 4, 2, 11, 6, 7, 12, 0, 5, 14, 9,
/* S[3] */
10, 0, 9, 14, 6, 3, 15, 5, 1, 13, 12, 7, 11, 4, 2, 8,
13, 7, 0, 9, 3, 4, 6, 10, 2, 8, 5, 14, 12, 11, 15, 1,
13, 6, 4, 9, 8, 15, 3, 0, 11, 1, 2, 12, 5, 10, 14, 7,
1, 10, 13, 0, 6, 9, 8, 7, 4, 15, 14, 3, 11, 5, 2, 12,
/* S[4] */
7, 13, 14, 3, 0, 6, 9, 10, 1, 2, 8, 5, 11, 12, 4, 15,
13, 8, 11, 5, 6, 15, 0, 3, 4, 7, 2, 12, 1, 10, 14, 9,
10, 6, 9, 0, 12, 11, 7, 13, 15, 1, 3, 14, 5, 2, 8, 4,
3, 15, 0, 6, 10, 1, 13, 8, 9, 4, 5, 11, 12, 7, 2, 14,
/* S[5] */
2, 12, 4, 1, 7, 10, 11, 6, 8, 5, 3, 15, 13, 0, 14, 9,
14, 11, 2, 12, 4, 7, 13, 1, 5, 0, 15, 10, 3, 9, 8, 6,
4, 2, 1, 11, 10, 13, 7, 8, 15, 9, 12, 5, 6, 3, 0, 14,
11, 8, 12, 7, 1, 14, 2, 13, 6, 15, 0, 9, 10, 4, 5, 3,
/* S[6] */
12, 1, 10, 15, 9, 2, 6, 8, 0, 13, 3, 4, 14, 7, 5, 11,
10, 15, 4, 2, 7, 12, 9, 5, 6, 1, 13, 14, 0, 11, 3, 8,
9, 14, 15, 5, 2, 8, 12, 3, 7, 0, 4, 10, 1, 13, 11, 6,
4, 3, 2, 12, 9, 5, 15, 10, 11, 14, 1, 7, 6, 0, 8, 13,
/* S[7] */
4, 11, 2, 14, 15, 0, 8, 13, 3, 12, 9, 7, 5, 10, 6, 1,
13, 0, 11, 7, 4, 9, 1, 10, 14, 3, 5, 12, 2, 15, 8, 6,
1, 4, 11, 13, 12, 3, 7, 14, 10, 15, 6, 8, 0, 5, 9, 2,
6, 11, 13, 8, 1, 4, 10, 7, 9, 5, 0, 15, 14, 2, 3, 12,
/* S[8] */
13, 2, 8, 4, 6, 15, 11, 1, 10, 9, 3, 14, 5, 0, 12, 7,
1, 15, 13, 8, 10, 3, 7, 4, 12, 5, 6, 11, 0, 14, 9, 2,
7, 11, 4, 1, 9, 12, 14, 2, 0, 6, 10, 13, 15, 3, 5, 8,
2, 1, 14, 7, 4, 10, 8, 13, 15, 12, 9, 0, 3, 5, 6, 11 };
__device__ unsigned char p32i[] /* 32-bit permutation function */
= { 16, 7, 20, 21,
29, 12, 28, 17,
1, 15, 23, 26,
5, 18, 31, 10,
2, 8, 24, 14,
32, 27, 3, 9,
19, 13, 30, 6,
22, 11, 4, 25 };
__device__ int bytebit[] /* bit 0 is left-most in byte */
= { 0200,0100,040,020,010,04,02,01 };
__device__ int nibblebit[] = { 010,04,02,01 };
__host__ void perminit(unsigned char perm[16][16][8],unsigned char p[64]) /* initialize a perm array */
{ register int l, j, k;
int i,m;
for (i=0; i<16; i++) /* each input nibble position */
for (j=0; j<16; j++) /* all possible input nibbles */
for (k=0; k<8; k++) /* each byte of the mask */
perm[i][j][k]=0;/* clear permutation array */
for (i=0; i<16; i++) /* each input nibble position */
for (j = 0; j < 16; j++)/* each possible input nibble */
for (k = 0; k < 64; k++)/* each output bit position */
{ l = p[k] - 1; /* where does this bit come from*/
if ((l >> 2) != i) /* does it come from input posn?*/
continue; /* if not, bit k is 0 */
if (!(j & nibblebit[l & 3]))
continue; /* any such bit in input? */
m = k & 07; /* which bit is this in the byte*/
perm[i][j][k>>3] |= bytebit[m];
}
}
__host__ int getcomp(int k,int v) /* 1 compression value for sinit*/
{ register int i,j; /* correspond to i and j in FIPS*/
i=((v&040)>>4)|(v&1); /* first and last bits make row */
j=(v&037)>>1; /* middle 4 bits are column */
return (int) si[k][(i<<4)+j]; /* result is ith row, jth col */
}
__host__ void sinit() /* initialize s1-s8 arrays */
{ register int i,j;
for (i=0; i<4; i++) /* each 12-bit position */
for (j=0; j<4096; j++) /* each possible 12-bit value */
s_host[i][j]=(getcomp(i*2,j>>6)<<4) |
(017&getcomp(i*2+1,j&077));
/* store 2 compressions per unsigned char*/
}
__host__ void kinit(unsigned char *key) /* initialize key schedule array; 64 bits (will use only 56) */
{ register int i,j,l;
int m;
for (j=0; j<56; j++) /* convert pc1 to bits of key */
{ l=pc1[j]-1; /* integer bit location */
m = l & 07; /* find bit */
pc1m[j]=(key[l>>3] & /* find which key byte l is in */
bytebit[m]) /* and which bit of that byte */
? 1 : 0; /* and store 1-bit result */
}
for (i=0; i<16; i++) /* for each key sched section */
for (j=0; j<6; j++) /* and each byte of the kn */
kn_host[i][j]=0; /* clear it for accumulation */
for (i=0; i<16; i++) /* key chunk for each iteration */
{ for (j=0; j<56; j++) /* rotate pc1 the right amount */
pcr[j] = pc1m[(l=j+totrot[i])<(j<28? 28 : 56) ? l: l-28];
/* rotate left and right halves independently */
for (j=0; j<48; j++) /* select bits individually */
if (pcr[pc2[j]-1]) /* check bit that goes to kn[j] */
{ l= j & 07;
kn_host[i][j>>3] |= bytebit[l];
} /* mask it in if it's there */
}
}
__host__ void p32init() /* initialize 32-bit permutation*/
{ register int l, j, k;
int i,m;
for (i=0; i<4; i++) /* each input byte position */
for (j=0; j<256; j++) /* all possible input bytes */
for (k=0; k<4; k++) /* each byte of the mask */
p32_host[i][j][k]=0; /* clear permutation array */
for (i=0; i<4; i++) /* each input byte position */
for (j=0; j<256; j++) /* each possible input byte */
for (k=0; k<32; k++) /* each output bit position */
{ l=p32i[k]-1; /* invert this bit (0-31) */
if ((l>>3)!=i) /* does it come from input posn?*/
continue; /* if not, bit k is 0 */
if (!(j&bytebit[l&07]))
continue; /* any such bit in input? */
m = k & 07; /* which bit is it? */
p32_host[i][j][k>>3] |= bytebit[m];
}
}
__host__ void desinit(unsigned char *key) /* initialize all des arrays */
{
perminit(iperm_host,ip); /* initial permutation */
perminit(fperm_host,fp); /* final permutation */
kinit(key); /* key schedule */
sinit(); /* compression functions */
p32init(); /* 32-bit permutation in f */
}
__device__ void expand(unsigned char *right,unsigned char *bigright) /* 32 to 48 bits with E oper */
{
register unsigned char *bb, *r, r0, r1, r2, r3;
bb = bigright;
r = right; r0 = *r++; r1 = *r++; r2 = *r++; r3 = *r++;
*bb++ = ((r3 & 0001) << 7) | /* 32 */
((r0 & 0370) >> 1) | /* 1 2 3 4 5 */
((r0 & 0030) >> 3); /* 4 5 */
*bb++ = ((r0 & 0007) << 5) | /* 6 7 8 */
((r1 & 0200) >> 3) | /* 9 */
((r0 & 0001) << 3) | /* 8 */
((r1 & 0340) >> 5); /* 9 10 11 */
*bb++ = ((r1 & 0030) << 3) | /* 12 13 */
((r1 & 0037) << 1) | /* 12 13 14 15 16 */
((r2 & 0200) >> 7); /* 17 */
*bb++ = ((r1 & 0001) << 7) | /* 16 */
((r2 & 0370) >> 1) | /* 17 18 19 20 21 */
((r2 & 0030) >> 3); /* 20 21 */
*bb++ = ((r2 & 0007) << 5) | /* 22 23 24 */
((r3 & 0200) >> 3) | /* 25 */
((r2 & 0001) << 3) | /* 24 */
((r3 & 0340) >> 5); /* 25 26 27 */
*bb++ = ((r3 & 0030) << 3) | /* 28 29 */
((r3 & 0037) << 1) | /* 28 29 30 31 32 */
((r0 & 0200) >> 7); /* 1 */
}
__device__ void contract(unsigned char *in48,unsigned char *out32) /* contract f from 48 to 32 bits*/
{ register unsigned char *c;
register unsigned char *i;
register int i0, i1, i2, i3, i4, i5;
i = in48;
i0 = *i++; i1 = *i++; i2 = *i++; i3 = *i++; i4 = *i++; i5 = *i++;
c = out32; /* do output a byte at a time */
*c++ = s[0][07777 & ((i0 << 4) | ((i1 >> 4) & 017 ))];
*c++ = s[1][07777 & ((i1 << 8) | ( i2 & 0377 ))];
*c++ = s[2][07777 & ((i3 << 4) | ((i4 >> 4) & 017 ))];
*c++ = s[3][07777 & ((i4 << 8) | ( i5 & 0377 ))];
}
__device__ void perm32(unsigned char *inblock,unsigned char *outblock) /* 32-bit permutation at end */
{ register int j;
/* register int i; */ /* rwo: unused */
register unsigned char *ib, *ob;
register unsigned char *q;
ob = outblock; /* clear output block */
*ob++ = 0; *ob++ = 0; *ob++ = 0; *ob++ = 0;
ib=inblock; /* ptr to 1st byte of input */
for (j=0; j<4; j++, ib++) /* for each input byte */
{ q = p32[j][*ib & 0377];
ob = outblock; /* and each output byte */
*ob++ |= *q++; /* OR the 16 masks together */
*ob++ |= *q++;
*ob++ |= *q++;
*ob++ |= *q++;
}
}
__device__ void f(unsigned char *right, int num, unsigned char *fret) /* critical cryptographic trans; index number of this iter */
{
register unsigned char *kb, *rb, *bb; /* ptr to key selection &c */
unsigned char bigright[6]; /* right expanded to 48 bits */
unsigned char result[6]; /* expand(R) XOR keyselect[num] */
unsigned char preout[4]; /* result of 32-bit permutation */
kb = kn[num]; /* fast version of iteration */
bb = bigright;
rb = result;
expand(right,bb); /* expand to 48 bits */
*rb++ = *bb++ ^ *kb++; /* expanded R XOR chunk of key */
*rb++ = *bb++ ^ *kb++;
*rb++ = *bb++ ^ *kb++;
*rb++ = *bb++ ^ *kb++;
*rb++ = *bb++ ^ *kb++;
*rb++ = *bb++ ^ *kb++;
contract(result,preout); /* use S fns to get 32 bits */
perm32(preout,fret); /* and do final 32-bit perm */
}
__device__ void iter(int num, unsigned char *inblock, unsigned char *outblock) /* 1 churning operation; 64 bits each */
{
unsigned char fret[4]; /* return from f(R[i-1],key) */
register unsigned char *ib, *ob, *fb;
ob = outblock; ib = &inblock[4];
f(ib, num, fret); /* the primary transformation */
*ob++ = *ib++; /* L[i] = R[i-1] */
*ob++ = *ib++;
*ob++ = *ib++;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -