📄 ffdecsa.c
字号:
/* FFdecsa -- fast decsa algorithm * * Copyright (C) 2003-2004 fatih89r * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */#include <sys/types.h>#include <string.h>#include <stdio.h>#include <stdlib.h>#include "FFdecsa.h"#ifndef NULL#define NULL 0#endif//#define DEBUG#ifdef DEBUG#define DBG(a) a#else#define DBG(a)#endif//// parallelization stuff, large speed differences are possible// possible choices#define PARALLEL_32_4CHAR 320#define PARALLEL_32_4CHARA 321#define PARALLEL_32_INT 322#define PARALLEL_64_8CHAR 640#define PARALLEL_64_8CHARA 641#define PARALLEL_64_2INT 642#define PARALLEL_64_LONG 643#define PARALLEL_64_MMX 644#define PARALLEL_128_16CHAR 1280#define PARALLEL_128_16CHARA 1281#define PARALLEL_128_4INT 1282#define PARALLEL_128_2LONG 1283#define PARALLEL_128_2MMX 1284#define PARALLEL_128_SSE 1285#define PARALLEL_128_SSE2 1286//////// our choice //////////////// our choice //////////////// our choice //////////////// our choice ////////#ifndef PARALLEL_MODE#define PARALLEL_MODE PARALLEL_32_INT#endif//////// our choice //////////////// our choice //////////////// our choice //////////////// our choice ////////#include "parallel_generic.h"//// conditionals#if PARALLEL_MODE==PARALLEL_32_4CHAR#include "parallel_032_4char.h"#elif PARALLEL_MODE==PARALLEL_32_4CHARA#include "parallel_032_4charA.h"#elif PARALLEL_MODE==PARALLEL_32_INT#include "parallel_032_int.h"#elif PARALLEL_MODE==PARALLEL_64_8CHAR#include "parallel_064_8char.h"#elif PARALLEL_MODE==PARALLEL_64_8CHARA#include "parallel_064_8charA.h"#elif PARALLEL_MODE==PARALLEL_64_2INT#include "parallel_064_2int.h"#elif PARALLEL_MODE==PARALLEL_64_LONG#include "parallel_064_long.h"#elif PARALLEL_MODE==PARALLEL_64_MMX#include "parallel_064_mmx.h"#elif PARALLEL_MODE==PARALLEL_128_16CHAR#include "parallel_128_16char.h"#elif PARALLEL_MODE==PARALLEL_128_16CHARA#include "parallel_128_16charA.h"#elif PARALLEL_MODE==PARALLEL_128_4INT#include "parallel_128_4int.h"#elif PARALLEL_MODE==PARALLEL_128_2LONG#include "parallel_128_2long.h"#elif PARALLEL_MODE==PARALLEL_128_2MMX#include "parallel_128_2mmx.h"#elif PARALLEL_MODE==PARALLEL_128_SSE#include "parallel_128_sse.h"#elif PARALLEL_MODE==PARALLEL_128_SSE2#include "parallel_128_sse2.h"#else#error "unknown/undefined parallel mode"#endif// stuff depending on conditionals#define BYTES_PER_GROUP (GROUP_PARALLELISM/8)#define BYPG BYTES_PER_GROUP#define BITS_PER_GROUP GROUP_PARALLELISM#define BIPG BITS_PER_GROUP#ifndef MALLOC#define MALLOC(X) malloc(X)#endif#ifndef FREE#define FREE(X) free(X)#endif#ifndef MEMALIGN#define MEMALIGN#endif//// debug toolstatic void dump_mem(const char *string, const unsigned char *p, int len, int linelen){ int i; for(i=0;i<len;i++){ if(i%linelen==0&&i) fprintf(stderr,"\n"); if(i%linelen==0) fprintf(stderr,"%s %08x:",string,i); else{ if(i%8==0) fprintf(stderr," "); if(i%4==0) fprintf(stderr," "); } fprintf(stderr," %02x",p[i]); } if(i%linelen==0) fprintf(stderr,"\n");}//////////////////////////////////////////////////////////////////////////////////struct csa_key_t{ unsigned char ck[8];// used by stream int iA[8]; // iA[0] is for A1, iA[7] is for A8 int iB[8]; // iB[0] is for B1, iB[7] is for B8// used by stream (group) MEMALIGN group ck_g[8][8]; // [byte][bit:0=LSB,7=MSB] MEMALIGN group iA_g[8][4]; // [0 for A1][0 for LSB] MEMALIGN group iB_g[8][4]; // [0 for B1][0 for LSB]// used by block unsigned char kk[56];// used by block (group) MEMALIGN batch kkmulti[56]; // many times the same byte in every batch};struct csa_keys_t{ struct csa_key_t even; struct csa_key_t odd;};//-----stream cypher//-----key schedule for stream decypherstatic void key_schedule_stream( unsigned char *ck, // [In] ck[0]-ck[7] 8 bytes | Key. int *iA, // [Out] iA[0]-iA[7] 8 nibbles | Key schedule. int *iB) // [Out] iB[0]-iB[7] 8 nibbles | Key schedule.{ iA[0]=(ck[0]>>4)&0xf; iA[1]=(ck[0] )&0xf; iA[2]=(ck[1]>>4)&0xf; iA[3]=(ck[1] )&0xf; iA[4]=(ck[2]>>4)&0xf; iA[5]=(ck[2] )&0xf; iA[6]=(ck[3]>>4)&0xf; iA[7]=(ck[3] )&0xf; iB[0]=(ck[4]>>4)&0xf; iB[1]=(ck[4] )&0xf; iB[2]=(ck[5]>>4)&0xf; iB[3]=(ck[5] )&0xf; iB[4]=(ck[6]>>4)&0xf; iB[5]=(ck[6] )&0xf; iB[6]=(ck[7]>>4)&0xf; iB[7]=(ck[7] )&0xf;}//----- stream main function#define STREAM_INIT#include "stream.c"#undef STREAM_INIT#define STREAM_NORMAL#include "stream.c"#undef STREAM_NORMAL//-----block decypher//-----key schedule for block decypherstatic void key_schedule_block( unsigned char *ck, // [In] ck[0]-ck[7] 8 bytes | Key. unsigned char *kk) // [Out] kk[0]-kk[55] 56 bytes | Key schedule.{ static const unsigned char key_perm[0x40] = { 0x12,0x24,0x09,0x07,0x2A,0x31,0x1D,0x15, 0x1C,0x36,0x3E,0x32,0x13,0x21,0x3B,0x40, 0x18,0x14,0x25,0x27,0x02,0x35,0x1B,0x01, 0x22,0x04,0x0D,0x0E,0x39,0x28,0x1A,0x29, 0x33,0x23,0x34,0x0C,0x16,0x30,0x1E,0x3A, 0x2D,0x1F,0x08,0x19,0x17,0x2F,0x3D,0x11, 0x3C,0x05,0x38,0x2B,0x0B,0x06,0x0A,0x2C, 0x20,0x3F,0x2E,0x0F,0x03,0x26,0x10,0x37, }; int i,j,k; int bit[64]; int newbit[64]; int kb[7][8]; // 56 steps // 56 key bytes kk(55)..kk(0) by key schedule from ck // kb(6,0) .. kb(6,7) = ck(0) .. ck(7) kb[6][0] = ck[0]; kb[6][1] = ck[1]; kb[6][2] = ck[2]; kb[6][3] = ck[3]; kb[6][4] = ck[4]; kb[6][5] = ck[5]; kb[6][6] = ck[6]; kb[6][7] = ck[7]; // calculate kb[5] .. kb[0] for(i=5; i>=0; i--){ // 64 bit perm on kb for(j=0; j<8; j++){ for(k=0; k<8; k++){ bit[j*8+k] = (kb[i+1][j] >> (7-k)) & 1; newbit[key_perm[j*8+k]-1] = bit[j*8+k]; } } for(j=0; j<8; j++){ kb[i][j] = 0; for(k=0; k<8; k++){ kb[i][j] |= newbit[j*8+k] << (7-k); } } } // xor to give kk for(i=0; i<7; i++){ for(j=0; j<8; j++){ kk[i*8+j] = kb[i][j] ^ i; } }}//-----block utilsstatic inline __attribute__((always_inline)) void trasp_N_8 (unsigned char *in,unsigned char* out,int count){ int *ri=(int *)in; int *ibi=(int *)out; int j,i,k,g; // copy and first step for(g=0;g<count;g++){ ri[g]=ibi[2*g]; ri[GROUP_PARALLELISM+g]=ibi[2*g+1]; }//dump_mem("NE1 r[roff]",&r[roff],GROUP_PARALLELISM*8,GROUP_PARALLELISM);// now 01230123#define INTS_PER_ROW (GROUP_PARALLELISM/8*2) for(j=0;j<8;j+=4){ for(i=0;i<2;i++){ for(k=0;k<INTS_PER_ROW;k++){ unsigned int t,b; t=ri[INTS_PER_ROW*(j+i)+k]; b=ri[INTS_PER_ROW*(j+i+2)+k]; ri[INTS_PER_ROW*(j+i)+k]= (t&0x0000ffff) | ((b )<<16); ri[INTS_PER_ROW*(j+i+2)+k]= ((t )>>16) | (b&0xffff0000) ; } } }//dump_mem("NE2 r[roff]",&r[roff],GROUP_PARALLELISM*8,GROUP_PARALLELISM);// now 01010101 for(j=0;j<8;j+=2){ for(i=0;i<1;i++){ for(k=0;k<INTS_PER_ROW;k++){ unsigned int t,b; t=ri[INTS_PER_ROW*(j+i)+k]; b=ri[INTS_PER_ROW*(j+i+1)+k]; ri[INTS_PER_ROW*(j+i)+k]= (t&0x00ff00ff) | ((b&0x00ff00ff)<<8); ri[INTS_PER_ROW*(j+i+1)+k]= ((t&0xff00ff00)>>8) | (b&0xff00ff00); } } }//dump_mem("NE3 r[roff]",&r[roff],GROUP_PARALLELISM*8,GROUP_PARALLELISM);// now 00000000}static inline __attribute__((always_inline)) void trasp_8_N (unsigned char *in,unsigned char* out,int count){ int *ri=(int *)in; int *bdi=(int *)out; int j,i,k,g;#define INTS_PER_ROW (GROUP_PARALLELISM/8*2)//dump_mem("NE1 r[roff]",&r[roff],GROUP_PARALLELISM*8,GROUP_PARALLELISM);// now 00000000 for(j=0;j<8;j+=2){ for(i=0;i<1;i++){ for(k=0;k<INTS_PER_ROW;k++){ unsigned int t,b; t=ri[INTS_PER_ROW*(j+i)+k]; b=ri[INTS_PER_ROW*(j+i+1)+k]; ri[INTS_PER_ROW*(j+i)+k]= (t&0x00ff00ff) | ((b&0x00ff00ff)<<8); ri[INTS_PER_ROW*(j+i+1)+k]= ((t&0xff00ff00)>>8) | (b&0xff00ff00); } } }//dump_mem("NE2 r[roff]",&r[roff],GROUP_PARALLELISM*8,GROUP_PARALLELISM);// now 01010101 for(j=0;j<8;j+=4){ for(i=0;i<2;i++){ for(k=0;k<INTS_PER_ROW;k++){ unsigned int t,b; t=ri[INTS_PER_ROW*(j+i)+k]; b=ri[INTS_PER_ROW*(j+i+2)+k]; ri[INTS_PER_ROW*(j+i)+k]= (t&0x0000ffff) | ((b )<<16); ri[INTS_PER_ROW*(j+i+2)+k]= ((t )>>16) | (b&0xffff0000) ; } } }//dump_mem("NE3 r[roff]",&r[roff],GROUP_PARALLELISM*8,GROUP_PARALLELISM);// now 01230123 for(g=0;g<count;g++){ bdi[2*g]=ri[g]; bdi[2*g+1]=ri[GROUP_PARALLELISM+g]; }}//-----block main function// block groupstatic void block_decypher_group( batch *kkmulti, // [In] kkmulti[0]-kkmulti[55] 56 batches | Key schedule (each batch has repeated equal bytes). unsigned char *ib, // [In] (ib0,ib1,...ib7)...x32 32*8 bytes | Initialization vector. unsigned char *bd, // [Out] (bd0,bd1,...bd7)...x32 32*8 bytes | Block decipher. int count){ // int is faster than unsigned char. apparently not static const unsigned char block_sbox[0x100] = { 0x3A,0xEA,0x68,0xFE,0x33,0xE9,0x88,0x1A, 0x83,0xCF,0xE1,0x7F,0xBA,0xE2,0x38,0x12, 0xE8,0x27,0x61,0x95,0x0C,0x36,0xE5,0x70, 0xA2,0x06,0x82,0x7C,0x17,0xA3,0x26,0x49, 0xBE,0x7A,0x6D,0x47,0xC1,0x51,0x8F,0xF3, 0xCC,0x5B,0x67,0xBD,0xCD,0x18,0x08,0xC9, 0xFF,0x69,0xEF,0x03,0x4E,0x48,0x4A,0x84, 0x3F,0xB4,0x10,0x04,0xDC,0xF5,0x5C,0xC6, 0x16,0xAB,0xAC,0x4C,0xF1,0x6A,0x2F,0x3C, 0x3B,0xD4,0xD5,0x94,0xD0,0xC4,0x63,0x62, 0x71,0xA1,0xF9,0x4F,0x2E,0xAA,0xC5,0x56, 0xE3,0x39,0x93,0xCE,0x65,0x64,0xE4,0x58, 0x6C,0x19,0x42,0x79,0xDD,0xEE,0x96,0xF6, 0x8A,0xEC,0x1E,0x85,0x53,0x45,0xDE,0xBB, 0x7E,0x0A,0x9A,0x13,0x2A,0x9D,0xC2,0x5E, 0x5A,0x1F,0x32,0x35,0x9C,0xA8,0x73,0x30, 0x29,0x3D,0xE7,0x92,0x87,0x1B,0x2B,0x4B, 0xA5,0x57,0x97,0x40,0x15,0xE6,0xBC,0x0E, 0xEB,0xC3,0x34,0x2D,0xB8,0x44,0x25,0xA4, 0x1C,0xC7,0x23,0xED,0x90,0x6E,0x50,0x00, 0x99,0x9E,0x4D,0xD9,0xDA,0x8D,0x6F,0x5F, 0x3E,0xD7,0x21,0x74,0x86,0xDF,0x6B,0x05, 0x8E,0x5D,0x37,0x11,0xD2,0x28,0x75,0xD6, 0xA7,0x77,0x24,0xBF,0xF0,0xB0,0x02,0xB7, 0xF8,0xFC,0x81,0x09,0xB1,0x01,0x76,0x91, 0x7D,0x0F,0xC8,0xA0,0xF2,0xCB,0x78,0x60, 0xD1,0xF7,0xE0,0xB5,0x98,0x22,0xB3,0x20, 0x1D,0xA6,0xDB,0x7B,0x59,0x9F,0xAE,0x31, 0xFB,0xD3,0xB6,0xCA,0x43,0x72,0x07,0xF4, 0xD8,0x41,0x14,0x55,0x0D,0x54,0x8B,0xB9, 0xAD,0x46,0x0B,0xAF,0x80,0x52,0x2C,0xFA, 0x8C,0x89,0x66,0xFD,0xB2,0xA9,0x9B,0xC0, }; MEMALIGN unsigned char r[GROUP_PARALLELISM*(8+56)]; /* 56 because we will move back in memory while looping */ MEMALIGN unsigned char sbox_in[GROUP_PARALLELISM],sbox_out[GROUP_PARALLELISM],perm_out[GROUP_PARALLELISM]; int roff; int i,g,count_all=GROUP_PARALLELISM; roff=GROUP_PARALLELISM*56;#define FASTTRASP1#ifndef FASTTRASP1 for(g=0;g<count;g++){ // Init registers int j; for(j=0;j<8;j++){ r[roff+GROUP_PARALLELISM*j+g]=ib[8*g+j]; } }#else trasp_N_8((unsigned char *)&r[roff],(unsigned char *)ib,count);#endif//dump_mem("OLD r[roff]",&r[roff],GROUP_PARALLELISM*8,GROUP_PARALLELISM); // loop over kk[55]..kk[0] for(i=55;i>=0;i--){ { MEMALIGN batch tkkmulti=kkmulti[i]; batch *si=(batch *)sbox_in; batch *r6_N=(batch *)(r+roff+GROUP_PARALLELISM*6); for(g=0;g<count_all/BYTES_PER_BATCH;g++){ si[g]=B_FFXOR(tkkmulti,r6_N[g]); //FIXME: introduce FASTBATCH? } } // table lookup, this works on only one byte at a time // most difficult part of all // - can't be parallelized // - can't be synthetized through boolean terms (8 input bits are too many) for(g=0;g<count_all;g++){ sbox_out[g]=block_sbox[sbox_in[g]]; } // bit permutation { unsigned char *po=(unsigned char *)perm_out; unsigned char *so=(unsigned char *)sbox_out;//dump_mem("pre perm ",(unsigned char *)so,GROUP_PARALLELISM,GROUP_PARALLELISM); for(g=0;g<count_all;g+=BYTES_PER_BATCH){ MEMALIGN batch in,out; in=*(batch *)&so[g]; out=B_FFOR( B_FFOR( B_FFOR( B_FFOR( B_FFOR( B_FFSH8L(B_FFAND(in,B_FFN_ALL_29()),1), B_FFSH8L(B_FFAND(in,B_FFN_ALL_02()),6)), B_FFSH8L(B_FFAND(in,B_FFN_ALL_04()),3)), B_FFSH8R(B_FFAND(in,B_FFN_ALL_10()),2)), B_FFSH8R(B_FFAND(in,B_FFN_ALL_40()),6)), B_FFSH8R(B_FFAND(in,B_FFN_ALL_80()),4)); *(batch *)&po[g]=out; }//dump_mem("post perm",(unsigned char *)po,GROUP_PARALLELISM,GROUP_PARALLELISM); } roff-=GROUP_PARALLELISM; /* virtual shift of registers */#if 0/* one by one */ for(g=0;g<count_all;g++){ r[roff+GROUP_PARALLELISM*0+g]=r[roff+GROUP_PARALLELISM*8+g]^sbox_out[g]; r[roff+GROUP_PARALLELISM*6+g]^=perm_out[g]; r[roff+GROUP_PARALLELISM*4+g]^=r[roff+GROUP_PARALLELISM*0+g]; r[roff+GROUP_PARALLELISM*3+g]^=r[roff+GROUP_PARALLELISM*0+g]; r[roff+GROUP_PARALLELISM*2+g]^=r[roff+GROUP_PARALLELISM*0+g]; }#else for(g=0;g<count_all;g+=BEST_SPAN){ XOR_BEST_BY(&r[roff+GROUP_PARALLELISM*0+g],&r[roff+GROUP_PARALLELISM*8+g],&sbox_out[g]); XOREQ_BEST_BY(&r[roff+GROUP_PARALLELISM*6+g],&perm_out[g]);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -