📄 mrgf2mnew.c
字号:
/*
* MIRACL routines for arithmetic over GF(2^m),
* mrgf2m.c
*
* For algorithms used, see IEEE P1363 Standard, Appendix A
* unless otherwise stated.
*
* The time-critical routines are the multiplication routine multiply2()
* and (for AFFINE co-ordinates), the modular inverse routine inverse2()
* and the routines it calls.
*
* READ COMMENTS CAREFULLY FOR VARIOUS OPTIMIZATION SUGGESTIONS
*
* No assembly language used.
*
* Use utility irp.cpp to generate optimal code for function reduce2(.) below
*
* Space can be saved by removing unneeded functions and
* deleting unrequired functionality
*
* Copyright (c) 2000-2007 Shamus Software Ltd.
*/
#include <stdlib.h>
#include "miracl.h"
#ifdef MR_STATIC
#include <string.h>
#endif
#ifndef MR_NOFULLWIDTH
/* This does not make sense using floating-point! */
#define M1 (MIRACL-1)
#define M2 (MIRACL-2)
#define M3 (MIRACL-3)
#define M4 (MIRACL-4)
#define M5 (MIRACL-5)
#define M6 (MIRACL-6)
#define M7 (MIRACL-7)
#define TOPBIT ((mr_small)1<<M1)
#define SECBIT ((mr_small)1<<M2)
#define THDBIT ((mr_small)1<<M3)
#define FRHBIT ((mr_small)1<< M4)
#define FIFBIT ((mr_small)1<< M5)
#define SIXBIT ((mr_small)1 << M6)
#define SEVBIT ((mr_small)1 << M7)
#define M8 (MIRACL-8)
#include <xmmintrin.h>
/* This is extremely time-critical, and expensive */
/*
#include <emmintrin.h>
static mr_small mr_mul2(mr_small a,mr_small b,mr_small *r)
{
__m64 tt[4],xg,rg;
mr_small q;
tt[0]=_m_from_int(0);
tt[1]=_m_from_int(a);
tt[2]=_m_psllqi(tt[1],1);
tt[3]=_m_pxor(tt[1],tt[2]);
rg=tt[b&3];
xg=tt[(b>>2)&3]; rg=_m_pxor(rg,_m_psllqi(xg,2));
xg=tt[(b>>4)&3]; rg=_m_pxor(rg,_m_psllqi(xg,4));
xg=tt[(b>>6)&3]; rg=_m_pxor(rg,_m_psllqi(xg,6));
xg=tt[(b>>8)&3]; rg=_m_pxor(rg,_m_psllqi(xg,8));
xg=tt[(b>>10)&3]; rg=_m_pxor(rg,_m_psllqi(xg,10));
xg=tt[(b>>12)&3]; rg=_m_pxor(rg,_m_psllqi(xg,12));
xg=tt[(b>>14)&3]; rg=_m_pxor(rg,_m_psllqi(xg,14));
xg=tt[(b>>16)&3]; rg=_m_pxor(rg,_m_psllqi(xg,16));
xg=tt[(b>>18)&3]; rg=_m_pxor(rg,_m_psllqi(xg,18));
xg=tt[(b>>20)&3]; rg=_m_pxor(rg,_m_psllqi(xg,20));
xg=tt[(b>>22)&3]; rg=_m_pxor(rg,_m_psllqi(xg,22));
xg=tt[(b>>24)&3]; rg=_m_pxor(rg,_m_psllqi(xg,24));
xg=tt[(b>>26)&3]; rg=_m_pxor(rg,_m_psllqi(xg,26));
xg=tt[(b>>28)&3]; rg=_m_pxor(rg,_m_psllqi(xg,28));
xg=tt[(b>>30)]; rg=_m_pxor(rg,_m_psllqi(xg,30));
*r=_m_to_int(rg);
q=_m_to_int(_m_psrlqi(rg,32));
_m_empty();
return q;
}
*/
/* This might be faster on a 16-bit processor with no variable shift instructions.
The line w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15); is just a 1-bit right shift on
the hi|lo value - should be really fast in assembly language
unsigned short mr_mul2(unsigned short x,unsigned short y,unsigned short *r)
{
unsigned short lo,hi,bit,w;
hi=0;
lo=x;
bit=-(lo&1);
lo>>=1;
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
*r=lo;
return hi;
}
*/
/* This might be faster on an 8-bit processor with no variable shift instructions.
The line w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7); is just a 1-bit right shift on
the hi|lo value - should be really fast in assembly language
unsigned char mr_mul2(unsigned char x,unsigned char y,unsigned char *r)
{
unsigned char lo,hi,bit,w;
hi=0;
lo=x;
bit=-(lo&1);
lo>>=1;
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);
hi^=(y&bit);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);
*r=lo;
return hi;
}
*/
/* wouldn't it be nice if instruction sets supported a
one cycle "carry-free" multiplication instruction ...
The SmartMips does - its called maddp */
#ifndef MR_COMBA2
#if MIRACL==8
/* maybe use a small precomputed look-up table? */
static mr_small mr_mul2(mr_small a,mr_small b,mr_small *r)
{
static const mr_small look[256]=
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,
0,3,6,5,12,15,10,9,24,27,30,29,20,23,18,17,
0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,
0,5,10,15,20,17,30,27,40,45,34,39,60,57,54,51,
0,6,12,10,24,30,20,18,48,54,60,58,40,46,36,34,
0,7,14,9,28,27,18,21,56,63,54,49,36,35,42,45,
0,8,16,24,32,40,48,56,64,72,80,88,96,104,112,120,
0,9,18,27,36,45,54,63,72,65,90,83,108,101,126,119,
0,10,20,30,40,34,60,54,80,90,68,78,120,114,108,102,
0,11,22,29,44,39,58,49,88,83,78,69,116,127,98,105,
0,12,24,20,48,60,40,36,96,108,120,116,80,92,72,68,
0,13,26,23,52,57,46,35,104,101,114,127,92,81,70,75,
0,14,28,18,56,54,36,42,112,126,108,98,72,70,84,90,
0,15,30,17,60,51,34,45,120,119,102,105,68,75,90,85
};
mr_small x0,x1,y0,y1,m,p,q;
x0=a&0x0f;
x1=a&0xf0;
y0=b&0x0f;
y1=b&0xf0; p=look[((x0<<4)|y0)];
m=look[(x1|y0)]^look[(y1|x0)];
q=look[(x1|(y1>>4))];
p^=(m<<4);
q^=(m>>4);
*r=p;
return q;
}
#else
int counter=0;
static __m128i MR_mul2(__m128i a,__m128i b, __m128i *r)
{
__m128i pp,qq,xx,tt[16],tb0,tb1,tb2,tb3,tb4,tb5,tb6,kb,m,ff;
int i,j;
kb=b;
m=_mm_set_epi32(0x1FFF,0xFFFF,0xFFFF,0xFFFF);
tt[0]=_mm_setzero_si128();
tt[1]=_mm_xor_si128(a,m); /* clear top 3 bits */
m=_mm_set_epi32(0,0,0xf0<<24,0);
tt[2]=_mm_xor_si128(_mm_slli_epi64(tt[1],1),_mm_srli_epi64( _mm_slli_si128(_mm_and_si128(m ,tt[1]),1) ,7));
tt[4]=_mm_xor_si128(_mm_slli_epi64(tt[1],2),_mm_srli_epi64( _mm_slli_si128(_mm_and_si128(m ,tt[1]),1) ,6));
tt[8]=_mm_xor_si128(_mm_slli_epi64(tt[1],3),_mm_srli_epi64( _mm_slli_si128(_mm_and_si128(m ,tt[1]),1) ,5));
tt[3]=_mm_xor_si128(tt[1],tt[2]);
tt[5]=_mm_xor_si128(tt[1],tt[4]);
tt[6]=_mm_xor_si128(tt[2],tt[4]);
tt[7]=_mm_xor_si128(tt[6],tt[1]);
tt[9]=_mm_xor_si128(tt[8],tt[1]);
tt[10]=_mm_xor_si128(tt[8],tt[2]);
tt[11]=_mm_xor_si128(tt[10],tt[1]);
tt[12]=_mm_xor_si128(tt[8],tt[4]);
tt[13]=_mm_xor_si128(tt[12],tt[1]);
tt[14]=_mm_xor_si128(tt[8],tt[6]);
tt[15]=_mm_xor_si128(tt[14],tt[1]);
m=_mm_set_epi32(0x8000,0,0,0);
tb0=_mm_and_si128(m,a);
tb0=_mm_xor_si128(tb0,_mm_srli_si128(tb0,4));
tb0=_mm_xor_si128(tb0,_mm_srli_si128(tb0,8));
tb0=_mm_srai_epi32(tb0,31);
tb1=_mm_slli_epi32(_mm_and_si128(_mm_srli_epi32(m,1),a),1);
tb1=_mm_xor_si128(tb1,_mm_srli_si128(tb1,4));
tb1=_mm_xor_si128(tb1,_mm_srli_si128(tb1,8));
tb1=_mm_srai_epi32(tb1,31);
tb2=_mm_slli_epi32(_mm_and_si128(_mm_srli_epi32(m,2),a),2);
tb2=_mm_xor_si128(tb2,_mm_srli_si128(tb2,4));
tb2=_mm_xor_si128(tb2,_mm_srli_si128(tb2,8));
tb2=_mm_srai_epi32(tb2,31);
tb3=_mm_slli_epi32(_mm_and_si128(_mm_srli_epi32(m,3),a),3);
tb3=_mm_xor_si128(tb3,_mm_srli_si128(tb3,4));
tb3=_mm_xor_si128(tb3,_mm_srli_si128(tb3,8));
tb3=_mm_srai_epi32(tb3,31);
tb4=_mm_slli_epi32(_mm_and_si128(_mm_srli_epi32(m,4),a),4);
tb4=_mm_xor_si128(tb4,_mm_srli_si128(tb4,4));
tb4=_mm_xor_si128(tb4,_mm_srli_si128(tb4,8));
tb4=_mm_srai_epi32(tb4,31);
tb5=_mm_slli_epi32(_mm_and_si128(_mm_srli_epi32(m,5),a),5);
tb5=_mm_xor_si128(tb5,_mm_srli_si128(tb5,4));
tb5=_mm_xor_si128(tb5,_mm_srli_si128(tb5,8));
tb5=_mm_srai_epi32(tb5,31);
tb6=_mm_slli_epi32(_mm_and_si128(_mm_srli_epi32(m,6),a),6);
tb6=_mm_xor_si128(tb6,_mm_srli_si128(tb6,4));
tb6=_mm_xor_si128(tb6,_mm_srli_si128(tb6,8));
tb6=_mm_srai_epi32(tb6,31);
ff=_mm_set_epi32(0,0,0,0xff);
i=_mm_cvtsi128_si32(_mm_and_si128(b,ff));
j=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,4),ff));
pp=qq=_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4))); qq=_mm_srli_si128(qq,1);
i=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,8),ff));
j=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,12),ff));
xx=_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)));
qq=_mm_xor_si128(qq,xx); pp=_mm_xor_si128(pp,_mm_slli_si128(xx,1)); qq=_mm_srli_si128(qq,1);
i=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,16),ff));
j=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,20),ff));
xx=_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)));
qq=_mm_xor_si128(qq,xx); pp=_mm_xor_si128(pp,_mm_slli_si128(xx,2)); qq=_mm_srli_si128(qq,1);
i=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,24),ff));
j=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,28),ff));
xx=_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)));
qq=_mm_xor_si128(qq,xx); pp=_mm_xor_si128(pp,_mm_slli_si128(xx,3)); qq=_mm_srli_si128(qq,1);
i=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,32),ff));
j=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,36),ff));
xx=_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)));
qq=_mm_xor_si128(qq,xx); pp=_mm_xor_si128(pp,_mm_slli_si128(xx,4)); qq=_mm_srli_si128(qq,1);
i=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,40),ff));
j=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,44),ff));
xx=_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)));
qq=_mm_xor_si128(qq,xx); pp=_mm_xor_si128(pp,_mm_slli_si128(xx,5)); qq=_mm_srli_si128(qq,1);
i=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,48),ff));
j=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,52),ff));
xx=_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)));
qq=_mm_xor_si128(qq,xx); pp=_mm_xor_si128(pp,_mm_slli_si128(xx,6)); qq=_mm_srli_si128(qq,1);
i=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,56),ff));
j=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,60),ff));
xx=_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)));
qq=_mm_xor_si128(qq,xx); pp=_mm_xor_si128(pp,_mm_slli_si128(xx,7)); qq=_mm_srli_si128(qq,1);
b=_mm_srli_si128(b,8);
i=_mm_cvtsi128_si32(_mm_and_si128(b,ff));
j=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,4),ff));
xx=_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)));
qq=_mm_xor_si128(qq,xx); pp=_mm_xor_si128(pp,_mm_slli_si128(xx,8)); qq=_mm_srli_si128(qq,1);
i=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,8),ff));
j=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,12),ff));
xx=_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)));
qq=_mm_xor_si128(qq,xx); pp=_mm_xor_si128(pp,_mm_slli_si128(xx,9)); qq=_mm_srli_si128(qq,1);
i=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,16),ff));
j=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,20),ff));
xx=_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)));
qq=_mm_xor_si128(qq,xx); pp=_mm_xor_si128(pp,_mm_slli_si128(xx,10)); qq=_mm_srli_si128(qq,1);
i=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,24),ff));
j=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,28),ff));
xx=_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)));
qq=_mm_xor_si128(qq,xx); pp=_mm_xor_si128(pp,_mm_slli_si128(xx,11)); qq=_mm_srli_si128(qq,1);
i=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,32),ff));
j=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,36),ff));
xx=_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)));
qq=_mm_xor_si128(qq,xx); pp=_mm_xor_si128(pp,_mm_slli_si128(xx,12)); qq=_mm_srli_si128(qq,1);
i=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,40),ff));
j=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,44),ff));
xx=_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)));
qq=_mm_xor_si128(qq,xx); pp=_mm_xor_si128(pp,_mm_slli_si128(xx,13)); qq=_mm_srli_si128(qq,1);
i=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,48),ff));
j=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,52),ff));
xx=_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)));
qq=_mm_xor_si128(qq,xx); pp=_mm_xor_si128(pp,_mm_slli_si128(xx,14)); qq=_mm_srli_si128(qq,1);
i=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,56),ff));
j=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,60),ff));
xx=_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)));
qq=_mm_xor_si128(qq,xx); pp=_mm_xor_si128(pp,_mm_slli_si128(xx,15)); qq=_mm_srli_si128(qq,1);
m=_mm_set_epi32(0,0xFF,0,0);
pp=_mm_xor_si128(pp, _mm_and_si128(tb0,_mm_slli_epi64(_mm_slli_si128(kb,15),7) ));
qq=_mm_xor_si128(qq,_mm_and_si128(tb0,_mm_or_si128(_mm_srli_epi64(kb,1), _mm_slli_epi64(_mm_srli_si128(_mm_and_si128(kb,m),1),7) )));
pp=_mm_xor_si128(pp, _mm_and_si128(tb1,_mm_slli_epi64(_mm_slli_si128(kb,15),6) ));
qq=_mm_xor_si128(qq,_mm_and_si128(tb1,_mm_or_si128(_mm_srli_epi64(kb,1), _mm_slli_epi64(_mm_srli_si128(_mm_and_si128(kb,m),1),6) )));
pp=_mm_xor_si128(pp, _mm_and_si128(tb2,_mm_slli_epi64(_mm_slli_si128(kb,15),5) ));
qq=_mm_xor_si128(qq,_mm_and_si128(tb2,_mm_or_si128(_mm_srli_epi64(kb,1), _mm_slli_epi64(_mm_srli_si128(_mm_and_si128(kb,m),1),5) )));
pp=_mm_xor_si128(pp, _mm_and_si128(tb3,_mm_slli_epi64(_mm_slli_si128(kb,15),4) ));
qq=_mm_xor_si128(qq,_mm_and_si128(tb3,_mm_or_si128(_mm_srli_epi64(kb,1), _mm_slli_epi64(_mm_srli_si128(_mm_and_si128(kb,m),1),4) )));
pp=_mm_xor_si128(pp, _mm_and_si128(tb4,_mm_slli_epi64(_mm_slli_si128(kb,15),3) ));
qq=_mm_xor_si128(qq,_mm_and_si128(tb4,_mm_or_si128(_mm_srli_epi64(kb,1), _mm_slli_epi64(_mm_srli_si128(_mm_and_si128(kb,m),1),3) )));
pp=_mm_xor_si128(pp, _mm_and_si128(tb5,_mm_slli_epi64(_mm_slli_si128(kb,15),2) ));
qq=_mm_xor_si128(qq,_mm_and_si128(tb5,_mm_or_si128(_mm_srli_epi64(kb,1), _mm_slli_epi64(_mm_srli_si128(_mm_and_si128(kb,m),1),2) )));
pp=_mm_xor_si128(pp, _mm_and_si128(tb6,_mm_slli_epi64(_mm_slli_si128(kb,15),1) ));
qq=_mm_xor_si128(qq,_mm_and_si128(tb6,_mm_or_si128(_mm_srli_epi64(kb,1), _mm_slli_epi64(_mm_srli_si128(_mm_and_si128(kb,m),1),1) )));
*r=pp;
return qq;
}
static inline mr_small mr_mul2(mr_small a,mr_small b,mr_small *r)
{
int i,j;
__m128i pp,tt[16],m;
mr_small xx[2];
m=_mm_set_epi32(0,0,0xf0<<24,0);
tt[0]=_mm_setzero_si128();
tt[1]=_mm_loadl_epi64((__m128i *)&a);
tt[2]=_mm_xor_si128(_mm_slli_epi64(tt[1],1),_mm_srli_epi64( _mm_slli_si128(_mm_and_si128(m ,tt[1]),1) ,7));
tt[4]=_mm_xor_si128(_mm_slli_epi64(tt[1],2),_mm_srli_epi64( _mm_slli_si128(_mm_and_si128(m ,tt[1]),1) ,6));
tt[8]=_mm_xor_si128(_mm_slli_epi64(tt[1],3),_mm_srli_epi64( _mm_slli_si128(_mm_and_si128(m ,tt[1]),1) ,5));
tt[3]=_mm_xor_si128(tt[1],tt[2]);
tt[5]=_mm_xor_si128(tt[1],tt[4]);
tt[6]=_mm_xor_si128(tt[2],tt[4]);
tt[7]=_mm_xor_si128(tt[6],tt[1]);
tt[9]=_mm_xor_si128(tt[8],tt[1]);
tt[10]=_mm_xor_si128(tt[8],tt[2]);
tt[11]=_mm_xor_si128(tt[10],tt[1]);
tt[12]=_mm_xor_si128(tt[8],tt[4]);
tt[13]=_mm_xor_si128(tt[12],tt[1]);
tt[14]=_mm_xor_si128(tt[8],tt[6]);
tt[15]=_mm_xor_si128(tt[14],tt[1]);
i=b&0xF; j=(b>>4)&0xF;
pp=_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)) );
i=(b>>8)&0xF; j=(b>>12)&0xF;
pp=_mm_xor_si128(pp, _mm_slli_si128( _mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)) ) ,1) );
i=(b>>16)&0xF; j=(b>>20)&0xF;
pp=_mm_xor_si128(pp, _mm_slli_si128(_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)) ) ,2) );
i=(b>>24)&0xF; j=(b>>28)&0xF;
pp=_mm_xor_si128(pp, _mm_slli_si128(_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)) ) ,3) );
i=(b>>32)&0xF; j=(b>>36)&0xF;
pp=_mm_xor_si128(pp, _mm_slli_si128(_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)) ) ,4) );
i=(b>>40)&0xF; j=(b>>44)&0xF;
pp=_mm_xor_si128(pp, _mm_slli_si128(_mm_xor_si128(tt[i],_mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)) ) ,5) );
i=(b>>48)&0xF; j=(b>>52)&0xF;
pp=_mm_xor_si128(pp, _mm_slli_si128(_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)) ) ,6) );
i=(b>>56)&0xF; j=(b>>60);
pp=_mm_xor_si128(pp, _mm_slli_si128(_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)) ) ,7) );
_mm_store_si128((__m128i*)xx,pp);
*r=xx[0];
return xx[1];
/* mr_small kb,t[16],xx[2];
mr_small x,q,p;
mr_utype tb0;
#if MIRACL > 32
mr_utype tb1,tb2,tb3,tb4,tb5,tb6;
#endif
*/
/*printf("a= %lx\n",a);*/
/*
kb=b;
tt[0]=_mm_setzero_si128();
tt[8]=_mm_loadl_epi64((__m128i *)&a);
tt[8]=_mm_slli_epi64(tt[8],7);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -