📄 mrgf2m.c
字号:
__m128i m,r,s,p,q,xe,xo;
__m64 a2,a1,a0,top;
if (x==y)
{
modsquare2(_MIPP_ x,w);
return;
}
#ifdef MR_COUNT_OPS
fpm2++;
#endif
if (x->len==0 || y->len==0)
{
zero(w);
return;
}
m=_mm_set_epi32(0,0,0xff<<24,0); /* shifting mask */
/* precompute a small table */
t[0]=_mm_set1_epi32(0);
xe=_mm_set_epi32(0,x->w[2],0,x->w[0]);
xo=_mm_set_epi32(0,0,0,x->w[1]);
t[1]=_mm_xor_si128(xe,_mm_slli_si128(xo,4));
xe=_mm_slli_epi64(xe,1);
xo=_mm_slli_epi64(xo,1);
t[2]=_mm_xor_si128(xe,_mm_slli_si128(xo,4));
t[3]=_mm_xor_si128(t[2],t[1]);
xe=_mm_slli_epi64(xe,1);
xo=_mm_slli_epi64(xo,1);
t[4]=_mm_xor_si128(xe,_mm_slli_si128(xo,4));
t[5]=_mm_xor_si128(t[4],t[1]);
t[6]=_mm_xor_si128(t[4],t[2]);
t[7]=_mm_xor_si128(t[4],t[3]);
xe=_mm_slli_epi64(xe,1);
xo=_mm_slli_epi64(xo,1);
t[8]=_mm_xor_si128(xe,_mm_slli_si128(xo,4));
t[9]=_mm_xor_si128(t[8],t[1]);
t[10]=_mm_xor_si128(t[8],t[2]);
t[11]=_mm_xor_si128(t[8],t[3]);
t[12]=_mm_xor_si128(t[8],t[4]);
t[13]=_mm_xor_si128(t[8],t[5]);
t[14]=_mm_xor_si128(t[8],t[6]);
t[15]=_mm_xor_si128(t[8],t[7]);
b=y->w[0];
i=b&0xf; j=(b>>4)&0xf; r=t[j];
s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4); /* net shift left 4 */
r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
p=q=r; q=_mm_srli_si128(q,1);
i=(b>>8)&0xf; j=(b>>12)&0xf; r=t[j];
s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,1);
p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
i=(b>>16)&0xf; j=(b>>20)&0xf; r=t[j];
s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,2);
p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
i=(b>>24)&0xf; j=(b>>28); r=t[j];
s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,3);
p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
b=y->w[1];
i=(b)&0xf; j=(b>>4)&0xf; r=t[j];
s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,4);
p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
i=(b>>8)&0xf; j=(b>>12)&0xf; r=t[j];
s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,5);
p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
i=(b>>16)&0xf; j=(b>>20)&0xf; r=t[j];
s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,6);
p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
i=(b>>24)&0xf; j=(b>>28); r=t[j];
s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,7);
p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
b=y->w[2];
i=(b)&0xf; j=(b>>4)&0xf; r=t[j];
s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,8);
p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
i=(b>>8)&0xf; j=(b>>12)&0xf; r=t[j];
s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,9);
p=_mm_xor_si128(p,r);
q=_mm_srli_si128(q,7); /* only 79 bits, so we are done */
/* modular reduction - x^79+x^9+1 */
a0=_mm_movepi64_pi64(p);
a1=_mm_movepi64_pi64(_mm_srli_si128(p,8));
a2=_mm_movepi64_pi64(q);
a1=_m_pxor(a1,_m_psrlqi(a2,15));
a1=_m_pxor(a1,_m_psrlqi(a2,6));
a0=_m_pxor(a0,_m_psllqi(a2,49));
a0=_m_pxor(a0,_m_psllqi(a2,58));
top=_m_psrlqi(a1,15);
a0=_m_pxor(a0,top);
top=_m_psllqi(top,15);
a0=_m_pxor(a0,_m_psrlqi(top,6));
a1=_m_pxor(a1,top);
w->w[2]=_m_to_int(a1);
if (w->len>3)
{ /* Yes I know its crazy, but its needed to fix the broken /O2 optimizer */
for (i=3;i<w->len;i++) w->w[i]=0;
}
w->w[0]=_m_to_int(a0);
a0=_m_psrlqi(a0,32);
w->w[1]=_m_to_int(a0);
w->len=3;
if (w->w[2]==0) mr_lzero(w);
_m_empty();
}
#endif
#ifndef SP103
#ifndef SP79
/*#ifndef SP271 */
void modmult2(_MIPD_ big x,big y,big w)
{ /* w=x*y mod f */
#ifdef MR_OS_THREADS
miracl *mr_mip=get_mip();
#endif
if (x==NULL || y==NULL)
{
zero(w);
return;
}
if (x==y)
{
modsquare2(_MIPP_ x,w);
return;
}
if (y->len==0)
{
zero(w);
return;
}
if (y->len==1)
{
if (y->w[0]==1)
{
copy(x,w);
return;
}
}
#ifdef MR_COUNT_OPS
fpm2++;
#endif
multiply2(_MIPP_ x,y,mr_mip->w0);
reduce2(_MIPP_ mr_mip->w0,mr_mip->w0);
copy(mr_mip->w0,w);
}
#endif
#endif
/*#endif*/
/* Will be *much* faster if M,A,(B and C) are all odd */
/* This could/should be optimized for a particular irreducible polynomial and fixed A, B and C */
void sqroot2(_MIPD_ big x,big y)
{
int i,M,A,B,C;
int k,n,h,s,a,aw,ab,bw,bb,cw,cb;
#if MIRACL != 32
int mm,j;
#endif
mr_small *wk,w,we,wo;
BOOL slow;
/* Using Harley's trick */
static const mr_small evens[16]=
{0,1,4,5,2,3,6,7,8,9,12,13,10,11,14,15};
static const mr_small odds[16]=
{0,4,1,5,8,12,9,13,2,6,3,7,10,14,11,15};
#ifdef MR_OS_THREADS
miracl *mr_mip=get_mip();
#endif
M=mr_mip->M;
A=mr_mip->AA;
if (A==0)
{
mr_berror(_MIPP_ MR_ERR_NO_BASIS);
return;
}
B=mr_mip->BB;
C=mr_mip->CC;
slow=FALSE;
if (B)
{
if (M%2!=1 || A%2!=1 || B%2!=1 || C%2!=1) slow=TRUE;
}
else
{
if (M%2!=1 || A%2!=1) slow=TRUE;
}
if (slow)
{
copy(x,y);
for (i=1;i<mr_mip->M;i++)
modsquare2(_MIPP_ y,y);
return;
}
bb=cb=cw=bw=0;
/* M, A (B and C) are all odd - so use fast
Fong, Hankerson, Lopez and Menezes method */
if (x==y)
{
copy (x,mr_mip->w0);
wk=mr_mip->w0->w;
}
else
{
wk=x->w;
}
zero(y);
#if MIRACL==8
if (M==271 && A==207 && B==175 && C==111)
{
y->len=34;
for (i=0;i<34;i++)
{
n=i/2;
w=wk[i];
we=evens[((w&0x5)+((w&0x50)>>3))];
wo=odds[((w&0xA)+((w&0xA0)>>5))];
i++;
w=wk[i];
we|=evens[((w&0x5)+((w&0x50)>>3))]<<4;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<4;
y->w[n]^=we;
y->w[n+17]=wo;
y->w[n+13]^=wo;
y->w[n+11]^=wo;
y->w[n+7]^=wo;
}
if (y->w[33]==0) mr_lzero(y);
return;
}
#endif
#if MIRACL==32
if (M==1223 && A==255)
{
y->len=39;
for (i=0;i<39;i++)
{
n=i/2;
w=wk[i];
we=evens[((w&0x5)+((w&0x50)>>3))];
wo=odds[((w&0xA)+((w&0xA0)>>5))];
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<4;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<4;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<8;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<8;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<12;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<12;
i++;
if (i<39)
{
w=wk[i];
we|=evens[((w&0x5)+((w&0x50)>>3))]<<16;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<16;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<20;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<20;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<24;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<24;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<28;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<28;
}
y->w[n]^=we;
y->w[20+n-1]^=wo<<4;
y->w[20+n]^=wo>>28;
y->w[n+4]^=wo;
}
if (y->w[38]==0) mr_lzero(y);
return;
}
#endif
#if MIRACL==64
if (M==1223 && A==255)
{
y->len=20;
for (i=0;i<20;i++)
{
n=i/2;
w=wk[i];
we=evens[((w&0x5)+((w&0x50)>>3))];
wo=odds[((w&0xA)+((w&0xA0)>>5))];
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<4;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<4;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<8;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<8;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<12;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<12;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<16;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<16;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<20;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<20;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<24;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<24;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<28;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<28;
i++;
w=wk[i];
we|=evens[((w&0x5)+((w&0x50)>>3))]<<32;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<32;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<36;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<36;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<40;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<40;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<44;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<44;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<48;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<48;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<52;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<52;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<56;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<56;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<60;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<60;
y->w[n]^=we;
y->w[10+n-1]^=wo<<36;
y->w[10+n]^=wo>>28;
y->w[n+2]^=wo;
}
if (y->w[19]==0) mr_lzero(y);
return;
}
#endif
k=1+(M/MIRACL);
h=(k+1)/2;
a=(A+1)/2;
aw=a/MIRACL;
ab=a%MIRACL;
if (B)
{
a=(B+1)/2;
bw=a/MIRACL;
bb=a%MIRACL;
a=(C+1)/2;
cw=a/MIRACL;
cb=a%MIRACL;
}
s=h*MIRACL-1-(M-1)/2;
y->len=k;
for (i=0;i<k;i++)
{
n=i/2;
w=wk[i];
#if MIRACL == 32
we=evens[((w&0x5)+((w&0x50)>>3))];
wo=odds[((w&0xA)+((w&0xA0)>>5))];
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<4;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<4;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<8;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<8;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<12;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<12;
#else
mm=0;
we=wo=0;
for (j=0;j<MIRACL/8;j++)
{
we|=evens[((w&0x5)+((w&0x50)>>3))]<<mm;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<mm;
mm+=4; w>>=8;
}
#endif
i++;
if (i<k)
{
w=wk[i];
#if MIRACL == 32
we|=evens[((w&0x5)+((w&0x50)>>3))]<<16;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<16;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<20;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<20;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<24;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<24;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<28;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<28;
#else
for (j=0;j<MIRACL/8;j++)
{
we|=evens[((w&0x5)+((w&0x50)>>3))]<<mm;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<mm;
mm+=4; w>>=8;
}
#endif
}
y->w[n]^=we;
if (s==0) y->w[h+n]=wo;
else
{
y->w[h+n-1]^=wo<<(MIRACL-s);
y->w[h+n]^=wo>>s; /* abutt odd bits to even */
}
if (ab==0) y->w[n+aw]^=wo;
else
{
y->w[n+aw]^=wo<<ab;
y->w[n+aw+1]^=wo>>(MIRACL-ab);
}
if (B)
{
if (bb==0) y->w[n+bw]^=wo;
else
{
y->w[n+bw]^=wo<<bb;
y->w[n+bw+1]^=wo>>(MIRACL-bb);
}
if (cb==0) y->w[n+cw]^=wo;
else
{
y->w[n+cw]^=wo
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -