📄 mrgf2m.c
字号:
/*
* MIRACL routines for arithmetic over GF(2^m),
* mrgf2m.c
*
* For algorithms used, see IEEE P1363 Standard, Appendix A
* unless otherwise stated.
*
* The time-critical routines are the multiplication routine multiply2()
* and (for AFFINE co-ordinates), the modular inverse routine inverse2()
* and the routines it calls.
*
* READ COMMENTS CAREFULLY FOR VARIOUS OPTIMIZATION SUGGESTIONS
*
* No assembly language used.
*
* Use utility irp.cpp to generate optimal code for function reduce2(.) below
*
* Space can be saved by removing unneeded functions and
* deleting unrequired functionality.
* For example in reduce2(.) remove code for those irreducible polynomials
* which will not be used by your code.
*
* Copyright (c) 2000-2007 Shamus Software Ltd.
*/
#include <stdlib.h>
#include "miracl.h"
#ifdef MR_STATIC
#include <string.h>
#endif
#ifdef MR_COUNT_OPS
extern int fpm2,fpi2;
#endif
/* must use /arch:SSE2 in compilation */
#ifdef _M_IX86_FP
#if _M_IX86_FP >= 2
#define MR_SSE2_INTRINSICS
#endif
#endif
/* must use -msse2 in compilation */
#ifdef __SSE2__
#define MR_SSE2_INTRINSICS
#endif
#ifdef MR_SSE2_INTRINSICS
#ifdef __GNUC__
#include <xmmintrin.h>
#else
#include <emmintrin.h>
#endif
#if MIRACL==64
#define MR_SSE2_64
/* Can use SSE2 registers for 64-bit manipulations */
#endif
#endif
#ifndef MR_NOFULLWIDTH
/* This does not make sense using floating-point! */
/* This is extremely time-critical, and expensive */
/* Some experimental MMX code for x86-32. Seems to be slower than the standard code (on a PIV anyway).. */
#ifdef MR_MMX_x86_32
#ifdef __GNUC__
#include <xmmintrin.h>
#else
#include <emmintrin.h>
#endif
static mr_small mr_mul2(mr_small a,mr_small b,mr_small *r)
{
__m64 rg,tt[4];
mr_small q;
tt[0]=_m_from_int(0);
tt[1]=_m_from_int(a);
tt[2]=_m_psllqi(tt[1],1);
tt[3]=_m_pxor(tt[1],tt[2]);
rg=tt[b&3];
rg=_m_pxor(rg,_m_psllqi(tt[(b>>2)&3],2));
rg=_m_pxor(rg,_m_psllqi(tt[(b>>4)&3],4));
rg=_m_pxor(rg,_m_psllqi(tt[(b>>6)&3],6));
rg=_m_pxor(rg,_m_psllqi(tt[(b>>8)&3],8));
rg=_m_pxor(rg,_m_psllqi(tt[(b>>10)&3],10));
rg=_m_pxor(rg,_m_psllqi(tt[(b>>12)&3],12));
rg=_m_pxor(rg,_m_psllqi(tt[(b>>14)&3],14));
rg=_m_pxor(rg,_m_psllqi(tt[(b>>16)&3],16));
rg=_m_pxor(rg,_m_psllqi(tt[(b>>18)&3],18));
rg=_m_pxor(rg,_m_psllqi(tt[(b>>20)&3],20));
rg=_m_pxor(rg,_m_psllqi(tt[(b>>22)&3],22));
rg=_m_pxor(rg,_m_psllqi(tt[(b>>24)&3],24));
rg=_m_pxor(rg,_m_psllqi(tt[(b>>26)&3],26));
rg=_m_pxor(rg,_m_psllqi(tt[(b>>28)&3],28));
rg=_m_pxor(rg,_m_psllqi(tt[(b>>30)],30));
*r=_m_to_int(rg);
q=_m_to_int(_m_psrlqi(rg,32));
return q;
}
#else
/* This might be faster on a 16-bit processor with no variable shift instructions.
The line w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15); is just a 1-bit right shift on
the hi|lo value - should be really fast in assembly language
unsigned short mr_mul2(unsigned short x,unsigned short y,unsigned short *r)
{
unsigned short lo,hi,bit,w;
hi=0;
lo=x;
bit=-(lo&1);
lo>>=1;
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
*r=lo;
return hi;
}
*/
/* This might be faster on an 8-bit processor with no variable shift instructions.
The line w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7); is just a 1-bit right shift on
the hi|lo value - should be really fast in assembly language
unsigned char mr_mul2(unsigned char x,unsigned char y,unsigned char *r)
{
unsigned char lo,hi,bit,w;
hi=0;
lo=x;
bit=-(lo&1);
lo>>=1;
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);
hi^=(y&bit);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);
*r=lo;
return hi;
}
*/
/* wouldn't it be nice if instruction sets supported a
one cycle "carry-free" multiplication instruction ...
The SmartMips does - its called maddp */
#ifndef MR_COMBA2
#if MIRACL==8
/* maybe use a small precomputed look-up table? */
static mr_small mr_mul2(mr_small a,mr_small b,mr_small *r)
{
static const mr_small look[256]=
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,
0,3,6,5,12,15,10,9,24,27,30,29,20,23,18,17,
0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,
0,5,10,15,20,17,30,27,40,45,34,39,60,57,54,51,
0,6,12,10,24,30,20,18,48,54,60,58,40,46,36,34,
0,7,14,9,28,27,18,21,56,63,54,49,36,35,42,45,
0,8,16,24,32,40,48,56,64,72,80,88,96,104,112,120,
0,9,18,27,36,45,54,63,72,65,90,83,108,101,126,119,
0,10,20,30,40,34,60,54,80,90,68,78,120,114,108,102,
0,11,22,29,44,39,58,49,88,83,78,69,116,127,98,105,
0,12,24,20,48,60,40,36,96,108,120,116,80,92,72,68,
0,13,26,23,52,57,46,35,104,101,114,127,92,81,70,75,
0,14,28,18,56,54,36,42,112,126,108,98,72,70,84,90,
0,15,30,17,60,51,34,45,120,119,102,105,68,75,90,85
};
mr_small x1,y0,m,p,q;
x1=a&0xf0;
y0=b&0x0f;
a<<=4;
b>>=4;
p=look[(a|y0)];
q=look[(x1|b)];
m=look[a^b^x1^y0]^p^q; /* Karatsuba! */
p^=(m<<4);
q^=(m>>4);
*r=p;
return q;
}
#else
#ifdef MR_SSE2_64
static mr_small mr_mul2(mr_small a,mr_small b,mr_small *r)
{
int i,j;
__m128i pp,tt[16],m;
m=_mm_set_epi32(0,0,0xf0<<24,0);
tt[0]=_mm_setzero_si128();
tt[1]=_mm_loadl_epi64((__m128i *)&a);
tt[2]=_mm_xor_si128(_mm_slli_epi64(tt[1],1),_mm_srli_epi64( _mm_slli_si128(_mm_and_si128(m ,tt[1]),1) ,7));
tt[4]=_mm_xor_si128(_mm_slli_epi64(tt[1],2),_mm_srli_epi64( _mm_slli_si128(_mm_and_si128(m ,tt[1]),1) ,6));
tt[8]=_mm_xor_si128(_mm_slli_epi64(tt[1],3),_mm_srli_epi64( _mm_slli_si128(_mm_and_si128(m ,tt[1]),1) ,5));
tt[3]=_mm_xor_si128(tt[1],tt[2]);
tt[5]=_mm_xor_si128(tt[1],tt[4]);
tt[6]=_mm_xor_si128(tt[2],tt[4]);
tt[7]=_mm_xor_si128(tt[6],tt[1]);
tt[9]=_mm_xor_si128(tt[8],tt[1]);
tt[10]=_mm_xor_si128(tt[8],tt[2]);
tt[11]=_mm_xor_si128(tt[10],tt[1]);
tt[12]=_mm_xor_si128(tt[8],tt[4]);
tt[13]=_mm_xor_si128(tt[12],tt[1]);
tt[14]=_mm_xor_si128(tt[8],tt[6]);
tt[15]=_mm_xor_si128(tt[14],tt[1]);
/* Thanks to Darrel Hankerson, who pointed out an optimization for this code ... */
i=(int)(b&0xF); j=(int)((b>>4)&0xF);
pp=_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),
_mm_srli_epi64(_mm_slli_si128(tt[j],8), 60)) );
i=(int)((b>>8)&0xF); j=(int)((b>>12)&0xF);
pp=_mm_xor_si128(pp, _mm_slli_si128( _mm_xor_si128(tt[i],
_mm_or_si128(_mm_slli_epi64(tt[j],4),
_mm_srli_epi64(_mm_slli_si128(tt[j],8), 60))
) ,1) );
i=(int)((b>>16)&0xF); j=(int)((b>>20)&0xF);
pp=_mm_xor_si128(pp, _mm_slli_si128(_mm_xor_si128(tt[i],
_mm_or_si128(_mm_slli_epi64(tt[j],4),
_mm_srli_epi64(_mm_slli_si128(tt[j],8), 60))
) ,2) );
i=(int)((b>>24)&0xF); j=(int)((b>>28)&0xF);
pp=_mm_xor_si128(pp, _mm_slli_si128(_mm_xor_si128(tt[i],
_mm_or_si128(_mm_slli_epi64(tt[j],4),
_mm_srli_epi64(_mm_slli_si128(tt[j],8), 60))
) ,3) );
i=(int)((b>>32)&0xF); j=(int)((b>>36)&0xF);
pp=_mm_xor_si128(pp, _mm_slli_si128(_mm_xor_si128(tt[i],
_mm_or_si128(_mm_slli_epi64(tt[j],4),
_mm_srli_epi64(_mm_slli_si128(tt[j],8), 60))
) ,4) );
i=(int)((b>>40)&0xF); j=(int)((b>>44)&0xF);
pp=_mm_xor_si128(pp, _mm_slli_si128(_mm_xor_si128(tt[i],
_mm_or_si128(_mm_slli_epi64(tt[j],4),
_mm_srli_epi64(_mm_slli_si128(tt[j],8), 60))
) ,5) );
i=(int)((b>>48)&0xF); j=(int)((b>>52)&0xF);
pp=_mm_xor_si128(pp, _mm_slli_si128(_mm_xor_si128(tt[i],
_mm_or_si128(_mm_slli_epi64(tt[j],4),
_mm_srli_epi64(_mm_slli_si128(tt[j],8), 60))
) ,6) );
i=(int)((b>>56)&0xF); j=(int)(b>>60);
pp=_mm_xor_si128(pp, _mm_slli_si128(_mm_xor_si128(tt[i],
_mm_or_si128(_mm_slli_epi64(tt[j],4),
_mm_srli_epi64(_mm_slli_si128(tt[j],8), 60))
) ,7) );
*r=((unsigned long *)&pp)[0];
return ((unsigned long *)&pp)[1];
}
#else
static mr_small mr_mul2(mr_small a,mr_small b,mr_small *r)
{
int k;
mr_small kb,t[16];
mr_small x,q,p;
mr_utype tb0;
#if MIRACL > 32
mr_utype tb1,tb2;
#endif
kb=b;
#if MIRACL <= 32
t[0]=0; /* small look up table */
t[3]=t[2]=a<<1; /* it can overflow.... */
t[1]=t[2]>>1;
t[3]^=t[1];
tb0=(mr_utype)(a&TOPBIT); /* remember top bit */
tb0>>=M1; /* all ones if top bit is one */
#else
t[0]=0; /* larger look-up table */
t[8]=a<<3;
t[4]=t[8]>>1;
t[2]=t[4]>>1;
t[1]=t[2]>>1;
t[3]=t[5]=t[7]=t[9]=t[11]=t[13]=t[15]=t[1];
t[3]^=t[2];
t[5]^=t[4];
t[9]^=t[8];
t[6]=t[3]<<1;
t[7]^=t[6];
t[10]=t[5]<<1;
t[11]^=t[10];
t[12]=t[6]<<1;
t[13]^=t[12];
t[14]=t[7]<<1;
t[15]^=t[14];
tb0=(a&TOPBIT); /* remember top bits */
tb0>>=M1; /* all bits one, if this bit is set in a */
tb1=(a&SECBIT)<<1;
tb1>>=M1;
tb2=(a&THDBIT)<<2;
tb2>>=M1;
#endif
#if MIRACL == 8
#define UNWOUNDM
p=q=t[b&3]; q>>=2;
x=t[(b>>2)&3]; q^=x; p^=(x<<2); q>>=2;
x=t[(b>>4)&3]; q^=x; p^=(x<<4); q>>=2;
x=t[(b>>6)]; q^=x; p^=(x<<6); q>>=2;
#endif
#if MIRACL == 16
#define UNWOUNDM
p=q=t[b&3]; q>>=2;
x=t[(b>>2)&3]; q^=x; p^=(x<<2); q>>=2;
x=t[(b>>4)&3]; q^=x; p^=(x<<4); q>>=2;
x=t[(b>>6)&3]; q^=x; p^=(x<<6); q>>=2;
x=t[(b>>8)&3]; q^=x; p^=(x<<8); q>>=2;
x=t[(b>>10)&3]; q^=x; p^=(x<<10); q>>=2;
x=t[(b>>12)&3]; q^=x; p^=(x<<12); q>>=2;
x=t[(b>>14)]; q^=x; p^=(x<<14); q>>=2;
#endif
#if MIRACL == 32
#define UNWOUNDM
p=q=t[b&3]; q>>=2;
x=t[(b>>2)&3]; q^=x; p^=(x<<2); q>>=2; /* 8 ASM 80386 instructions */
x=t[(b>>4)&3]; q^=x; p^=(x<<4); q>>=2; /* but only 4 ARM instructions! */
x=t[(b>>6)&3]; q^=x; p^=(x<<6); q>>=2;
x=t[(b>>8)&3]; q^=x; p^=(x<<8); q>>=2;
x=t[(b>>10)&3]; q^=x; p^=(x<<10); q>>=2;
x=t[(b>>12)&3]; q^=x; p^=(x<<12); q>>=2;
x=t[(b>>14)&3]; q^=x; p^=(x<<14); q>>=2;
x=t[(b>>16)&3]; q^=x; p^=(x<<16); q>>=2;
x=t[(b>>18)&3]; q^=x; p^=(x<<18); q>>=2;
x=t[(b>>20)&3]; q^=x; p^=(x<<20); q>>=2;
x=t[(b>>22)&3]; q^=x; p^=(x<<22); q>>=2;
x=t[(b>>24)&3]; q^=x; p^=(x<<24); q>>=2;
x=t[(b>>26)&3]; q^=x; p^=(x<<26); q>>=2;
x=t[(b>>28)&3]; q^=x; p^=(x<<28); q>>=2;
x=t[(b>>30)]; q^=x; p^=(x<<30); q>>=2;
#endif
#if MIRACL == 64
#define UNWOUNDM
p=q=t[b&0xf]; q>>=4;
x=t[(b>>4)&0xf]; q^=x; p^=(x<<4); q>>=4;
x=t[(b>>8)&0xf]; q^=x; p^=(x<<8); q>>=4;
x=t[(b>>12)&0xf]; q^=x; p^=(x<<12); q>>=4;
x=t[(b>>16)&0xf]; q^=x; p^=(x<<16); q>>=4;
x=t[(b>>20)&0xf]; q^=x; p^=(x<<20); q>>=4;
x=t[(b>>24)&0xf]; q^=x; p^=(x<<24); q>>=4;
x=t[(b>>28)&0xf]; q^=x; p^=(x<<28); q>>=4;
x=t[(b>>32)&0xf]; q^=x; p^=(x<<32); q>>=4;
x=t[(b>>36)&0xf]; q^=x; p^=(x<<36); q>>=4;
x=t[(b>>40)&0xf]; q^=x; p^=(x<<40); q>>=4;
x=t[(b>>44)&0xf]; q^=x; p^=(x<<44); q>>=4;
x=t[(b>>48)&0xf]; q^=x; p^=(x<<48); q>>=4;
x=t[(b>>52)&0xf]; q^=x; p^=(x<<52); q>>=4;
x=t[(b>>56)&0xf]; q^=x; p^=(x<<56); q>>=4;
x=t[(b>>60)]; q^=x; p^=(x<<60); q>>=4;
#endif
#ifndef UNWOUNDM
q=p=(mr_small)0;
for (k=0;k<MIRACL;k+=8)
{
q^=(t[b&3]);
b>>=2;
p>>=2;
p|=q<<M2;
q>>=2;
q^=(t[b&3]);
b>>=2;
p>>=2;
p|=q<<M2;
q>>=2;
q^=(t[b&3]);
b>>=2;
p>>=2;
p|=q<<M2;
q>>=2;
q^=(t[b&3]);
b>>=2;
p>>=2;
p|=q<<M2;
q>>=2;
}
#endif
#if MIRACL <= 32
p^=(tb0&(kb<<M1)); /* compensate for top bit */
q^=(tb0&(kb>>1)); /* don't break pipeline.. */
#else
p^=(tb0&(kb<<M1));
q^=(tb0&(kb>>1));
p^=(tb1&(kb<<M2));
q^=(tb1&(kb>>2));
p^=(tb2&(kb<<M3));
q^=(tb2&(kb>>3));
#endif
*r=p;
return q;
}
#endif
#endif
#endif
#endif
static int numbits(big x)
{ /* return degree of x */
mr_small *gx=x->w,bit=TOPBIT;
int m,k=x->len;
if (k==0) return 0;
m=k*MIRACL;
while (!(gx[k-1]&bit))
{
m--;
bit>>=1;
}
return m;
}
int degree2(big x)
{ /* returns -1 for x=0 */
return (numbits(x)-1);
}
/*
static int zerobits(big x)
{
int m,n,k;
mr_small *gx,lsb,bit=1;
k=x->len;
if (k==0) return (-1);
gx=x->w;
for (m=0;m<k;m++)
{
if (gx[m]==0) continue;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -