⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 mrgf2mnew.c

📁 比较新的功能强大的rsa算法源代码,方便使用.
💻 C
📖 第 1 页 / 共 5 页
字号:
/*
 *   MIRACL routines for arithmetic over GF(2^m), 
 *   mrgf2m.c
 *
 *   For algorithms used, see IEEE P1363 Standard, Appendix A
 *   unless otherwise stated.
 *
 *   The time-critical routines are the multiplication routine multiply2()
 *   and (for AFFINE co-ordinates), the modular inverse routine inverse2() 
 *   and the routines it calls.
 *
 *   READ COMMENTS CAREFULLY FOR VARIOUS OPTIMIZATION SUGGESTIONS
 *
 *   No assembly language used.
 *
 *   Use utility irp.cpp to generate optimal code for function reduce2(.) below
 *
 *   Space can be saved by removing unneeded functions and 
 *   deleting unrequired functionality 
 *
 *   Copyright (c) 2000-2007 Shamus Software Ltd.
 */

#include <stdlib.h> 
#include "miracl.h"
#ifdef MR_STATIC
#include <string.h>
#endif

#ifndef MR_NOFULLWIDTH
                     /* This does not make sense using floating-point! */

#define M1 (MIRACL-1)
#define M2 (MIRACL-2)
#define M3 (MIRACL-3)
#define M4 (MIRACL-4)
#define M5 (MIRACL-5)
#define M6 (MIRACL-6)
#define M7 (MIRACL-7)
#define TOPBIT ((mr_small)1<<M1)
#define SECBIT ((mr_small)1<<M2)
#define THDBIT ((mr_small)1<<M3)
#define FRHBIT ((mr_small)1<< M4)
#define FIFBIT ((mr_small)1<< M5)
#define SIXBIT ((mr_small)1 << M6)
#define SEVBIT ((mr_small)1 << M7)
#define M8 (MIRACL-8)

#include <xmmintrin.h>

/* This is extremely time-critical, and expensive */

/*
#include <emmintrin.h>

static mr_small mr_mul2(mr_small a,mr_small b,mr_small *r)
{
    __m64 tt[4],xg,rg;
    mr_small q;

    tt[0]=_m_from_int(0);
    tt[1]=_m_from_int(a);
    tt[2]=_m_psllqi(tt[1],1);
    tt[3]=_m_pxor(tt[1],tt[2]);

    rg=tt[b&3]; 
    xg=tt[(b>>2)&3]; rg=_m_pxor(rg,_m_psllqi(xg,2));
    xg=tt[(b>>4)&3]; rg=_m_pxor(rg,_m_psllqi(xg,4));
    xg=tt[(b>>6)&3]; rg=_m_pxor(rg,_m_psllqi(xg,6));
    xg=tt[(b>>8)&3]; rg=_m_pxor(rg,_m_psllqi(xg,8));
    xg=tt[(b>>10)&3]; rg=_m_pxor(rg,_m_psllqi(xg,10));
    xg=tt[(b>>12)&3]; rg=_m_pxor(rg,_m_psllqi(xg,12));
    xg=tt[(b>>14)&3]; rg=_m_pxor(rg,_m_psllqi(xg,14));
    xg=tt[(b>>16)&3]; rg=_m_pxor(rg,_m_psllqi(xg,16));
    xg=tt[(b>>18)&3]; rg=_m_pxor(rg,_m_psllqi(xg,18));
    xg=tt[(b>>20)&3]; rg=_m_pxor(rg,_m_psllqi(xg,20));
    xg=tt[(b>>22)&3]; rg=_m_pxor(rg,_m_psllqi(xg,22));
    xg=tt[(b>>24)&3]; rg=_m_pxor(rg,_m_psllqi(xg,24));
    xg=tt[(b>>26)&3]; rg=_m_pxor(rg,_m_psllqi(xg,26));
    xg=tt[(b>>28)&3]; rg=_m_pxor(rg,_m_psllqi(xg,28));
    xg=tt[(b>>30)]; rg=_m_pxor(rg,_m_psllqi(xg,30));

    *r=_m_to_int(rg);
    q=_m_to_int(_m_psrlqi(rg,32));

    _m_empty();

    return q;
}

*/

/* This might be faster on a 16-bit processor with no variable shift instructions. 
   The line w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15); is just a 1-bit right shift on 
   the hi|lo value - should be really fast in assembly language

unsigned short mr_mul2(unsigned short x,unsigned short y,unsigned short *r)
{
    unsigned short lo,hi,bit,w;
    hi=0;  
    lo=x;   
    bit=-(lo&1); 
    lo>>=1;

    hi^=(y&bit);    bit=-(lo&1); 
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);

    hi^=(y&bit);    bit=-(lo&1); 
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);

    hi^=(y&bit);    bit=-(lo&1); 
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);

    hi^=(y&bit);    bit=-(lo&1); 
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);

    hi^=(y&bit);    bit=-(lo&1); 
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);

    hi^=(y&bit);    bit=-(lo&1); 
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);

    hi^=(y&bit);    bit=-(lo&1); 
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);

    hi^=(y&bit);    bit=-(lo&1); 
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);

    hi^=(y&bit);    bit=-(lo&1); 
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);

    hi^=(y&bit);    bit=-(lo&1); 
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);

    hi^=(y&bit);    bit=-(lo&1); 
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);

    hi^=(y&bit);    bit=-(lo&1); 
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);

    hi^=(y&bit);    bit=-(lo&1); 
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);

    hi^=(y&bit);    bit=-(lo&1); 
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);

    hi^=(y&bit);    bit=-(lo&1); 
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);

    hi^=(y&bit);   
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);

    *r=lo;
    return hi;
}
      
*/


/* This might be faster on an 8-bit processor with no variable shift instructions. 
   The line w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7); is just a 1-bit right shift on 
   the hi|lo value - should be really fast in assembly language

unsigned char mr_mul2(unsigned char x,unsigned char y,unsigned char *r)
{
    unsigned char lo,hi,bit,w;
    hi=0;  
    lo=x;   
    bit=-(lo&1); 
    lo>>=1;

    hi^=(y&bit);    bit=-(lo&1); 
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);

    hi^=(y&bit);    bit=-(lo&1); 
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);

    hi^=(y&bit);    bit=-(lo&1); 
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);

    hi^=(y&bit);    bit=-(lo&1); 
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);

    hi^=(y&bit);    bit=-(lo&1); 
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);

    hi^=(y&bit);    bit=-(lo&1); 
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);

    hi^=(y&bit);    bit=-(lo&1); 
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);

    hi^=(y&bit);   
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);

    *r=lo;
    return hi;
}
      
*/

/* wouldn't it be nice if instruction sets supported a 
   one cycle "carry-free" multiplication instruction ... 
   The SmartMips does - its called maddp                 */

#ifndef MR_COMBA2

#if MIRACL==8

/* maybe use a small precomputed look-up table? */

static mr_small mr_mul2(mr_small a,mr_small b,mr_small *r)
{
    static const mr_small look[256]=
    {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
     0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,
     0,3,6,5,12,15,10,9,24,27,30,29,20,23,18,17,
     0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,
     0,5,10,15,20,17,30,27,40,45,34,39,60,57,54,51,
     0,6,12,10,24,30,20,18,48,54,60,58,40,46,36,34,
     0,7,14,9,28,27,18,21,56,63,54,49,36,35,42,45,
     0,8,16,24,32,40,48,56,64,72,80,88,96,104,112,120,
     0,9,18,27,36,45,54,63,72,65,90,83,108,101,126,119,
     0,10,20,30,40,34,60,54,80,90,68,78,120,114,108,102,
     0,11,22,29,44,39,58,49,88,83,78,69,116,127,98,105,
     0,12,24,20,48,60,40,36,96,108,120,116,80,92,72,68,
     0,13,26,23,52,57,46,35,104,101,114,127,92,81,70,75,
     0,14,28,18,56,54,36,42,112,126,108,98,72,70,84,90,
     0,15,30,17,60,51,34,45,120,119,102,105,68,75,90,85
    };
    mr_small x0,x1,y0,y1,m,p,q;
    x0=a&0x0f;
    x1=a&0xf0;
    y0=b&0x0f;
    y1=b&0xf0;   p=look[((x0<<4)|y0)];
    m=look[(x1|y0)]^look[(y1|x0)];
    q=look[(x1|(y1>>4))];

    p^=(m<<4);
    q^=(m>>4);

    *r=p;
    return q;
}

#else

int counter=0;

static  __m128i MR_mul2(__m128i a,__m128i b, __m128i *r)
{
    __m128i pp,qq,xx,tt[16],tb0,tb1,tb2,tb3,tb4,tb5,tb6,kb,m,ff;
	int i,j;

	kb=b;

	m=_mm_set_epi32(0x1FFF,0xFFFF,0xFFFF,0xFFFF);  
	tt[0]=_mm_setzero_si128();
	tt[1]=_mm_xor_si128(a,m);  /* clear top 3 bits */
	m=_mm_set_epi32(0,0,0xf0<<24,0);  
	tt[2]=_mm_xor_si128(_mm_slli_epi64(tt[1],1),_mm_srli_epi64( _mm_slli_si128(_mm_and_si128(m ,tt[1]),1) ,7));
	tt[4]=_mm_xor_si128(_mm_slli_epi64(tt[1],2),_mm_srli_epi64( _mm_slli_si128(_mm_and_si128(m ,tt[1]),1) ,6));
	tt[8]=_mm_xor_si128(_mm_slli_epi64(tt[1],3),_mm_srli_epi64( _mm_slli_si128(_mm_and_si128(m ,tt[1]),1) ,5));
	tt[3]=_mm_xor_si128(tt[1],tt[2]);
	tt[5]=_mm_xor_si128(tt[1],tt[4]);
	tt[6]=_mm_xor_si128(tt[2],tt[4]);
	tt[7]=_mm_xor_si128(tt[6],tt[1]);
	tt[9]=_mm_xor_si128(tt[8],tt[1]);
	tt[10]=_mm_xor_si128(tt[8],tt[2]);
	tt[11]=_mm_xor_si128(tt[10],tt[1]);
	tt[12]=_mm_xor_si128(tt[8],tt[4]);
	tt[13]=_mm_xor_si128(tt[12],tt[1]);
	tt[14]=_mm_xor_si128(tt[8],tt[6]);
	tt[15]=_mm_xor_si128(tt[14],tt[1]);
 
	m=_mm_set_epi32(0x8000,0,0,0);
	tb0=_mm_and_si128(m,a);
	tb0=_mm_xor_si128(tb0,_mm_srli_si128(tb0,4));
	tb0=_mm_xor_si128(tb0,_mm_srli_si128(tb0,8));
	tb0=_mm_srai_epi32(tb0,31);

	tb1=_mm_slli_epi32(_mm_and_si128(_mm_srli_epi32(m,1),a),1);
	tb1=_mm_xor_si128(tb1,_mm_srli_si128(tb1,4));
	tb1=_mm_xor_si128(tb1,_mm_srli_si128(tb1,8));
	tb1=_mm_srai_epi32(tb1,31);

	tb2=_mm_slli_epi32(_mm_and_si128(_mm_srli_epi32(m,2),a),2);
	tb2=_mm_xor_si128(tb2,_mm_srli_si128(tb2,4));
	tb2=_mm_xor_si128(tb2,_mm_srli_si128(tb2,8));
	tb2=_mm_srai_epi32(tb2,31);

	tb3=_mm_slli_epi32(_mm_and_si128(_mm_srli_epi32(m,3),a),3);
	tb3=_mm_xor_si128(tb3,_mm_srli_si128(tb3,4));
	tb3=_mm_xor_si128(tb3,_mm_srli_si128(tb3,8));
	tb3=_mm_srai_epi32(tb3,31);

	tb4=_mm_slli_epi32(_mm_and_si128(_mm_srli_epi32(m,4),a),4);
	tb4=_mm_xor_si128(tb4,_mm_srli_si128(tb4,4));
	tb4=_mm_xor_si128(tb4,_mm_srli_si128(tb4,8));
	tb4=_mm_srai_epi32(tb4,31);

	tb5=_mm_slli_epi32(_mm_and_si128(_mm_srli_epi32(m,5),a),5);
	tb5=_mm_xor_si128(tb5,_mm_srli_si128(tb5,4));
	tb5=_mm_xor_si128(tb5,_mm_srli_si128(tb5,8));
	tb5=_mm_srai_epi32(tb5,31);

	tb6=_mm_slli_epi32(_mm_and_si128(_mm_srli_epi32(m,6),a),6);
	tb6=_mm_xor_si128(tb6,_mm_srli_si128(tb6,4));
	tb6=_mm_xor_si128(tb6,_mm_srli_si128(tb6,8));
	tb6=_mm_srai_epi32(tb6,31);

	ff=_mm_set_epi32(0,0,0,0xff);
	i=_mm_cvtsi128_si32(_mm_and_si128(b,ff));
	j=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,4),ff));
	pp=qq=_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4))); qq=_mm_srli_si128(qq,1);

	i=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,8),ff));
	j=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,12),ff));
	xx=_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)));  
	qq=_mm_xor_si128(qq,xx); pp=_mm_xor_si128(pp,_mm_slli_si128(xx,1)); qq=_mm_srli_si128(qq,1);

    i=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,16),ff));
	j=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,20),ff));
	xx=_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)));  
	qq=_mm_xor_si128(qq,xx); pp=_mm_xor_si128(pp,_mm_slli_si128(xx,2)); qq=_mm_srli_si128(qq,1);

    i=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,24),ff));
	j=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,28),ff));
	xx=_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)));  
	qq=_mm_xor_si128(qq,xx); pp=_mm_xor_si128(pp,_mm_slli_si128(xx,3)); qq=_mm_srli_si128(qq,1);

    i=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,32),ff));
	j=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,36),ff));
	xx=_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)));  
	qq=_mm_xor_si128(qq,xx); pp=_mm_xor_si128(pp,_mm_slli_si128(xx,4)); qq=_mm_srli_si128(qq,1);

    i=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,40),ff));
	j=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,44),ff));
	xx=_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)));  
	qq=_mm_xor_si128(qq,xx); pp=_mm_xor_si128(pp,_mm_slli_si128(xx,5)); qq=_mm_srli_si128(qq,1);

    i=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,48),ff));
	j=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,52),ff));
	xx=_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)));  
	qq=_mm_xor_si128(qq,xx); pp=_mm_xor_si128(pp,_mm_slli_si128(xx,6)); qq=_mm_srli_si128(qq,1);

	i=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,56),ff));
	j=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,60),ff));
	xx=_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)));  
	qq=_mm_xor_si128(qq,xx); pp=_mm_xor_si128(pp,_mm_slli_si128(xx,7)); qq=_mm_srli_si128(qq,1);

    b=_mm_srli_si128(b,8);

	i=_mm_cvtsi128_si32(_mm_and_si128(b,ff));
	j=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,4),ff));
	xx=_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)));  
	qq=_mm_xor_si128(qq,xx); pp=_mm_xor_si128(pp,_mm_slli_si128(xx,8)); qq=_mm_srli_si128(qq,1);

    i=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,8),ff));
	j=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,12),ff));
	xx=_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)));  
	qq=_mm_xor_si128(qq,xx); pp=_mm_xor_si128(pp,_mm_slli_si128(xx,9)); qq=_mm_srli_si128(qq,1);

	i=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,16),ff));
	j=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,20),ff));
	xx=_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)));  
	qq=_mm_xor_si128(qq,xx); pp=_mm_xor_si128(pp,_mm_slli_si128(xx,10)); qq=_mm_srli_si128(qq,1);

    i=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,24),ff));
	j=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,28),ff));
	xx=_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)));  
	qq=_mm_xor_si128(qq,xx); pp=_mm_xor_si128(pp,_mm_slli_si128(xx,11)); qq=_mm_srli_si128(qq,1);

	i=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,32),ff));
	j=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,36),ff));
	xx=_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)));  
	qq=_mm_xor_si128(qq,xx); pp=_mm_xor_si128(pp,_mm_slli_si128(xx,12)); qq=_mm_srli_si128(qq,1);

    i=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,40),ff));
	j=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,44),ff));
	xx=_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)));  
	qq=_mm_xor_si128(qq,xx); pp=_mm_xor_si128(pp,_mm_slli_si128(xx,13)); qq=_mm_srli_si128(qq,1);

    i=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,48),ff));
	j=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,52),ff));
	xx=_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)));  
	qq=_mm_xor_si128(qq,xx); pp=_mm_xor_si128(pp,_mm_slli_si128(xx,14)); qq=_mm_srli_si128(qq,1);

    i=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,56),ff));
	j=_mm_cvtsi128_si32(_mm_and_si128(_mm_slli_epi64(b,60),ff));
	xx=_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)));  
	qq=_mm_xor_si128(qq,xx); pp=_mm_xor_si128(pp,_mm_slli_si128(xx,15)); qq=_mm_srli_si128(qq,1);

	m=_mm_set_epi32(0,0xFF,0,0);
	pp=_mm_xor_si128(pp,  _mm_and_si128(tb0,_mm_slli_epi64(_mm_slli_si128(kb,15),7) ));
	qq=_mm_xor_si128(qq,_mm_and_si128(tb0,_mm_or_si128(_mm_srli_epi64(kb,1),  _mm_slli_epi64(_mm_srli_si128(_mm_and_si128(kb,m),1),7)    )));

	pp=_mm_xor_si128(pp,  _mm_and_si128(tb1,_mm_slli_epi64(_mm_slli_si128(kb,15),6) ));
	qq=_mm_xor_si128(qq,_mm_and_si128(tb1,_mm_or_si128(_mm_srli_epi64(kb,1),  _mm_slli_epi64(_mm_srli_si128(_mm_and_si128(kb,m),1),6)    )));

	pp=_mm_xor_si128(pp,  _mm_and_si128(tb2,_mm_slli_epi64(_mm_slli_si128(kb,15),5) ));
	qq=_mm_xor_si128(qq,_mm_and_si128(tb2,_mm_or_si128(_mm_srli_epi64(kb,1),  _mm_slli_epi64(_mm_srli_si128(_mm_and_si128(kb,m),1),5)    )));

	pp=_mm_xor_si128(pp,  _mm_and_si128(tb3,_mm_slli_epi64(_mm_slli_si128(kb,15),4) ));
	qq=_mm_xor_si128(qq,_mm_and_si128(tb3,_mm_or_si128(_mm_srli_epi64(kb,1),  _mm_slli_epi64(_mm_srli_si128(_mm_and_si128(kb,m),1),4)    )));

	pp=_mm_xor_si128(pp,  _mm_and_si128(tb4,_mm_slli_epi64(_mm_slli_si128(kb,15),3) ));
	qq=_mm_xor_si128(qq,_mm_and_si128(tb4,_mm_or_si128(_mm_srli_epi64(kb,1),  _mm_slli_epi64(_mm_srli_si128(_mm_and_si128(kb,m),1),3)    )));

	pp=_mm_xor_si128(pp,  _mm_and_si128(tb5,_mm_slli_epi64(_mm_slli_si128(kb,15),2) ));
	qq=_mm_xor_si128(qq,_mm_and_si128(tb5,_mm_or_si128(_mm_srli_epi64(kb,1),  _mm_slli_epi64(_mm_srli_si128(_mm_and_si128(kb,m),1),2)    )));

	pp=_mm_xor_si128(pp,  _mm_and_si128(tb6,_mm_slli_epi64(_mm_slli_si128(kb,15),1) ));
	qq=_mm_xor_si128(qq,_mm_and_si128(tb6,_mm_or_si128(_mm_srli_epi64(kb,1),  _mm_slli_epi64(_mm_srli_si128(_mm_and_si128(kb,m),1),1)    )));

	*r=pp;
	return qq;
}

static inline mr_small mr_mul2(mr_small a,mr_small b,mr_small *r)
{
    int i,j;
	__m128i pp,tt[16],m;
	mr_small xx[2];

 m=_mm_set_epi32(0,0,0xf0<<24,0);  

tt[0]=_mm_setzero_si128();
tt[1]=_mm_loadl_epi64((__m128i *)&a);
tt[2]=_mm_xor_si128(_mm_slli_epi64(tt[1],1),_mm_srli_epi64( _mm_slli_si128(_mm_and_si128(m ,tt[1]),1) ,7));
tt[4]=_mm_xor_si128(_mm_slli_epi64(tt[1],2),_mm_srli_epi64( _mm_slli_si128(_mm_and_si128(m ,tt[1]),1) ,6));
tt[8]=_mm_xor_si128(_mm_slli_epi64(tt[1],3),_mm_srli_epi64( _mm_slli_si128(_mm_and_si128(m ,tt[1]),1) ,5));
tt[3]=_mm_xor_si128(tt[1],tt[2]);
tt[5]=_mm_xor_si128(tt[1],tt[4]);
tt[6]=_mm_xor_si128(tt[2],tt[4]);
tt[7]=_mm_xor_si128(tt[6],tt[1]);
tt[9]=_mm_xor_si128(tt[8],tt[1]);
tt[10]=_mm_xor_si128(tt[8],tt[2]);
tt[11]=_mm_xor_si128(tt[10],tt[1]);
tt[12]=_mm_xor_si128(tt[8],tt[4]);
tt[13]=_mm_xor_si128(tt[12],tt[1]);
tt[14]=_mm_xor_si128(tt[8],tt[6]);
tt[15]=_mm_xor_si128(tt[14],tt[1]);

i=b&0xF; j=(b>>4)&0xF;
pp=_mm_xor_si128(tt[i],  _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4)) );
i=(b>>8)&0xF; j=(b>>12)&0xF;
pp=_mm_xor_si128(pp, _mm_slli_si128( _mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4))  )   ,1) );
i=(b>>16)&0xF; j=(b>>20)&0xF;
pp=_mm_xor_si128(pp, _mm_slli_si128(_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4))   )   ,2) );
i=(b>>24)&0xF; j=(b>>28)&0xF;
pp=_mm_xor_si128(pp, _mm_slli_si128(_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4))   )   ,3) );
i=(b>>32)&0xF; j=(b>>36)&0xF;
pp=_mm_xor_si128(pp, _mm_slli_si128(_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4))   )   ,4) );
i=(b>>40)&0xF; j=(b>>44)&0xF;
pp=_mm_xor_si128(pp, _mm_slli_si128(_mm_xor_si128(tt[i],_mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4))    )   ,5) );
i=(b>>48)&0xF; j=(b>>52)&0xF;
pp=_mm_xor_si128(pp, _mm_slli_si128(_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4))   )   ,6) );
i=(b>>56)&0xF; j=(b>>60);
pp=_mm_xor_si128(pp, _mm_slli_si128(_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),_mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m,tt[j]),1),4))   )   ,7) );

_mm_store_si128((__m128i*)xx,pp);

    *r=xx[0];
    return xx[1];


/*    mr_small kb,t[16],xx[2];

    mr_small x,q,p;
    mr_utype tb0;
#if MIRACL > 32
    mr_utype tb1,tb2,tb3,tb4,tb5,tb6;
#endif
*/
/*printf("a= %lx\n",a);*/
/*
    kb=b;

tt[0]=_mm_setzero_si128();
tt[8]=_mm_loadl_epi64((__m128i *)&a);
tt[8]=_mm_slli_epi64(tt[8],7);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -