📄 mrcomba.tpl
字号:
/*
* MIRACL Comba's method for ultimate speed modular multiplication
* mrcomba.tpl
*
* See "Exponentiation Cryptosystems on the IBM PC", IBM Systems
* Journal Vol. 29 No. 4 1990. Comba's method has been extended to
* implement Montgomery reduction.
*
* Here the inner loops of the basic multiplication, squaring and
* Montgomery's redc() functions are completely unravelled, and
* reorganised for maximum possible speed.
*
* This approach is recommended for maximum speed where parameters
* are fixed and compute resources are constrained. The processor must
* support an unsigned multiply instruction, and should have a carry flag.
*
* This file is a template. To fill in the gaps and create mrcomba.c,
* you must run the mex.c program to insert the C or assembly language
* macros from the appropriate .mcs file. For use with C MR_NOASM must
* be defined in mirdef.h
*
* This method would appear to be particularly useful for implementing
* fast Elliptic Curve Cryptosystems over GF(p) and fast 1024-bit RSA
* decryption.
*
* The #define MR_COMBA in mirdef.h determines the FIXED size of
* modulus to be used. This *must* be determined at compile time.
*
* Note that this module can generate a *lot* of code for large values
* of MR_COMBA. This should have a maximum value of 8-16. Any larger
* that and you should define MR_KCM instead - see mrkcm.tpl
*
* Note that on some processors it is *VITAL* that arrays be aligned on
* 4-byte boundaries
*
* **** This code does not like -fomit-frame-pointer using GCC ***********
*
* Copyright (c) 1988-2001 Shamus Software Ltd.
*/
#include "miracl.h"
#ifdef MR_SSE2_INTRINSICS
#ifdef __GNUC__
#include <xmmintrin.h>
#else
#include <emmintrin.h>
#endif
#endif
#ifdef MR_COMBA
#if INLINE_ASM == 1
#define N 2
#define POINTER WORD PTR
#define PBP bp
#define PBX bx
#define PSI si
#define PDI di
#define DSI si
#define DDI di
#define DBP bp
#define DAX ax
#define DCX cx
#define DDX dx
#endif
#if INLINE_ASM == 2
#define N 4
#define POINTER DWORD PTR
#define PBP bp
#define PBX bx
#define PSI si
#define PDI di
#define DSI esi
#define DDI edi
#define DBP ebp
#define DAX eax
#define DCX ecx
#define DDX edx
#endif
#if INLINE_ASM == 3
#define N 4
#define POINTER DWORD PTR
#define PBP ebp
#define PBX ebx
#define PSI esi
#define PDI edi
#define DSI esi
#define DDI edi
#define DBP ebp
#define DAX eax
#define DCX ecx
#define DDX edx
#endif
/* NOTE! z must be distinct from x and y */
void comba_mult(_MIPD_ big x,big y,big z)
{ /* comba multiplier */
int i;
mr_small *a,*b,*c;
#ifdef MR_SSE2_INTRINSICS
__m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm7;
#endif
#ifdef MR_WIN64
mr_small lo,hi,sumlo,sumhi,extra;
#endif
#ifdef MR_ITANIUM
register mr_small lo1,hi1,lo2,hi2,sumlo,sumhi,extra,ma,mb;
#else
#ifdef MR_NOASM
#ifdef mr_qltype
mr_large pp1;
mr_vlarge sum;
#else
register mr_small extra,s0,s1;
mr_large pp1,pp2,sum;
#endif
#endif
#endif
#ifdef MR_OS_THREADS
miracl *mr_mip=get_mip();
#endif
for (i=2*MR_COMBA;i<(int)(z->len&MR_OBITS);i++) z->w[i]=0;
z->len=2*MR_COMBA;
a=x->w; b=y->w; c=z->w;
/*** MULTIPLY ***/ /* multiply a by b, result in c */
if (z->w[2*MR_COMBA-1]==0) mr_lzero(z);
}
/* NOTE! z and x must be distinct */
void comba_square(_MIPD_ big x,big z)
{ /* super comba squarer */
int i;
mr_small *a,*c;
#ifdef MR_WIN64
mr_small lo,hi,sumlo,sumhi,extra,cy;
#endif
#ifdef MR_ITANIUM
register mr_small lo1,hi1,lo2,hi2,sumlo,sumhi,extra,ma,mb;
#endif
#ifdef MR_NOASM
#ifdef mr_qltype
mr_large pp1;
mr_vlarge sum;
#else
register mr_small extra,s0,s1;
mr_large pp1,pp2,sum;
#endif
#endif
#ifdef MR_OS_THREADS
miracl *mr_mip=get_mip();
#endif
for (i=2*MR_COMBA;i<(int)(z->len&MR_OBITS);i++) z->w[i]=0;
z->len=2*MR_COMBA;
a=x->w; c=z->w;
/*** SQUARE ***/ /* squares a, result in b */
if (z->w[2*MR_COMBA-1]==0) mr_lzero(z);
}
/* NOTE! t and z must be distinct! */
void comba_redc(_MIPD_ big t,big z)
{ /* super comba Montgomery redc() function */
mr_small carry,su;
#ifdef MR_ITANIUM
register mr_small lo1,hi1,lo2,hi2,sumlo,sumhi,extra,ma,mb,sp,u;
#endif
#ifdef MR_WIN64
mr_small lo,hi,sumlo,sumhi,extra,ma,mb,u;
#endif
#ifdef MR_NOASM
mr_large u;
#ifndef MR_SPECIAL
#ifdef mr_qltype
register mr_small sp;
mr_large pp1;
mr_vlarge sum;
#else
register mr_small sp,extra,s0,s1;
mr_large pp1,pp2,sum;
#endif
#endif
#endif
unsigned int i;
big w,modulus;
mr_small *a,*b;
#ifndef MR_SPECIAL
BOOL need_subtract;
mr_small ndash;
#endif
#ifdef MR_OS_THREADS
miracl *mr_mip=get_mip();
#endif
#ifdef MR_SPECIAL
/* !!! Implement here a "special" fast method for modular reduction,
for a particular modulus. Implemented here for 2^192-2^64-1
and 2^224-2^96+1 on a 32 bit processor.
See for example "Software Implementation of the NIST Elliptic
Curves Over Prime Fields", Brown et al., Report #36, 2000 available
from www.cacr.math.uwaterloo.ca
The generated code can be manually optimised further.....
*/
int overshoot;
mr_small k[MR_COMBA],sn,tt,v;
#ifdef MR_PSEUDO_MERSENNE_142
mr_small sh,sl;
#endif
mr_small *c;
modulus=mr_mip->modulus;
for (i=MR_COMBA;i<(int)(z->len&MR_OBITS);i++) z->w[i]=0;
/* zero(z); */
z->len=MR_COMBA;
#ifdef MR_PSEUDO_MERSENNE_142
/* special code for 2^142-111 */
sn=111;
sh=t->w[4]>>14;
t->w[4]&=0x3FFF;
sl=t->w[5]>>14;
t->w[5]<<=18; t->w[5]|=sh;
sh=t->w[6]>>14;
t->w[6]<<=18; t->w[6]|=sl;
sl=t->w[7]>>14;
t->w[7]<<=18; t->w[7]|=sh;
sh=t->w[8]>>14;
t->w[8]<<=18; t->w[8]|=sl;
t->w[9]=sh;
a=&(t->w[5]);
b=k;
c=z->w;
/*** PMULT ***/
a=c;
k[0]=(c[4]>>14)*111;
c[4]&=0x3FFF;
/*** INCREMENT ***/
b=t->w;
/*** INCREMENT ***/
b=modulus->w;
while (z->w[4]>>14)
{
/*** DECREMENT ***/
}
if (z->w[MR_COMBA-1]>=modulus->w[MR_COMBA-1])
{
if (mr_compare(z,modulus)>=0)
{
/*** DECREMENT ***/
}
}
if (z->w[MR_COMBA-1]==0) mr_lzero(z);
#endif
#ifdef MR_PSEUDO_MERSENNE
sn=(mr_small)0-modulus->w[0]; /* Modulus is 2^{MIRACL*MR_COMBA}-c. Here we calculate c */
/* .. where c MUST be a word sized ... */
a=&(t->w[MR_COMBA]);
b=k;
c=z->w;
/*** PMULT ***/
a=c;
/*** INCREMENT ***/
overshoot=carry;
b=t->w;
/*** INCREMENT ***/
overshoot+=carry;
b=modulus->w;
while(overshoot>0)
{
/*** DECREMENT ***/
overshoot-=carry;
}
if (z->w[MR_COMBA-1]>=modulus->w[MR_COMBA-1])
{
if (mr_compare(z,modulus)>=0)
{
/*** DECREMENT ***/
}
}
if (z->w[MR_COMBA-1]==0) mr_lzero(z);
#endif
#ifdef MR_GENERALIZED_MERSENNE
#if MIRACL==64
#if MR_COMBA == 3
/* Special Code for 2^192-2^64-1 - assuming 64-bit processor */
a=t->w; b=k; c=z->w;
k[0]=k[1]=a[3]; k[2]=0;
/*** ADDITION ***/
overshoot=carry;
a=c; c=t->w;
k[0]=0;k[1]=k[2]=c[4];
/*** INCREMENT ***/
overshoot+=carry;
k[0]=k[1]=k[2]=c[5];
/*** INCREMENT ***/
overshoot+=carry;
b=modulus->w;
while(overshoot>0)
{
/*** DECREMENT ***/
overshoot-=carry;
}
if (z->w[MR_COMBA-1]>=modulus->w[MR_COMBA-1])
{
if (mr_compare(z,modulus)>=0)
{
/*** DECREMENT ***/
}
}
if (z->w[MR_COMBA-1]==0) mr_lzero(z);
#endif
#if MR_COMBA == 2
/* Special code for 2^127-1 - for 64-bit processor */
a=t->w;
k[0]=a[2]; k[1]=a[3];
a=b=k;
/*** INCREMENT ***/
a=t->w;
k[0]+=(a[1]>>63); k[1]|=(a[1]&0x8000000000000000);
c=z->w;
/*** ADDITION ***/
a=z->w;
b=modulus->w;
if (z->w[1]>=modulus->w[1])
{
if (mr_compare(z,modulus)>=0)
{
/*** DECREMENT ***/
}
}
if (z->w[MR_COMBA-1]==0) mr_lzero(z);
#endif
#endif
#if MIRACL==8
#if MR_COMBA==32
a=t->w; b=k; c=z->w;
k[0]=k[1]=k[2]=k[3]=k[4]=k[5]=k[6]=k[7]=k[8]=k[9]=k[10]=k[11]=0;
k[12]=a[44]; k[13]=a[45]; k[14]=a[46]; k[15]=a[47];
k[16]=a[48]; k[17]=a[49]; k[18]=a[50]; k[19]=a[51];
k[20]=a[52]; k[21]=a[53]; k[22]=a[54]; k[23]=a[55];
k[24]=a[56]; k[25]=a[57]; k[26]=a[58]; k[27]=a[59];
k[28]=a[60]; k[29]=a[61]; k[30]=a[62]; k[31]=a[63];
/*** ADDITION ***/
overshoot=carry;
a=c; c=t->w;
/*** INCREMENT ***/
overshoot+=carry;
k[0]=k[1]=k[2]=k[3]=k[4]=k[5]=k[6]=k[7]=k[8]=k[9]=k[10]=k[11]=0;
k[12]=c[48]; k[13]=c[49]; k[14]=c[50]; k[15]=c[51];
k[16]=c[52]; k[17]=c[53]; k[18]=c[54]; k[19]=c[55];
k[20]=c[56]; k[21]=c[57]; k[22]=c[58]; k[23]=c[59];
k[24]=c[60]; k[25]=c[61]; k[26]=c[62]; k[27]=c[63];
k[28]=k[29]=k[30]=k[31]=0;
/*** INCREMENT ***/
overshoot+=carry;
/*** INCREMENT ***/
overshoot+=carry;
k[0]=c[32]; k[1]=c[33]; k[2]=c[34]; k[3]=c[35];
k[4]=c[36]; k[5]=c[37]; k[6]=c[38]; k[7]=c[39];
k[8]=c[40]; k[9]=c[41]; k[10]=c[42]; k[11]=c[43];
k[12]=k[13]=k[14]=k[15]=k[16]=k[17]=k[18]=k[19]=k[20]=k[21]=k[22]=k[23]=0;
k[24]=c[56]; k[25]=c[57]; k[26]=c[58]; k[27]=c[59];
k[28]=c[60]; k[29]=c[61]; k[30]=c[62]; k[31]=c[63];
/*** INCREMENT ***/
overshoot+=carry;
k[0]=c[36]; k[1]=c[37]; k[2]=c[38]; k[3]=c[39];
k[4]=c[40]; k[5]=c[41]; k[6]=c[42]; k[7]=c[43];
k[8]=c[44]; k[9]=c[45]; k[10]=c[46]; k[11]=c[47];
k[12]=c[52]; k[13]=c[53]; k[14]=c[54]; k[15]=c[55];
k[16]=c[56]; k[17]=c[57]; k[18]=c[58]; k[19]=c[59];
k[20]=c[60]; k[21]=c[61]; k[22]=c[62]; k[23]=c[63];
k[24]=c[52]; k[25]=c[53]; k[26]=c[54]; k[27]=c[55];
k[28]=c[32]; k[29]=c[33]; k[30]=c[34]; k[31]=c[35];
/*** INCREMENT ***/
overshoot+=carry;
k[0]=c[44]; k[1]=c[45]; k[2]=c[46]; k[3]=c[47];
k[4]=c[48]; k[5]=c[49]; k[6]=c[50]; k[7]=c[51];
k[8]=c[52]; k[9]=c[53]; k[10]=c[54]; k[11]=c[55];
k[12]=k[13]=k[14]=k[15]=k[16]=k[17]=k[18]=k[19]=k[20]=k[21]=k[22]=k[23]=0;
k[24]=c[32]; k[25]=c[33]; k[26]=c[34]; k[27]=c[35];
k[28]=c[40]; k[29]=c[41]; k[30]=c[42]; k[31]=c[43];
/*** DECREMENT ***/
overshoot-=carry;
k[0]=c[48]; k[1]=c[49]; k[2]=c[50]; k[3]=c[51];
k[4]=c[52]; k[5]=c[53]; k[6]=c[54]; k[7]=c[55];
k[8]=c[56]; k[9]=c[57]; k[10]=c[58]; k[11]=c[59];
k[12]=c[60]; k[13]=c[61]; k[14]=c[62]; k[15]=c[63];
k[16]=k[17]=k[18]=k[19]=k[20]=k[21]=k[22]=k[23]=0;
k[24]=c[36]; k[25]=c[37]; k[26]=c[38]; k[27]=c[39];
k[28]=c[44]; k[29]=c[45]; k[30]=c[46]; k[31]=c[47];
/*** DECREMENT ***/
overshoot-=carry;
k[0]=c[52]; k[1]=c[53]; k[2]=c[54]; k[3]=c[55];
k[4]=c[56]; k[5]=c[57]; k[6]=c[58]; k[7]=c[59];
k[8]=c[60]; k[9]=c[61]; k[10]=c[62]; k[11]=c[63];
k[12]=c[32]; k[13]=c[33]; k[14]=c[34]; k[15]=c[35];
k[16]=c[36]; k[17]=c[37]; k[18]=c[38]; k[19]=c[39];
k[20]=c[40]; k[21]=c[41]; k[22]=c[42]; k[23]=c[43];
k[24]=k[25]=k[26]=k[27]=0;
k[28]=c[48]; k[29]=c[49]; k[30]=c[50]; k[31]=c[51];
/*** DECREMENT ***/
overshoot-=carry;
k[0]=c[56]; k[1]=c[57]; k[2]=c[58]; k[3]=c[59];
k[4]=c[60]; k[5]=c[61]; k[6]=c[62]; k[7]=c[63];
k[8]=k[9]=k[10]=k[11]=0;
k[12]=c[36]; k[13]=c[37]; k[14]=c[38]; k[15]=c[39];
k[16]=c[40]; k[17]=c[41]; k[18]=c[42]; k[19]=c[43];
k[20]=c[44]; k[21]=c[45]; k[22]=c[46]; k[23]=c[47];
k[24]=k[25]=k[26]=k[27]=0;
k[28]=c[52]; k[29]=c[53]; k[30]=c[54]; k[31]=c[55];
/*** DECREMENT ***/
overshoot-=carry;
b=modulus->w;
while (overshoot>0)
{
/*** DECREMENT ***/
overshoot-=carry;
}
while (overshoot<0)
{
/*** INCREMENT ***/
overshoot+=carry;
}
if (z->w[MR_COMBA-1]>=modulus->w[MR_COMBA-1])
{
if (compare(z,modulus)>=0)
{
/*** DECREMENT ***/
}
}
if (z->w[MR_COMBA-1]==0) mr_lzero(z);
#endif
#if MR_COMBA == 16
/* Special code for 2^127-1 - for 8-bit processor */
a=t->w;
k[0]=a[16]; k[1]=a[17]; k[2]=a[18]; k[3]=a[19];
k[4]=a[20]; k[5]=a[21]; k[6]=a[22]; k[7]=a[23];
k[8]=a[24]; k[9]=a[25]; k[10]=a[26]; k[11]=a[27];
k[12]=a[28]; k[13]=a[29]; k[14]=a[30]; k[15]=a[31];
a=b=k;
/*** DOUBLEIT ***/
a=t->w;
k[0]+=(a[15]>>7); k[15]|=(a[15]&0x80);
c=z->w;
/*** ADDITION ***/
a=z->w;
b=modulus->w;
/* if MSB is 1, try and deal with it here */
tt=(mr_small)(z->w[15]>>M1);
v=z->w[0]+tt;
if (v>=z->w[0])
{
z->w[15]-=(mr_small)(tt<<M1);
z->w[0]=v;
}
/* This is probably not going to happen now... */
if (z->w[15]>=modulus->w[15])
{
if (mr_compare(z,modulus)>=0)
{
/*** DECREMENT ***/
}
}
if (z->w[MR_COMBA-1]==0) mr_lzero(z);
#endif
#if MR_COMBA==20
/* 2^160-2^112+2^64+1 */
/* faster way - keep a[20] to a[39] in registers r2-r21.
*/
a=t->w; b=k; c=z->w;
k[0]=k[8]=a[38]; k[2]=k[4]=k[6]=k[10]=k[12]=0; k[14]=a[20]; k[16]=a[22]; k[18]=a[24];
k[1]=k[9]=a[39]; k[3]=k[5]=k[7]=k[11]=k[13]=0; k[15]=a[21]; k[17]=a[23]; k[19]=a[25];
/*** ADDITION ***/
overshoot=carry;
a=c; c=t->w;
k[0]=k[2]=k[4]=k[6]=0; k[8]=k[14]=k[16]=c[34]; k[10]=c[36]; k[12]=c[24]; k[18]=0;
k[1]=k[3]=k[5]=k[7]=0; k[9]=k[15]=k[17]=c[35]; k[11]=c[37]; k[13]=c[25]; k[19]=0;
/*** DECREMENT ***/
overshoot-=carry;
k[4]=c[24]; k[6]=k[12]=k[14]=c[32]; k[0]=k[8]=c[20]; k[2]=k[10]=c[22]; k[16]=k[18]=c[36];
k[5]=c[25]; k[7]=k[13]=k[15]=c[33]; k[1]=k[9]=c[21]; k[3]=k[11]=c[23]; k[17]=k[19]=c[37];
/*** DECREMENT ***/
overshoot-=carry;
k[0]=k[6]=k[8]=c[26]; k[4]=k[10]=k[12]=c[30]; k[2]=k[14]=k[16]=k[18]=0;
k[1]=k[7]=k[9]=c[27]; k[5]=k[11]=k[13]=c[31]; k[3]=k[15]=k[17]=k[19]=0;
/*** DECREMENT ***/
overshoot-=carry;
k[2]=k[8]=k[10]=c[28]; k[0]=k[4]=k[6]=k[16]=0; k[12]=k[14]=k[18]=c[38];
k[3]=k[9]=k[11]=c[29]; k[1]=k[5]=k[7]=k[17]=0; k[13]=k[15]=k[19]=c[39];
/*** DECREMENT ***/
overshoot-=carry;
b=modulus->w;
while(overshoot>0)
{
/*** DECREMENT ***/
overshoot-=carry;
}
while (overshoot<0)
{
/*** INCREMENT ***/
overshoot+=carry;
}
if (z->w[MR_COMBA-1]>=modulus->w[MR_COMBA-1])
{
if (mr_compare(z,modulus)>=0)
{
/*** DECREMENT ***/
}
}
if (z->w[MR_COMBA-1]==0) mr_lzero(z);
#endif
#endif
#if MIRACL==16
#if MR_COMBA==10
/* 2^160-2^112+2^64+1 */
a=t->w; b=k; c=z->w;
k[0]=k[4]=a[19]; k[1]=k[2]=k[3]=k[5]=k[6]=0; k[7]=a[10]; k[8]=a[11]; k[9]=a[12];
/*** ADDITION ***/
overshoot=carry;
a=c; c=t->w;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -