📄 mrmuldv.any
字号:
/*
* MIRACL - various implementations of routines muldiv, muldvm, muldvd
* muldvd2 and imuldiv
* mrmuldv.c
*
* THIS FILE CONTAINS MANY VERSIONS OF THESE ROUTINES
* COPY THIS FILE TO MRMULDV.C AND DELETE THOSE PARTS IRRELEVANT TO
* YOUR REQUIREMENTS.
*
* NOTE: - This file and its contents are not needed
* if MR_NOASM is defined in mirdef.h
*
* muldiv() calculates (a*b+c)/m and (a*b+c)%m as quickly as possible. Should
* ideally be written in assembly language of target machine for speed
* The problem is to avoid overflow in the calculation of the intermediate
* product a*b+c.
*
* If using a floating-point underlying type, and rounding can be
* controlled, it makes sense to pre-calculate
* the inverse of the modulus m, and multiply instead of divide
* In this situation a function imuldiv() is also needed.
*
* muldvm() and muldvd() routines are necessary to support full-width number
* base working. They are not needed if MR_NOFULLWIDTH is defined in mirdef.h.
*
* muldvm - returns (a*base+c)/m and remainder
* muldvd - returns (a*b+c)/base and remainder
*
* NOTE: New to version 4.2, new routine muldvd2() is required.
* See C version below for specification
* Versions of this are easily developed from existing muldvd() programs
*
* In most applications muldvd2() will be the time critical routine.
*
* Note that full-width base working may not be possible for all processors.
* For example it cannot be used on a VAX, or RS/6000 with mr_utypes defined
* as ints. This is because the instruction set does not support
* unsigned multiply and divide instructions. In such cases ALWAYS use a
* maximum base of MAXBASE in mirsys(), rather than 0.
*
* Since parameter passing and returning is time-consuming, these routines
* should be generated 'inline', if compiler allows it. Parameter passing
* by register will also be faster than via the stack. For even faster
* operation, use in-line assembly to speed up the inner loops of routines
* pmul(), sdiv(), multiply() and divide(). See these routines for details
* of Microsoft/Borland C inline 80x86 assembly, which gives a substantial speed-up.
*
* NOTE: All other things being equal, versions of MIRACL with 32-bit mr_utypes
* will run 3-4 times faster than versions with 16-bit mr_utypes, even for medium
* precision arithmetic, such as used in Public Key systems.
*
* Note that a portable C version of 'muldiv' may not possible with some
* 32-bit compilers if ints and longs are both 32-bits and there is no
* 64-bit type. Fortunately these days there usually is such a type - called
* perhaps long long, or maybe __int64. See also the Blakely-Sloan
* method below. In any case the portable versions may be used if mr_utypes
* are defined as shorts, usually 16 bits. This would amount however to
* using the 32-bit processor in a 16 bit mode and would be very inefficient
* - up to 4 times slower. See mirdef.haf
*
* First the standard portable versions, for use when there is a double
* length type capable of holding the product of two mr_utype types.
* For example 32 and 16 bits types respectively.
* Note that if MR_NOASM is defined in mirdef.h, these routines are
* implemented in mrcore.c, and do not need to be extracted from here.
*
* This is followed by various other assembly language implementations for
* popular processors, computers and compilers.
*
**************************************************************
/* Standard C version of mrmuldv.c */
#include <stdio.h>
#include "miracl.h"
mr_small muldiv(mr_small a,mr_small b,mr_small c,mr_small m,mr_small *rp)
{
mr_small q;
mr_large ldres,dble=(mr_large)a*b+c;
q=(mr_small)MR_LROUND(dble/m);
*rp=(mr_small)(dble-(mr_large)q*m);
return q;
}
#ifdef MR_FP_ROUNDING
mr_small imuldiv(mr_small a,mr_small b,mr_small c,mr_small m,mr_large im,mr_small *rp)
{
mr_small q;
mr_large ldres,dble=(mr_large)a*b+c;
q=(mr_small)MR_LROUND(dble*im);
*rp=(mr_small)(dble-(mr_large)q*m);
return q;
}
#endif
#ifndef MR_NOFULLWIDTH
mr_small muldvm(mr_small a,mr_small c,mr_small m,mr_small *rp)
{
mr_small q;
union doubleword dble;
dble.h[MR_BOT]=c;
dble.h[MR_TOP]=a;
q=(mr_small)(dble.d/m);
*rp=(mr_small)(dble.d-(mr_large)q*m);
return q;
}
mr_small muldvd(mr_small a,mr_small b,mr_small c,mr_small *rp)
{
union doubleword dble;
dble.d=(mr_large)a*b+c;
*rp=dble.h[MR_BOT];
return dble.h[MR_TOP];
}
void muldvd2(mr_small a,mr_small b,mr_small *c,mr_small *rp)
{
union doubleword dble;
dble.d=(mr_large)a*b+*c+*rp;
*rp=dble.h[MR_BOT];
*c=dble.h[MR_TOP];
}
#endif
****************************************************************
//
// Version of muldiv() for use with underlying type a double
// and using the FP co-processor on a Pentium, and the gcc compiler.
// In this case MR_NOFULLWIDTH is defined.
// This is much better than compiling the above, but fprem and fdiv
// are still very slow.
//
.file "mrmuldv.s"
.text
.globl _muldiv
_muldiv:
pushl %ebx
fldl 8(%esp)
fmull 16(%esp)
movl 40(%esp),%ebx
faddl 24(%esp)
fldl 32(%esp)
fld %st(1)
// NOTE: If rounding control is possible, set rounding to "chop"
// and replace lines below with these
// In this case #define MR_FP_ROUNDING will be defined in mirdef.h
//
// fdiv %st(1),%st
// fistpq 8(%esp)
// fildq 8(%esp)
// fmul %st,%st(1)
// fxch %st(2)
// fsubp %st,%st(1)
// fstpl (%ebx)
fprem
fstl (%ebx)
fsubrp %st,%st(2)
fdivrp %st,%st(1)
popl %ebx
ret
//
// If MR_FP_ROUNDING is defined, this function will be needed for Pentium
//
.globl _imuldiv
_imuldiv:
pushl %ebx
fldl 8(%esp)
fmull 16(%esp)
movl 52(%esp),%ebx
faddl 24(%esp)
fldl 32(%esp)
fld %st(1)
fldt 40(%esp)
fmulp %st,%st(1)
fistpq 8(%esp)
fildq 8(%esp)
fmul %st,%st(1)
fxch %st(2)
fsubp %st,%st(1)
fstpl (%ebx)
popl %ebx
ret
************************************************************************
/*
* Borland C++ 32-bit compiler (BCC32) version of the above.
* Uses inline assembly feature. Suitable for Win32 Apps
* Also compatible with Microsoft Visual C++ 32-bit compiler
* BUT change TBYTE to QWORD
*/
#include "mirdef.h"
#define ASM _asm
double muldiv(double a,double b,double c,double m,double *rp)
{
ASM fld QWORD PTR a
ASM fmul QWORD PTR b
ASM mov ebx,DWORD PTR rp
ASM fadd QWORD PTR c
ASM fld QWORD PTR m
ASM fld st(1)
#ifdef MR_FP_ROUNDING
ASM fdiv st,st(1)
ASM fistp QWORD PTR [ebx]
ASM fild QWORD PTR [ebx]
ASM fmul st(1),st
ASM fxch st(2)
ASM fsubrp st(1),st
ASM fstp QWORD PTR [ebx]
#else
ASM fprem
ASM fst QWORD PTR [ebx]
ASM fsubp st(2),st
ASM fdivp st(1),st
#endif
}
#ifdef MR_FP_ROUNDING
double imuldiv(double a,double b,double c,double m,long double im,double *rp)
{
ASM fld QWORD PTR a
ASM fmul QWORD PTR b
ASM fld QWORD PTR m
ASM fxch st(1)
ASM fadd QWORD PTR c
ASM mov ebx,DWORD PTR rp
ASM fxch st(1)
ASM fld st(1)
ASM fld TBYTE PTR im /* QWORD for Microsoft */
ASM fmulp st(1),st
ASM fistp QWORD PTR [ebx]
ASM fild QWORD PTR [ebx]
ASM fmul st(1),st
ASM fxch st(2)
ASM fsubrp st(1),st
ASM fstp QWORD PTR [ebx]
}
#endif
*********************************************************************
;
; VAX11 version for Dec C compiler
; with 32 bit int using 64-bit quadword
; for the intermediate product.
;
; Use with mirdef.h32 - but define MR_NOFULLWIDTH
; Use mirsys(...,MAXBASE) instead of mirsys(...,0) in your programs.
;
; Why ...(MIRACL-2) instead of ...(MIRACL-1) ? That's a negative
; number for division by mr_base!
;
; The problem is that the emul and ediv instructions work only
; for signed types
;
.entry muldiv,0
subl #4,sp
emul 4(ap),8(ap),12(ap),r0 ;a*b+c
ediv 16(ap),r0,r0,@20(ap) ;quo. in r0, rem. in *rp
ret
.end
;
; Fullwidth base working not possible on VAX, so no muldvm() or muldvd()
;
;
**********************************************************************
#
# Version of muldiv.c for IBM RS/6000
# This processor has no unsigned multiply/divide
# so full-width base not possible, so no muldvm() or muldvd()
#
# Use with mirdef.h32 but define MR_NOFULLWIDTH definition.
# Use mirsys(...,MAXBASE) instead of mirsys(...,0) in your programs.
#
# Note this version was developed from very inadequate RS/6000
# documentation. It may not be optimal, and it may not always work
# (although it works fine for me!)
#
.file "mrmuldv.s"
.globl .muldiv[PR]
.csect .muldiv[PR]
# parameters are passed in registers 3,4,5,6 and 7
# the mq register holds the low 32-bits for mul/div
mul 3,4,3 # q=a*b
mfmq 4 # get low part from mq
a 4,5,4 # add in c
aze 3,3 # add carry to high part
mtmq 4 # move low part to mq
div 3,3,6 # q=(a*b+c)/m
mfmq 4 # get remainder
st 4,0(7) # store remainder
# quotient is returned in register 3
brl
************************************************************************
/* Here's another portable method which might be considered for processors
* like the VAX and RS6000. The idea is due to Peter Montgomery. */
#include "mirdef.h"
typedef unsigned mr_utype uint;
uint muldiv(a,b,c,m,rp)
uint a,b,c,m,*rp;
{
int q,r;
q=(int)(0.5+((double)a*(double)b+(double)c)/(double)m);
r=(int)(((uint)a*(uint)b+(uint)c)-(uint)m*(uint)q);
if (r < 0)
{
r+=m;
q--;
}
*rp=r;
return q;
}
**********************************************************************
;
; IBM-PC versions - small memory model only
; Easily modified for other memory models
;
; For large code models (e.g. medium)
;
; change _TEXT to mrmuldv_TEXT (in three places)
; change NEAR to FAR
; change [bp+4] to [bp+6]
; change [bp+6] to [bp+8]
; change [bp+8] to [bp+10]
; change [bp+10] to [bp+12]
; change [bp+12] to [bp+14]
;
; For large data models, see Turbo C version below for required modification
;
; Microsoft C compiler V4.0+
; Written for MS macro-assembler
;
ASSUME CS:_TEXT
_TEXT SEGMENT BYTE PUBLIC 'CODE'
PUBLIC _muldiv
_muldiv PROC NEAR
push bp ;standard C linkage
mov bp,sp
mov ax,[bp+4] ;get a
mul WORD PTR [bp+6] ;multiply by b
add ax,[bp+8] ;add c to low word
adc dx,0h ;add carry to high word
div WORD PTR [bp+10] ;divide by m
mov bx,[bp+12] ;get address for remainder
mov [bx],dx ;store remainder
pop bp ;standard C return
ret ;quotient in ax
_muldiv endP
PUBLIC _muldvm
_muldvm PROC NEAR
push bp ;standard C linkage
mov bp,sp
mov dx,[bp+4] ;get a
mov ax,[bp+6] ;add in c
div WORD PTR [bp+8] ;divide by m
mov bx,[bp+10] ;get address for remainder
mov [bx],dx ;store remainder
pop bp ;standard C return
ret ;quotient in ax
_muldvm endP
PUBLIC _muldvd
_muldvd PROC NEAR
push bp ;standard C linkage
mov bp,sp
mov ax,[bp+4] ;get a
mul WORD PTR [bp+6] ;multiply by b
add ax,[bp+8] ;add c to low word
adc dx,0h ;add carry to high word
mov bx,[bp+10] ;get address for remainder
mov [bx],ax ;store remainder
mov ax,dx
pop bp ;standard C return
ret ;quotient in ax
_muldvd endP
PUBLIC _muldvd2
_muldvd2 PROC NEAR
push bp ;standard C linkage
mov bp,sp
push si
mov ax,[bp+4] ;get a
mul WORD PTR [bp+6] ;multiply by b
mov bx,[bp+8] ;get address for c
add ax,[bx] ;add c
adc dx,0h ;add carry to high word
mov si,[bp+10] ;get address for remainder
add ax,[si] ;add rp
adc dx,0h ;add carry to high word
mov [si],ax ;store remainder
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -