📄 mrmuldv.any
字号:
q= (mr_small)am*bm;
middle=(mr_small)al*bm;
middle2=(mr_small)bl*am;
middle+=middle2; /* combine them - carefully */
if (middle<middle2) q+=((mr_small)1<<hshift);
r+=(middle << hshift);
if ((r>>hshift)<(unsigned mr_hltype)middle) q++;
q+=(middle>>hshift);
r+=*c;
if (r<*c) q++;
r+=*rp;
if (r<*rp) q++;
*rp=r;
*c=q;
}
*************************************************************************
/* SPARC assembler version of above. Note that when Full-width base
working is used, then muldvd() is the most time-critical of these
three routines. Use with above Blakely-Sloan C versions of muldvm
and muldiv (Assumes mr_utype is 32 bit int) */
.global _muldvd
_muldvd:
mov %o1,%y
andcc %g0,%g0,%o4
nop
nop
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%g0,%o4
tst %o0
bge 1f
nop
add %o4,%o1,%o4
1:
rd %y,%o1
addcc %o1,%o2,%o1
st %o1,[%o3]
retl
addxcc %o4,%g0,%o0
**************************************************************************
/* If you have a "decent" SPARC which supports UMUL and UDIV instructions
then the following will be much faster. Cut and paste what follows
into mrmuldv.s. See miracl.mak make file
Aside: God, I hate the Sparc, with its slippery ill-defined Instruction
set. Not all implementations support UMUL and UDIV, so its safer
to use the method above.
Note: Sometimes the routine name needs a preceding underscore,
so it may be necessary to change for example muldvd to _muldvd
through-out. Depends on the Unix version
*/
.global muldvd
muldvd:
umul %o0,%o1,%o0
rd %y,%o1
addcc %o0,%o2,%o0
st %o0,[%o3]
retl
addx %o1,%g0,%o0
.global muldvd2
muldvd2:
umul %o0,%o1,%o0
rd %y,%o1
ld [%o2],%o5
addcc %o0,%o5,%o0
ld [%o3],%o5
addx %o1,%g0,%o1
addcc %o0,%o5,%o0
st %o0,[%o3]
addx %o1,%g0,%o1
retl
st %o1,[%o2]
.global muldvm
muldvm:
mov %o0,%y
nop
nop
nop
udiv %o1,%o2,%o0
umul %o0,%o2,%o2
sub %o1,%o2,%o1
retl
st %o1,[%o3]
.global muldiv
muldiv:
umul %o0,%o1,%o1
rd %y,%o0
addcc %o1,%o2,%o1
addx %o0,%g0,%o0
mov %o0,%y
nop
nop
nop
udiv %o1,%o3,%o0
umul %o0,%o3,%o2
sub %o1,%o2,%o1
retl
st %o1,[%o4]
/* In-line assembly for SPARC using double type */
#include <stdio.h>
#include "miracl.h"
mr_small muldiv(mr_small a,mr_small b,mr_small c,mr_small m,mr_small *rp)
{
mr_small q;
mr_large ldres,dble;
static mr_small magic=MR_MAGIC;
__asm__ __volatile__ (
"fdmulq %1,%2,%%f0\n"
"fdtoq %3,%%f4\n"
"faddq %%f0,%%f4,%%f0\n"
"fdtoq %4,%%f4\n"
"fdivq %%f0,%%f4,%%f4\n"
"fdtoq %5,%%f8\n"
"faddq %%f4,%%f8,%%f4\n"
"fsubq %%f4,%%f8,%%f4\n"
"fqtod %%f4,%0\n"
"fdmulq %0,%4,%%f8\n"
"fsubq %%f0,%%f8,%%f0\n"
"fqtod %%f0,%%f10\n"
"std %%f10,[%6]\n"
: "=f"(q)
: "f"(a),"f"(b),"f"(c),"f"(m),"f"(magic),"r"(rp)
: "f0","f1","f2","f3","f4","f5","f6","f7","f8","f9","f10","f11","memory"
);
return q;
}
#ifdef MR_FP_ROUNDING
mr_small imuldiv(mr_small a,mr_small b,mr_small c,mr_small m,mr_large im,mr_small *rp)
{
mr_small q;
mr_large ldres,dble;
static mr_small magic=MR_MAGIC;
__asm__ __volatile__ (
"fdmulq %1,%2,%%f0\n"
"fdtoq %3,%%f4\n"
"faddq %%f0,%%f4,%%f0\n"
"fmulq %4,%%f0,%%f4\n"
"fdtoq %6,%%f8\n"
"faddq %%f4,%%f8,%%f4\n"
"fsubq %%f4,%%f8,%%f4\n"
"fqtod %%f4,%0\n"
"fdmulq %0,%5,%%f8\n"
"fsubq %%f0,%%f8,%%f0\n"
"fqtod %%f0,%%f10\n"
"std %%f10,[%7]\n"
: "=f"(q)
: "f"(a),"f"(b),"f"(c),"f"(im),"f"(m),"f"(magic),"r"(rp)
: "f0","f1","f2","f3","f4","f5","f6","f7","f8","f9","f10","f11","memory"
);
return q;
}
#endif
/* before leaving the SPARC, here is an interesting idea
Specify the underlying type as the 64-bit long long, as supported by the
GCC compiler. Use the Blakely-Sloan Portable Code above, with mr_hltype
defined as a long. This has been tried and works, getting 64-bit
behaviour from a 32-bit processor! Its slower than the 32-bit code above,
but if the 64-bit mrmuldvd() were rewritten in fast assembler.....? */
************************************************************************
#########################################################################################
#
# mrmuldv.s
# author: G. Garth Feb 1996
#
# implementation of modular multiplication for smalls
# using Blakely-Sloan division algorithm
# for Motorola 601 and 604 RISC PowerPC 32-bit processors
# see IEEE trans. Computers C-34, No. 3, March 1985 pp. 290-292
#
# see also PowerPC Microprocessor Developer's guide
# by Bunda, Potter & Shadowen, SAMS 1995, Appendix A p. 177
#
# intended for use in MIRACL library as assembly language implementation
# of routines muldiv, muldvm and muldvd
# written for Apple MPW PPC Assembler for Macintosh PPC computers
#
# Division Algorithm Pseudo Code
# given: integers A,B,C,D and M where D = A * B + C
# this algorithm computes Q and R such that
# D = M * Q + R
# Constraints:
# A,B,C,M < 2^H where H is word length in bits
# 0 <= Q,R < M; 0 < D < 2^(2*H)
#
# let K = # of bits in D
#
# R = Q = 0;
# for(T = K - 1; T >= 0; T--)
# {
# R <<= 1;
# Q <<= 1;
# if(D[T] == 1)
# {
# R += 1;
# }
# while(R >= M)
# {
# R -= M;
# Q += 1;
# }
# }
#
#########################################################################################
export muldiv[DS]
export .muldiv[PR]
export muldvm[DS]
export .muldvm[PR]
export muldvd[DS]
export .muldvd[PR]
toc
tc muldiv[TC],muldiv[DS]
tc muldvm[TC],muldvm[DS]
tc muldvd[TC],muldvd[DS]
csect muldiv[DS]
dc.l .muldiv[PR]
dc.l TOC[tc0]
csect muldvm[DS]
dc.l .muldvm[PR]
dc.l TOC[tc0]
csect muldvd[DS]
dc.l .muldvd[PR]
dc.l TOC[tc0]
#
# unsigned int muldiv(a,b,c,m,rp)
# unsigned int a,b,c,m,*rp;
# returns q = int[(a*b+c)/m] and *rp = (a*b+c) mod m
# when called a -> (r3), b -> (r4), c -> (r5), m -> (r6), rp -> (r7)
# upon return q -> (r3), *rp -> [(r12)]
# registers used: r3 thru r12
#
csect .muldiv[PR]
function .muldiv[PR]
or r12,r7,r7 ;(r12) <- remainder address
mulhwu r8,r3,r4 ;(r8) <- a * b high word
mullw r9,r3,r4 ;(r9 ) <- a * b low word
addc r4,r5,r9 ;(r4) <- a * b + c dividend.lo
addze r3,r8 ;(r3) <- (r8) + XERca dividend.hi
subic. r5,r3,0 ;test for zero dividend.hi
bne divlong ;
;here if dividend is single word
divwu r3,r4,r6 ;(r3) <- quotient
mullw r7,r6,r3; ;(r7) <- r6 * int (r4 / r6)
subf r5,r7,r4 ;(r5) <- remainder.lo
stw r5,0x0000(r12) ;[(r12)] <- remainder
blr ;that's all for single word division
divlong:
xor r7,r7,r7 ;zero divisor.hi
nor r7,r7,r7 ;calc ~divisor.hi
subfic r8,r6,0 ;(r8) <- -divisor.lo, set CA
addze r7,r7 ;(r7) <- ~divisor.hi + CA
or r11,r4,r4 ;(r11) <- dividend.lo
or r4,r3,r3 ;(r4) <- dividend.hi
;try to shift ahead, skipping unnecessary
;shifting loops
cntlzw r10,r4 ;find order of dividend.hi
subfic r9,r10,32 ;calc shift = 32 - order
slw r4,r4,r10 ;shift ahead dividend.hi
srw r3,r11,r9 ;get shifted part of dividend.lo
or r4,r4,r3 ;combine with dividend.hi
slw r11,r11,r10 ;shift ahead dividend.lo
addi r9,r9,33 ;setup for looping
mtctr r9 ;
xor r3,r3,r3 ;clear quotient.lo
xor r5,r5,r5 ;clear shift.hi
xor r6,r6,r6 ;clear shift.lo
b ldiff ;skip first round of shifting
align 6 ;align loop to 64-byte boundary
lshift:
rlwinm r5,r5,1,0,30 ;shift.hi <<= 1
rlwimi r5,r6,1,31,31 ;shift.hi[31] = shift.lo[0]
rlwinm r6,r6,1,0,30 ;shift.lo <<= 1
rlwimi r6,r4,1,31,31 ;shift.lo[31] = dividend.hi[0]
rlwinm r4,r4,1,0,30 ;dividend.hi <<= 1
rlwimi r4,r11,1,31,31 ;dividend.hi[31] = dividend.lo[0]
rlwinm r11,r11,1,0,30 ;dividend.lo <<= 1
rlwinm r3,r3,1,0,30 ;quotient.lo <<=1
ldiff:
addc r10,r6,r8 ;diff.lo = shift.lo - divisor.lo, set CA
adde. r9,r5,r7 ;diff.hi = shift.hi - divisor.hi + CA
blt lloop ;loop if diff < 0
or r6,r10,r10 ;shift.lo = diff.lo
or r5,r9,r9 ;shift.hi = diff.hi
ori r3,r3,1 ;set bit in quotient
lloop:
bdnz lshift ;loop until done
stw r6,0x0000(r12) ;store remainder in rp address
blr ;return
#
# unsigned int muldvm(a,c,m,rp)
# unsigned int a,c,m,*rp;
# returns q = int[(a*base+c)/m] and *rp = (a*base+c) mod m
# when called a -> (r3), c -> (r4), m -> (r5), rp -> (r6)
# upon return q -> (r3), *rp -> [(r12)]
# registers used: r3 thru r12
#
csect .muldvm[PR]
function .muldvm[PR]
or r12,r6,r6 ;(r12) <- remainder address
or r6,r5,r5 ;(r6) <- m
xor r7,r7,r7 ;zero divisor.hi
nor r7,r7,r7 ;calc ~divisor.hi
subfic r8,r6,0 ;(r8) <- calc -divisor.lo, set CA
addze r7,r7 ;(r7) <- ~divisor.hi += CA
or r11,r4,r4 ;(r11) <- dividend.lo
or r4,r3,r3 ;(r4) <- dividend.hi
;try to shift ahead, skipping unnecessary
;shifting loops
cntlzw r10,r4 ;find order of dividend.hi
subfic r9,r10,32 ;calc shift = 32 - order
slw r4,r4,r10 ;shift ahead dividend.hi
srw r3,r11,r9 ;get shifted part of dividend.lo
or r4,r4,r3 ;combine with dividend.hi
slw r11,r11,r10 ;shift ahead dividend.lo
addi r9,r9,33 ;setup for looping
mtctr r9 ;
xor r3,r3,r3 ;clear quotient.lo
xor r5,r5,r5 ;clear shift.hi
xor r6,r6,r6 ;clear shift.lo
b sdiff ;skip first round of shifting
align 6 ;align loop to 64-byte boundary
sshift:
rlwinm r5,r5,1,0,30 ;shift.hi <<= 1
rlwimi r5,r6,1,31,31 ;shift.hi[31] = shift.lo[0]
rlwinm r6,r6,1,0,30 ;shift.lo <<= 1
rlwimi r6,r4,1,31,31 ;shift.lo[31] = dividend.hi[0]
rlwinm r4,r4,1,0,30 ;dividend.hi <<= 1
rlwimi r4,r11,1,31,31 ;dividend.hi[31] = dividend.lo[0]
rlwinm r11,r11,1,0,30 ;dividend.lo <<= 1
rlwinm r3,r3,1,0,30 ;quotient.lo <<=1
sdiff:
addc r10,r6,r8 ;diff.lo = shift.lo - divisor.lo, set CA
adde. r9,r5,r7 ;diff.hi = shift.hi - divisor.hi + CA
blt sloop ;loop if diff < 0
or r6,r10,r10 ;shift.lo = diff.lo
or r5,r9,r9 ;shift.hi = diff.hi
ori r3,r3,1 ;set bit in quotient
sloop:
bdnz sshift
stw r6,0x0000(r12) ;store remainder in rp address
blr ;return
#
# unsigned int muldvd(a,b,c,rp)
# unsigned int a,b,c,*rp;
# returns q = int[(a*b+c)/base] and *rp = (a*b+c) mod base
# when called a -> (r3), b -> (r4), c -> (r5), rp -> (r6)
# upon return q -> (r3), *rp -> [(r6)]
# registers used: r3 thru r8
#
csect .muldvd[PR]
function .muldvd[PR]
mulhwu r7,r3,r4 ;(r7) <- a * b high word
mullw r8,r3,r4 ;(r8) <- a * b low word
addc r4,r8,r5 ;(r4) <- a * b + c
addze r3,r7 ;(r3) <- (r7) + XERca
stw r4,0x0000(r6) ;store remainder -> (r6)
blr ;return
*****************************************************************************/
/* Itanium code for Intel compiler, with mr_small a 64-bit long */
#include "miracl.h"
mr_small muldiv(a,b,c,m,rp)
mr_small a,b,c,m;
mr_small *rp;
{ /* Blakely-Sloan */
int i;
mr_small d,q=0,r=0;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -