📄 mrmuldv.any

📁 miracl-大数运算库,大家使用有什么问题请多多提意见
💻 ANY
📖 第 1 页 / 共 5 页
字号:
上一页 1 2 3 45
    q= (mr_small)am*bm;
    middle=(mr_small)al*bm;
    middle2=(mr_small)bl*am;
    middle+=middle2;                        /* combine them - carefully */
    if (middle<middle2) q+=((mr_small)1<<hshift);
    r+=(middle << hshift);
    if ((r>>hshift)<(unsigned mr_hltype)middle) q++;
    q+=(middle>>hshift);
    r+=*c;
    if (r<*c) q++;
    r+=*rp;
    if (r<*rp) q++;
    *rp=r;
    *c=q;
}   

*************************************************************************


/* SPARC assembler version of above. Note that when Full-width base 
   working is used, then muldvd() is the most time-critical of these
   three routines. Use with above Blakely-Sloan C versions of muldvm 
   and muldiv (Assumes mr_utype is 32 bit int) */
          .global _muldvd
_muldvd:
          mov    %o1,%y       
          andcc  %g0,%g0,%o4  
          nop                  
          nop                 
          mulscc %o4,%o0,%o4  
          mulscc %o4,%o0,%o4  
	  mulscc %o4,%o0,%o4  
          mulscc %o4,%o0,%o4  
          mulscc %o4,%o0,%o4  
          mulscc %o4,%o0,%o4  
          mulscc %o4,%o0,%o4  
          mulscc %o4,%o0,%o4  
          mulscc %o4,%o0,%o4  
          mulscc %o4,%o0,%o4  
          mulscc %o4,%o0,%o4  
          mulscc %o4,%o0,%o4  
          mulscc %o4,%o0,%o4  
          mulscc %o4,%o0,%o4  
          mulscc %o4,%o0,%o4  
          mulscc %o4,%o0,%o4  
          mulscc %o4,%o0,%o4  
          mulscc %o4,%o0,%o4  
          mulscc %o4,%o0,%o4  
          mulscc %o4,%o0,%o4  
          mulscc %o4,%o0,%o4  
          mulscc %o4,%o0,%o4  
          mulscc %o4,%o0,%o4  
          mulscc %o4,%o0,%o4  
          mulscc %o4,%o0,%o4  
          mulscc %o4,%o0,%o4  
          mulscc %o4,%o0,%o4  
          mulscc %o4,%o0,%o4  
          mulscc %o4,%o0,%o4  
          mulscc %o4,%o0,%o4  
          mulscc %o4,%o0,%o4  
          mulscc %o4,%o0,%o4  
          mulscc %o4,%g0,%o4
          tst    %o0          
          bge 1f
          nop
          add    %o4,%o1,%o4
1:
          rd     %y,%o1       
          addcc  %o1,%o2,%o1  
          st     %o1,[%o3]    
          retl          
          addxcc %o4,%g0,%o0  


**************************************************************************


/* If you have a "decent" SPARC which supports UMUL and UDIV instructions
   then the following will be much faster. Cut and paste what follows 
   into mrmuldv.s. See miracl.mak make file 

   Aside: God, I hate the Sparc, with its slippery ill-defined Instruction 
          set. Not all implementations support UMUL and UDIV, so its safer 
          to use the method above. 

   Note: Sometimes the routine name needs a preceding underscore,
         so it may be necessary to change for example muldvd to _muldvd
         through-out. Depends on the Unix version
*/

          .global muldvd
muldvd:
          umul   %o0,%o1,%o0              
          rd     %y,%o1       
          addcc  %o0,%o2,%o0  
          st     %o0,[%o3]    
          retl          
          addx   %o1,%g0,%o0 

          .global muldvd2
muldvd2:
          umul   %o0,%o1,%o0              
          rd     %y,%o1   
          ld     [%o2],%o5
          addcc  %o0,%o5,%o0 
          ld     [%o3],%o5
          addx   %o1,%g0,%o1
          addcc  %o0,%o5,%o0 
          st     %o0,[%o3]    
          addx   %o1,%g0,%o1 
          retl          
          st     %o1,[%o2]

         .global muldvm
muldvm:
          mov %o0,%y
          nop
          nop
          nop
          udiv %o1,%o2,%o0
          umul %o0,%o2,%o2
          sub  %o1,%o2,%o1
          retl
          st  %o1,[%o3]
          
          .global muldiv
muldiv:
           umul  %o0,%o1,%o1
           rd    %y,%o0
           addcc %o1,%o2,%o1
           addx  %o0,%g0,%o0
           mov %o0,%y
           nop
           nop
           nop
           udiv %o1,%o3,%o0
           umul %o0,%o3,%o2
           sub  %o1,%o2,%o1
           retl
           st %o1,[%o4]


/* In-line assembly for SPARC using double type */

#include <stdio.h>
#include "miracl.h"

mr_small muldiv(mr_small a,mr_small b,mr_small c,mr_small m,mr_small *rp)
{
    mr_small q;
    mr_large ldres,dble;
    static mr_small magic=MR_MAGIC;
    __asm__ __volatile__ (
    "fdmulq %1,%2,%%f0\n"
    "fdtoq  %3,%%f4\n"
    "faddq  %%f0,%%f4,%%f0\n"
    "fdtoq  %4,%%f4\n"
    "fdivq  %%f0,%%f4,%%f4\n"   
    "fdtoq  %5,%%f8\n"
    "faddq  %%f4,%%f8,%%f4\n"
    "fsubq  %%f4,%%f8,%%f4\n"
    "fqtod  %%f4,%0\n"
    "fdmulq %0,%4,%%f8\n"
    "fsubq  %%f0,%%f8,%%f0\n"
    "fqtod  %%f0,%%f10\n"
    "std    %%f10,[%6]\n"
    : "=f"(q)
    : "f"(a),"f"(b),"f"(c),"f"(m),"f"(magic),"r"(rp)
    : "f0","f1","f2","f3","f4","f5","f6","f7","f8","f9","f10","f11","memory"
    );
    return q;
}

#ifdef MR_FP_ROUNDING

mr_small imuldiv(mr_small a,mr_small b,mr_small c,mr_small m,mr_large im,mr_small *rp)
{
    mr_small q;
    mr_large ldres,dble;
    static mr_small magic=MR_MAGIC;
    __asm__ __volatile__ (
    "fdmulq %1,%2,%%f0\n"
    "fdtoq  %3,%%f4\n"
    "faddq  %%f0,%%f4,%%f0\n"

    "fmulq  %4,%%f0,%%f4\n"
    "fdtoq  %6,%%f8\n"
    "faddq  %%f4,%%f8,%%f4\n"
    "fsubq  %%f4,%%f8,%%f4\n"
    "fqtod  %%f4,%0\n"

    "fdmulq %0,%5,%%f8\n"
    "fsubq  %%f0,%%f8,%%f0\n"
    "fqtod  %%f0,%%f10\n"
    "std    %%f10,[%7]\n"
    : "=f"(q)
    : "f"(a),"f"(b),"f"(c),"f"(im),"f"(m),"f"(magic),"r"(rp)
    : "f0","f1","f2","f3","f4","f5","f6","f7","f8","f9","f10","f11","memory"
    );
    return q;
}

#endif



/* before leaving the SPARC, here is an interesting idea
   Specify the underlying type as the 64-bit long long, as supported by the
   GCC compiler. Use the Blakely-Sloan Portable Code above, with mr_hltype
   defined as a long. This has been tried and works, getting 64-bit 
   behaviour from a 32-bit processor! Its slower than the 32-bit code above,
   but if the 64-bit mrmuldvd() were rewritten in fast assembler.....? */ 






************************************************************************



#########################################################################################
#
# mrmuldv.s
# author: G. Garth Feb 1996
# 
# implementation of modular multiplication for smalls
# using Blakely-Sloan division algorithm
# for Motorola 601 and 604 RISC PowerPC 32-bit processors
# see IEEE trans. Computers C-34, No. 3, March 1985 pp. 290-292
#
# see also PowerPC Microprocessor Developer's guide
# by Bunda, Potter & Shadowen, SAMS 1995, Appendix A p. 177
#
# intended for use in MIRACL library as assembly language implementation
# of routines muldiv, muldvm and muldvd
# written for Apple MPW PPC Assembler for Macintosh PPC computers
#
# Division Algorithm Pseudo Code
# given: integers A,B,C,D and M where D = A * B + C
# this algorithm computes Q and R such that
# D = M * Q + R
# Constraints:
# A,B,C,M < 2^H where H is word length in bits 
# 0 <= Q,R < M; 0 < D < 2^(2*H)
# 
# let K = # of bits in D
#
# R = Q = 0;
# for(T = K - 1; T >= 0; T--)
# {
#       R <<= 1;
#       Q <<= 1;
#       if(D[T] == 1)
#       {
#               R += 1;
#       }
#       while(R >= M)
#       {
#               R -= M;
#               Q += 1;
#       }
# }
#
#########################################################################################

        export muldiv[DS]
        export .muldiv[PR]
        export muldvm[DS]
        export .muldvm[PR]
        export muldvd[DS]
        export .muldvd[PR]

        toc
                tc muldiv[TC],muldiv[DS]
                tc muldvm[TC],muldvm[DS]
                tc muldvd[TC],muldvd[DS]
        
        csect muldiv[DS]
                dc.l .muldiv[PR]
                dc.l TOC[tc0]
        csect muldvm[DS]
                dc.l .muldvm[PR]
                dc.l TOC[tc0]
        csect muldvd[DS]
                dc.l .muldvd[PR]
                dc.l TOC[tc0]
                
#
# unsigned int muldiv(a,b,c,m,rp)
# unsigned int a,b,c,m,*rp;
# returns q = int[(a*b+c)/m] and *rp = (a*b+c) mod m
# when called a -> (r3), b -> (r4), c -> (r5), m -> (r6), rp -> (r7)
# upon return q -> (r3), *rp -> [(r12)]
# registers used: r3 thru r12
#

        csect .muldiv[PR]
        function .muldiv[PR]
        
        or r12,r7,r7           ;(r12) <- remainder address
        mulhwu r8,r3,r4        ;(r8) <- a * b  high word
        mullw r9,r3,r4         ;(r9 ) <- a * b  low word
        addc r4,r5,r9          ;(r4) <- a * b + c  dividend.lo
        addze r3,r8            ;(r3) <- (r8) + XERca  dividend.hi
        subic. r5,r3,0         ;test for zero dividend.hi
        bne divlong            ;
                               ;here if dividend is single word
        divwu r3,r4,r6         ;(r3) <- quotient
        mullw r7,r6,r3;        ;(r7) <- r6 * int (r4 / r6)
        subf r5,r7,r4          ;(r5) <- remainder.lo
        stw r5,0x0000(r12)     ;[(r12)] <- remainder
        blr                    ;that's all for single word division
                               
divlong:
        xor r7,r7,r7           ;zero divisor.hi
        nor r7,r7,r7           ;calc ~divisor.hi
        subfic r8,r6,0         ;(r8) <- -divisor.lo, set CA
        addze r7,r7            ;(r7) <- ~divisor.hi + CA
        or r11,r4,r4           ;(r11) <- dividend.lo
        or r4,r3,r3            ;(r4) <- dividend.hi
                               ;try to shift ahead, skipping unnecessary
                               ;shifting loops
        cntlzw r10,r4          ;find order of dividend.hi
        subfic r9,r10,32       ;calc shift = 32 - order
        slw r4,r4,r10          ;shift ahead dividend.hi
        srw r3,r11,r9          ;get shifted part of dividend.lo
        or r4,r4,r3            ;combine with dividend.hi
        slw r11,r11,r10        ;shift ahead dividend.lo
        addi r9,r9,33          ;setup for looping
        mtctr r9               ;
        xor r3,r3,r3           ;clear quotient.lo
        xor r5,r5,r5           ;clear shift.hi
        xor r6,r6,r6           ;clear shift.lo
        b ldiff                ;skip first round of shifting
        align 6                ;align loop to 64-byte boundary
lshift:
        rlwinm r5,r5,1,0,30    ;shift.hi <<= 1
        rlwimi r5,r6,1,31,31   ;shift.hi[31] = shift.lo[0]
        rlwinm r6,r6,1,0,30    ;shift.lo <<= 1
        rlwimi r6,r4,1,31,31   ;shift.lo[31] = dividend.hi[0]
        rlwinm r4,r4,1,0,30    ;dividend.hi <<= 1
        rlwimi r4,r11,1,31,31  ;dividend.hi[31] = dividend.lo[0]
        rlwinm r11,r11,1,0,30  ;dividend.lo <<= 1
        rlwinm r3,r3,1,0,30    ;quotient.lo <<=1
ldiff:
        addc r10,r6,r8         ;diff.lo = shift.lo - divisor.lo, set CA
        adde. r9,r5,r7         ;diff.hi = shift.hi - divisor.hi + CA
        blt lloop              ;loop if diff < 0
        or r6,r10,r10          ;shift.lo = diff.lo
        or r5,r9,r9            ;shift.hi = diff.hi
        ori r3,r3,1            ;set bit in quotient
lloop:
        bdnz lshift             ;loop until done
        stw r6,0x0000(r12)     ;store remainder in rp address
        blr                    ;return  

#
# unsigned int muldvm(a,c,m,rp)
# unsigned int a,c,m,*rp;
# returns q = int[(a*base+c)/m] and *rp = (a*base+c) mod m
# when called a -> (r3), c -> (r4), m -> (r5), rp -> (r6)
# upon return q -> (r3), *rp -> [(r12)]
# registers used: r3 thru r12 
#

        csect .muldvm[PR]
        function .muldvm[PR]
        
        or r12,r6,r6           ;(r12) <- remainder address
        or r6,r5,r5            ;(r6) <- m
        xor r7,r7,r7           ;zero divisor.hi
        nor r7,r7,r7           ;calc ~divisor.hi
        subfic r8,r6,0         ;(r8) <- calc -divisor.lo, set CA
        addze r7,r7            ;(r7) <- ~divisor.hi += CA
        or r11,r4,r4           ;(r11) <- dividend.lo
        or r4,r3,r3            ;(r4) <- dividend.hi
                               ;try to shift ahead, skipping unnecessary
                               ;shifting loops
        cntlzw r10,r4          ;find order of dividend.hi
        subfic r9,r10,32       ;calc shift = 32 - order
        slw r4,r4,r10          ;shift ahead dividend.hi
        srw r3,r11,r9          ;get shifted part of dividend.lo
        or r4,r4,r3            ;combine with dividend.hi
        slw r11,r11,r10        ;shift ahead dividend.lo
        addi r9,r9,33          ;setup for looping
        mtctr r9               ;
        xor r3,r3,r3           ;clear quotient.lo
        xor r5,r5,r5           ;clear shift.hi
        xor r6,r6,r6           ;clear shift.lo
        b sdiff                ;skip first round of shifting
        align 6                ;align loop to 64-byte boundary
sshift:
        rlwinm r5,r5,1,0,30    ;shift.hi <<= 1
        rlwimi r5,r6,1,31,31   ;shift.hi[31] = shift.lo[0]
        rlwinm r6,r6,1,0,30    ;shift.lo <<= 1
        rlwimi r6,r4,1,31,31   ;shift.lo[31] = dividend.hi[0]
        rlwinm r4,r4,1,0,30    ;dividend.hi <<= 1
        rlwimi r4,r11,1,31,31  ;dividend.hi[31] = dividend.lo[0]
        rlwinm r11,r11,1,0,30  ;dividend.lo <<= 1
        rlwinm r3,r3,1,0,30    ;quotient.lo <<=1
sdiff:
        addc r10,r6,r8         ;diff.lo = shift.lo - divisor.lo, set CA
        adde. r9,r5,r7         ;diff.hi = shift.hi - divisor.hi + CA
        blt sloop              ;loop if diff < 0
        or r6,r10,r10          ;shift.lo = diff.lo
        or r5,r9,r9            ;shift.hi = diff.hi
        ori r3,r3,1            ;set bit in quotient
sloop:
        bdnz sshift
        stw r6,0x0000(r12)     ;store remainder in rp address
        blr                    ;return  

#
# unsigned int muldvd(a,b,c,rp)
# unsigned int a,b,c,*rp;
# returns q = int[(a*b+c)/base] and *rp = (a*b+c) mod base
# when called a -> (r3), b -> (r4), c -> (r5), rp -> (r6)
# upon return q -> (r3), *rp -> [(r6)]
# registers used: r3 thru r8
#

        csect .muldvd[PR]
        function .muldvd[PR]
                               
        mulhwu r7,r3,r4        ;(r7) <- a * b  high word
        mullw r8,r3,r4         ;(r8) <- a * b  low word
        addc r4,r8,r5          ;(r4) <- a * b + c
        addze r3,r7            ;(r3) <- (r7) + XERca
        stw r4,0x0000(r6)      ;store remainder -> (r6)
        blr                    ;return
        


*****************************************************************************/


/*  Itanium code for Intel compiler, with mr_small a 64-bit long */

#include "miracl.h"

mr_small muldiv(a,b,c,m,rp)
mr_small a,b,c,m;
mr_small *rp;
{ /* Blakely-Sloan */
    int i;
    mr_small d,q=0,r=0;
上一页 1 2 3 45
💿 文件大小 1432 K
👤 上传用户 foxfsk
📂 所属分类数据结构
🏷️ 相关标签

#miracl #运算 #家
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -