bench.cpp

来自「开放源码的编译器open watcom 1.6.0版的源代码」· C++ 代码 · 共 557 行 · 第 1/2 页

CPP
557
字号
#ifndef __WATCOM_INT64__
#include <stdio.h>

int main() {
    FILE *fp = fopen( "bench.chk", "r" );
    int c;
    for(;;) {
        c = fgetc( fp );
        if( c == EOF ) break;
        putchar( c );
    }
    fclose( fp );
    return 0;
}
#else
/*
                This program can be used to check how well
        gcc optimizes some 64-bit arithmetic constructs.

                Procedure NR_MUL does modular multiplication of
        multiple precision operands, as used (for example)
        by the RSA cryptosystem.
        In my integer factorization program, 50-90% of the
        time is spent here.  An argument to the procedure
        specifies one of three algorithms for arithmetic:

                MULT_DBLEINT -  Use long long data type.
                                This algorithm takes the
                                64-bit product of 32-bit
                                arguments, and manipulates
                                such products.
                                It needs addition, multiplication,
                                shifting, masking.

                MULT_HALFINT -  To multiply single-precision
                                arguments, each is broken
                                into half-sized pieces, which
                                are individually multiplied.

                MULT_DOUBLE_PRECISION - Floating point arithmetic
                                is used for intermediate calculations.

                This version of NR_MUL is a simplified version of the
        real one, which includes checks for squaring a number
        and whose choice of algorithm is made at compilation
        time rather than execution time.
        More importantly, the real one allows radix 2^30 for
        MULT_DBLEINT, 2^28 for MULT_HALFINT, but only 2^26
        for MULT_DOUBLE_PRECISION (assuming 53-bit
        double precision floating mantissa, as in IEEE).
        So a 100-digit (= 332-bit) number will need
        length 12 for MULT_DBLEINT, 13 for MULT_HALFINT,
        but 14 for MULT_DOUBLE_PRECISION.

                On a DEC 5000 (MIPS R3000 architecture), the last output
        line shows times of 11249, 937, and 703 microseconds
        when lng = 14, for MULT_DBLEINT, MULT_HALFINT,
        and MULT_DOUBLE_PRECISION, respectively.
        The pixstats output shows 59% of the time in bmul and __muldi3,
        18% in badd and __adddi3, 17% in NR_MUL, 4% in bzero, 2% in __lshrdi3

                Presently 20% of the time is being spent in badd,
        __adddi3, and __lshrdi3; inlining these routines
        (as you have done for gcc 2) will cause a 15%-25% speed-up.
        When bmul, bzero, and __muldi3 are similarly inlined,
        the overall speed-up should be 2-fold to 3-fold,
        on machines with hardware integer multiply,
        and the MULT_DBLEINT time will hopefully
        be comparable with the others, if not better.

                Thank you for your consideration.

                        Peter L. Montgomery
                        Department of Mathematics
                        University of California
                        Los Angeles, CA 90024-1555 USA
                        pmontgom@math.ucla.edu

*/

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
/*
        Typedefs for 32-bit and 64-bit integers.
        A leading "s" identifies signed data;
        a trailing "c" identifies const data.
*/
#define LNGMAX 15

typedef unsigned long       mp_digit_t;
typedef   signed long      smp_digit_t;
typedef unsigned __int64    mp_digit2_t;
typedef   signed int        mp_lng_t;      /* For subscripts, lengths, etc. */

typedef const  mp_digit_t   mp_digit_tc;
typedef const  mp_digit2_t  mp_digit2_tc;
typedef const  mp_lng_t     mp_lng_tc;

#define CONSTP(type) type*const
                        /* Constant pointer to (possibly non-const) data */

#define LG2MPB  26
#define MP_BASE (1L << LG2MPB)
                /* Radix for multiple precision arithmetic */
                /* This assumes double precision floating is */
                /* at least 2*26 + 1 = 53 bits wide. */
#define MP_MASK (MP_BASE - 1)
#define LG2MPB_CHUNK (LG2MPB/2)
#if LG2MPB != 2*LG2MPB_CHUNK
    #error "LG2MPB must be even"
#endif

static const double MP_BASE_D = (double)MP_BASE;
static const double MP_BASE_DINV = 1.0/(double)MP_BASE;


#define MULT_DBLEINT 0
                /* Use long long data type for intermediate results */
#define MULT_HALFINT 1
                /* Partition operands into two half-size integers */
#define MULT_DOUBLE_PRECISION 2
                /* Use double precision floating point arithmetic */

#define MAX_MULT_ALG 2
static char *desc[1+MAX_MULT_ALG] = {"Long long", "Half ints", "Floating"};

void NR_MUL(CONSTP(mp_digit_tc) ar1,
            CONSTP(mp_digit_t ) ar2,
            CONSTP(mp_digit_tc) ar3,            /* Modulus */
            CONSTP(mp_digit_t ) ar4,            /* Product */
                   mp_digit_tc  inv3,
                   mp_lng_tc    lng,
                        int     algorithm)
{
/*
C               ar4 = (ar1*ar2)/MP_BASE**lng mod ar3.
C               We know inv3*ar3(0) == -1 mod MP_BASE.

C               See "Modular Multiplication Without Trial Division", by
C               Peter Montgomery, Mathematics of Computation, April, 1985.


C       General algorithm:
C
C           tmparr(0:lng-1) = 0
C           do j = 0, lng-1     ! j = 0 iteration is coded separately
C               mul2 = ar1(j)
C               quad = mul2*ar2(0) + tmparr(0)
C               mul3 = IAND(quad * inv3, MP_MASK)
C               carry = (quad + mul3*ar3(0))/MP_BASE   (exact)
C               do i = 1, lng-1
C
C                               0 <= mul2       <= MP_BASE-1
C                               0 <= mul3       <= MP_BASE-1
C                               0 <= ar2(i)     <= MP_BASE-1
C                               0 <= ar3(i)     <= MP_BASE-1
C                               0 <= tmparr(i)  <= MP_BASE-1 if i < lng-1
C                               0 <= carry      <= 2*(MP_BASE-1)
C
C                               0 <= quad       <= 2*MP_BASE**2-MP_BASE-1
C                                                            if i < lng-1
C                               0 <= tmparr(lng-1)<= 2*MP_BASE-1
C                               0 <= quad        <= 2*MP_BASE**2-1
C                                                            if i = lng-1
C                   quad = mul2*ar2(i) + mul3*ar3(i) + tmparr(i) + carry
C                   tmparr(i-1) = IAND(quad, MP_MASK)
C                   carry = RSHIFT(quad, LG2MPB)
C               end do
C               tmparr(lng-1) = carry
C Note tmparr(lng-1) may be as large as 2*MP_BASE-1.
C           end do
C
C           if (tmparr >= ar3) then
C               ar4 = tmparr - ar3      (multiple precision subtract)
C           else
C               ar4 = tmparr
C           end if
C
C - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
*/
                   mp_digit_t   tmparr[LNGMAX]; /* variable length */
            CONSTP(mp_digit_tc) par1    = ar1;
   register CONSTP(mp_digit_tc) par2    = ar2;
   register CONSTP(mp_digit_tc) par3    = ar3;
   register          mp_lng_t   i;
                     mp_lng_t   j;

    switch(algorithm) {
default: fprintf(stderr, "Bad algorithm value - %d\n", algorithm);
         exit(1);
         break;
case MULT_DBLEINT:      /* Use long long type */
#define NN_DPRODUU(nnm1, nnm2) (((mp_digit2_t)(nnm1))*((mp_digit2_t)(nnm2)))
    {
        register mp_digit_t  mul2, mul3;
        register mp_digit2_t quad, carry;

        mul2 = par1[0];
        quad = NN_DPRODUU(mul2, par2[0]);
        mul3 = (mp_digit_t)(quad & MP_MASK);
        mul3 = (mp_digit_t)(NN_DPRODUU(mul3, inv3) & MP_MASK);
        carry = (quad + NN_DPRODUU(mul3, par3[0])) >> LG2MPB;
        if (lng == 1) {
            tmparr[0] = (mp_digit_t)carry;
        } else {
            i = 1;
            do {
                quad =   NN_DPRODUU(mul2, par2[i])
                       + NN_DPRODUU(mul3, par3[i]) + carry;
                tmparr[i-1] = quad & MP_MASK;
                carry = quad >> LG2MPB;
            } while (++i < lng);
            tmparr[lng-1] = (mp_digit_t)carry;

            j = 1;
            do {
                mul2 = par1[j];
                quad = NN_DPRODUU(mul2, par2[0]) + (mp_digit2_t)tmparr[0];
                mul3 = (mp_digit_t)(quad & MP_MASK);
                mul3 = (mp_digit_t)(NN_DPRODUU(mul3, inv3) & MP_MASK);
                carry = (quad + NN_DPRODUU(mul3, par3[0]) >> LG2MPB);
                i = 1;
                do {
                    quad = NN_DPRODUU(mul2, par2[i]) + NN_DPRODUU(mul3, par3[i])
                           + (mp_digit2_t)(tmparr[i]) + carry;
                    tmparr[i-1] = quad & MP_MASK;
                    carry = quad >> LG2MPB;
                } while (++i < lng);
                tmparr[lng-1] = (mp_digit_t)carry;
            } while (++j < lng);
        } /* lng > 1 */
    }
    break;      /* MULT_DBLEINT */
#undef NN_DPRODUU

case MULT_HALFINT:              /* Partition everything into half-integers */

#define NN_LCHUNKU(nnm1) ((nnm1) & ((1L << LG2MPB_CHUNK)-1))
                /* Lower half of 32-bit integer */
#define NN_HCHUNKU(nnm1) ((nnm1) >> (LG2MPB_CHUNK))
                /* Upper half of 32-bit integer */
    {
        mp_digit_tc inv3h  = NN_HCHUNKU(inv3);
        mp_digit_tc inv3l  = NN_LCHUNKU(inv3);
        mp_digit_tc ar20h  = NN_HCHUNKU(par2[0]);
        mp_digit_tc ar20l  = NN_LCHUNKU(par2[0]);
        mp_digit_tc ar30h  = NN_HCHUNKU(par3[0]);
        mp_digit_tc ar30l  = NN_LCHUNKU(par3[0]);

                 mp_digit_t mul2h  = NN_HCHUNKU(par1[0]);
                 mp_digit_t mul2l  = NN_LCHUNKU(par1[0]);
        register mp_digit_t quadh  = mul2h*ar20h;
        register mp_digit_t quadm  = mul2h*ar20l + mul2l*ar20h;
        register mp_digit_t quadl  = mul2l*ar20l;
                 mp_digit_t quadll = NN_LCHUNKU(quadl);
                 mp_digit_t quadlh = NN_LCHUNKU(quadm + NN_HCHUNKU(quadl));
                 mp_digit_t mul3l  = quadll*inv3l;
                 mp_digit_t mul3h  = quadll*inv3h + quadlh*inv3l;
        register mp_digit_t carry;

        mul3h  = NN_LCHUNKU(mul3h + NN_HCHUNKU(mul3l));
        mul3l  = NN_LCHUNKU(mul3l);
        quadh += mul3h*ar30h;
        quadm += mul3h*ar30l + mul3l*ar30h;
        quadl += mul3l*ar30l;
        quadm += NN_HCHUNKU(quadl);
        carry  = quadh + NN_HCHUNKU(quadm);

        if (lng == 1) {
            tmparr[0] = carry;
        } else {
            i = 1;
            do {
                mp_digit_tc ar2h  = NN_HCHUNKU(ar2[i]);
                mp_digit_tc ar2l  = NN_LCHUNKU(ar2[i]);
                mp_digit_tc ar3h  = NN_HCHUNKU(ar3[i]);

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?