bench.cpp
来自「开放源码的编译器open watcom 1.6.0版的源代码」· C++ 代码 · 共 557 行 · 第 1/2 页
CPP
557 行
#ifndef __WATCOM_INT64__
#include <stdio.h>
int main() {
FILE *fp = fopen( "bench.chk", "r" );
int c;
for(;;) {
c = fgetc( fp );
if( c == EOF ) break;
putchar( c );
}
fclose( fp );
return 0;
}
#else
/*
This program can be used to check how well
gcc optimizes some 64-bit arithmetic constructs.
Procedure NR_MUL does modular multiplication of
multiple precision operands, as used (for example)
by the RSA cryptosystem.
In my integer factorization program, 50-90% of the
time is spent here. An argument to the procedure
specifies one of three algorithms for arithmetic:
MULT_DBLEINT - Use long long data type.
This algorithm takes the
64-bit product of 32-bit
arguments, and manipulates
such products.
It needs addition, multiplication,
shifting, masking.
MULT_HALFINT - To multiply single-precision
arguments, each is broken
into half-sized pieces, which
are individually multiplied.
MULT_DOUBLE_PRECISION - Floating point arithmetic
is used for intermediate calculations.
This version of NR_MUL is a simplified version of the
real one, which includes checks for squaring a number
and whose choice of algorithm is made at compilation
time rather than execution time.
More importantly, the real one allows radix 2^30 for
MULT_DBLEINT, 2^28 for MULT_HALFINT, but only 2^26
for MULT_DOUBLE_PRECISION (assuming 53-bit
double precision floating mantissa, as in IEEE).
So a 100-digit (= 332-bit) number will need
length 12 for MULT_DBLEINT, 13 for MULT_HALFINT,
but 14 for MULT_DOUBLE_PRECISION.
On a DEC 5000 (MIPS R3000 architecture), the last output
line shows times of 11249, 937, and 703 microseconds
when lng = 14, for MULT_DBLEINT, MULT_HALFINT,
and MULT_DOUBLE_PRECISION, respectively.
The pixstats output shows 59% of the time in bmul and __muldi3,
18% in badd and __adddi3, 17% in NR_MUL, 4% in bzero, 2% in __lshrdi3
Presently 20% of the time is being spent in badd,
__adddi3, and __lshrdi3; inlining these routines
(as you have done for gcc 2) will cause a 15%-25% speed-up.
When bmul, bzero, and __muldi3 are similarly inlined,
the overall speed-up should be 2-fold to 3-fold,
on machines with hardware integer multiply,
and the MULT_DBLEINT time will hopefully
be comparable with the others, if not better.
Thank you for your consideration.
Peter L. Montgomery
Department of Mathematics
University of California
Los Angeles, CA 90024-1555 USA
pmontgom@math.ucla.edu
*/
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
/*
Typedefs for 32-bit and 64-bit integers.
A leading "s" identifies signed data;
a trailing "c" identifies const data.
*/
#define LNGMAX 15
typedef unsigned long mp_digit_t;
typedef signed long smp_digit_t;
typedef unsigned __int64 mp_digit2_t;
typedef signed int mp_lng_t; /* For subscripts, lengths, etc. */
typedef const mp_digit_t mp_digit_tc;
typedef const mp_digit2_t mp_digit2_tc;
typedef const mp_lng_t mp_lng_tc;
#define CONSTP(type) type*const
/* Constant pointer to (possibly non-const) data */
#define LG2MPB 26
#define MP_BASE (1L << LG2MPB)
/* Radix for multiple precision arithmetic */
/* This assumes double precision floating is */
/* at least 2*26 + 1 = 53 bits wide. */
#define MP_MASK (MP_BASE - 1)
#define LG2MPB_CHUNK (LG2MPB/2)
#if LG2MPB != 2*LG2MPB_CHUNK
#error "LG2MPB must be even"
#endif
static const double MP_BASE_D = (double)MP_BASE;
static const double MP_BASE_DINV = 1.0/(double)MP_BASE;
#define MULT_DBLEINT 0
/* Use long long data type for intermediate results */
#define MULT_HALFINT 1
/* Partition operands into two half-size integers */
#define MULT_DOUBLE_PRECISION 2
/* Use double precision floating point arithmetic */
#define MAX_MULT_ALG 2
static char *desc[1+MAX_MULT_ALG] = {"Long long", "Half ints", "Floating"};
void NR_MUL(CONSTP(mp_digit_tc) ar1,
CONSTP(mp_digit_t ) ar2,
CONSTP(mp_digit_tc) ar3, /* Modulus */
CONSTP(mp_digit_t ) ar4, /* Product */
mp_digit_tc inv3,
mp_lng_tc lng,
int algorithm)
{
/*
C ar4 = (ar1*ar2)/MP_BASE**lng mod ar3.
C We know inv3*ar3(0) == -1 mod MP_BASE.
C See "Modular Multiplication Without Trial Division", by
C Peter Montgomery, Mathematics of Computation, April, 1985.
C General algorithm:
C
C tmparr(0:lng-1) = 0
C do j = 0, lng-1 ! j = 0 iteration is coded separately
C mul2 = ar1(j)
C quad = mul2*ar2(0) + tmparr(0)
C mul3 = IAND(quad * inv3, MP_MASK)
C carry = (quad + mul3*ar3(0))/MP_BASE (exact)
C do i = 1, lng-1
C
C 0 <= mul2 <= MP_BASE-1
C 0 <= mul3 <= MP_BASE-1
C 0 <= ar2(i) <= MP_BASE-1
C 0 <= ar3(i) <= MP_BASE-1
C 0 <= tmparr(i) <= MP_BASE-1 if i < lng-1
C 0 <= carry <= 2*(MP_BASE-1)
C
C 0 <= quad <= 2*MP_BASE**2-MP_BASE-1
C if i < lng-1
C 0 <= tmparr(lng-1)<= 2*MP_BASE-1
C 0 <= quad <= 2*MP_BASE**2-1
C if i = lng-1
C quad = mul2*ar2(i) + mul3*ar3(i) + tmparr(i) + carry
C tmparr(i-1) = IAND(quad, MP_MASK)
C carry = RSHIFT(quad, LG2MPB)
C end do
C tmparr(lng-1) = carry
C Note tmparr(lng-1) may be as large as 2*MP_BASE-1.
C end do
C
C if (tmparr >= ar3) then
C ar4 = tmparr - ar3 (multiple precision subtract)
C else
C ar4 = tmparr
C end if
C
C - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
*/
mp_digit_t tmparr[LNGMAX]; /* variable length */
CONSTP(mp_digit_tc) par1 = ar1;
register CONSTP(mp_digit_tc) par2 = ar2;
register CONSTP(mp_digit_tc) par3 = ar3;
register mp_lng_t i;
mp_lng_t j;
switch(algorithm) {
default: fprintf(stderr, "Bad algorithm value - %d\n", algorithm);
exit(1);
break;
case MULT_DBLEINT: /* Use long long type */
#define NN_DPRODUU(nnm1, nnm2) (((mp_digit2_t)(nnm1))*((mp_digit2_t)(nnm2)))
{
register mp_digit_t mul2, mul3;
register mp_digit2_t quad, carry;
mul2 = par1[0];
quad = NN_DPRODUU(mul2, par2[0]);
mul3 = (mp_digit_t)(quad & MP_MASK);
mul3 = (mp_digit_t)(NN_DPRODUU(mul3, inv3) & MP_MASK);
carry = (quad + NN_DPRODUU(mul3, par3[0])) >> LG2MPB;
if (lng == 1) {
tmparr[0] = (mp_digit_t)carry;
} else {
i = 1;
do {
quad = NN_DPRODUU(mul2, par2[i])
+ NN_DPRODUU(mul3, par3[i]) + carry;
tmparr[i-1] = quad & MP_MASK;
carry = quad >> LG2MPB;
} while (++i < lng);
tmparr[lng-1] = (mp_digit_t)carry;
j = 1;
do {
mul2 = par1[j];
quad = NN_DPRODUU(mul2, par2[0]) + (mp_digit2_t)tmparr[0];
mul3 = (mp_digit_t)(quad & MP_MASK);
mul3 = (mp_digit_t)(NN_DPRODUU(mul3, inv3) & MP_MASK);
carry = (quad + NN_DPRODUU(mul3, par3[0]) >> LG2MPB);
i = 1;
do {
quad = NN_DPRODUU(mul2, par2[i]) + NN_DPRODUU(mul3, par3[i])
+ (mp_digit2_t)(tmparr[i]) + carry;
tmparr[i-1] = quad & MP_MASK;
carry = quad >> LG2MPB;
} while (++i < lng);
tmparr[lng-1] = (mp_digit_t)carry;
} while (++j < lng);
} /* lng > 1 */
}
break; /* MULT_DBLEINT */
#undef NN_DPRODUU
case MULT_HALFINT: /* Partition everything into half-integers */
#define NN_LCHUNKU(nnm1) ((nnm1) & ((1L << LG2MPB_CHUNK)-1))
/* Lower half of 32-bit integer */
#define NN_HCHUNKU(nnm1) ((nnm1) >> (LG2MPB_CHUNK))
/* Upper half of 32-bit integer */
{
mp_digit_tc inv3h = NN_HCHUNKU(inv3);
mp_digit_tc inv3l = NN_LCHUNKU(inv3);
mp_digit_tc ar20h = NN_HCHUNKU(par2[0]);
mp_digit_tc ar20l = NN_LCHUNKU(par2[0]);
mp_digit_tc ar30h = NN_HCHUNKU(par3[0]);
mp_digit_tc ar30l = NN_LCHUNKU(par3[0]);
mp_digit_t mul2h = NN_HCHUNKU(par1[0]);
mp_digit_t mul2l = NN_LCHUNKU(par1[0]);
register mp_digit_t quadh = mul2h*ar20h;
register mp_digit_t quadm = mul2h*ar20l + mul2l*ar20h;
register mp_digit_t quadl = mul2l*ar20l;
mp_digit_t quadll = NN_LCHUNKU(quadl);
mp_digit_t quadlh = NN_LCHUNKU(quadm + NN_HCHUNKU(quadl));
mp_digit_t mul3l = quadll*inv3l;
mp_digit_t mul3h = quadll*inv3h + quadlh*inv3l;
register mp_digit_t carry;
mul3h = NN_LCHUNKU(mul3h + NN_HCHUNKU(mul3l));
mul3l = NN_LCHUNKU(mul3l);
quadh += mul3h*ar30h;
quadm += mul3h*ar30l + mul3l*ar30h;
quadl += mul3l*ar30l;
quadm += NN_HCHUNKU(quadl);
carry = quadh + NN_HCHUNKU(quadm);
if (lng == 1) {
tmparr[0] = carry;
} else {
i = 1;
do {
mp_digit_tc ar2h = NN_HCHUNKU(ar2[i]);
mp_digit_tc ar2l = NN_LCHUNKU(ar2[i]);
mp_digit_tc ar3h = NN_HCHUNKU(ar3[i]);
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?