📄 adjlogs.c

📁 MPICH是MPI的重要研究,提供了一系列的接口函数,为并行计算的实现提供了编程环境.
💻 C
📖 第 1 页 / 共 2 页
字号:
上一页 12
int type;{int i;if (type == ALOG_EVENT_PAIR_A1) return 1;for (i=0; i<a1p; i++)     if (type == a1event[i]) return 1;return 0;}int is_a2_event( type )int type;{int i;if (type == ALOG_EVENT_PAIR_A2) return 1;for (i=0; i<a2p; i++)     if (type == a2event[i]) return 1;return 0;}int is_b1_event( type )int type;{int i;if (type == ALOG_EVENT_PAIR_B1) return 1;for (i=0; i<b1p; i++)     if (type == b1event[i]) return 1;return 0;}unsigned long GlobalTime( time, p, nsync )unsigned long time;int           p, nsync;{unsigned long gtime, stime1, stime2;unsigned long frac;unsigned long tdiff;unsigned long ScaleLong();/* Problem: since times are UNSIGNED, we have to be careful about how they   are adjusted.  time - synctime may not be positive.  We make sure that   all of the subexpressions are unsigned longs */if (time >= globaloffset[p]) {    tdiff = time - globaloffset[p];    frac  = ScaleLong( numer[p], denom[p], tdiff );    gtime = frac + globaloffset[0];    }else {    tdiff = globaloffset[p] - time;    frac  = ScaleLong( numer[p], denom[p], tdiff );    if (frac > globaloffset[0]) printf( "Oops!\n" );    gtime = globaloffset[0] - frac;    }return gtime;}/*    This routine takes offset events and solves for the offsets.  The    approach is:    Let the global time be given by (local_time - offset)*scale ,    with a different offset and scale on each processor.  Each processor    originates exactly one communication event (except processor 0),    generating an a1 and a2 event.  A corresponding number of b2 events    are generated, but note that one processor may have more than 1 b2    event (if using Dunnigan's synchronization, there will be np-1 b2 events    on processor 0, and none anywhere else).    These events are:   pi   a1 (send to nbr)                        (recv) a2   pj                     (recv) b1 (send back)    We base the analysis on the assumption that in the GLOBAL time    repreresentation, a2-a1 is twice the time to do a (send) and    a (recv).  This is equivalent to assuming that global((a1+a2)/2) ==    global(b1).  Then, with the unknowns the offsets (the scales    are assumed known from the syncevent calculation), the matrix is    1    -s0 s1       ....       -sj ... si    where si is the scale for the i'th processor (note s0 = 1).    The right hand sides are (1/2)(a1(i)+a2(i)) *s(i) - b1(j)*s(j).    Because of the triangular nature of the matrix, this reduces to       o(i) = (a1(i)+a2(i))/2 - (s(j)/s(i)) * (b1(j)-o(j))    Note that if s(i)==s(j) and b1 == (a1+a2)/2, this gives o(i)==o(j).     */void ComputeOffsets( np )int np;{int i, j;unsigned long d1, delta;unsigned long ScaleLong();/* If there aren't enough events, return */if (noffsetevents != np - 1) {    if (noffsetevents != 0) 	fprintf( stderr, 	   "Incorrect number of offset events to compute clock offsets\n" );    else	fprintf( stderr, "No clock offset events\n" );    return;    }/* Take globaloffset[0] from sync */for (i=1; i<np; i++) {    /* o(i) = (a1(i)+a2(i))/2 - (s(j)/s(i)) * (b1(j)-o(j)) */    j  = offsetevents[i].p1;        /* Compute a1(i)+a2(i)/2.  Do this by adding half the difference;       this insures that we avoid overflow */    d1 = (offsetevents[i].a2 - offsetevents[i].a1)/2;    d1 = offsetevents[i].a1 + d1;    /* We form (b1-o(j))(s(j)/s(i)) by noting that       s(j)/s(i) == denom(i)/denom(j) (since numer(i)==numer(j)) */    delta = ScaleLong( denom[i], denom[j],                       offsetevents[i].b1 - globaloffset[j] );    globaloffset[i] = d1 - delta;    }}#include <mp.h>static MINT *prod, *qq, *rr;static int  mpallocated = 0;unsigned long ScaleLong( n, d, v )unsigned long n, d, v;{char buf[40];char *s;MINT *nn, *dd, *vv;unsigned long q, r;if (!mpallocated) {    prod = itom(0);    if (!prod) {	fprintf( stderr, "Could not allocate mp int\n" );	exit(0);	}    qq   = itom(0);    if (!qq) {	fprintf( stderr, "Could not allocate mp int\n" );	exit(0);	}    rr   = itom(0);    if (!rr) {	fprintf( stderr, "Could not allocate mp int\n" );	exit(0);	}    mpallocated = 1;    }sprintf( buf, "%x", n );nn = xtom(buf);if (!nn) {    fprintf( stderr, "Could not allocate mp int\n" );    exit(0);    }sprintf( buf, "%x", v );vv = xtom(buf);if (!vv) {    fprintf( stderr, "Could not allocate mp int\n" );    exit(0);    }sprintf( buf, "%x", d );dd = xtom(buf);if (!dd) {    fprintf( stderr, "Could not allocate mp int\n" );    exit(0);    }mult(nn,vv,prod);mdiv(prod,dd,qq,rr);s = mtox(qq);sscanf( s, "%x", &q );free( s );s = mtox(rr);sscanf( s, "%x", &r );free( s );/* Free the locals */mfree( nn );mfree( dd );mfree( vv );return q;}/* Here is not-quite working code for multiple precision arithmetic */#ifdef DO_MULTIPLE_ARITH/*    This routine takes a value v and scales it by (n/d).  This    routine handles integer overflow by using the following formulas:   Let h(u) = high 16 bits of u, and l(u) = low 16 bits of u.   Then v *(n/d) =    (l(v)+h(v))*(l(u)+h(u))/d    = l(v)l(n)/d + (l(n)h(v)+l(v)h(n))/d + h(v)h(n)/d      == a1/d + a2/d + a3/d   In order to keep the values in-range, we define low(u)=l(u) and   high(u) = h(u) >> 16.  Then this formula becomes (with high substituted   for h):   a1/d + (a2<<16)/d + (a3<<32)/d   Now, when doing the integer division, we need to propagate the remainders.   Let the result be r.  Then   rd = a1 + (a2<<16) + (a3<<32)   if a1 = k1 d + b1, (a2 << 16) = (k2 d + b2), and (a3 << 32) = (k3 d + b3),   then      r d = (k1 d + b1) + (k2 d + b2) + (k3 d + b3);       = (k1 + k2 + k3) d + (b1 + b2 + b3)          and so   r   = (k1 + k2 + k3) + (b1 + b2 + b3) / d   To compute (k2,b2) and (k3,b3), we do:   (a2<<16)/d:   a2 = p2 d + c2   a2 << 16 = p2 d << 16 + c2 << 16            = (p2 << 16) d + c2 << 16   Let c2 << 16 = r2 d + s2, then (finally!)   a2 << 16 = (p2 << 16 + r2) d + s2   (a3 << 32)/d:   a3 = p3 d + c3   a3 << 32 = p3 d << 32 + c3 << 32            = (p3 << 32) d + c3 << 32   Computing c3 << 32 = r3 d + s3 is a challange, particularly   since we need only the low 32 bits (the high 32 bits will be 0)   We do this in stages as well:   c3 << 32 = (c3 << 16) << 16;    (c3 << 16) = t3 d + u3   (c3 << 32) = (t3 << 16)d + u3 << 16              = (t3 << 16 + y3)d + z3,	      == r3 d + s3   where u3 << 16 = y3 d + z3   Then   a3 << 32 = (p3 << 32 + r3) d + s3 */void DivLong();/*    ScaleDecomp - convert (a << p) = alpha d + beta, with beta < d   This works by recursively:   a = b d + r,   a<<p = (b << p)d + (r<<p)   then process r<<p to bd + r' etc until b == 0 */void ScaleDecomp( a, p, d, alpha, beta )int           p;unsigned long a, d, *alpha, *beta;{unsigned long b, r;unsigned long Alpha, Beta;int      p1;Alpha = 0; Beta = 0;b     = a / d;r     = a % d;Alpha = b << p;/* We need to gingerly deal with r, since shifting it by much   may make it too large, particularly if d is nearly 32 bits.     What we need is r << p = gamma d + delta, with r < d.  This   is really the hard part. We can not assume that d is much   smaller than 32 bits, so this is tricky. */DivLong( r, d, (unsigned long)(1 << p), &b, &r );Alpha += b;*beta = r;#ifdef FOOwhile (p > 1 && r > 0) {    p1    = p/2;    r     = (r << p1);    b     = r / d;    r     = r % d;    Alpha += b << (p-p1);    p      = (p - p1);    }*alpha = Alpha;*beta  = r << p;#endif}#define LOWBITS(a) (unsigned long)((a)&lowmask)#define HIGHBITS(a) (unsigned long)( ((a) >> 16 ) & lowmask )#include <mp.h>unsigned long ScaleLong( n, d, v )unsigned long n, d, v;{#ifdef FOO#define LOWBITS(a) (unsigned long)((a)&lowmask)#define HIGHBITS(a) (unsigned long)( ((a) >> 16 ) & lowmask )unsigned long a1, a21, a22, a3, k1, k21, k22, k3, b1, b21, b22, b3;DivLong( n, d, v, &k1, &b1 );return k1 + b1/d;a1  = LOWBITS(v)*LOWBITS(n);a21 = LOWBITS(v)*HIGHBITS(n);a22 = LOWBITS(n)*HIGHBITS(v);a3  = HIGHBITS(v)*HIGHBITS(n);k1 = a1 / d;b1 = a1 % d;ScaleDecomp( a21, 16, d, &k21, &b21 );ScaleDecomp( a22, 16, d, &k22, &b22 );ScaleDecomp( a3,  32, d, &k3,  &b3 );return (k1 + k21 + k22 + k3) + (b1 + b21 + b22 + b3) / d;#elsechar buf[40];MINT *nn, *dd, *vv, *prod, *qq, *rr;unsigned long q, r;sprintf( buf, "%x", n );nn = xtom(buf);sprintf( buf, "%x", v );vv = xtom(buf);sprintf( buf, "%x", d );dd = xtom(buf);prod = itom(0);qq   = itom(0);rr   = itom(0);mult(nn,vv,prod);mdiv(prod,dd,qq,rr);sscanf( mtox(qq), "%x", &q );sscanf( mtox(rr), "%x", &r );return q;#endif}/*  Represent nv = alpha d + beta */void DivLong( n, d, v, alpha, beta )unsigned long n, d, v;unsigned long *alpha, *beta;{unsigned long a1, a21, a22, a3, k1, k21, k22, k3, b1, b21, b22, b3;a1  = LOWBITS(v)*LOWBITS(n);a21 = LOWBITS(v)*HIGHBITS(n);a22 = LOWBITS(n)*HIGHBITS(v);a3  = HIGHBITS(v)*HIGHBITS(n);k1 = a1 / d;b1 = a1 % d;ScaleDecomp( a21, 16, d, &k21, &b21 );ScaleDecomp( a22, 16, d, &k22, &b22 );ScaleDecomp( a3,  32, d, &k3,  &b3 );*alpha = k1 + k21 + k22 + k3;*beta  = b1 + b21 + b22 + b3;}#endif
上一页 12
💿 文件大小 15603 K
👤 上传用户 yufei66900
📂 所属分类并行计算
🏷️ 相关标签

#MPICH #MPI #接口函数 #并行计算
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -