nb_kernel030_ia64_single.s
来自「最著名最快的分子模拟软件」· S 代码 · 共 970 行 · 第 1/2 页
S
970 行
/* * $Id: nb_kernel030_ia64_single.S,v 1.2 2005/01/25 12:11:51 lindahl Exp $ * * Gromacs 4.0 Copyright (c) 1991-2003 * David van der Spoel, Erik Lindahl, University of Groningen. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * To help us fund GROMACS development, we humbly ask that you cite * the research papers on the package. Check out http://www.gromacs.org * * And Hey: * Gnomes, ROck Monsters And Chili Sauce */#ifdef HAVE_CONFIG_H#include <config.h>#endif/* * The ia64-assembly Gromacs inner loops would not have been * possible without a lot of support, tutoring and optimization * suggestions from John Worley at Hewlett-Packard. *//* Each thread locks a counter and grabs a couple of neighborlists. * Available sizes for this chunk: 1,2,4,8, or 16 */#define THREAD_CHUNK_SIZE 8#define JJNR_PREFETCH_DISTANCE 128// ia64 General Register definitions: #define zero r0 /* permanent zero */#define gp r1 /* global data pointer */#define at0 r2 /* temp, target of addi */#define at1 r3 /* temp, target of addi */#define S0 r4 /* callee saves register */#define S1 r5 /* callee saves register */#define S2 r6 /* callee saves register */#define S3 r7 /* callee saves register */#define v0 r8 /* 1st fixed point return value/ptr */#define v1 r9 /* 2nd fixed return value/ptr */#define v2 r10 /* 3rd fixed return value/ptr */#define v3 r11 /* 4th fixed return value/ptr */#define sp r12 /* memory stack pointer */#define tp r13 /* thread pointer */#define t0 r14 /* caller saves register */#define t1 r15 /* caller saves register */#define t2 r16 /* caller saves register */#define t3 r17 /* caller saves register */#define t4 r18 /* caller saves register */#define t5 r19 /* caller saves register */#define t6 r20 /* caller saves register */#define t7 r21 /* caller saves register */#define t8 r22 /* caller saves register */#define t9 r23 /* caller saves register */#define t10 r24 /* caller saves register */#define t11 r25 /* caller saves register */#define t12 r26 /* caller saves register */#define t13 r27 /* caller saves register */#define t14 r28 /* caller saves register */#define t15 r29 /* caller saves register */#define t16 r30 /* caller saves register */#define t17 r31 /* caller saves register */// ia64 Floating-point register definitions#define fZero f0 /* permanent floating point 0.0 */#define fOne f1 /* permanent floating point 1.0 */#define fs0 f2 /* callee saves register */#define fs1 f3 /* callee saves register */#define fs2 f4 /* callee saves register */#define fs3 f5 /* callee saves register */ #define ft0 f6 /* caller saves register */#define ft1 f7 /* caller saves register */#define fa0 f8 /* argument register 0 */#define fa1 f9 /* argument register 1 */#define fa2 f10 /* argument register 2 */#define fa3 f11 /* argument register 3 */#define fa4 f12 /* argument register 4 */#define fa5 f13 /* argument register 5 */#define fa6 f14 /* argument register 6 */#define fa7 f15 /* argument register 7 */#define fv0 f8 /* return value register 0 */#define fv1 f9 /* return value register 1 */#define fv2 f10 /* return value register 2 */#define fv3 f11 /* return value register 3 */#define fv4 f12 /* return value register 4 */#define fv5 f13 /* return value register 5 */#define fv6 f14 /* return value register 6 */#define fv7 f15 /* return value register 7 */#define fs4 f16 /* callee saves register */#define fs5 f17 /* callee saves register */#define fs6 f18 /* callee saves register */#define fs7 f19 /* callee saves register */#define fs8 f20 /* callee saves register */#define fs9 f21 /* callee saves register */#define fs10 f22 /* callee saves register */#define fs11 f23 /* callee saves register */#define fs12 f24 /* callee saves register */#define fs13 f25 /* callee saves register */#define fs14 f26 /* callee saves register */#define fs15 f27 /* callee saves register */#define fs16 f28 /* callee saves register */#define fs17 f29 /* callee saves register */#define fs18 f30 /* callee saves register */#define fs19 f31 /* callee saves register */// ia64 predicate register definitions#define pone p0 /* permanent one predicate */#define pTrue p0 /* permanent one predicate */#define ps0 p1 /* callee saves predicate */#define ps1 p2 /* callee saves predicate */#define ps2 p3 /* callee saves predicate */#define ps3 p4 /* callee saves predicate */#define ps4 p5 /* callee saves predicate */#define pt0 p6 /* caller saves predicate */#define pt1 p7 /* caller saves predicate */#define pt2 p8 /* caller saves predicate */#define pt3 p9 /* caller saves predicate */#define pt4 p10 /* caller saves predicate */#define pt5 p11 /* caller saves predicate */#define pt6 p12 /* caller saves predicate */#define pt7 p13 /* caller saves predicate */#define pt8 p14 /* caller saves predicate */#define pt9 p15 /* caller saves predicate */// ia64 branch register definitions#define rb b0 /* return link */#define bs0 b1 /* callee saves branch register */#define bs1 b2 /* callee saves branch register */#define bs2 b3 /* callee saves branch register */#define bs3 b4 /* callee saves branch register */#define bs4 b5 /* callee saves branch register */ #define bt0 b6 /* caller saves branch register */#define bt1 b7 /* caller saves branch register */ .text#define CHARGE t10#define FACTION t9#define FActII loc8#define FActIX fs1#define FActIY fs2#define FActIZ fs3#define FIX fs4#define FIY fs5#define FIZ fs6#define FSHIFT t6#define FShiftIS loc9#define FShiftX fa5#define FShiftY fa6#define FShiftZ fa7#define ICharge fs5#define InnerCnt t17#define II t13#define II3 in7#define IQ fs5#define IS t12#define IS3 in6#define IX fa2#define IY fa3#define IZ fa4#define In_FSHIFT in6#define In_GID in7#define In_IINR in1#define In_JINDEX in2#define In_JJNR in3#define In_NRI in0#define In_SHIFT in4#define In_SHIFTVEC in5#define NRI loc12#define IINR loc13#define JINDEX loc14#define JJNR loc15#define SHIFT loc16#define GID loc17#define COUNT loc18#define JX DX[0]#define JY DY[0]#define JZ DZ[0]#define LCSave at0#define NJ0 t14#define NJ1 t15#define POSITION t8#define PRSave at1#define PosX f88#define PosY f89#define PosZ f90#define SHIFTVEC t5#define VC t11#define VCPtr ggid#define VNBTotal fs0#define Vvdw6 C6[2]#define Vvdw12 C12[2]#define RInv12 RInv6[1]#define argPtr loc23#define chargePtr v2#define Tmp1 t0#define Tmp2 t17#define Tmp3 loc11#define Tmp4 t2#define Tmp5 t3#define fHALF ft0#define f3_8 ft1#define f5_16 fa0#define fSIX fa1#define fillP0 v0#define fillP1 v1#define NN0 t0#define NN1 loc11#define ggid loc10#define gidPtr t7#define iinrPtr t1#define jindexPtr t2#define jjnrPtr t3#define jnr t16#define jnr3 v0#define nriCount t0#define pCont pt0#define pDone pt1#define pJJNR pt2#define pMore pt3#define pLast pt4#define posPtr v3#define Facel fa4#define Tabscale fa1#define shX fa2#define shY fa3#define shZ fa4#define shiftPtr t4#define shiftVPtr v1#define spillPtr v0 /* used to be t7 */#define spillPtr2 t0#define xPFS at0#define TYPE loc19#define NTYPE loc20#define typePtr loc21#define NBFP loc22#define NTI loc24#define VNBPtr loc25#define VFTab loc26#define Nouter loc27#define Ninner loc28#define OuterIter loc29#define InnerIter loc30#define VNB loc31 #define nnnA nA[0]#define R RT[0]#define eps0 n0[1]#define eps1 n0[2]#define eps2 n0[3]#define _NINPUTS 8#define _NLOCALS 32#define _NOUTPUT 0#define _NROTATE 16 .regstk 8, 32, 0, 16 .rotr FActPtr[7], TypeJ[4], nnn[2] .rotf DX[7], DY[7], DZ[7], FActX[3], FActY[3], FActZ[3], RSqr[3], RInv[6], RInvT[2], RInvU[2], RInvErr[2], FScalar[2], C6[5], C12[5], Disp_Y[3], Disp_F[3], Disp_G[3], Disp_H[2], Rep_Y[3], Rep_F[3], Rep_G[3], Rep_H[2], eps[2], RT[2], n0[2] .rotp pPipe[7]#define PIPE_DEPTH 7#define EXP(n) (0xffff + (n))#define POS_STK_OFFSET 0x10#define FACTION_STK_OFFSET 0x18#define CHARGE_STK_OFFSET 0x20#define FACEL_STK_OFFSET 0x28#define KRF_STK_OFFSET 0x30#define CRF_STK_OFFSET 0x38#define VC_STK_OFFSET 0x40#define TYPE_STK_OFFSET 0x48#define NTYPE_STK_OFFSET 0x50#define NBFP_STK_OFFSET 0x58#define VNB_STK_OFFSET 0x60#define TABSCALE_STK_OFFSET 0x68#define VFTAB_STK_OFFSET 0x70#define INVSQRTA_STK_OFFSET 0x78#define DVDA_STK_OFFSET 0x80#define GBTABSCALE_STK_OFFSET 0x88#define GBTAB_STK_OFFSET 0x90#define NTHREADS_STK_OFFSET 0x98#define COUNT_STK_OFFSET 0xA0#define MTX_STK_OFFSET 0xA8#define OUTERITER_STK_OFFSET 0xB0#define INNERITER_STK_OFFSET 0xB8#define WORK_STK_OFFSET 0xC0// The mutex call parameter isnt used at all in assembly... .global nb_kernel030_ia64_single .proc nb_kernel030_ia64_single .align 32nb_kernel030_ia64_single:// INIT 1 { .mmi alloc xPFS = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE mov spillPtr = sp mov Tmp1 = EXP(-1) } { .mfi nop 0x0 nop 0x0 add argPtr = TABSCALE_STK_OFFSET, sp } ;;// INIT 2 { .mlx ld8 loc28 = [argPtr], COUNT_STK_OFFSET - TABSCALE_STK_OFFSET movl Tmp3 = 0x3ec00000 } { .mii stf.spill [spillPtr] = fs0, -16 nop 0x0 add sp = -6 * 16, sp } ;;// INIT 3 { .mlx stf.spill [spillPtr] = fs1, -16 movl Tmp4 = 0x3ea00000 } { .mmi setf.exp fHALF = Tmp1 ld8 COUNT = [argPtr], POS_STK_OFFSET - COUNT_STK_OFFSET mov PRSave = pr } ;;// INIT 4 { .mfi ld8 POSITION = [argPtr], FACTION_STK_OFFSET - POS_STK_OFFSET nop 0x0 nop 0x0 } { .mmi nop 0x0 stf.spill [spillPtr] = fs2, -16 mov pr.rot = 0x0 } ;;// INIT 5 { .mmi ld8 FACTION = [argPtr], NTYPE_STK_OFFSET - FACTION_STK_OFFSET setf.s f3_8 = Tmp3 mov SHIFTVEC = In_SHIFTVEC } { .mmi stf.spill [spillPtr] = fs3, -16 ld4 NRI = [In_NRI] mov Nouter = 0 } ;;// INIT 6 { .mmi ld8 NTYPE = [argPtr], TYPE_STK_OFFSET - NTYPE_STK_OFFSET setf.s f5_16 = Tmp4 mov FSHIFT = In_FSHIFT } { .mfi stf.spill [spillPtr] = fs4, -16 nop 0x0 mov GID = In_GID } ;;// INIT 7 { .mmf ld8 TYPE = [argPtr], NBFP_STK_OFFSET - TYPE_STK_OFFSET mov SHIFT = In_SHIFT fnorm Facel = Facel } { .mmi stf.spill [spillPtr] = fs5, -16 ldfs Tabscale = [loc28] nop 0x0 } ;;// INIT 8 { .mmf ld8 NBFP = [argPtr], VFTAB_STK_OFFSET - NBFP_STK_OFFSET mov JJNR = In_JJNR fnorm f3_8 = f3_8 } { .mii stf.spill [spillPtr] = fs6, -16 mov IINR = In_IINR mov JINDEX = In_JINDEX } ;;// INIT 9 { .mmf ld8 VFTab = [argPtr], VNB_STK_OFFSET - VFTAB_STK_OFFSET stf.spill [spillPtr] = fs7, -16 fnorm f5_16 = f5_16 } ;; // INIT 10 { .mfi ld8 VNB = [argPtr], OUTERITER_STK_OFFSET - VNB_STK_OFFSET fnorm fHALF = fHALF mov LCSave = ar.lc } ;;// INIT 11 { .mmi ld8 OuterIter = [argPtr], INNERITER_STK_OFFSET - OUTERITER_STK_OFFSET ld4 NTYPE = [NTYPE] mov Ninner = 0 } ;;// INIT 12 { .mfi ld8 InnerIter = [argPtr], WORK_STK_OFFSET - INNERITER_STK_OFFSET nop 0x0 nop 0x0 } ;;// 20 bundles used for init - still aligned. threadLoop:// THREAD PROLOGUE 1 { .mfi fetchadd4.rel NN0 = [COUNT], THREAD_CHUNK_SIZE nop 0x0 nop 0x0 } { .mfi setf.sig f33 = NTYPE nop 0x0 nop 0x0 } ;; // THREAD PROLOGUE 2 - at least 12 cycle latency hole before this bundle (fetchadd4) { .mmi cmp.lt pCont, pDone = NN0, NRI shladd gidPtr = NN0, 2, GID adds NN1 = THREAD_CHUNK_SIZE, NN0 } { .mmi shladd jindexPtr = NN0, 2, JINDEX shladd shiftPtr = NN0, 2, SHIFT shladd iinrPtr = NN0, 2, IINR } ;; // THREAD PROLOGUE 3 { .mmi (pCont) ld4 II = [iinrPtr], 4 (pCont) ld4 IS = [shiftPtr], 4 cmp.ge pLast, pMore = NN1, NRI } { .mib (pCont) ld4 NJ0 = [jindexPtr], 4 (pCont) adds Tmp2 = 1, NN0 (pDone) br.cond.spnt.few finish } ;; // THREAD PROLOGUE 4 { .mmi ld4 ggid = [gidPtr], 4 shladd II3 = II, 1, II shladd IS3 = IS, 1, IS } { .mmi ld4 NJ1 = [jindexPtr], 4 nop 0x0 shladd jjnrPtr = NJ0, 2, JJNR } ;;// THREAD PROLOGUE 5 { .mmi cmp.lt pCont, pDone = Tmp2, NRI shladd FShiftIS = IS3, 2, FSHIFT shladd typePtr = II, 2, TYPE } { .mmi
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?