nb_kernel410nf_ia64_double.s
来自「最著名最快的分子模拟软件」· S 代码 · 共 1,012 行 · 第 1/2 页
S
1,012 行
{ .mmi ld4 ggid = [gidPtr], 4 shladd II3 = II, 1, II shladd IS3 = IS, 1, IS } { .mmi ld4 NJ1 = [jindexPtr], 4 shladd chargePtr = II, 3, CHARGE shladd jjnrPtr = NJ0, 2, JJNR } ;;// THREAD PROLOGUE 5 { .mmi cmp.lt pCont, pDone = Tmp2, NRI nop 0x0 shladd typePtr = II, 2, TYPE } { .mmi shladd posPtr = II3, 3, POSITION nop 0x0 shladd shiftVPtr = IS3, 3, SHIFTVEC } ;;// THREAD PROLOGUE 6 { .mfi nop 0x0 nop 0x0 shladd isaPtr = II, 3, INVSQRTA } ;;// THREAD PROLOGUE 7 { .mmi ld4 jnr = [jjnrPtr], 4 (pCont) ld4 IS = [shiftPtr], 4 nop 0x0 } { .mmi (pCont) ld4 II = [iinrPtr], 4 ld4 NTI = [typePtr] (pLast) mov NN1 = NRI } ;;// 12 bundles in thread prologue - still alignedouterLoop_nf: // At this point in the outer loop, the following values are ready // // FActII Pointer to FACTION XYZ for II // FShiftIS Pointer to FSHIFT XYZ for IS // shiftVPtr Pointer to current shift XYZ values // posPtr Pointer to current XYZ position // chargePtr Pointer to current atom charge // ggid Index for Vc array // jjnr Pointer to next neighbor index // jnr Current jnr value // NJ0, NJ1 Bounds of current neighbor list // // Load up all the floating-point values (yes, McKinley can do 4 FP loads // per cycle) and initialize the loop counters and predicates. Compute // the initial position <x, y, z> and charge. If this isn't the last time // through the loop, start loading the next value for NJ1 - we already // moved the previous NJ1 -> NJ0.// OUTER PROLOGUE 1 { .mfi nop 0x0 mov FIX = f0 add Nouter = 1, Nouter } { .mmf ldfd shX = [shiftVPtr], 8 ldfd PosX = [posPtr], 8 mov FIY = f0 } ;;// OUTER PROLOGUE 2 { .mmf setf.sig f32 = NTI ldfd shY = [shiftVPtr], 8 nop 0x0 } { .mfi ldfd PosY = [posPtr], 8 nop 0x0 nop 0x0 } ;; { .mmf ldfd shZ = [shiftVPtr] ldfd PosZ = [posPtr] mov FIZ = f0 } { .mmi nop 0x0 nop 0x0 shladd VNBPtr = ggid, 3, VNB } ;;// OUTER PROLOGUE 4 { .mmf nop 0x0 nop 0x0 xma.l f32 = f32, f33, fZero } { .mmi sub InnerCnt = NJ1, NJ0, 1 nop 0x0 shladd VCPtr = ggid, 3, VC } ;;// OUTER PROLOGUE 5 { .mmi nop 0x0 nop 0x0 mov NJ0 = NJ1 } ;;// OUTER PROLOGUE 6 { .mmf ldfd ICharge = [chargePtr], 8 ldfd VNBTotal = [VNBPtr] fadd IX = shX, PosX } ;;// OUTER PROLOGUE 7 { .mfi ldfd VCTotal = [VCPtr] fadd IY = shY, PosY add NN0 = 1, NN0 } { .mmi (pCont) ld4 NJ1 = [jindexPtr], 4 ldfd isaI = [isaPtr], 4 // This may seem strange, but we set the first stage of the // pipe to execute this way because setting pr.rot doesn't take // into account how much the predicates have rotated. If this is // the first time through, we cleared all the pipeline predicates // in the initialization. If not, flushing the pipeline set all // the pipeline predicates to 0 cmp.eq pPipe[0], p0 = zero, zero } ;;// OUTER PROLOGUE 8 { .mfi cmp.lt pCont, pDone = NN0, NN1 fadd IZ = shZ, PosZ mov ar.lc = InnerCnt } ;;// OUTER PROLOGUE 9 { .mfi getf.sig NTI = f32 fmpy IQ = ICharge, Facel mov ar.ec = PIPE_DEPTH } ;;// 14 bundles in outer loop - still aligned. // The inner loop is a 6-stage pipeline. The serial sequence of float ops // is folded into a 17-cycle loop (17 * 2 = 34 float ops, one empty), // then divided // into 5 stages.innerLoop_nf:// INNER LOOP 1 { .mfi (pPipe[2]) ldfd C6[0] = [TypeJ[2]], 8 (pPipe[1]) fsub DZ[1] = IZ, DZ[1] (pPipe[0]) shladd jnr3 = jnr, 1, jnr } // We march through jjnr[] sequentially, so it's usually a good idea // to preload the next value. However, we don't want to do this if // (1) we're in the epilogue or (2) this is the last time through and // there are no more atoms to inspect. Thus, we keep track of the loop // trip and use the logic below to see if we should load ahead .pred.rel "mutex", pCont, pDone { .mfi (pCont) cmp.ge pJJNR, p0 = InnerCnt, zero (pPipe[2]) fmpy RInvErr[0] = RSqr[1], RInv[1] (pDone) cmp.gt pJJNR, p0 = InnerCnt, zero } ;;// INNER LOOP 2 { .mfi (pPipe[2]) ldfd C12[0] = [TypeJ[2]] (pPipe[3]) fmpy RInvT[1] = fHALF, RInv[2] (pPipe[0]) shladd isaPtr = jnr, 3, INVSQRTA } { .mfi (pPipe[4]) getf.sig nnn = n0[1] (pPipe[4]) fcvt.xf n0[1] = n0[1] (pPipe[0]) shladd posPtr = jnr3, 3, POSITION } ;;// INNER LOOP 3 { .mfi (pPipe[0]) ldfd JX = [posPtr], 8 (pPipe[1]) fmpy RSqr[0] = DX[1], DX[1] (pPipe[0]) shladd TypeJ[0] = jnr, 2, TYPE } { .mfi nop 0x0 (pPipe[3]) fnma RInvErr[1] = RInvErr[1], RInv[2], fOne (pPipe[0]) add Ninner = 1, Ninner } ;;// INNER LOOP 4 { .mfi (pPipe[0]) ldfd JY = [posPtr], 8 (pPipe[3]) fmpy RT[0] = RSqr[2], isaJ[3] (pPipe[1]) add TypeJ[1] = NTI, TypeJ[1] } { .mfi nop 0x0 (pPipe[4]) fmpy RInv6[1] = RInv6[1], RInv[3] (pPipe[0]) shladd chargePtr = jnr, 3, CHARGE } ;;// INNER LOOP 5 { .mfi (pPipe[0]) ldfd JZ = [posPtr], 8 (pPipe[2]) fnma RInvErr[0] = RInvErr[0], RInv[1], fOne nop 0x0 } { .mfi (pJJNR) ld4 jnr = [jjnrPtr], 4 (pPipe[2]) fmpy isaJ[2] = isaJ[2], GBTabscale (pPipe[0]) add InnerCnt = -1, InnerCnt } ;;// INNER LOOP 6 { .mfi (pPipe[0]) ldfd isaJ[0] = [isaPtr] (pPipe[1]) fmpy isaJ[1] = isaJ[1], isaI nop 0x0 } { .mfi nop 0x0 (pPipe[5]) fma G[1] = eps[1], H[1], G[1] nop 0x0 } ;;// INNER LOOP 7 { .mfi (pPipe[0]) ld4 TypeJ[0] = [TypeJ[0]] (pPipe[1]) fma RSqr[0] = DY[1], DY[1], RSqr[0] (pPipe[1]) shladd TypeJ[1] = TypeJ[1], 4, NBFP } { .mfi nop 0x0 (pPipe[3]) fma RInv[2] = RInvT[1], RInvErr[1], RInv[2] (pJJNR) add jjnrPtr = JJNR_PREFETCH_DISTANCE, jjnrPtr } ;;// INNER LOOP 8 { .mfi (pPipe[0]) ldfd Charge[0] = [chargePtr] (pPipe[4]) fnma VNBTotal = C6[2], RInv6[1], VNBTotal (pPipe[4]) shladd nnn = nnn, 2, zero } { .mfi nop 0x0 (pPipe[4]) fmpy RInv6[1] = RInv6[1], RInv6[1] nop 0x0 } ;;// INNER LOOP 9 { .mfi (pJJNR) lfetch.nta [jjnrPtr] (pPipe[2]) fma RInvT[0] = RInvErr[0], f3_8, fHALF nop 0x0 } { .mfi nop 0x0 (pPipe[2]) fmpy RInvU[0] = RInv[1], RInvErr[0] (pPipe[4]) shladd nnn = nnn, 3, GBTab } ;;// INNER LOOP 10 { .mfi (pPipe[4]) ldfpd Y[0], F[0] = [nnn], 16 (pPipe[2]) fmpy Charge[2] = Charge[2], IQ nop 0x0 } { .mfi nop 0x0 (pPipe[5]) fma F[1] = eps[1], G[1], F[1] nop 0x0 } ;;// INNER LOOP 11 { .mfi (pPipe[4]) ldfpd G[0], H[0] = [nnn] (pPipe[1]) fma RSqr[0] = DZ[1], DZ[1], RSqr[0] (pJJNR) add jjnrPtr = -JJNR_PREFETCH_DISTANCE, jjnrPtr } { .mfi nop 0x0 (pPipe[3]) fmpy RT[0] = RT[0], RInv[2] nop 0x0 } ;;// INNER LOOP 12 { .mfi nop 0x0 (pPipe[1]) fmpy Charge[1] = Charge[1], isaJ[1] nop 0x0 } { .mfi nop 0x0 (pPipe[3]) fmpy RInv[2] = RInv[2], RInv[2] nop 0x0 } ;;// INNER LOOP 13 { .mfi nop 0x0 (pPipe[2]) fma RInv[1] = RInvU[0], RInvT[0], RInv[1] nop 0x0 } { .mfi nop 0x0 (pPipe[4]) fsub eps[0] = RT[1], n0[1] nop 0x0 } ;;// INNER LOOP 14 { .mfi nop 0x0 nop 0x0 nop 0x0 } { .mfi nop 0x0 (pPipe[5]) fma Y[1] = eps[1], F[1], Y[1] nop 0x0 } ;;// INNER LOOP 15 { .mfi nop 0x0 (pPipe[1]) frsqrta RInv[0], p0 = RSqr[0] nop 0x0 } { .mfi nop 0x0 (pPipe[3]) fcvt.fx.trunc n0[0] = RT[0] nop 0x0 } ;;// INNER LOOP 16 { .mfi nop 0x0 (pPipe[3]) fmpy RInv6[0] = RInv[2], RInv[2] nop 0x0 } { .mfi nop 0x0 (pPipe[4]) fma VNBTotal = C12[2], RInv6[1], VNBTotal nop 0x0 } ;;// INNER LOOP 17 { .mfi nop 0x0 (pPipe[0]) fsub DX[0] = IX, DX[0] nop 0x0 } { .mfi nop 0x0 (pPipe[2]) fmpy RInvErr[0] = RInv[1], RSqr[1] nop 0x0 } ;;// INNER LOOP 18 { .mfi nop 0x0 (pPipe[0]) fsub DY[0] = IY, DY[0] nop 0x0 } { .mfb nop 0x0 (pPipe[5]) fma VCTotal = Charge[5], Y[1], VCTotal br.ctop.sptk.many innerLoop_nf } ;;// End of modulo-scheduled inner loop // Having finshed the loop, we now compute various quantities to // store. In paralllel, start computing computing some of the values // for the next loop trip, if we're going there.// OUTER EPILOGUE 1 { .mfi (pCont) shladd typePtr = II, 2, TYPE nop 0x0 (pCont) shladd II3 = II, 1, II } { .mfi (pCont) shladd chargePtr = II, 3, CHARGE nop 0x0 (pCont) shladd IS3 = IS, 1, IS } ;;// OUTER EPILOGUE 2 { .mfi nop 0x0 nop 0x0 nop 0x0 } ;;// OUTER EPILOGUE 3 { .mfi (pCont) ld4 IS = [shiftPtr], 4 nop 0x0 (pCont) shladd isaPtr = II, 3, INVSQRTA } { .mmf (pCont) setf.sig f33 = NTYPE nop 0x0 nop 0x0 } ;;// OUTER EPILOGUE 4 { .mfi (pCont) ld4 NTI = [typePtr] nop 0x0 (pCont) shladd shiftVPtr = IS3, 3, SHIFTVEC } { .mfi nop 0x0 nop 0x0 (pCont) shladd posPtr = II3, 3, POSITION } ;;// OUTER EPILOGUE 6 { .mmi stfd [VCPtr] = VCTotal (pCont) ld4 ggid = [gidPtr], 4 nop 0x0 } ;;// OUTER EPILOGUE 7 { .mmi nop 0x0 (pCont) ld4 II = [iinrPtr] ,4 nop 0x0 }// OUTER EPILOGUE 8 { .mib stfd [VNBPtr] = VNBTotal nop 0x0 (pCont) br.cond.sptk.many outerLoop_nf } ;; // Finish if this was the last chunk, or do another thread-loop iteration// THREAD EPILOGUE 1 { .mib nop 0x0 nop 0x0 (pMore) br.cond.sptk.many threadLoop_nf } ;; // Ready to exit - restore the floating-point registers we saved, the // loop counter, and the predicates, then we're done. Note that the // stack pointer has the address of the last saved FP register.finish_nf:// EXIT 1 { .mmi mov fillP0 = sp add fillP1 = 16, sp nop 0x0 } { .mmi st4 [OuterIter] = Nouter st4 [InnerIter] = Ninner nop 0x0 } ;;// EXIT 2 { .mmi ldf.fill fs13 = [fillP0], 32 ldf.fill fs12 = [fillP1], 32 nop 0x0 } ;;// EXIT 3 { .mmi ldf.fill fs11 = [fillP0], 32 ldf.fill fs10 = [fillP1], 32 nop 0x0 } ;;// EXIT 4 { .mmi ldf.fill fs9 = [fillP0], 32 ldf.fill fs8 = [fillP1], 32 nop 0x0 } ;;// EXIT 5 { .mmi ldf.fill fs7 = [fillP0], 32 ldf.fill fs6 = [fillP1], 32 add sp = 13 * 16, sp } ;;// EXIT 6 { .mmi ldf.fill fs5 = [fillP0], 32 ldf.fill fs4 = [fillP1], 32 mov ar.lc = LCSave } ;;// EXIT 7 { .mmi ldf.fill fs3 = [fillP0], 32 ldf.fill fs2 = [fillP1], 32 mov pr = PRSave, 0x1ffff } ;;// EXIT 8 { .mmb ldf.fill fs1 = [fillP0], 32 ldf.fill fs0 = [fillP1], 32 br.ret.sptk.few rp } ;; .endp nb_kernel410nf_ia64_double
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?