nb_kernel100nf_ia64_double.s
来自「最著名最快的分子模拟软件」· S 代码 · 共 786 行 · 第 1/2 页
S
786 行
// THREAD PROLOGUE 1 { .mfi fetchadd4.rel NN0 = [COUNT], THREAD_CHUNK_SIZE nop 0x0 nop 0x0 } { .mfi // alignment bundle nop 0x0 nop 0x0 nop 0x0 } ;; // THREAD PROLOGUE 2 - at least 12 cycle latency hole before this bundle (fetchadd4) { .mmi cmp.lt pCont, pDone = NN0, NRI shladd gidPtr = NN0, 2, GID adds NN1 = THREAD_CHUNK_SIZE, NN0 } { .mmi shladd jindexPtr = NN0, 2, JINDEX shladd shiftPtr = NN0, 2, SHIFT shladd iinrPtr = NN0, 2, IINR } ;; // THREAD PROLOGUE 3 { .mmi (pCont) ld4 II = [iinrPtr], 4 (pCont) ld4 IS = [shiftPtr], 4 cmp.ge pLast, pMore = NN1, NRI } { .mib (pCont) ld4 NJ0 = [jindexPtr], 4 (pCont) adds Tmp2 = 1, NN0 (pDone) br.cond.spnt.few finish_nf } ;; // THREAD PROLOGUE 4 { .mmi ld4 ggid = [gidPtr], 4 shladd II3 = II, 1, II shladd IS3 = IS, 1, IS } { .mmi ld4 NJ1 = [jindexPtr], 4 shladd chargePtr = II, 3, CHARGE shladd jjnrPtr = NJ0, 2, JJNR } ;; // THREAD PROLOGUE 5 { .mmi cmp.lt pCont, pDone = Tmp2, NRI nop 0x0 (pLast) mov NN1 = NRI } { .mmi shladd posPtr = II3, 3, POSITION nop 0x0 shladd shiftVPtr = IS3, 3, SHIFTVEC } ;;// THREAD PROLOGUE 6 { .mmi ld4 jnr = [jjnrPtr], 4 (pCont) ld4 IS = [shiftPtr], 4 nop 0x0 } { .mmi (pCont) ld4 II = [iinrPtr], 4 nop 0x0 nop 0x0 } ;;// 12 bundles in thread prologue - still alignedouterLoop_nf: // At this point in the outer loop, the following values are ready // // FActII Pointer to FACTION XYZ for II // FShiftIS Pointer to FSHIFT XYZ for IS // shiftVPtr Pointer to current shift XYZ values // posPtr Pointer to current XYZ position // chargePtr Pointer to current atom charge // ggid Index for Vc array // jjnr Pointer to next neighbor index // jnr Current jnr value // NJ0, NJ1 Bounds of current neighbor list // // Load up all the floating-point values (yes, McKinley can do 4 FP loads // per cycle) and initialize the loop counters and predicates. Compute // the initial position <x, y, z> and charge. If this isn't the last time // through the loop, start loading the next value for NJ1 - we already // moved the previous NJ1 -> NJ0.// OUTER PROLOGUE 1 { .mmf ldfd shX = [shiftVPtr], 8 ldfd PosX = [posPtr], 8 mov FIX = f0 } { .mfi nop 0x0 mov FIY = f0 add Nouter = 1, Nouter } ;;// OUTER PROLOGUE 2 { .mmf ldfd shY = [shiftVPtr], 8 ldfd PosY = [posPtr], 8 mov FIZ = f0 } { .mmi nop 0x0 nop 0x0 shladd VCPtr = ggid, 3, VC } ;;// OUTER PROLOGUE 3 { .mmi ldfd shZ = [shiftVPtr] ldfd PosZ = [posPtr] sub InnerCnt = NJ1, NJ0, 1 } { .mmi nop 0x0 nop 0x0 mov NJ0 = NJ1 } ;;// OUTER PROLOGUE 4 { .mmf ldfd ICharge = [chargePtr] ldfd VCTotal = [VCPtr] fadd IX = shX, PosX } ;;// OUTER PROLOGUE 5 { .mfi add NN0 = 1, NN0 fadd IY = shY, PosY // This may seem strange, but we set the first stage of the // pipe to execute this way because setting pr.rot doesn't take // into account how much the predicates have rotated. If this is // the first time through, we cleared all the pipeline predicates // in the initialization. If not, flushing the pipeline set all // the pipeline predicates to 0 cmp.eq pPipe[0], p0 = zero, zero } ;;// OUTER PROLOGUE 6 { .mfi cmp.lt pCont, pDone = NN0, NN1 fadd IZ = shZ, PosZ mov ar.lc = InnerCnt } ;;// OUTER PROLOGUE 7 { .mfi (pCont) ld4 NJ1 = [jindexPtr], 4 fmpy IQ = ICharge, Facel mov ar.ec = PIPE_DEPTH } ;;// 10 bundles in outer loop - still aligned. // The inner loop is a 6-stage pipeline. The serial sequence of float ops // is folded into a 12-cycle loop (12 * 2 = 24 float ops), then divided // into 5 stages.innerLoop_nf:// INNER LOOP 1 { .mfi (pPipe[0]) shladd chargePtr = jnr, 3, CHARGE (pPipe[2]) fsub DY[2] = IY, DY[2] (pPipe[0]) shladd jnr3 = jnr, 1, jnr } // We march through jjnr[] sequentially, so it's usually a good idea // to preload the next value. However, we don't want to do this if // (1) we're in the epilogue or (2) this is the last time through and // there are no more atoms to inspect. Thus, we keep track of the loop // trip and use the logic below to see if we should load ahead .pred.rel "mutex", pCont, pDone { .mfi (pCont) cmp.ge pJJNR, p0 = InnerCnt, zero (pPipe[5]) fma RInvT[0] = RInvErr[1], f3_8, fHALF (pDone) cmp.gt pJJNR, p0 = InnerCnt, zero } ;;// INNER LOOP 2 { .mfi (pJJNR) ld4 jnr = [jjnrPtr], 4 (pPipe[2]) fsub DZ[2] = IZ, DZ[2] nop 0x0 } { .mfi (pPipe[0]) shladd posPtr = jnr3, 3, POSITION (pPipe[4]) fmpy RInvErr[0] = RInv[1], RSqr[2] nop 0x0 } ;;// INNER LOOP 3 { .mfi (pPipe[0]) ldfd JX = [posPtr], 8 (pPipe[2]) fmpy RSqr[0] = DX[2], DX[2] (pPipe[0]) add InnerCnt = -1, InnerCnt } { .mfi nop 0x0 (pPipe[3]) fma RSqr[1] = DZ[3], DZ[3], RSqr[1] (pPipe[0]) add Ninner = 1, Ninner } ;;// INNER LOOP 4 { .mfi (pPipe[0]) ldfd JY = [posPtr], 8 (pPipe[6]) fmpy RInvT[1] = fHALF, RInv[3] (pJJNR) add jjnrPtr = JJNR_PREFETCH_DISTANCE, jjnrPtr } { .mfi nop 0x0 (pPipe[5]) fmpy RInvU[0] = RInv[2], RInvErr[1] nop 0x0 } ;;// INNER LOOP 5 { .mfi (pPipe[0]) ldfd JZ = [posPtr], 8 (pPipe[7]) fma RInv[4] = RInvT[2], RInvErr[3], RInv[4] nop 0x0 } { .mfi nop 0x0 (pPipe[6]) fmpy RInvErr[2] = RSqr[4], RInv[3] nop 0x0 } ;;// INNER LOOP 6 { .mfi (pJJNR) lfetch.nta [jjnrPtr] (pPipe[4]) fnma RInvErr[0] = RInvErr[0], RInv[1], fOne nop 0x0 } { .mfi nop 0x0 (pPipe[3]) fmpy Charge[3] = IQ, Charge[3] nop 0x0 } ;;// INNER LOOP 7 { .mfi nop 0x0 (pPipe[2]) fma RSqr[0] = DY[2], DY[2], RSqr[0] (pJJNR) add jjnrPtr = -JJNR_PREFETCH_DISTANCE, jjnrPtr } { .mfi nop 0x0 (pPipe[3]) frsqrta RInv[0], p0 = RSqr[1] nop 0x0 } ;;// INNER LOOP 8 { .mfi nop 0x0 (pPipe[1]) fsub DX[1] = IX, DX[1] nop 0x0 } { .mfi nop 0x0 (pPipe[5]) fma RInv[2] = RInvT[0], RInvU[0], RInv[2] nop 0x0 } ;;// INNER LOOP 9 { .mfi (pPipe[0]) ldfd QCharge = [chargePtr] (pPipe[6]) fnma RInvErr[2] = RInvErr[2], RInv[3], fOne nop 0x0 } { .mfb nop 0x0 (pPipe[7]) fma VCTotal = Charge[7], RInv[4], VCTotal br.ctop.sptk.many innerLoop_nf } ;; // End of modulo-scheduled inner loop // Having finshed the loop, we now compute various quantities to // store. In paralllel, start computing computing some of the values // for the next loop trip, if we're going there.// OUTER EPILOGUE 1 { .mfi nop 0x0 nop 0x0 (pCont) shladd II3 = II, 1, II } { .mfi (pCont) shladd chargePtr = II, 3, CHARGE nop 0x0 (pCont) shladd IS3 = IS, 1, IS } ;;// OUTER EPILOGUE 2 { .mfi (pCont) ld4 II = [iinrPtr] ,4 nop 0x0 nop 0x0 } { .mfi (pCont) ld4 IS = [shiftPtr], 4 nop 0x0 nop 0x0 } ;;// OUTER EPILOGUE 3 { .mfi nop 0x0 nop 0x0 (pCont) shladd shiftVPtr = IS3, 3, SHIFTVEC } { .mfi nop 0x0 nop 0x0 (pCont) shladd posPtr = II3, 3, POSITION } ;;// OUTER EPILOGUE 4 { .mmb stfd [VCPtr] = VCTotal (pCont) ld4 ggid = [gidPtr], 4 (pCont) br.cond.sptk.many outerLoop_nf } ;; // Finish if this was the last chunk, or do another thread-loop iteration// THREAD EPILOGUE 1 { .mib nop 0x0 nop 0x0 (pMore) br.cond.sptk.many threadLoop_nf } ;; // Ready to exit - restore the floating-point registers we saved, the // loop counter, and the predicates, then we're done. Note that the // stack pointer has the address of the last saved FP register.finish_nf:// EXIT 1 { .mmi mov fillP0 = sp add fillP1 = 16, sp nop 0x0 } { .mmi st4 [OuterIter] = Nouter st4 [InnerIter] = Ninner nop 0x0 } ;;// EXIT 2 { .mmi ldf.fill fs7 = [fillP0], 32 ldf.fill fs6 = [fillP1], 32 mov ar.lc = LCSave } ;;// EXIT 3 { .mmi ldf.fill fs5 = [fillP0], 32 ldf.fill fs4 = [fillP1], 32 mov pr = PRSave, 0x1ffff } ;;// EXIT 4 { .mmi ldf.fill fs3 = [fillP0], 32 ldf.fill fs2 = [fillP1], 32 add sp = 7 * 16, sp } ;;// EXIT 5 { .mmb ldf.fill fs1 = [fillP0] ldf.fill fs0 = [fillP1] br.ret.sptk.few rp } ;; .endp nb_kernel100nf_ia64_double
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?