nb_kernel410nf_ia64_double.s

来自「最著名最快的分子模拟软件」· S 代码 · 共 1,012 行 · 第 1/2 页

S
1,012
字号
	{ .mmi						ld4				ggid = [gidPtr], 4		shladd			II3 = II, 1, II		shladd			IS3 = IS, 1, IS	}	{ .mmi		ld4				NJ1 = [jindexPtr], 4		shladd			chargePtr = II, 3, CHARGE		shladd			jjnrPtr = NJ0, 2, JJNR	} ;;//  THREAD PROLOGUE 5		{ .mmi		cmp.lt			pCont, pDone = Tmp2, NRI								nop				0x0				shladd			typePtr = II, 2, TYPE	}		{ .mmi		shladd			posPtr    = II3, 3, POSITION		nop				0x0				shladd			shiftVPtr = IS3, 3, SHIFTVEC		} ;;//  THREAD PROLOGUE 6		{	.mfi		nop				0x0				nop				0x0				shladd			isaPtr = II, 3, INVSQRTA	} ;;//	THREAD PROLOGUE 7	{ .mmi				ld4				jnr = [jjnrPtr], 4	(pCont)	ld4			IS = [shiftPtr], 4		nop				0x0	}		{ .mmi	(pCont) ld4			II = [iinrPtr], 4			ld4			NTI = [typePtr]	(pLast)	mov			NN1 = NRI	} ;;//  12 bundles in thread prologue - still alignedouterLoop_nf:	//	At this point in the outer loop, the following values are ready	//	//		FActII		Pointer to FACTION XYZ for II	//		FShiftIS	Pointer to FSHIFT XYZ for IS	//		shiftVPtr	Pointer to current shift XYZ values	//		posPtr		Pointer to current XYZ position	//		chargePtr	Pointer to current atom charge	//		ggid		Index for Vc array	//		jjnr		Pointer to next neighbor index	//		jnr			Current jnr value	//		NJ0, NJ1	Bounds of current neighbor list	//	//	Load up all the floating-point values (yes, McKinley can do 4 FP loads	//	per cycle) and initialize the loop counters and predicates. Compute	//	the initial position <x, y, z> and charge. If this isn't the last time	//	through the loop, start loading the next value for NJ1 - we already	//	moved the previous NJ1 -> NJ0.//	OUTER PROLOGUE 1	{	.mfi								nop 		0x0		mov		FIX = f0		add		Nouter = 1, Nouter	}	{	.mmf		ldfd		shX = [shiftVPtr], 8		ldfd		PosX = [posPtr], 8		mov			FIY = f0	} ;;//	OUTER PROLOGUE 2	{	.mmf		setf.sig	f32 = NTI		ldfd		shY = [shiftVPtr], 8		nop			0x0	}	{	.mfi		ldfd		PosY = [posPtr], 8		nop			0x0		nop			0x0			} ;;	{	.mmf								ldfd		shZ = [shiftVPtr]		ldfd		PosZ = [posPtr]		mov			FIZ = f0	}	{	.mmi		nop			0x0		nop			0x0				shladd		VNBPtr = ggid, 3, VNB	} ;;//	OUTER PROLOGUE 4	{	.mmf			nop			0x0		nop			0x0				xma.l		f32 = f32, f33, fZero	}	{ 	.mmi		sub			InnerCnt = NJ1, NJ0, 1		nop			0x0				shladd		VCPtr = ggid, 3, VC	} ;;//	OUTER PROLOGUE 5	{	.mmi		nop			0x0		nop			0x0				mov			NJ0 = NJ1	} ;;//	OUTER PROLOGUE 6	{	.mmf				ldfd		ICharge = [chargePtr], 8		ldfd		VNBTotal = [VNBPtr]		fadd		IX = shX, PosX	} ;;//	OUTER PROLOGUE 7	{	.mfi		ldfd		VCTotal = [VCPtr]		fadd		IY = shY, PosY		add			NN0 = 1, NN0	}	{	.mmi	(pCont)	ld4		NJ1 = [jindexPtr], 4		ldfd		isaI = [isaPtr], 4		//	This may seem strange, but we set the first stage of the		//	pipe to execute this way because setting pr.rot doesn't take		//	into account how much the predicates have rotated. If this is		//	the first time through, we cleared all the pipeline predicates		//	in the initialization. If not, flushing the pipeline set all		//	the pipeline predicates to 0		cmp.eq		pPipe[0], p0 = zero, zero	} ;;//	OUTER PROLOGUE 8	{	.mfi				cmp.lt		pCont, pDone = NN0, NN1		fadd		IZ = shZ, PosZ		mov		    ar.lc = InnerCnt	} ;;//	OUTER PROLOGUE 9	{	.mfi				getf.sig	NTI = f32		fmpy		IQ = ICharge, Facel		mov			ar.ec = PIPE_DEPTH	} ;;// 14 bundles in outer loop - still aligned.	//	The inner loop is a 6-stage pipeline. The serial sequence of float ops	//	is folded into a 17-cycle loop (17 * 2 = 34 float ops, one empty),     //  then divided	//	into 5 stages.innerLoop_nf://	INNER LOOP 1	{	.mfi		(pPipe[2])	ldfd	C6[0] = [TypeJ[2]], 8	(pPipe[1])	fsub	DZ[1] = IZ, DZ[1]	(pPipe[0])	shladd	jnr3 = jnr, 1, jnr	}	//	We march through jjnr[] sequentially, so it's usually a good idea	//	to preload the next value. However, we don't want to do this if	//	(1) we're in the epilogue or (2) this is the last time through and	//	there are no more atoms to inspect. Thus, we keep track of the loop	//	trip and use the logic below to see if we should load ahead	.pred.rel "mutex", pCont, pDone	{	.mfi	(pCont)		cmp.ge	pJJNR, p0 = InnerCnt, zero	(pPipe[2])	fmpy	RInvErr[0] = RSqr[1], RInv[1]	(pDone)		cmp.gt	pJJNR, p0 = InnerCnt, zero	} ;;//	INNER LOOP 2	{	.mfi		(pPipe[2])	ldfd	C12[0] = [TypeJ[2]]	(pPipe[3])	fmpy	RInvT[1] = fHALF, RInv[2]	(pPipe[0])	shladd	isaPtr = jnr, 3, INVSQRTA	}	{	.mfi	(pPipe[4])	getf.sig nnn = n0[1]	(pPipe[4])	fcvt.xf n0[1] = n0[1]	(pPipe[0])	shladd	posPtr = jnr3, 3, POSITION	} ;;//	INNER LOOP 3	{	.mfi										(pPipe[0])	ldfd	JX = [posPtr], 8	(pPipe[1])	fmpy	RSqr[0] = DX[1], DX[1]	(pPipe[0])	shladd  TypeJ[0] = jnr, 2, TYPE	}	{  	.mfi				nop		0x0	(pPipe[3])	fnma	RInvErr[1] = RInvErr[1], RInv[2], fOne	(pPipe[0])	add	Ninner = 1, Ninner	} ;;//	INNER LOOP 4	{	.mfi		(pPipe[0])	ldfd	JY = [posPtr], 8	(pPipe[3])	fmpy	RT[0] = RSqr[2], isaJ[3]	(pPipe[1])	add		TypeJ[1] = NTI, TypeJ[1]		}	{	.mfi				nop		0x0	(pPipe[4])	fmpy	RInv6[1] = RInv6[1], RInv[3]	(pPipe[0])	shladd	chargePtr = jnr, 3, CHARGE	} ;;//	INNER LOOP 5	{	.mfi										(pPipe[0])	ldfd	JZ = [posPtr], 8	(pPipe[2])	fnma	RInvErr[0] = RInvErr[0], RInv[1], fOne					nop		0x0	}	{	.mfi	(pJJNR)		ld4		jnr = [jjnrPtr], 4	(pPipe[2])	fmpy	isaJ[2] = isaJ[2], GBTabscale	(pPipe[0])	add		InnerCnt = -1, InnerCnt	} ;;//	INNER LOOP 6	{	.mfi										(pPipe[0])	ldfd	isaJ[0] = [isaPtr]	(pPipe[1])	fmpy	isaJ[1] = isaJ[1], isaI				nop		0x0	}	{	.mfi				nop		0x0	(pPipe[5])	fma		G[1] = eps[1], H[1], G[1]				nop		0x0	} ;;//	INNER LOOP 7	{	.mfi										(pPipe[0])	ld4 	TypeJ[0] = [TypeJ[0]]				(pPipe[1])	fma		RSqr[0] = DY[1], DY[1], RSqr[0]	(pPipe[1])	shladd	TypeJ[1] = TypeJ[1], 4, NBFP	}	{	.mfi				nop		0x0	(pPipe[3])	fma		RInv[2] = RInvT[1], RInvErr[1], RInv[2]	(pJJNR)     add     jjnrPtr = JJNR_PREFETCH_DISTANCE, jjnrPtr	} ;;//	INNER LOOP 8	{	.mfi										(pPipe[0])	ldfd	Charge[0] = [chargePtr]							(pPipe[4])	fnma	VNBTotal = C6[2], RInv6[1], VNBTotal	(pPipe[4])	shladd	nnn = nnn, 2, zero	}	{	.mfi				nop		0x0		(pPipe[4])	fmpy	RInv6[1] = RInv6[1], RInv6[1]				nop		0x0	} ;;//	INNER LOOP 9	{	.mfi										(pJJNR)     lfetch.nta  [jjnrPtr]	(pPipe[2])	fma		RInvT[0] = RInvErr[0], f3_8, fHALF				nop		0x0	}	{	.mfi				nop		0x0	(pPipe[2])	fmpy	RInvU[0] = RInv[1], RInvErr[0]	(pPipe[4])	shladd	nnn = nnn, 3, GBTab	} ;;//	INNER LOOP 10	{	.mfi										(pPipe[4])	ldfpd	Y[0], F[0] = [nnn], 16	(pPipe[2])	fmpy	Charge[2] = Charge[2], IQ				nop		0x0	}	{	.mfi				nop		0x0	(pPipe[5])	fma		F[1] = eps[1], G[1], F[1]				nop		0x0	} ;;//	INNER LOOP 11	{	.mfi										(pPipe[4])	ldfpd	G[0], H[0] = [nnn]	(pPipe[1])	fma		RSqr[0] = DZ[1], DZ[1], RSqr[0]	(pJJNR)     add     jjnrPtr = -JJNR_PREFETCH_DISTANCE, jjnrPtr	}	{	.mfi				nop		0x0	(pPipe[3])	fmpy	RT[0] = RT[0], RInv[2]				nop		0x0	} ;;//	INNER LOOP 12	{	.mfi													nop		0x0	(pPipe[1])	fmpy	Charge[1] = Charge[1], isaJ[1]				nop		0x0	}	{	.mfi				nop		0x0	(pPipe[3])	fmpy	RInv[2] = RInv[2], RInv[2]				nop		0x0	} ;;//	INNER LOOP 13	{	.mfi													nop		0x0	(pPipe[2])	fma		RInv[1] = RInvU[0], RInvT[0], RInv[1]						nop		0x0	}	{	.mfi				nop		0x0	(pPipe[4])	fsub	eps[0] = RT[1], n0[1]				nop		0x0	} ;;//	INNER LOOP 14	{	.mfi													nop		0x0				nop		0x0				nop		0x0	}	{	.mfi				nop		0x0	(pPipe[5])	fma     Y[1] = eps[1], F[1], Y[1]				nop		0x0	} ;;//	INNER LOOP 15	{	.mfi													nop		0x0	(pPipe[1])	frsqrta RInv[0], p0 = RSqr[0]				nop		0x0	}	{	.mfi				nop		0x0	(pPipe[3])	fcvt.fx.trunc n0[0] = RT[0]				nop		0x0	} ;;//	INNER LOOP 16	{	.mfi													nop		0x0	(pPipe[3])	fmpy	RInv6[0] = RInv[2], RInv[2]				nop		0x0	}	{	.mfi				nop		0x0	(pPipe[4])	fma		VNBTotal = C12[2], RInv6[1], VNBTotal				nop		0x0	} ;;//	INNER LOOP 17	{	.mfi													nop		0x0	(pPipe[0])	fsub	DX[0] = IX, DX[0]				nop		0x0	}	{	.mfi				nop		0x0	(pPipe[2])	fmpy	RInvErr[0] = RInv[1], RSqr[1]				nop		0x0	} ;;//	INNER LOOP 18	{	.mfi													nop		0x0	(pPipe[0])	fsub	DY[0] = IY, DY[0]				nop		0x0	}	{	.mfb				nop		0x0	(pPipe[5])	fma 	VCTotal = Charge[5], Y[1], VCTotal			br.ctop.sptk.many innerLoop_nf	} ;;// 	End of modulo-scheduled inner loop	//	Having finshed the loop, we now compute various quantities to	//	store. In paralllel, start computing computing some of the values	//	for the next loop trip, if we're going there.//	OUTER EPILOGUE 1    {   .mfi	(pCont)	shladd		typePtr = II, 2, TYPE			nop			0x0	(pCont)	shladd		II3 = II, 1, II    }	{	.mfi									(pCont)	shladd		chargePtr = II, 3, CHARGE		    	nop		0x0	(pCont)	shladd		IS3 = IS, 1, IS    } ;;//	OUTER EPILOGUE 2	{	.mfi		nop				0x0		nop				0x0		nop				0x0	} ;;//	OUTER EPILOGUE 3    {   .mfi	(pCont)	ld4			IS = [shiftPtr], 4			nop				0x0	(pCont)	shladd 		isaPtr = II, 3, INVSQRTA	}    {   .mmf	(pCont)	setf.sig	f33 = NTYPE			nop			0x0			nop				0x0	} ;;// 	OUTER EPILOGUE 4    {   .mfi	(pCont)	ld4				NTI = [typePtr]	  				nop				0x0	(pCont)	shladd	shiftVPtr = IS3, 3, SHIFTVEC							}     {   .mfi		nop 0x0			nop				0x0	(pCont)	shladd	posPtr = II3, 3, POSITION	} ;;//	OUTER EPILOGUE 6   {   .mmi		stfd    [VCPtr] = VCTotal	(pCont)		ld4     ggid = [gidPtr], 4 		nop 	0x0	} ;;//	OUTER EPILOGUE 7	{	.mmi		nop		0x0	(pCont)	ld4	II = [iinrPtr] ,4		nop		0x0	}//	OUTER EPILOGUE 8	{	.mib		stfd    [VNBPtr] = VNBTotal		nop		0x0	(pCont)	br.cond.sptk.many	outerLoop_nf	} ;;	// Finish if this was the last chunk, or do another thread-loop iteration//  THREAD EPILOGUE 1	{ .mib						nop				0x0		nop				0x0	(pMore) br.cond.sptk.many threadLoop_nf	} ;;		//	Ready to exit - restore the floating-point registers we saved, the	//	loop counter, and the predicates, then we're done. Note that the	//	stack pointer has the address of the last saved FP register.finish_nf://  EXIT 1	{	.mmi		mov			fillP0 = sp		add			fillP1 = 16, sp		nop			0x0	}  	{	.mmi		st4			[OuterIter] = Nouter		st4			[InnerIter] = Ninner		nop			0x0	} ;;//  EXIT 2	{	.mmi		ldf.fill		fs13 = [fillP0], 32		ldf.fill		fs12 = [fillP1], 32		nop				0x0	} ;;//  EXIT 3	{	.mmi		ldf.fill		fs11 = [fillP0], 32		ldf.fill		fs10 = [fillP1], 32		nop				0x0	} ;;//  EXIT 4	{	.mmi		ldf.fill		fs9 = [fillP0], 32		ldf.fill		fs8 = [fillP1], 32		nop				0x0	} ;;//  EXIT 5	{	.mmi		ldf.fill		fs7 = [fillP0], 32		ldf.fill		fs6 = [fillP1], 32		add				sp = 13 * 16, sp	} ;;//  EXIT 6	{	.mmi		ldf.fill		fs5 = [fillP0], 32		ldf.fill		fs4 = [fillP1], 32		mov				ar.lc = LCSave	} ;;//  EXIT 7	{	.mmi		ldf.fill		fs3 = [fillP0], 32		ldf.fill		fs2 = [fillP1], 32		mov				pr = PRSave, 0x1ffff	} ;;//  EXIT 8	{	.mmb		ldf.fill		fs1 = [fillP0], 32		ldf.fill		fs0 = [fillP1], 32		br.ret.sptk.few	rp	} ;;	.endp	 nb_kernel410nf_ia64_double

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?