nb_kernel030nf_ia64_double.s

来自「最著名最快的分子模拟软件」· S 代码 · 共 893 行 · 第 1/2 页

S
893
字号
	} ;;    //  THREAD PROLOGUE 2 - at least 12 cycle latency hole before this bundle (fetchadd4)	{	.mmi				cmp.lt			pCont, pDone = NN0, NRI		shladd			gidPtr = NN0, 2, GID		adds			NN1 = THREAD_CHUNK_SIZE, NN0	}	{	.mmi		shladd			jindexPtr = NN0, 2, JINDEX		shladd   		shiftPtr  = NN0, 2, SHIFT		shladd			iinrPtr   = NN0, 2, IINR	} ;; //  THREAD PROLOGUE 3 		{ .mmi					(pCont) ld4			II = [iinrPtr], 4	(pCont) ld4			IS = [shiftPtr], 4		cmp.ge			pLast, pMore = NN1, NRI	}	{ .mib	(pCont) ld4			NJ0 = [jindexPtr], 4	(pCont) adds		Tmp2 = 1, NN0	(pDone) br.cond.spnt.few finish_nf	} ;; 		//  THREAD PROLOGUE 4		{ .mmi						ld4				ggid = [gidPtr], 4		shladd			II3 = II, 1, II		shladd			IS3 = IS, 1, IS	}	{ .mmi		ld4				NJ1 = [jindexPtr], 4		nop				0x0		shladd			jjnrPtr = NJ0, 2, JJNR	} ;;//  THREAD PROLOGUE 5		{ .mmi		cmp.lt			pCont, pDone = Tmp2, NRI								nop				0x0		shladd			typePtr = II, 2, TYPE	}		{ .mmi		shladd			posPtr    = II3, 3, POSITION		nop				0x0		shladd			shiftVPtr = IS3, 3, SHIFTVEC		} ;;//  THREAD PROLOGUE 6		{ .mmi						ld4				jnr = [jjnrPtr], 4	(pCont)	ld4			IS = [shiftPtr], 4		nop				0x0	}		{ .mmi	(pCont) ld4			II = [iinrPtr], 4			ld4			NTI = [typePtr]	(pLast)	mov			NN1 = NRI	} ;;//  12 bundles in thread prologue - still alignedouterLoop_nf:	//	At this point in the outer loop, the following values are ready	//	//		FActII		Pointer to FACTION XYZ for II	//		FShiftIS	Pointer to FSHIFT XYZ for IS	//		shiftVPtr	Pointer to current shift XYZ values	//		posPtr		Pointer to current XYZ position	//		chargePtr	Pointer to current atom charge	//		ggid		Index for Vc array	//		jjnr		Pointer to next neighbor index	//		jnr			Current jnr value	//		NJ0, NJ1	Bounds of current neighbor list	//	//	Load up all the floating-point values (yes, McKinley can do 4 FP loads	//	per cycle) and initialize the loop counters and predicates. Compute	//	the initial position <x, y, z> and charge. If this isn't the last time	//	through the loop, start loading the next value for NJ1 - we already	//	moved the previous NJ1 -> NJ0.//	OUTER PROLOGUE 1	{	.mfi								nop 		0x0		mov			FIX = f0		add			NN0 = 1, NN0	}	{	.mmf		ldfd		shX = [shiftVPtr], 8		ldfd		PosX = [posPtr], 8		mov			FIY = f0	} ;;//	OUTER PROLOGUE 2	{	.mmf		setf.sig	f32 = NTI		ldfd		shY = [shiftVPtr], 8		nop			0x0	}	{	.mfi		ldfd		PosY = [posPtr], 8		nop			0x0		add		Nouter = 1, Nouter	} ;;	{	.mmf								ldfd		shZ = [shiftVPtr]		ldfd		PosZ = [posPtr]		mov			FIZ = f0	}	{	.mmi		nop			0x0		nop			0x0		shladd		VNBPtr = ggid, 3, VNB	} ;;//	OUTER PROLOGUE 4	{	.mmf			nop			0x0		nop			0x0		xma.l		f32 = f32, f33, fZero	}	{ 	.mii		ldfd		VNBTotal = [VNBPtr]		sub			InnerCnt = NJ1, NJ0, 1		mov			NJ0 = NJ1	} ;;//	OUTER PROLOGUE 5	{	.mmf		nop			0x0		nop			0x0		fadd		IX = shX, PosX	} ;;//	OUTER PROLOGUE 6	{	.mfi	(pCont)	ld4		NJ1 = [jindexPtr], 4		fadd		IY = shY, PosY		//	This may seem strange, but we set the first stage of the		//	pipe to execute this way because setting pr.rot doesn't take		//	into account how much the predicates have rotated. If this is		//	the first time through, we cleared all the pipeline predicates		//	in the initialization. If not, flushing the pipeline set all		//	the pipeline predicates to 0		cmp.eq		pPipe[0], p0 = zero, zero	} ;;//	OUTER PROLOGUE 7	{	.mfi				cmp.lt		pCont, pDone = NN0, NN1		fadd		IZ = shZ, PosZ		mov		    ar.lc = InnerCnt	} ;;//	OUTER PROLOGUE 8	{	.mfi				getf.sig	NTI = f32		nop			0x0		mov			ar.ec = PIPE_DEPTH	} ;;// 14 bundles in outer loop - still aligned.	//	The inner loop is a 6-stage pipeline. The serial sequence of float ops	//	is folded into a 17-cycle loop (17 * 2 = 34 float ops, one empty),     //  then divided	//	into 5 stages.innerLoop_nf://	INNER LOOP 1	{	.mfi					nop		0x0	(pPipe[1])	fsub	DX[1] = IX, DX[1]	(pPipe[0])	shladd	jnr3 = jnr, 1, jnr	}	//	We march through jjnr[] sequentially, so it's usually a good idea	//	to preload the next value. However, we don't want to do this if	//	(1) we're in the epilogue or (2) this is the last time through and	//	there are no more atoms to inspect. Thus, we keep track of the loop	//	trip and use the logic below to see if we should load ahead	.pred.rel "mutex", pCont, pDone	{	.mfi	(pCont)		cmp.ge	pJJNR, p0 = InnerCnt, zero	(pPipe[4])	fma 	RInv[2] = RInvErr[2], RInvT[2], RInv[2]	(pDone)		cmp.gt	pJJNR, p0 = InnerCnt, zero	} ;;//	INNER LOOP 2	{	.mfi					nop		0x0	(pPipe[1])	fsub	DY[1] = IY, DY[1]	(pPipe[0])	add	Ninner = 1, Ninner	}	{	.mfi	(pPipe[0])	shladd	posPtr = jnr3, 3, POSITION	(pPipe[2])	frsqrta RInv[0], p0 = RSqr[1]				nop		0x0	} ;;//	INNER LOOP 3	{	.mfi										(pPipe[0])	ldfd	JX = [posPtr], 8	(pPipe[1])	fsub	DZ[1] = IZ, DZ[1]	(pPipe[0])	shladd  TypeJ[0] = jnr, 2, TYPE	}	{  	.mfi				nop		0x0	(pPipe[3])	fma 	RInv[1] = RInvU[1], RInvT[1], RInv[1]				nop		0x0	} ;;//	INNER LOOP 4	{	.mfi	(pPipe[0])	ldfd	JY = [posPtr], 8	(pPipe[7])	fma     Disp_Y[2] = eps2, Disp_F[2], Disp_Y[2]	(pPipe[5])	shladd  nnn[1] = nnn[1], 2, zero	}	{	.mfi				nop		0x0	(pPipe[7])	fma		Rep_F[2] = eps2, Rep_G[2], Rep_F[2]	(pPipe[0])	add		InnerCnt = -1, InnerCnt	} ;;//	INNER LOOP 5	{	.mfi										(pPipe[0])	ldfd	JZ = [posPtr], 8	(pPipe[1])	fmpy	RSqr[0] = DX[1], DX[1]	(pPipe[5])	shladd  nnn[1] = nnn[1], 4, VFTab	}	{	.mfi	(pJJNR)		ld4		jnr = [jjnrPtr], 4	(pPipe[4])	fmpy	RT[2] = RT[2], RInv[2]				nop		0x0	} ;;//	INNER LOOP 6	{	.mfi		(pPipe[5])	ldfpd	Disp_Y[0], Disp_F[0] = [nnn[1]], 16	(pPipe[2])	fmpy	RInvErr[0] = RInv[0], RSqr[1]				nop		0x0	}	{	.mfi				nop		0x0	(pPipe[5])	fsub	eps0 = RT[3], n0[1]				nop		0x0	} ;;//	INNER LOOP 7	{	.mfi										(pPipe[5])	ldfpd	Disp_G[0], Disp_H[0] = [nnn[1]], 16	(pPipe[2])	fmpy	RT[0] = RSqr[1], Tabscale	(pPipe[2])	shladd  TypeJ[2] = TypeJ[2], 4, NBFP	}	{	.mfi				nop		0x0	(pPipe[3])	fmpy	RInvErr[1] = RInv[1], RSqr[2]	(pJJNR)     add     jjnrPtr = JJNR_PREFETCH_DISTANCE, jjnrPtr	} ;;//	INNER LOOP 8	{	.mfi		(pPipe[5])	ldfpd	Rep_Y[0], Rep_F[0] = [nnn[1]], 16	(pPipe[7])	fma		VNBTotal = C6[5], Disp_Y[2], VNBTotal 				nop		0x0	}	{	.mfi				nop		0x0	(pPipe[7])	fma     Rep_Y[2] = eps2, Rep_F[2], Rep_Y[2]				nop		0x0	} ;;//	INNER LOOP 9	{	.mfi										(pPipe[5])	ldfpd	Rep_G[0], Rep_H[0] = [nnn[1]]	(pPipe[1])	fma		RSqr[0] = DY[1], DY[1], RSqr[0]				nop		0x0	}	{	.mfi	(pPipe[0])	ld4 	TypeJ[0] = [TypeJ[0]]	(pPipe[4])	fcvt.fx.trunc n0[0] = RT[2]				nop		0x0	} ;;//	INNER LOOP 10	{	.mfi			(pPipe[2])	ldfd	C6[0] = [TypeJ[2]], 8	(pPipe[2])	fnma	RInvErr[0] = RInvErr[0], RInv[0], fOne				nop		0x0	}	{	.mfi	(pJJNR)     lfetch.nta  [jjnrPtr]	(pPipe[3])	fmpy	RInvT[1] = RInv[1], fHALF				nop		0x0	} ;;//	INNER LOOP 11	{	.mfi			(pPipe[2])	ldfd	C12[0] = [TypeJ[2]]	(pPipe[3])	fnma	RInvErr[1] = RInvErr[1], RInv[1], fOne		(pJJNR)     add     jjnrPtr = -JJNR_PREFETCH_DISTANCE, jjnrPtr	}	{	.mfi				nop		0x0	(pPipe[6])	fma		Disp_G[1] = eps1, Disp_H[1], Disp_G[1]				nop		0x0	} ;;//	INNER LOOP 12	{	.mfi				nop		0x0	(pPipe[6])	fma		Rep_G[1] = eps1, Rep_H[1], Rep_G[1]				nop		0x0	}	{	.mfi				nop		0x0	(pPipe[7])	fma		VNBTotal = C12[5], Rep_Y[2], VNBTotal 				nop		0x0	} ;;//	INNER LOOP 13	{	.mfi				nop		0x0	(pPipe[1])	fma		RSqr[0] = DZ[1], DZ[1], RSqr[0]				nop		0x0	}	{	.mfb	(pPipe[4])	getf.sig	nnn[0] = n0[0]	(pPipe[4])	fcvt.xf 	n0[0] = n0[0]				nop		0x0	} ;;//	INNER LOOP 14	{	.mfi				nop		0x0	(pPipe[2])	fmpy	RInvU[0] = RInv[0], RInvErr[0]	(pPipe[1])	add  	TypeJ[1] = TypeJ[1], NTI	}	{	.mfb				nop		0x0	(pPipe[2])	fma		RInvT[0] = RInvErr[0], f3_8, fHALF				nop		0x0	} ;;//	INNER LOOP 15	{	.mfi				nop		0x0	(pPipe[6])	fma		Disp_F[1] = eps1, Disp_G[1], Disp_F[1]				nop		0x0	}	{	.mfb				nop		0x0				nop		0x0		br.ctop.sptk.many innerLoop_nf	} ;;// 	End of modulo-scheduled inner loop	//	Having finshed the loop, we now compute various quantities to	//	store. In paralllel, start computing computing some of the values	//	for the next loop trip, if we're going there.//	OUTER EPILOGUE 1    {   .mfi	(pCont)	shladd	typePtr = II, 2, TYPE			nop		0x0	(pCont)	shladd	II3 = II, 1, II    }	{	.mfi											nop		0x0			nop		0x0	(pCont)	shladd	IS3 = IS, 1, IS    } ;;//	OUTER EPILOGUE 2    {   .mfi	(pCont)	ld4	IS = [shiftPtr], 4		nop		0x0		nop		0x0	}    {   .mmf	(pCont)	setf.sig	f33 = NTYPE	(pCont)	ld4	II = [iinrPtr] ,4		nop		0x0	} ;;// 	OUTER EPILOGUE 3    {   .mfi	(pCont)	ld4				NTI = [typePtr]	  			nop		0x0	(pCont)	shladd	shiftVPtr = IS3, 3, SHIFTVEC							}     {   .mfi		nop 0x0		nop	0x0	(pCont)	shladd	posPtr = II3, 3, POSITION	} ;;//	OUTER EPILOGUE 4    {   .mmb		stfd    [VNBPtr] = VNBTotal	(pCont)		ld4     ggid = [gidPtr], 4 	(pCont)	br.cond.sptk.many	outerLoop_nf	} ;;	// Finish if this was the last chunk, or do another thread-loop iteration//  THREAD EPILOGUE 1	{ .mib						nop				0x0		nop				0x0	(pMore) br.cond.sptk.many threadLoop_nf	} ;;		//	Ready to exit - restore the floating-point registers we saved, the	//	loop counter, and the predicates, then we're done. Note that the	//	stack pointer has the address of the last saved FP register.finish_nf://  EXIT 1	{	.mmi		mov			fillP0 = sp		add			fillP1 = 16, sp		mov			ar.lc = LCSave	}  	{	.mmi		st4			[OuterIter] = Nouter		st4			[InnerIter] = Ninner		nop			0x0	} ;;//  EXIT 2	{	.mmi		ldf.fill		fs6 = [fillP0], 32		ldf.fill		fs5 = [fillP1], 32		mov				pr = PRSave, 0x1ffff	} ;;//  EXIT 3	{	.mmi		ldf.fill		fs4 = [fillP0], 32		ldf.fill		fs3 = [fillP1], 32		nop				0x0	} ;;//  EXIT 4	{	.mmi		ldf.fill		fs2 = [fillP0], 32		ldf.fill		fs1 = [fillP1], 32		add				sp = 6 * 16, sp	} ;;//  EXIT 8	{	.mmb		ldf.fill		fs0 = [fillP0]		nop				0x0		br.ret.sptk.few	rp	} ;;	.endp	 nb_kernel030nf_ia64_double

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?