nb_kernel100nf_ia64_double.s

来自「最著名最快的分子模拟软件」· S 代码 · 共 786 行 · 第 1/2 页

S
786
字号
//  THREAD PROLOGUE 1		{	.mfi				fetchadd4.rel	NN0 = [COUNT], THREAD_CHUNK_SIZE		nop				0x0		nop				0x0	}	{	.mfi // alignment bundle		nop				0x0		nop				0x0		nop				0x0	} ;;    //  THREAD PROLOGUE 2 - at least 12 cycle latency hole before this bundle (fetchadd4)	{	.mmi				cmp.lt			pCont, pDone = NN0, NRI		shladd			gidPtr = NN0, 2, GID		adds			NN1 = THREAD_CHUNK_SIZE, NN0	}	{	.mmi		shladd			jindexPtr = NN0, 2, JINDEX		shladd   		shiftPtr  = NN0, 2, SHIFT		shladd			iinrPtr   = NN0, 2, IINR	} ;; //  THREAD PROLOGUE 3 		{ .mmi					(pCont) ld4			II = [iinrPtr], 4	(pCont) ld4			IS = [shiftPtr], 4		cmp.ge			pLast, pMore = NN1, NRI	}	{ .mib	(pCont) ld4			NJ0 = [jindexPtr], 4	(pCont) adds		Tmp2 = 1, NN0	(pDone) br.cond.spnt.few finish_nf	} ;; 		//  THREAD PROLOGUE 4		{ .mmi							ld4			ggid = [gidPtr], 4			shladd		II3 = II, 1, II			shladd		IS3 = IS, 1, IS	}	{ .mmi		ld4				NJ1 = [jindexPtr], 4		shladd			chargePtr = II, 3, CHARGE		shladd			jjnrPtr = NJ0, 2, JJNR	} ;; 		//  THREAD PROLOGUE 5		{ .mmi		cmp.lt			pCont, pDone = Tmp2, NRI								nop				0x0	(pLast)	mov			NN1 = NRI	}		{ .mmi		shladd			posPtr    = II3, 3, POSITION		nop				0x0		shladd			shiftVPtr = IS3, 3, SHIFTVEC		} ;;//  THREAD PROLOGUE 6		{ .mmi						ld4				jnr = [jjnrPtr], 4	(pCont)		ld4		IS = [shiftPtr], 4		nop				0x0	}		{ .mmi	(pCont)		ld4		II = [iinrPtr], 4		nop				0x0		nop				0x0	} ;;//  12 bundles in thread prologue - still alignedouterLoop_nf:	//	At this point in the outer loop, the following values are ready	//	//		FActII		Pointer to FACTION XYZ for II	//		FShiftIS	Pointer to FSHIFT XYZ for IS	//		shiftVPtr	Pointer to current shift XYZ values	//		posPtr		Pointer to current XYZ position	//		chargePtr	Pointer to current atom charge	//		ggid		Index for Vc array	//		jjnr		Pointer to next neighbor index	//		jnr			Current jnr value	//		NJ0, NJ1	Bounds of current neighbor list	//	//	Load up all the floating-point values (yes, McKinley can do 4 FP loads	//	per cycle) and initialize the loop counters and predicates. Compute	//	the initial position <x, y, z> and charge. If this isn't the last time	//	through the loop, start loading the next value for NJ1 - we already	//	moved the previous NJ1 -> NJ0.//	OUTER PROLOGUE 1	{	.mmf								ldfd		shX = [shiftVPtr], 8		ldfd		PosX = [posPtr], 8		mov			FIX = f0	}	{	.mfi		nop			0x0		mov			FIY = f0		add		Nouter = 1, Nouter	} ;;//	OUTER PROLOGUE 2	{	.mmf				ldfd		shY = [shiftVPtr], 8		ldfd		PosY = [posPtr], 8		mov			FIZ = f0	}	{	.mmi		nop			0x0		nop			0x0		shladd		VCPtr = ggid, 3, VC	} ;;//	OUTER PROLOGUE 3	{	.mmi				ldfd		shZ = [shiftVPtr]		ldfd		PosZ = [posPtr]		sub			InnerCnt = NJ1, NJ0, 1	}	{	.mmi		nop			0x0		nop			0x0		mov			NJ0 = NJ1	} ;;//	OUTER PROLOGUE 4	{	.mmf				ldfd		ICharge = [chargePtr]		ldfd		VCTotal = [VCPtr]		fadd		IX = shX, PosX	} ;;//	OUTER PROLOGUE 5	{	.mfi				add			NN0 = 1, NN0		fadd		IY = shY, PosY		//	This may seem strange, but we set the first stage of the		//	pipe to execute this way because setting pr.rot doesn't take		//	into account how much the predicates have rotated. If this is		//	the first time through, we cleared all the pipeline predicates		//	in the initialization. If not, flushing the pipeline set all		//	the pipeline predicates to 0		cmp.eq		pPipe[0], p0 = zero, zero	} ;;//	OUTER PROLOGUE 6	{	.mfi				cmp.lt		pCont, pDone = NN0, NN1		fadd		IZ = shZ, PosZ		mov		    ar.lc = InnerCnt	} ;;//	OUTER PROLOGUE 7	{	.mfi			(pCont)	ld4		NJ1 = [jindexPtr], 4		fmpy		IQ = ICharge, Facel		mov			ar.ec = PIPE_DEPTH	} ;;// 10 bundles in outer loop - still aligned.	//	The inner loop is a 6-stage pipeline. The serial sequence of float ops	//	is folded into a 12-cycle loop (12 * 2 = 24 float ops), then divided	//	into 5 stages.innerLoop_nf://	INNER LOOP 1	{	.mfi		(pPipe[0])	shladd	chargePtr = jnr, 3, CHARGE	(pPipe[2])	fsub	DY[2] = IY, DY[2]	(pPipe[0])	shladd	jnr3 = jnr, 1, jnr	}	//	We march through jjnr[] sequentially, so it's usually a good idea	//	to preload the next value. However, we don't want to do this if	//	(1) we're in the epilogue or (2) this is the last time through and	//	there are no more atoms to inspect. Thus, we keep track of the loop	//	trip and use the logic below to see if we should load ahead	.pred.rel "mutex", pCont, pDone	{	.mfi	(pCont)		cmp.ge	pJJNR, p0 = InnerCnt, zero	(pPipe[5])	fma	RInvT[0] = RInvErr[1], f3_8, fHALF	(pDone)		cmp.gt	pJJNR, p0 = InnerCnt, zero	} ;;//	INNER LOOP 2	{	.mfi		(pJJNR)		ld4	jnr = [jjnrPtr], 4	(pPipe[2])	fsub	DZ[2] = IZ, DZ[2]			nop	0x0	}	{	.mfi	(pPipe[0])	shladd	posPtr = jnr3, 3, POSITION	(pPipe[4])	fmpy	RInvErr[0] = RInv[1], RSqr[2]		nop		0x0	} ;;//	INNER LOOP 3	{	.mfi										(pPipe[0])	ldfd	JX = [posPtr], 8	(pPipe[2])	fmpy	RSqr[0] = DX[2], DX[2]	(pPipe[0])	add	InnerCnt = -1, InnerCnt	}	{  	.mfi			nop	0x0	(pPipe[3])	fma 	RSqr[1] = DZ[3], DZ[3], RSqr[1]	(pPipe[0])	add	Ninner = 1, Ninner	} ;;//	INNER LOOP 4	{	.mfi		(pPipe[0])	ldfd	JY = [posPtr], 8	(pPipe[6])	fmpy	RInvT[1] = fHALF, RInv[3]	(pJJNR)     add     jjnrPtr = JJNR_PREFETCH_DISTANCE, jjnrPtr	}	{	.mfi			nop	0x0	(pPipe[5])	fmpy	RInvU[0] = RInv[2], RInvErr[1]			nop	0x0	} ;;//	INNER LOOP 5	{	.mfi										(pPipe[0])	ldfd	JZ = [posPtr], 8	(pPipe[7])	fma 	RInv[4] = RInvT[2], RInvErr[3], RInv[4]			nop	0x0	}	{	.mfi						nop	0x0	(pPipe[6])	fmpy	RInvErr[2] = RSqr[4], RInv[3]			nop	0x0	} ;;//	INNER LOOP 6	{	.mfi		(pJJNR)     lfetch.nta  [jjnrPtr]	(pPipe[4])	fnma	RInvErr[0] = RInvErr[0], RInv[1], fOne			nop	0x0	}	{	.mfi			nop	0x0	(pPipe[3])	fmpy	Charge[3] = IQ, Charge[3]			nop	0x0	} ;;//	INNER LOOP 7	{	.mfi												nop	0x0	(pPipe[2])	fma		RSqr[0] = DY[2], DY[2], RSqr[0]	(pJJNR)     add     jjnrPtr = -JJNR_PREFETCH_DISTANCE, jjnrPtr	}	{	.mfi			nop	0x0	(pPipe[3])	frsqrta RInv[0], p0 = RSqr[1]			nop	0x0	} ;;//	INNER LOOP 8	{	.mfi					nop		0x0	(pPipe[1])	fsub	DX[1] = IX, DX[1]				nop	0x0	}	{	.mfi			nop	0x0	(pPipe[5])	fma		RInv[2] = RInvT[0], RInvU[0], RInv[2]			nop	0x0	} ;;//	INNER LOOP 9	{	.mfi										(pPipe[0])	ldfd	QCharge = [chargePtr]	(pPipe[6])	fnma	RInvErr[2] = RInvErr[2], RInv[3], fOne			nop	0x0	}	{	.mfb			nop	0x0	(pPipe[7])	fma 	VCTotal = Charge[7], RInv[4], VCTotal		br.ctop.sptk.many innerLoop_nf	} ;;	// 	End of modulo-scheduled inner loop	//	Having finshed the loop, we now compute various quantities to	//	store. In paralllel, start computing computing some of the values	//	for the next loop trip, if we're going there.//	OUTER EPILOGUE 1    {   .mfi	  	nop	0x0			nop		0x0	(pCont)	shladd	II3 = II, 1, II    }	{	.mfi									(pCont)	shladd	chargePtr = II, 3, CHARGE		nop 0x0	(pCont)	shladd	IS3 = IS, 1, IS    } ;;//	OUTER EPILOGUE 2    {   .mfi	(pCont)	ld4	II = [iinrPtr] ,4		nop	0x0		nop 0x0	}    {   .mfi	(pCont)	ld4	IS = [shiftPtr], 4		nop	0x0		nop 0x0	} ;;// 	OUTER EPILOGUE 3    {   .mfi		nop 0x0		nop	0x0	(pCont)	shladd	shiftVPtr = IS3, 3, SHIFTVEC							}     {   .mfi		nop 0x0		nop	0x0	(pCont)	shladd	posPtr = II3, 3, POSITION	} ;;//	OUTER EPILOGUE 4    {   .mmb		stfd    [VCPtr] = VCTotal	(pCont)		ld4     ggid = [gidPtr], 4 	(pCont)	br.cond.sptk.many	outerLoop_nf	} ;;	// Finish if this was the last chunk, or do another thread-loop iteration//  THREAD EPILOGUE 1	{ .mib						nop				0x0		nop				0x0	(pMore) br.cond.sptk.many threadLoop_nf	} ;;		//	Ready to exit - restore the floating-point registers we saved, the	//	loop counter, and the predicates, then we're done. Note that the	//	stack pointer has the address of the last saved FP register.finish_nf://  EXIT 1	{	.mmi		mov			fillP0 = sp		add			fillP1 = 16, sp		nop			0x0	}  	{	.mmi		st4			[OuterIter] = Nouter		st4			[InnerIter] = Ninner		nop			0x0	} ;;//  EXIT 2	{	.mmi		ldf.fill		fs7 = [fillP0], 32		ldf.fill		fs6 = [fillP1], 32		mov				ar.lc = LCSave	} ;;//  EXIT 3	{	.mmi		ldf.fill		fs5 = [fillP0], 32		ldf.fill		fs4 = [fillP1], 32		mov				pr = PRSave, 0x1ffff	} ;;//  EXIT 4	{	.mmi		ldf.fill		fs3 = [fillP0], 32		ldf.fill		fs2 = [fillP1], 32		add				sp = 7 * 16, sp	} ;;//  EXIT 5	{	.mmb		ldf.fill		fs1 = [fillP0]		ldf.fill		fs0 = [fillP1]		br.ret.sptk.few	rp	} ;;	.endp	 nb_kernel100nf_ia64_double

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?