📄 nb_kernel010_ia32_3dnow.intel_syntax.s

📁 最著名最快的分子模拟软件
💻 S
📖 第 1 页 / 共 2 页
字号:
上一页 12
	;# update i particle force 	movq mm0,  [esp + nb010_fix]	movd mm1,  [esp + nb010_fiz]	pfadd mm0, mm2	pfadd mm1, mm3	movq [esp + nb010_fix], mm0	movd [esp + nb010_fiz], mm1	;# update j particle force 	movq mm0,  [edi + eax*4]	movd mm1,  [edi + eax*4+ 8]	pfsub mm0, mm2	pfsub mm1, mm3	movq [edi + eax*4], mm0	movd [edi + eax*4 +8], mm1	;# done! .nb010_updateouterdata:		mov   ecx, [esp + nb010_ii3]	movq  mm6, [edi + ecx*4]       ;# increment i force 	movd  mm7, [edi + ecx*4 + 8]		pfadd mm6, [esp + nb010_fix]	pfadd mm7, [esp + nb010_fiz]	movq  [edi + ecx*4],    mm6	movd  [edi + ecx*4 +8], mm7	mov   ebx, [ebp + nb010_fshift]    ;# increment fshift force 	mov   edx, [esp + nb010_is3]	movq  mm6, [ebx + edx*4]		movd  mm7, [ebx + edx*4 + 8]		pfadd mm6, [esp + nb010_fix]	pfadd mm7, [esp + nb010_fiz]	movq  [ebx + edx*4],     mm6	movd  [ebx + edx*4 + 8], mm7	;# get n from stack	mov esi, [esp + nb010_n]        ;# get group index for i particle         mov   edx, [ebp + nb010_gid]      	;# base of gid[]        mov   edx, [edx + esi*4]		;# ggid=gid[n]	movq  mm7, [esp + nb010_Vvdwtot]     	pfacc mm7,mm7	          ;# get and sum the two parts of total potential 		mov   eax, [ebp + nb010_Vvdw]	movd  mm6, [eax + edx*4] 	pfadd mm6, mm7	movd  [eax + edx*4], mm6          ;# increment Vvdw[gid]        ;# finish if last         mov ecx, [esp + nb010_nn1]	;# esi already loaded with n	inc esi        sub ecx, esi        jecxz .nb010_outerend        ;# not last, iterate outer loop once more!          mov [esp + nb010_n], esi        jmp .nb010_outer.nb010_outerend:        ;# check if more outer neighborlists remain        mov   ecx, [esp + nb010_nri]	;# esi already loaded with n above        sub   ecx, esi        jecxz .nb010_end        ;# non-zero, do one more workunit        jmp   .nb010_threadloop.nb010_end:	femms	mov eax, [esp + nb010_nouter] 		mov ebx, [esp + nb010_ninner]	mov ecx, [ebp + nb010_outeriter]	mov edx, [ebp + nb010_inneriter]	mov [ecx], eax	mov [edx], ebx		add esp, 132	pop edi	pop esi    	pop edx    	pop ecx    	pop ebx    	pop eax	leave	ret	.globl nb_kernel010nf_ia32_3dnow.globl _nb_kernel010nf_ia32_3dnownb_kernel010nf_ia32_3dnow:	_nb_kernel010nf_ia32_3dnow:	.equiv		nb010nf_p_nri,		8.equiv		nb010nf_iinr,		12.equiv		nb010nf_jindex,		16.equiv		nb010nf_jjnr,		20.equiv		nb010nf_shift,		24.equiv		nb010nf_shiftvec,	28.equiv		nb010nf_fshift,		32.equiv		nb010nf_gid,		36.equiv		nb010nf_pos,		40		.equiv		nb010nf_faction,	44.equiv		nb010nf_charge,		48.equiv		nb010nf_p_facel,	52.equiv		nb010nf_p_krf,		56	.equiv		nb010nf_p_crf,		60	.equiv		nb010nf_Vc,		64	.equiv		nb010nf_type,		68.equiv		nb010nf_p_ntype,	72.equiv		nb010nf_vdwparam,	76	.equiv		nb010nf_Vvdw,		80	.equiv		nb010nf_p_tabscale,	84	.equiv		nb010nf_VFtab,		88.equiv		nb010nf_invsqrta,	92	.equiv		nb010nf_dvda,		96.equiv          nb010nf_p_gbtabscale,   100.equiv          nb010nf_GBtab,          104.equiv          nb010nf_p_nthreads,     108.equiv          nb010nf_count,          112.equiv          nb010nf_mtx,            116.equiv          nb010nf_outeriter,      120.equiv          nb010nf_inneriter,      124.equiv          nb010nf_work,           128	;# stack offsets for local variables .equiv		nb010nf_is3,		0.equiv		nb010nf_ii3,		4.equiv		nb010nf_ix,		8.equiv		nb010nf_iy,		12.equiv		nb010nf_iz,		16.equiv		nb010nf_Vvdwtot,	20  .equiv		nb010nf_c6,		28  .equiv		nb010nf_c12,		36  .equiv		nb010nf_ntia,		44.equiv		nb010nf_innerjjnr,	48.equiv		nb010nf_innerk,		52	.equiv          nb010nf_n,              56 ;# idx for outer loop.equiv          nb010nf_nn1,            60 ;# number of outer iterations.equiv          nb010nf_nri,            64.equiv          nb010nf_ntype,          68.equiv          nb010nf_nouter,         72.equiv          nb010nf_ninner,         76	push ebp	mov ebp,esp	    	push eax    	push ebx    	push ecx     	push edx	push esi	push edi	sub esp, 80		;# local stack space 	femms	mov ecx, [ebp + nb010nf_p_nri]	mov edx, [ebp + nb010nf_p_ntype]	mov ecx, [ecx]	mov edx, [edx]	mov [esp + nb010nf_nri], ecx	mov [esp + nb010nf_ntype], edx	;# zero iteration counters	mov eax, 0	mov [esp + nb010nf_nouter], eax	mov [esp + nb010nf_ninner], eax.nb010nf_threadloop:        mov   esi, [ebp + nb010nf_count]          ;# pointer to sync counter        mov   eax, [esi].nb010nf_spinlock:        mov   ebx, eax                          ;# ebx=*count=nn0        add   ebx, 10                           ;# ebx=nn1=nn0+10        lock        cmpxchg [esi], ebx                      ;# write nn1 to *counter,                                                ;# if it hasnt changed.                                                ;# or reread *counter to eax.        pause                                   ;# -> better p4 performance        jnz .nb010nf_spinlock        ;# if(nn1>nri) nn1=nri        mov ecx, [esp + nb010nf_nri]        mov edx, ecx        sub ecx, ebx        cmovle ebx, edx                         ;# if(nn1>nri) nn1=nri        ;# Cleared the spinlock if we got here.        ;# eax contains nn0, ebx contains nn1.        mov [esp + nb010nf_n], eax        mov [esp + nb010nf_nn1], ebx        sub ebx, eax		; # calc number of outer lists in ecx        mov esi, eax	; # copy n to esi        jg  .nb010nf_outerstart        jmp .nb010nf_end.nb010nf_outerstart:	;; # ebx contains number of outer iterations        add ebx, [esp + nb010nf_nouter]        mov [esp + nb010nf_nouter], ebx	.nb010nf_outer:	mov   eax, [ebp + nb010nf_shift]      ;# eax = pointer into shift[] 	mov   ebx, [eax+esi*4]		;# ebx=shift[n] 		lea   ebx, [ebx + ebx*2]    ;# ebx=3*is 	mov   [esp + nb010nf_is3],ebx    	;# store is3 	mov   eax, [ebp + nb010nf_shiftvec]   ;# eax = base of shiftvec[] 		movq  mm0, [eax + ebx*4]	;# move shX/shY to mm0 and shZ to mm1. 	movd  mm1, [eax + ebx*4 + 8]	mov   ecx, [ebp + nb010nf_iinr]       ;# ecx = pointer into iinr[] 		mov   ebx, [ecx+esi*4]	    ;# ebx =ii 	mov   edx, [ebp + nb010nf_type] 		mov   edx, [edx + ebx*4]		imul  edx, [esp + nb010nf_ntype]	shl   edx, 1	mov   [esp + nb010nf_ntia], edx	lea   ebx, [ebx + ebx*2]	;# ebx = 3*ii=ii3 	mov   eax, [ebp + nb010nf_pos]    ;# eax = base of pos[] 		pfadd mm0, [eax + ebx*4]    ;# ix = shX + posX (and iy too) 	movd  mm3, [eax + ebx*4 + 8]    ;# cant use direct memory add for 4 bytes (iz) 	mov   [esp + nb010nf_ii3], ebx	pfadd mm1, mm3	movq  [esp + nb010nf_ix], mm0		movd  [esp + nb010nf_iz], mm1						;# clear total potential 	pxor  mm7,mm7	movq  [esp + nb010nf_Vvdwtot], mm7	mov   eax, [ebp + nb010nf_jindex]	mov   ecx, [eax + esi*4]	     ;# jindex[n] 	mov   edx, [eax + esi*4 + 4]	     ;# jindex[n+1] 	sub   edx, ecx               ;# number of innerloop atoms 	mov   esi, [ebp + nb010nf_pos]		mov   eax, [ebp + nb010nf_jjnr]	shl   ecx, 2	add   eax, ecx	mov   [esp + nb010nf_innerjjnr], eax     ;#  pointer to jjnr[nj0] 	mov   ecx, edx	sub   edx,  2	add   ecx, [esp + nb010nf_ninner]	mov   [esp + nb010nf_ninner], ecx	mov   [esp + nb010nf_innerk], edx    ;# number of innerloop atoms 	add   edx, 0	jge   .nb010nf_unroll_loop	jmp   .nb010nf_finish_inner.nb010nf_unroll_loop:	;# paired innerloop starts here 	mov   ecx, [esp + nb010nf_innerjjnr]     ;# pointer to jjnr[k] 	mov   eax, [ecx]		mov   ebx, [ecx + 4]         ;# eax/ebx=jnr 	add dword ptr [esp + nb010nf_innerjjnr],  8 ;# advance pointer (unrolled 2) 	prefetch [ecx + 16]	     ;# prefetch data - trial and error says 16 is best 		mov ecx, [ebp + nb010nf_type]	mov edx, [ecx + eax*4]        	 ;# type [jnr1] 	mov ecx, [ecx + ebx*4]       ;# type [jnr2] 	mov esi, [ebp + nb010nf_vdwparam]		;# base of vdwparam  	shl edx, 1	shl ecx, 1	add edx, [esp + nb010nf_ntia]	     ;# tja = ntia + 2*type 	add ecx, [esp + nb010nf_ntia]	movq mm5, [esi + edx*4]		;# mm5 = 1st c6 / c12 			movq mm7, [esi + ecx*4]		;# mm7 = 2nd c6 / c12 		movq mm6,mm5				punpckldq mm5,mm7		;# mm5 = 1st c6 / 2nd c6 	punpckhdq mm6,mm7		;# mm6 = 1st c12 / 2nd c12 	movq [esp + nb010nf_c6], mm5	movq [esp + nb010nf_c12], mm6	lea   eax, [eax + eax*2]     ;# replace jnr with j3 	lea   ebx, [ebx + ebx*2]			mov   esi, [ebp + nb010nf_pos]	movq  mm0, [esp + nb010nf_ix]	movd  mm1, [esp + nb010nf_iz]	 		movq  mm4, [esi + eax*4]     ;# fetch first j coordinates 	movd  mm5, [esi + eax*4 + 8]			pfsubr mm4,mm0		     ;# dr = ir - jr  	pfsubr mm5,mm1	pfmul mm4,mm4	             ;# square dx,dy,dz 		         	pfmul mm5,mm5			pfacc mm4, mm5               ;# accumulate to get dx*dx+ dy*dy+ dz*dz 	pfacc mm4, mm5		     ;# first rsq in lower mm4 	movq  mm6, [esi + ebx*4]     ;# fetch second j coordinates  	movd  mm7, [esi + ebx*4 + 8]		pfsubr mm6,mm0	             ;# dr = ir - jr  	pfsubr mm7,mm1	pfmul mm6,mm6	             ;# square dx,dy,dz 	pfmul mm7,mm7	pfacc mm6, mm7		     ;# accumulate to get dx*dx+ dy*dy+ dz*dz 	pfacc mm6, mm7	             ;# second rsq in lower mm6     	pfrcp mm0, mm4	             ;# lookup reciprocal seed      	pfrcp mm1, mm6 	punpckldq mm0,mm1	punpckldq mm4,mm6        	;# now 4 has rsq and 0 the seed for both pairs.                   	        	;# amd 3dnow N-R iteration to get full precision.     	pfrcpit1 mm4,mm0				    	pfrcpit2 mm4,mm0		;# mm4 now contains invsq,	 ;# do potential and fscal	 	movq  mm0, mm4	pfmul mm4, mm0	pfmul mm4, mm0             	;# mm4=rinvsix 	movq  mm5, mm4		pfmul mm5, mm5	            ;# mm5=rinvtwelve 	pfmul mm5, [esp + nb010nf_c12]	pfmul mm4, [esp + nb010nf_c6]		movq mm6, mm5	;# mm6 is Vvdw12-Vvdw6  	pfsub mm6, mm4	;# update Vvdwtot  	pfadd mm6, [esp + nb010nf_Vvdwtot]      ;# add the earlier value 	movq [esp + nb010nf_Vvdwtot], mm6       ;# store the sum 		;# should we do one more iteration? 	sub dword ptr [esp + nb010nf_innerk],  2	jl    .nb010nf_finish_inner	jmp   .nb010nf_unroll_loop.nb010nf_finish_inner:		and dword ptr [esp + nb010nf_innerk],  1	jnz  .nb010nf_single_inner	jmp  .nb010nf_updateouterdata.nb010nf_single_inner:	;# a single j particle iteration here - compare with the unrolled code for comments 	mov   eax, [esp + nb010nf_innerjjnr]	mov   eax, [eax]	;# eax=jnr offset 	mov esi, [ebp + nb010nf_vdwparam]	mov ecx, [ebp + nb010nf_type]	mov edx, [ecx + eax*4]        	;# type [jnr1] 	shl edx, 1	add edx, [esp + nb010nf_ntia]	    ;# tja = ntia + 2*type 	movd mm5, [esi + edx*4]		;# mm5 = 1st c6 			movq [esp + nb010nf_c6], mm5	movd mm5, [esi + edx*4 + 4]	;# mm5 = 1st c12 			movq [esp + nb010nf_c12], mm5	mov   esi, [ebp + nb010nf_pos]	lea   eax, [eax + eax*2]	movq  mm0, [esp + nb010nf_ix]	movd  mm1, [esp + nb010nf_iz]	movq  mm4, [esi + eax*4]	movd  mm5, [esi + eax*4 + 8]	pfsubr mm4, mm0	pfsubr mm5, mm1	pfmul mm4,mm4	pfmul mm5,mm5	pfacc mm4, mm5	pfacc mm4, mm5		;# mm4=rsq 	    	pfrcp mm0,mm4    	pfrcpit1 mm4,mm0				    	pfrcpit2 mm4,mm0	;# mm4=invsq 	;# calculate potentials and scalar force 	movq  mm0, mm4	pfmul mm4, mm0	pfmul mm4, mm0             	;# mm4=rinvsix 	movq  mm5, mm4		pfmul mm5, mm5	            ;# mm5=rinvtwelve 	pfmul mm5, [esp + nb010nf_c12]	pfmul mm4, [esp + nb010nf_c6]		movq mm6, mm5	;# mm6 is Vvdw12-Vvdw6 	pfsub mm6, mm4	;# update Vvdwtot 	pfadd mm6, [esp + nb010nf_Vvdwtot]      ;# add the earlier value 	movq [esp + nb010nf_Vvdwtot], mm6       ;# store the sum   .nb010nf_updateouterdata:		;# get n from stack	mov esi, [esp + nb010nf_n]        ;# get group index for i particle         mov   edx, [ebp + nb010nf_gid]      	;# base of gid[]        mov   edx, [edx + esi*4]		;# ggid=gid[n]	movq  mm7, [esp + nb010nf_Vvdwtot]     	pfacc mm7,mm7	          ;# get and sum the two parts of total potential 		mov   eax, [ebp + nb010nf_Vvdw]	movd  mm6, [eax + edx*4] 	pfadd mm6, mm7	movd  [eax + edx*4], mm6          ;# increment Vvdw[gid]        ;# finish if last         mov ecx, [esp + nb010nf_nn1]	;# esi already loaded with n	inc esi        sub ecx, esi        jecxz .nb010nf_outerend        ;# not last, iterate outer loop once more!          mov [esp + nb010nf_n], esi        jmp .nb010nf_outer.nb010nf_outerend:        ;# check if more outer neighborlists remain        mov   ecx, [esp + nb010nf_nri]	;# esi already loaded with n above        sub   ecx, esi        jecxz .nb010nf_end        ;# non-zero, do one more workunit        jmp   .nb010nf_threadloop.nb010nf_end:	femms        mov eax, [esp + nb010nf_nouter]	mov ebx, [esp + nb010nf_ninner]	mov ecx, [ebp + nb010nf_outeriter]	mov edx, [ebp + nb010nf_inneriter]	mov [ecx], eax        mov [edx], ebx			add esp, 80	pop edi	pop esi    	pop edx    	pop ecx    	pop ebx    	pop eax	leave	ret
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -