⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 nb_kernel312_ia32_3dnow.intel_syntax.s

📁 最著名最快的分子模拟软件
💻 S
📖 第 1 页 / 共 3 页
字号:
	add   edx, ecx	mov   eax, [ebp + nb312nf_vdwparam]	movd  mm0, [eax + edx*4]	movd  mm1, [eax + edx*4 + 4]	movq  [esp + nb312nf_c6], mm0	movq  [esp + nb312nf_c12], mm1	movd  mm5, [edi]	punpckldq mm5,mm5	movq  [esp + nb312nf_tsc], mm5.nb312nf_threadloop:        mov   esi, [ebp + nb312nf_count]          ;# pointer to sync counter        mov   eax, [esi].nb312nf_spinlock:        mov   ebx, eax                          ;# ebx=*count=nn0        add   ebx, 1                           ;# ebx=nn1=nn0+10        lock        cmpxchg [esi], ebx                      ;# write nn1 to *counter,                                                ;# if it hasnt changed.                                                ;# or reread *counter to eax.        pause                                   ;# -> better p4 performance        jnz .nb312nf_spinlock        ;# if(nn1>nri) nn1=nri        mov ecx, [esp + nb312nf_nri]        mov edx, ecx        sub ecx, ebx        cmovle ebx, edx                         ;# if(nn1>nri) nn1=nri        ;# Cleared the spinlock if we got here.        ;# eax contains nn0, ebx contains nn1.        mov [esp + nb312nf_n], eax        mov [esp + nb312nf_nn1], ebx        sub ebx, eax                            ;# calc number of outer lists	mov esi, eax				;# copy n to esi        jg  .nb312nf_outerstart        jmp .nb312nf_end.nb312nf_outerstart:	;# ebx contains number of outer iterations	add ebx, [esp + nb312nf_nouter]        mov [esp + nb312nf_nouter], ebx	.nb312nf_outer:	mov   eax, [ebp + nb312nf_shift]      ;# eax = pointer into shift[] 	mov   ebx, [eax+esi*4]		;# ebx=shift[n] 		lea   ebx, [ebx + ebx*2]    ;# ebx=3*is 	mov   [esp + nb312nf_is3],ebx    	;# store is3 	mov   eax, [ebp + nb312nf_shiftvec]   ;# eax = base of shiftvec[] 		movq  mm5, [eax + ebx*4]	;# move shX/shY to mm5 and shZ to mm6. 	movd  mm6, [eax + ebx*4 + 8]	movq  mm0, mm5	movq  mm1, mm5	movq  mm2, mm6	punpckldq mm0,mm0	    ;# also expand shX,Y,Z in mm0--mm2. 	punpckhdq mm1,mm1	punpckldq mm2,mm2				mov   ecx, [ebp + nb312nf_iinr]       ;# ecx = pointer into iinr[] 		mov   ebx, [ecx+esi*4]	    ;# ebx=ii 	lea   ebx, [ebx + ebx*2]	;# ebx = 3*ii=ii3 	mov   eax, [ebp + nb312nf_pos]    ;# eax = base of pos[] 	pfadd mm5, [eax + ebx*4]    ;# ix = shX + posX (and iy too) 	movd  mm7, [eax + ebx*4 + 8]    ;# cant use direct memory add for 4 bytes (iz) 	mov   [esp + nb312nf_ii3], ebx	    ;# (use mm7 as temp. storage for iz.) 	pfadd mm6, mm7	movq  [esp + nb312nf_ixO], mm5		movq  [esp + nb312nf_izO], mm6	movd  mm3, [eax + ebx*4 + 12]	movd  mm4, [eax + ebx*4 + 16]	movd  mm5, [eax + ebx*4 + 20]	punpckldq  mm3, [eax + ebx*4 + 24]	punpckldq  mm4, [eax + ebx*4 + 28]	punpckldq  mm5, [eax + ebx*4 + 32] ;# coords of H1 in low mm3-mm5, H2 in high 		pfadd mm0, mm3	pfadd mm1, mm4	pfadd mm2, mm5			movq [esp + nb312nf_ixH], mm0		movq [esp + nb312nf_iyH], mm1		movq [esp + nb312nf_izH], mm2		;# clear vctot and i forces 	pxor  mm7,mm7	movq  [esp + nb312nf_vctot], mm7	movq  [esp + nb312nf_Vvdwtot], mm7	mov   eax, [ebp + nb312nf_jindex]	mov   ecx, [eax+esi*4]	     ;# jindex[n] 	mov   edx, [eax + esi*4 + 4]	     ;# jindex[n+1] 	sub   edx, ecx               ;# number of innerloop atoms 	mov   [esp + nb312nf_innerk], edx    ;# number of innerloop atoms 	add   edx, [esp + nb312nf_ninner]	mov   [esp + nb312nf_ninner], edx	mov   esi, [ebp + nb312nf_pos]	mov   eax, [ebp + nb312nf_jjnr]	shl   ecx, 2	add   eax, ecx	mov   [esp + nb312nf_innerjjnr], eax     ;# pointer to jjnr[nj0] .nb312nf_inner_loop:	;# a single j particle iteration here - compare with the unrolled code for comments. 	mov   eax, [esp + nb312nf_innerjjnr]	mov   eax, [eax]	;# eax=jnr offset 	add dword ptr [esp + nb312nf_innerjjnr],  4 ;# advance pointer 	lea   eax, [eax + eax*2]	movq  mm0, [esi + eax*4]	movd  mm1, [esi + eax*4 + 8]	;# copy & expand to mm2-mm4 for the H interactions 	movq  mm2, mm0	movq  mm3, mm0	movq  mm4, mm1	punpckldq mm2,mm2	punpckhdq mm3,mm3	punpckldq mm4,mm4		pfsubr mm0, [esp + nb312nf_ixO]	pfsubr mm1, [esp + nb312nf_izO]			pfmul mm0,mm0	pfmul mm1,mm1	pfacc mm0, mm0	pfadd mm0, mm1		;# mm0=rsqO 		punpckldq mm2, mm2	punpckldq mm3, mm3	punpckldq mm4, mm4  ;# mm2-mm4 is jx-jz 	pfsubr mm2, [esp + nb312nf_ixH]	pfsubr mm3, [esp + nb312nf_iyH]	pfsubr mm4, [esp + nb312nf_izH] ;# mm2-mm4 is dxH-dzH 		pfmul mm2,mm2	pfmul mm3,mm3	pfmul mm4,mm4	pfadd mm3,mm2	pfadd mm3,mm4		;# mm3=rsqH 	movq [esp + nb312nf_tmprsqH], mm3    	pfrsqrt mm1,mm0    	movq mm2,mm1    	pfmul mm1,mm1    	pfrsqit1 mm1,mm0				    	pfrcpit2 mm1,mm2	;# mm1=invsqrt  	pfmul mm0, mm1		;# mm0=rsq  	pfmul mm0, [esp + nb312nf_tsc]	pf2iw mm4, mm0	movd [esp + nb312nf_n1], mm4	pi2fd mm4,mm4	pfsub mm0, mm4               ;# now mm0 is eps and mm4 n0 	movq  mm2, mm0	pfmul mm2, mm2		;# mm0 is eps, mm2 eps2 	;# coulomb table 	mov edx, [ebp + nb312nf_VFtab]	mov ecx, [esp + nb312nf_n1]	shl ecx, 2	;# load all values we need 	movd mm4, [edx + ecx*4]	movd mm5, [edx + ecx*4 + 4]	movd mm6, [edx + ecx*4 + 8]	movd mm7, [edx + ecx*4 + 12]		pfmul mm6, mm0  ;# mm6 = Geps 			pfmul mm7, mm2	;# mm7 = Heps2 		pfadd mm5, mm6	pfadd mm5, mm7	;# mm5 = Fp 	pfmul mm5, mm0  ;# mm5=eps*Fp 	pfadd mm5, mm4	;#  mm5= VV 	pfmul mm5, [esp + nb312nf_qqOO]	;# vcoul=qq*VV 	;# update vctot directly 	pfadd mm5, [esp + nb312nf_vctot]	movq [esp + nb312nf_vctot], mm5		movq mm5, mm1	pfmul mm5,mm5	movq mm4, mm5	pfmul mm4,mm5	pfmul mm4,mm5	movq mm5, mm4	pfmul mm5,mm5	;# mm4=rinvsix, mm5=rinvtwelve 	pfmul mm4, [esp + nb312nf_c6]	pfmul mm5, [esp + nb312nf_c12]	movq mm6,mm5	pfsub mm6,mm4	;# update Vvdwtot  	pfadd mm6, [esp + nb312nf_Vvdwtot]      ;# add the earlier value 	movq [esp + nb312nf_Vvdwtot], mm6       ;# store the sum       		;# time for hydrogens! 	movq mm0, [esp + nb312nf_tmprsqH]	pfrsqrt mm1, mm0	pswapd mm0,mm0	pfrsqrt mm2, mm0	pswapd mm0,mm0	punpckldq mm1,mm2	;# seeds are in mm1 now, and rsq in mm0. 	movq mm2, mm1	pfmul mm1,mm1    	pfrsqit1 mm1,mm0				    	pfrcpit2 mm1,mm2	;# mm1=invsqrt 		pfmul mm0,mm1		;# mm0=r 	pfmul mm0, [esp + nb312nf_tsc]	pf2iw mm4, mm0	movq [esp + nb312nf_n1], mm4	pi2fd mm4,mm4	pfsub mm0, mm4               ;# now mm0 is eps and mm4 n0 	movq  mm2, mm0	pfmul mm2, mm2		;# mm0 is eps, mm2 eps2 		;# coulomb table 	mov edx, [ebp + nb312nf_VFtab]	mov ecx, [esp + nb312nf_n1]	shl ecx, 2	;# load all values we need 	movd mm4, [edx + ecx*4]	movd mm5, [edx + ecx*4 + 4]	movd mm6, [edx + ecx*4 + 8]	movd mm7, [edx + ecx*4 + 12]	mov ecx, [esp + nb312nf_n1+4]	shl ecx, 2	punpckldq mm4, [edx + ecx*4]	punpckldq mm5, [edx + ecx*4 + 4]	punpckldq mm6, [edx + ecx*4 + 8]	punpckldq mm7, [edx + ecx*4 + 12]		pfmul mm6, mm0  ;# mm6 = Geps 			pfmul mm7, mm2	;# mm7 = Heps2 		pfadd mm5, mm6	pfadd mm5, mm7	;# mm5 = Fp 	pfmul mm5, mm0  ;# mm5=eps*Fp 	pfadd mm5, mm4	;#  mm5= VV 	pfmul mm5, [esp + nb312nf_qqOH]	;# vcoul=qq*VV 	;# update vctot 	pfadd mm5, [esp + nb312nf_vctot]	movq [esp + nb312nf_vctot], mm5		;# interactions with j H1 	movq  mm0, [esi + eax*4 + 12]	movd  mm1, [esi + eax*4 + 20]	;# copy & expand to mm2-mm4 for the H interactions 	movq  mm2, mm0	movq  mm3, mm0	movq  mm4, mm1	punpckldq mm2,mm2	punpckhdq mm3,mm3	punpckldq mm4,mm4		pfsubr mm0, [esp + nb312nf_ixO]	pfsubr mm1, [esp + nb312nf_izO]			pfmul mm0,mm0	pfmul mm1,mm1	pfacc mm0, mm1	pfadd mm0, mm1		;# mm0=rsqO 		punpckldq mm2, mm2	punpckldq mm3, mm3	punpckldq mm4, mm4  ;# mm2-mm4 is jx-jz 	pfsubr mm2, [esp + nb312nf_ixH]	pfsubr mm3, [esp + nb312nf_iyH]	pfsubr mm4, [esp + nb312nf_izH] ;# mm2-mm4 is dxH-dzH 		pfmul mm2,mm2	pfmul mm3,mm3	pfmul mm4,mm4	pfadd mm3,mm2	pfadd mm3,mm4		;# mm3=rsqH 	movq [esp + nb312nf_tmprsqH], mm3    pfrsqrt mm1,mm0    movq mm2,mm1    pfmul mm1,mm1    pfrsqit1 mm1,mm0				    pfrcpit2 mm1,mm2	;# mm1=invsqrt 	pfmul mm0, mm1		;# mm0=rsq  		pfmul mm0, [esp + nb312nf_tsc]	pf2iw mm4, mm0	movd [esp + nb312nf_n1], mm4	pi2fd mm4,mm4	pfsub mm0, mm4               ;# now mm0 is eps and mm4 n0 	movq  mm2, mm0	pfmul mm2, mm2		;# mm0 is eps, mm2 eps2 	;# coulomb table 	mov edx, [ebp + nb312nf_VFtab]	mov ecx, [esp + nb312nf_n1]	shl ecx, 2	;# load all values we need 	movd mm4, [edx + ecx*4]	movd mm5, [edx + ecx*4 + 4]	movd mm6, [edx + ecx*4 + 8]	movd mm7, [edx + ecx*4 + 12]		pfmul mm6, mm0  ;# mm6 = Geps 			pfmul mm7, mm2	;# mm7 = Heps2 		pfadd mm5, mm6	pfadd mm5, mm7	;# mm5 = Fp 	pfmul mm5, mm0  ;# mm5=eps*Fp 	pfadd mm5, mm4	;#  mm5= VV 	pfmul mm5, [esp + nb312nf_qqOH]	;# vcoul=qq*VV 	;# update vctot  directly, force is moved to mm3 	pfadd mm5, [esp + nb312nf_vctot]	movq [esp + nb312nf_vctot], mm5		movq mm0, [esp + nb312nf_tmprsqH]	pfrsqrt mm1, mm0	pswapd mm0,mm0	pfrsqrt mm2, mm0	pswapd mm0,mm0	punpckldq mm1,mm2	;# seeds are in mm1 now, and rsq in mm0. 	movq mm2, mm1	pfmul mm1,mm1    pfrsqit1 mm1,mm0				    pfrcpit2 mm1,mm2	;# mm1=invsqrt 		pfmul mm0,mm1		;# mm0=r 	pfmul mm0, [esp + nb312nf_tsc]	pf2iw mm4, mm0	movq [esp + nb312nf_n1], mm4	pi2fd mm4,mm4	pfsub mm0, mm4               ;# now mm0 is eps and mm4 n0 	movq  mm2, mm0	pfmul mm2, mm2		;# mm0 is eps, mm2 eps2 		;# coulomb table 	mov edx, [ebp + nb312nf_VFtab]	mov ecx, [esp + nb312nf_n1]	shl ecx, 2	;# load all values we need 	movd mm4, [edx + ecx*4]	movd mm5, [edx + ecx*4 + 4]	movd mm6, [edx + ecx*4 + 8]	movd mm7, [edx + ecx*4 + 12]	mov ecx, [esp + nb312nf_n1+4]	shl ecx, 2	punpckldq mm4, [edx + ecx*4]	punpckldq mm5, [edx + ecx*4 + 4]	punpckldq mm6, [edx + ecx*4 + 8]	punpckldq mm7, [edx + ecx*4 + 12]		pfmul mm6, mm0  ;# mm6 = Geps 			pfmul mm7, mm2	;# mm7 = Heps2 		pfadd mm5, mm6	pfadd mm5, mm7	;# mm5 = Fp 	pfmul mm5, mm0  ;# mm5=eps*Fp 	pfadd mm5, mm4	;#  mm5= VV 	pfmul mm5, [esp + nb312nf_qqHH]	;# vcoul=qq*VV 	;# update vctot 	pfadd mm5, [esp + nb312nf_vctot]	movq [esp + nb312nf_vctot], mm5		;# interactions with j H2 	movq  mm0, [esi + eax*4 + 24]	movd  mm1, [esi + eax*4 + 32]	;# copy & expand to mm2-mm4 for the H interactions 	movq  mm2, mm0	movq  mm3, mm0	movq  mm4, mm1	punpckldq mm2,mm2	punpckhdq mm3,mm3	punpckldq mm4,mm4	pfsubr mm0, [esp + nb312nf_ixO]	pfsubr mm1, [esp + nb312nf_izO]			pfmul mm0,mm0	pfmul mm1,mm1	pfacc mm0, mm1	pfadd mm0, mm1		;# mm0=rsqO 		punpckldq mm2, mm2	punpckldq mm3, mm3	punpckldq mm4, mm4  ;# mm2-mm4 is jx-jz 	pfsubr mm2, [esp + nb312nf_ixH]	pfsubr mm3, [esp + nb312nf_iyH]	pfsubr mm4, [esp + nb312nf_izH] ;# mm2-mm4 is dxH-dzH 		pfmul mm2,mm2	pfmul mm3,mm3	pfmul mm4,mm4	pfadd mm3,mm2	pfadd mm3,mm4		;# mm3=rsqH 	movq [esp + nb312nf_tmprsqH], mm3    pfrsqrt mm1,mm0    movq mm2,mm1    pfmul mm1,mm1    pfrsqit1 mm1,mm0				    pfrcpit2 mm1,mm2	;# mm1=invsqrt 	pfmul mm0, mm1	pfmul mm0, [esp + nb312nf_tsc]	pf2iw mm4, mm0	movd [esp + nb312nf_n1], mm4	pi2fd mm4,mm4	pfsub mm0, mm4               ;# now mm0 is eps and mm4 n0 	movq  mm2, mm0	pfmul mm2, mm2		;# mm0 is eps, mm2 eps2 	;# coulomb table 	mov edx, [ebp + nb312nf_VFtab]	mov ecx, [esp + nb312nf_n1]	shl ecx, 2	;# load all values we need 	movd mm4, [edx + ecx*4]	movd mm5, [edx + ecx*4 + 4]	movd mm6, [edx + ecx*4 + 8]	movd mm7, [edx + ecx*4 + 12]		pfmul mm6, mm0  ;# mm6 = Geps 			pfmul mm7, mm2	;# mm7 = Heps2 		pfadd mm5, mm6	pfadd mm5, mm7	;# mm5 = Fp 	pfmul mm5, mm0  ;# mm5=eps*Fp 	pfadd mm5, mm4	;#  mm5= VV 	pfmul mm5, [esp + nb312nf_qqOH]	;# vcoul=qq*VV 	;# update vctot directly 	pfadd mm5, [esp + nb312nf_vctot]	movq [esp + nb312nf_vctot], mm5		movq mm0, [esp + nb312nf_tmprsqH]	pfrsqrt mm1, mm0	pswapd mm0,mm0	pfrsqrt mm2, mm0	pswapd mm0,mm0	punpckldq mm1,mm2	;# seeds are in mm1 now, and rsq in mm0. 	movq mm2, mm1	pfmul mm1,mm1    pfrsqit1 mm1,mm0				    pfrcpit2 mm1,mm2	;# mm1=invsqrt 		pfmul mm0,mm1		;# mm0=r 	pfmul mm0, [esp + nb312nf_tsc]	pf2iw mm4, mm0	movq [esp + nb312nf_n1], mm4	pi2fd mm4,mm4	pfsub mm0, mm4               ;# now mm0 is eps and mm4 n0 	movq  mm2, mm0	pfmul mm2, mm2		;# mm0 is eps, mm2 eps2 		;# coulomb table 	mov edx, [ebp + nb312nf_VFtab]	mov ecx, [esp + nb312nf_n1]	shl ecx, 2	;# load all values we need 	movd mm4, [edx + ecx*4]	movd mm5, [edx + ecx*4 + 4]	movd mm6, [edx + ecx*4 + 8]	movd mm7, [edx + ecx*4 + 12]	mov ecx, [esp + nb312nf_n1+4]	shl ecx, 2	punpckldq mm4, [edx + ecx*4]	punpckldq mm5, [edx + ecx*4 + 4]	punpckldq mm6, [edx + ecx*4 + 8]	punpckldq mm7, [edx + ecx*4 + 12]		pfmul mm6, mm0  ;# mm6 = Geps 			pfmul mm7, mm2	;# mm7 = Heps2 		pfadd mm5, mm6	pfadd mm5, mm7	;# mm5 = Fp 	pfmul mm5, mm0  ;# mm5=eps*Fp 	pfadd mm5, mm4	;#  mm5= VV 	pfmul mm5, [esp + nb312nf_qqHH]	;# vcoul=qq*VV 	;# update vctot 	pfadd mm5, [esp + nb312nf_vctot]	movq [esp + nb312nf_vctot], mm5			;#  done  - one more? 	dec dword ptr [esp + nb312nf_innerk]	jz  .nb312nf_updateouterdata	jmp .nb312nf_inner_loop	.nb312nf_updateouterdata:		;# get n from stack	mov esi, [esp + nb312nf_n]        ;# get group index for i particle         mov   edx, [ebp + nb312nf_gid]      	;# base of gid[]        mov   edx, [edx + esi*4]		;# ggid=gid[n]	movq  mm7, [esp + nb312nf_vctot]     	pfacc mm7,mm7	          ;# get and sum the two parts of total potential 	mov   eax, [ebp + nb312nf_Vc]	movd  mm6, [eax + edx*4] 	pfadd mm6, mm7	movd  [eax + edx*4], mm6          ;# increment vc[gid] 	movq  mm7, [esp + nb312nf_Vvdwtot]     	pfacc mm7,mm7	          ;# get and sum the two parts of total potential 	mov   eax, [ebp + nb312nf_Vvdw]	movd  mm6, [eax + edx*4] 	pfadd mm6, mm7	movd  [eax + edx*4], mm6          ;# increment Vvdwtot[gid]        	;# finish if last         mov ecx, [esp + nb312nf_nn1]	;# esi already loaded with n	inc esi        sub ecx, esi        jecxz .nb312nf_outerend        ;# not last, iterate outer loop once more!          mov [esp + nb312nf_n], esi        jmp .nb312nf_outer.nb312nf_outerend:        ;# check if more outer neighborlists remain        mov   ecx, [esp + nb312nf_nri]	;# esi already loaded with n above        sub   ecx, esi        jecxz .nb312nf_end        ;# non-zero, do one more workunit        jmp   .nb312nf_threadloop.nb312nf_end:	femms	mov eax, [esp + nb312nf_nouter] 		mov ebx, [esp + nb312nf_ninner]	mov ecx, [ebp + nb312nf_outeriter]	mov edx, [ebp + nb312nf_inneriter]	mov [ecx], eax	mov [edx], ebx	add esp, 152	pop edi	pop esi    	pop edx    	pop ecx    	pop ebx    	pop eax	leave	ret

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -