nb_kernel430_x86_64_sse2.intel_syntax.s

来自「最著名最快的分子模拟软件」· S 代码 · 共 1,666 行 · 第 1/4 页

S
1,666
字号
	mov  rsi, [rbp + nb430_GBtab]	mov  rdi, [rbp + nb430_VFtab]    ;# load GB table data to xmm0-xmm3, disp to xmm4-xmm7, rep. to xmm8-xmm11    movapd xmm0,  [rsi + r12*8]        ;# Y1c F1c    movapd xmm12, [rsi + r13*8]        ;# Y2c F2c    movapd xmm4,  [rdi + r14*8]        ;# Y1d F1d    movapd xmm13, [rdi + r15*8]        ;# Y2d F2d    movapd xmm8,  [rdi + r14*8 + 32]   ;# Y1r F1r    movapd xmm14, [rdi + r15*8 + 32]   ;# Y2r F2r	movapd xmm1, xmm0	movapd xmm5, xmm4	movapd xmm9, xmm8	unpcklpd xmm0, xmm12	;# Y1c Y2c 	unpckhpd xmm1, xmm12	;# F1c F2c 	unpcklpd xmm4, xmm13	;# Y1d Y2d 	unpckhpd xmm5, xmm13	;# F1d F2d 	unpcklpd xmm8, xmm14	;# Y1r Y2r 	unpckhpd xmm9, xmm14	;# F1r F2r         movapd xmm2,  [rsi + r12*8 + 16]   ;# G1c H1c    movapd xmm12, [rsi + r13*8 + 16]   ;# G2c H2c    movapd xmm6,  [rdi + r14*8 + 16]   ;# G1d H1d    movapd xmm13, [rdi + r15*8 + 16]   ;# G2d H2d    movapd xmm10, [rdi + r14*8 + 48]   ;# G1r H1r    movapd xmm14, [rdi + r15*8 + 48]   ;# G2r H2r	movapd xmm3, xmm2	movapd xmm7, xmm6	movapd xmm11, xmm10	unpcklpd xmm2, xmm12	;# G1c G2c 	unpckhpd xmm3, xmm12	;# H1c H2c 	unpcklpd xmm6, xmm13	;# G1d G2d 	unpckhpd xmm7, xmm13	;# H1d H2d 	unpcklpd xmm10, xmm14	;# G1r G2r 	unpckhpd xmm11, xmm14	;# H1r H2r     ;# table data ready. Coul GB in xmm0-xmm3 , disp in xmm4-xmm7 , rep. in xmm8-xmm11    mov rdi, [rbp + nb430_vdwparam]        movapd xmm12, [rsp + nb430_epsgb]    movapd xmm13, [rsp + nb430_eps]        mulpd  xmm3, xmm12   ;# Heps    mulpd  xmm7, xmm13    mulpd  xmm11, xmm13    mulpd  xmm2, xmm12     ;# Geps    mulpd  xmm6, xmm13    mulpd  xmm10, xmm13    mulpd  xmm3, xmm12   ;# Heps2    mulpd  xmm7, xmm13    mulpd  xmm11, xmm13    movlpd xmm14, [rdi + r8*8]    movlpd xmm15, [rdi + r8*8 + 8]        addpd  xmm1, xmm2   ;# F+Geps    addpd  xmm5, xmm6    addpd  xmm9, xmm10     addpd  xmm1, xmm3   ;# F+Geps+Heps2 = Fp    addpd  xmm5, xmm7    addpd  xmm9, xmm11     addpd  xmm3, xmm3    ;# 2*Heps2    addpd  xmm7, xmm7    addpd  xmm11, xmm11    movhpd xmm14, [rdi + r9*8]    movhpd xmm15, [rdi + r9*8 + 8]        addpd  xmm3, xmm2    ;# 2*Heps2+Geps    addpd  xmm7, xmm6      addpd  xmm11, xmm10    addpd  xmm3, xmm1   ;# FF = Fp + 2*Heps2 + Geps    addpd  xmm7, xmm5    addpd  xmm11, xmm9    mulpd  xmm1, xmm12   ;# eps*Fp    mulpd  xmm5, xmm13    mulpd  xmm9, xmm13    addpd  xmm1, xmm0     ;# VV    addpd  xmm5, xmm4    addpd  xmm9, xmm8    mulpd  xmm1, [rsp + nb430_qq]   ;# VV*qq = vcoul    mulpd  xmm5, xmm14   ;# vnb6    mulpd  xmm9, xmm15   ;# vnb12    mulpd  xmm3, [rsp + nb430_qq]    ;# FF*qq = fij    mulpd  xmm7, xmm14   ;# fijD    mulpd  xmm11, xmm15   ;#fijR    addpd  xmm11, xmm7 ;# fijD+fijR    mulpd  xmm11, [rsp + nb430_tsc] ;# (fijD+fijR)*tabscale        ;# accumulate Vvdwtot    addpd  xmm5, [rsp + nb430_Vvdwtot]    addpd  xmm5, xmm9    movapd [rsp + nb430_Vvdwtot], xmm5	mov rsi, [rbp + nb430_dvda]		;# Calculate dVda	mulpd xmm3, [rsp + nb430_gbscale]   ;# fijC=qq*FF*gbscale	movapd xmm6, xmm3 	mulpd  xmm6, [rsp + nb430_r]	addpd  xmm6, xmm1   ;# vcoul+fijC*r    addpd  xmm3, xmm11  ;# fijC+fijD+fijR        ;# increment vctot	addpd  xmm1, [rsp + nb430_vctot]    movapd [rsp + nb430_vctot], xmm1	;# xmm6=(vcoul+fijC*r)	xorpd  xmm7, xmm7	subpd  xmm7, xmm6	movapd xmm6, xmm7	    ;# the fj's - start by combiningg forces from memory     mov rdi, [rbp + nb430_faction]	movlpd xmm0, [rdi + r10*8]	movlpd xmm1, [rdi + r10*8 + 8]	movlpd xmm2, [rdi + r10*8 + 16]	movhpd xmm0, [rdi + r11*8]	movhpd xmm1, [rdi + r11*8 + 8]	movhpd xmm2, [rdi + r11*8 + 16]	;# update dvdasum 	addpd  xmm7, [rsp + nb430_dvdasum]    movapd [rsp + nb430_dvdasum], xmm7	;# update j atoms dvdaj	movhlps xmm7, xmm6	addsd  xmm6, [rsi + rax*8]	addsd  xmm7, [rsi + rbx*8]	movsd  [rsi + rax*8], xmm6	movsd  [rsi + rbx*8], xmm7	xorpd  xmm4, xmm4		mulpd xmm3, [rsp + nb430_rinv]	subpd  xmm4, xmm3    movapd  xmm9, xmm4    movapd  xmm10, xmm4    movapd  xmm11, xmm4        mulpd  xmm9, [rsp + nb430_dx]    mulpd  xmm10, [rsp + nb430_dy]    mulpd  xmm11, [rsp + nb430_dz]    	addpd xmm0, xmm9	addpd xmm1, xmm10	addpd xmm2, xmm11	;# accumulate i forces    addpd xmm9, [rsp + nb430_fix]    addpd xmm10, [rsp + nb430_fiy]    addpd xmm11, [rsp + nb430_fiz]	movlpd [rdi + r10*8], xmm0	movlpd [rdi + r10*8 + 8], xmm1	movlpd [rdi + r10*8 + 16], xmm2    movapd [rsp + nb430_fix], xmm9    movapd [rsp + nb430_fiy], xmm10    movapd [rsp + nb430_fiz], xmm11	movhpd [rdi + r11*8], xmm0	movhpd [rdi + r11*8 + 8], xmm1	movhpd [rdi + r11*8 + 16], xmm2	    ;# should we do one more iteration? 	sub dword ptr [rsp + nb430_innerk],  2	jl    .nb430_checksingle	jmp   .nb430_unroll_loop.nb430_checksingle:	mov   edx, [rsp + nb430_innerk]	and   edx, 1	jnz    .nb430_dosingle	jmp    .nb430_updateouterdata.nb430_dosingle:	mov rsi, [rbp + nb430_charge]	mov rdx, [rbp + nb430_invsqrta]	mov rdi, [rbp + nb430_pos]	mov   rcx, [rsp + nb430_innerjjnr]	mov   eax, [rcx]		;# load isaj	mov rsi, [rbp + nb430_invsqrta]	movsd xmm2, [rsi + rax*8]	mulsd  xmm2, [rsp + nb430_isai]	movapd [rsp + nb430_isaprod], xmm2		movapd xmm1, xmm2	mulsd xmm1, [rsp + nb430_gbtsc]	movapd [rsp + nb430_gbscale], xmm1    mulsd xmm2, [rsp + nb430_iq]	mov rsi, [rbp + nb430_charge]    ;# base of charge[] 	movsd xmm3, [rsi + rax*8]	mulsd  xmm3, xmm2	movapd [rsp + nb430_qq], xmm3			mov rsi, [rbp + nb430_type]	mov r8d, [rsi + rax*4]	mov rsi, [rbp + nb430_vdwparam]	shl r8d, 1	mov edi, [rsp + nb430_ntia]	add r8d, edi	movsd xmm4, [rsi + r8*8]		movsd xmm6, [rsi + r8*8 + 8]	movapd [rsp + nb430_c6], xmm4	movapd [rsp + nb430_c12], xmm6			mov rsi, [rbp + nb430_pos]		;# base of pos[] 	lea   r10, [rax + rax*2]     ;# j3 	;# move coordinate to xmm4-xmm6 	movsd xmm4, [rsi + r10*8]	movsd xmm5, [rsi + r10*8 + 8]	movsd xmm6, [rsi + r10*8 + 16]	mov    rdi, [rbp + nb430_faction]		;# calc dr 	subsd xmm4, [rsp + nb430_ix]	subsd xmm5, [rsp + nb430_iy]	subsd xmm6, [rsp + nb430_iz]	;# store dr 	movapd [rsp + nb430_dx], xmm4	movapd [rsp + nb430_dy], xmm5	movapd [rsp + nb430_dz], xmm6    	;# square it 	mulsd xmm4,xmm4	mulsd xmm5,xmm5	mulsd xmm6,xmm6	addsd xmm4, xmm5	addsd xmm4, xmm6	;# rsq in xmm4 	cvtsd2ss xmm5, xmm4		rsqrtss xmm5, xmm5	cvtss2sd xmm2, xmm5	;# lu in low xmm2 	;# lookup seed in xmm2 	movapd xmm5, xmm2	;# copy of lu 	mulsd xmm2, xmm2	;# lu*lu 	movapd xmm1, [rsp + nb430_three]	mulsd xmm2, xmm4	;# rsq*lu*lu 				movapd xmm0, [rsp + nb430_half]	subsd xmm1, xmm2	;# 30-rsq*lu*lu 	mulsd xmm1, xmm5		mulsd xmm1, xmm0	;# xmm0=iter1 of rinv (new lu) 	movapd xmm5, xmm1	;# copy of lu 	mulsd xmm1, xmm1	;# lu*lu 	movapd xmm2, [rsp + nb430_three]	mulsd xmm1, xmm4	;# rsq*lu*lu 				movapd xmm0, [rsp + nb430_half]	subsd xmm2, xmm1	;# 30-rsq*lu*lu 	mulsd xmm2, xmm5		mulsd xmm0, xmm2	;# xmm0=iter2 of rinv 	mulsd xmm4, xmm0	;# xmm4=r 	movapd [rsp + nb430_r], xmm4	movapd [rsp + nb430_rinv], xmm0    movapd xmm8, xmm4 ;# r	mulsd xmm4, [rsp + nb430_gbscale]	mulsd xmm8, [rsp + nb430_tsc]        ;# truncate and convert to integers    cvttsd2si r12d, xmm4  ;# gb    cvttsd2si r14d, xmm8  ;# lj        ;# convert back to float    cvtsi2sd  xmm6, r12d   ;# gb    cvtsi2sd  xmm10, r14d  ;# lj        ;# multiply by 4 and 8, respectively    shl    r12d, 2   ;# gb    shl    r14d, 3   ;# lj    ;# GB indices: r10   LJ indices: r12    ;# calculate eps    subsd     xmm4, xmm6   ;# gb    subsd     xmm8, xmm10  ;# lj    movapd    [rsp + nb430_epsgb], xmm4 ;# gb eps    movapd    [rsp + nb430_eps], xmm8 ;# lj eps    	mov  rsi, [rbp + nb430_GBtab]	mov  rdi, [rbp + nb430_VFtab]    ;# load GB table data to xmm0-xmm3, disp to xmm4-xmm7, rep. to xmm8-xmm11    movapd xmm0,  [rsi + r12*8]        ;# Y1c F1c    movapd xmm4,  [rdi + r14*8]        ;# Y1d F1d    movapd xmm8,  [rdi + r14*8 + 32]   ;# Y1r F1r	movhlps xmm1, xmm0	movhlps xmm5, xmm4	movhlps xmm9, xmm8        movapd xmm2,  [rsi + r12*8 + 16]   ;# G1c H1c    movapd xmm6,  [rdi + r14*8 + 16]   ;# G1d H1d    movapd xmm10, [rdi + r14*8 + 48]   ;# G1r H1r	movhlps xmm3, xmm2	movhlps xmm7, xmm6	movhlps xmm11, xmm10    ;# table data ready. Coul GB in xmm0-xmm3 , disp in xmm4-xmm7 , rep. in xmm8-xmm11        movapd xmm12, [rsp + nb430_epsgb]    movapd xmm13, [rsp + nb430_eps]        mulsd  xmm3, xmm12   ;# Heps    mulsd  xmm7, xmm13    mulsd  xmm11, xmm13    mulsd  xmm2, xmm12     ;# Geps    mulsd  xmm6, xmm13    mulsd  xmm10, xmm13    mulsd  xmm3, xmm12   ;# Heps2    mulsd  xmm7, xmm13    mulsd  xmm11, xmm13    addsd  xmm1, xmm2   ;# F+Geps    addsd  xmm5, xmm6    addsd  xmm9, xmm10     addsd  xmm1, xmm3   ;# F+Geps+Heps2 = Fp    addsd  xmm5, xmm7    addsd  xmm9, xmm11     addsd  xmm3, xmm3    ;# 2*Heps2    addsd  xmm7, xmm7    addsd  xmm11, xmm11    addsd  xmm3, xmm2    ;# 2*Heps2+Geps    addsd  xmm7, xmm6      addsd  xmm11, xmm10    addsd  xmm3, xmm1   ;# FF = Fp + 2*Heps2 + Geps    addsd  xmm7, xmm5    addsd  xmm11, xmm9    mulsd  xmm1, xmm12   ;# eps*Fp    mulsd  xmm5, xmm13    mulsd  xmm9, xmm13    addsd  xmm1, xmm0     ;# VV    addsd  xmm5, xmm4    addsd  xmm9, xmm8    mulsd  xmm1, [rsp + nb430_qq]   ;# VV*qq = vcoul    mulsd  xmm5, [rsp + nb430_c6]   ;# vnb6    mulsd  xmm9, [rsp + nb430_c12]   ;# vnb12    mulsd  xmm3, [rsp + nb430_qq]    ;# FF*qq = fij    mulsd  xmm7, [rsp + nb430_c6]   ;# fijD    mulsd  xmm11, [rsp + nb430_c12]   ;#fijR    addsd  xmm11, xmm7 ;# fijD+fijR    mulsd  xmm11, [rsp + nb430_tsc] ;# (fijD+fijR)*tabscale        ;# accumulate Vvdwtot    addsd  xmm5, [rsp + nb430_Vvdwtot]    addsd  xmm5, xmm9    movsd [rsp + nb430_Vvdwtot], xmm5	mov rsi, [rbp + nb430_dvda]		;# Calculate dVda	mulsd xmm3, [rsp + nb430_gbscale]   ;# fijC=qq*FF*gbscale	movapd xmm6, xmm3 	mulsd  xmm6, [rsp + nb430_r]	addsd  xmm6, xmm1   ;# vcoul+fijC*r    addsd  xmm3, xmm11  ;# fijC+fijD+fijR        ;# increment vctot	addsd  xmm1, [rsp + nb430_vctot]    movsd [rsp + nb430_vctot], xmm1	;# xmm6=(vcoul+fijC*r)	xorpd  xmm7, xmm7	subsd  xmm7, xmm6	movapd xmm6, xmm7		;# update dvdasum 	addsd  xmm7, [rsp + nb430_dvdasum]    movsd [rsp + nb430_dvdasum], xmm7	;# update j atoms dvdaj	addsd  xmm6, [rsi + rax*8]	movsd  [rsi + rax*8], xmm6	xorpd  xmm4, xmm4		mulsd xmm3, [rsp + nb430_rinv]	subsd  xmm4, xmm3    movapd  xmm9, xmm4    movapd  xmm10, xmm4    movapd  xmm11, xmm4        mulsd  xmm9, [rsp + nb430_dx]    mulsd  xmm10, [rsp + nb430_dy]    mulsd  xmm11, [rsp + nb430_dz]        movapd xmm3, xmm9    movapd xmm4, xmm10    movapd xmm5, xmm11    	;# accumulate i forces    addsd xmm9, [rsp + nb430_fix]    addsd xmm10, [rsp + nb430_fiy]    addsd xmm11, [rsp + nb430_fiz]    movsd [rsp + nb430_fix], xmm9    movsd [rsp + nb430_fiy], xmm10    movsd [rsp + nb430_fiz], xmm11    mov rdi, [rbp + nb430_faction]	;# the fj's - start by accumulating forces from memory 	addsd xmm3,   [rdi + r10*8]	addsd xmm4,  [rdi + r10*8 + 8]	addsd xmm5,  [rdi + r10*8 + 16]	movsd [rdi + r10*8], xmm3	movsd [rdi + r10*8 + 8], xmm4	movsd [rdi + r10*8 + 16], xmm5	.nb430_updateouterdata:	mov   ecx, [rsp + nb430_ii3]

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?