nb_kernel234_x86_64_sse2.intel_syntax.s

来自「最著名最快的分子模拟软件」· S 代码 · 共 2,365 行 · 第 1/5 页

S
2,365
字号
	movapd xmm6, [rsi + r10*8 + 16]     ;# G1d H1d 		movapd xmm10, [rsi + r10*8 + 48]	;# G1r H1r 		movhlps xmm7, xmm6	movhlps xmm11, xmm10	;# tables ready, in xmm4-xmm7 and xmm8-xmm11    mulsd  xmm7, xmm1    ;# Heps    mulsd  xmm11, xmm1     mulsd  xmm6, xmm1   ;# Geps    mulsd  xmm10, xmm1     mulsd  xmm7, xmm1   ;# Heps2    mulsd  xmm11, xmm1     addsd  xmm5, xmm6  ;# F+Geps    addsd  xmm9, xmm10     addsd  xmm5, xmm7   ;# F+Geps+Heps2 = Fp    addsd  xmm9, xmm11     addsd  xmm7, xmm7    ;# 2*Heps2    addsd  xmm11, xmm11    addsd  xmm7, xmm6   ;# 2*Heps2+Geps    addsd  xmm11, xmm10        addsd  xmm7, xmm5  ;# FF = Fp + 2*Heps2 + Geps    addsd  xmm11, xmm9    mulsd  xmm5, xmm1  ;# eps*Fp    mulsd  xmm9, xmm1    addsd  xmm5, xmm4 ;# VV    addsd  xmm9, xmm8    mulsd  xmm5, [rsp + nb234_c6]  ;# VV*c6 = vnb6    mulsd  xmm9, [rsp + nb234_c12]  ;# VV*c12 = vnb12    addsd  xmm5, xmm9    addsd  xmm5, [rsp + nb234_Vvdwtot]    movsd [rsp + nb234_Vvdwtot], xmm5            mulsd  xmm7, [rsp + nb234_c6]   ;# FF*c6 = fnb6    mulsd  xmm11, [rsp + nb234_c12]   ;# FF*c12  = fnb12    addsd  xmm7, xmm11        mulsd  xmm7, [rsp + nb234_tsc]    mulsd  xmm7, xmm2    xorpd  xmm9, xmm9        subpd  xmm9, xmm7    mulsd xmm13, xmm9    mulsd xmm14, xmm9    mulsd xmm15, xmm9        mov rdi, [rbp + nb234_faction]    movapd xmm0, [rsp + nb234_fixO]    movapd xmm1, [rsp + nb234_fiyO]    movapd xmm2, [rsp + nb234_fizO]        ;# accumulate i forces    addsd xmm0, xmm13    addsd xmm1, xmm14    addsd xmm2, xmm15    movsd [rsp + nb234_fixO], xmm0    movsd [rsp + nb234_fiyO], xmm1    movsd [rsp + nb234_fizO], xmm2    	;# the fj's - start by accumulating forces from memory 	addsd xmm13, [rdi + rax*8]	addsd xmm14, [rdi + rax*8 + 8]	addsd xmm15, [rdi + rax*8 + 16]	movsd [rdi + rax*8], xmm13	movsd [rdi + rax*8 + 8], xmm14	movsd [rdi + rax*8 + 16], xmm15    ;# done with OO interaction    	;# move j H1 coordinates to local temp variables     mov rsi, [rbp + nb234_pos]    movsd xmm0, [rsi + rax*8 + 24]     movsd xmm1, [rsi + rax*8 + 32]     movsd xmm2, [rsi + rax*8 + 40]     ;# xmm0 = H1x    ;# xmm1 = H1y    ;# xmm2 = H1z            movsd xmm3, xmm0    movsd xmm4, xmm1    movsd xmm5, xmm2    movsd xmm6, xmm0    movsd xmm7, xmm1    movsd xmm8, xmm2        subsd xmm0, [rsp + nb234_ixH1]    subsd xmm1, [rsp + nb234_iyH1]    subsd xmm2, [rsp + nb234_izH1]    subsd xmm3, [rsp + nb234_ixH2]    subsd xmm4, [rsp + nb234_iyH2]    subsd xmm5, [rsp + nb234_izH2]    subsd xmm6, [rsp + nb234_ixM]    subsd xmm7, [rsp + nb234_iyM]    subsd xmm8, [rsp + nb234_izM]    	movsd [rsp + nb234_dxH1H1], xmm0	movsd [rsp + nb234_dyH1H1], xmm1	movsd [rsp + nb234_dzH1H1], xmm2	mulsd  xmm0, xmm0	mulsd  xmm1, xmm1	mulsd  xmm2, xmm2	movsd [rsp + nb234_dxH2H1], xmm3	movsd [rsp + nb234_dyH2H1], xmm4	movsd [rsp + nb234_dzH2H1], xmm5	mulsd  xmm3, xmm3	mulsd  xmm4, xmm4	mulsd  xmm5, xmm5	movsd [rsp + nb234_dxMH1], xmm6	movsd [rsp + nb234_dyMH1], xmm7	movsd [rsp + nb234_dzMH1], xmm8	mulsd  xmm6, xmm6	mulsd  xmm7, xmm7	mulsd  xmm8, xmm8	addsd  xmm0, xmm1	addsd  xmm0, xmm2	addsd  xmm3, xmm4	addsd  xmm3, xmm5    addsd  xmm6, xmm7    addsd  xmm6, xmm8	;# start doing invsqrt for jH1 atoms    cvtsd2ss xmm1, xmm0    cvtsd2ss xmm4, xmm3    cvtsd2ss xmm7, xmm6	rsqrtss xmm1, xmm1	rsqrtss xmm4, xmm4    rsqrtss xmm7, xmm7    cvtss2sd xmm1, xmm1    cvtss2sd xmm4, xmm4    cvtss2sd xmm7, xmm7		movsd  xmm2, xmm1	movsd  xmm5, xmm4    movsd  xmm8, xmm7    	mulsd   xmm1, xmm1 ;# lu*lu	mulsd   xmm4, xmm4 ;# lu*lu    mulsd   xmm7, xmm7 ;# lu*lu			movsd  xmm9, [rsp + nb234_three]	movsd  xmm10, xmm9    movsd  xmm11, xmm9	mulsd   xmm1, xmm0 ;# rsq*lu*lu	mulsd   xmm4, xmm3 ;# rsq*lu*lu     mulsd   xmm7, xmm6 ;# rsq*lu*lu		subsd   xmm9, xmm1	subsd   xmm10, xmm4    subsd   xmm11, xmm7 ;# 3-rsq*lu*lu	mulsd   xmm9, xmm2	mulsd   xmm10, xmm5    mulsd   xmm11, xmm8 ;# lu*(3-rsq*lu*lu)	movsd  xmm15, [rsp + nb234_half]	mulsd   xmm9, xmm15  ;# first iteration for rinvH1H1 	mulsd   xmm10, xmm15 ;# first iteration for rinvH2H1    mulsd   xmm11, xmm15 ;# first iteration for rinvMH1	    ;# second iteration step    	movsd  xmm2, xmm9	movsd  xmm5, xmm10    movsd  xmm8, xmm11    	mulsd   xmm2, xmm2 ;# lu*lu	mulsd   xmm5, xmm5 ;# lu*lu    mulsd   xmm8, xmm8 ;# lu*lu			movsd  xmm1, [rsp + nb234_three]	movsd  xmm4, xmm1    movsd  xmm7, xmm1	mulsd   xmm2, xmm0 ;# rsq*lu*lu	mulsd   xmm5, xmm3 ;# rsq*lu*lu     mulsd   xmm8, xmm6 ;# rsq*lu*lu		subsd   xmm1, xmm2	subsd   xmm4, xmm5    subsd   xmm7, xmm8 ;# 3-rsq*lu*lu	mulsd   xmm9, xmm1	mulsd   xmm10, xmm4    mulsd   xmm11, xmm7 ;# lu*(3-rsq*lu*lu)	movsd  xmm15, [rsp + nb234_half]	mulsd   xmm9, xmm15  ;#  rinvH1H1 	mulsd   xmm10, xmm15 ;#   rinvH2H1    mulsd   xmm11, xmm15 ;#   rinvMH1		;# H1 interactions     ;# rsq in xmm0,xmm3,xmm6      ;# rinv in xmm9, xmm10, xmm11    movsd xmm1, xmm9 ;# copy of rinv    movsd xmm4, xmm10    movsd xmm7, xmm11    movsd xmm2, [rsp + nb234_krf]        mulsd  xmm9, xmm9   ;# rinvsq    mulsd  xmm10, xmm10    mulsd  xmm11, xmm11    mulsd  xmm0, xmm2  ;# k*rsq    mulsd  xmm3, xmm2    mulsd  xmm6, xmm2    movsd  xmm2, xmm0 ;# copy of k*rsq    movsd  xmm5, xmm3    movsd  xmm8, xmm6    addsd  xmm2, xmm1  ;# rinv+krsq    addsd  xmm5, xmm4    addsd  xmm8, xmm7    movsd  xmm14, [rsp + nb234_crf]    subsd  xmm2, xmm14   ;# rinv+krsq-crf    subsd  xmm5, xmm14    subsd  xmm8, xmm14    movsd  xmm12, [rsp + nb234_qqHH]    movsd  xmm13, [rsp + nb234_qqMH]        mulsd  xmm2, xmm12 ;# voul=qq*(rinv+ krsq-crf)    mulsd  xmm5, xmm12 ;# voul=qq*(rinv+ krsq-crf)    mulsd  xmm8, xmm13 ;# voul=qq*(rinv+ krsq-crf)    addsd  xmm0, xmm0 ;# 2*krsq    addsd  xmm3, xmm3     addsd  xmm6, xmm6     subsd  xmm1, xmm0 ;# rinv-2*krsq    subsd  xmm4, xmm3    subsd  xmm7, xmm6    mulsd  xmm1, xmm12   ;# (rinv-2*krsq)*qq    mulsd  xmm4, xmm12    mulsd  xmm7, xmm13    addsd  xmm2, [rsp + nb234_vctot]    addsd  xmm5, xmm8    addsd  xmm2, xmm5    movsd  [rsp + nb234_vctot], xmm2        mulsd  xmm9, xmm1   ;# fscal    mulsd  xmm10, xmm4    mulsd  xmm11, xmm7    ;# move j H1 forces to xmm0-xmm2    mov rdi, [rbp + nb234_faction]	movsd xmm0, [rdi + rax*8 + 24]	movsd xmm1, [rdi + rax*8 + 32]	movsd xmm2, [rdi + rax*8 + 40]    movsd xmm7, xmm9    movsd xmm8, xmm9    movsd xmm13, xmm11    movsd xmm14, xmm11    movsd xmm15, xmm11    movsd xmm11, xmm10    movsd xmm12, xmm10	mulsd xmm7, [rsp + nb234_dxH1H1]	mulsd xmm8, [rsp + nb234_dyH1H1]	mulsd xmm9, [rsp + nb234_dzH1H1]	mulsd xmm10, [rsp + nb234_dxH2H1]	mulsd xmm11, [rsp + nb234_dyH2H1]	mulsd xmm12, [rsp + nb234_dzH2H1]	mulsd xmm13, [rsp + nb234_dxMH1]	mulsd xmm14, [rsp + nb234_dyMH1]	mulsd xmm15, [rsp + nb234_dzMH1]    addsd xmm0, xmm7    addsd xmm1, xmm8    addsd xmm2, xmm9    addsd xmm7, [rsp + nb234_fixH1]    addsd xmm8, [rsp + nb234_fiyH1]    addsd xmm9, [rsp + nb234_fizH1]    addsd xmm0, xmm10    addsd xmm1, xmm11    addsd xmm2, xmm12    addsd xmm10, [rsp + nb234_fixH2]    addsd xmm11, [rsp + nb234_fiyH2]    addsd xmm12, [rsp + nb234_fizH2]    addsd xmm0, xmm13    addsd xmm1, xmm14    addsd xmm2, xmm15    addsd xmm13, [rsp + nb234_fixM]    addsd xmm14, [rsp + nb234_fiyM]    addsd xmm15, [rsp + nb234_fizM]    movsd [rsp + nb234_fixH1], xmm7    movsd [rsp + nb234_fiyH1], xmm8    movsd [rsp + nb234_fizH1], xmm9    movsd [rsp + nb234_fixH2], xmm10    movsd [rsp + nb234_fiyH2], xmm11    movsd [rsp + nb234_fizH2], xmm12    movsd [rsp + nb234_fixM], xmm13    movsd [rsp + nb234_fiyM], xmm14    movsd [rsp + nb234_fizM], xmm15       ;# store back j H1 forces from xmm0-xmm2	movsd [rdi + rax*8 + 24], xmm0	movsd [rdi + rax*8 + 32], xmm1	movsd [rdi + rax*8 + 40], xmm2	;# move j H2 coordinates to local temp variables     mov rsi, [rbp + nb234_pos]    movsd xmm0, [rsi + rax*8 + 48]     movsd xmm1, [rsi + rax*8 + 56]     movsd xmm2, [rsi + rax*8 + 64]     ;# xmm0 = H2x    ;# xmm1 = H2y    ;# xmm2 = H2z            movsd xmm3, xmm0    movsd xmm4, xmm1    movsd xmm5, xmm2    movsd xmm6, xmm0    movsd xmm7, xmm1    movsd xmm8, xmm2        subsd xmm0, [rsp + nb234_ixH1]    subsd xmm1, [rsp + nb234_iyH1]    subsd xmm2, [rsp + nb234_izH1]    subsd xmm3, [rsp + nb234_ixH2]    subsd xmm4, [rsp + nb234_iyH2]    subsd xmm5, [rsp + nb234_izH2]    subsd xmm6, [rsp + nb234_ixM]    subsd xmm7, [rsp + nb234_iyM]    subsd xmm8, [rsp + nb234_izM]    	movsd [rsp + nb234_dxH1H2], xmm0	movsd [rsp + nb234_dyH1H2], xmm1	movsd [rsp + nb234_dzH1H2], xmm2	mulsd  xmm0, xmm0	mulsd  xmm1, xmm1	mulsd  xmm2, xmm2	movsd [rsp + nb234_dxH2H2], xmm3	movsd [rsp + nb234_dyH2H2], xmm4	movsd [rsp + nb234_dzH2H2], xmm5	mulsd  xmm3, xmm3	mulsd  xmm4, xmm4	mulsd  xmm5, xmm5	movsd [rsp + nb234_dxMH2], xmm6	movsd [rsp + nb234_dyMH2], xmm7	movsd [rsp + nb234_dzMH2], xmm8	mulsd  xmm6, xmm6	mulsd  xmm7, xmm7	mulsd  xmm8, xmm8	addsd  xmm0, xmm1	addsd  xmm0, xmm2	addsd  xmm3, xmm4	addsd  xmm3, xmm5    addsd  xmm6, xmm7    addsd  xmm6, xmm8	;# start doing invsqrt for jH2 atoms    cvtsd2ss xmm1, xmm0    cvtsd2ss xmm4, xmm3    cvtsd2ss xmm7, xmm6	rsqrtss xmm1, xmm1	rsqrtss xmm4, xmm4    rsqrtss xmm7, xmm7    cvtss2sd xmm1, xmm1    cvtss2sd xmm4, xmm4    cvtss2sd xmm7, xmm7		movsd  xmm2, xmm1	movsd  xmm5, xmm4    movsd  xmm8, xmm7    	mulsd   xmm1, xmm1 ;# lu*lu	mulsd   xmm4, xmm4 ;# lu*lu    mulsd   xmm7, xmm7 ;# lu*lu			movsd  xmm9, [rsp + nb234_three]	movsd  xmm10, xmm9    movsd  xmm11, xmm9	mulsd   xmm1, xmm0 ;# rsq*lu*lu	mulsd   xmm4, xmm3 ;# rsq*lu*lu     mulsd   xmm7, xmm6 ;# rsq*lu*lu		subsd   xmm9, xmm1	subsd   xmm10, xmm4    subsd   xmm11, xmm7 ;# 3-rsq*lu*lu	mulsd   xmm9, xmm2	mulsd   xmm10, xmm5    mulsd   xmm11, xmm8 ;# lu*(3-rsq*lu*lu)	movsd  xmm15, [rsp + nb234_half]	mulsd   xmm9, xmm15  ;# first iteration for rinvH1H2 	mulsd   xmm10, xmm15 ;# first iteration for rinvH2H2    mulsd   xmm11, xmm15 ;# first iteration for rinvMH2    ;# second iteration step    	movsd  xmm2, xmm9	movsd  xmm5, xmm10    movsd  xmm8, xmm11    	mulsd   xmm2, xmm2 ;# lu*lu	mulsd   xmm5, xmm5 ;# lu*lu    mulsd   xmm8, xmm8 ;# lu*lu			movsd  xmm1, [rsp + nb234_three]	movsd  xmm4, xmm1    movsd  xmm7, xmm1	mulsd   xmm2, xmm0 ;# rsq*lu*lu	mulsd   xmm5, xmm3 ;# rsq*lu*lu     mulsd   xmm8, xmm6 ;# rsq*lu*lu		subsd   xmm1, xmm2	subsd   xmm4, xmm5    subsd   xmm7, xmm8 ;# 3-rsq*lu*lu	mulsd   xmm9, xmm1	mulsd   xmm10, xmm4    mulsd   xmm11, xmm7 ;# lu*(3-rsq*lu*lu)	movsd  xmm15, [rsp + nb234_half]	mulsd   xmm9, xmm15  ;#  rinvH1H2	mulsd   xmm10, xmm15 ;#   rinvH2H2    mulsd   xmm11, xmm15 ;#   rinvMH2		;# H2 interactions     ;# rsq in xmm0,xmm3,xmm6      ;# rinv in xmm9, xmm10, xmm11    movsd xmm1, xmm9 ;# copy of rinv    movsd xmm4, xmm10    movsd xmm7, xmm11    movsd xmm2, [rsp + nb234_krf]        mulsd  xmm9, xmm9   ;# rinvsq    mulsd  xmm10, xmm10    mulsd  xmm11, xmm11    mulsd  xmm0, xmm2  ;# k*rsq    mulsd  xmm3, xmm2    mulsd  xmm6, xmm2    movsd xmm2, xmm0 ;# copy of k*rsq    movsd xmm5, xmm3    movsd xmm8, xmm6    addsd  xmm2, xmm1  ;# rinv+krsq    addsd  xmm5, xmm4    addsd  xmm8, xmm7    movsd xmm14, [rsp + nb234_crf]    subsd  xmm2, xmm14   ;# rinv+krsq-crf    subsd  xmm5, xmm14    subsd  xmm8, xmm14    movsd xmm12, [rsp + nb234_qqHH]    movsd xmm13, [rsp + nb234_qqMH]        mulsd  xmm2, xmm12 ;# xmm6=voul=qq*(rinv+ krsq-crf)    mulsd  xmm5, xmm12 ;# xmm6=voul=qq*(rinv+ krsq-crf)    mulsd  xmm8, xmm13 ;# xmm6=voul=qq*(rinv+ krsq-crf)    addsd  xmm0, xmm0 ;# 2*krsq    addsd  xmm3, xmm3     addsd  xmm6, xmm6     subsd  xmm1, xmm0 ;# rinv-2*krsq    subsd  xmm4, xmm3    subsd  xmm7, xmm6    mulsd  xmm1, xmm12   ;# (rinv-2*krsq)*qq    mulsd  xmm4, xmm12    mulsd  xmm7, xmm13    addsd  xmm2, [rsp + nb234_vctot]    addsd  xmm5, xmm8    addsd  xmm2, xmm5    movsd  [rsp + nb234_vctot], xmm2        mulsd  xmm9, xmm1   ;# fscal    mulsd  xmm10, xmm4    mulsd  xmm11, xmm7    ;# move j H2 forces to xmm0-xmm2    mov rdi, [rbp + nb234_faction]	movsd xmm0, [rdi + rax*8 + 48]	movsd xmm1, [rdi + rax*8 + 56]

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?