nb_kernel312_x86_64_sse2.intel_syntax.s
来自「最著名最快的分子模拟软件」· S 代码 · 共 2,377 行 · 第 1/5 页
S
2,377 行
rsqrtps xmm7, xmm7 cvtps2pd xmm1, xmm1 cvtps2pd xmm4, xmm4 cvtps2pd xmm7, xmm7 movapd xmm2, xmm1 movapd xmm5, xmm4 movapd xmm8, xmm7 mulpd xmm1, xmm1 ;# lu*lu mulpd xmm4, xmm4 ;# lu*lu mulpd xmm7, xmm7 ;# lu*lu movapd xmm9, [rsp + nb312_three] movapd xmm10, xmm9 movapd xmm11, xmm9 mulpd xmm1, xmm0 ;# rsq*lu*lu mulpd xmm4, xmm3 ;# rsq*lu*lu mulpd xmm7, xmm6 ;# rsq*lu*lu subpd xmm9, xmm1 subpd xmm10, xmm4 subpd xmm11, xmm7 ;# 3-rsq*lu*lu mulpd xmm9, xmm2 mulpd xmm10, xmm5 mulpd xmm11, xmm8 ;# lu*(3-rsq*lu*lu) movapd xmm15, [rsp + nb312_half] mulpd xmm9, xmm15 ;# first iteration for rinvOO mulpd xmm10, xmm15 ;# first iteration for rinvH1O mulpd xmm11, xmm15 ;# first iteration for rinvH2O ;# second iteration step movapd xmm2, xmm9 movapd xmm5, xmm10 movapd xmm8, xmm11 mulpd xmm2, xmm2 ;# lu*lu mulpd xmm5, xmm5 ;# lu*lu mulpd xmm8, xmm8 ;# lu*lu movapd xmm1, [rsp + nb312_three] movapd xmm4, xmm1 movapd xmm7, xmm1 mulpd xmm2, xmm0 ;# rsq*lu*lu mulpd xmm5, xmm3 ;# rsq*lu*lu mulpd xmm8, xmm6 ;# rsq*lu*lu subpd xmm1, xmm2 subpd xmm4, xmm5 subpd xmm7, xmm8 ;# 3-rsq*lu*lu mulpd xmm9, xmm1 mulpd xmm10, xmm4 mulpd xmm11, xmm7 ;# lu*(3-rsq*lu*lu) movapd xmm15, [rsp + nb312_half] mulpd xmm9, xmm15 ;# rinvOO mulpd xmm10, xmm15 ;# rinvH1O mulpd xmm11, xmm15 ;# rinvH2O movapd [rsp + nb312_rinvOO], xmm9 movapd [rsp + nb312_rinvH1O], xmm10 movapd [rsp + nb312_rinvH2O], xmm11 ;# O interactions ;# rsq in xmm0,xmm3,xmm6 ;# rinv in xmm9, xmm10, xmm11 movapd xmm1, [rsp + nb312_tsc] mulpd xmm0, xmm9 ;# r mulpd xmm3, xmm10 mulpd xmm6, xmm11 mulpd xmm0, xmm1 ;# rtab mulpd xmm3, xmm1 mulpd xmm6, xmm1 ;# truncate and convert to integers cvttpd2dq xmm1, xmm0 cvttpd2dq xmm4, xmm3 cvttpd2dq xmm7, xmm6 ;# convert back to float cvtdq2pd xmm2, xmm1 cvtdq2pd xmm5, xmm4 cvtdq2pd xmm8, xmm7 ;# multiply by 4 pslld xmm1, 2 pslld xmm4, 2 pslld xmm7, 2 ;# move to integer registers pshufd xmm13, xmm1, 1 pshufd xmm14, xmm4, 1 pshufd xmm15, xmm7, 1 movd r8d, xmm1 movd r10d, xmm4 movd r12d, xmm7 movd r9d, xmm13 movd r11d, xmm14 movd r13d, xmm15 mov rsi, [rbp + nb312_VFtab] ;# calculate eps subpd xmm0, xmm2 subpd xmm3, xmm5 subpd xmm6, xmm8 movapd [rsp + nb312_epsO], xmm0 movapd [rsp + nb312_epsH1], xmm3 movapd [rsp + nb312_epsH2], xmm6 ;# Load LOTS of table data movlpd xmm0, [rsi + r8*8] movlpd xmm1, [rsi + r8*8 + 8] movlpd xmm2, [rsi + r8*8 + 16] movlpd xmm3, [rsi + r8*8 + 24] movlpd xmm4, [rsi + r10*8] movlpd xmm5, [rsi + r10*8 + 8] movlpd xmm6, [rsi + r10*8 + 16] movlpd xmm7, [rsi + r10*8 + 24] movlpd xmm8, [rsi + r12*8] movlpd xmm9, [rsi + r12*8 + 8] movlpd xmm10, [rsi + r12*8 + 16] movlpd xmm11, [rsi + r12*8 + 24] movhpd xmm0, [rsi + r9*8] movhpd xmm1, [rsi + r9*8 + 8] movhpd xmm2, [rsi + r9*8 + 16] movhpd xmm3, [rsi + r9*8 + 24] movhpd xmm4, [rsi + r11*8] movhpd xmm5, [rsi + r11*8 + 8] movhpd xmm6, [rsi + r11*8 + 16] movhpd xmm7, [rsi + r11*8 + 24] movhpd xmm8, [rsi + r13*8] movhpd xmm9, [rsi + r13*8 + 8] movhpd xmm10, [rsi + r13*8 + 16] movhpd xmm11, [rsi + r13*8 + 24] ;# table data ready in xmm0-xmm3 , xmm4-xmm7 , and xmm8-xmm11 mulpd xmm3, [rsp + nb312_epsO] ;# Heps mulpd xmm7, [rsp + nb312_epsH1] mulpd xmm11, [rsp + nb312_epsH2] mulpd xmm2, [rsp + nb312_epsO] ;# Geps mulpd xmm6, [rsp + nb312_epsH1] mulpd xmm10, [rsp + nb312_epsH2] mulpd xmm3, [rsp + nb312_epsO] ;# Heps2 mulpd xmm7, [rsp + nb312_epsH1] mulpd xmm11, [rsp + nb312_epsH2] addpd xmm1, xmm2 ;# F+Geps addpd xmm5, xmm6 addpd xmm9, xmm10 addpd xmm1, xmm3 ;# F+Geps+Heps2 = Fp addpd xmm5, xmm7 addpd xmm9, xmm11 addpd xmm3, xmm3 ;# 2*Heps2 addpd xmm7, xmm7 addpd xmm11, xmm11 addpd xmm3, xmm2 ;# 2*Heps2+Geps addpd xmm7, xmm6 addpd xmm11, xmm10 addpd xmm3, xmm1 ;# FF = Fp + 2*Heps2 + Geps addpd xmm7, xmm5 addpd xmm11, xmm9 mulpd xmm1, [rsp + nb312_epsO] ;# eps*Fp mulpd xmm5, [rsp + nb312_epsH1] mulpd xmm9, [rsp + nb312_epsH2] addpd xmm1, xmm0 ;# VV addpd xmm5, xmm4 addpd xmm9, xmm8 mulpd xmm1, [rsp + nb312_qqOO] ;# VV*qq = vcoul mulpd xmm5, [rsp + nb312_qqOH] mulpd xmm9, [rsp + nb312_qqOH] mulpd xmm3, [rsp + nb312_qqOO] ;# FF*qq = fij mulpd xmm7, [rsp + nb312_qqOH] mulpd xmm11, [rsp + nb312_qqOH] ;# calculate LJ movapd xmm12, [rsp + nb312_rinvOO] mulpd xmm12, xmm12 ;# rinvsq movapd xmm13, xmm12 ;# rinvsq mulpd xmm12, xmm12 ;# rinv4 mulpd xmm12, xmm13 ;# rinv6 movapd xmm13, xmm12 ;# rinv6 mulpd xmm12, xmm12 ;# rinv12 mulpd xmm13, [rsp + nb312_c6] mulpd xmm12, [rsp + nb312_c12] movapd xmm14, xmm12 subpd xmm14, xmm13 addpd xmm14, [rsp + nb312_Vvdwtot] mulpd xmm13, [rsp + nb312_six] mulpd xmm12, [rsp + nb312_twelve] movapd [rsp + nb312_Vvdwtot], xmm14 subpd xmm12, xmm13 ;# LJ fscal mulpd xmm12, [rsp + nb312_rinvOO] movapd xmm4, xmm12 ;# accumulate vctot addpd xmm1, [rsp + nb312_vctot] addpd xmm5, xmm9 addpd xmm1, xmm5 movapd [rsp + nb312_vctot], xmm1 xorpd xmm8, xmm8 xorpd xmm12, xmm12 movapd xmm5, [rsp + nb312_tsc] mulpd xmm3, xmm5 ;# fscal mulpd xmm7, xmm5 mulpd xmm11, xmm5 subpd xmm4, xmm3 subpd xmm8, xmm7 subpd xmm12, xmm11 mulpd xmm4, [rsp + nb312_rinvOO] mulpd xmm8, [rsp + nb312_rinvH1O] mulpd xmm12, [rsp + nb312_rinvH2O] ;# move j O forces to xmm0-xmm2 mov rdi, [rbp + nb312_faction] movlpd xmm0, [rdi + rax*8] movlpd xmm1, [rdi + rax*8 + 8] movlpd xmm2, [rdi + rax*8 + 16] movhpd xmm0, [rdi + rbx*8] movhpd xmm1, [rdi + rbx*8 + 8] movhpd xmm2, [rdi + rbx*8 + 16] movapd xmm3, xmm4 movapd xmm5, xmm4 movapd xmm7, xmm8 movapd xmm9, xmm8 movapd xmm10, xmm12 movapd xmm11, xmm12 mulpd xmm3, [rsp + nb312_dxOO] mulpd xmm4, [rsp + nb312_dyOO] mulpd xmm5, [rsp + nb312_dzOO] mulpd xmm7, [rsp + nb312_dxH1O] mulpd xmm8, [rsp + nb312_dyH1O] mulpd xmm9, [rsp + nb312_dzH1O] mulpd xmm10, [rsp + nb312_dxH2O] mulpd xmm11, [rsp + nb312_dyH2O] mulpd xmm12, [rsp + nb312_dzH2O] addpd xmm0, xmm3 addpd xmm1, xmm4 addpd xmm2, xmm5 addpd xmm3, [rsp + nb312_fixO] addpd xmm4, [rsp + nb312_fiyO] addpd xmm5, [rsp + nb312_fizO] addpd xmm0, xmm7 addpd xmm1, xmm8 addpd xmm2, xmm9 addpd xmm7, [rsp + nb312_fixH1] addpd xmm8, [rsp + nb312_fiyH1] addpd xmm9, [rsp + nb312_fizH1] addpd xmm0, xmm10 addpd xmm1, xmm11 addpd xmm2, xmm12 addpd xmm10, [rsp + nb312_fixH2] addpd xmm11, [rsp + nb312_fiyH2] addpd xmm12, [rsp + nb312_fizH2] movapd [rsp + nb312_fixO], xmm3 movapd [rsp + nb312_fiyO], xmm4 movapd [rsp + nb312_fizO], xmm5 movapd [rsp + nb312_fixH1], xmm7 movapd [rsp + nb312_fiyH1], xmm8 movapd [rsp + nb312_fizH1], xmm9 movapd [rsp + nb312_fixH2], xmm10 movapd [rsp + nb312_fiyH2], xmm11 movapd [rsp + nb312_fizH2], xmm12 ;# store back j O forces from xmm0-xmm2 movlpd [rdi + rax*8], xmm0 movlpd [rdi + rax*8 + 8], xmm1 movlpd [rdi + rax*8 + 16], xmm2 movhpd [rdi + rbx*8], xmm0 movhpd [rdi + rbx*8 + 8], xmm1 movhpd [rdi + rbx*8 + 16], xmm2 ;# move j H1 coordinates to local temp variables mov rsi, [rbp + nb312_pos] movlpd xmm0, [rsi + rax*8 + 24] movlpd xmm1, [rsi + rax*8 + 32] movlpd xmm2, [rsi + rax*8 + 40] movhpd xmm0, [rsi + rbx*8 + 24] movhpd xmm1, [rsi + rbx*8 + 32] movhpd xmm2, [rsi + rbx*8 + 40] ;# xmm0 = H1x ;# xmm1 = H1y ;# xmm2 = H1z movapd xmm3, xmm0 movapd xmm4, xmm1 movapd xmm5, xmm2 movapd xmm6, xmm0 movapd xmm7, xmm1 movapd xmm8, xmm2 subpd xmm0, [rsp + nb312_ixO] subpd xmm1, [rsp + nb312_iyO] subpd xmm2, [rsp + nb312_izO] subpd xmm3, [rsp + nb312_ixH1] subpd xmm4, [rsp + nb312_iyH1] subpd xmm5, [rsp + nb312_izH1] subpd xmm6, [rsp + nb312_ixH2] subpd xmm7, [rsp + nb312_iyH2] subpd xmm8, [rsp + nb312_izH2] movapd [rsp + nb312_dxOH1], xmm0 movapd [rsp + nb312_dyOH1], xmm1 movapd [rsp + nb312_dzOH1], xmm2 mulpd xmm0, xmm0 mulpd xmm1, xmm1 mulpd xmm2, xmm2 movapd [rsp + nb312_dxH1H1], xmm3 movapd [rsp + nb312_dyH1H1], xmm4 movapd [rsp + nb312_dzH1H1], xmm5 mulpd xmm3, xmm3 mulpd xmm4, xmm4 mulpd xmm5, xmm5 movapd [rsp + nb312_dxH2H1], xmm6 movapd [rsp + nb312_dyH2H1], xmm7 movapd [rsp + nb312_dzH2H1], xmm8 mulpd xmm6, xmm6 mulpd xmm7, xmm7 mulpd xmm8, xmm8 addpd xmm0, xmm1 addpd xmm0, xmm2 addpd xmm3, xmm4 addpd xmm3, xmm5 addpd xmm6, xmm7 addpd xmm6, xmm8 ;# start doing invsqrt for jH1 atoms cvtpd2ps xmm1, xmm0 cvtpd2ps xmm4, xmm3 cvtpd2ps xmm7, xmm6 rsqrtps xmm1, xmm1 rsqrtps xmm4, xmm4 rsqrtps xmm7, xmm7 cvtps2pd xmm1, xmm1 cvtps2pd xmm4, xmm4 cvtps2pd xmm7, xmm7 movapd xmm2, xmm1 movapd xmm5, xmm4 movapd xmm8, xmm7 mulpd xmm1, xmm1 ;# lu*lu mulpd xmm4, xmm4 ;# lu*lu mulpd xmm7, xmm7 ;# lu*lu movapd xmm9, [rsp + nb312_three] movapd xmm10, xmm9 movapd xmm11, xmm9 mulpd xmm1, xmm0 ;# rsq*lu*lu mulpd xmm4, xmm3 ;# rsq*lu*lu mulpd xmm7, xmm6 ;# rsq*lu*lu subpd xmm9, xmm1 subpd xmm10, xmm4 subpd xmm11, xmm7 ;# 3-rsq*lu*lu mulpd xmm9, xmm2 mulpd xmm10, xmm5 mulpd xmm11, xmm8 ;# lu*(3-rsq*lu*lu) movapd xmm15, [rsp + nb312_half] mulpd xmm9, xmm15 ;# first iteration for rinvOH1 mulpd xmm10, xmm15 ;# first iteration for rinvH1H1 mulpd xmm11, xmm15 ;# first iteration for rinvH2OH1 ;# second iteration step movapd xmm2, xmm9 movapd xmm5, xmm10 movapd xmm8, xmm11 mulpd xmm2, xmm2 ;# lu*lu mulpd xmm5, xmm5 ;# lu*lu mulpd xmm8, xmm8 ;# lu*lu movapd xmm1, [rsp + nb312_three] movapd xmm4, xmm1 movapd xmm7, xmm1 mulpd xmm2, xmm0 ;# rsq*lu*lu mulpd xmm5, xmm3 ;# rsq*lu*lu mulpd xmm8, xmm6 ;# rsq*lu*lu subpd xmm1, xmm2 subpd xmm4, xmm5 subpd xmm7, xmm8 ;# 3-rsq*lu*lu mulpd xmm9, xmm1 mulpd xmm10, xmm4 mulpd xmm11, xmm7 ;# lu*(3-rsq*lu*lu) movapd xmm15, [rsp + nb312_half] mulpd xmm9, xmm15 ;# rinvOH1 mulpd xmm10, xmm15 ;# rinvH1H1 mulpd xmm11, xmm15 ;# rinvH2H1 movapd [rsp + nb312_rinvOH1], xmm9 movapd [rsp + nb312_rinvH1H1], xmm10 movapd [rsp + nb312_rinvH2H1], xmm11 ;# H1 interactions ;# rsq in xmm0,xmm3,xmm6 ;# rinv in xmm9, xmm10, xmm11 movapd xmm1, [rsp + nb312_tsc] mulpd xmm0, xmm9 ;# r mulpd xmm3, xmm10 mulpd xmm6, xmm11 mulpd xmm0, xmm1 ;# rtab mulpd xmm3, xmm1 mulpd xmm6, xmm1 ;# truncate and convert to integers cvttpd2dq xmm1, xmm0 cvttpd2dq xmm4, xmm3 cvttpd2dq xmm7, xmm6 ;# convert back to float cvtdq2pd xmm2, xmm1 cvtdq2pd xmm5, xmm4 cvtdq2pd xmm8, xmm7 ;# multiply by 4 pslld xmm1, 2 pslld xmm4, 2 pslld xmm7, 2 ;# move to integer registers pshufd xmm13, xmm1, 1 pshufd xmm14, xmm4, 1 pshufd xmm15, xmm7, 1 movd r8d, xmm1 movd r10d, xmm4 movd r12d, xmm7 movd r9d, xmm13 movd r11d, xmm14 movd r13d, xmm15 mov rsi, [rbp + nb312_VFtab] ;# calculate eps subpd xmm0, xmm2 subpd xmm3, xmm5 subpd xmm6, xmm8 movapd [rsp + nb312_epsO], xmm0 movapd [rsp + nb312_epsH1], xmm3 movapd [rsp + nb312_epsH2], xmm6 ;# Load LOTS of table data movlpd xmm0, [rsi + r8*8] movlpd xmm1, [rsi + r8*8 + 8] movlpd xmm2, [rsi + r8*8 + 16] movlpd xmm3, [rsi + r8*8 + 24] movlpd xmm4, [rsi + r10*8] movlpd xmm5, [rsi + r10*8 + 8] movlpd xmm6, [rsi + r10*8 + 16]
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?