nb_kernel333_x86_64_sse2.intel_syntax.s
来自「最著名最快的分子模拟软件」· S 代码 · 共 2,340 行 · 第 1/5 页
S
2,340 行
add qword ptr [rsp + nb333_innerjjnr], 8 ;# advance pointer (unrolled 2) mov rsi, [rbp + nb333_charge] ;# base of charge[] movsd xmm3, [rsi + rax*8] movapd xmm4, xmm3 mulsd xmm3, [rsp + nb333_iqM] mulsd xmm4, [rsp + nb333_iqH] movapd [rsp + nb333_qqM], xmm3 movapd [rsp + nb333_qqH], xmm4 mov rsi, [rbp + nb333_type] mov r8d, [rsi + rax*4] mov rsi, [rbp + nb333_vdwparam] shl r8d, 1 mov edi, [rsp + nb333_ntia] add r8d, edi movlpd xmm6, [rsi + r8*8] ;# c6a movlpd xmm7, [rsi + r8*8 + 8] ;# c12a movapd [rsp + nb333_c6], xmm6 movapd [rsp + nb333_c12], xmm7 mov rsi, [rbp + nb333_pos] ;# base of pos[] lea rax, [rax + rax*2] ;# replace jnr with j3 ;# move j coordinates to local temp variables movsd xmm0, [rsi + rax*8] movsd xmm1, [rsi + rax*8 + 8] movsd xmm2, [rsi + rax*8 + 16] ;# xmm0 = jx ;# xmm1 = jy ;# xmm2 = jz ;# O interaction ;# copy to xmm3-xmm5 movapd xmm3, xmm0 movapd xmm4, xmm1 movapd xmm5, xmm2 subsd xmm3, [rsp + nb333_ixO] subsd xmm4, [rsp + nb333_iyO] subsd xmm5, [rsp + nb333_izO] movapd [rsp + nb333_dxO], xmm3 movapd [rsp + nb333_dyO], xmm4 movapd [rsp + nb333_dzO], xmm5 mulsd xmm3, xmm3 mulsd xmm4, xmm4 mulsd xmm5, xmm5 addsd xmm3, xmm4 addsd xmm3, xmm5 ;# xmm3=rsq cvtsd2ss xmm5, xmm3 rsqrtss xmm5, xmm5 cvtss2sd xmm15, xmm5 ;# lu in low xmm2 ;# lookup seed in xmm2 movapd xmm5, xmm15 ;# copy of lu mulsd xmm15, xmm15 ;# lu*lu movapd xmm7, [rsp + nb333_three] mulsd xmm15, xmm3 ;# rsq*lu*lu movapd xmm6, [rsp + nb333_half] subsd xmm7, xmm15 ;# 30-rsq*lu*lu mulsd xmm7, xmm5 mulsd xmm7, xmm6 ;# xmm0=iter1 of rinv (new lu) movapd xmm5, xmm7 ;# copy of lu mulsd xmm7, xmm7 ;# lu*lu movapd xmm15, [rsp + nb333_three] mulsd xmm7, xmm3 ;# rsq*lu*lu movapd xmm6, [rsp + nb333_half] subsd xmm15, xmm7 ;# 30-rsq*lu*lu mulsd xmm15, xmm5 mulsd xmm15, xmm6 ;# xmm15=rinv mulsd xmm3, xmm15 ;# xmm3=r ;# xmm15=rinv ;# xmm3=r mulsd xmm3, [rsp + nb333_tsc] ;# rtab ;# truncate and convert to integers cvttsd2si r10d, xmm3 ;# convert back to float cvtsi2sd xmm4, r10d ;# multiply by 4 shl r10d, 2 ;# calculate eps subsd xmm3, xmm4 ;# xmm3=eps ;# multiply by 3 lea r10, [r10 + r10*2] ;# xmm3=eps ;# xmm15=rinv mov rsi, [rbp + nb333_VFtab] movsd xmm4, [rsi + r10*8 + 32] movsd xmm5, [rsi + r10*8 + 40] movsd xmm6, [rsi + r10*8 + 48] movsd xmm7, [rsi + r10*8 + 56] movsd xmm8, [rsi + r10*8 + 64] movsd xmm9, [rsi + r10*8 + 72] movsd xmm10, [rsi + r10*8 + 80] movsd xmm11, [rsi + r10*8 + 88] ;# dispersion table in xmm4-xmm7, repulsion table in xmm8-xmm11 mulsd xmm7, xmm3 ;# Heps mulsd xmm11, xmm3 mulsd xmm6, xmm3 ;# Geps mulsd xmm10, xmm3 mulsd xmm7, xmm3 ;# Heps2 mulsd xmm11, xmm3 addsd xmm5, xmm6 ;# F+Geps addsd xmm9, xmm10 addsd xmm5, xmm7 ;# F+Geps+Heps2 = Fp addsd xmm9, xmm11 addsd xmm7, xmm7 ;# 2*Heps2 addsd xmm11, xmm11 addsd xmm7, xmm6 ;# 2*Heps2+Geps addsd xmm11, xmm10 addsd xmm7, xmm5 ;# FF = Fp + 2*Heps2 + Geps addsd xmm11, xmm9 mulsd xmm5, xmm3 ;# eps*Fp mulsd xmm9, xmm3 movapd xmm12, [rsp + nb333_c6] movapd xmm13, [rsp + nb333_c12] addsd xmm5, xmm4 ;# VV addsd xmm9, xmm8 mulsd xmm5, xmm12 ;# VV*c6 = vnb6 mulsd xmm9, xmm13 ;# VV*c12 = vnb12 addsd xmm5, xmm9 addsd xmm5, [rsp + nb333_Vvdwtot] movsd [rsp + nb333_Vvdwtot], xmm5 mulsd xmm7, xmm12 ;# FF*c6 = fnb6 mulsd xmm11, xmm13 ;# FF*c12 = fnb12 addsd xmm7, xmm11 mulsd xmm7, [rsp + nb333_tsc] mulsd xmm7, xmm15 ;# -fscal xorpd xmm9, xmm9 subsd xmm9, xmm7 ;# fscal movapd xmm10, xmm9 movapd xmm11, xmm9 mulsd xmm9, [rsp + nb333_dxO] ;# fx/fy/fz mulsd xmm10, [rsp + nb333_dyO] mulsd xmm11, [rsp + nb333_dzO] ;# save j force temporarily movapd [rsp + nb333_fjx], xmm9 movapd [rsp + nb333_fjy], xmm10 movapd [rsp + nb333_fjz], xmm11 ;# increment i O force addsd xmm9, [rsp + nb333_fixO] addsd xmm10, [rsp + nb333_fiyO] addsd xmm11, [rsp + nb333_fizO] movsd [rsp + nb333_fixO], xmm9 movsd [rsp + nb333_fiyO], xmm10 movsd [rsp + nb333_fizO], xmm11 ;# finished O LJ interaction. ;# do H1, H2, and M interactions in parallel. ;# xmm0-xmm2 still contain j coordinates. movapd xmm3, xmm0 movapd xmm4, xmm1 movapd xmm5, xmm2 movapd xmm6, xmm0 movapd xmm7, xmm1 movapd xmm8, xmm2 subsd xmm0, [rsp + nb333_ixH1] subsd xmm1, [rsp + nb333_iyH1] subsd xmm2, [rsp + nb333_izH1] subsd xmm3, [rsp + nb333_ixH2] subsd xmm4, [rsp + nb333_iyH2] subsd xmm5, [rsp + nb333_izH2] subsd xmm6, [rsp + nb333_ixM] subsd xmm7, [rsp + nb333_iyM] subsd xmm8, [rsp + nb333_izM] movapd [rsp + nb333_dxH1], xmm0 movapd [rsp + nb333_dyH1], xmm1 movapd [rsp + nb333_dzH1], xmm2 mulsd xmm0, xmm0 mulsd xmm1, xmm1 mulsd xmm2, xmm2 movapd [rsp + nb333_dxH2], xmm3 movapd [rsp + nb333_dyH2], xmm4 movapd [rsp + nb333_dzH2], xmm5 mulsd xmm3, xmm3 mulsd xmm4, xmm4 mulsd xmm5, xmm5 movapd [rsp + nb333_dxM], xmm6 movapd [rsp + nb333_dyM], xmm7 movapd [rsp + nb333_dzM], xmm8 mulsd xmm6, xmm6 mulsd xmm7, xmm7 mulsd xmm8, xmm8 addsd xmm0, xmm1 addsd xmm0, xmm2 addsd xmm3, xmm4 addsd xmm3, xmm5 addsd xmm6, xmm7 addsd xmm6, xmm8 ;# start doing invsqrt for j atoms cvtsd2ss xmm1, xmm0 cvtsd2ss xmm4, xmm3 cvtsd2ss xmm7, xmm6 rsqrtss xmm1, xmm1 rsqrtss xmm4, xmm4 rsqrtss xmm7, xmm7 cvtss2sd xmm1, xmm1 cvtss2sd xmm4, xmm4 cvtss2sd xmm7, xmm7 movapd xmm2, xmm1 movapd xmm5, xmm4 movapd xmm8, xmm7 mulsd xmm1, xmm1 ;# lu*lu mulsd xmm4, xmm4 ;# lu*lu mulsd xmm7, xmm7 ;# lu*lu movapd xmm9, [rsp + nb333_three] movapd xmm10, xmm9 movapd xmm11, xmm9 mulsd xmm1, xmm0 ;# rsq*lu*lu mulsd xmm4, xmm3 ;# rsq*lu*lu mulsd xmm7, xmm6 ;# rsq*lu*lu subsd xmm9, xmm1 subsd xmm10, xmm4 subsd xmm11, xmm7 ;# 3-rsq*lu*lu mulsd xmm9, xmm2 mulsd xmm10, xmm5 mulsd xmm11, xmm8 ;# lu*(3-rsq*lu*lu) movapd xmm15, [rsp + nb333_half] mulsd xmm9, xmm15 ;# first iteration for rinvH1 mulsd xmm10, xmm15 ;# first iteration for rinvH2 mulsd xmm11, xmm15 ;# first iteration for rinvM ;# second iteration step movapd xmm2, xmm9 movapd xmm5, xmm10 movapd xmm8, xmm11 mulsd xmm2, xmm2 ;# lu*lu mulsd xmm5, xmm5 ;# lu*lu mulsd xmm8, xmm8 ;# lu*lu movapd xmm1, [rsp + nb333_three] movapd xmm4, xmm1 movapd xmm7, xmm1 mulsd xmm2, xmm0 ;# rsq*lu*lu mulsd xmm5, xmm3 ;# rsq*lu*lu mulsd xmm8, xmm6 ;# rsq*lu*lu subsd xmm1, xmm2 subsd xmm4, xmm5 subsd xmm7, xmm8 ;# 3-rsq*lu*lu mulsd xmm9, xmm1 mulsd xmm10, xmm4 mulsd xmm11, xmm7 ;# lu*(3-rsq*lu*lu) movapd xmm15, [rsp + nb333_half] mulsd xmm9, xmm15 ;# rinvH1 mulsd xmm10, xmm15 ;# rinvH2 mulsd xmm11, xmm15 ;# rinvM movapd [rsp + nb333_rinvH1], xmm9 movapd [rsp + nb333_rinvH2], xmm10 movapd [rsp + nb333_rinvM], xmm11 ;# interactions ;# rsq in xmm0,xmm3,xmm6 ;# rinv in xmm9, xmm10, xmm11 movapd xmm1, [rsp + nb333_tsc] mulsd xmm0, xmm9 ;# r mulsd xmm3, xmm10 mulsd xmm6, xmm11 mulsd xmm0, xmm1 ;# rtab mulsd xmm3, xmm1 mulsd xmm6, xmm1 ;# truncate and convert to integers cvttsd2si r8d, xmm0 cvttsd2si r10d, xmm3 cvttsd2si r12d, xmm6 ;# convert back to float cvtsi2sd xmm2, r8d cvtsi2sd xmm5, r10d cvtsi2sd xmm8, r12d ;# multiply by 4 shl r8d, 2 shl r10d, 2 shl r12d, 2 mov rsi, [rbp + nb333_VFtab] lea r8, [r8 + r8*2] lea r10, [r10 + r10*2] lea r12, [r12 + r12*2] ;# calculate eps subsd xmm0, xmm2 subsd xmm3, xmm5 subsd xmm6, xmm8 movapd xmm12, xmm0 ;# epsH1 movapd xmm13, xmm3 ;# epsH2 movapd xmm14, xmm6 ;# epsM ;# Load LOTS of table data movsd xmm0, [rsi + r8*8] movsd xmm1, [rsi + r8*8 + 8] movsd xmm2, [rsi + r8*8 + 16] movsd xmm3, [rsi + r8*8 + 24] movsd xmm4, [rsi + r10*8] movsd xmm5, [rsi + r10*8 + 8] movsd xmm6, [rsi + r10*8 + 16] movsd xmm7, [rsi + r10*8 + 24] movsd xmm8, [rsi + r12*8] movsd xmm9, [rsi + r12*8 + 8] movsd xmm10, [rsi + r12*8 + 16] movsd xmm11, [rsi + r12*8 + 24] ;# table data ready in xmm0-xmm3 , xmm4-xmm7 , and xmm8-xmm11 mulsd xmm3, xmm12 ;# Heps mulsd xmm7, xmm13 mulsd xmm11, xmm14 mulsd xmm2, xmm12 ;# Geps mulsd xmm6, xmm13 mulsd xmm10, xmm14 mulsd xmm3, xmm12 ;# Heps2 mulsd xmm7, xmm13 mulsd xmm11, xmm14 addsd xmm1, xmm2 ;# F+Geps addsd xmm5, xmm6 addsd xmm9, xmm10 addsd xmm1, xmm3 ;# F+Geps+Heps2 = Fp addsd xmm5, xmm7 addsd xmm9, xmm11 addsd xmm3, xmm3 ;# 2*Heps2 addsd xmm7, xmm7 addsd xmm11, xmm11 addsd xmm3, xmm2 ;# 2*Heps2+Geps addsd xmm7, xmm6 addsd xmm11, xmm10 addsd xmm3, xmm1 ;# FF = Fp + 2*Heps2 + Geps addsd xmm7, xmm5 addsd xmm11, xmm9 mulsd xmm1, xmm12 ;# eps*Fp mulsd xmm5, xmm13 mulsd xmm9, xmm14 movapd xmm12, [rsp + nb333_qqH] movapd xmm13, [rsp + nb333_qqM] addsd xmm1, xmm0 ;# VV addsd xmm5, xmm4 addsd xmm9, xmm8 mulsd xmm1, xmm12 ;# VV*qq = vcoul mulsd xmm5, xmm12 mulsd xmm9, xmm13 mulsd xmm3, xmm12 ;# FF*qq = fij mulsd xmm7, xmm12 mulsd xmm11, xmm13 ;# accumulate vctot addsd xmm1, [rsp + nb333_vctot] addsd xmm5, xmm9 addsd xmm1, xmm5 movsd [rsp + nb333_vctot], xmm1 movapd xmm10, [rsp + nb333_tsc] mulsd xmm3, xmm10 ;# fscal mulsd xmm7, xmm10 mulsd xmm10, xmm11 xorpd xmm4, xmm4 xorpd xmm8, xmm8 xorpd xmm11, xmm11 subsd xmm4, xmm3 subsd xmm8, xmm7 subsd xmm11, xmm10 mulsd xmm4, [rsp + nb333_rinvH1] mulsd xmm8, [rsp + nb333_rinvH2] mulsd xmm11, [rsp + nb333_rinvM] ;# move j forces to xmm0-xmm2 mov rdi, [rbp + nb333_faction] movsd xmm0, [rdi + rax*8] movsd xmm1, [rdi + rax*8 + 8] movsd xmm2, [rdi + rax*8 + 16] movapd xmm3, xmm4 movapd xmm5, xmm4 movapd xmm7, xmm8 movapd xmm9, xmm8 movapd xmm10, xmm11 movapd xmm12, xmm11 ;# add forces from O interaction addsd xmm0, [rsp + nb333_fjx] addsd xmm1, [rsp + nb333_fjy] addsd xmm2, [rsp + nb333_fjz] mulsd xmm3, [rsp + nb333_dxH1] mulsd xmm4, [rsp + nb333_dyH1] mulsd xmm5, [rsp + nb333_dzH1] mulsd xmm7, [rsp + nb333_dxH2] mulsd xmm8, [rsp + nb333_dyH2] mulsd xmm9, [rsp + nb333_dzH2] mulsd xmm10, [rsp + nb333_dxM] mulsd xmm11, [rsp + nb333_dyM] mulsd xmm12, [rsp + nb333_dzM] addsd xmm0, xmm3 addsd xmm1, xmm4 addsd xmm2, xmm5 addsd xmm3, [rsp + nb333_fixH1] addsd xmm4, [rsp + nb333_fiyH1] addsd xmm5, [rsp + nb333_fizH1] addsd xmm0, xmm7 addsd xmm1, xmm8 addsd xmm2, xmm9 addsd xmm7, [rsp + nb333_fixH2] addsd xmm8, [rsp + nb333_fiyH2] addsd xmm9, [rsp + nb333_fizH2] addsd xmm0, xmm10 addsd xmm1, xmm11 addsd xmm2, xmm12 addsd xmm10, [rsp + nb333_fixM] addsd xmm11, [rsp + nb333_fiyM] addsd xmm12, [rsp + nb333_fizM] movsd [rsp + nb333_fixH1], xmm3
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?