nb_kernel430_x86_64_sse2.intel_syntax.s
来自「最著名最快的分子模拟软件」· S 代码 · 共 1,666 行 · 第 1/4 页
S
1,666 行
mov rsi, [rbp + nb430_GBtab] mov rdi, [rbp + nb430_VFtab] ;# load GB table data to xmm0-xmm3, disp to xmm4-xmm7, rep. to xmm8-xmm11 movapd xmm0, [rsi + r12*8] ;# Y1c F1c movapd xmm12, [rsi + r13*8] ;# Y2c F2c movapd xmm4, [rdi + r14*8] ;# Y1d F1d movapd xmm13, [rdi + r15*8] ;# Y2d F2d movapd xmm8, [rdi + r14*8 + 32] ;# Y1r F1r movapd xmm14, [rdi + r15*8 + 32] ;# Y2r F2r movapd xmm1, xmm0 movapd xmm5, xmm4 movapd xmm9, xmm8 unpcklpd xmm0, xmm12 ;# Y1c Y2c unpckhpd xmm1, xmm12 ;# F1c F2c unpcklpd xmm4, xmm13 ;# Y1d Y2d unpckhpd xmm5, xmm13 ;# F1d F2d unpcklpd xmm8, xmm14 ;# Y1r Y2r unpckhpd xmm9, xmm14 ;# F1r F2r movapd xmm2, [rsi + r12*8 + 16] ;# G1c H1c movapd xmm12, [rsi + r13*8 + 16] ;# G2c H2c movapd xmm6, [rdi + r14*8 + 16] ;# G1d H1d movapd xmm13, [rdi + r15*8 + 16] ;# G2d H2d movapd xmm10, [rdi + r14*8 + 48] ;# G1r H1r movapd xmm14, [rdi + r15*8 + 48] ;# G2r H2r movapd xmm3, xmm2 movapd xmm7, xmm6 movapd xmm11, xmm10 unpcklpd xmm2, xmm12 ;# G1c G2c unpckhpd xmm3, xmm12 ;# H1c H2c unpcklpd xmm6, xmm13 ;# G1d G2d unpckhpd xmm7, xmm13 ;# H1d H2d unpcklpd xmm10, xmm14 ;# G1r G2r unpckhpd xmm11, xmm14 ;# H1r H2r ;# table data ready. Coul GB in xmm0-xmm3 , disp in xmm4-xmm7 , rep. in xmm8-xmm11 mov rdi, [rbp + nb430_vdwparam] movapd xmm12, [rsp + nb430_epsgb] movapd xmm13, [rsp + nb430_eps] mulpd xmm3, xmm12 ;# Heps mulpd xmm7, xmm13 mulpd xmm11, xmm13 mulpd xmm2, xmm12 ;# Geps mulpd xmm6, xmm13 mulpd xmm10, xmm13 mulpd xmm3, xmm12 ;# Heps2 mulpd xmm7, xmm13 mulpd xmm11, xmm13 movlpd xmm14, [rdi + r8*8] movlpd xmm15, [rdi + r8*8 + 8] addpd xmm1, xmm2 ;# F+Geps addpd xmm5, xmm6 addpd xmm9, xmm10 addpd xmm1, xmm3 ;# F+Geps+Heps2 = Fp addpd xmm5, xmm7 addpd xmm9, xmm11 addpd xmm3, xmm3 ;# 2*Heps2 addpd xmm7, xmm7 addpd xmm11, xmm11 movhpd xmm14, [rdi + r9*8] movhpd xmm15, [rdi + r9*8 + 8] addpd xmm3, xmm2 ;# 2*Heps2+Geps addpd xmm7, xmm6 addpd xmm11, xmm10 addpd xmm3, xmm1 ;# FF = Fp + 2*Heps2 + Geps addpd xmm7, xmm5 addpd xmm11, xmm9 mulpd xmm1, xmm12 ;# eps*Fp mulpd xmm5, xmm13 mulpd xmm9, xmm13 addpd xmm1, xmm0 ;# VV addpd xmm5, xmm4 addpd xmm9, xmm8 mulpd xmm1, [rsp + nb430_qq] ;# VV*qq = vcoul mulpd xmm5, xmm14 ;# vnb6 mulpd xmm9, xmm15 ;# vnb12 mulpd xmm3, [rsp + nb430_qq] ;# FF*qq = fij mulpd xmm7, xmm14 ;# fijD mulpd xmm11, xmm15 ;#fijR addpd xmm11, xmm7 ;# fijD+fijR mulpd xmm11, [rsp + nb430_tsc] ;# (fijD+fijR)*tabscale ;# accumulate Vvdwtot addpd xmm5, [rsp + nb430_Vvdwtot] addpd xmm5, xmm9 movapd [rsp + nb430_Vvdwtot], xmm5 mov rsi, [rbp + nb430_dvda] ;# Calculate dVda mulpd xmm3, [rsp + nb430_gbscale] ;# fijC=qq*FF*gbscale movapd xmm6, xmm3 mulpd xmm6, [rsp + nb430_r] addpd xmm6, xmm1 ;# vcoul+fijC*r addpd xmm3, xmm11 ;# fijC+fijD+fijR ;# increment vctot addpd xmm1, [rsp + nb430_vctot] movapd [rsp + nb430_vctot], xmm1 ;# xmm6=(vcoul+fijC*r) xorpd xmm7, xmm7 subpd xmm7, xmm6 movapd xmm6, xmm7 ;# the fj's - start by combiningg forces from memory mov rdi, [rbp + nb430_faction] movlpd xmm0, [rdi + r10*8] movlpd xmm1, [rdi + r10*8 + 8] movlpd xmm2, [rdi + r10*8 + 16] movhpd xmm0, [rdi + r11*8] movhpd xmm1, [rdi + r11*8 + 8] movhpd xmm2, [rdi + r11*8 + 16] ;# update dvdasum addpd xmm7, [rsp + nb430_dvdasum] movapd [rsp + nb430_dvdasum], xmm7 ;# update j atoms dvdaj movhlps xmm7, xmm6 addsd xmm6, [rsi + rax*8] addsd xmm7, [rsi + rbx*8] movsd [rsi + rax*8], xmm6 movsd [rsi + rbx*8], xmm7 xorpd xmm4, xmm4 mulpd xmm3, [rsp + nb430_rinv] subpd xmm4, xmm3 movapd xmm9, xmm4 movapd xmm10, xmm4 movapd xmm11, xmm4 mulpd xmm9, [rsp + nb430_dx] mulpd xmm10, [rsp + nb430_dy] mulpd xmm11, [rsp + nb430_dz] addpd xmm0, xmm9 addpd xmm1, xmm10 addpd xmm2, xmm11 ;# accumulate i forces addpd xmm9, [rsp + nb430_fix] addpd xmm10, [rsp + nb430_fiy] addpd xmm11, [rsp + nb430_fiz] movlpd [rdi + r10*8], xmm0 movlpd [rdi + r10*8 + 8], xmm1 movlpd [rdi + r10*8 + 16], xmm2 movapd [rsp + nb430_fix], xmm9 movapd [rsp + nb430_fiy], xmm10 movapd [rsp + nb430_fiz], xmm11 movhpd [rdi + r11*8], xmm0 movhpd [rdi + r11*8 + 8], xmm1 movhpd [rdi + r11*8 + 16], xmm2 ;# should we do one more iteration? sub dword ptr [rsp + nb430_innerk], 2 jl .nb430_checksingle jmp .nb430_unroll_loop.nb430_checksingle: mov edx, [rsp + nb430_innerk] and edx, 1 jnz .nb430_dosingle jmp .nb430_updateouterdata.nb430_dosingle: mov rsi, [rbp + nb430_charge] mov rdx, [rbp + nb430_invsqrta] mov rdi, [rbp + nb430_pos] mov rcx, [rsp + nb430_innerjjnr] mov eax, [rcx] ;# load isaj mov rsi, [rbp + nb430_invsqrta] movsd xmm2, [rsi + rax*8] mulsd xmm2, [rsp + nb430_isai] movapd [rsp + nb430_isaprod], xmm2 movapd xmm1, xmm2 mulsd xmm1, [rsp + nb430_gbtsc] movapd [rsp + nb430_gbscale], xmm1 mulsd xmm2, [rsp + nb430_iq] mov rsi, [rbp + nb430_charge] ;# base of charge[] movsd xmm3, [rsi + rax*8] mulsd xmm3, xmm2 movapd [rsp + nb430_qq], xmm3 mov rsi, [rbp + nb430_type] mov r8d, [rsi + rax*4] mov rsi, [rbp + nb430_vdwparam] shl r8d, 1 mov edi, [rsp + nb430_ntia] add r8d, edi movsd xmm4, [rsi + r8*8] movsd xmm6, [rsi + r8*8 + 8] movapd [rsp + nb430_c6], xmm4 movapd [rsp + nb430_c12], xmm6 mov rsi, [rbp + nb430_pos] ;# base of pos[] lea r10, [rax + rax*2] ;# j3 ;# move coordinate to xmm4-xmm6 movsd xmm4, [rsi + r10*8] movsd xmm5, [rsi + r10*8 + 8] movsd xmm6, [rsi + r10*8 + 16] mov rdi, [rbp + nb430_faction] ;# calc dr subsd xmm4, [rsp + nb430_ix] subsd xmm5, [rsp + nb430_iy] subsd xmm6, [rsp + nb430_iz] ;# store dr movapd [rsp + nb430_dx], xmm4 movapd [rsp + nb430_dy], xmm5 movapd [rsp + nb430_dz], xmm6 ;# square it mulsd xmm4,xmm4 mulsd xmm5,xmm5 mulsd xmm6,xmm6 addsd xmm4, xmm5 addsd xmm4, xmm6 ;# rsq in xmm4 cvtsd2ss xmm5, xmm4 rsqrtss xmm5, xmm5 cvtss2sd xmm2, xmm5 ;# lu in low xmm2 ;# lookup seed in xmm2 movapd xmm5, xmm2 ;# copy of lu mulsd xmm2, xmm2 ;# lu*lu movapd xmm1, [rsp + nb430_three] mulsd xmm2, xmm4 ;# rsq*lu*lu movapd xmm0, [rsp + nb430_half] subsd xmm1, xmm2 ;# 30-rsq*lu*lu mulsd xmm1, xmm5 mulsd xmm1, xmm0 ;# xmm0=iter1 of rinv (new lu) movapd xmm5, xmm1 ;# copy of lu mulsd xmm1, xmm1 ;# lu*lu movapd xmm2, [rsp + nb430_three] mulsd xmm1, xmm4 ;# rsq*lu*lu movapd xmm0, [rsp + nb430_half] subsd xmm2, xmm1 ;# 30-rsq*lu*lu mulsd xmm2, xmm5 mulsd xmm0, xmm2 ;# xmm0=iter2 of rinv mulsd xmm4, xmm0 ;# xmm4=r movapd [rsp + nb430_r], xmm4 movapd [rsp + nb430_rinv], xmm0 movapd xmm8, xmm4 ;# r mulsd xmm4, [rsp + nb430_gbscale] mulsd xmm8, [rsp + nb430_tsc] ;# truncate and convert to integers cvttsd2si r12d, xmm4 ;# gb cvttsd2si r14d, xmm8 ;# lj ;# convert back to float cvtsi2sd xmm6, r12d ;# gb cvtsi2sd xmm10, r14d ;# lj ;# multiply by 4 and 8, respectively shl r12d, 2 ;# gb shl r14d, 3 ;# lj ;# GB indices: r10 LJ indices: r12 ;# calculate eps subsd xmm4, xmm6 ;# gb subsd xmm8, xmm10 ;# lj movapd [rsp + nb430_epsgb], xmm4 ;# gb eps movapd [rsp + nb430_eps], xmm8 ;# lj eps mov rsi, [rbp + nb430_GBtab] mov rdi, [rbp + nb430_VFtab] ;# load GB table data to xmm0-xmm3, disp to xmm4-xmm7, rep. to xmm8-xmm11 movapd xmm0, [rsi + r12*8] ;# Y1c F1c movapd xmm4, [rdi + r14*8] ;# Y1d F1d movapd xmm8, [rdi + r14*8 + 32] ;# Y1r F1r movhlps xmm1, xmm0 movhlps xmm5, xmm4 movhlps xmm9, xmm8 movapd xmm2, [rsi + r12*8 + 16] ;# G1c H1c movapd xmm6, [rdi + r14*8 + 16] ;# G1d H1d movapd xmm10, [rdi + r14*8 + 48] ;# G1r H1r movhlps xmm3, xmm2 movhlps xmm7, xmm6 movhlps xmm11, xmm10 ;# table data ready. Coul GB in xmm0-xmm3 , disp in xmm4-xmm7 , rep. in xmm8-xmm11 movapd xmm12, [rsp + nb430_epsgb] movapd xmm13, [rsp + nb430_eps] mulsd xmm3, xmm12 ;# Heps mulsd xmm7, xmm13 mulsd xmm11, xmm13 mulsd xmm2, xmm12 ;# Geps mulsd xmm6, xmm13 mulsd xmm10, xmm13 mulsd xmm3, xmm12 ;# Heps2 mulsd xmm7, xmm13 mulsd xmm11, xmm13 addsd xmm1, xmm2 ;# F+Geps addsd xmm5, xmm6 addsd xmm9, xmm10 addsd xmm1, xmm3 ;# F+Geps+Heps2 = Fp addsd xmm5, xmm7 addsd xmm9, xmm11 addsd xmm3, xmm3 ;# 2*Heps2 addsd xmm7, xmm7 addsd xmm11, xmm11 addsd xmm3, xmm2 ;# 2*Heps2+Geps addsd xmm7, xmm6 addsd xmm11, xmm10 addsd xmm3, xmm1 ;# FF = Fp + 2*Heps2 + Geps addsd xmm7, xmm5 addsd xmm11, xmm9 mulsd xmm1, xmm12 ;# eps*Fp mulsd xmm5, xmm13 mulsd xmm9, xmm13 addsd xmm1, xmm0 ;# VV addsd xmm5, xmm4 addsd xmm9, xmm8 mulsd xmm1, [rsp + nb430_qq] ;# VV*qq = vcoul mulsd xmm5, [rsp + nb430_c6] ;# vnb6 mulsd xmm9, [rsp + nb430_c12] ;# vnb12 mulsd xmm3, [rsp + nb430_qq] ;# FF*qq = fij mulsd xmm7, [rsp + nb430_c6] ;# fijD mulsd xmm11, [rsp + nb430_c12] ;#fijR addsd xmm11, xmm7 ;# fijD+fijR mulsd xmm11, [rsp + nb430_tsc] ;# (fijD+fijR)*tabscale ;# accumulate Vvdwtot addsd xmm5, [rsp + nb430_Vvdwtot] addsd xmm5, xmm9 movsd [rsp + nb430_Vvdwtot], xmm5 mov rsi, [rbp + nb430_dvda] ;# Calculate dVda mulsd xmm3, [rsp + nb430_gbscale] ;# fijC=qq*FF*gbscale movapd xmm6, xmm3 mulsd xmm6, [rsp + nb430_r] addsd xmm6, xmm1 ;# vcoul+fijC*r addsd xmm3, xmm11 ;# fijC+fijD+fijR ;# increment vctot addsd xmm1, [rsp + nb430_vctot] movsd [rsp + nb430_vctot], xmm1 ;# xmm6=(vcoul+fijC*r) xorpd xmm7, xmm7 subsd xmm7, xmm6 movapd xmm6, xmm7 ;# update dvdasum addsd xmm7, [rsp + nb430_dvdasum] movsd [rsp + nb430_dvdasum], xmm7 ;# update j atoms dvdaj addsd xmm6, [rsi + rax*8] movsd [rsi + rax*8], xmm6 xorpd xmm4, xmm4 mulsd xmm3, [rsp + nb430_rinv] subsd xmm4, xmm3 movapd xmm9, xmm4 movapd xmm10, xmm4 movapd xmm11, xmm4 mulsd xmm9, [rsp + nb430_dx] mulsd xmm10, [rsp + nb430_dy] mulsd xmm11, [rsp + nb430_dz] movapd xmm3, xmm9 movapd xmm4, xmm10 movapd xmm5, xmm11 ;# accumulate i forces addsd xmm9, [rsp + nb430_fix] addsd xmm10, [rsp + nb430_fiy] addsd xmm11, [rsp + nb430_fiz] movsd [rsp + nb430_fix], xmm9 movsd [rsp + nb430_fiy], xmm10 movsd [rsp + nb430_fiz], xmm11 mov rdi, [rbp + nb430_faction] ;# the fj's - start by accumulating forces from memory addsd xmm3, [rdi + r10*8] addsd xmm4, [rdi + r10*8 + 8] addsd xmm5, [rdi + r10*8 + 16] movsd [rdi + r10*8], xmm3 movsd [rdi + r10*8 + 8], xmm4 movsd [rdi + r10*8 + 16], xmm5 .nb430_updateouterdata: mov ecx, [rsp + nb430_ii3]
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?