nb_kernel331_x86_64_sse2.intel_syntax.s
来自「最著名最快的分子模拟软件」· S 代码 · 共 2,353 行 · 第 1/5 页
S
2,353 行
mulsd xmm11, xmm7 ;# lu*(3-rsq*lu*lu) movapd xmm15, [rsp + nb331_half] mulsd xmm9, xmm15 ;# rinvO mulsd xmm10, xmm15 ;# rinvH1 mulsd xmm11, xmm15 ;# rinvH2 movapd [rsp + nb331_rinvO], xmm9 movapd [rsp + nb331_rinvH1], xmm10 movapd [rsp + nb331_rinvH2], xmm11 ;# interactions ;# rsq in xmm0,xmm3,xmm6 ;# rinv in xmm9, xmm10, xmm11 movapd [rsp + nb331_rinvO], xmm9 movapd xmm1, [rsp + nb331_tsc] mulsd xmm0, xmm9 ;# r mulsd xmm3, xmm10 mulsd xmm6, xmm11 mulsd xmm0, xmm1 ;# rtab mulsd xmm3, xmm1 mulsd xmm6, xmm1 ;# truncate and convert to integers cvttsd2si r8d, xmm0 cvttsd2si r10d, xmm3 cvttsd2si r12d, xmm6 ;# convert back to float cvtsi2sd xmm2, r8d cvtsi2sd xmm5, r10d cvtsi2sd xmm8, r12d ;# multiply by 4 shl r8d, 2 shl r10d, 2 shl r12d, 2 ;# multiply by 3 lea r8, [r8 + r8*2] lea r10, [r10 + r10*2] lea r12, [r12 + r12*2] mov rsi, [rbp + nb331_VFtab] ;# calculate eps subsd xmm0, xmm2 subsd xmm3, xmm5 subsd xmm6, xmm8 movapd xmm12, xmm0 movapd xmm13, xmm3 movapd xmm14, xmm6 ;# Load LOTS of table data movsd xmm0, [rsi + r8*8] movsd xmm1, [rsi + r8*8 + 8] movsd xmm2, [rsi + r8*8 + 16] movsd xmm3, [rsi + r8*8 + 24] movsd xmm4, [rsi + r10*8] movsd xmm5, [rsi + r10*8 + 8] movsd xmm6, [rsi + r10*8 + 16] movsd xmm7, [rsi + r10*8 + 24] movsd xmm8, [rsi + r12*8] movsd xmm9, [rsi + r12*8 + 8] movsd xmm10, [rsi + r12*8 + 16] movsd xmm11, [rsi + r12*8 + 24] ;# table data ready in xmm0-xmm3 , xmm4-xmm7 , and xmm8-xmm11 mulsd xmm3, xmm12 ;# Heps mulsd xmm7, xmm13 mulsd xmm11, xmm14 mulsd xmm2, xmm12 ;# Geps mulsd xmm6, xmm13 mulsd xmm10, xmm14 mulsd xmm3, xmm12 ;# Heps2 mulsd xmm7, xmm13 mulsd xmm11, xmm14 addsd xmm1, xmm2 ;# F+Geps addsd xmm5, xmm6 addsd xmm9, xmm10 addsd xmm1, xmm3 ;# F+Geps+Heps2 = Fp addsd xmm5, xmm7 addsd xmm9, xmm11 addsd xmm3, xmm3 ;# 2*Heps2 addsd xmm7, xmm7 addsd xmm11, xmm11 addsd xmm3, xmm2 ;# 2*Heps2+Geps addsd xmm7, xmm6 addsd xmm11, xmm10 addsd xmm3, xmm1 ;# FF = Fp + 2*Heps2 + Geps addsd xmm7, xmm5 addsd xmm11, xmm9 mulsd xmm1, xmm12 ;# eps*Fp mulsd xmm5, xmm13 mulsd xmm9, xmm14 addsd xmm1, xmm0 ;# VV addsd xmm5, xmm4 addsd xmm9, xmm8 mulsd xmm1, [rsp + nb331_qqO] ;# VV*qq = vcoul mulsd xmm5, [rsp + nb331_qqH] mulsd xmm9, [rsp + nb331_qqH] mulsd xmm3, [rsp + nb331_qqO] ;# FF*qq = fij mulsd xmm7, [rsp + nb331_qqH] mulsd xmm11, [rsp + nb331_qqH] ;# accumulate vctot addsd xmm1, [rsp + nb331_vctot] addsd xmm5, xmm9 addsd xmm1, xmm5 movsd [rsp + nb331_vctot], xmm1 movapd xmm2, xmm7 movapd xmm1, xmm11 ;# fij coul in xmm3, xmm2, xmm1 ;# calculate LJ table movsd xmm4, [rsi + r8*8 + 32] movsd xmm5, [rsi + r8*8 + 40] movsd xmm6, [rsi + r8*8 + 48] movsd xmm7, [rsi + r8*8 + 56] movsd xmm8, [rsi + r8*8 + 64] movsd xmm9, [rsi + r8*8 + 72] movsd xmm10, [rsi + r8*8 + 80] movsd xmm11, [rsi + r8*8 + 88] ;# dispersion table in xmm4-xmm7, repulsion table in xmm8-xmm11 ;# xmm12 = epsO mulsd xmm7, xmm12 ;# Heps mulsd xmm11, xmm12 mulsd xmm6, xmm12 ;# Geps mulsd xmm10, xmm12 mulsd xmm7, xmm12 ;# Heps2 mulsd xmm11, xmm12 addpd xmm5, xmm6 ;# F+Geps addsd xmm9, xmm10 addsd xmm5, xmm7 ;# F+Geps+Heps2 = Fp addsd xmm9, xmm11 addsd xmm7, xmm7 ;# 2*Heps2 addsd xmm11, xmm11 addsd xmm7, xmm6 ;# 2*Heps2+Geps addsd xmm11, xmm10 addsd xmm7, xmm5 ;# FF = Fp + 2*Heps2 + Geps addsd xmm11, xmm9 mulsd xmm5, xmm12 ;# eps*Fp mulsd xmm9, xmm12 movapd xmm12, [rsp + nb331_c6] movapd xmm13, [rsp + nb331_c12] addsd xmm5, xmm4 ;# VV addsd xmm9, xmm8 mulsd xmm5, xmm12 ;# VV*c6 = vnb6 mulsd xmm9, xmm13 ;# VV*c12 = vnb12 addsd xmm5, xmm9 addsd xmm5, [rsp + nb331_Vvdwtot] movsd [rsp + nb331_Vvdwtot], xmm5 mulsd xmm7, xmm12 ;# FF*c6 = fnb6 mulsd xmm11, xmm13 ;# FF*c12 = fnb12 addsd xmm7, xmm11 addsd xmm3, xmm7 movapd xmm10, [rsp + nb331_tsc] mulsd xmm3, xmm10 ;# fscal mulsd xmm2, xmm10 mulsd xmm1, xmm10 ;# move j forces to xmm11-xmm13 mov rdi, [rbp + nb331_faction] movsd xmm11, [rdi + rax*8] movsd xmm12, [rdi + rax*8 + 8] movsd xmm13, [rdi + rax*8 + 16] xorpd xmm0, xmm0 xorpd xmm4, xmm4 xorpd xmm8, xmm8 subsd xmm0, xmm3 subsd xmm4, xmm2 subsd xmm8, xmm1 mulsd xmm0, [rsp + nb331_rinvO] mulsd xmm4, [rsp + nb331_rinvH1] mulsd xmm8, [rsp + nb331_rinvH2] movapd xmm1, xmm0 movapd xmm2, xmm0 movapd xmm3, xmm4 movapd xmm5, xmm4 movapd xmm6, xmm8 movapd xmm7, xmm8 mulsd xmm0, [rsp + nb331_dxO] mulsd xmm1, [rsp + nb331_dyO] mulsd xmm2, [rsp + nb331_dzO] mulsd xmm3, [rsp + nb331_dxH1] mulsd xmm4, [rsp + nb331_dyH1] mulsd xmm5, [rsp + nb331_dzH1] mulsd xmm6, [rsp + nb331_dxH2] mulsd xmm7, [rsp + nb331_dyH2] mulsd xmm8, [rsp + nb331_dzH2] addsd xmm11, xmm0 addsd xmm12, xmm1 addsd xmm13, xmm2 addsd xmm0, [rsp + nb331_fixO] addsd xmm1, [rsp + nb331_fiyO] addsd xmm2, [rsp + nb331_fizO] addsd xmm11, xmm3 addsd xmm12, xmm4 addsd xmm13, xmm5 addsd xmm3, [rsp + nb331_fixH1] addsd xmm4, [rsp + nb331_fiyH1] addsd xmm5, [rsp + nb331_fizH1] addsd xmm11, xmm6 addsd xmm12, xmm7 addsd xmm13, xmm8 addsd xmm6, [rsp + nb331_fixH2] addsd xmm7, [rsp + nb331_fiyH2] addsd xmm8, [rsp + nb331_fizH2] movsd [rsp + nb331_fixO], xmm0 movsd [rsp + nb331_fiyO], xmm1 movsd [rsp + nb331_fizO], xmm2 movsd [rsp + nb331_fixH1], xmm3 movsd [rsp + nb331_fiyH1], xmm4 movsd [rsp + nb331_fizH1], xmm5 movsd [rsp + nb331_fixH2], xmm6 movsd [rsp + nb331_fiyH2], xmm7 movsd [rsp + nb331_fizH2], xmm8 ;# store back j forces from xmm11-xmm13 movsd [rdi + rax*8], xmm11 movsd [rdi + rax*8 + 8], xmm12 movsd [rdi + rax*8 + 16], xmm13.nb331_updateouterdata: mov ecx, [rsp + nb331_ii3] mov rdi, [rbp + nb331_faction] mov rsi, [rbp + nb331_fshift] mov edx, [rsp + nb331_is3] ;# accumulate Oi forces in xmm0, xmm1, xmm2 movapd xmm0, [rsp + nb331_fixO] movapd xmm1, [rsp + nb331_fiyO] movapd xmm2, [rsp + nb331_fizO] movhlps xmm3, xmm0 movhlps xmm4, xmm1 movhlps xmm5, xmm2 addsd xmm0, xmm3 addsd xmm1, xmm4 addsd xmm2, xmm5 ;# sum is in low xmm0-xmm2 movapd xmm3, xmm0 movapd xmm4, xmm1 movapd xmm5, xmm2 ;# increment i force movsd xmm3, [rdi + rcx*8] movsd xmm4, [rdi + rcx*8 + 8] movsd xmm5, [rdi + rcx*8 + 16] subsd xmm3, xmm0 subsd xmm4, xmm1 subsd xmm5, xmm2 movsd [rdi + rcx*8], xmm3 movsd [rdi + rcx*8 + 8], xmm4 movsd [rdi + rcx*8 + 16], xmm5 ;# accumulate force in xmm6/xmm7 for fshift movapd xmm6, xmm0 movsd xmm7, xmm2 unpcklpd xmm6, xmm1 ;# accumulate H1i forces in xmm0, xmm1, xmm2 movapd xmm0, [rsp + nb331_fixH1] movapd xmm1, [rsp + nb331_fiyH1] movapd xmm2, [rsp + nb331_fizH1] movhlps xmm3, xmm0 movhlps xmm4, xmm1 movhlps xmm5, xmm2 addsd xmm0, xmm3 addsd xmm1, xmm4 addsd xmm2, xmm5 ;# sum is in low xmm0-xmm2 ;# increment i force movsd xmm3, [rdi + rcx*8 + 24] movsd xmm4, [rdi + rcx*8 + 32] movsd xmm5, [rdi + rcx*8 + 40] subsd xmm3, xmm0 subsd xmm4, xmm1 subsd xmm5, xmm2 movsd [rdi + rcx*8 + 24], xmm3 movsd [rdi + rcx*8 + 32], xmm4 movsd [rdi + rcx*8 + 40], xmm5 ;# accumulate force in xmm6/xmm7 for fshift addsd xmm7, xmm2 unpcklpd xmm0, xmm1 addpd xmm6, xmm0 ;# accumulate H2i forces in xmm0, xmm1, xmm2 movapd xmm0, [rsp + nb331_fixH2] movapd xmm1, [rsp + nb331_fiyH2] movapd xmm2, [rsp + nb331_fizH2] movhlps xmm3, xmm0 movhlps xmm4, xmm1 movhlps xmm5, xmm2 addsd xmm0, xmm3 addsd xmm1, xmm4 addsd xmm2, xmm5 ;# sum is in low xmm0-xmm2 movapd xmm3, xmm0 movapd xmm4, xmm1 movapd xmm5, xmm2 ;# increment i force movsd xmm3, [rdi + rcx*8 + 48] movsd xmm4, [rdi + rcx*8 + 56] movsd xmm5, [rdi + rcx*8 + 64] subsd xmm3, xmm0 subsd xmm4, xmm1 subsd xmm5, xmm2 movsd [rdi + rcx*8 + 48], xmm3 movsd [rdi + rcx*8 + 56], xmm4 movsd [rdi + rcx*8 + 64], xmm5 ;# accumulate force in xmm6/xmm7 for fshift addsd xmm7, xmm2 unpcklpd xmm0, xmm1 addpd xmm6, xmm0 ;# increment fshift force movlpd xmm3, [rsi + rdx*8] movhpd xmm3, [rsi + rdx*8 + 8] movsd xmm4, [rsi + rdx*8 + 16] subpd xmm3, xmm6 subsd xmm4, xmm7 movlpd [rsi + rdx*8], xmm3 movhpd [rsi + rdx*8 + 8], xmm3 movsd [rsi + rdx*8 + 16], xmm4 ;# get n from stack mov esi, [rsp + nb331_n] ;# get group index for i particle mov rdx, [rbp + nb331_gid] ;# base of gid[] mov edx, [rdx + rsi*4] ;# ggid=gid[n] ;# accumulate total potential energy and update it movapd xmm7, [rsp + nb331_vctot] ;# accumulate movhlps xmm6, xmm7 addsd xmm7, xmm6 ;# low xmm7 has the sum now ;# add earlier value from mem mov rax, [rbp + nb331_Vc] addsd xmm7, [rax + rdx*8] ;# move back to mem movsd [rax + rdx*8], xmm7 ;# accumulate total lj energy and update it movapd xmm7, [rsp + nb331_Vvdwtot] ;# accumulate movhlps xmm6, xmm7 addsd xmm7, xmm6 ;# low xmm7 has the sum now ;# add earlier value from mem mov rax, [rbp + nb331_Vvdw] addsd xmm7, [rax + rdx*8] ;# move back to mem movsd [rax + rdx*8], xmm7 ;# finish if last mov ecx, [rsp + nb331_nn1] ;# esi already loaded with n inc esi sub ecx, esi jz .nb331_outerend ;# not last, iterate outer loop once more! mov [rsp + nb331_n], esi jmp .nb331_outer.nb331_outerend: ;# check if more outer neighborlists remain mov ecx, [rsp + nb331_nri] ;# esi already loaded with n above sub ecx, esi jz .nb331_end ;# non-zero, do one more workunit jmp .nb331_threadloop.nb331_end: mov eax, [rsp + nb331_nouter] mov ebx, [rsp + nb331_ninner] mov rcx, [rbp + nb331_outeriter] mov rdx, [rbp + nb331_inneriter] mov [rcx], eax mov [rdx], ebx add rsp, 872 emms pop r15 pop r14 pop r13 pop r12 pop rbx pop rbp ret .globl nb_kernel331nf_x86_64_sse2.globl _nb_kernel331nf_x86_64_sse2nb_kernel331nf_x86_64_sse2: _nb_kernel331nf_x86_64_sse2: ;# Room for return address and rbp (16 bytes).equiv nb331nf_fshift, 16.equiv nb331nf_gid, 24.equiv nb331nf_pos, 32.equiv nb331nf_faction, 40.equiv nb331nf_charge, 48.equiv nb331nf_p_facel, 56.equiv nb331nf_argkrf, 64.equiv nb331nf_argcrf, 72.equiv nb331nf_Vc, 80.equiv nb331nf_type, 88.equiv nb331nf_p_ntype, 96.equiv nb331nf_vdwparam, 104.equiv nb331nf_Vvdw, 112.equiv nb331nf_p_tabscale, 120.equiv nb331nf_VFtab, 128.equiv nb331nf_invsqrta, 136.equiv nb331nf_dvda, 144.equiv nb331nf_p_gbtabscale, 152.equiv nb331nf_GBtab, 160.equiv nb331nf_p_nthreads, 168.equiv nb331nf_count, 176.equiv nb331nf_mtx, 184.equiv nb331nf_outeriter, 192.equiv nb331nf_inneriter, 200.equiv nb331nf_work, 208 ;# stack offsets for local variables ;# bottom of stack is cache-aligned for sse use .equiv nb331nf_ixO, 0.equiv nb331nf_iyO, 16.equiv nb331nf_izO, 32.equiv nb331nf_ixH1, 48.equiv nb331nf_iyH1, 64.equiv nb331nf_izH1, 80.equiv nb331nf_ixH2, 96.equiv nb331nf_iyH2, 112.equiv nb331nf_izH2, 128.equiv nb331nf_iqO, 144.equiv nb331nf_iqH, 160.equiv nb331nf_qqO, 176.equiv nb331nf_qqH, 192
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?