nb_kernel133_x86_64_sse2.intel_syntax.s
来自「最著名最快的分子模拟软件」· S 代码 · 共 2,375 行 · 第 1/5 页
S
2,375 行
movapd xmm3, xmm2 mulsd xmm2, xmm2 movapd xmm1, [rsp + nb133_three] mulsd xmm2, xmm5 ;# rsq*lu*lu subsd xmm1, xmm2 ;# 30-rsq*lu*lu mulsd xmm1, xmm3 ;# lu*(3-rsq*lu*lu) mulsd xmm1, [rsp + nb133_half] ;# iter1 ( new lu) movapd xmm3, xmm1 mulsd xmm1, xmm1 ;# lu*lu mulsd xmm5, xmm1 ;# rsq*lu*lu movapd xmm1, [rsp + nb133_three] subsd xmm1, xmm5 ;# 3-rsq*lu*lu mulsd xmm1, xmm3 ;# lu*( 3-rsq*lu*lu) mulsd xmm1, [rsp + nb133_half] ;# rinv movapd [rsp + nb133_rinvH2], xmm1 ;# rsqM - seed in xmm2 cvtsd2ss xmm2, xmm4 rsqrtss xmm2, xmm2 cvtss2sd xmm2, xmm2 movapd xmm3, xmm2 mulsd xmm2, xmm2 movapd xmm1, [rsp + nb133_three] mulsd xmm2, xmm4 ;# rsq*lu*lu subsd xmm1, xmm2 ;# 30-rsq*lu*lu mulsd xmm1, xmm3 ;# lu*(3-rsq*lu*lu) mulsd xmm1, [rsp + nb133_half] ;# iter1 ( new lu) movapd xmm3, xmm1 mulsd xmm1, xmm1 ;# lu*lu mulsd xmm4, xmm1 ;# rsq*lu*lu movapd xmm1, [rsp + nb133_three] subsd xmm1, xmm4 ;# 3-rsq*lu*lu mulsd xmm1, xmm3 ;# lu*( 3-rsq*lu*lu) mulsd xmm1, [rsp + nb133_half] ;# rinv movapd [rsp + nb133_rinvM], xmm1 ;# rsqO - put seed in xmm2 cvtsd2ss xmm2, xmm7 rsqrtss xmm2, xmm2 cvtss2sd xmm2, xmm2 movsd xmm3, xmm2 mulsd xmm2, xmm2 movsd xmm4, [rsp + nb133_three] mulsd xmm2, xmm7 ;# rsq*lu*lu subsd xmm4, xmm2 ;# 30-rsq*lu*lu mulsd xmm4, xmm3 ;# lu*(3-rsq*lu*lu) mulsd xmm4, [rsp + nb133_half] ;# iter1 ( new lu) movsd xmm3, xmm4 mulsd xmm4, xmm4 ;# lu*lu mulsd xmm7, xmm4 ;# rsq*lu*lu movsd xmm4, [rsp + nb133_three] subsd xmm4, xmm7 ;# 3-rsq*lu*lu mulsd xmm4, xmm3 ;# lu*( 3-rsq*lu*lu) mulsd xmm4, [rsp + nb133_half] ;# rinv movsd xmm7, xmm4 ;# rinvO in xmm7 movsd xmm4, [rsp + nb133_rsqO] movapd xmm0, xmm7 ;# LJ table interaction. mulsd xmm4, xmm7 ;# xmm4=r mulsd xmm4, [rsp + nb133_tsc] cvttsd2si ebx, xmm4 ;# mm6 = lu idx cvtsi2sd xmm5, ebx subpd xmm4, xmm5 movapd xmm1, xmm4 ;# xmm1=eps movapd xmm2, xmm1 mulpd xmm2, xmm2 ;# xmm2=eps2 shl ebx, 3 mov rsi, [rbp + nb133_VFtab] ;# dispersion movlpd xmm4, [rsi + rbx*8] ;# Y1 movhpd xmm4, [rsi + rbx*8 + 8] ;# Y1 F1 movapd xmm5, xmm4 unpcklpd xmm4, xmm3 ;# Y1 Y2 unpckhpd xmm5, xmm3 ;# F1 F2 movlpd xmm6, [rsi + rbx*8 + 16] ;# G1 movhpd xmm6, [rsi + rbx*8 + 24] ;# G1 H1 movapd xmm7, xmm6 unpcklpd xmm6, xmm3 ;# G1 G2 unpckhpd xmm7, xmm3 ;# H1 H2 ;# dispersion table ready, in xmm4-xmm7 mulsd xmm6, xmm1 ;# xmm6=Geps mulsd xmm7, xmm2 ;# xmm7=Heps2 addsd xmm5, xmm6 addsd xmm5, xmm7 ;# xmm5=Fp mulsd xmm7, [rsp + nb133_two] ;# two*Heps2 addsd xmm7, xmm6 addsd xmm7, xmm5 ;# xmm7=FF mulsd xmm5, xmm1 ;# xmm5=eps*Fp addsd xmm5, xmm4 ;# xmm5=VV movsd xmm4, [rsp + nb133_c6] mulsd xmm7, xmm4 ;# fijD mulsd xmm5, xmm4 ;# Vvdw6 ;# put scalar force on stack Update Vvdwtot directly addsd xmm5, [rsp + nb133_Vvdwtot] xorpd xmm3, xmm3 mulsd xmm7, [rsp + nb133_tsc] subsd xmm3, xmm7 movsd [rsp + nb133_fstmp], xmm3 movsd [rsp + nb133_Vvdwtot], xmm5 ;# repulsion movlpd xmm4, [rsi + rbx*8 + 32] ;# Y1 movhpd xmm4, [rsi + rbx*8 + 40] ;# Y1 F1 movapd xmm5, xmm4 unpcklpd xmm4, xmm3 ;# Y1 Y2 unpckhpd xmm5, xmm3 ;# F1 F2 movlpd xmm6, [rsi + rbx*8 + 48] ;# G1 movhpd xmm6, [rsi + rbx*8 + 56] ;# G1 H1 movapd xmm7, xmm6 unpcklpd xmm6, xmm3 ;# G1 G2 unpckhpd xmm7, xmm3 ;# H1 H2 ;# table ready, in xmm4-xmm7 mulsd xmm6, xmm1 ;# xmm6=Geps mulsd xmm7, xmm2 ;# xmm7=Heps2 addsd xmm5, xmm6 addsd xmm5, xmm7 ;# xmm5=Fp mulsd xmm7, [rsp + nb133_two] ;# two*Heps2 addsd xmm7, xmm6 addsd xmm7, xmm5 ;# xmm7=FF mulsd xmm5, xmm1 ;# xmm5=eps*Fp addsd xmm5, xmm4 ;# xmm5=VV movsd xmm4, [rsp + nb133_c12] mulsd xmm7, xmm4 mulsd xmm5, xmm4 addsd xmm5, [rsp + nb133_Vvdwtot] movsd xmm3, [rsp + nb133_fstmp] mulsd xmm7, [rsp + nb133_tsc] subsd xmm3, xmm7 movsd [rsp + nb133_Vvdwtot], xmm5 mulsd xmm3, xmm0 movsd xmm0, [rsp + nb133_dxO] movsd xmm1, [rsp + nb133_dyO] movsd xmm2, [rsp + nb133_dzO] mov rdi, [rbp + nb133_faction] mulsd xmm0, xmm3 mulsd xmm1, xmm3 mulsd xmm2, xmm3 ;# update O forces movapd xmm3, [rsp + nb133_fixO] movapd xmm4, [rsp + nb133_fiyO] movapd xmm7, [rsp + nb133_fizO] addsd xmm3, xmm0 addsd xmm4, xmm1 addsd xmm7, xmm2 movsd [rsp + nb133_fixO], xmm3 movsd [rsp + nb133_fiyO], xmm4 movsd [rsp + nb133_fizO], xmm7 ;# update j forces with water O movsd [rsp + nb133_fjx], xmm0 movsd [rsp + nb133_fjy], xmm1 movsd [rsp + nb133_fjz], xmm2 ;# H1 interactions movsd xmm6, [rsp + nb133_rinvH1] movsd xmm4, xmm6 mulsd xmm4, xmm4 ;# xmm6=rinv, xmm4=rinvsq mulsd xmm6, [rsp + nb133_qqH] ;# vcoul mulsd xmm4, xmm6 ;# fscal addsd xmm6, [rsp + nb133_vctot] movsd [rsp + nb133_vctot], xmm6 movapd xmm0, [rsp + nb133_dxH1] movapd xmm1, [rsp + nb133_dyH1] movapd xmm2, [rsp + nb133_dzH1] mulsd xmm0, xmm4 mulsd xmm1, xmm4 mulsd xmm2, xmm4 ;# update H1 forces movapd xmm3, [rsp + nb133_fixH1] movapd xmm4, [rsp + nb133_fiyH1] movapd xmm7, [rsp + nb133_fizH1] addsd xmm3, xmm0 addsd xmm4, xmm1 addsd xmm7, xmm2 movsd [rsp + nb133_fixH1], xmm3 movsd [rsp + nb133_fiyH1], xmm4 movsd [rsp + nb133_fizH1], xmm7 ;# update j forces with water H1 addsd xmm0, [rsp + nb133_fjx] addsd xmm1, [rsp + nb133_fjy] addsd xmm2, [rsp + nb133_fjz] movsd [rsp + nb133_fjx], xmm0 movsd [rsp + nb133_fjy], xmm1 movsd [rsp + nb133_fjz], xmm2 ;# H2 interactions movsd xmm6, [rsp + nb133_rinvH2] movsd xmm4, xmm6 mulsd xmm4, xmm4 ;# xmm6=rinv, xmm4=rinvsq mulsd xmm6, [rsp + nb133_qqH] ;# vcoul mulsd xmm4, xmm6 ;# fscal addsd xmm6, [rsp + nb133_vctot] movsd [rsp + nb133_vctot], xmm6 movapd xmm0, [rsp + nb133_dxH2] movapd xmm1, [rsp + nb133_dyH2] movapd xmm2, [rsp + nb133_dzH2] mulsd xmm0, xmm4 mulsd xmm1, xmm4 mulsd xmm2, xmm4 ;# update H2 forces movapd xmm3, [rsp + nb133_fixH2] movapd xmm4, [rsp + nb133_fiyH2] movapd xmm7, [rsp + nb133_fizH2] addsd xmm3, xmm0 addsd xmm4, xmm1 addsd xmm7, xmm2 movsd [rsp + nb133_fixH2], xmm3 movsd [rsp + nb133_fiyH2], xmm4 movsd [rsp + nb133_fizH2], xmm7 ;# update j forces with water H2 addsd xmm0, [rsp + nb133_fjx] addsd xmm1, [rsp + nb133_fjy] addsd xmm2, [rsp + nb133_fjz] movsd [rsp + nb133_fjx], xmm0 movsd [rsp + nb133_fjy], xmm1 movsd [rsp + nb133_fjz], xmm2 ;# M interactions movsd xmm6, [rsp + nb133_rinvM] movsd xmm4, xmm6 mulsd xmm4, xmm4 ;# xmm6=rinv, xmm4=rinvsq mulsd xmm6, [rsp + nb133_qqM] ;# vcoul mulsd xmm4, xmm6 ;# fscal addsd xmm6, [rsp + nb133_vctot] movsd [rsp + nb133_vctot], xmm6 movapd xmm0, [rsp + nb133_dxM] movapd xmm1, [rsp + nb133_dyM] movapd xmm2, [rsp + nb133_dzM] mulsd xmm0, xmm4 mulsd xmm1, xmm4 mulsd xmm2, xmm4 ;# update M forces movapd xmm3, [rsp + nb133_fixM] movapd xmm4, [rsp + nb133_fiyM] movapd xmm7, [rsp + nb133_fizM] addsd xmm3, xmm0 addsd xmm4, xmm1 addsd xmm7, xmm2 movsd [rsp + nb133_fixM], xmm3 movsd [rsp + nb133_fiyM], xmm4 movsd [rsp + nb133_fizM], xmm7 mov rdi, [rbp + nb133_faction] ;# update j forces addsd xmm0, [rsp + nb133_fjx] addsd xmm1, [rsp + nb133_fjy] addsd xmm2, [rsp + nb133_fjz] movlpd xmm3, [rdi + rax*8] movlpd xmm4, [rdi + rax*8 + 8] movlpd xmm5, [rdi + rax*8 + 16] addsd xmm3, xmm0 addsd xmm4, xmm1 addsd xmm5, xmm2 movlpd [rdi + rax*8], xmm3 movlpd [rdi + rax*8 + 8], xmm4 movlpd [rdi + rax*8 + 16], xmm5.nb133_updateouterdata: mov ecx, [rsp + nb133_ii3] mov rdi, [rbp + nb133_faction] mov rsi, [rbp + nb133_fshift] mov edx, [rsp + nb133_is3] ;# accumulate Oi forces in xmm0, xmm1, xmm2 movapd xmm0, [rsp + nb133_fixO] movapd xmm1, [rsp + nb133_fiyO] movapd xmm2, [rsp + nb133_fizO] movhlps xmm3, xmm0 movhlps xmm4, xmm1 movhlps xmm5, xmm2 addsd xmm0, xmm3 addsd xmm1, xmm4 addsd xmm2, xmm5 ;# sum is in low xmm0-xmm2 ;# increment i force movsd xmm3, [rdi + rcx*8] movsd xmm4, [rdi + rcx*8 + 8] movsd xmm5, [rdi + rcx*8 + 16] subsd xmm3, xmm0 subsd xmm4, xmm1 subsd xmm5, xmm2 movsd [rdi + rcx*8], xmm3 movsd [rdi + rcx*8 + 8], xmm4 movsd [rdi + rcx*8 + 16], xmm5 ;# accumulate force in xmm6/xmm7 for fshift movapd xmm6, xmm0 movsd xmm7, xmm2 unpcklpd xmm6,xmm1 ;# accumulate H1i forces in xmm0, xmm1, xmm2 movapd xmm0, [rsp + nb133_fixH1] movapd xmm1, [rsp + nb133_fiyH1] movapd xmm2, [rsp + nb133_fizH1] movhlps xmm3, xmm0 movhlps xmm4, xmm1 movhlps xmm5, xmm2 addsd xmm0, xmm3 addsd xmm1, xmm4 addsd xmm2, xmm5 ;# sum is in low xmm0-xmm2 ;# increment i force movsd xmm3, [rdi + rcx*8 + 24] movsd xmm4, [rdi + rcx*8 + 32] movsd xmm5, [rdi + rcx*8 + 40] subsd xmm3, xmm0 subsd xmm4, xmm1 subsd xmm5, xmm2 movsd [rdi + rcx*8 + 24], xmm3 movsd [rdi + rcx*8 + 32], xmm4 movsd [rdi + rcx*8 + 40], xmm5 ;# accumulate force in xmm6/xmm7 for fshift addsd xmm7, xmm2 unpcklpd xmm0, xmm1 addpd xmm6, xmm0 ;# accumulate H2i forces in xmm0, xmm1, xmm2 movapd xmm0, [rsp + nb133_fixH2] movapd xmm1, [rsp + nb133_fiyH2] movapd xmm2, [rsp + nb133_fizH2] movhlps xmm3, xmm0 movhlps xmm4, xmm1 movhlps xmm5, xmm2 addsd xmm0, xmm3 addsd xmm1, xmm4 addsd xmm2, xmm5 ;# sum is in low xmm0-xmm2 ;# increment i force movsd xmm3, [rdi + rcx*8 + 48] movsd xmm4, [rdi + rcx*8 + 56] movsd xmm5, [rdi + rcx*8 + 64] subsd xmm3, xmm0 subsd xmm4, xmm1 subsd xmm5, xmm2 movsd [rdi + rcx*8 + 48], xmm3 movsd [rdi + rcx*8 + 56], xmm4 movsd [rdi + rcx*8 + 64], xmm5 ;# accumulate force in xmm6/xmm7 for fshift addsd xmm7, xmm2 unpcklpd xmm0, xmm1 addpd xmm6, xmm0 ;# accumulate Mi forces in xmm0, xmm1, xmm2 movapd xmm0, [rsp + nb133_fixM] movapd xmm1, [rsp + nb133_fiyM] movapd xmm2, [rsp + nb133_fizM] movhlps xmm3, xmm0 movhlps xmm4, xmm1 movhlps xmm5, xmm2 addsd xmm0, xmm3 addsd xmm1, xmm4 addsd xmm2, xmm5 ;# sum is in low xmm0-xmm2 ;# increment i force movsd xmm3, [rdi + rcx*8 + 72] movsd xmm4, [rdi + rcx*8 + 80] movsd xmm5, [rdi + rcx*8 + 88] subsd xmm3, xmm0 subsd xmm4, xmm1 subsd xmm5, xmm2 movsd [rdi + rcx*8 + 72], xmm3 movsd [rdi + rcx*8 + 80], xmm4 movsd [rdi + rcx*8 + 88], xmm5 ;# accumulate force in xmm6/xmm7 for fshift addsd xmm7, xmm2 unpcklpd xmm0, xmm1 addpd xmm6, xmm0 ;# increment fshift force movlpd xmm3, [rsi + rdx*8] movhpd xmm3, [rsi + rdx*8 + 8] movsd xmm4, [rsi + rdx*8 + 16] subpd xmm3, xmm6 subsd xmm4, xmm7 movlpd [rsi + rdx*8], xmm3 movhpd [rsi + rdx*8 + 8], xmm3 movsd [rsi + rdx*8 + 16], xmm4 ;# get n from stack mov esi, [rsp + nb133_n] ;# get group index for i particle mov rdx, [rbp + nb133_gid] ;# base of gid[] mov edx, [rdx + rsi*4] ;# ggid=gid[n] ;# accumulate total potential energy and update it movapd xmm7, [rsp + nb133_vctot] ;# accumulate movhlps xmm6, xmm7 addsd xmm7, xmm6 ;# low xmm7 has the sum now ;# add earlier value from mem mov rax, [rbp + nb133_Vc] addsd xmm7, [rax + rdx*8] ;# move back to mem movsd [rax + rdx*8], xmm7 ;# accumulate total lj energy and update it movapd xmm7, [rsp + nb133_Vvdwtot] ;# accumulate movhlps xmm6, xmm7 addsd xmm7, xmm6 ;# low xmm7 has the sum now ;# add earlier value from mem mov rax, [rbp + nb133_Vvdw] addsd xmm7, [rax + rdx*8] ;# move back to mem movsd [rax + rdx*8], xmm7 ;# finish if last mov ecx, [rsp + nb133_nn1] ;# esi already loaded with n inc esi sub ecx, esi jz .nb133_outerend ;# not last, iterate outer loop once more! mov [rsp + nb133_n], esi jmp .nb133_outer.nb133_outerend: ;# check if more outer neighborlists remain mov ecx, [rsp + nb133_nri] ;# esi already loaded with n above sub ecx, esi jz .nb133_end ;# non-zero, do one more workunit jmp .nb133_threadloop.nb133_end: mov eax, [rsp + nb133_nouter] mov ebx, [rsp + nb133_ninner] mov rcx, [rbp + nb133_outeriter] mov rdx, [rbp + nb133_inneriter] mov [rcx], eax mov [rdx], ebx add rsp, 1080 emms pop r15
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?