nb_kernel113_x86_64_sse2.intel_syntax.s
来自「最著名最快的分子模拟软件」· S 代码 · 共 2,046 行 · 第 1/4 页
S
2,046 行
mulpd xmm2, xmm2 movapd [rsp + nb113_dxH2], xmm3 movapd [rsp + nb113_dyH2], xmm4 movapd [rsp + nb113_dzH2], xmm5 mulpd xmm3, xmm3 mulpd xmm4, xmm4 mulpd xmm5, xmm5 movapd [rsp + nb113_dxM], xmm6 movapd [rsp + nb113_dyM], xmm7 movapd [rsp + nb113_dzM], xmm8 mulpd xmm6, xmm6 mulpd xmm7, xmm7 mulpd xmm8, xmm8 addpd xmm0, xmm1 addpd xmm0, xmm2 addpd xmm3, xmm4 addpd xmm3, xmm5 addpd xmm6, xmm7 addpd xmm6, xmm8 ;# start doing invsqrt for j atoms cvtpd2ps xmm1, xmm0 cvtpd2ps xmm4, xmm3 cvtpd2ps xmm7, xmm6 rsqrtps xmm1, xmm1 rsqrtps xmm4, xmm4 rsqrtps xmm7, xmm7 cvtps2pd xmm1, xmm1 cvtps2pd xmm4, xmm4 cvtps2pd xmm7, xmm7 movapd xmm2, xmm1 movapd xmm5, xmm4 movapd xmm8, xmm7 mulpd xmm1, xmm1 ;# lu*lu mulpd xmm4, xmm4 ;# lu*lu mulpd xmm7, xmm7 ;# lu*lu movapd xmm9, [rsp + nb113_three] movapd xmm10, xmm9 movapd xmm11, xmm9 mulpd xmm1, xmm0 ;# rsq*lu*lu mulpd xmm4, xmm3 ;# rsq*lu*lu mulpd xmm7, xmm6 ;# rsq*lu*lu subpd xmm9, xmm1 subpd xmm10, xmm4 subpd xmm11, xmm7 ;# 3-rsq*lu*lu mulpd xmm9, xmm2 mulpd xmm10, xmm5 mulpd xmm11, xmm8 ;# lu*(3-rsq*lu*lu) movapd xmm15, [rsp + nb113_half] mulpd xmm9, xmm15 ;# first iteration for rinvH1 mulpd xmm10, xmm15 ;# first iteration for rinvH2 mulpd xmm11, xmm15 ;# first iteration for rinvM ;# second iteration step movapd xmm2, xmm9 movapd xmm5, xmm10 movapd xmm8, xmm11 mulpd xmm2, xmm2 ;# lu*lu mulpd xmm5, xmm5 ;# lu*lu mulpd xmm8, xmm8 ;# lu*lu movapd xmm1, [rsp + nb113_three] movapd xmm4, xmm1 movapd xmm7, xmm1 mulpd xmm2, xmm0 ;# rsq*lu*lu mulpd xmm5, xmm3 ;# rsq*lu*lu mulpd xmm8, xmm6 ;# rsq*lu*lu subpd xmm1, xmm2 subpd xmm4, xmm5 subpd xmm7, xmm8 ;# 3-rsq*lu*lu mulpd xmm9, xmm1 mulpd xmm10, xmm4 mulpd xmm11, xmm7 ;# lu*(3-rsq*lu*lu) movapd xmm15, [rsp + nb113_half] mulpd xmm9, xmm15 ;# rinvH1 mulpd xmm10, xmm15 ;# rinvH2 mulpd xmm11, xmm15 ;# rinvM ;# interactions movapd xmm0, xmm9 movapd xmm1, xmm10 movapd xmm2, xmm11 mulpd xmm9, xmm9 mulpd xmm10, xmm10 mulpd xmm11, xmm11 mulpd xmm0, [rsp + nb113_qqH] mulpd xmm1, [rsp + nb113_qqH] mulpd xmm2, [rsp + nb113_qqM] mulpd xmm9, xmm0 mulpd xmm10, xmm1 mulpd xmm11, xmm2 addpd xmm0, [rsp + nb113_vctot] addpd xmm1, xmm2 addpd xmm0, xmm1 movapd [rsp + nb113_vctot], xmm0 ;# move j forces to xmm0-xmm2 mov rdi, [rbp + nb113_faction] movlpd xmm0, [rdi + rax*8] movlpd xmm1, [rdi + rax*8 + 8] movlpd xmm2, [rdi + rax*8 + 16] movhpd xmm0, [rdi + rbx*8] movhpd xmm1, [rdi + rbx*8 + 8] movhpd xmm2, [rdi + rbx*8 + 16] movapd xmm7, xmm9 movapd xmm8, xmm9 movapd xmm13, xmm11 movapd xmm14, xmm11 movapd xmm15, xmm11 movapd xmm11, xmm10 movapd xmm12, xmm10 ;# add forces from O interaction addpd xmm0, [rsp + nb113_fjx] addpd xmm1, [rsp + nb113_fjy] addpd xmm2, [rsp + nb113_fjz] mulpd xmm7, [rsp + nb113_dxH1] mulpd xmm8, [rsp + nb113_dyH1] mulpd xmm9, [rsp + nb113_dzH1] mulpd xmm10, [rsp + nb113_dxH2] mulpd xmm11, [rsp + nb113_dyH2] mulpd xmm12, [rsp + nb113_dzH2] mulpd xmm13, [rsp + nb113_dxM] mulpd xmm14, [rsp + nb113_dyM] mulpd xmm15, [rsp + nb113_dzM] addpd xmm0, xmm7 addpd xmm1, xmm8 addpd xmm2, xmm9 addpd xmm7, [rsp + nb113_fixH1] addpd xmm8, [rsp + nb113_fiyH1] addpd xmm9, [rsp + nb113_fizH1] addpd xmm0, xmm10 addpd xmm1, xmm11 addpd xmm2, xmm12 addpd xmm10, [rsp + nb113_fixH2] addpd xmm11, [rsp + nb113_fiyH2] addpd xmm12, [rsp + nb113_fizH2] addpd xmm0, xmm13 addpd xmm1, xmm14 addpd xmm2, xmm15 addpd xmm13, [rsp + nb113_fixM] addpd xmm14, [rsp + nb113_fiyM] addpd xmm15, [rsp + nb113_fizM] movapd [rsp + nb113_fixH1], xmm7 movapd [rsp + nb113_fiyH1], xmm8 movapd [rsp + nb113_fizH1], xmm9 movapd [rsp + nb113_fixH2], xmm10 movapd [rsp + nb113_fiyH2], xmm11 movapd [rsp + nb113_fizH2], xmm12 movapd [rsp + nb113_fixM], xmm13 movapd [rsp + nb113_fiyM], xmm14 movapd [rsp + nb113_fizM], xmm15 ;# store back j forces from xmm0-xmm2 movlpd [rdi + rax*8], xmm0 movlpd [rdi + rax*8 + 8], xmm1 movlpd [rdi + rax*8 + 16], xmm2 movhpd [rdi + rbx*8], xmm0 movhpd [rdi + rbx*8 + 8], xmm1 movhpd [rdi + rbx*8 + 16], xmm2 ;# should we do one more iteration? sub dword ptr [rsp + nb113_innerk], 2 jl .nb113_checksingle jmp .nb113_unroll_loop.nb113_checksingle: mov edx, [rsp + nb113_innerk] and edx, 1 jnz .nb113_dosingle jmp .nb113_updateouterdata.nb113_dosingle: mov rdx, [rsp + nb113_innerjjnr] ;# pointer to jjnr[k] mov eax, [rdx] add qword ptr [rsp + nb113_innerjjnr], 4 mov rsi, [rbp + nb113_charge] ;# base of charge[] xorpd xmm3, xmm3 movlpd xmm3, [rsi + rax*8] movapd xmm4, xmm3 mulpd xmm3, [rsp + nb113_iqM] mulpd xmm4, [rsp + nb113_iqH] movapd [rsp + nb113_qqM], xmm3 movapd [rsp + nb113_qqH], xmm4 mov rsi, [rbp + nb113_type] mov r8d, [rsi + rax*4] mov rsi, [rbp + nb113_vdwparam] shl r8d, 1 mov edi, [rsp + nb113_ntia] add r8d, edi movsd xmm6, [rsi + r8*8] ;# c6a movsd xmm7, [rsi + r8*8 + 8] ;# c12a movapd [rsp + nb113_c6], xmm6 movapd [rsp + nb113_c12], xmm7 mov rsi, [rbp + nb113_pos] ;# base of pos[] lea rax, [rax + rax*2] ;# replace jnr with j3 ;# move coordinates to xmm0-xmm2 and xmm4-xmm6 movlpd xmm4, [rsi + rax*8] movlpd xmm5, [rsi + rax*8 + 8] movlpd xmm6, [rsi + rax*8 + 16] movapd xmm0, xmm4 movapd xmm1, xmm5 movapd xmm2, xmm6 ;# calc dr subsd xmm4, [rsp + nb113_ixO] subsd xmm5, [rsp + nb113_iyO] subsd xmm6, [rsp + nb113_izO] ;# store dr movapd [rsp + nb113_dxO], xmm4 movapd [rsp + nb113_dyO], xmm5 movapd [rsp + nb113_dzO], xmm6 ;# square it mulsd xmm4,xmm4 mulsd xmm5,xmm5 mulsd xmm6,xmm6 addsd xmm4, xmm5 addsd xmm4, xmm6 movapd xmm7, xmm4 ;# rsqO in xmm7 ;# move j coords to xmm4-xmm6 movapd xmm4, xmm0 movapd xmm5, xmm1 movapd xmm6, xmm2 ;# calc dr subsd xmm4, [rsp + nb113_ixH1] subsd xmm5, [rsp + nb113_iyH1] subsd xmm6, [rsp + nb113_izH1] ;# store dr movapd [rsp + nb113_dxH1], xmm4 movapd [rsp + nb113_dyH1], xmm5 movapd [rsp + nb113_dzH1], xmm6 ;# square it mulsd xmm4,xmm4 mulsd xmm5,xmm5 mulsd xmm6,xmm6 addsd xmm6, xmm5 addsd xmm6, xmm4 ;# rsqH1 in xmm6 ;# move j coords to xmm3-xmm5 movapd xmm3, xmm0 movapd xmm4, xmm1 movapd xmm5, xmm2 ;# calc dr subsd xmm3, [rsp + nb113_ixH2] subsd xmm4, [rsp + nb113_iyH2] subsd xmm5, [rsp + nb113_izH2] ;# store dr movapd [rsp + nb113_dxH2], xmm3 movapd [rsp + nb113_dyH2], xmm4 movapd [rsp + nb113_dzH2], xmm5 ;# square it mulsd xmm3,xmm3 mulsd xmm4,xmm4 mulsd xmm5,xmm5 addsd xmm5, xmm4 addsd xmm5, xmm3 ;# move j coords to xmm4-xmm2 movapd xmm4, xmm0 movapd xmm3, xmm1 ;# xmm2 already contains z ;# calc dr subsd xmm4, [rsp + nb113_ixM] subsd xmm3, [rsp + nb113_iyM] subsd xmm2, [rsp + nb113_izM] ;# store dr movapd [rsp + nb113_dxM], xmm4 movapd [rsp + nb113_dyM], xmm3 movapd [rsp + nb113_dzM], xmm2 ;# square it mulpd xmm2,xmm2 mulpd xmm3,xmm3 mulpd xmm4,xmm4 addpd xmm4, xmm3 addpd xmm4, xmm2 ;# rsqM in xmm4, rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7 ;# start with rsqH1 - put seed in xmm2 cvtsd2ss xmm2, xmm6 rsqrtss xmm2, xmm2 cvtss2sd xmm2, xmm2 movapd xmm3, xmm2 mulsd xmm2, xmm2 movapd xmm1, [rsp + nb113_three] mulsd xmm2, xmm6 ;# rsq*lu*lu subsd xmm1, xmm2 ;# 30-rsq*lu*lu mulsd xmm1, xmm3 ;# lu*(3-rsq*lu*lu) mulsd xmm1, [rsp + nb113_half] ;# iter1 ( new lu) movapd xmm3, xmm1 mulsd xmm1, xmm1 ;# lu*lu mulsd xmm6, xmm1 ;# rsq*lu*lu movapd xmm1, [rsp + nb113_three] subsd xmm1, xmm6 ;# 3-rsq*lu*lu mulsd xmm1, xmm3 ;# lu*( 3-rsq*lu*lu) mulsd xmm1, [rsp + nb113_half] ;# rinv movapd [rsp + nb113_rinvH1], xmm1 ;# rsqH2 - seed in xmm2 cvtsd2ss xmm2, xmm5 rsqrtss xmm2, xmm2 cvtss2sd xmm2, xmm2 movapd xmm3, xmm2 mulsd xmm2, xmm2 movapd xmm1, [rsp + nb113_three] mulsd xmm2, xmm5 ;# rsq*lu*lu subsd xmm1, xmm2 ;# 30-rsq*lu*lu mulsd xmm1, xmm3 ;# lu*(3-rsq*lu*lu) mulsd xmm1, [rsp + nb113_half] ;# iter1 ( new lu) movapd xmm3, xmm1 mulsd xmm1, xmm1 ;# lu*lu mulsd xmm5, xmm1 ;# rsq*lu*lu movapd xmm1, [rsp + nb113_three] subsd xmm1, xmm5 ;# 3-rsq*lu*lu mulsd xmm1, xmm3 ;# lu*( 3-rsq*lu*lu) mulsd xmm1, [rsp + nb113_half] ;# rinv movapd [rsp + nb113_rinvH2], xmm1 ;# rsqM - seed in xmm2 cvtsd2ss xmm2, xmm4 rsqrtss xmm2, xmm2 cvtss2sd xmm2, xmm2 movapd xmm3, xmm2 mulsd xmm2, xmm2 movapd xmm1, [rsp + nb113_three] mulsd xmm2, xmm4 ;# rsq*lu*lu subsd xmm1, xmm2 ;# 30-rsq*lu*lu mulsd xmm1, xmm3 ;# lu*(3-rsq*lu*lu) mulsd xmm1, [rsp + nb113_half] ;# iter1 ( new lu) movapd xmm3, xmm1 mulsd xmm1, xmm1 ;# lu*lu mulsd xmm4, xmm1 ;# rsq*lu*lu movapd xmm1, [rsp + nb113_three] subsd xmm1, xmm4 ;# 3-rsq*lu*lu mulsd xmm1, xmm3 ;# lu*( 3-rsq*lu*lu) mulsd xmm1, [rsp + nb113_half] ;# rinv movapd [rsp + nb113_rinvM], xmm1 ;# do O interactions directly. xmm7=rsq cvtsd2ss xmm2, xmm7 movapd xmm6, xmm7 rcpps xmm2, xmm2 cvtss2sd xmm2, xmm2 movapd xmm1, [rsp + nb113_two] movapd xmm0, xmm1 mulsd xmm7, xmm2 subsd xmm1, xmm7 mulsd xmm2, xmm1 ;# iter1 mulsd xmm6, xmm2 subsd xmm0, xmm6 mulsd xmm0, xmm2 ;# xmm0=rinvsq movapd xmm1, xmm0 mulsd xmm1, xmm1 ;# rinv4 mulsd xmm1, xmm0 ;#rinvsix movapd xmm2, xmm1 mulsd xmm2, xmm2 ;# rinvtwelve mulsd xmm1, [rsp + nb113_c6] mulsd xmm2, [rsp + nb113_c12] movapd xmm3, xmm2 subsd xmm3, xmm1 ;# Vvdw=Vvdw12-Vvdw6 addsd xmm3, [rsp + nb113_Vvdwtot] mulsd xmm1, [rsp + nb113_six] mulsd xmm2, [rsp + nb113_twelve] subsd xmm2, xmm1 mulsd xmm2, xmm0 movapd xmm4, xmm2 ;# total fsO movsd [rsp + nb113_Vvdwtot], xmm3 movapd xmm0, [rsp + nb113_dxO] movapd xmm1, [rsp + nb113_dyO] movapd xmm2, [rsp + nb113_dzO] mulsd xmm0, xmm4 mulsd xmm1, xmm4 mulsd xmm2, xmm4 ;# update O forces movapd xmm3, [rsp + nb113_fixO] movapd xmm4, [rsp + nb113_fiyO] movapd xmm7, [rsp + nb113_fizO] addsd xmm3, xmm0 addsd xmm4, xmm1 addsd xmm7, xmm2 movsd [rsp + nb113_fixO], xmm3 movsd [rsp + nb113_fiyO], xmm4 movsd [rsp + nb113_fizO], xmm7 ;# update j forces with water O movsd [rsp + nb113_fjx], xmm0 movsd [rsp + nb113_fjy], xmm1 movsd [rsp + nb113_fjz], xmm2 ;# H1 interactions movapd xmm6, [rsp + nb113_rinvH1] movapd xmm4, xmm6 mulsd xmm4, xmm4 ;# xmm6=rinv, xmm4=rinvsq mulsd xmm6, [rsp + nb113_qqH] ;# xmm6=vcoul mulsd xmm4, xmm6 ;# total fsH1 in xmm4 addsd xmm6, [rsp + nb113_vctot] movapd xmm0, [rsp + nb113_dxH1] movapd xmm1, [rsp + nb113_dyH1] movapd xmm2, [rsp + nb113_dzH1] movsd [rsp + nb113_vctot], xmm6 mulsd xmm0, xmm4 mulsd xmm1, xmm4 mulsd xmm2, xmm4 ;# update H1 forces movapd xmm3, [rsp + nb113_fixH1] movapd xmm4, [rsp + nb113_fiyH1] movapd xmm7, [rsp + nb113_fizH1] addsd xmm3, xmm0 addsd xmm4, xmm1 addsd xmm7, xmm2 movsd [rsp + nb113_fixH1], xmm3 movsd [rsp + nb113_fiyH1], xmm4 movsd [rsp + nb113_fizH1], xmm7 ;# update j forces with water H1 addsd xmm0, [rsp + nb113_fjx] addsd xmm1, [rsp + nb113_fjy] addsd xmm2, [rsp + nb113_fjz] movsd [rsp + nb113_fjx], xmm0 movsd [rsp + nb113_fjy], xmm1 movsd [rsp + nb113_fjz], xmm2 ;# H2 interactions movapd xmm5, [rsp + nb113_rinvH2] movapd xmm4, xmm5 mulsd xmm4, xmm4 ;# xmm5=rinv, xmm4=rinvsq mulsd xmm5, [rsp + nb113_qqH] ;# xmm5=vcoul mulsd xmm4, xmm5 ;# total fsH1 in xmm4 addsd xmm5, [rsp + nb113_vctot] movapd xmm0, [rsp + nb113_dxH2] movapd xmm1, [rsp + nb113_dyH2] movapd xmm2, [rsp + nb113_dzH2] movsd [rsp + nb113_vctot], xmm5 mulsd xmm0, xmm4 mulsd xmm1, xmm4 mulsd xmm2, xmm4 ;# update H2 forces movapd xmm3, [rsp + nb113_fixH2] movapd xmm4, [rsp + nb113_fiyH2] movapd xmm7, [rsp + nb113_fizH2] addsd xmm3, xmm0 addsd xmm4, xmm1 addsd xmm7, xmm2 movsd [rsp + nb113_fixH2], xmm3 movsd [rsp + nb113_fiyH2], xmm4 movsd [rsp + nb113_fizH2], xmm7 ;# update j forces with water H2 addsd xmm0, [rsp + nb113_fjx] addsd xmm1, [rsp + nb113_fjy] addsd xmm2, [rsp + nb113_fjz] movsd [rsp + nb113_fjx], xmm0 movsd [rsp + nb113_fjy], xmm1 movsd [rsp + nb113_fjz], xmm2 ;# M interactions movapd xmm5, [rsp + nb113_rinvM] movapd xmm4, xmm5 mulsd xmm4, xmm4 ;# xmm5=rinv, xmm4=rinvsq mulsd xmm5, [rsp + nb113_qqM] ;# xmm5=vcoul mulsd xmm4, xmm5 ;# total fsH1 in xmm4 addsd xmm5, [rsp + nb113_vctot] movapd xmm0, [rsp + nb113_dxM]
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?