nb_kernel112_x86_64_sse.intel_syntax.s
来自「最著名最快的分子模拟软件」· S 代码 · 共 2,220 行 · 第 1/5 页
S
2,220 行
subps xmm5, [rsp + nb112_izH1] subps xmm6, [rsp + nb112_ixH2] subps xmm7, [rsp + nb112_iyH2] subps xmm8, [rsp + nb112_izH2] movaps [rsp + nb112_dxOH2], xmm0 movaps [rsp + nb112_dyOH2], xmm1 movaps [rsp + nb112_dzOH2], xmm2 mulps xmm0, xmm0 mulps xmm1, xmm1 mulps xmm2, xmm2 movaps [rsp + nb112_dxH1H2], xmm3 movaps [rsp + nb112_dyH1H2], xmm4 movaps [rsp + nb112_dzH1H2], xmm5 mulps xmm3, xmm3 mulps xmm4, xmm4 mulps xmm5, xmm5 movaps [rsp + nb112_dxH2H2], xmm6 movaps [rsp + nb112_dyH2H2], xmm7 movaps [rsp + nb112_dzH2H2], xmm8 mulps xmm6, xmm6 mulps xmm7, xmm7 mulps xmm8, xmm8 addps xmm0, xmm1 addps xmm0, xmm2 addps xmm3, xmm4 addps xmm3, xmm5 addps xmm6, xmm7 addps xmm6, xmm8 ;# start doing invsqrt for jH2 atoms rsqrtps xmm1, xmm0 rsqrtps xmm4, xmm3 rsqrtps xmm7, xmm6 movaps xmm2, xmm1 movaps xmm5, xmm4 movaps xmm8, xmm7 mulps xmm1, xmm1 ;# lu*lu mulps xmm4, xmm4 ;# lu*lu mulps xmm7, xmm7 ;# lu*lu movaps xmm9, [rsp + nb112_three] movaps xmm10, xmm9 movaps xmm11, xmm9 mulps xmm1, xmm0 ;# rsq*lu*lu mulps xmm4, xmm3 ;# rsq*lu*lu mulps xmm7, xmm6 ;# rsq*lu*lu subps xmm9, xmm1 subps xmm10, xmm4 subps xmm11, xmm7 ;# 3-rsq*lu*lu mulps xmm9, xmm2 mulps xmm10, xmm5 mulps xmm11, xmm8 ;# lu*(3-rsq*lu*lu) movaps xmm0, [rsp + nb112_half] mulps xmm9, xmm0 ;# rinvOH2 mulps xmm10, xmm0 ;# rinvH1H2 mulps xmm11, xmm0 ;# rinvH2H2 ;# H2 interactions movaps xmm0, xmm9 movaps xmm1, xmm10 movaps xmm2, xmm11 mulps xmm9, xmm9 mulps xmm10, xmm10 mulps xmm11, xmm11 mulps xmm0, [rsp + nb112_qqOH] mulps xmm1, [rsp + nb112_qqHH] mulps xmm2, [rsp + nb112_qqHH] mulps xmm9, xmm0 mulps xmm10, xmm1 mulps xmm11, xmm2 addps xmm0, [rsp + nb112_vctot] addps xmm1, xmm2 addps xmm0, xmm1 movaps [rsp + nb112_vctot], xmm0 ;# move j H2 forces to local temp variables movlps xmm0, [rdi + rax*4 + 24] ;# jxH2a jyH2a - - movlps xmm1, [rdi + rcx*4 + 24] ;# jxH2c jyH2c - - movhps xmm0, [rdi + rbx*4 + 24] ;# jxH2a jyH2a jxH2b jyH2b movhps xmm1, [rdi + rdx*4 + 24] ;# jxH2c jyH2c jxH2d jyH2d movss xmm2, [rdi + rax*4 + 32] ;# jzH2a - - - movss xmm3, [rdi + rcx*4 + 32] ;# jzH2c - - - movss xmm7, [rdi + rbx*4 + 32] ;# jzH2b - - - movss xmm8, [rdi + rdx*4 + 32] ;# jzH2d - - - movlhps xmm2, xmm7 ;# jzH2a - jzH2b - movlhps xmm3, xmm8 ;# jzH2c - jzH2d - shufps xmm2, xmm3, 136 ;# 10001000 => jzH2a jzH2b jzH2c jzH2d ;# xmm0: jxH2a jyH2a jxH2b jyH2b ;# xmm1: jxH2c jyH2c jxH2d jyH2d ;# xmm2: jzH2a jzH2b jzH2c jzH2d movaps xmm7, xmm9 movaps xmm8, xmm9 movaps xmm13, xmm11 movaps xmm14, xmm11 movaps xmm15, xmm11 movaps xmm11, xmm10 movaps xmm12, xmm10 mulps xmm7, [rsp + nb112_dxOH2] mulps xmm8, [rsp + nb112_dyOH2] mulps xmm9, [rsp + nb112_dzOH2] mulps xmm10, [rsp + nb112_dxH1H2] mulps xmm11, [rsp + nb112_dyH1H2] mulps xmm12, [rsp + nb112_dzH1H2] mulps xmm13, [rsp + nb112_dxH2H2] mulps xmm14, [rsp + nb112_dyH2H2] mulps xmm15, [rsp + nb112_dzH2H2] movaps xmm3, xmm7 movaps xmm4, xmm8 addps xmm2, xmm9 addps xmm7, [rsp + nb112_fixO] addps xmm8, [rsp + nb112_fiyO] addps xmm9, [rsp + nb112_fizO] addps xmm3, xmm10 addps xmm4, xmm11 addps xmm2, xmm12 addps xmm10, [rsp + nb112_fixH1] addps xmm11, [rsp + nb112_fiyH1] addps xmm12, [rsp + nb112_fizH1] addps xmm3, xmm13 addps xmm4, xmm14 addps xmm2, xmm15 addps xmm13, [rsp + nb112_fixH2] addps xmm14, [rsp + nb112_fiyH2] addps xmm15, [rsp + nb112_fizH2] movaps [rsp + nb112_fixO], xmm7 movaps [rsp + nb112_fiyO], xmm8 movaps [rsp + nb112_fizO], xmm9 movaps [rsp + nb112_fixH1], xmm10 movaps [rsp + nb112_fiyH1], xmm11 movaps [rsp + nb112_fizH1], xmm12 movaps [rsp + nb112_fixH2], xmm13 movaps [rsp + nb112_fiyH2], xmm14 movaps [rsp + nb112_fizH2], xmm15 ;# xmm0 = fH2x ;# xmm1 = fH2y ;# xmm2 = fH2z movaps xmm5, xmm3 unpcklps xmm3, xmm4 unpckhps xmm5, xmm4 addps xmm0, xmm3 addps xmm1, xmm5 movhlps xmm3, xmm2 ;# fH2zc fH2zd movlps [rdi + rax*4 + 24], xmm0 movhps [rdi + rbx*4 + 24], xmm0 movlps [rdi + rcx*4 + 24], xmm1 movhps [rdi + rdx*4 + 24], xmm1 movss [rdi + rax*4 + 32], xmm2 movss [rdi + rcx*4 + 32], xmm3 shufps xmm2, xmm2, 1 shufps xmm3, xmm3, 1 movss [rdi + rbx*4 + 32], xmm2 movss [rdi + rdx*4 + 32], xmm3 ;# should we do one more iteration? sub dword ptr [rsp + nb112_innerk], 4 jl .nb112_single_check jmp .nb112_unroll_loop.nb112_single_check: add dword ptr [rsp + nb112_innerk], 4 jnz .nb112_single_loop jmp .nb112_updateouterdata.nb112_single_loop: mov rdx, [rsp + nb112_innerjjnr] ;# pointer to jjnr[k] mov eax, [rdx] add qword ptr [rsp + nb112_innerjjnr], 4 mov rsi, [rbp + nb112_pos] lea rax, [rax + rax*2] ;# fetch j coordinates xorps xmm0, xmm0 xorps xmm1, xmm1 xorps xmm2, xmm2 movss xmm0, [rsi + rax*4] ;# jxO - - - movss xmm1, [rsi + rax*4 + 4] ;# jyO - - - movss xmm2, [rsi + rax*4 + 8] ;# jzO - - - movlps xmm6, [rsi + rax*4 + 12] ;# xmm6 = jxH1 jyH1 - - movss xmm7, [rsi + rax*4 + 20] ;# xmm7 = jzH1 - - - movhps xmm6, [rsi + rax*4 + 24] ;# xmm6 = jxH1 jyH1 jxH2 jyH2 movss xmm5, [rsi + rax*4 + 32] ;# xmm5 = jzH2 - - - ;# have all coords, time for some shuffling. shufps xmm6, xmm6, 216 ;# 11011000 ;# xmm6 = jxH1 jxH2 jyH1 jyH2 unpcklps xmm7, xmm5 ;# xmm7 = jzH1 jzH2 - - movlhps xmm0, xmm6 ;# xmm0 = jxO 0 jxH1 jxH2 shufps xmm1, xmm6, 228 ;# 11100100 ;# xmm1 = jyO 0 jyH1 jyH2 shufps xmm2, xmm7, 68 ;# 01000100 ;# xmm2 = jzO 0 jzH1 jzH2 ;# store all j coordinates in jO movaps [rsp + nb112_jxO], xmm0 movaps [rsp + nb112_jyO], xmm1 movaps [rsp + nb112_jzO], xmm2 subps xmm0, [rsp + nb112_ixO] subps xmm1, [rsp + nb112_iyO] subps xmm2, [rsp + nb112_izO] movaps [rsp + nb112_dxOO], xmm0 movaps [rsp + nb112_dyOO], xmm1 movaps [rsp + nb112_dzOO], xmm2 mulps xmm0, xmm0 mulps xmm1, xmm1 mulps xmm2, xmm2 addps xmm0, xmm1 addps xmm0, xmm2 ;# have rsq in xmm0 ;# do invsqrt rsqrtps xmm1, xmm0 movaps xmm2, xmm1 mulps xmm1, xmm1 movaps xmm3, [rsp + nb112_three] mulps xmm1, xmm0 subps xmm3, xmm1 mulps xmm3, xmm2 mulps xmm3, [rsp + nb112_half] ;# rinv iO - j water xorps xmm1, xmm1 movaps xmm0, xmm3 xorps xmm4, xmm4 mulps xmm0, xmm0 ;# xmm0=rinvsq ;# fetch charges to xmm4 (temporary) movss xmm4, [rsp + nb112_qqOO] movss xmm1, xmm0 movhps xmm4, [rsp + nb112_qqOH] mulss xmm1, xmm0 mulps xmm3, xmm4 ;# xmm3=vcoul mulss xmm1, xmm0 ;# xmm1(0)=rinvsix movaps xmm2, xmm1 ;# zero everything else in xmm2 mulss xmm2, xmm2 ;# xmm2=rinvtwelve mulss xmm1, [rsp + nb112_c6] mulss xmm2, [rsp + nb112_c12] movaps xmm4, xmm2 subss xmm4, xmm1 ;# Vvdwtot=Vvdw12-Vvdw6 addps xmm4, [rsp + nb112_Vvdwtot] mulss xmm1, [rsp + nb112_six] mulss xmm2, [rsp + nb112_twelve] movaps [rsp + nb112_Vvdwtot], xmm4 subss xmm2, xmm1 ;# fsD+ fsR addps xmm2, xmm3 ;# fsC+ fsD+ fsR addps xmm3, [rsp + nb112_vctot] mulps xmm0, xmm2 ;# total fscal movaps [rsp + nb112_vctot], xmm3 movaps xmm1, xmm0 movaps xmm2, xmm0 mulps xmm0, [rsp + nb112_dxOO] mulps xmm1, [rsp + nb112_dyOO] mulps xmm2, [rsp + nb112_dzOO] ;# initial update for j forces xorps xmm3, xmm3 xorps xmm4, xmm4 xorps xmm5, xmm5 addps xmm3, xmm0 addps xmm4, xmm1 addps xmm5, xmm2 movaps [rsp + nb112_fjxO], xmm3 movaps [rsp + nb112_fjyO], xmm4 movaps [rsp + nb112_fjzO], xmm5 addps xmm0, [rsp + nb112_fixO] addps xmm1, [rsp + nb112_fiyO] addps xmm2, [rsp + nb112_fizO] movaps [rsp + nb112_fixO], xmm0 movaps [rsp + nb112_fiyO], xmm1 movaps [rsp + nb112_fizO], xmm2 ;# done with i O Now do i H1 & H2 simultaneously first get i particle coords: movaps xmm0, [rsp + nb112_jxO] movaps xmm1, [rsp + nb112_jyO] movaps xmm2, [rsp + nb112_jzO] movaps xmm3, xmm0 movaps xmm4, xmm1 movaps xmm5, xmm2 subps xmm0, [rsp + nb112_ixH1] subps xmm1, [rsp + nb112_iyH1] subps xmm2, [rsp + nb112_izH1] subps xmm3, [rsp + nb112_ixH2] subps xmm4, [rsp + nb112_iyH2] subps xmm5, [rsp + nb112_izH2] movaps [rsp + nb112_dxH1O], xmm0 movaps [rsp + nb112_dyH1O], xmm1 movaps [rsp + nb112_dzH1O], xmm2 movaps [rsp + nb112_dxH2O], xmm3 movaps [rsp + nb112_dyH2O], xmm4 movaps [rsp + nb112_dzH2O], xmm5 mulps xmm0, xmm0 mulps xmm1, xmm1 mulps xmm2, xmm2 mulps xmm3, xmm3 mulps xmm4, xmm4 mulps xmm5, xmm5 addps xmm0, xmm1 addps xmm4, xmm3 addps xmm0, xmm2 ;# have rsqH1 in xmm0 addps xmm4, xmm5 ;# have rsqH2 in xmm4 ;# do invsqrt rsqrtps xmm1, xmm0 rsqrtps xmm5, xmm4 movaps xmm2, xmm1 ;# do coulomb interaction movaps xmm6, xmm5 mulps xmm1, xmm1 mulps xmm5, xmm5 movaps xmm3, [rsp + nb112_three] movaps xmm7, xmm3 mulps xmm1, xmm0 mulps xmm5, xmm4 subps xmm3, xmm1 subps xmm7, xmm5 mulps xmm3, xmm2 mulps xmm7, xmm6 mulps xmm3, [rsp + nb112_half] ;# rinv H1 - j water mulps xmm7, [rsp + nb112_half] ;# rinv H2 - j water ;# assemble charges in xmm6 xorps xmm6, xmm6 ;# do coulomb interaction movaps xmm0, xmm3 movss xmm6, [rsp + nb112_qqOH] movaps xmm4, xmm7 movhps xmm6, [rsp + nb112_qqHH] mulps xmm0, xmm0 ;# rinvsq mulps xmm4, xmm4 ;# rinvsq mulps xmm3, xmm6 ;# vcoul mulps xmm7, xmm6 ;# vcoul movaps xmm2, xmm3 addps xmm2, xmm7 ;# total vcoul mulps xmm0, xmm3 ;# fscal addps xmm2, [rsp + nb112_vctot] mulps xmm7, xmm4 ;# fscal movaps [rsp + nb112_vctot], xmm2 movaps xmm1, xmm0 movaps xmm2, xmm0 mulps xmm0, [rsp + nb112_dxH1O] mulps xmm1, [rsp + nb112_dyH1O] mulps xmm2, [rsp + nb112_dzH1O] ;# update forces H1 - j water movaps xmm3, [rsp + nb112_fjxO] movaps xmm4, [rsp + nb112_fjyO] movaps xmm5, [rsp + nb112_fjzO] addps xmm3, xmm0 addps xmm4, xmm1 addps xmm5, xmm2 movaps [rsp + nb112_fjxO], xmm3 movaps [rsp + nb112_fjyO], xmm4 movaps [rsp + nb112_fjzO], xmm5 addps xmm0, [rsp + nb112_fixH1] addps xmm1, [rsp + nb112_fiyH1] addps xmm2, [rsp + nb112_fizH1] movaps [rsp + nb112_fixH1], xmm0 movaps [rsp + nb112_fiyH1], xmm1 movaps [rsp + nb112_fizH1], xmm2 ;# do forces H2 - j water movaps xmm0, xmm7 movaps xmm1, xmm7 movaps xmm2, xmm7 mulps xmm0, [rsp + nb112_dxH2O] mulps xmm1, [rsp + nb112_dyH2O] mulps xmm2, [rsp + nb112_dzH2O] movaps xmm3, [rsp + nb112_fjxO] movaps xmm4, [rsp + nb112_fjyO] movaps xmm5, [rsp + nb112_fjzO] addps xmm3, xmm0 addps xmm4, xmm1 addps xmm5, xmm2 mov rsi, [rbp + nb112_faction] movaps [rsp + nb112_fjxO], xmm3 movaps [rsp + nb112_fjyO], xmm4 movaps [rsp + nb112_fjzO], xmm5 addps xmm0, [rsp + nb112_fixH2] addps xmm1, [rsp + nb112_fiyH2] addps xmm2, [rsp + nb112_fizH2] movaps [rsp + nb112_fixH2], xmm0 movaps [rsp + nb112_fiyH2], xmm1 movaps [rsp + nb112_fizH2], xmm2 ;# update j water forces from local variables movlps xmm0, [rsi + rax*4] movlps xmm1, [rsi + rax*4 + 12] movhps xmm1, [rsi + rax*4 + 24] movaps xmm3, [rsp + nb112_fjxO] movaps xmm4, [rsp + nb112_fjyO] movaps xmm5, [rsp + nb112_fjzO] movaps xmm6, xmm5 movaps xmm7, xmm5 shufps xmm6, xmm6, 2 ;# 00000010 shufps xmm7, xmm7, 3 ;# 00000011 addss xmm5, [rsi + rax*4 + 8] addss xmm6, [rsi + rax*4 + 20] addss xmm7, [rsi + rax*4 + 32] movss [rsi + rax*4 + 8], xmm5 movss [rsi + rax*4 + 20], xmm6 movss [rsi + rax*4 + 32], xmm7 movaps xmm5, xmm3 unpcklps xmm3, xmm4 unpckhps xmm5, xmm4 addps xmm0, xmm3 addps xmm1, xmm5 movlps [rsi + rax*4], xmm0 movlps [rsi + rax*4 + 12], xmm1 movhps [rsi + rax*4 + 24], xmm1 dec dword ptr [rsp + nb112_innerk] jz .nb112_updateouterdata jmp .nb112_single_loop.nb112_updateouterdata: mov ecx, [rsp + nb112_ii3] mov rdi, [rbp + nb112_faction] mov rsi, [rbp + nb112_fshift] mov edx, [rsp + nb112_is3] ;# accumulate Oi forces in xmm0, xmm1, xmm2 movaps xmm0, [rsp + nb112_fixO] movaps xmm1, [rsp + nb112_fiyO] movaps xmm2, [rsp + nb112_fizO] movhlps xmm3, xmm0 movhlps xmm4, xmm1
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?