nb_kernel112_x86_64_sse.intel_syntax.s
来自「最著名最快的分子模拟软件」· S 代码 · 共 2,220 行 · 第 1/5 页
S
2,220 行
movhlps xmm5, xmm2 addps xmm0, xmm3 addps xmm1, xmm4 addps xmm2, xmm5 ;# sum is in 1/2 in xmm0-xmm2 movaps xmm3, xmm0 movaps xmm4, xmm1 movaps xmm5, xmm2 shufps xmm3, xmm3, 1 shufps xmm4, xmm4, 1 shufps xmm5, xmm5, 1 addss xmm0, xmm3 addss xmm1, xmm4 addss xmm2, xmm5 ;# xmm0-xmm2 has single force in pos0 ;# increment i force movss xmm3, [rdi + rcx*4] movss xmm4, [rdi + rcx*4 + 4] movss xmm5, [rdi + rcx*4 + 8] subss xmm3, xmm0 subss xmm4, xmm1 subss xmm5, xmm2 movss [rdi + rcx*4], xmm3 movss [rdi + rcx*4 + 4], xmm4 movss [rdi + rcx*4 + 8], xmm5 ;# accumulate force in xmm6/xmm7 for fshift movaps xmm6, xmm0 movss xmm7, xmm2 movlhps xmm6, xmm1 shufps xmm6, xmm6, 8 ;# 00001000 ;# accumulate H1i forces in xmm0, xmm1, xmm2 movaps xmm0, [rsp + nb112_fixH1] movaps xmm1, [rsp + nb112_fiyH1] movaps xmm2, [rsp + nb112_fizH1] movhlps xmm3, xmm0 movhlps xmm4, xmm1 movhlps xmm5, xmm2 addps xmm0, xmm3 addps xmm1, xmm4 addps xmm2, xmm5 ;# sum is in 1/2 in xmm0-xmm2 movaps xmm3, xmm0 movaps xmm4, xmm1 movaps xmm5, xmm2 shufps xmm3, xmm3, 1 shufps xmm4, xmm4, 1 shufps xmm5, xmm5, 1 addss xmm0, xmm3 addss xmm1, xmm4 addss xmm2, xmm5 ;# xmm0-xmm2 has single force in pos0 ;# increment i force movss xmm3, [rdi + rcx*4 + 12] movss xmm4, [rdi + rcx*4 + 16] movss xmm5, [rdi + rcx*4 + 20] subss xmm3, xmm0 subss xmm4, xmm1 subss xmm5, xmm2 movss [rdi + rcx*4 + 12], xmm3 movss [rdi + rcx*4 + 16], xmm4 movss [rdi + rcx*4 + 20], xmm5 ;# accumulate force in xmm6/xmm7 for fshift addss xmm7, xmm2 movlhps xmm0, xmm1 shufps xmm0, xmm0, 8 ;# 00001000 addps xmm6, xmm0 ;# accumulate H2i forces in xmm0, xmm1, xmm2 movaps xmm0, [rsp + nb112_fixH2] movaps xmm1, [rsp + nb112_fiyH2] movaps xmm2, [rsp + nb112_fizH2] movhlps xmm3, xmm0 movhlps xmm4, xmm1 movhlps xmm5, xmm2 addps xmm0, xmm3 addps xmm1, xmm4 addps xmm2, xmm5 ;# sum is in 1/2 in xmm0-xmm2 movaps xmm3, xmm0 movaps xmm4, xmm1 movaps xmm5, xmm2 shufps xmm3, xmm3, 1 shufps xmm4, xmm4, 1 shufps xmm5, xmm5, 1 addss xmm0, xmm3 addss xmm1, xmm4 addss xmm2, xmm5 ;# xmm0-xmm2 has single force in pos0 ;# increment i force movss xmm3, [rdi + rcx*4 + 24] movss xmm4, [rdi + rcx*4 + 28] movss xmm5, [rdi + rcx*4 + 32] subss xmm3, xmm0 subss xmm4, xmm1 subss xmm5, xmm2 movss [rdi + rcx*4 + 24], xmm3 movss [rdi + rcx*4 + 28], xmm4 movss [rdi + rcx*4 + 32], xmm5 ;# accumulate force in xmm6/xmm7 for fshift addss xmm7, xmm2 movlhps xmm0, xmm1 shufps xmm0, xmm0, 8 ;# 00001000 addps xmm6, xmm0 ;# increment fshift force movlps xmm3, [rsi + rdx*4] movss xmm4, [rsi + rdx*4 + 8] subps xmm3, xmm6 subss xmm4, xmm7 movlps [rsi + rdx*4], xmm3 movss [rsi + rdx*4 + 8], xmm4 ;# get n from stack mov esi, [rsp + nb112_n] ;# get group index for i particle mov rdx, [rbp + nb112_gid] ;# base of gid[] mov edx, [rdx + rsi*4] ;# ggid=gid[n] ;# accumulate total potential energy and update it movaps xmm7, [rsp + nb112_vctot] ;# accumulate movhlps xmm6, xmm7 addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now movaps xmm6, xmm7 shufps xmm6, xmm6, 1 addss xmm7, xmm6 ;# add earlier value from mem mov rax, [rbp + nb112_Vc] addss xmm7, [rax + rdx*4] ;# move back to mem movss [rax + rdx*4], xmm7 ;# accumulate total lj energy and update it movaps xmm7, [rsp + nb112_Vvdwtot] ;# accumulate movhlps xmm6, xmm7 addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now movaps xmm6, xmm7 shufps xmm6, xmm6, 1 addss xmm7, xmm6 ;# add earlier value from mem mov rax, [rbp + nb112_Vvdw] addss xmm7, [rax + rdx*4] ;# move back to mem movss [rax + rdx*4], xmm7 ;# finish if last mov ecx, [rsp + nb112_nn1] ;# esi already loaded with n inc esi sub ecx, esi jz .nb112_outerend ;# not last, iterate outer loop once more! mov [rsp + nb112_n], esi jmp .nb112_outer.nb112_outerend: ;# check if more outer neighborlists remain mov ecx, [rsp + nb112_nri] ;# esi already loaded with n above sub ecx, esi jz .nb112_end ;# non-zero, do one more workunit jmp .nb112_threadloop.nb112_end: mov eax, [rsp + nb112_nouter] mov ebx, [rsp + nb112_ninner] mov rcx, [rbp + nb112_outeriter] mov rdx, [rbp + nb112_inneriter] mov [rcx], eax mov [rdx], ebx add rsp, 1592 emms pop r15 pop r14 pop r13 pop r12 pop rbx pop rbp ret .globl nb_kernel112nf_x86_64_sse.globl _nb_kernel112nf_x86_64_ssenb_kernel112nf_x86_64_sse: _nb_kernel112nf_x86_64_sse: ;# Room for return address and rbp (16 bytes).equiv nb112nf_fshift, 16.equiv nb112nf_gid, 24.equiv nb112nf_pos, 32.equiv nb112nf_faction, 40.equiv nb112nf_charge, 48.equiv nb112nf_p_facel, 56.equiv nb112nf_argkrf, 64.equiv nb112nf_argcrf, 72.equiv nb112nf_Vc, 80.equiv nb112nf_type, 88.equiv nb112nf_p_ntype, 96.equiv nb112nf_vdwparam, 104.equiv nb112nf_Vvdw, 112.equiv nb112nf_p_tabscale, 120.equiv nb112nf_VFtab, 128.equiv nb112nf_invsqrta, 136.equiv nb112nf_dvda, 144.equiv nb112nf_p_gbtabscale, 152.equiv nb112nf_GBtab, 160.equiv nb112nf_p_nthreads, 168.equiv nb112nf_count, 176.equiv nb112nf_mtx, 184.equiv nb112nf_outeriter, 192.equiv nb112nf_inneriter, 200.equiv nb112nf_work, 208 ;# stack offsets for local variables ;# bottom of stack is cache-aligned for sse use .equiv nb112nf_ixO, 0.equiv nb112nf_iyO, 16.equiv nb112nf_izO, 32.equiv nb112nf_ixH1, 48.equiv nb112nf_iyH1, 64.equiv nb112nf_izH1, 80.equiv nb112nf_ixH2, 96.equiv nb112nf_iyH2, 112.equiv nb112nf_izH2, 128.equiv nb112nf_jxO, 144.equiv nb112nf_jyO, 160.equiv nb112nf_jzO, 176.equiv nb112nf_jxH1, 192.equiv nb112nf_jyH1, 208.equiv nb112nf_jzH1, 224.equiv nb112nf_jxH2, 240.equiv nb112nf_jyH2, 256.equiv nb112nf_jzH2, 272.equiv nb112nf_qqOO, 288.equiv nb112nf_qqOH, 304.equiv nb112nf_qqHH, 320.equiv nb112nf_c6, 336.equiv nb112nf_c12, 352.equiv nb112nf_vctot, 368.equiv nb112nf_Vvdwtot, 384.equiv nb112nf_half, 400.equiv nb112nf_three, 416.equiv nb112nf_rsqOO, 432.equiv nb112nf_rsqOH1, 448.equiv nb112nf_rsqOH2, 464.equiv nb112nf_rsqH1O, 480.equiv nb112nf_rsqH1H1, 496.equiv nb112nf_rsqH1H2, 512.equiv nb112nf_rsqH2O, 528.equiv nb112nf_rsqH2H1, 544.equiv nb112nf_rsqH2H2, 560.equiv nb112nf_rinvOO, 576.equiv nb112nf_rinvOH1, 592.equiv nb112nf_rinvOH2, 608.equiv nb112nf_rinvH1O, 624.equiv nb112nf_rinvH1H1, 640.equiv nb112nf_rinvH1H2, 656.equiv nb112nf_rinvH2O, 672.equiv nb112nf_rinvH2H1, 688.equiv nb112nf_rinvH2H2, 704.equiv nb112nf_is3, 720.equiv nb112nf_ii3, 724.equiv nb112nf_nri, 740.equiv nb112nf_iinr, 748.equiv nb112nf_jindex, 756.equiv nb112nf_jjnr, 764.equiv nb112nf_shift, 772.equiv nb112nf_shiftvec, 780.equiv nb112nf_facel, 788.equiv nb112nf_innerjjnr, 796.equiv nb112nf_innerk, 804.equiv nb112nf_n, 812.equiv nb112nf_nn1, 816.equiv nb112nf_nouter, 820.equiv nb112nf_ninner, 824 push rbp mov rbp, rsp push rbx sub rsp, 840 emms ;# zero 32-bit iteration counters mov eax, 0 mov [rsp + nb112nf_nouter], eax mov [rsp + nb112nf_ninner], eax mov edi, [rdi] mov [rsp + nb112nf_nri], edi mov [rsp + nb112nf_iinr], rsi mov [rsp + nb112nf_jindex], rdx mov [rsp + nb112nf_jjnr], rcx mov [rsp + nb112nf_shift], r8 mov [rsp + nb112nf_shiftvec], r9 mov rsi, [rbp + nb112nf_p_facel] movss xmm0, [rsi] movss [rsp + nb112nf_facel], xmm0 ;# assume we have at least one i particle - start directly mov rcx, [rsp + nb112nf_iinr] ;# rcx = pointer into iinr[] mov ebx, [rcx] ;# ebx =ii mov rdx, [rbp + nb112nf_charge] movss xmm3, [rdx + rbx*4] movss xmm4, xmm3 movss xmm5, [rdx + rbx*4 + 4] mov rsi, [rbp + nb112nf_p_facel] movss xmm0, [rsi] movss xmm6, [rsp + nb112nf_facel] mulss xmm3, xmm3 mulss xmm4, xmm5 mulss xmm5, xmm5 mulss xmm3, xmm6 mulss xmm4, xmm6 mulss xmm5, xmm6 shufps xmm3, xmm3, 0 shufps xmm4, xmm4, 0 shufps xmm5, xmm5, 0 movaps [rsp + nb112nf_qqOO], xmm3 movaps [rsp + nb112nf_qqOH], xmm4 movaps [rsp + nb112nf_qqHH], xmm5 xorps xmm0, xmm0 mov rdx, [rbp + nb112nf_type] mov ecx, [rdx + rbx*4] shl ecx, 1 mov edx, ecx mov rdi, [rbp + nb112nf_p_ntype] imul ecx, [rdi] ;# rcx = ntia = 2*ntype*type[ii0] add edx, ecx mov rax, [rbp + nb112nf_vdwparam] movlps xmm0, [rax + rdx*4] movaps xmm1, xmm0 shufps xmm0, xmm0, 0 shufps xmm1, xmm1, 85 ;# 01010101 movaps [rsp + nb112nf_c6], xmm0 movaps [rsp + nb112nf_c12], xmm1 ;# create constant floating-point factors on stack mov eax, 0x3f000000 ;# half in IEEE (hex) mov [rsp + nb112nf_half], eax movss xmm1, [rsp + nb112nf_half] shufps xmm1, xmm1, 0 ;# splat to all elements movaps xmm2, xmm1 addps xmm2, xmm2 ;# one movaps xmm3, xmm2 addps xmm2, xmm2 ;# two addps xmm3, xmm2 ;# three movaps [rsp + nb112nf_half], xmm1 movaps [rsp + nb112nf_three], xmm3.nb112nf_threadloop: mov rsi, [rbp + nb112nf_count] ;# pointer to sync counter mov eax, [rsi].nb112nf_spinlock: mov ebx, eax ;# ebx=*count=nn0 add ebx, 1 ;# ebx=nn1=nn0+10 lock cmpxchg [esi], ebx ;# write nn1 to *counter, ;# if it hasnt changed. ;# or reread *counter to eax. pause ;# -> better p4 performance jnz .nb112nf_spinlock ;# if(nn1>nri) nn1=nri mov ecx, [rsp + nb112nf_nri] mov edx, ecx sub ecx, ebx cmovle ebx, edx ;# if(nn1>nri) nn1=nri ;# Cleared the spinlock if we got here. ;# eax contains nn0, ebx contains nn1. mov [rsp + nb112nf_n], eax mov [rsp + nb112nf_nn1], ebx sub ebx, eax ;# calc number of outer lists mov esi, eax ;# copy n to esi jg .nb112nf_outerstart jmp .nb112nf_end.nb112nf_outerstart: ;# ebx contains number of outer iterations add ebx, [rsp + nb112nf_nouter] mov [rsp + nb112nf_nouter], ebx.nb112nf_outer: mov rax, [rsp + nb112nf_shift] ;# rax = pointer into shift[] mov ebx, [rax + rsi*4] ;# rbx=shift[n] lea rbx, [rbx + rbx*2] ;# rbx=3*is mov [rsp + nb112nf_is3],ebx ;# store is3 mov rax, [rsp + nb112nf_shiftvec] ;# rax = base of shiftvec[] movss xmm0, [rax + rbx*4] movss xmm1, [rax + rbx*4 + 4] movss xmm2, [rax + rbx*4 + 8] mov rcx, [rsp + nb112nf_iinr] ;# rcx = pointer into iinr[] mov ebx, [rcx + rsi*4] ;# ebx =ii lea rbx, [rbx + rbx*2] ;# rbx = 3*ii=ii3 mov rax, [rbp + nb112nf_pos] ;# rax = base of pos[] mov [rsp + nb112nf_ii3], ebx movaps xmm3, xmm0 movaps xmm4, xmm1 movaps xmm5, xmm2 addss xmm3, [rax + rbx*4] addss xmm4, [rax + rbx*4 + 4] addss xmm5, [rax + rbx*4 + 8] shufps xmm3, xmm3, 0 shufps xmm4, xmm4, 0 shufps xmm5, xmm5, 0 movaps [rsp + nb112nf_ixO], xmm3 movaps [rsp + nb112nf_iyO], xmm4 movaps [rsp + nb112nf_izO], xmm5 movss xmm3, xmm0 movss xmm4, xmm1 movss xmm5, xmm2 addss xmm0, [rax + rbx*4 + 12] addss xmm1, [rax + rbx*4 + 16] addss xmm2, [rax + rbx*4 + 20] addss xmm3, [rax + rbx*4 + 24] addss xmm4, [rax + rbx*4 + 28]
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?