nb_kernel202_x86_64_sse2.intel_syntax.s
来自「最著名最快的分子模拟软件」· S 代码 · 共 2,314 行 · 第 1/5 页
S
2,314 行
;# increment i force movsd xmm3, [rdi + rcx*8 + 24] movsd xmm4, [rdi + rcx*8 + 32] movsd xmm5, [rdi + rcx*8 + 40] subsd xmm3, xmm0 subsd xmm4, xmm1 subsd xmm5, xmm2 movsd [rdi + rcx*8 + 24], xmm3 movsd [rdi + rcx*8 + 32], xmm4 movsd [rdi + rcx*8 + 40], xmm5 ;# accumulate force in xmm6/xmm7 for fshift addsd xmm7, xmm2 unpcklpd xmm0, xmm1 addpd xmm6, xmm0 ;# accumulate H2i forces in xmm0, xmm1, xmm2 movapd xmm0, [rsp + nb202_fixH2] movapd xmm1, [rsp + nb202_fiyH2] movapd xmm2, [rsp + nb202_fizH2] movhlps xmm3, xmm0 movhlps xmm4, xmm1 movhlps xmm5, xmm2 addsd xmm0, xmm3 addsd xmm1, xmm4 addsd xmm2, xmm5 ;# sum is in low xmm0-xmm2 movapd xmm3, xmm0 movapd xmm4, xmm1 movapd xmm5, xmm2 ;# increment i force movsd xmm3, [rdi + rcx*8 + 48] movsd xmm4, [rdi + rcx*8 + 56] movsd xmm5, [rdi + rcx*8 + 64] subsd xmm3, xmm0 subsd xmm4, xmm1 subsd xmm5, xmm2 movsd [rdi + rcx*8 + 48], xmm3 movsd [rdi + rcx*8 + 56], xmm4 movsd [rdi + rcx*8 + 64], xmm5 ;# accumulate force in xmm6/xmm7 for fshift addsd xmm7, xmm2 unpcklpd xmm0, xmm1 addpd xmm6, xmm0 ;# increment fshift force movlpd xmm3, [rsi + rdx*8] movhpd xmm3, [rsi + rdx*8 + 8] movsd xmm4, [rsi + rdx*8 + 16] subpd xmm3, xmm6 subsd xmm4, xmm7 movlpd [rsi + rdx*8], xmm3 movhpd [rsi + rdx*8 + 8], xmm3 movsd [rsi + rdx*8 + 16], xmm4 ;# get n from stack mov esi, [rsp + nb202_n] ;# get group index for i particle mov rdx, [rbp + nb202_gid] ;# base of gid[] mov edx, [rdx + rsi*4] ;# ggid=gid[n] ;# accumulate total potential energy and update it movapd xmm7, [rsp + nb202_vctot] ;# accumulate movhlps xmm6, xmm7 addsd xmm7, xmm6 ;# low xmm7 has the sum now ;# add earlier value from mem mov rax, [rbp + nb202_Vc] addsd xmm7, [rax + rdx*8] ;# move back to mem movsd [rax + rdx*8], xmm7 ;# finish if last mov ecx, [rsp + nb202_nn1] ;# esi already loaded with n inc esi sub ecx, esi jz .nb202_outerend ;# not last, iterate outer loop once more! mov [rsp + nb202_n], esi jmp .nb202_outer.nb202_outerend: ;# check if more outer neighborlists remain mov ecx, [rsp + nb202_nri] ;# esi already loaded with n above sub ecx, esi jz .nb202_end ;# non-zero, do one more workunit jmp .nb202_threadloop.nb202_end: mov eax, [rsp + nb202_nouter] mov ebx, [rsp + nb202_ninner] mov rcx, [rbp + nb202_outeriter] mov rdx, [rbp + nb202_inneriter] mov [rcx], eax mov [rdx], ebx add rsp, 1544 emms pop r15 pop r14 pop r13 pop r12 pop rbx pop rbp ret .globl nb_kernel202nf_x86_64_sse2.globl _nb_kernel202nf_x86_64_sse2nb_kernel202nf_x86_64_sse2: _nb_kernel202nf_x86_64_sse2: ;# Room for return address and rbp (16 bytes).equiv nb202nf_fshift, 16.equiv nb202nf_gid, 24.equiv nb202nf_pos, 32.equiv nb202nf_faction, 40.equiv nb202nf_charge, 48.equiv nb202nf_p_facel, 56.equiv nb202nf_argkrf, 64.equiv nb202nf_argcrf, 72.equiv nb202nf_Vc, 80.equiv nb202nf_type, 88.equiv nb202nf_p_ntype, 96.equiv nb202nf_vdwparam, 104.equiv nb202nf_Vvdw, 112.equiv nb202nf_p_tabscale, 120.equiv nb202nf_VFtab, 128.equiv nb202nf_invsqrta, 136.equiv nb202nf_dvda, 144.equiv nb202nf_p_gbtabscale, 152.equiv nb202nf_GBtab, 160.equiv nb202nf_p_nthreads, 168.equiv nb202nf_count, 176.equiv nb202nf_mtx, 184.equiv nb202nf_outeriter, 192.equiv nb202nf_inneriter, 200.equiv nb202nf_work, 208 ;# stack offsets for local variables ;# bottom of stack is cache-aligned for sse use .equiv nb202nf_ixO, 0.equiv nb202nf_iyO, 16.equiv nb202nf_izO, 32.equiv nb202nf_ixH1, 48.equiv nb202nf_iyH1, 64.equiv nb202nf_izH1, 80.equiv nb202nf_ixH2, 96.equiv nb202nf_iyH2, 112.equiv nb202nf_izH2, 128.equiv nb202nf_jxO, 144.equiv nb202nf_jyO, 160.equiv nb202nf_jzO, 176.equiv nb202nf_jxH1, 192.equiv nb202nf_jyH1, 208.equiv nb202nf_jzH1, 224.equiv nb202nf_jxH2, 240.equiv nb202nf_jyH2, 256.equiv nb202nf_jzH2, 272.equiv nb202nf_qqOO, 288.equiv nb202nf_qqOH, 304.equiv nb202nf_qqHH, 320.equiv nb202nf_vctot, 336.equiv nb202nf_half, 352.equiv nb202nf_three, 368.equiv nb202nf_rsqOO, 384.equiv nb202nf_rsqOH1, 400.equiv nb202nf_rsqOH2, 416.equiv nb202nf_rsqH1O, 432.equiv nb202nf_rsqH1H1, 448.equiv nb202nf_rsqH1H2, 464.equiv nb202nf_rsqH2O, 480.equiv nb202nf_rsqH2H1, 496.equiv nb202nf_rsqH2H2, 512.equiv nb202nf_rinvOO, 528.equiv nb202nf_rinvOH1, 544.equiv nb202nf_rinvOH2, 560.equiv nb202nf_rinvH1O, 576.equiv nb202nf_rinvH1H1, 592.equiv nb202nf_rinvH1H2, 608.equiv nb202nf_rinvH2O, 624.equiv nb202nf_rinvH2H1, 640.equiv nb202nf_rinvH2H2, 656.equiv nb202nf_krf, 672.equiv nb202nf_crf, 688.equiv nb202nf_is3, 704.equiv nb202nf_ii3, 708.equiv nb202nf_nri, 712.equiv nb202nf_iinr, 720.equiv nb202nf_jindex, 728.equiv nb202nf_jjnr, 736.equiv nb202nf_shift, 744.equiv nb202nf_shiftvec, 752.equiv nb202nf_facel, 760.equiv nb202nf_innerjjnr, 768.equiv nb202nf_innerk, 776.equiv nb202nf_n, 780.equiv nb202nf_nn1, 784.equiv nb202nf_nouter, 788.equiv nb202nf_ninner, 792 push rbp mov rbp, rsp push rbx emms push r12 push r13 push r14 push r15 sub rsp, 808 ;# local variable stack space (n*16+8) ;# zero 32-bit iteration counters mov eax, 0 mov [rsp + nb202nf_nouter], eax mov [rsp + nb202nf_ninner], eax mov edi, [rdi] mov [rsp + nb202nf_nri], edi mov [rsp + nb202nf_iinr], rsi mov [rsp + nb202nf_jindex], rdx mov [rsp + nb202nf_jjnr], rcx mov [rsp + nb202nf_shift], r8 mov [rsp + nb202nf_shiftvec], r9 mov rsi, [rbp + nb202nf_p_facel] movsd xmm0, [rsi] movsd [rsp + nb202nf_facel], xmm0 mov rsi, [rbp + nb202nf_argkrf] mov rdi, [rbp + nb202nf_argcrf] movsd xmm1, [rsi] movsd xmm2, [rdi] shufpd xmm1, xmm1, 0 shufpd xmm2, xmm2, 0 movapd [rsp + nb202nf_krf], xmm1 movapd [rsp + nb202nf_crf], xmm2 ;# create constant floating-point factors on stack mov eax, 0x00000000 ;# lower half of double half IEEE (hex) mov ebx, 0x3fe00000 mov [rsp + nb202nf_half], eax mov [rsp + nb202nf_half+4], ebx movsd xmm1, [rsp + nb202nf_half] shufpd xmm1, xmm1, 0 ;# splat to all elements movapd xmm3, xmm1 addpd xmm3, xmm3 ;# one movapd xmm2, xmm3 addpd xmm2, xmm2 ;# two addpd xmm3, xmm2 ;# three movapd [rsp + nb202nf_half], xmm1 movapd [rsp + nb202nf_three], xmm3 ;# assume we have at least one i particle - start directly mov rcx, [rsp + nb202nf_iinr] ;# rcx = pointer into iinr[] mov ebx, [rcx] ;# ebx =ii mov rdx, [rbp + nb202nf_charge] movsd xmm3, [rdx + rbx*8] movsd xmm4, xmm3 movsd xmm5, [rdx + rbx*8 + 8] movsd xmm6, [rsp + nb202nf_facel] mulsd xmm3, xmm3 mulsd xmm4, xmm5 mulsd xmm5, xmm5 mulsd xmm3, xmm6 mulsd xmm4, xmm6 mulsd xmm5, xmm6 shufpd xmm3, xmm3, 0 shufpd xmm4, xmm4, 0 shufpd xmm5, xmm5, 0 movapd [rsp + nb202nf_qqOO], xmm3 movapd [rsp + nb202nf_qqOH], xmm4 movapd [rsp + nb202nf_qqHH], xmm5 .nb202nf_threadloop: mov rsi, [rbp + nb202nf_count] ;# pointer to sync counter mov eax, [rsi].nb202nf_spinlock: mov ebx, eax ;# ebx=*count=nn0 add ebx, 1 ;# ebx=nn1=nn0+10 lock cmpxchg [esi], ebx ;# write nn1 to *counter, ;# if it hasnt changed. ;# or reread *counter to eax. pause ;# -> better p4 performance jnz .nb202nf_spinlock ;# if(nn1>nri) nn1=nri mov ecx, [rsp + nb202nf_nri] mov edx, ecx sub ecx, ebx cmovle ebx, edx ;# if(nn1>nri) nn1=nri ;# Cleared the spinlock if we got here. ;# eax contains nn0, ebx contains nn1. mov [rsp + nb202nf_n], eax mov [rsp + nb202nf_nn1], ebx sub ebx, eax ;# calc number of outer lists mov esi, eax ;# copy n to esi jg .nb202nf_outerstart jmp .nb202nf_end.nb202nf_outerstart: ;# ebx contains number of outer iterations add ebx, [rsp + nb202nf_nouter] mov [rsp + nb202nf_nouter], ebx.nb202nf_outer: mov rax, [rsp + nb202nf_shift] ;# rax = pointer into shift[] mov ebx, [rax+rsi*4] ;# rbx=shift[n] lea rbx, [rbx + rbx*2] ;# rbx=3*is mov rax, [rsp + nb202nf_shiftvec] ;# rax = base of shiftvec[] movsd xmm0, [rax + rbx*8] movsd xmm1, [rax + rbx*8 + 8] movsd xmm2, [rax + rbx*8 + 16] mov rcx, [rsp + nb202nf_iinr] ;# rcx = pointer into iinr[] mov ebx, [rcx+rsi*4] ;# ebx =ii lea rbx, [rbx + rbx*2] ;# rbx = 3*ii=ii3 mov rax, [rbp + nb202nf_pos] ;# rax = base of pos[] mov [rsp + nb202nf_ii3], ebx movapd xmm3, xmm0 movapd xmm4, xmm1 movapd xmm5, xmm2 addsd xmm3, [rax + rbx*8] addsd xmm4, [rax + rbx*8 + 8] addsd xmm5, [rax + rbx*8 + 16] shufpd xmm3, xmm3, 0 shufpd xmm4, xmm4, 0 shufpd xmm5, xmm5, 0 movapd [rsp + nb202nf_ixO], xmm3 movapd [rsp + nb202nf_iyO], xmm4 movapd [rsp + nb202nf_izO], xmm5 movsd xmm3, xmm0 movsd xmm4, xmm1 movsd xmm5, xmm2 addsd xmm0, [rax + rbx*8 + 24] addsd xmm1, [rax + rbx*8 + 32] addsd xmm2, [rax + rbx*8 + 40] addsd xmm3, [rax + rbx*8 + 48] addsd xmm4, [rax + rbx*8 + 56] addsd xmm5, [rax + rbx*8 + 64] shufpd xmm0, xmm0, 0 shufpd xmm1, xmm1, 0 shufpd xmm2, xmm2, 0 shufpd xmm3, xmm3, 0 shufpd xmm4, xmm4, 0 shufpd xmm5, xmm5, 0 movapd [rsp + nb202nf_ixH1], xmm0 movapd [rsp + nb202nf_iyH1], xmm1 movapd [rsp + nb202nf_izH1], xmm2 movapd [rsp + nb202nf_ixH2], xmm3 movapd [rsp + nb202nf_iyH2], xmm4 movapd [rsp + nb202nf_izH2], xmm5 ;# clear vctot xorpd xmm4, xmm4 movapd [rsp + nb202nf_vctot], xmm4 mov rax, [rsp + nb202nf_jindex] mov ecx, [rax + rsi*4] ;# jindex[n] mov edx, [rax + rsi*4 + 4] ;# jindex[n+1] sub edx, ecx ;# number of innerloop atoms mov rsi, [rbp + nb202nf_pos] mov rax, [rsp + nb202nf_jjnr] shl ecx, 2 add rax, rcx mov [rsp + nb202nf_innerjjnr], rax ;# pointer to jjnr[nj0] mov ecx, edx sub edx, 2 add ecx, [rsp + nb202nf_ninner] mov [rsp + nb202nf_ninner], ecx add edx, 0 mov [rsp + nb202nf_innerk], edx ;# number of innerloop atoms jge .nb202nf_unroll_loop jmp .nb202nf_checksingle.nb202nf_unroll_loop: ;# twice unrolled innerloop here mov rdx, [rsp + nb202nf_innerjjnr] ;# pointer to jjnr[k] mov eax, [rdx] mov ebx, [rdx + 4] add qword ptr [rsp + nb202nf_innerjjnr], 8 ;# advance pointer (unrolled 2) mov rsi, [rbp + nb202nf_pos] ;# base of pos[] lea rax, [rax + rax*2] ;# replace jnr with j3 lea rbx, [rbx + rbx*2] ;# move j coordinates to local temp variables movlpd xmm2, [rsi + rax*8] movlpd xmm3, [rsi + rax*8 + 8] movlpd xmm4, [rsi + rax*8 + 16] movlpd xmm5, [rsi + rax*8 + 24] movlpd xmm6, [rsi + rax*8 + 32] movlpd xmm7, [rsi + rax*8 + 40] movhpd xmm2, [rsi + rbx*8] movhpd xmm3, [rsi + rbx*8 + 8] movhpd xmm4, [rsi + rbx*8 + 16] movhpd xmm5, [rsi + rbx*8 + 24] movhpd xmm6, [rsi + rbx*8 + 32] movhpd xmm7, [rsi + rbx*8 + 40] movapd [rsp + nb202nf_jxO], xmm2 movapd [rsp + nb202nf_jyO], xmm3 movapd [rsp + nb202nf_jzO], xmm4 movapd [rsp + nb202nf_jxH1], xmm5 movapd [rsp + nb202nf_jyH1], xmm6 movapd [rsp + nb202nf_jzH1], xmm7 movlpd xmm2, [rsi + rax*8 + 48] movlpd xmm3, [rsi + rax*8 + 56] movlpd xmm4, [rsi + rax*8 + 64] movhpd xmm2, [rsi + rbx*8 + 48] movhpd xmm3, [rsi + rbx*8 + 56] movhpd xmm4, [rsi + rbx*8 + 64] movapd [rsp + nb202nf_jxH2], xmm2 movapd [rsp + nb202nf_jyH2], xmm3 movapd [rsp + nb202nf_jzH2], xmm4 movapd xmm0, [rsp + nb202nf_ixO] movapd xmm1, [rsp + nb202nf_iyO] movapd xmm2, [rsp + nb202nf_izO] movapd xmm3, [rsp + nb202nf_ixO] movapd xmm4, [rsp + nb202nf_iyO] movapd xmm5, [rsp + nb202nf_izO] subpd xmm0, [rsp + nb202nf_jxO] subpd xmm1, [rsp + nb202nf_jyO] subpd xmm2, [rsp + nb202nf_jzO] subpd xmm3, [rsp + nb202nf_jxH1] subpd xmm4, [rsp + nb202nf_jyH1] subpd xmm5, [rsp + nb202nf_jzH1] mulpd xmm0, xmm0 mulpd xmm1, xmm1 mulpd xmm2, xmm2 mulpd xmm3, xmm3 mulpd xmm4, xmm4 mulpd xmm5, xmm5 addpd xmm0, xmm1 addpd xmm0, xmm2 addpd xmm3, xmm4 addpd xmm3, xmm5 movapd [rsp + nb202nf_rsqOO], xmm0 movapd [rsp + nb202nf_rsqOH1], xmm3 movapd xmm0, [rsp + nb202nf_ixO] movapd xmm1, [rsp + nb202nf_iy
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?