nb_kernel110_x86_64_sse2.s
来自「最著名最快的分子模拟软件」· S 代码 · 共 1,111 行 · 第 1/3 页
S
1,111 行
## now update f_i addpd %xmm9,%xmm13 addpd %xmm10,%xmm14 addpd %xmm11,%xmm15 movlpd %xmm6,(%rdi,%rax,8) movlpd %xmm7,8(%rdi,%rax,8) movlpd %xmm8,16(%rdi,%rax,8) movhpd %xmm6,(%rdi,%rbx,8) movhpd %xmm7,8(%rdi,%rbx,8) movhpd %xmm8,16(%rdi,%rbx,8) ## should we do one more iteration? subl $2,nb110_innerk(%rsp) jl _nb_kernel110_x86_64_sse2.nb110_checksingle jmp _nb_kernel110_x86_64_sse2.nb110_unroll_loop_nb_kernel110_x86_64_sse2.nb110_checksingle: movl nb110_innerk(%rsp),%edx andl $1,%edx jnz _nb_kernel110_x86_64_sse2.nb110_dosingle jmp _nb_kernel110_x86_64_sse2.nb110_updateouterdata_nb_kernel110_x86_64_sse2.nb110_dosingle: movq nb110_charge(%rbp),%rsi movq nb110_pos(%rbp),%rdi movq nb110_innerjjnr(%rsp),%rcx movl (%rcx),%eax movq nb110_charge(%rbp),%rsi ## base of charge[] movsd (%rsi,%rax,8),%xmm3 mulsd nb110_iq(%rsp),%xmm3 ## qq movq nb110_type(%rbp),%rsi movl (%rsi,%rax,4),%r8d movq nb110_vdwparam(%rbp),%rsi shll %r8d movl nb110_ntia(%rsp),%edi addl %edi,%r8d movsd (%rsi,%r8,8),%xmm4 ## c6 movsd 8(%rsi,%r8,8),%xmm6 ## c12 movapd %xmm4,nb110_c6(%rsp) movapd %xmm6,nb110_c12(%rsp) movq nb110_pos(%rbp),%rsi ## base of pos[] lea (%rax,%rax,2),%rax ## replace jnr with j3 ## move two coordinates to xmm4-xmm6 movsd (%rsi,%rax,8),%xmm4 movsd 8(%rsi,%rax,8),%xmm5 movsd 16(%rsi,%rax,8),%xmm6 ## calc dr subsd nb110_ix(%rsp),%xmm4 subsd nb110_iy(%rsp),%xmm5 subsd nb110_iz(%rsp),%xmm6 ## store dr movapd %xmm4,%xmm9 movapd %xmm5,%xmm10 movapd %xmm6,%xmm11 ## square it mulsd %xmm4,%xmm4 mulsd %xmm5,%xmm5 mulsd %xmm6,%xmm6 addsd %xmm5,%xmm4 addsd %xmm6,%xmm4 ## rsq in xmm4 cvtsd2ss %xmm4,%xmm5 rsqrtss %xmm5,%xmm5 cvtss2sd %xmm5,%xmm2 ## lu in low xmm2 ## lookup seed in xmm2 movapd %xmm2,%xmm5 ## copy of lu mulsd %xmm2,%xmm2 ## lu*lu movapd nb110_three(%rsp),%xmm1 mulsd %xmm4,%xmm2 ## rsq*lu*lu movapd nb110_half(%rsp),%xmm0 subsd %xmm2,%xmm1 ## 30-rsq*lu*lu mulsd %xmm5,%xmm1 mulsd %xmm0,%xmm1 ## xmm0=iter1 of rinv (new lu) movapd %xmm1,%xmm5 ## copy of lu mulsd %xmm1,%xmm1 ## lu*lu movapd nb110_three(%rsp),%xmm2 mulsd %xmm4,%xmm1 ## rsq*lu*lu movapd nb110_half(%rsp),%xmm0 subsd %xmm1,%xmm2 ## 30-rsq*lu*lu mulsd %xmm5,%xmm2 mulsd %xmm2,%xmm0 ## xmm0=rinv movapd %xmm0,%xmm4 mulsd %xmm4,%xmm4 ## xmm4=rinvsq movapd %xmm4,%xmm1 mulsd %xmm4,%xmm1 mulsd %xmm4,%xmm1 ## xmm1=rinvsix movapd %xmm1,%xmm2 mulsd %xmm2,%xmm2 ## xmm2=rinvtwelve mulsd %xmm0,%xmm3 ## xmm3=vcoul mulsd nb110_c6(%rsp),%xmm1 mulsd nb110_c12(%rsp),%xmm2 movapd %xmm2,%xmm5 subsd %xmm1,%xmm5 ## Vvdw=Vvdw12-Vvdw6 addsd nb110_Vvdwtot(%rsp),%xmm5 mulsd nb110_six(%rsp),%xmm1 mulsd nb110_twelve(%rsp),%xmm2 subsd %xmm1,%xmm2 addsd %xmm3,%xmm2 mulsd %xmm2,%xmm4 ## xmm4=total fscal addsd %xmm3,%xmm12 ## add to vctot movsd %xmm5,nb110_Vvdwtot(%rsp) movq nb110_faction(%rbp),%rdi mulsd %xmm4,%xmm9 mulsd %xmm4,%xmm10 mulsd %xmm4,%xmm11 ## now update f_i addsd %xmm9,%xmm13 addsd %xmm10,%xmm14 addsd %xmm11,%xmm15 ## the fj's - start by accumulating forces from memory addsd (%rdi,%rax,8),%xmm9 addsd 8(%rdi,%rax,8),%xmm10 addsd 16(%rdi,%rax,8),%xmm11 movsd %xmm9,(%rdi,%rax,8) movsd %xmm10,8(%rdi,%rax,8) movsd %xmm11,16(%rdi,%rax,8)_nb_kernel110_x86_64_sse2.nb110_updateouterdata: movl nb110_ii3(%rsp),%ecx movq nb110_faction(%rbp),%rdi movq nb110_fshift(%rbp),%rsi movl nb110_is3(%rsp),%edx ## accumulate i forces in xmm13, xmm14, xmm15 movhlps %xmm13,%xmm3 movhlps %xmm14,%xmm4 movhlps %xmm15,%xmm5 addsd %xmm3,%xmm13 addsd %xmm4,%xmm14 addsd %xmm5,%xmm15 ## sum is in low xmm13-xmm15 ## increment i force movsd (%rdi,%rcx,8),%xmm3 movsd 8(%rdi,%rcx,8),%xmm4 movsd 16(%rdi,%rcx,8),%xmm5 subsd %xmm13,%xmm3 subsd %xmm14,%xmm4 subsd %xmm15,%xmm5 movsd %xmm3,(%rdi,%rcx,8) movsd %xmm4,8(%rdi,%rcx,8) movsd %xmm5,16(%rdi,%rcx,8) ## increment fshift force movsd (%rsi,%rdx,8),%xmm3 movsd 8(%rsi,%rdx,8),%xmm4 movsd 16(%rsi,%rdx,8),%xmm5 subsd %xmm13,%xmm3 subsd %xmm14,%xmm4 subsd %xmm15,%xmm5 movsd %xmm3,(%rsi,%rdx,8) movsd %xmm4,8(%rsi,%rdx,8) movsd %xmm5,16(%rsi,%rdx,8) ## get n from stack movl nb110_n(%rsp),%esi ## get group index for i particle movq nb110_gid(%rbp),%rdx ## base of gid[] movl (%rdx,%rsi,4),%edx ## ggid=gid[n] ## accumulate total potential energy and update it movhlps %xmm12,%xmm6 addsd %xmm6,%xmm12 ## low xmm12 has the sum now ## add earlier value from mem movq nb110_Vc(%rbp),%rax addsd (%rax,%rdx,8),%xmm12 ## move back to mem movsd %xmm12,(%rax,%rdx,8) ## accumulate total lj energy and update it movapd nb110_Vvdwtot(%rsp),%xmm7 ## accumulate movhlps %xmm7,%xmm6 addsd %xmm6,%xmm7 ## low xmm7 has the sum now ## add earlier value from mem movq nb110_Vvdw(%rbp),%rax addsd (%rax,%rdx,8),%xmm7 ## move back to mem movsd %xmm7,(%rax,%rdx,8) ## finish if last movl nb110_nn1(%rsp),%ecx ## esi already loaded with n incl %esi subl %esi,%ecx jz _nb_kernel110_x86_64_sse2.nb110_outerend ## not last, iterate outer loop once more! movl %esi,nb110_n(%rsp) jmp _nb_kernel110_x86_64_sse2.nb110_outer_nb_kernel110_x86_64_sse2.nb110_outerend: ## check if more outer neighborlists remain movl nb110_nri(%rsp),%ecx ## esi already loaded with n above subl %esi,%ecx jz _nb_kernel110_x86_64_sse2.nb110_end ## non-zero, do one more workunit jmp _nb_kernel110_x86_64_sse2.nb110_threadloop_nb_kernel110_x86_64_sse2.nb110_end: movl nb110_nouter(%rsp),%eax movl nb110_ninner(%rsp),%ebx movq nb110_outeriter(%rbp),%rcx movq nb110_inneriter(%rbp),%rdx movl %eax,(%rcx) movl %ebx,(%rdx) addq $408,%rsp emms pop %r15 pop %r14 pop %r13 pop %r12 pop %rbx pop %rbp ret.globl nb_kernel110nf_x86_64_sse2.globl _nb_kernel110nf_x86_64_sse2nb_kernel110nf_x86_64_sse2: _nb_kernel110nf_x86_64_sse2: ## Room for return address and rbp (16 bytes).set nb110nf_fshift, 16.set nb110nf_gid, 24.set nb110nf_pos, 32.set nb110nf_faction, 40.set nb110nf_charge, 48.set nb110nf_p_facel, 56.set nb110nf_argkrf, 64.set nb110nf_argcrf, 72.set nb110nf_Vc, 80.set nb110nf_type, 88.set nb110nf_p_ntype, 96.set nb110nf_vdwparam, 104.set nb110nf_Vvdw, 112.set nb110nf_p_tabscale, 120.set nb110nf_VFtab, 128.set nb110nf_invsqrta, 136.set nb110nf_dvda, 144.set nb110nf_p_gbtabscale, 152.set nb110nf_GBtab, 160.set nb110nf_p_nthreads, 168.set nb110nf_count, 176.set nb110nf_mtx, 184.set nb110nf_outeriter, 192.set nb110nf_inneriter, 200.set nb110nf_work, 208 ## stack offsets for local variables ## bottom of stack is cache-aligned for sse2 use .set nb110nf_ix, 0.set nb110nf_iy, 16.set nb110nf_iz, 32.set nb110nf_iq, 48.set nb110nf_c6, 64.set nb110nf_c12, 80.set nb110nf_vctot, 96.set nb110nf_Vvdwtot, 112.set nb110nf_half, 128.set nb110nf_three, 144.set nb110nf_is3, 160.set nb110nf_ii3, 164.set nb110nf_nri, 168.set nb110nf_iinr, 176.set nb110nf_jindex, 184.set nb110nf_jjnr, 192.set nb110nf_shift, 200.set nb110nf_shiftvec, 208.set nb110nf_facel, 216.set nb110nf_innerjjnr, 224.set nb110nf_ntia, 232.set nb110nf_innerk, 236.set nb110nf_n, 240.set nb110nf_nn1, 244.set nb110nf_ntype, 248.set nb110nf_nouter, 252.set nb110nf_ninner, 256 push %rbp movq %rsp,%rbp push %rbx emms push %r12 push %r13 push %r14 push %r15 subq $280,%rsp ## local variable stack space (n*16+8) ## zero 32-bit iteration counters movl $0,%eax movl %eax,nb110nf_nouter(%rsp) movl %eax,nb110nf_ninner(%rsp) movl (%rdi),%edi movl %edi,nb110nf_nri(%rsp) movq %rsi,nb110nf_iinr(%rsp) movq %rdx,nb110nf_jindex(%rsp) movq %rcx,nb110nf_jjnr(%rsp) movq %r8,nb110nf_shift(%rsp) movq %r9,nb110nf_shiftvec(%rsp) movq nb110nf_p_ntype(%rbp),%rdi movl (%rdi),%edi movl %edi,nb110nf_ntype(%rsp) movq nb110nf_p_facel(%rbp),%rsi movsd (%rsi),%xmm0 movsd %xmm0,nb110nf_facel(%rsp) ## create constant floating-point factors on stack movl $0x00000000,%eax ## lower half of double half IEEE (hex) movl $0x3fe00000,%ebx movl %eax,nb110nf_half(%rsp) movl %ebx,nb110nf_half+4(%rsp) movsd nb110nf_half(%rsp),%xmm1 shufpd $0,%xmm1,%xmm1 ## splat to all elements movapd %xmm1,%xmm3 addpd %xmm3,%xmm3 ## one movapd %xmm3,%xmm2 addpd %xmm2,%xmm2 ## two addpd %xmm2,%xmm3 ## three movapd %xmm1,nb110nf_half(%rsp) movapd %xmm3,nb110nf_three(%rsp)_nb_kernel110nf_x86_64_sse2.nb110nf_threadloop: movq nb110nf_count(%rbp),%rsi ## pointer to sync counter movl (%rsi),%eax_nb_kernel110nf_x86_64_sse2.nb110nf_spinlock: movl %eax,%ebx ## ebx=*count=nn0 addl $1,%ebx ## ebx=nn1=nn0+10 lock cmpxchgl %ebx,(%rsi) ## write nn1 to *counter, ## if it hasnt changed. ## or reread *counter to eax. pause ## -> better p4 performance jnz _nb_kernel110nf_x86_64_sse2.nb110nf_spinlock ## if(nn1>nri) nn1=nri movl nb110nf_nri(%rsp),%ecx movl %ecx,%edx subl %ebx,%ecx cmovlel %edx,%ebx ## if(nn1>nri) nn1=nri ## Cleared the spinlock if we got here. ## eax contains nn0, ebx contains nn1. movl %eax,nb110nf_n(%rsp) movl %ebx,nb110nf_nn1(%rsp) subl %eax,%ebx ## calc number of outer lists movl %eax,%esi ## copy n to esi
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?