nb_kernel331_x86_64_sse2.intel_syntax.s

来自「最著名最快的分子模拟软件」· S 代码 · 共 2,353 行 · 第 1/5 页

S
2,353
字号
    mulsd   xmm11, xmm7 ;# lu*(3-rsq*lu*lu)	movapd  xmm15, [rsp + nb331_half]	mulsd   xmm9, xmm15  ;#  rinvO	mulsd   xmm10, xmm15 ;#   rinvH1    mulsd   xmm11, xmm15 ;#   rinvH2		movapd  [rsp + nb331_rinvO], xmm9	movapd  [rsp + nb331_rinvH1], xmm10	movapd  [rsp + nb331_rinvH2], xmm11		;# interactions     ;# rsq in xmm0,xmm3,xmm6      ;# rinv in xmm9, xmm10, xmm11    movapd [rsp + nb331_rinvO], xmm9    movapd xmm1, [rsp + nb331_tsc]        mulsd  xmm0, xmm9  ;# r    mulsd  xmm3, xmm10    mulsd  xmm6, xmm11    mulsd  xmm0, xmm1 ;# rtab    mulsd  xmm3, xmm1    mulsd  xmm6, xmm1        ;# truncate and convert to integers    cvttsd2si r8d, xmm0    cvttsd2si r10d, xmm3    cvttsd2si r12d, xmm6            ;# convert back to float    cvtsi2sd  xmm2, r8d    cvtsi2sd  xmm5, r10d    cvtsi2sd  xmm8, r12d        ;# multiply by 4    shl   r8d, 2    shl   r10d, 2    shl   r12d, 2    ;# multiply by 3   	lea   r8, [r8 + r8*2]   	lea   r10, [r10 + r10*2]   	lea   r12, [r12 + r12*2]    mov  rsi, [rbp + nb331_VFtab]    ;# calculate eps    subsd     xmm0, xmm2    subsd     xmm3, xmm5    subsd     xmm6, xmm8    movapd    xmm12, xmm0    movapd    xmm13, xmm3    movapd    xmm14, xmm6    ;# Load LOTS of table data    movsd xmm0,  [rsi + r8*8]    movsd xmm1,  [rsi + r8*8 + 8]    movsd xmm2,  [rsi + r8*8 + 16]    movsd xmm3,  [rsi + r8*8 + 24]    movsd xmm4,  [rsi + r10*8]    movsd xmm5,  [rsi + r10*8 + 8]    movsd xmm6,  [rsi + r10*8 + 16]    movsd xmm7,  [rsi + r10*8 + 24]    movsd xmm8,  [rsi + r12*8]    movsd xmm9,  [rsi + r12*8 + 8]    movsd xmm10, [rsi + r12*8 + 16]    movsd xmm11, [rsi + r12*8 + 24]    ;# table data ready in xmm0-xmm3 , xmm4-xmm7 , and xmm8-xmm11        mulsd  xmm3, xmm12   ;# Heps    mulsd  xmm7, xmm13    mulsd  xmm11, xmm14    mulsd  xmm2, xmm12  ;# Geps    mulsd  xmm6, xmm13    mulsd  xmm10, xmm14    mulsd  xmm3, xmm12   ;# Heps2    mulsd  xmm7, xmm13    mulsd  xmm11, xmm14    addsd  xmm1, xmm2   ;# F+Geps    addsd  xmm5, xmm6    addsd  xmm9, xmm10     addsd  xmm1, xmm3   ;# F+Geps+Heps2 = Fp    addsd  xmm5, xmm7    addsd  xmm9, xmm11     addsd  xmm3, xmm3    ;# 2*Heps2    addsd  xmm7, xmm7    addsd  xmm11, xmm11    addsd  xmm3, xmm2    ;# 2*Heps2+Geps    addsd  xmm7, xmm6      addsd  xmm11, xmm10    addsd  xmm3, xmm1   ;# FF = Fp + 2*Heps2 + Geps    addsd  xmm7, xmm5    addsd  xmm11, xmm9    mulsd  xmm1, xmm12   ;# eps*Fp    mulsd  xmm5, xmm13    mulsd  xmm9, xmm14    addsd  xmm1, xmm0     ;# VV    addsd  xmm5, xmm4    addsd  xmm9, xmm8    mulsd  xmm1, [rsp + nb331_qqO]   ;# VV*qq = vcoul    mulsd  xmm5, [rsp + nb331_qqH]    mulsd  xmm9, [rsp + nb331_qqH]    mulsd  xmm3, [rsp + nb331_qqO]    ;# FF*qq = fij    mulsd  xmm7, [rsp + nb331_qqH]    mulsd  xmm11, [rsp + nb331_qqH]     ;# accumulate vctot    addsd  xmm1, [rsp + nb331_vctot]    addsd  xmm5, xmm9    addsd  xmm1, xmm5    movsd [rsp + nb331_vctot], xmm1    movapd xmm2, xmm7    movapd xmm1, xmm11    ;# fij coul in xmm3, xmm2, xmm1            ;# calculate LJ table    movsd xmm4,  [rsi + r8*8 + 32]    movsd xmm5,  [rsi + r8*8 + 40]    movsd xmm6,  [rsi + r8*8 + 48]    movsd xmm7,  [rsi + r8*8 + 56]    movsd xmm8,  [rsi + r8*8 + 64]    movsd xmm9,  [rsi + r8*8 + 72]    movsd xmm10, [rsi + r8*8 + 80]    movsd xmm11, [rsi + r8*8 + 88]    ;# dispersion table in xmm4-xmm7, repulsion table in xmm8-xmm11        ;# xmm12 = epsO        mulsd  xmm7, xmm12    ;# Heps    mulsd  xmm11, xmm12     mulsd  xmm6, xmm12   ;# Geps    mulsd  xmm10, xmm12     mulsd  xmm7, xmm12   ;# Heps2    mulsd  xmm11, xmm12     addpd  xmm5, xmm6  ;# F+Geps    addsd  xmm9, xmm10     addsd  xmm5, xmm7   ;# F+Geps+Heps2 = Fp    addsd  xmm9, xmm11     addsd  xmm7, xmm7    ;# 2*Heps2    addsd  xmm11, xmm11    addsd  xmm7, xmm6   ;# 2*Heps2+Geps    addsd  xmm11, xmm10        addsd  xmm7, xmm5  ;# FF = Fp + 2*Heps2 + Geps    addsd  xmm11, xmm9    mulsd  xmm5, xmm12  ;# eps*Fp    mulsd  xmm9, xmm12    movapd xmm12, [rsp + nb331_c6]    movapd xmm13, [rsp + nb331_c12]    addsd  xmm5, xmm4 ;# VV    addsd  xmm9, xmm8    mulsd  xmm5, xmm12  ;# VV*c6 = vnb6    mulsd  xmm9, xmm13  ;# VV*c12 = vnb12    addsd  xmm5, xmm9    addsd  xmm5, [rsp + nb331_Vvdwtot]    movsd [rsp + nb331_Vvdwtot], xmm5            mulsd  xmm7, xmm12   ;# FF*c6 = fnb6    mulsd  xmm11, xmm13   ;# FF*c12  = fnb12    addsd  xmm7, xmm11        addsd  xmm3, xmm7    movapd xmm10, [rsp + nb331_tsc]    mulsd  xmm3, xmm10  ;# fscal    mulsd  xmm2, xmm10    mulsd  xmm1, xmm10            ;# move j forces to xmm11-xmm13    mov rdi, [rbp + nb331_faction]	movsd xmm11, [rdi + rax*8]	movsd xmm12, [rdi + rax*8 + 8]	movsd xmm13, [rdi + rax*8 + 16]    xorpd  xmm0, xmm0    xorpd  xmm4, xmm4    xorpd  xmm8, xmm8        subsd  xmm0, xmm3    subsd  xmm4, xmm2    subsd  xmm8, xmm1    mulsd  xmm0, [rsp + nb331_rinvO]    mulsd  xmm4, [rsp + nb331_rinvH1]    mulsd  xmm8, [rsp + nb331_rinvH2]        movapd xmm1, xmm0    movapd xmm2, xmm0    movapd xmm3, xmm4    movapd xmm5, xmm4    movapd xmm6, xmm8    movapd xmm7, xmm8	mulsd xmm0, [rsp + nb331_dxO]	mulsd xmm1, [rsp + nb331_dyO]	mulsd xmm2, [rsp + nb331_dzO]	mulsd xmm3, [rsp + nb331_dxH1]	mulsd xmm4, [rsp + nb331_dyH1]	mulsd xmm5, [rsp + nb331_dzH1]	mulsd xmm6, [rsp + nb331_dxH2]	mulsd xmm7, [rsp + nb331_dyH2]	mulsd xmm8, [rsp + nb331_dzH2]    addsd xmm11,  xmm0    addsd xmm12, xmm1    addsd xmm13, xmm2    addsd xmm0, [rsp + nb331_fixO]    addsd xmm1, [rsp + nb331_fiyO]    addsd xmm2, [rsp + nb331_fizO]    addsd xmm11,  xmm3    addsd xmm12, xmm4    addsd xmm13, xmm5    addsd xmm3, [rsp + nb331_fixH1]    addsd xmm4, [rsp + nb331_fiyH1]    addsd xmm5, [rsp + nb331_fizH1]    addsd xmm11,  xmm6    addsd xmm12, xmm7    addsd xmm13, xmm8    addsd xmm6, [rsp + nb331_fixH2]    addsd xmm7, [rsp + nb331_fiyH2]    addsd xmm8, [rsp + nb331_fizH2]    movsd [rsp + nb331_fixO], xmm0    movsd [rsp + nb331_fiyO], xmm1    movsd [rsp + nb331_fizO], xmm2    movsd [rsp + nb331_fixH1], xmm3    movsd [rsp + nb331_fiyH1], xmm4    movsd [rsp + nb331_fizH1], xmm5    movsd [rsp + nb331_fixH2], xmm6    movsd [rsp + nb331_fiyH2], xmm7    movsd [rsp + nb331_fizH2], xmm8           ;# store back j forces from xmm11-xmm13	movsd [rdi + rax*8],      xmm11	movsd [rdi + rax*8 + 8],  xmm12	movsd [rdi + rax*8 + 16], xmm13.nb331_updateouterdata:	mov   ecx, [rsp + nb331_ii3]	mov   rdi, [rbp + nb331_faction]	mov   rsi, [rbp + nb331_fshift]	mov   edx, [rsp + nb331_is3]	;# accumulate  Oi forces in xmm0, xmm1, xmm2 	movapd xmm0, [rsp + nb331_fixO]	movapd xmm1, [rsp + nb331_fiyO]	movapd xmm2, [rsp + nb331_fizO]	movhlps xmm3, xmm0	movhlps xmm4, xmm1	movhlps xmm5, xmm2	addsd  xmm0, xmm3	addsd  xmm1, xmm4	addsd  xmm2, xmm5 ;# sum is in low xmm0-xmm2 	movapd xmm3, xmm0		movapd xmm4, xmm1		movapd xmm5, xmm2		;# increment i force 	movsd  xmm3, [rdi + rcx*8]	movsd  xmm4, [rdi + rcx*8 + 8]	movsd  xmm5, [rdi + rcx*8 + 16]	subsd  xmm3, xmm0	subsd  xmm4, xmm1	subsd  xmm5, xmm2	movsd  [rdi + rcx*8],     xmm3	movsd  [rdi + rcx*8 + 8], xmm4	movsd  [rdi + rcx*8 + 16], xmm5	;# accumulate force in xmm6/xmm7 for fshift 	movapd xmm6, xmm0	movsd xmm7, xmm2	unpcklpd xmm6, xmm1	;# accumulate H1i forces in xmm0, xmm1, xmm2 	movapd xmm0, [rsp + nb331_fixH1]	movapd xmm1, [rsp + nb331_fiyH1]	movapd xmm2, [rsp + nb331_fizH1]	movhlps xmm3, xmm0	movhlps xmm4, xmm1	movhlps xmm5, xmm2	addsd  xmm0, xmm3	addsd  xmm1, xmm4	addsd  xmm2, xmm5 ;# sum is in low xmm0-xmm2 	;# increment i force 	movsd  xmm3, [rdi + rcx*8 + 24]	movsd  xmm4, [rdi + rcx*8 + 32]	movsd  xmm5, [rdi + rcx*8 + 40]	subsd  xmm3, xmm0	subsd  xmm4, xmm1	subsd  xmm5, xmm2	movsd  [rdi + rcx*8 + 24], xmm3	movsd  [rdi + rcx*8 + 32], xmm4	movsd  [rdi + rcx*8 + 40], xmm5	;# accumulate force in xmm6/xmm7 for fshift 	addsd xmm7, xmm2	unpcklpd xmm0, xmm1	addpd xmm6, xmm0	;# accumulate H2i forces in xmm0, xmm1, xmm2 	movapd xmm0, [rsp + nb331_fixH2]	movapd xmm1, [rsp + nb331_fiyH2]	movapd xmm2, [rsp + nb331_fizH2]	movhlps xmm3, xmm0	movhlps xmm4, xmm1	movhlps xmm5, xmm2	addsd  xmm0, xmm3	addsd  xmm1, xmm4	addsd  xmm2, xmm5 ;# sum is in low xmm0-xmm2 	movapd xmm3, xmm0		movapd xmm4, xmm1		movapd xmm5, xmm2		;# increment i force 	movsd  xmm3, [rdi + rcx*8 + 48]	movsd  xmm4, [rdi + rcx*8 + 56]	movsd  xmm5, [rdi + rcx*8 + 64]	subsd  xmm3, xmm0	subsd  xmm4, xmm1	subsd  xmm5, xmm2	movsd  [rdi + rcx*8 + 48], xmm3	movsd  [rdi + rcx*8 + 56], xmm4	movsd  [rdi + rcx*8 + 64], xmm5	;# accumulate force in xmm6/xmm7 for fshift 	addsd xmm7, xmm2	unpcklpd xmm0, xmm1	addpd xmm6, xmm0	;# increment fshift force 	movlpd xmm3, [rsi + rdx*8]	movhpd xmm3, [rsi + rdx*8 + 8]	movsd  xmm4, [rsi + rdx*8 + 16]	subpd  xmm3, xmm6	subsd  xmm4, xmm7	movlpd [rsi + rdx*8],      xmm3	movhpd [rsi + rdx*8 + 8],  xmm3	movsd  [rsi + rdx*8 + 16], xmm4	;# get n from stack	mov esi, [rsp + nb331_n]        ;# get group index for i particle         mov   rdx, [rbp + nb331_gid]      	;# base of gid[]        mov   edx, [rdx + rsi*4]		;# ggid=gid[n]	;# accumulate total potential energy and update it 	movapd xmm7, [rsp + nb331_vctot]	;# accumulate 	movhlps xmm6, xmm7	addsd  xmm7, xmm6	;# low xmm7 has the sum now         	;# add earlier value from mem 	mov   rax, [rbp + nb331_Vc]	addsd xmm7, [rax + rdx*8] 	;# move back to mem 	movsd [rax + rdx*8], xmm7 		;# accumulate total lj energy and update it 	movapd xmm7, [rsp + nb331_Vvdwtot]	;# accumulate 	movhlps xmm6, xmm7	addsd  xmm7, xmm6	;# low xmm7 has the sum now 	;# add earlier value from mem 	mov   rax, [rbp + nb331_Vvdw]	addsd xmm7, [rax + rdx*8] 	;# move back to mem 	movsd [rax + rdx*8], xmm7 	        ;# finish if last         mov ecx, [rsp + nb331_nn1]	;# esi already loaded with n	inc esi        sub ecx, esi        jz .nb331_outerend        ;# not last, iterate outer loop once more!          mov [rsp + nb331_n], esi        jmp .nb331_outer.nb331_outerend:        ;# check if more outer neighborlists remain        mov   ecx, [rsp + nb331_nri]	;# esi already loaded with n above        sub   ecx, esi        jz .nb331_end        ;# non-zero, do one more workunit        jmp   .nb331_threadloop.nb331_end:	mov eax, [rsp + nb331_nouter]	mov ebx, [rsp + nb331_ninner]	mov rcx, [rbp + nb331_outeriter]	mov rdx, [rbp + nb331_inneriter]	mov [rcx], eax	mov [rdx], ebx	add rsp, 872	emms        pop r15        pop r14        pop r13        pop r12	pop rbx	pop	rbp	ret	.globl nb_kernel331nf_x86_64_sse2.globl _nb_kernel331nf_x86_64_sse2nb_kernel331nf_x86_64_sse2:	_nb_kernel331nf_x86_64_sse2:	;#	Room for return address and rbp (16 bytes).equiv          nb331nf_fshift,         16.equiv          nb331nf_gid,            24.equiv          nb331nf_pos,            32.equiv          nb331nf_faction,        40.equiv          nb331nf_charge,         48.equiv          nb331nf_p_facel,        56.equiv          nb331nf_argkrf,         64.equiv          nb331nf_argcrf,         72.equiv          nb331nf_Vc,             80.equiv          nb331nf_type,           88.equiv          nb331nf_p_ntype,        96.equiv          nb331nf_vdwparam,       104.equiv          nb331nf_Vvdw,           112.equiv          nb331nf_p_tabscale,     120.equiv          nb331nf_VFtab,          128.equiv          nb331nf_invsqrta,       136.equiv          nb331nf_dvda,           144.equiv          nb331nf_p_gbtabscale,   152.equiv          nb331nf_GBtab,          160.equiv          nb331nf_p_nthreads,     168.equiv          nb331nf_count,          176.equiv          nb331nf_mtx,            184.equiv          nb331nf_outeriter,      192.equiv          nb331nf_inneriter,      200.equiv          nb331nf_work,           208	;# stack offsets for local variables  	;# bottom of stack is cache-aligned for sse use .equiv          nb331nf_ixO,            0.equiv          nb331nf_iyO,            16.equiv          nb331nf_izO,            32.equiv          nb331nf_ixH1,           48.equiv          nb331nf_iyH1,           64.equiv          nb331nf_izH1,           80.equiv          nb331nf_ixH2,           96.equiv          nb331nf_iyH2,           112.equiv          nb331nf_izH2,           128.equiv          nb331nf_iqO,            144.equiv          nb331nf_iqH,            160.equiv          nb331nf_qqO,            176.equiv          nb331nf_qqH,            192

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?