nb_kernel211_x86_64_sse2.intel_syntax.s

来自「最著名最快的分子模拟软件」· S 代码 · 共 1,949 行 · 第 1/4 页

S
1,949
字号
	movapd  xmm2, xmm9	movapd  xmm5, xmm10    movapd  xmm8, xmm11    	mulpd   xmm2, xmm2 ;# lu*lu	mulpd   xmm5, xmm5 ;# lu*lu    mulpd   xmm8, xmm8 ;# lu*lu			movapd  xmm1, [rsp + nb211_three]	movapd  xmm4, xmm1    movapd  xmm7, xmm1	mulpd   xmm2, xmm0 ;# rsq*lu*lu	mulpd   xmm5, xmm3 ;# rsq*lu*lu     mulpd   xmm8, xmm6 ;# rsq*lu*lu		subpd   xmm1, xmm2	subpd   xmm4, xmm5    subpd   xmm7, xmm8 ;# 3-rsq*lu*lu	mulpd   xmm9, xmm1	mulpd   xmm10, xmm4    mulpd   xmm11, xmm7 ;# lu*(3-rsq*lu*lu)	movapd  xmm15, [rsp + nb211_half]	mulpd   xmm9, xmm15  ;#  rinvO	mulpd   xmm10, xmm15 ;#   rinvH1    mulpd   xmm11, xmm15 ;#   rinvH2		;# interactions     ;# rsq in xmm0,xmm3,xmm6      ;# rinv in xmm9, xmm10, xmm11    movapd xmm1, xmm9 ;# copy of rinv    movapd xmm4, xmm10    movapd xmm7, xmm11    movapd xmm2, [rsp + nb211_krf]        mulpd  xmm9, xmm9   ;# rinvsq    mulpd  xmm10, xmm10    mulpd  xmm11, xmm11    mulpd  xmm0, xmm2  ;# k*rsq    mulpd  xmm3, xmm2    mulpd  xmm6, xmm2    movapd xmm2, xmm0 ;# copy of k*rsq    movapd xmm5, xmm3    movapd xmm8, xmm6    addpd  xmm2, xmm1  ;# rinv+krsq    addpd  xmm5, xmm4    addpd  xmm8, xmm7    movapd xmm12, xmm9    mulpd  xmm12, xmm12 ;# rinv4    mulpd  xmm12, xmm9  ;# rinv6    subpd  xmm2, [rsp + nb211_crf]   ;# rinv+krsq-crf    subpd  xmm5, [rsp + nb211_crf]    subpd  xmm8, [rsp + nb211_crf]       mulpd  xmm2, [rsp + nb211_qqO] ;# voul=qq*(rinv+ krsq-crf)    mulpd  xmm5, [rsp + nb211_qqH] ;# voul=qq*(rinv+ krsq-crf)    mulpd  xmm8, [rsp + nb211_qqH] ;# voul=qq*(rinv+ krsq-crf)    addpd  xmm0, xmm0 ;# 2*krsq    addpd  xmm3, xmm3     addpd  xmm6, xmm6     subpd  xmm1, xmm0 ;# rinv-2*krsq    subpd  xmm4, xmm3    subpd  xmm7, xmm6    movapd xmm13, xmm12 ;# rinv6    mulpd xmm12, xmm12 ;# rinv12	mulpd  xmm13, [rsp + nb211_c6]	mulpd  xmm12, [rsp + nb211_c12]    movapd xmm14, xmm12    subpd  xmm14, xmm13    mulpd  xmm1, [rsp + nb211_qqO]   ;# (rinv-2*krsq)*qq    mulpd  xmm4, [rsp + nb211_qqH]     mulpd  xmm7, [rsp + nb211_qqH]     addpd  xmm2, [rsp + nb211_vctot]    addpd  xmm5, xmm8    addpd  xmm2, xmm5    movapd [rsp + nb211_vctot], xmm2    	addpd  xmm14, [rsp + nb211_Vvdwtot]	mulpd  xmm13, [rsp + nb211_six]	mulpd  xmm12, [rsp + nb211_twelve]	movapd [rsp + nb211_Vvdwtot], xmm14    subpd  xmm12, xmm13 ;# LJ fscal            addpd xmm1, xmm12        mulpd  xmm9, xmm1   ;# fscal    mulpd  xmm10, xmm4    mulpd  xmm11, xmm7    ;# move j forces to xmm0-xmm2    mov rdi, [rbp + nb211_faction]	movlpd xmm0, [rdi + rax*8]	movlpd xmm1, [rdi + rax*8 + 8]	movlpd xmm2, [rdi + rax*8 + 16]	movhpd xmm0, [rdi + rbx*8]	movhpd xmm1, [rdi + rbx*8 + 8]	movhpd xmm2, [rdi + rbx*8 + 16]    movapd xmm7, xmm9    movapd xmm8, xmm9    movapd xmm13, xmm11    movapd xmm14, xmm11    movapd xmm15, xmm11    movapd xmm11, xmm10    movapd xmm12, xmm10	mulpd xmm7, [rsp + nb211_dxO]	mulpd xmm8, [rsp + nb211_dyO]	mulpd xmm9, [rsp + nb211_dzO]	mulpd xmm10, [rsp + nb211_dxH1]	mulpd xmm11, [rsp + nb211_dyH1]	mulpd xmm12, [rsp + nb211_dzH1]	mulpd xmm13, [rsp + nb211_dxH2]	mulpd xmm14, [rsp + nb211_dyH2]	mulpd xmm15, [rsp + nb211_dzH2]    addpd xmm0, xmm7    addpd xmm1, xmm8    addpd xmm2, xmm9    addpd xmm7, [rsp + nb211_fixO]    addpd xmm8, [rsp + nb211_fiyO]    addpd xmm9, [rsp + nb211_fizO]    addpd xmm0, xmm10    addpd xmm1, xmm11    addpd xmm2, xmm12    addpd xmm10, [rsp + nb211_fixH1]    addpd xmm11, [rsp + nb211_fiyH1]    addpd xmm12, [rsp + nb211_fizH1]    addpd xmm0, xmm13    addpd xmm1, xmm14    addpd xmm2, xmm15    addpd xmm13, [rsp + nb211_fixH2]    addpd xmm14, [rsp + nb211_fiyH2]    addpd xmm15, [rsp + nb211_fizH2]    movapd [rsp + nb211_fixO], xmm7    movapd [rsp + nb211_fiyO], xmm8    movapd [rsp + nb211_fizO], xmm9    movapd [rsp + nb211_fixH1], xmm10    movapd [rsp + nb211_fiyH1], xmm11    movapd [rsp + nb211_fizH1], xmm12    movapd [rsp + nb211_fixH2], xmm13    movapd [rsp + nb211_fiyH2], xmm14    movapd [rsp + nb211_fizH2], xmm15       ;# store back j forces from xmm0-xmm2	movlpd [rdi + rax*8],      xmm0	movlpd [rdi + rax*8 + 8],  xmm1	movlpd [rdi + rax*8 + 16], xmm2	movhpd [rdi + rbx*8],      xmm0	movhpd [rdi + rbx*8 + 8],  xmm1	movhpd [rdi + rbx*8 + 16], xmm2	;# should we do one more iteration? 	sub dword ptr [rsp + nb211_innerk],  2	jl    .nb211_checksingle	jmp   .nb211_unroll_loop.nb211_checksingle:		mov   edx, [rsp + nb211_innerk]	and   edx, 1	jnz   .nb211_dosingle	jmp   .nb211_updateouterdata.nb211_dosingle:	mov   rdx, [rsp + nb211_innerjjnr]     ;# pointer to jjnr[k] 	mov   eax, [rdx]		add qword ptr [rsp + nb211_innerjjnr],  4		mov rsi, [rbp + nb211_charge]    ;# base of charge[] 	xorpd xmm3, xmm3	movlpd xmm3, [rsi + rax*8]	movapd xmm4, xmm3	mulpd  xmm3, [rsp + nb211_iqO]	mulpd  xmm4, [rsp + nb211_iqH]	movapd  [rsp + nb211_qqO], xmm3	movapd  [rsp + nb211_qqH], xmm4		mov rsi, [rbp + nb211_type]	mov r8d, [rsi + rax*4]	mov rsi, [rbp + nb211_vdwparam]	shl r8d, 1		mov edi, [rsp + nb211_ntia]	add r8d, edi	movlpd xmm6, [rsi + r8*8]	;# c6a	movhpd xmm6, [rsi + r8*8 + 8]	;# c6a c12a 		xorpd xmm7, xmm7	movapd xmm4, xmm6	unpcklpd xmm4, xmm7	unpckhpd xmm6, xmm7		movapd [rsp + nb211_c6], xmm4	movapd [rsp + nb211_c12], xmm6		mov rsi, [rbp + nb211_pos]       ;# base of pos[] 	lea   rax, [rax + rax*2]     ;# replace jnr with j3 	;# move coordinates to xmm0-xmm2 		movlpd xmm4, [rsi + rax*8]	movlpd xmm5, [rsi + rax*8 + 8]	movlpd xmm6, [rsi + rax*8 + 16]    movapd xmm0, xmm4    movapd xmm1, xmm5    movapd xmm2, xmm6	;# calc dr 	subsd xmm4, [rsp + nb211_ixO]	subsd xmm5, [rsp + nb211_iyO]	subsd xmm6, [rsp + nb211_izO]	;# store dr 	movapd [rsp + nb211_dxO], xmm4	movapd [rsp + nb211_dyO], xmm5	movapd [rsp + nb211_dzO], xmm6	;# square it 	mulsd xmm4,xmm4	mulsd xmm5,xmm5	mulsd xmm6,xmm6	addsd xmm4, xmm5	addsd xmm4, xmm6	movapd xmm7, xmm4	;# rsqO in xmm7 	;# move j coords to xmm4-xmm6 	movapd xmm4, xmm0	movapd xmm5, xmm1	movapd xmm6, xmm2	;# calc dr 	subsd xmm4, [rsp + nb211_ixH1]	subsd xmm5, [rsp + nb211_iyH1]	subsd xmm6, [rsp + nb211_izH1]	;# store dr 	movapd [rsp + nb211_dxH1], xmm4	movapd [rsp + nb211_dyH1], xmm5	movapd [rsp + nb211_dzH1], xmm6	;# square it 	mulsd xmm4,xmm4	mulsd xmm5,xmm5	mulsd xmm6,xmm6	addsd xmm6, xmm5	addsd xmm6, xmm4	;# rsqH1 in xmm6 	;# move j coords to xmm3-xmm5	movapd xmm3, xmm0	movapd xmm4, xmm1	movapd xmm5, xmm2	;# calc dr 	subsd xmm3, [rsp + nb211_ixH2]	subsd xmm4, [rsp + nb211_iyH2]	subsd xmm5, [rsp + nb211_izH2]	;# store dr 	movapd [rsp + nb211_dxH2], xmm3	movapd [rsp + nb211_dyH2], xmm4	movapd [rsp + nb211_dzH2], xmm5	;# square it 	mulsd xmm3,xmm3	mulsd xmm4,xmm4	mulsd xmm5,xmm5	addsd xmm5, xmm4	addsd xmm5, xmm3	;# rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7 	movapd xmm0, xmm5	movapd xmm1, xmm6	movapd xmm2, xmm7	mulsd  xmm0, [rsp + nb211_krf]		mulsd  xmm1, [rsp + nb211_krf]		mulsd  xmm2, [rsp + nb211_krf]		movapd [rsp + nb211_krsqH2], xmm0	movapd [rsp + nb211_krsqH1], xmm1	movapd [rsp + nb211_krsqO], xmm2	;# start with rsqO - put seed in xmm2 	cvtsd2ss xmm2, xmm7		rsqrtss xmm2, xmm2	cvtss2sd xmm2, xmm2	movapd  xmm3, xmm2	mulsd   xmm2, xmm2	movapd  xmm4, [rsp + nb211_three]	mulsd   xmm2, xmm7	;# rsq*lu*lu 	subsd   xmm4, xmm2	;# 30-rsq*lu*lu 	mulsd   xmm4, xmm3	;# lu*(3-rsq*lu*lu) 	mulsd   xmm4, [rsp + nb211_half] ;# iter1 ( new lu) 	movapd xmm3, xmm4	mulsd xmm4, xmm4	;# lu*lu 	mulsd xmm7, xmm4	;# rsq*lu*lu 	movapd xmm4, [rsp + nb211_three]	subsd xmm4, xmm7	;# 3-rsq*lu*lu 	mulsd xmm4, xmm3	;# lu*(	3-rsq*lu*lu) 	mulsd xmm4, [rsp + nb211_half] ;# rinv 	movapd  xmm7, xmm4	;# rinvO in xmm7 		;# rsqH1 - seed in xmm2 	cvtsd2ss xmm2, xmm6		rsqrtss xmm2, xmm2	cvtss2sd xmm2, xmm2	movapd  xmm3, xmm2	mulsd   xmm2, xmm2	movapd  xmm4, [rsp + nb211_three]	mulsd   xmm2, xmm6	;# rsq*lu*lu 	subsd   xmm4, xmm2	;# 30-rsq*lu*lu 	mulsd   xmm4, xmm3	;# lu*(3-rsq*lu*lu) 	mulsd   xmm4, [rsp + nb211_half] ;# iter1 ( new lu) 	movapd xmm3, xmm4	mulsd xmm4, xmm4	;# lu*lu 	mulsd xmm6, xmm4	;# rsq*lu*lu 	movapd xmm4, [rsp + nb211_three]	subsd xmm4, xmm6	;# 3-rsq*lu*lu 	mulsd xmm4, xmm3	;# lu*(	3-rsq*lu*lu) 	mulsd xmm4, [rsp + nb211_half] ;# rinv 	movapd  xmm6, xmm4	;# rinvH1 in xmm6 		;# rsqH2 - seed in xmm2 	cvtsd2ss xmm2, xmm5		rsqrtss xmm2, xmm2	cvtss2sd xmm2, xmm2	movapd  xmm3, xmm2	mulsd   xmm2, xmm2	movapd  xmm4, [rsp + nb211_three]	mulsd   xmm2, xmm5	;# rsq*lu*lu 	subsd   xmm4, xmm2	;# 30-rsq*lu*lu 	mulsd   xmm4, xmm3	;# lu*(3-rsq*lu*lu) 	mulsd   xmm4, [rsp + nb211_half] ;# iter1 ( new lu) 	movapd xmm3, xmm4	mulsd xmm4, xmm4	;# lu*lu 	mulsd xmm5, xmm4	;# rsq*lu*lu 	movapd xmm4, [rsp + nb211_three]	subsd xmm4, xmm5	;# 3-rsq*lu*lu 	mulsd xmm4, xmm3	;# lu*(	3-rsq*lu*lu) 	mulsd xmm4, [rsp + nb211_half] ;# rinv 	movapd  xmm5, xmm4	;# rinvH2 in xmm5 	;# do O interactions 	movapd  xmm4, xmm7		mulsd   xmm4, xmm4	;# xmm7=rinv, xmm4=rinvsq 	movapd xmm1, xmm4	mulsd  xmm1, xmm4	mulsd  xmm1, xmm4	;# xmm1=rinvsix 	movapd xmm2, xmm1	mulsd  xmm2, xmm2	;# xmm2=rinvtwelve 	mulsd  xmm1, [rsp + nb211_c6]	mulsd  xmm2, [rsp + nb211_c12]	movapd xmm3, xmm2	subsd  xmm3, xmm1	;# Vvdw=Vvdw12-Vvdw6 			addsd  xmm3, [rsp + nb211_Vvdwtot]	mulsd  xmm1, [rsp + nb211_six]	mulsd  xmm2, [rsp + nb211_twelve]	subsd  xmm2, xmm1	;# nb part of fs  	movapd xmm0, xmm7	movapd xmm1, [rsp + nb211_krsqO]	addsd  xmm0, xmm1	mulsd  xmm1, [rsp + nb211_two]	subsd  xmm0, [rsp + nb211_crf] ;# xmm0=rinv+ krsq-crf 	subsd  xmm7, xmm1	mulsd  xmm0, [rsp + nb211_qqO]	mulsd  xmm7, [rsp + nb211_qqO]	addsd  xmm2, xmm7	mulsd  xmm4, xmm2	;# total fsO in xmm4 	addsd  xmm0, [rsp + nb211_vctot]	movlpd [rsp + nb211_Vvdwtot], xmm3	movlpd [rsp + nb211_vctot], xmm0	movapd xmm0, [rsp + nb211_dxO]	movapd xmm1, [rsp + nb211_dyO]	movapd xmm2, [rsp + nb211_dzO]	mulsd  xmm0, xmm4	mulsd  xmm1, xmm4	mulsd  xmm2, xmm4	;# update O forces 	movapd xmm3, [rsp + nb211_fixO]	movapd xmm4, [rsp + nb211_fiyO]	movapd xmm7, [rsp + nb211_fizO]	addsd  xmm3, xmm0	addsd  xmm4, xmm1	addsd  xmm7, xmm2	movlpd [rsp + nb211_fixO], xmm3	movlpd [rsp + nb211_fiyO], xmm4	movlpd [rsp + nb211_fizO], xmm7	;# update j forces with water O 	movlpd [rsp + nb211_fjx], xmm0	movlpd [rsp + nb211_fjy], xmm1	movlpd [rsp + nb211_fjz], xmm2	;# H1 interactions 	movapd  xmm4, xmm6		mulsd   xmm4, xmm4	;# xmm6=rinv, xmm4=rinvsq 	movapd  xmm7, xmm6	movapd  xmm0, [rsp + nb211_krsqH1]	addsd   xmm6, xmm0	;# xmm6=rinv+ krsq 	mulsd   xmm0, [rsp + nb211_two]	subsd   xmm6, [rsp + nb211_crf]	subsd   xmm7, xmm0	;# xmm7=rinv-2*krsq 	mulsd   xmm6, [rsp + nb211_qqH] ;# vcoul 	mulsd   xmm7, [rsp + nb211_qqH]	mulsd  xmm4, xmm7		;# total fsH1 in xmm4 		addsd  xmm6, [rsp + nb211_vctot]	movapd xmm0, [rsp + nb211_dxH1]	movapd xmm1, [rsp + nb211_dyH1]	movapd xmm2, [rsp + nb211_dzH1]	movlpd [rsp + nb211_vctot], xmm6	mulsd  xmm0, xmm4	mulsd  xmm1, xmm4	mulsd  xmm2, xmm4	;# update H1 forces 	movapd xmm3, [rsp + nb211_fixH1]	movapd xmm4, [rsp + nb211_fiyH1]	movapd xmm7, [rsp + nb211_fizH1]	addsd  xmm3, xmm0	addsd  xmm4, xmm1	addsd  xmm7, xmm2	movlpd [rsp + nb211_fixH1], xmm3	movlpd [rsp + nb211_fiyH1], xmm4	movlpd [rsp + nb211_fizH1], xmm7	;# update j forces with water H1 	addsd  xmm0, [rsp + nb211_fjx]	addsd  xmm1, [rsp + nb211_fjy]	addsd  xmm2, [rsp + nb211_fjz]	movlpd [rsp + nb211_fjx], xmm0	movlpd [rsp + nb211_fjy], xmm1	movlpd [rsp + nb211_fjz], xmm2	;# H2 interactions 	movapd  xmm4, xmm5		mulsd   xmm4, xmm4	;# xmm5=rinv, xmm4=rinvsq 	movapd  xmm7, xmm5	movapd  xmm0, [rsp + nb211_krsqH2]	addsd   xmm5, xmm0	;# xmm5=rinv+ krsq 	mulsd   xmm0, [rsp + nb211_two]	subsd   xmm5, [rsp + nb211_crf]	subsd   xmm7, xmm0	;# xmm7=rinv-2*krsq 	mulsd   xmm5, [rsp + nb211_qqH] ;# vcoul 	mulsd   xmm7, [rsp + nb211_qqH]	mulsd  xmm4, xmm7		;# total fsH2 in xmm4 		addsd  xmm5, [rsp + nb211_vctot]	movapd xmm0, [rsp + nb211_dxH2]	movapd xmm1, [rsp + nb211_dyH2]	movapd xmm2, [rsp + nb211_dzH2]	movlpd [rsp + nb211_vctot], xmm5	mulsd  xmm0, xmm4	mulsd  xmm1, xmm4	mulsd  xmm2, xmm4	;# update H2 forces 	movapd xmm3, [rsp + nb211_fixH2]	movapd xmm4, [rsp + nb211_fiyH2]	movapd xmm7, [rsp + nb211_fizH2]	addsd  xmm3, xmm0	addsd  xmm4, xmm1	addsd  xmm7, xmm2	movlpd [rsp + nb211_fixH2], xmm3	movlpd [rsp + nb211_fiyH2], xmm4	movlpd [rsp + nb211_fizH2], xmm7	mov rdi, [rbp + nb211_faction]	;# update j forces 	addsd  xmm0, [rsp + nb211_fjx]	addsd  xmm1, [rsp + nb211_fjy]	addsd  xmm2, [rsp + nb211_fjz]	movlpd xmm3, [rdi + rax*8]	movlpd xmm4, [rdi + rax*8 + 8]	movlpd xmm5, [rdi + rax*8 + 16]

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?