nb_kernel111_x86_64_sse2.intel_syntax.s

来自「最著名最快的分子模拟软件」· S 代码 · 共 1,816 行 · 第 1/4 页

S
1,816
字号
	mulpd   xmm1, xmm0 ;# rsq*lu*lu	mulpd   xmm4, xmm3 ;# rsq*lu*lu     mulpd   xmm7, xmm6 ;# rsq*lu*lu		subpd   xmm9, xmm1	subpd   xmm10, xmm4    subpd   xmm11, xmm7 ;# 3-rsq*lu*lu	mulpd   xmm9, xmm2	mulpd   xmm10, xmm5    mulpd   xmm11, xmm8 ;# lu*(3-rsq*lu*lu)	movapd  xmm15, [rsp + nb111_half]	mulpd   xmm9, xmm15  ;# first iteration for rinvO	mulpd   xmm10, xmm15 ;# first iteration for rinvH1    mulpd   xmm11, xmm15 ;# first iteration for rinvH2    ;# second iteration step    	movapd  xmm2, xmm9	movapd  xmm5, xmm10    movapd  xmm8, xmm11    	mulpd   xmm2, xmm2 ;# lu*lu	mulpd   xmm5, xmm5 ;# lu*lu    mulpd   xmm8, xmm8 ;# lu*lu			movapd  xmm1, [rsp + nb111_three]	movapd  xmm4, xmm1    movapd  xmm7, xmm1	mulpd   xmm2, xmm0 ;# rsq*lu*lu	mulpd   xmm5, xmm3 ;# rsq*lu*lu     mulpd   xmm8, xmm6 ;# rsq*lu*lu		subpd   xmm1, xmm2	subpd   xmm4, xmm5    subpd   xmm7, xmm8 ;# 3-rsq*lu*lu	mulpd   xmm9, xmm1	mulpd   xmm10, xmm4    mulpd   xmm11, xmm7 ;# lu*(3-rsq*lu*lu)	movapd  xmm15, [rsp + nb111_half]	mulpd   xmm9, xmm15  ;#  rinvO 	mulpd   xmm10, xmm15 ;#   rinvH1    mulpd   xmm11, xmm15 ;#   rinvH2		;# interactions     movapd xmm0, xmm9    movapd xmm1, xmm10    movapd xmm2, xmm11    mulpd  xmm9, xmm9    ;# rinvsq    mulpd  xmm10, xmm10    mulpd  xmm11, xmm11    movapd xmm12, xmm9    mulpd  xmm12, xmm12 ;# rinv4    mulpd  xmm12, xmm9  ;# rinv6    mulpd  xmm0, [rsp + nb111_qqO]     mulpd  xmm1, [rsp + nb111_qqH]     mulpd  xmm2, [rsp + nb111_qqH]     movapd xmm13, xmm12 ;# rinv6    mulpd  xmm12, xmm12 ;# rinv12	mulpd  xmm13, [rsp + nb111_c6]	mulpd  xmm12, [rsp + nb111_c12]    movapd xmm14, xmm12    subpd  xmm14, xmm13    	addpd  xmm14, [rsp + nb111_Vvdwtot]	mulpd  xmm13, [rsp + nb111_six]	mulpd  xmm12, [rsp + nb111_twelve]	movapd [rsp + nb111_Vvdwtot], xmm14    subpd  xmm12, xmm13 ;# LJ fscal            addpd  xmm12, xmm0        mulpd  xmm9, xmm12    mulpd  xmm10, xmm1    mulpd  xmm11, xmm2        addpd xmm0, [rsp + nb111_vctot]     addpd xmm1, xmm2    addpd xmm0, xmm1    movapd [rsp + nb111_vctot], xmm0        ;# move j forces to xmm0-xmm2    mov rdi, [rbp + nb111_faction]	movlpd xmm0, [rdi + rax*8]	movlpd xmm1, [rdi + rax*8 + 8]	movlpd xmm2, [rdi + rax*8 + 16]	movhpd xmm0, [rdi + rbx*8]	movhpd xmm1, [rdi + rbx*8 + 8]	movhpd xmm2, [rdi + rbx*8 + 16]    movapd xmm7, xmm9    movapd xmm8, xmm9    movapd xmm13, xmm11    movapd xmm14, xmm11    movapd xmm15, xmm11    movapd xmm11, xmm10    movapd xmm12, xmm10	mulpd xmm7, [rsp + nb111_dxO]	mulpd xmm8, [rsp + nb111_dyO]	mulpd xmm9, [rsp + nb111_dzO]	mulpd xmm10, [rsp + nb111_dxH1]	mulpd xmm11, [rsp + nb111_dyH1]	mulpd xmm12, [rsp + nb111_dzH1]	mulpd xmm13, [rsp + nb111_dxH2]	mulpd xmm14, [rsp + nb111_dyH2]	mulpd xmm15, [rsp + nb111_dzH2]    addpd xmm0, xmm7    addpd xmm1, xmm8    addpd xmm2, xmm9    addpd xmm7, [rsp + nb111_fixO]    addpd xmm8, [rsp + nb111_fiyO]    addpd xmm9, [rsp + nb111_fizO]    addpd xmm0, xmm10    addpd xmm1, xmm11    addpd xmm2, xmm12    addpd xmm10, [rsp + nb111_fixH1]    addpd xmm11, [rsp + nb111_fiyH1]    addpd xmm12, [rsp + nb111_fizH1]    addpd xmm0, xmm13    addpd xmm1, xmm14    addpd xmm2, xmm15    addpd xmm13, [rsp + nb111_fixH2]    addpd xmm14, [rsp + nb111_fiyH2]    addpd xmm15, [rsp + nb111_fizH2]    movapd [rsp + nb111_fixO], xmm7    movapd [rsp + nb111_fiyO], xmm8    movapd [rsp + nb111_fizO], xmm9    movapd [rsp + nb111_fixH1], xmm10    movapd [rsp + nb111_fiyH1], xmm11    movapd [rsp + nb111_fizH1], xmm12    movapd [rsp + nb111_fixH2], xmm13    movapd [rsp + nb111_fiyH2], xmm14    movapd [rsp + nb111_fizH2], xmm15       ;# store back j forces from xmm0-xmm2	movlpd [rdi + rax*8],      xmm0	movlpd [rdi + rax*8 + 8],  xmm1	movlpd [rdi + rax*8 + 16], xmm2	movhpd [rdi + rbx*8],      xmm0	movhpd [rdi + rbx*8 + 8],  xmm1	movhpd [rdi + rbx*8 + 16], xmm2	;# should we do one more iteration? 	sub dword ptr [rsp + nb111_innerk],  2	jl   .nb111_checksingle	jmp  .nb111_unroll_loop.nb111_checksingle:		mov   edx, [rsp + nb111_innerk]	and   edx, 1	jnz  .nb111_dosingle	jmp  .nb111_updateouterdata.nb111_dosingle:	mov   rdx, [rsp + nb111_innerjjnr]     ;# pointer to jjnr[k] 	mov   eax, [rdx]		add qword ptr [rsp + nb111_innerjjnr],  4		mov rsi, [rbp + nb111_charge]    ;# base of charge[] 	xorpd xmm3, xmm3	movlpd xmm3, [rsi + rax*8]	movapd xmm4, xmm3	mulpd  xmm3, [rsp + nb111_iqO]	mulpd  xmm4, [rsp + nb111_iqH]	movapd  [rsp + nb111_qqO], xmm3	movapd  [rsp + nb111_qqH], xmm4		mov rsi, [rbp + nb111_type]	mov r8d, [rsi + rax*4]	mov rsi, [rbp + nb111_vdwparam]	shl r8d, 1		mov edi, [rsp + nb111_ntia]	add r8d, edi	movlpd xmm6, [rsi + r8*8]	;# c6a	movhpd xmm6, [rsi + r8*8 + 8]	;# c6a c12a 	xorpd xmm7, xmm7	movapd xmm4, xmm6	unpcklpd xmm4, xmm7	unpckhpd xmm6, xmm7		movapd [rsp + nb111_c6], xmm4	movapd [rsp + nb111_c12], xmm6		mov rsi, [rbp + nb111_pos]       ;# base of pos[] 	lea   rax, [rax + rax*2]     ;# replace jnr with j3 	;# move coordinates to xmm0-xmm2 		movlpd xmm4, [rsi + rax*8]	movlpd xmm5, [rsi + rax*8 + 8]	movlpd xmm6, [rsi + rax*8 + 16]    movapd xmm0, xmm4    movapd xmm1, xmm5    movapd xmm2, xmm6	;# calc dr 	subsd xmm4, [rsp + nb111_ixO]	subsd xmm5, [rsp + nb111_iyO]	subsd xmm6, [rsp + nb111_izO]	;# store dr 	movapd [rsp + nb111_dxO], xmm4	movapd [rsp + nb111_dyO], xmm5	movapd [rsp + nb111_dzO], xmm6	;# square it 	mulsd xmm4,xmm4	mulsd xmm5,xmm5	mulsd xmm6,xmm6	addsd xmm4, xmm5	addsd xmm4, xmm6	movapd xmm7, xmm4	;# rsqO in xmm7 	;# move j coords to xmm4-xmm6 	movapd xmm4, xmm0	movapd xmm5, xmm1	movapd xmm6, xmm2	;# calc dr 	subsd xmm4, [rsp + nb111_ixH1]	subsd xmm5, [rsp + nb111_iyH1]	subsd xmm6, [rsp + nb111_izH1]	;# store dr 	movapd [rsp + nb111_dxH1], xmm4	movapd [rsp + nb111_dyH1], xmm5	movapd [rsp + nb111_dzH1], xmm6	;# square it 	mulsd xmm4,xmm4	mulsd xmm5,xmm5	mulsd xmm6,xmm6	addsd xmm6, xmm5	addsd xmm6, xmm4	;# rsqH1 in xmm6 	;# move j coords to xmm3-xmm5	movapd xmm3, xmm0	movapd xmm4, xmm1	movapd xmm5, xmm2	;# calc dr 	subsd xmm3, [rsp + nb111_ixH2]	subsd xmm4, [rsp + nb111_iyH2]	subsd xmm5, [rsp + nb111_izH2]	;# store dr 	movapd [rsp + nb111_dxH2], xmm3	movapd [rsp + nb111_dyH2], xmm4	movapd [rsp + nb111_dzH2], xmm5	;# square it 	mulsd xmm3,xmm3	mulsd xmm4,xmm4	mulsd xmm5,xmm5	addsd xmm5, xmm4	addsd xmm5, xmm3	;# rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7 	;# start with rsqO - put seed in xmm2 	cvtsd2ss xmm2, xmm7		rsqrtss xmm2, xmm2	cvtss2sd xmm2, xmm2	movapd  xmm3, xmm2	mulsd   xmm2, xmm2	movapd  xmm4, [rsp + nb111_three]	mulsd   xmm2, xmm7	;# rsq*lu*lu 	subsd   xmm4, xmm2	;# 30-rsq*lu*lu 	mulsd   xmm4, xmm3	;# lu*(3-rsq*lu*lu) 	mulsd   xmm4, [rsp + nb111_half] ;# iter1 ( new lu) 	movapd xmm3, xmm4	mulsd xmm4, xmm4	;# lu*lu 	mulsd xmm7, xmm4	;# rsq*lu*lu 	movapd xmm4, [rsp + nb111_three]	subsd xmm4, xmm7	;# 3-rsq*lu*lu 	mulsd xmm4, xmm3	;# lu*(	3-rsq*lu*lu) 	mulsd xmm4, [rsp + nb111_half] ;# rinv 	movapd  xmm7, xmm4	;# rinvO in xmm7 		;# rsqH1 - seed in xmm2 	cvtsd2ss xmm2, xmm6		rsqrtss xmm2, xmm2	cvtss2sd xmm2, xmm2	movapd  xmm3, xmm2	mulsd   xmm2, xmm2	movapd  xmm4, [rsp + nb111_three]	mulsd   xmm2, xmm6	;# rsq*lu*lu 	subsd   xmm4, xmm2	;# 30-rsq*lu*lu 	mulsd   xmm4, xmm3	;# lu*(3-rsq*lu*lu) 	mulsd   xmm4, [rsp + nb111_half] ;# iter1 ( new lu) 	movapd xmm3, xmm4	mulsd xmm4, xmm4	;# lu*lu 	mulsd xmm6, xmm4	;# rsq*lu*lu 	movapd xmm4, [rsp + nb111_three]	subsd xmm4, xmm6	;# 3-rsq*lu*lu 	mulsd xmm4, xmm3	;# lu*(	3-rsq*lu*lu) 	mulsd xmm4, [rsp + nb111_half] ;# rinv 	movapd  xmm6, xmm4	;# rinvH1 in xmm6 		;# rsqH2 - seed in xmm2 	cvtsd2ss xmm2, xmm5		rsqrtss xmm2, xmm2	cvtss2sd xmm2, xmm2	movapd  xmm3, xmm2	mulsd   xmm2, xmm2	movapd  xmm4, [rsp + nb111_three]	mulsd   xmm2, xmm5	;# rsq*lu*lu 	subsd   xmm4, xmm2	;# 30-rsq*lu*lu 	mulsd   xmm4, xmm3	;# lu*(3-rsq*lu*lu) 	mulsd   xmm4, [rsp + nb111_half] ;# iter1 ( new lu) 	movapd xmm3, xmm4	mulsd xmm4, xmm4	;# lu*lu 	mulsd xmm5, xmm4	;# rsq*lu*lu 	movapd xmm4, [rsp + nb111_three]	subsd xmm4, xmm5	;# 3-rsq*lu*lu 	mulsd xmm4, xmm3	;# lu*(	3-rsq*lu*lu) 	mulsd xmm4, [rsp + nb111_half] ;# rinv 	movapd  xmm5, xmm4	;# rinvH2 in xmm5 	;# do O interactions 	movapd  xmm4, xmm7		mulsd   xmm4, xmm4	;# xmm7=rinv, xmm4=rinvsq 	movapd xmm1, xmm4	mulsd  xmm1, xmm4	mulsd  xmm1, xmm4	;# xmm1=rinvsix 	movapd xmm2, xmm1	mulsd  xmm2, xmm2	;# xmm2=rinvtwelve 	mulsd  xmm7, [rsp + nb111_qqO]	;# xmm7=vcoul 		mulsd  xmm1, [rsp + nb111_c6]	mulsd  xmm2, [rsp + nb111_c12]	movapd xmm3, xmm2	subsd  xmm3, xmm1	;# Vvdw=Vvdw12-Vvdw6 			addsd  xmm3, [rsp + nb111_Vvdwtot]	mulsd  xmm1, [rsp + nb111_six]	mulsd  xmm2, [rsp + nb111_twelve]	subsd  xmm2, xmm1	addsd  xmm2, xmm7		mulsd  xmm4, xmm2	;# total fsO in xmm4 	addsd  xmm7, [rsp + nb111_vctot]		movsd [rsp + nb111_Vvdwtot], xmm3	movsd [rsp + nb111_vctot], xmm7	movapd xmm0, [rsp + nb111_dxO]	movapd xmm1, [rsp + nb111_dyO]	movapd xmm2, [rsp + nb111_dzO]	mulsd  xmm0, xmm4	mulsd  xmm1, xmm4	mulsd  xmm2, xmm4	;# update O forces 	movapd xmm3, [rsp + nb111_fixO]	movapd xmm4, [rsp + nb111_fiyO]	movapd xmm7, [rsp + nb111_fizO]	addsd  xmm3, xmm0	addsd  xmm4, xmm1	addsd  xmm7, xmm2	movsd [rsp + nb111_fixO], xmm3	movsd [rsp + nb111_fiyO], xmm4	movsd [rsp + nb111_fizO], xmm7	;# update j forces with water O 	movsd [rsp + nb111_fjx], xmm0	movsd [rsp + nb111_fjy], xmm1	movsd [rsp + nb111_fjz], xmm2	;# H1 interactions 	movapd  xmm4, xmm6		mulsd   xmm4, xmm4	;# xmm6=rinv, xmm4=rinvsq 	mulsd  xmm6, [rsp + nb111_qqH]	;# xmm6=vcoul 	mulsd  xmm4, xmm6		;# total fsH1 in xmm4 		addsd  xmm6, [rsp + nb111_vctot]	movapd xmm0, [rsp + nb111_dxH1]	movapd xmm1, [rsp + nb111_dyH1]	movapd xmm2, [rsp + nb111_dzH1]	movsd [rsp + nb111_vctot], xmm6	mulsd  xmm0, xmm4	mulsd  xmm1, xmm4	mulsd  xmm2, xmm4	;# update H1 forces 	movapd xmm3, [rsp + nb111_fixH1]	movapd xmm4, [rsp + nb111_fiyH1]	movapd xmm7, [rsp + nb111_fizH1]	addsd  xmm3, xmm0	addsd  xmm4, xmm1	addsd  xmm7, xmm2	movsd [rsp + nb111_fixH1], xmm3	movsd [rsp + nb111_fiyH1], xmm4	movsd [rsp + nb111_fizH1], xmm7	;# update j forces with water H1 	addsd  xmm0, [rsp + nb111_fjx]	addsd  xmm1, [rsp + nb111_fjy]	addsd  xmm2, [rsp + nb111_fjz]	movsd [rsp + nb111_fjx], xmm0	movsd [rsp + nb111_fjy], xmm1	movsd [rsp + nb111_fjz], xmm2	;# H2 interactions 	movapd  xmm4, xmm5		mulsd   xmm4, xmm4	;# xmm5=rinv, xmm4=rinvsq 	mulsd  xmm5, [rsp + nb111_qqH]	;# xmm5=vcoul 	mulsd  xmm4, xmm5		;# total fsH1 in xmm4 		addsd  xmm5, [rsp + nb111_vctot]	movapd xmm0, [rsp + nb111_dxH2]	movapd xmm1, [rsp + nb111_dyH2]	movapd xmm2, [rsp + nb111_dzH2]	movsd [rsp + nb111_vctot], xmm5	mulsd  xmm0, xmm4	mulsd  xmm1, xmm4	mulsd  xmm2, xmm4	;# update H2 forces 	movapd xmm3, [rsp + nb111_fixH2]	movapd xmm4, [rsp + nb111_fiyH2]	movapd xmm7, [rsp + nb111_fizH2]	addsd  xmm3, xmm0	addsd  xmm4, xmm1	addsd  xmm7, xmm2	movsd [rsp + nb111_fixH2], xmm3	movsd [rsp + nb111_fiyH2], xmm4	movsd [rsp + nb111_fizH2], xmm7	mov rdi, [rbp + nb111_faction]	;# update j forces 	addsd  xmm0, [rsp + nb111_fjx]	addsd  xmm1, [rsp + nb111_fjy]	addsd  xmm2, [rsp + nb111_fjz]	movlpd xmm3, [rdi + rax*8]	movlpd xmm4, [rdi + rax*8 + 8]	movlpd xmm5, [rdi + rax*8 + 16]	addsd xmm3, xmm0	addsd xmm4, xmm1	addsd xmm5, xmm2

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?