nb_kernel213_x86_64_sse2.intel_syntax.s

来自「最著名最快的分子模拟软件」· S 代码 · 共 2,195 行 · 第 1/4 页

S
2,195
字号
	;# start doing invsqrt for j atoms    cvtpd2ps xmm1, xmm0    cvtpd2ps xmm4, xmm3    cvtpd2ps xmm7, xmm6	rsqrtps xmm1, xmm1	rsqrtps xmm4, xmm4    rsqrtps xmm7, xmm7    cvtps2pd xmm1, xmm1    cvtps2pd xmm4, xmm4    cvtps2pd xmm7, xmm7		movapd  xmm2, xmm1	movapd  xmm5, xmm4    movapd  xmm8, xmm7    	mulpd   xmm1, xmm1 ;# lu*lu	mulpd   xmm4, xmm4 ;# lu*lu    mulpd   xmm7, xmm7 ;# lu*lu			movapd  xmm9, [rsp + nb213_three]	movapd  xmm10, xmm9    movapd  xmm11, xmm9	mulpd   xmm1, xmm0 ;# rsq*lu*lu	mulpd   xmm4, xmm3 ;# rsq*lu*lu     mulpd   xmm7, xmm6 ;# rsq*lu*lu		subpd   xmm9, xmm1	subpd   xmm10, xmm4    subpd   xmm11, xmm7 ;# 3-rsq*lu*lu	mulpd   xmm9, xmm2	mulpd   xmm10, xmm5    mulpd   xmm11, xmm8 ;# lu*(3-rsq*lu*lu)	movapd  xmm15, [rsp + nb213_half]	mulpd   xmm9, xmm15  ;# first iteration for rinvH1	mulpd   xmm10, xmm15 ;# first iteration for rinvH2    mulpd   xmm11, xmm15 ;# first iteration for rinvM    ;# second iteration step    	movapd  xmm2, xmm9	movapd  xmm5, xmm10    movapd  xmm8, xmm11    	mulpd   xmm2, xmm2 ;# lu*lu	mulpd   xmm5, xmm5 ;# lu*lu    mulpd   xmm8, xmm8 ;# lu*lu			movapd  xmm1, [rsp + nb213_three]	movapd  xmm4, xmm1    movapd  xmm7, xmm1	mulpd   xmm2, xmm0 ;# rsq*lu*lu	mulpd   xmm5, xmm3 ;# rsq*lu*lu     mulpd   xmm8, xmm6 ;# rsq*lu*lu		subpd   xmm1, xmm2	subpd   xmm4, xmm5    subpd   xmm7, xmm8 ;# 3-rsq*lu*lu	mulpd   xmm9, xmm1	mulpd   xmm10, xmm4    mulpd   xmm11, xmm7 ;# lu*(3-rsq*lu*lu)	movapd  xmm15, [rsp + nb213_half]	mulpd   xmm9, xmm15  ;#  rinvH1	mulpd   xmm10, xmm15 ;#   rinvH2    mulpd   xmm11, xmm15 ;#   rinvM		;# interactions     ;# rsq in xmm0,xmm3,xmm6      ;# rinv in xmm9, xmm10, xmm11    movapd xmm1, xmm9 ;# copy of rinv    movapd xmm4, xmm10    movapd xmm7, xmm11    movapd xmm2, [rsp + nb213_krf]        mulpd  xmm9, xmm9   ;# rinvsq    mulpd  xmm10, xmm10    mulpd  xmm11, xmm11    mulpd  xmm0, xmm2  ;# k*rsq    mulpd  xmm3, xmm2    mulpd  xmm6, xmm2    movapd xmm2, xmm0 ;# copy of k*rsq    movapd xmm5, xmm3    movapd xmm8, xmm6    addpd  xmm2, xmm1  ;# rinv+krsq    addpd  xmm5, xmm4    addpd  xmm8, xmm7    movapd xmm14, [rsp + nb213_crf]    subpd  xmm2, xmm14   ;# rinv+krsq-crf    subpd  xmm5, xmm14    subpd  xmm8, xmm14    movapd xmm12, [rsp + nb213_qqH]    movapd xmm13, [rsp + nb213_qqM]        mulpd  xmm2, xmm12 ;# voul=qq*(rinv+ krsq-crf)    mulpd  xmm5, xmm12 ;# voul=qq*(rinv+ krsq-crf)    mulpd  xmm8, xmm13 ;# voul=qq*(rinv+ krsq-crf)    addpd  xmm0, xmm0 ;# 2*krsq    addpd  xmm3, xmm3     addpd  xmm6, xmm6     subpd  xmm1, xmm0 ;# rinv-2*krsq    subpd  xmm4, xmm3    subpd  xmm7, xmm6    mulpd  xmm1, xmm12   ;# (rinv-2*krsq)*qq    mulpd  xmm4, xmm12    mulpd  xmm7, xmm13    addpd  xmm2, [rsp + nb213_vctot]    addpd  xmm5, xmm8    addpd  xmm2, xmm5    movapd [rsp + nb213_vctot], xmm2        mulpd  xmm9, xmm1   ;# fscal    mulpd  xmm10, xmm4    mulpd  xmm11, xmm7    ;# move j forces to xmm0-xmm2    mov rdi, [rbp + nb213_faction]	movlpd xmm0, [rdi + rax*8]	movlpd xmm1, [rdi + rax*8 + 8]	movlpd xmm2, [rdi + rax*8 + 16]	movhpd xmm0, [rdi + rbx*8]	movhpd xmm1, [rdi + rbx*8 + 8]	movhpd xmm2, [rdi + rbx*8 + 16]    movapd xmm7, xmm9    movapd xmm8, xmm9    movapd xmm13, xmm11    movapd xmm14, xmm11    movapd xmm15, xmm11    movapd xmm11, xmm10    movapd xmm12, xmm10    ;# add forces from O interaction    addpd xmm0, [rsp + nb213_fjx]    addpd xmm1, [rsp + nb213_fjy]    addpd xmm2, [rsp + nb213_fjz]	mulpd xmm7, [rsp + nb213_dxH1]	mulpd xmm8, [rsp + nb213_dyH1]	mulpd xmm9, [rsp + nb213_dzH1]	mulpd xmm10, [rsp + nb213_dxH2]	mulpd xmm11, [rsp + nb213_dyH2]	mulpd xmm12, [rsp + nb213_dzH2]	mulpd xmm13, [rsp + nb213_dxM]	mulpd xmm14, [rsp + nb213_dyM]	mulpd xmm15, [rsp + nb213_dzM]    addpd xmm0, xmm7    addpd xmm1, xmm8    addpd xmm2, xmm9    addpd xmm7, [rsp + nb213_fixH1]    addpd xmm8, [rsp + nb213_fiyH1]    addpd xmm9, [rsp + nb213_fizH1]    addpd xmm0, xmm10    addpd xmm1, xmm11    addpd xmm2, xmm12    addpd xmm10, [rsp + nb213_fixH2]    addpd xmm11, [rsp + nb213_fiyH2]    addpd xmm12, [rsp + nb213_fizH2]    addpd xmm0, xmm13    addpd xmm1, xmm14    addpd xmm2, xmm15    addpd xmm13, [rsp + nb213_fixM]    addpd xmm14, [rsp + nb213_fiyM]    addpd xmm15, [rsp + nb213_fizM]    movapd [rsp + nb213_fixH1], xmm7    movapd [rsp + nb213_fiyH1], xmm8    movapd [rsp + nb213_fizH1], xmm9    movapd [rsp + nb213_fixH2], xmm10    movapd [rsp + nb213_fiyH2], xmm11    movapd [rsp + nb213_fizH2], xmm12    movapd [rsp + nb213_fixM], xmm13    movapd [rsp + nb213_fiyM], xmm14    movapd [rsp + nb213_fizM], xmm15       ;# store back j forces from xmm0-xmm2	movlpd [rdi + rax*8], xmm0	movlpd [rdi + rax*8 + 8], xmm1	movlpd [rdi + rax*8 + 16], xmm2	movhpd [rdi + rbx*8], xmm0	movhpd [rdi + rbx*8 + 8], xmm1	movhpd [rdi + rbx*8 + 16], xmm2	;# should we do one more iteration? 	sub dword ptr [rsp + nb213_innerk],  2	jl   .nb213_checksingle	jmp  .nb213_unroll_loop.nb213_checksingle:		mov   edx, [rsp + nb213_innerk]	and   edx, 1	jnz  .nb213_dosingle	jmp  .nb213_updateouterdata.nb213_dosingle:	mov   rdx, [rsp + nb213_innerjjnr]     ;# pointer to jjnr[k] 	mov   eax, [rdx]		add qword ptr [rsp + nb213_innerjjnr],  4		mov rsi, [rbp + nb213_charge]    ;# base of charge[] 	xorpd xmm3, xmm3	movlpd xmm3, [rsi + rax*8]	movapd xmm4, xmm3	mulsd  xmm3, [rsp + nb213_iqM]	mulsd  xmm4, [rsp + nb213_iqH]	movapd  [rsp + nb213_qqM], xmm3	movapd  [rsp + nb213_qqH], xmm4		mov rsi, [rbp + nb213_type]	mov r8d, [rsi + rax*4]	mov rsi, [rbp + nb213_vdwparam]	shl r8d, 1		mov edi, [rsp + nb213_ntia]	add r8d, edi	movlpd xmm6, [rsi + r8*8]	;# c6a	movhpd xmm6, [rsi + r8*8 + 8]	;# c6a c12a 	xorpd xmm7, xmm7	movapd xmm4, xmm6	unpcklpd xmm4, xmm7	unpckhpd xmm6, xmm7		movapd [rsp + nb213_c6], xmm4	movapd [rsp + nb213_c12], xmm6		mov rsi, [rbp + nb213_pos]       ;# base of pos[] 	lea   rax, [rax + rax*2]     ;# replace jnr with j3 	;# move coordinates to xmm0-xmm2  and xmm4-xmm6	movlpd xmm4, [rsi + rax*8]	movlpd xmm5, [rsi + rax*8 + 8]	movlpd xmm6, [rsi + rax*8 + 16]    movapd xmm0, xmm4    movapd xmm1, xmm5    movapd xmm2, xmm6	;# calc dr 	subsd xmm4, [rsp + nb213_ixO]	subsd xmm5, [rsp + nb213_iyO]	subsd xmm6, [rsp + nb213_izO]	;# store dr 	movapd [rsp + nb213_dxO], xmm4	movapd [rsp + nb213_dyO], xmm5	movapd [rsp + nb213_dzO], xmm6	;# square it 	mulsd xmm4,xmm4	mulsd xmm5,xmm5	mulsd xmm6,xmm6	addsd xmm4, xmm5	addsd xmm4, xmm6	movapd xmm7, xmm4	;# rsqO in xmm7 	;# move j coords to xmm4-xmm6 	movapd xmm4, xmm0	movapd xmm5, xmm1	movapd xmm6, xmm2	;# calc dr 	subsd xmm4, [rsp + nb213_ixH1]	subsd xmm5, [rsp + nb213_iyH1]	subsd xmm6, [rsp + nb213_izH1]	;# store dr 	movapd [rsp + nb213_dxH1], xmm4	movapd [rsp + nb213_dyH1], xmm5	movapd [rsp + nb213_dzH1], xmm6	;# square it 	mulsd xmm4,xmm4	mulsd xmm5,xmm5	mulsd xmm6,xmm6	addsd xmm6, xmm5	addsd xmm6, xmm4	;# rsqH1 in xmm6 	;# move j coords to xmm3-xmm5 	movapd xmm3, xmm0	movapd xmm4, xmm1	movapd xmm5, xmm2	;# calc dr 	subsd xmm3, [rsp + nb213_ixH2]	subsd xmm4, [rsp + nb213_iyH2]	subsd xmm5, [rsp + nb213_izH2]	;# store dr 	movapd [rsp + nb213_dxH2], xmm3	movapd [rsp + nb213_dyH2], xmm4	movapd [rsp + nb213_dzH2], xmm5	;# square it 	mulsd xmm3,xmm3	mulsd xmm4,xmm4	mulsd xmm5,xmm5	addsd xmm5, xmm4	addsd xmm5, xmm3	;# move j coords to xmm4-xmm2	movapd xmm4, xmm0	movapd xmm3, xmm1    ;# xmm2 already contains z	;# calc dr 	subsd xmm4, [rsp + nb213_ixM]	subsd xmm3, [rsp + nb213_iyM]	subsd xmm2, [rsp + nb213_izM]	;# store dr 	movapd [rsp + nb213_dxM], xmm4	movapd [rsp + nb213_dyM], xmm3	movapd [rsp + nb213_dzM], xmm2	;# square it 	mulpd xmm2,xmm2	mulpd xmm3,xmm3	mulpd xmm4,xmm4	addpd xmm4, xmm3	addpd xmm4, xmm2		;# rsqM in xmm4, rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7 	;# calculate krsq	movsd xmm0, [rsp + nb213_krf]	movsd xmm1, xmm0	movsd xmm2, xmm0	mulsd xmm0, xmm4  	mulsd xmm1, xmm5	mulsd xmm2, xmm6	movsd [rsp + nb213_krsqM], xmm0	movsd [rsp + nb213_krsqH2], xmm1	movsd [rsp + nb213_krsqH1], xmm2	;# start with rsqH1 - put seed in xmm2 	cvtsd2ss xmm2, xmm6		rsqrtss xmm2, xmm2	cvtss2sd xmm2, xmm2	movapd  xmm3, xmm2	mulsd   xmm2, xmm2	movapd  xmm1, [rsp + nb213_three]	mulsd   xmm2, xmm6	;# rsq*lu*lu 	subsd   xmm1, xmm2	;# 30-rsq*lu*lu 	mulsd   xmm1, xmm3	;# lu*(3-rsq*lu*lu) 	mulsd   xmm1, [rsp + nb213_half] ;# iter1 ( new lu) 	movapd xmm3, xmm1	mulsd xmm1, xmm1	;# lu*lu 	mulsd xmm6, xmm1	;# rsq*lu*lu 	movapd xmm1, [rsp + nb213_three]	subsd xmm1, xmm6	;# 3-rsq*lu*lu 	mulsd xmm1, xmm3	;# lu*(	3-rsq*lu*lu) 	mulsd xmm1, [rsp + nb213_half] ;# rinv 	movapd [rsp + nb213_rinvH1], xmm1		;# rsqH2 - seed in xmm2 	cvtsd2ss xmm2, xmm5		rsqrtss xmm2, xmm2	cvtss2sd xmm2, xmm2	movapd  xmm3, xmm2	mulsd   xmm2, xmm2	movapd  xmm1, [rsp + nb213_three]	mulsd   xmm2, xmm5	;# rsq*lu*lu 	subsd   xmm1, xmm2	;# 30-rsq*lu*lu 	mulsd   xmm1, xmm3	;# lu*(3-rsq*lu*lu) 	mulsd   xmm1, [rsp + nb213_half] ;# iter1 ( new lu) 	movapd xmm3, xmm1	mulsd xmm1, xmm1	;# lu*lu 	mulsd xmm5, xmm1	;# rsq*lu*lu 	movapd xmm1, [rsp + nb213_three]	subsd xmm1, xmm5	;# 3-rsq*lu*lu 	mulsd xmm1, xmm3	;# lu*(	3-rsq*lu*lu) 	mulsd xmm1, [rsp + nb213_half] ;# rinv 	movapd [rsp + nb213_rinvH2], xmm1		;# rsqM - seed in xmm2 	cvtsd2ss xmm2, xmm4	rsqrtss xmm2, xmm2	cvtss2sd xmm2, xmm2	movapd  xmm3, xmm2	mulsd   xmm2, xmm2	movapd  xmm1, [rsp + nb213_three]	mulsd   xmm2, xmm4	;# rsq*lu*lu 	subsd   xmm1, xmm2	;# 30-rsq*lu*lu 	mulsd   xmm1, xmm3	;# lu*(3-rsq*lu*lu) 	mulsd   xmm1, [rsp + nb213_half] ;# iter1 ( new lu) 	movapd xmm3, xmm1	mulsd xmm1, xmm1	;# lu*lu 	mulsd xmm4, xmm1	;# rsq*lu*lu 	movapd xmm1, [rsp + nb213_three]	subsd xmm1, xmm4	;# 3-rsq*lu*lu 	mulsd xmm1, xmm3	;# lu*(	3-rsq*lu*lu) 	mulsd xmm1, [rsp + nb213_half] ;# rinv 	movapd [rsp + nb213_rinvM], xmm1	;# do O interactions directly. xmm7=rsq	cvtsd2ss xmm2, xmm7	movapd   xmm6, xmm7	rcpps    xmm2, xmm2	cvtss2sd xmm2, xmm2	movapd   xmm1, [rsp + nb213_two]	movapd   xmm0, xmm1	mulsd   xmm7, xmm2	subsd   xmm1, xmm7	mulsd   xmm2, xmm1 ;# iter1 	mulsd   xmm6, xmm2	subsd   xmm0, xmm6	mulsd   xmm0, xmm2 ;# xmm0=rinvsq	movapd  xmm1, xmm0		mulsd   xmm1, xmm1 ;# rinv4	mulsd   xmm1, xmm0 ;#rinvsix	movapd  xmm2, xmm1	mulsd	xmm2, xmm2 ;# rinvtwelve	mulsd  xmm1, [rsp + nb213_c6]	mulsd  xmm2, [rsp + nb213_c12]	movapd xmm3, xmm2	subsd  xmm3, xmm1	;# Vvdw=Vvdw12-Vvdw6 			addsd  xmm3, [rsp + nb213_Vvdwtot]	mulsd  xmm1, [rsp + nb213_six]	mulsd  xmm2, [rsp + nb213_twelve]	subsd  xmm2, xmm1	mulsd  xmm2, xmm0	movapd xmm4, xmm2 ;# total fsO 	movsd [rsp + nb213_Vvdwtot], xmm3	movapd xmm0, [rsp + nb213_dxO]	movapd xmm1, [rsp + nb213_dyO]	movapd xmm2, [rsp + nb213_dzO]	mulsd  xmm0, xmm4	mulsd  xmm1, xmm4	mulsd  xmm2, xmm4	;# update O forces 	movapd xmm3, [rsp + nb213_fixO]	movapd xmm4, [rsp + nb213_fiyO]	movapd xmm7, [rsp + nb213_fizO]	addsd  xmm3, xmm0	addsd  xmm4, xmm1	addsd  xmm7, xmm2	movsd [rsp + nb213_fixO], xmm3	movsd [rsp + nb213_fiyO], xmm4	movsd [rsp + nb213_fizO], xmm7	;# update j forces with water O 	movsd [rsp + nb213_fjx], xmm0	movsd [rsp + nb213_fjy], xmm1	movsd [rsp + nb213_fjz], xmm2	;# H1 interactions	movsd  xmm6, [rsp + nb213_rinvH1] 	movsd  xmm4, xmm6	mulsd   xmm4, xmm4	;# xmm6=rinv, xmm4=rinvsq 	movsd  xmm7, xmm6	movsd  xmm0, [rsp + nb213_krsqH1]	addsd   xmm6, xmm0	;# xmm6=rinv+ krsq 	mulsd   xmm0, [rsp + nb213_two]	subsd   xmm6, [rsp + nb213_crf]	subsd   xmm7, xmm0	;# xmm7=rinv-2*krsq 	mulsd   xmm6, [rsp + nb213_qqH] ;# vcoul 	mulsd   xmm7, [rsp + nb213_qqH]	mulsd  xmm4, xmm7		;# total fsH1 in xmm4 		addsd  xmm6, [rsp + nb213_vctot]	movapd xmm0, [rsp + nb213_dxH1]	movapd xmm1, [rsp + nb213_dyH1]	movapd xmm2, [rsp + nb213_dzH1]	movsd [rsp + nb213_vctot], xmm6	mulsd  xmm0, xmm4	mulsd  xmm1, xmm4	mulsd  xmm2, xmm4	;# update H1 forces 	movapd xmm3, [rsp + nb213_fixH1]	movapd xmm4, [rsp + nb213_fiyH1]	movapd xmm7, [rsp + nb213_fizH1]	addsd  xmm3, xmm0	addsd  xmm4, xmm1	addsd  xmm7, xmm2	movsd [rsp + nb213_fixH1], xmm3	movsd [rsp + nb213_fiyH1], xmm4	movsd [rsp + nb213_fizH1], xmm7	;# update j forces with water H1 	addsd  xmm0, [rsp + nb213_fjx]	addsd  xmm1, [rsp + nb213_fjy]	addsd  xmm2, [rsp + nb213_fjz]	movsd [rsp + nb213_fjx], xmm0	movsd [rsp + nb213_fjy], xmm1	movsd [rsp + nb213_fjz], xmm2	;# H2 interactions 	movsd  xmm5, [rsp + nb213_rinvH2] 	movsd  xmm4, xmm5		mulsd   xmm4, xmm4	;# xmm5=rinv, xmm4=rinvsq 	movsd  xmm7, xmm5	movsd  xmm0, [rsp + nb213_krsqH2]	addsd   xmm5, xmm0	;# xmm5=rinv+ krsq 	mulsd   xmm0, [rsp + nb213_two]	subsd   xmm5, [rsp + nb213_crf]	subsd   xmm7, xmm0	;# xmm7=rinv-2*krsq 	mulsd   xmm5, [rsp + nb213_qqH] ;# vcoul 	mulsd   xmm7, [rsp + nb213_qqH]	mulsd  xmm4, xmm7		;# total fsH2 in xmm4 		addsd  xmm5, [rsp + nb213_vctot]	movapd xmm0, [rsp + nb213_dxH2]	movapd xmm1, [rsp + nb213_dyH2]	movapd xmm2, [rsp + nb213_dzH2]	movsd [rsp + nb213_vctot], xmm5	mulsd  xmm0, xmm4	mulsd  xmm1, xmm4	mulsd  xmm2, xmm4	;# update H2 forces 	movapd xmm3, [rsp + nb213_fixH2]	movapd xmm4, [rsp + nb213_fiyH2]	movapd xmm7, [rsp + nb213_fizH2]	addsd  xmm3, xmm0	addsd  xmm4, xmm1	addsd  xmm7, xmm2	movsd [rsp + nb213_fixH2], xmm3	movsd [rsp + nb213_fiyH2], xmm4	movsd [rsp + nb213_fizH2], xmm7	;# update j forces with water H2 	addsd  xmm0, [rsp + nb213_fjx]	addsd  xmm1, [rsp + nb213_fjy]	addsd  xmm2, [rsp + nb213_fjz]	movsd [rsp + nb213_fjx], xmm0	movsd [rsp + nb213_fjy], xmm1	movsd [rsp + nb213_fjz], xmm2	;# M interactions 	movsd  xmm5, [rsp + nb213_rinvM] 	movsd  xmm4, xmm5		mulsd   xmm4, xmm4	;# xmm5=rinv, xmm4=rinvsq 	movsd  xmm7, xmm5	movsd  xmm0, [rsp + nb213_krsqM]	addsd   xmm5, xmm0	;# xmm5=rinv+ krsq 	mulsd   xmm0, [rsp + nb213_two]	subsd   xmm5, [rsp + nb213_crf]	subsd   xmm7, xmm0	;# xmm7=rinv-2*krsq 

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?