nb_kernel203_x86_64_sse2.intel_syntax.s

来自「最著名最快的分子模拟软件」· S 代码 · 共 1,732 行 · 第 1/4 页

S
1,732
字号
		subpd   xmm9, xmm1	subpd   xmm10, xmm4    subpd   xmm11, xmm7 ;# 3-rsq*lu*lu	mulpd   xmm9, xmm2	mulpd   xmm10, xmm5    mulpd   xmm11, xmm8 ;# lu*(3-rsq*lu*lu)	movapd  xmm15, [rsp + nb203_half]	mulpd   xmm9, xmm15  ;# first iteration for rinvH1	mulpd   xmm10, xmm15 ;# first iteration for rinvH2    mulpd   xmm11, xmm15 ;# first iteration for rinvM    ;# second iteration step    	movapd  xmm2, xmm9	movapd  xmm5, xmm10    movapd  xmm8, xmm11    	mulpd   xmm2, xmm2 ;# lu*lu	mulpd   xmm5, xmm5 ;# lu*lu    mulpd   xmm8, xmm8 ;# lu*lu			movapd  xmm1, [rsp + nb203_three]	movapd  xmm4, xmm1    movapd  xmm7, xmm1	mulpd   xmm2, xmm0 ;# rsq*lu*lu	mulpd   xmm5, xmm3 ;# rsq*lu*lu     mulpd   xmm8, xmm6 ;# rsq*lu*lu		subpd   xmm1, xmm2	subpd   xmm4, xmm5    subpd   xmm7, xmm8 ;# 3-rsq*lu*lu	mulpd   xmm9, xmm1	mulpd   xmm10, xmm4    mulpd   xmm11, xmm7 ;# lu*(3-rsq*lu*lu)	movapd  xmm15, [rsp + nb203_half]	mulpd   xmm9, xmm15  ;#  rinvH1	mulpd   xmm10, xmm15 ;#   rinvH2    mulpd   xmm11, xmm15 ;#   rinvM		;# interactions     ;# rsq in xmm0,xmm3,xmm6      ;# rinv in xmm9, xmm10, xmm11    movapd xmm1, xmm9 ;# copy of rinv    movapd xmm4, xmm10    movapd xmm7, xmm11    movapd xmm2, [rsp + nb203_krf]        mulpd  xmm9, xmm9   ;# rinvsq    mulpd  xmm10, xmm10    mulpd  xmm11, xmm11    mulpd  xmm0, xmm2  ;# k*rsq    mulpd  xmm3, xmm2    mulpd  xmm6, xmm2    movapd xmm2, xmm0 ;# copy of k*rsq    movapd xmm5, xmm3    movapd xmm8, xmm6    addpd  xmm2, xmm1  ;# rinv+krsq    addpd  xmm5, xmm4    addpd  xmm8, xmm7    movapd xmm14, [rsp + nb203_crf]    subpd  xmm2, xmm14   ;# rinv+krsq-crf    subpd  xmm5, xmm14    subpd  xmm8, xmm14    movapd xmm12, [rsp + nb203_qqH]    movapd xmm13, [rsp + nb203_qqM]        mulpd  xmm2, xmm12 ;# voul=qq*(rinv+ krsq-crf)    mulpd  xmm5, xmm12 ;# voul=qq*(rinv+ krsq-crf)    mulpd  xmm8, xmm13 ;# voul=qq*(rinv+ krsq-crf)    addpd  xmm0, xmm0 ;# 2*krsq    addpd  xmm3, xmm3     addpd  xmm6, xmm6     subpd  xmm1, xmm0 ;# rinv-2*krsq    subpd  xmm4, xmm3    subpd  xmm7, xmm6    mulpd  xmm1, xmm12   ;# (rinv-2*krsq)*qq    mulpd  xmm4, xmm12    mulpd  xmm7, xmm13    addpd  xmm2, [rsp + nb203_vctot]    addpd  xmm5, xmm8    addpd  xmm2, xmm5    movapd [rsp + nb203_vctot], xmm2        mulpd  xmm9, xmm1   ;# fscal    mulpd  xmm10, xmm4    mulpd  xmm11, xmm7    ;# move j forces to xmm0-xmm2	movlpd xmm0, [rdi + rax*8]	movlpd xmm1, [rdi + rax*8 + 8]	movlpd xmm2, [rdi + rax*8 + 16]	movhpd xmm0, [rdi + rbx*8]	movhpd xmm1, [rdi + rbx*8 + 8]	movhpd xmm2, [rdi + rbx*8 + 16]    movapd xmm7, xmm9    movapd xmm8, xmm9    movapd xmm13, xmm11    movapd xmm14, xmm11    movapd xmm15, xmm11    movapd xmm11, xmm10    movapd xmm12, xmm10	mulpd xmm7, [rsp + nb203_dxH1]	mulpd xmm8, [rsp + nb203_dyH1]	mulpd xmm9, [rsp + nb203_dzH1]	mulpd xmm10, [rsp + nb203_dxH2]	mulpd xmm11, [rsp + nb203_dyH2]	mulpd xmm12, [rsp + nb203_dzH2]	mulpd xmm13, [rsp + nb203_dxM]	mulpd xmm14, [rsp + nb203_dyM]	mulpd xmm15, [rsp + nb203_dzM]    addpd xmm0, xmm7    addpd xmm1, xmm8    addpd xmm2, xmm9    addpd xmm7, [rsp + nb203_fixH1]    addpd xmm8, [rsp + nb203_fiyH1]    addpd xmm9, [rsp + nb203_fizH1]    addpd xmm0, xmm10    addpd xmm1, xmm11    addpd xmm2, xmm12    addpd xmm10, [rsp + nb203_fixH2]    addpd xmm11, [rsp + nb203_fiyH2]    addpd xmm12, [rsp + nb203_fizH2]    addpd xmm0, xmm13    addpd xmm1, xmm14    addpd xmm2, xmm15    addpd xmm13, [rsp + nb203_fixM]    addpd xmm14, [rsp + nb203_fiyM]    addpd xmm15, [rsp + nb203_fizM]    movapd [rsp + nb203_fixH1], xmm7    movapd [rsp + nb203_fiyH1], xmm8    movapd [rsp + nb203_fizH1], xmm9    movapd [rsp + nb203_fixH2], xmm10    movapd [rsp + nb203_fiyH2], xmm11    movapd [rsp + nb203_fizH2], xmm12    movapd [rsp + nb203_fixM], xmm13    movapd [rsp + nb203_fiyM], xmm14    movapd [rsp + nb203_fizM], xmm15       ;# store back j forces from xmm0-xmm2	movlpd [rdi + rax*8], xmm0	movlpd [rdi + rax*8 + 8], xmm1	movlpd [rdi + rax*8 + 16], xmm2	movhpd [rdi + rbx*8], xmm0	movhpd [rdi + rbx*8 + 8], xmm1	movhpd [rdi + rbx*8 + 16], xmm2	;# should we do one more iteration? 	sub dword ptr [rsp + nb203_innerk],  2	jl    .nb203_checksingle	jmp   .nb203_unroll_loop.nb203_checksingle:		mov   edx, [rsp + nb203_innerk]	and   edx, 1	jnz   .nb203_dosingle	jmp   .nb203_updateouterdata.nb203_dosingle:	mov   rdx, [rsp + nb203_innerjjnr]     ;# pointer to jjnr[k] 	mov   eax, [rdx]		add qword ptr [rsp + nb203_innerjjnr],  4		mov rsi, [rbp + nb203_charge]    ;# base of charge[] 	xorpd xmm3, xmm3	movlpd xmm3, [rsi + rax*8]	movapd xmm4, xmm3	mulpd  xmm3, [rsp + nb203_iqM]	mulpd  xmm4, [rsp + nb203_iqH]	movapd  [rsp + nb203_qqM], xmm3	movapd  [rsp + nb203_qqH], xmm4		mov rsi, [rbp + nb203_pos]       ;# base of pos[] 	lea   rax, [rax + rax*2]     ;# replace jnr with j3 	;# move coordinates to xmm4-xmm6 & xmm0-xmm2 		movlpd xmm4, [rsi + rax*8]	movlpd xmm5, [rsi + rax*8 + 8]	movlpd xmm6, [rsi + rax*8 + 16]    movapd xmm0, xmm4    movapd xmm1, xmm5    movapd xmm2, xmm6    	;# calc dr 	subsd xmm4, [rsp + nb203_ixM]	subsd xmm5, [rsp + nb203_iyM]	subsd xmm6, [rsp + nb203_izM]	;# store dr 	movapd [rsp + nb203_dxM], xmm4	movapd [rsp + nb203_dyM], xmm5	movapd [rsp + nb203_dzM], xmm6	;# square it 	mulsd xmm4,xmm4	mulsd xmm5,xmm5	mulsd xmm6,xmm6	addsd xmm4, xmm5	addsd xmm4, xmm6	movapd xmm7, xmm4	;# rsqM in xmm7 	;# move j coords to xmm4-xmm6 	movapd xmm4, xmm0	movapd xmm5, xmm1	movapd xmm6, xmm2	;# calc dr 	subsd xmm4, [rsp + nb203_ixH1]	subsd xmm5, [rsp + nb203_iyH1]	subsd xmm6, [rsp + nb203_izH1]	;# store dr 	movapd [rsp + nb203_dxH1], xmm4	movapd [rsp + nb203_dyH1], xmm5	movapd [rsp + nb203_dzH1], xmm6	;# square it 	mulsd xmm4,xmm4	mulsd xmm5,xmm5	mulsd xmm6,xmm6	addsd xmm6, xmm5	addsd xmm6, xmm4	;# rsqH1 in xmm6 	;# move j coords to xmm3-xmm5 	movapd xmm3, xmm0	movapd xmm4, xmm1	movapd xmm5, xmm2	;# calc dr 	subsd xmm3, [rsp + nb203_ixH2]	subsd xmm4, [rsp + nb203_iyH2]	subsd xmm5, [rsp + nb203_izH2]	;# store dr 	movapd [rsp + nb203_dxH2], xmm3	movapd [rsp + nb203_dyH2], xmm4	movapd [rsp + nb203_dzH2], xmm5	;# square it 	mulsd xmm3,xmm3	mulsd xmm4,xmm4	mulsd xmm5,xmm5	addsd xmm5, xmm4	addsd xmm5, xmm3	;# rsqH2 in xmm5, rsqH1 in xmm6, rsqM in xmm7 		movapd xmm0, xmm5	movapd xmm1, xmm6	movapd xmm2, xmm7	mulsd  xmm0, [rsp + nb203_krf]		mulsd  xmm1, [rsp + nb203_krf]		mulsd  xmm2, [rsp + nb203_krf]		movapd [rsp + nb203_krsqH2], xmm0	movapd [rsp + nb203_krsqH1], xmm1	movapd [rsp + nb203_krsqM], xmm2		;# start with rsqM - put seed in xmm2 	cvtsd2ss xmm2, xmm7		rsqrtss xmm2, xmm2	cvtss2sd xmm2, xmm2	movapd  xmm3, xmm2	mulsd   xmm2, xmm2	movapd  xmm4, [rsp + nb203_three]	mulsd   xmm2, xmm7	;# rsq*lu*lu 	subsd   xmm4, xmm2	;# 30-rsq*lu*lu 	mulsd   xmm4, xmm3	;# lu*(3-rsq*lu*lu) 	mulsd   xmm4, [rsp + nb203_half] ;# iter1 ( new lu) 	movapd xmm3, xmm4	mulsd xmm4, xmm4	;# lu*lu 	mulsd xmm7, xmm4	;# rsq*lu*lu 	movapd xmm4, [rsp + nb203_three]	subsd xmm4, xmm7	;# 3-rsq*lu*lu 	mulsd xmm4, xmm3	;# lu*(	3-rsq*lu*lu) 	mulsd xmm4, [rsp + nb203_half] ;# rinv 	movapd  xmm7, xmm4	;# rinvM in xmm7 		;# rsqH1 - seed in xmm2 	cvtsd2ss xmm2, xmm6		rsqrtss xmm2, xmm2	cvtss2sd xmm2, xmm2	movapd  xmm3, xmm2	mulsd   xmm2, xmm2	movapd  xmm4, [rsp + nb203_three]	mulsd   xmm2, xmm6	;# rsq*lu*lu 	subsd   xmm4, xmm2	;# 30-rsq*lu*lu 	mulsd   xmm4, xmm3	;# lu*(3-rsq*lu*lu) 	mulsd   xmm4, [rsp + nb203_half] ;# iter1 ( new lu) 	movapd xmm3, xmm4	mulsd xmm4, xmm4	;# lu*lu 	mulsd xmm6, xmm4	;# rsq*lu*lu 	movapd xmm4, [rsp + nb203_three]	subsd xmm4, xmm6	;# 3-rsq*lu*lu 	mulsd xmm4, xmm3	;# lu*(	3-rsq*lu*lu) 	mulsd xmm4, [rsp + nb203_half] ;# rinv 	movapd  xmm6, xmm4	;# rinvH1 in xmm6 		;# rsqH2 - seed in xmm2 	cvtsd2ss xmm2, xmm5		rsqrtss xmm2, xmm2	cvtss2sd xmm2, xmm2	movapd  xmm3, xmm2	mulsd   xmm2, xmm2	movapd  xmm4, [rsp + nb203_three]	mulsd   xmm2, xmm5	;# rsq*lu*lu 	subsd   xmm4, xmm2	;# 30-rsq*lu*lu 	mulsd   xmm4, xmm3	;# lu*(3-rsq*lu*lu) 	mulsd   xmm4, [rsp + nb203_half] ;# iter1 ( new lu) 	movapd xmm3, xmm4	mulsd xmm4, xmm4	;# lu*lu 	mulsd xmm5, xmm4	;# rsq*lu*lu 	movapd xmm4, [rsp + nb203_three]	subsd xmm4, xmm5	;# 3-rsq*lu*lu 	mulsd xmm4, xmm3	;# lu*(	3-rsq*lu*lu) 	mulsd xmm4, [rsp + nb203_half] ;# rinv 	movapd  xmm5, xmm4	;# rinvH2 in xmm5 	;# do M interactions 	movapd  xmm4, xmm7		mulsd   xmm4, xmm4	;# xmm6=rinv, xmm4=rinvsq 	movapd  xmm3, xmm7	movapd  xmm0, [rsp + nb203_krsqM]	addsd   xmm7, xmm0	;# xmm6=rinv+ krsq 	mulsd   xmm0, [rsp + nb203_two]	subsd   xmm7, [rsp + nb203_crf]	subsd   xmm3, xmm0	;# xmm7=rinv-2*krsq 	mulsd   xmm7, [rsp + nb203_qqM] ;# vcoul 	mulsd   xmm3, [rsp + nb203_qqM]	mulsd  xmm4, xmm3	;# total fsH1 in xmm4 		addsd  xmm7, [rsp + nb203_vctot]	movapd xmm0, [rsp + nb203_dxM]	movapd xmm1, [rsp + nb203_dyM]	movapd xmm2, [rsp + nb203_dzM]	movlpd [rsp + nb203_vctot], xmm7	mulsd  xmm0, xmm4	mulsd  xmm1, xmm4	mulsd  xmm2, xmm4	;# update M forces 	movapd xmm3, [rsp + nb203_fixM]	movapd xmm4, [rsp + nb203_fiyM]	movapd xmm7, [rsp + nb203_fizM]	addsd  xmm3, xmm0	addsd  xmm4, xmm1	addsd  xmm7, xmm2	movlpd [rsp + nb203_fixM], xmm3	movlpd [rsp + nb203_fiyM], xmm4	movlpd [rsp + nb203_fizM], xmm7	;# update j forces with water M 	movlpd [rsp + nb203_fjx], xmm0	movlpd [rsp + nb203_fjy], xmm1	movlpd [rsp + nb203_fjz], xmm2	;# H1 interactions 	movapd  xmm4, xmm6		mulsd   xmm4, xmm4	;# xmm6=rinv, xmm4=rinvsq 	movapd  xmm7, xmm6	movapd  xmm0, [rsp + nb203_krsqH1]	addsd   xmm6, xmm0	;# xmm6=rinv+ krsq 	mulsd   xmm0, [rsp + nb203_two]	subsd   xmm6, [rsp + nb203_crf]	subsd   xmm7, xmm0	;# xmm7=rinv-2*krsq 	mulsd   xmm6, [rsp + nb203_qqH] ;# vcoul 	mulsd   xmm7, [rsp + nb203_qqH]	mulsd  xmm4, xmm7		;# total fsH1 in xmm4 		addsd  xmm6, [rsp + nb203_vctot]	movapd xmm0, [rsp + nb203_dxH1]	movapd xmm1, [rsp + nb203_dyH1]	movapd xmm2, [rsp + nb203_dzH1]	movlpd [rsp + nb203_vctot], xmm6	mulsd  xmm0, xmm4	mulsd  xmm1, xmm4	mulsd  xmm2, xmm4	;# update H1 forces 	movapd xmm3, [rsp + nb203_fixH1]	movapd xmm4, [rsp + nb203_fiyH1]	movapd xmm7, [rsp + nb203_fizH1]	addsd  xmm3, xmm0	addsd  xmm4, xmm1	addsd  xmm7, xmm2	movlpd [rsp + nb203_fixH1], xmm3	movlpd [rsp + nb203_fiyH1], xmm4	movlpd [rsp + nb203_fizH1], xmm7	;# update j forces with water H1 	addsd  xmm0, [rsp + nb203_fjx]	addsd  xmm1, [rsp + nb203_fjy]	addsd  xmm2, [rsp + nb203_fjz]	movlpd [rsp + nb203_fjx], xmm0	movlpd [rsp + nb203_fjy], xmm1	movlpd [rsp + nb203_fjz], xmm2	;# H2 interactions 	movapd  xmm4, xmm5		mulsd   xmm4, xmm4	;# xmm5=rinv, xmm4=rinvsq 	movapd  xmm7, xmm5	movapd  xmm0, [rsp + nb203_krsqH2]	addsd   xmm5, xmm0	;# xmm5=rinv+ krsq 	mulsd   xmm0, [rsp + nb203_two]	subsd   xmm5, [rsp + nb203_crf]	subsd   xmm7, xmm0	;# xmm7=rinv-2*krsq 	mulsd   xmm5, [rsp + nb203_qqH] ;# vcoul 	mulsd   xmm7, [rsp + nb203_qqH]	mulsd  xmm4, xmm7		;# total fsH2 in xmm4 		addsd  xmm5, [rsp + nb203_vctot]	movapd xmm0, [rsp + nb203_dxH2]	movapd xmm1, [rsp + nb203_dyH2]	movapd xmm2, [rsp + nb203_dzH2]	movlpd [rsp + nb203_vctot], xmm5	mulsd  xmm0, xmm4	mulsd  xmm1, xmm4	mulsd  xmm2, xmm4	;# update H2 forces 

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?