nb_kernel201_ia32_sse2.intel_syntax.s

来自「最著名最快的分子模拟软件」· S 代码 · 共 1,790 行 · 第 1/4 页

S
1,790
字号
	mulpd   xmm2, xmm7	;# rsq*lu*lu 	subpd   xmm4, xmm2	;# 30-rsq*lu*lu 	mulpd   xmm4, xmm3	;# lu*(3-rsq*lu*lu) 	mulpd   xmm4, [esp + nb201_half] ;# iter1 ( new lu) 	movapd xmm3, xmm4	mulpd xmm4, xmm4	;# lu*lu 	mulpd xmm7, xmm4	;# rsq*lu*lu 	movapd xmm4, [esp + nb201_three]	subpd xmm4, xmm7	;# 3-rsq*lu*lu 	mulpd xmm4, xmm3	;# lu*(	3-rsq*lu*lu) 	mulpd xmm4, [esp + nb201_half] ;# rinv 	movapd  xmm7, xmm4	;# rinvO in xmm7 		;# rsqH1 - seed in xmm2 	cvtpd2ps xmm2, xmm6		rsqrtps xmm2, xmm2	cvtps2pd xmm2, xmm2	movapd  xmm3, xmm2	mulpd   xmm2, xmm2	movapd  xmm4, [esp + nb201_three]	mulpd   xmm2, xmm6	;# rsq*lu*lu 	subpd   xmm4, xmm2	;# 30-rsq*lu*lu 	mulpd   xmm4, xmm3	;# lu*(3-rsq*lu*lu) 	mulpd   xmm4, [esp + nb201_half] ;# iter1 ( new lu) 	movapd xmm3, xmm4	mulpd xmm4, xmm4	;# lu*lu 	mulpd xmm6, xmm4	;# rsq*lu*lu 	movapd xmm4, [esp + nb201_three]	subpd xmm4, xmm6	;# 3-rsq*lu*lu 	mulpd xmm4, xmm3	;# lu*(	3-rsq*lu*lu) 	mulpd xmm4, [esp + nb201_half] ;# rinv 	movapd  xmm6, xmm4	;# rinvH1 in xmm6 		;# rsqH2 - seed in xmm2 	cvtpd2ps xmm2, xmm5		rsqrtps xmm2, xmm2	cvtps2pd xmm2, xmm2	movapd  xmm3, xmm2	mulpd   xmm2, xmm2	movapd  xmm4, [esp + nb201_three]	mulpd   xmm2, xmm5	;# rsq*lu*lu 	subpd   xmm4, xmm2	;# 30-rsq*lu*lu 	mulpd   xmm4, xmm3	;# lu*(3-rsq*lu*lu) 	mulpd   xmm4, [esp + nb201_half] ;# iter1 ( new lu) 	movapd xmm3, xmm4	mulpd xmm4, xmm4	;# lu*lu 	mulpd xmm5, xmm4	;# rsq*lu*lu 	movapd xmm4, [esp + nb201_three]	subpd xmm4, xmm5	;# 3-rsq*lu*lu 	mulpd xmm4, xmm3	;# lu*(	3-rsq*lu*lu) 	mulpd xmm4, [esp + nb201_half] ;# rinv 	movapd  xmm5, xmm4	;# rinvH2 in xmm5 	;# do O interactions 	movapd  xmm4, xmm7		mulpd   xmm4, xmm4	;# xmm6=rinv, xmm4=rinvsq 	movapd  xmm3, xmm7	movapd  xmm0, [esp + nb201_krsqO]	addpd   xmm7, xmm0	;# xmm6=rinv+ krsq 	mulpd   xmm0, [esp + nb201_two]	subpd   xmm7, [esp + nb201_crf]	subpd   xmm3, xmm0	;# xmm7=rinv-2*krsq 	mulpd   xmm7, [esp + nb201_qqO] ;# vcoul 	mulpd   xmm3, [esp + nb201_qqO]	mulpd  xmm4, xmm3	;# total fsH1 in xmm4 		addpd  xmm7, [esp + nb201_vctot]	movapd xmm0, [esp + nb201_dxO]	movapd xmm1, [esp + nb201_dyO]	movapd xmm2, [esp + nb201_dzO]	movapd [esp + nb201_vctot], xmm7	mulpd  xmm0, xmm4	mulpd  xmm1, xmm4	mulpd  xmm2, xmm4		;# update O forces 	movapd xmm3, [esp + nb201_fixO]	movapd xmm4, [esp + nb201_fiyO]	movapd xmm7, [esp + nb201_fizO]	addpd  xmm3, xmm0	addpd  xmm4, xmm1	addpd  xmm7, xmm2	movapd [esp + nb201_fixO], xmm3	movapd [esp + nb201_fiyO], xmm4	movapd [esp + nb201_fizO], xmm7	;# update j forces with water O 	movapd [esp + nb201_fjx], xmm0	movapd [esp + nb201_fjy], xmm1	movapd [esp + nb201_fjz], xmm2	;# H1 interactions 	movapd  xmm4, xmm6		mulpd   xmm4, xmm4	;# xmm6=rinv, xmm4=rinvsq 	movapd  xmm7, xmm6	movapd  xmm0, [esp + nb201_krsqH1]	addpd   xmm6, xmm0	;# xmm6=rinv+ krsq 	mulpd   xmm0, [esp + nb201_two]	subpd   xmm6, [esp + nb201_crf]	subpd   xmm7, xmm0	;# xmm7=rinv-2*krsq 	mulpd   xmm6, [esp + nb201_qqH] ;# vcoul 	mulpd   xmm7, [esp + nb201_qqH]	mulpd  xmm4, xmm7		;# total fsH1 in xmm4 		addpd  xmm6, [esp + nb201_vctot]	movapd xmm0, [esp + nb201_dxH1]	movapd xmm1, [esp + nb201_dyH1]	movapd xmm2, [esp + nb201_dzH1]	movapd [esp + nb201_vctot], xmm6	mulpd  xmm0, xmm4	mulpd  xmm1, xmm4	mulpd  xmm2, xmm4	;# update H1 forces 	movapd xmm3, [esp + nb201_fixH1]	movapd xmm4, [esp + nb201_fiyH1]	movapd xmm7, [esp + nb201_fizH1]	addpd  xmm3, xmm0	addpd  xmm4, xmm1	addpd  xmm7, xmm2	movapd [esp + nb201_fixH1], xmm3	movapd [esp + nb201_fiyH1], xmm4	movapd [esp + nb201_fizH1], xmm7	;# update j forces with water H1 	addpd  xmm0, [esp + nb201_fjx]	addpd  xmm1, [esp + nb201_fjy]	addpd  xmm2, [esp + nb201_fjz]	movapd [esp + nb201_fjx], xmm0	movapd [esp + nb201_fjy], xmm1	movapd [esp + nb201_fjz], xmm2	;# H2 interactions 	movapd  xmm4, xmm5		mulpd   xmm4, xmm4	;# xmm5=rinv, xmm4=rinvsq 	movapd  xmm7, xmm5	movapd  xmm0, [esp + nb201_krsqH2]	addpd   xmm5, xmm0	;# xmm5=rinv+ krsq 	mulpd   xmm0, [esp + nb201_two]	subpd   xmm5, [esp + nb201_crf]	subpd   xmm7, xmm0	;# xmm7=rinv-2*krsq 	mulpd   xmm5, [esp + nb201_qqH] ;# vcoul 	mulpd   xmm7, [esp + nb201_qqH]	mulpd  xmm4, xmm7		;# total fsH2 in xmm4 		addpd  xmm5, [esp + nb201_vctot]	movapd xmm0, [esp + nb201_dxH2]	movapd xmm1, [esp + nb201_dyH2]	movapd xmm2, [esp + nb201_dzH2]	movapd [esp + nb201_vctot], xmm5	mulpd  xmm0, xmm4	mulpd  xmm1, xmm4	mulpd  xmm2, xmm4	;# update H2 forces 	movapd xmm3, [esp + nb201_fixH2]	movapd xmm4, [esp + nb201_fiyH2]	movapd xmm7, [esp + nb201_fizH2]	addpd  xmm3, xmm0	addpd  xmm4, xmm1	addpd  xmm7, xmm2	movapd [esp + nb201_fixH2], xmm3	movapd [esp + nb201_fiyH2], xmm4	movapd [esp + nb201_fizH2], xmm7	mov edi, [ebp + nb201_faction]	;# update j forces 	addpd  xmm0, [esp + nb201_fjx]	addpd  xmm1, [esp + nb201_fjy]	addpd  xmm2, [esp + nb201_fjz]	movlpd xmm3, [edi + eax*8]	movlpd xmm4, [edi + eax*8 + 8]	movlpd xmm5, [edi + eax*8 + 16]	movhpd xmm3, [edi + ebx*8]	movhpd xmm4, [edi + ebx*8 + 8]	movhpd xmm5, [edi + ebx*8 + 16]	subpd xmm3, xmm0	subpd xmm4, xmm1	subpd xmm5, xmm2	movlpd [edi + eax*8], xmm3	movlpd [edi + eax*8 + 8], xmm4	movlpd [edi + eax*8 + 16], xmm5	movhpd [edi + ebx*8], xmm3	movhpd [edi + ebx*8 + 8], xmm4	movhpd [edi + ebx*8 + 16], xmm5					;# should we do one more iteration? 	sub dword ptr [esp + nb201_innerk],  2	jl    .nb201_checksingle	jmp   .nb201_unroll_loop.nb201_checksingle:		mov   edx, [esp + nb201_innerk]	and   edx, 1	jnz   .nb201_dosingle	jmp   .nb201_updateouterdata.nb201_dosingle:	mov   edx, [esp + nb201_innerjjnr]     ;# pointer to jjnr[k] 	mov   eax, [edx]		add dword ptr [esp + nb201_innerjjnr],  4		mov esi, [ebp + nb201_charge]    ;# base of charge[] 	xorpd xmm3, xmm3	movlpd xmm3, [esi + eax*8]	movapd xmm4, xmm3	mulpd  xmm3, [esp + nb201_iqO]	mulpd  xmm4, [esp + nb201_iqH]	movapd  [esp + nb201_qqO], xmm3	movapd  [esp + nb201_qqH], xmm4		mov esi, [ebp + nb201_pos]       ;# base of pos[] 	lea   eax, [eax + eax*2]     ;# replace jnr with j3 	;# move coordinates to xmm0-xmm2 	movlpd xmm0, [esi + eax*8]	movlpd xmm1, [esi + eax*8 + 8]	movlpd xmm2, [esi + eax*8 + 16]	;# move ixO-izO to xmm4-xmm6 	movapd xmm4, [esp + nb201_ixO]	movapd xmm5, [esp + nb201_iyO]	movapd xmm6, [esp + nb201_izO]	;# calc dr 	subsd xmm4, xmm0	subsd xmm5, xmm1	subsd xmm6, xmm2	;# store dr 	movapd [esp + nb201_dxO], xmm4	movapd [esp + nb201_dyO], xmm5	movapd [esp + nb201_dzO], xmm6	;# square it 	mulsd xmm4,xmm4	mulsd xmm5,xmm5	mulsd xmm6,xmm6	addsd xmm4, xmm5	addsd xmm4, xmm6	movapd xmm7, xmm4	;# rsqO in xmm7 	;# move ixH1-izH1 to xmm4-xmm6 	movapd xmm4, [esp + nb201_ixH1]	movapd xmm5, [esp + nb201_iyH1]	movapd xmm6, [esp + nb201_izH1]	;# calc dr 	subsd xmm4, xmm0	subsd xmm5, xmm1	subsd xmm6, xmm2	;# store dr 	movapd [esp + nb201_dxH1], xmm4	movapd [esp + nb201_dyH1], xmm5	movapd [esp + nb201_dzH1], xmm6	;# square it 	mulsd xmm4,xmm4	mulsd xmm5,xmm5	mulsd xmm6,xmm6	addsd xmm6, xmm5	addsd xmm6, xmm4	;# rsqH1 in xmm6 	;# move ixH2-izH2 to xmm3-xmm5  	movapd xmm3, [esp + nb201_ixH2]	movapd xmm4, [esp + nb201_iyH2]	movapd xmm5, [esp + nb201_izH2]	;# calc dr 	subsd xmm3, xmm0	subsd xmm4, xmm1	subsd xmm5, xmm2	;# store dr 	movapd [esp + nb201_dxH2], xmm3	movapd [esp + nb201_dyH2], xmm4	movapd [esp + nb201_dzH2], xmm5	;# square it 	mulsd xmm3,xmm3	mulsd xmm4,xmm4	mulsd xmm5,xmm5	addsd xmm5, xmm4	addsd xmm5, xmm3	;# rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7 		movapd xmm0, xmm5	movapd xmm1, xmm6	movapd xmm2, xmm7	mulsd  xmm0, [esp + nb201_krf]		mulsd  xmm1, [esp + nb201_krf]		mulsd  xmm2, [esp + nb201_krf]		movapd [esp + nb201_krsqH2], xmm0	movapd [esp + nb201_krsqH1], xmm1	movapd [esp + nb201_krsqO], xmm2		;# start with rsqO - put seed in xmm2 	cvtsd2ss xmm2, xmm7		rsqrtss xmm2, xmm2	cvtss2sd xmm2, xmm2	movapd  xmm3, xmm2	mulsd   xmm2, xmm2	movapd  xmm4, [esp + nb201_three]	mulsd   xmm2, xmm7	;# rsq*lu*lu 	subsd   xmm4, xmm2	;# 30-rsq*lu*lu 	mulsd   xmm4, xmm3	;# lu*(3-rsq*lu*lu) 	mulsd   xmm4, [esp + nb201_half] ;# iter1 ( new lu) 	movapd xmm3, xmm4	mulsd xmm4, xmm4	;# lu*lu 	mulsd xmm7, xmm4	;# rsq*lu*lu 	movapd xmm4, [esp + nb201_three]	subsd xmm4, xmm7	;# 3-rsq*lu*lu 	mulsd xmm4, xmm3	;# lu*(	3-rsq*lu*lu) 	mulsd xmm4, [esp + nb201_half] ;# rinv 	movapd  xmm7, xmm4	;# rinvO in xmm7 		;# rsqH1 - seed in xmm2 	cvtsd2ss xmm2, xmm6		rsqrtss xmm2, xmm2	cvtss2sd xmm2, xmm2	movapd  xmm3, xmm2	mulsd   xmm2, xmm2	movapd  xmm4, [esp + nb201_three]	mulsd   xmm2, xmm6	;# rsq*lu*lu 	subsd   xmm4, xmm2	;# 30-rsq*lu*lu 	mulsd   xmm4, xmm3	;# lu*(3-rsq*lu*lu) 	mulsd   xmm4, [esp + nb201_half] ;# iter1 ( new lu) 	movapd xmm3, xmm4	mulsd xmm4, xmm4	;# lu*lu 	mulsd xmm6, xmm4	;# rsq*lu*lu 	movapd xmm4, [esp + nb201_three]	subsd xmm4, xmm6	;# 3-rsq*lu*lu 	mulsd xmm4, xmm3	;# lu*(	3-rsq*lu*lu) 	mulsd xmm4, [esp + nb201_half] ;# rinv 	movapd  xmm6, xmm4	;# rinvH1 in xmm6 		;# rsqH2 - seed in xmm2 	cvtsd2ss xmm2, xmm5		rsqrtss xmm2, xmm2	cvtss2sd xmm2, xmm2	movapd  xmm3, xmm2	mulsd   xmm2, xmm2	movapd  xmm4, [esp + nb201_three]	mulsd   xmm2, xmm5	;# rsq*lu*lu 	subsd   xmm4, xmm2	;# 30-rsq*lu*lu 	mulsd   xmm4, xmm3	;# lu*(3-rsq*lu*lu) 	mulsd   xmm4, [esp + nb201_half] ;# iter1 ( new lu) 	movapd xmm3, xmm4	mulsd xmm4, xmm4	;# lu*lu 	mulsd xmm5, xmm4	;# rsq*lu*lu 	movapd xmm4, [esp + nb201_three]	subsd xmm4, xmm5	;# 3-rsq*lu*lu 	mulsd xmm4, xmm3	;# lu*(	3-rsq*lu*lu) 	mulsd xmm4, [esp + nb201_half] ;# rinv 	movapd  xmm5, xmm4	;# rinvH2 in xmm5 	;# do O interactions 	movapd  xmm4, xmm7		mulsd   xmm4, xmm4	;# xmm6=rinv, xmm4=rinvsq 	movapd  xmm3, xmm7	movapd  xmm0, [esp + nb201_krsqO]	addsd   xmm7, xmm0	;# xmm6=rinv+ krsq 	mulsd   xmm0, [esp + nb201_two]	subsd   xmm7, [esp + nb201_crf]	subsd   xmm3, xmm0	;# xmm7=rinv-2*krsq 	mulsd   xmm7, [esp + nb201_qqO] ;# vcoul 	mulsd   xmm3, [esp + nb201_qqO]	mulsd  xmm4, xmm3	;# total fsH1 in xmm4 		addsd  xmm7, [esp + nb201_vctot]	movapd xmm0, [esp + nb201_dxO]	movapd xmm1, [esp + nb201_dyO]	movapd xmm2, [esp + nb201_dzO]	movlpd [esp + nb201_vctot], xmm7	mulsd  xmm0, xmm4	mulsd  xmm1, xmm4	mulsd  xmm2, xmm4	;# update O forces 	movapd xmm3, [esp + nb201_fixO]	movapd xmm4, [esp + nb201_fiyO]	movapd xmm7, [esp + nb201_fizO]	addsd  xmm3, xmm0	addsd  xmm4, xmm1	addsd  xmm7, xmm2	movlpd [esp + nb201_fixO], xmm3	movlpd [esp + nb201_fiyO], xmm4	movlpd [esp + nb201_fizO], xmm7	;# update j forces with water O 	movlpd [esp + nb201_fjx], xmm0	movlpd [esp + nb201_fjy], xmm1	movlpd [esp + nb201_fjz], xmm2	;# H1 interactions 	movapd  xmm4, xmm6		mulsd   xmm4, xmm4	;# xmm6=rinv, xmm4=rinvsq 	movapd  xmm7, xmm6	movapd  xmm0, [esp + nb201_krsqH1]	addsd   xmm6, xmm0	;# xmm6=rinv+ krsq 	mulsd   xmm0, [esp + nb201_two]	subsd   xmm6, [esp + nb201_crf]	subsd   xmm7, xmm0	;# xmm7=rinv-2*krsq 	mulsd   xmm6, [esp + nb201_qqH] ;# vcoul 	mulsd   xmm7, [esp + nb201_qqH]	mulsd  xmm4, xmm7		;# total fsH1 in xmm4 		addsd  xmm6, [esp + nb201_vctot]	movapd xmm0, [esp + nb201_dxH1]	movapd xmm1, [esp + nb201_dyH1]	movapd xmm2, [esp + nb201_dzH1]	movlpd [esp + nb201_vctot], xmm6	mulsd  xmm0, xmm4	mulsd  xmm1, xmm4	mulsd  xmm2, xmm4	;# update H1 forces 	movapd xmm3, [esp + nb201_fixH1]	movapd xmm4, [esp + nb201_fiyH1]	movapd xmm7, [esp + nb201_fizH1]	addsd  xmm3, xmm0	addsd  xmm4, xmm1	addsd  xmm7, xmm2	movlpd [esp + nb201_fixH1], xmm3	movlpd [esp + nb201_fiyH1], xmm4	movlpd [esp + nb201_fizH1], xmm7	;# update j forces with water H1 	addsd  xmm0, [esp + nb201_fjx]	addsd  xmm1, [esp + nb201_fjy]	addsd  xmm2, [esp + nb201_fjz]	movlpd [esp + nb201_fjx], xmm0	movlpd [esp + nb201_fjy], xmm1	movlpd [esp + nb201_fjz], xmm2	;# H2 interactions 

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?