nb_kernel201_x86_64_sse2.intel_syntax.s

来自「最著名最快的分子模拟软件」· S 代码 · 共 1,738 行 · 第 1/4 页

S
1,738
字号
	mov   [rsp + nb201nf_innerk], edx    ;# number of innerloop atoms 	jge   .nb201nf_unroll_loop	jmp   .nb201nf_checksingle.nb201nf_unroll_loop:	;# twice unrolled innerloop here 	mov   rdx, [rsp + nb201nf_innerjjnr]     ;# pointer to jjnr[k] 	mov   eax, [rdx]		mov   ebx, [rdx + 4]	add qword ptr [rsp + nb201nf_innerjjnr],  8	;# advance pointer (unrolled 2) 	mov rsi, [rbp + nb201nf_charge]    ;# base of charge[] 		movlpd xmm3, [rsi + rax*8]	movhpd xmm3, [rsi + rbx*8]	movapd xmm4, xmm3	mulpd  xmm3, [rsp + nb201nf_iqO]	mulpd  xmm4, [rsp + nb201nf_iqH]	movapd  [rsp + nb201nf_qqO], xmm3	movapd  [rsp + nb201nf_qqH], xmm4		mov rsi, [rbp + nb201nf_pos]       ;# base of pos[] 	lea   rax, [rax + rax*2]     ;# replace jnr with j3 	lea   rbx, [rbx + rbx*2]		;# move two coordinates to xmm0-xmm2 	movlpd xmm0, [rsi + rax*8]	movlpd xmm1, [rsi + rax*8 + 8]	movlpd xmm2, [rsi + rax*8 + 16]	movhpd xmm0, [rsi + rbx*8]	movhpd xmm1, [rsi + rbx*8 + 8]	movhpd xmm2, [rsi + rbx*8 + 16]	;# move ixO-izO to xmm4-xmm6 	movapd xmm4, [rsp + nb201nf_ixO]	movapd xmm5, [rsp + nb201nf_iyO]	movapd xmm6, [rsp + nb201nf_izO]	;# calc dr 	subpd xmm4, xmm0	subpd xmm5, xmm1	subpd xmm6, xmm2	;# square it 	mulpd xmm4,xmm4	mulpd xmm5,xmm5	mulpd xmm6,xmm6	addpd xmm4, xmm5	addpd xmm4, xmm6	movapd xmm7, xmm4	;# rsqO in xmm7 	;# move ixH1-izH1 to xmm4-xmm6 	movapd xmm4, [rsp + nb201nf_ixH1]	movapd xmm5, [rsp + nb201nf_iyH1]	movapd xmm6, [rsp + nb201nf_izH1]	;# calc dr 	subpd xmm4, xmm0	subpd xmm5, xmm1	subpd xmm6, xmm2	;# square it 	mulpd xmm4,xmm4	mulpd xmm5,xmm5	mulpd xmm6,xmm6	addpd xmm6, xmm5	addpd xmm6, xmm4	;# rsqH1 in xmm6 	;# move ixH2-izH2 to xmm3-xmm5  	movapd xmm3, [rsp + nb201nf_ixH2]	movapd xmm4, [rsp + nb201nf_iyH2]	movapd xmm5, [rsp + nb201nf_izH2]	;# calc dr 	subpd xmm3, xmm0	subpd xmm4, xmm1	subpd xmm5, xmm2	;# square it 	mulpd xmm3,xmm3	mulpd xmm4,xmm4	mulpd xmm5,xmm5	addpd xmm5, xmm4	addpd xmm5, xmm3	;# rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7 	movapd xmm0, xmm5	movapd xmm1, xmm6	movapd xmm2, xmm7	mulpd  xmm0, [rsp + nb201nf_krf]		mulpd  xmm1, [rsp + nb201nf_krf]		mulpd  xmm2, [rsp + nb201nf_krf]		movapd [rsp + nb201nf_krsqH2], xmm0	movapd [rsp + nb201nf_krsqH1], xmm1	movapd [rsp + nb201nf_krsqO], xmm2		;# start with rsqO - put seed in xmm2 	cvtpd2ps xmm2, xmm7		rsqrtps xmm2, xmm2	cvtps2pd xmm2, xmm2	movapd  xmm3, xmm2	mulpd   xmm2, xmm2	movapd  xmm4, [rsp + nb201nf_three]	mulpd   xmm2, xmm7	;# rsq*lu*lu 	subpd   xmm4, xmm2	;# 30-rsq*lu*lu 	mulpd   xmm4, xmm3	;# lu*(3-rsq*lu*lu) 	mulpd   xmm4, [rsp + nb201nf_half] ;# iter1 ( new lu) 	movapd xmm3, xmm4	mulpd xmm4, xmm4	;# lu*lu 	mulpd xmm7, xmm4	;# rsq*lu*lu 	movapd xmm4, [rsp + nb201nf_three]	subpd xmm4, xmm7	;# 3-rsq*lu*lu 	mulpd xmm4, xmm3	;# lu*(	3-rsq*lu*lu) 	mulpd xmm4, [rsp + nb201nf_half] ;# rinv 	movapd  xmm7, xmm4	;# rinvO in xmm7 		;# rsqH1 - seed in xmm2 	cvtpd2ps xmm2, xmm6		rsqrtps xmm2, xmm2	cvtps2pd xmm2, xmm2	movapd  xmm3, xmm2	mulpd   xmm2, xmm2	movapd  xmm4, [rsp + nb201nf_three]	mulpd   xmm2, xmm6	;# rsq*lu*lu 	subpd   xmm4, xmm2	;# 30-rsq*lu*lu 	mulpd   xmm4, xmm3	;# lu*(3-rsq*lu*lu) 	mulpd   xmm4, [rsp + nb201nf_half] ;# iter1 ( new lu) 	movapd xmm3, xmm4	mulpd xmm4, xmm4	;# lu*lu 	mulpd xmm6, xmm4	;# rsq*lu*lu 	movapd xmm4, [rsp + nb201nf_three]	subpd xmm4, xmm6	;# 3-rsq*lu*lu 	mulpd xmm4, xmm3	;# lu*(	3-rsq*lu*lu) 	mulpd xmm4, [rsp + nb201nf_half] ;# rinv 	movapd  xmm6, xmm4	;# rinvH1 in xmm6 		;# rsqH2 - seed in xmm2 	cvtpd2ps xmm2, xmm5		rsqrtps xmm2, xmm2	cvtps2pd xmm2, xmm2	movapd  xmm3, xmm2	mulpd   xmm2, xmm2	movapd  xmm4, [rsp + nb201nf_three]	mulpd   xmm2, xmm5	;# rsq*lu*lu 	subpd   xmm4, xmm2	;# 30-rsq*lu*lu 	mulpd   xmm4, xmm3	;# lu*(3-rsq*lu*lu) 	mulpd   xmm4, [rsp + nb201nf_half] ;# iter1 ( new lu) 	movapd xmm3, xmm4	mulpd xmm4, xmm4	;# lu*lu 	mulpd xmm5, xmm4	;# rsq*lu*lu 	movapd xmm4, [rsp + nb201nf_three]	subpd xmm4, xmm5	;# 3-rsq*lu*lu 	mulpd xmm4, xmm3	;# lu*(	3-rsq*lu*lu) 	mulpd xmm4, [rsp + nb201nf_half] ;# rinv 	movapd  xmm5, xmm4	;# rinvH2 in xmm5 	;# do O interactions 	movapd  xmm0, [rsp + nb201nf_krsqO]	addpd   xmm7, xmm0	;# xmm7=rinv+ krsq 	subpd   xmm7, [rsp + nb201nf_crf]	mulpd   xmm7, [rsp + nb201nf_qqO] ;# vcoul 		addpd  xmm7, [rsp + nb201nf_vctot]	;# H1 interactions 	movapd  xmm0, [rsp + nb201nf_krsqH1]	addpd   xmm6, xmm0	;# xmm6=rinv+ krsq 	subpd   xmm6, [rsp + nb201nf_crf]	mulpd   xmm6, [rsp + nb201nf_qqH] ;# vcoul 	addpd  xmm6, xmm7	;# H2 interactions 	movapd  xmm0, [rsp + nb201nf_krsqH2]	addpd   xmm5, xmm0	;# xmm5=rinv+ krsq 	subpd   xmm5, [rsp + nb201nf_crf]	mulpd   xmm5, [rsp + nb201nf_qqH] ;# vcoul 	addpd  xmm5, xmm6	movapd [rsp + nb201nf_vctot], xmm5		;# should we do one more iteration? 	sub dword ptr [rsp + nb201nf_innerk],  2	jl    .nb201nf_checksingle	jmp   .nb201nf_unroll_loop.nb201nf_checksingle:		mov   edx, [rsp + nb201nf_innerk]	and   edx, 1	jnz   .nb201nf_dosingle	jmp   .nb201nf_updateouterdata.nb201nf_dosingle:	mov   rdx, [rsp + nb201nf_innerjjnr]     ;# pointer to jjnr[k] 	mov   eax, [rdx]		add qword ptr [rsp + nb201nf_innerjjnr],  4		mov rsi, [rbp + nb201nf_charge]    ;# base of charge[] 	xorpd xmm3, xmm3	movlpd xmm3, [rsi + rax*8]	movapd xmm4, xmm3	mulpd  xmm3, [rsp + nb201nf_iqO]	mulpd  xmm4, [rsp + nb201nf_iqH]	movapd  [rsp + nb201nf_qqO], xmm3	movapd  [rsp + nb201nf_qqH], xmm4		mov rsi, [rbp + nb201nf_pos]       ;# base of pos[] 	lea   rax, [rax + rax*2]     ;# replace jnr with j3 	;# move coordinates to xmm0-xmm2 	movlpd xmm0, [rsi + rax*8]	movlpd xmm1, [rsi + rax*8 + 8]	movlpd xmm2, [rsi + rax*8 + 16]	;# move ixO-izO to xmm4-xmm6 	movapd xmm4, [rsp + nb201nf_ixO]	movapd xmm5, [rsp + nb201nf_iyO]	movapd xmm6, [rsp + nb201nf_izO]	;# calc dr 	subsd xmm4, xmm0	subsd xmm5, xmm1	subsd xmm6, xmm2	;# square it 	mulsd xmm4,xmm4	mulsd xmm5,xmm5	mulsd xmm6,xmm6	addsd xmm4, xmm5	addsd xmm4, xmm6	movapd xmm7, xmm4	;# rsqO in xmm7 	;# move ixH1-izH1 to xmm4-xmm6 	movapd xmm4, [rsp + nb201nf_ixH1]	movapd xmm5, [rsp + nb201nf_iyH1]	movapd xmm6, [rsp + nb201nf_izH1]	;# calc dr 	subsd xmm4, xmm0	subsd xmm5, xmm1	subsd xmm6, xmm2	;# square it 	mulsd xmm4,xmm4	mulsd xmm5,xmm5	mulsd xmm6,xmm6	addsd xmm6, xmm5	addsd xmm6, xmm4	;# rsqH1 in xmm6 	;# move ixH2-izH2 to xmm3-xmm5  	movapd xmm3, [rsp + nb201nf_ixH2]	movapd xmm4, [rsp + nb201nf_iyH2]	movapd xmm5, [rsp + nb201nf_izH2]	;# calc dr 	subsd xmm3, xmm0	subsd xmm4, xmm1	subsd xmm5, xmm2	;# square it 	mulsd xmm3,xmm3	mulsd xmm4,xmm4	mulsd xmm5,xmm5	addsd xmm5, xmm4	addsd xmm5, xmm3	;# rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7 		movapd xmm0, xmm5	movapd xmm1, xmm6	movapd xmm2, xmm7	mulsd  xmm0, [rsp + nb201nf_krf]		mulsd  xmm1, [rsp + nb201nf_krf]		mulsd  xmm2, [rsp + nb201nf_krf]		movapd [rsp + nb201nf_krsqH2], xmm0	movapd [rsp + nb201nf_krsqH1], xmm1	movapd [rsp + nb201nf_krsqO], xmm2		;# start with rsqO - put seed in xmm2 	cvtsd2ss xmm2, xmm7		rsqrtss xmm2, xmm2	cvtss2sd xmm2, xmm2	movapd  xmm3, xmm2	mulsd   xmm2, xmm2	movapd  xmm4, [rsp + nb201nf_three]	mulsd   xmm2, xmm7	;# rsq*lu*lu 	subsd   xmm4, xmm2	;# 30-rsq*lu*lu 	mulsd   xmm4, xmm3	;# lu*(3-rsq*lu*lu) 	mulsd   xmm4, [rsp + nb201nf_half] ;# iter1 ( new lu) 	movapd xmm3, xmm4	mulsd xmm4, xmm4	;# lu*lu 	mulsd xmm7, xmm4	;# rsq*lu*lu 	movapd xmm4, [rsp + nb201nf_three]	subsd xmm4, xmm7	;# 3-rsq*lu*lu 	mulsd xmm4, xmm3	;# lu*(	3-rsq*lu*lu) 	mulsd xmm4, [rsp + nb201nf_half] ;# rinv 	movapd  xmm7, xmm4	;# rinvO in xmm7 		;# rsqH1 - seed in xmm2 	cvtsd2ss xmm2, xmm6		rsqrtss xmm2, xmm2	cvtss2sd xmm2, xmm2	movapd  xmm3, xmm2	mulsd   xmm2, xmm2	movapd  xmm4, [rsp + nb201nf_three]	mulsd   xmm2, xmm6	;# rsq*lu*lu 	subsd   xmm4, xmm2	;# 30-rsq*lu*lu 	mulsd   xmm4, xmm3	;# lu*(3-rsq*lu*lu) 	mulsd   xmm4, [rsp + nb201nf_half] ;# iter1 ( new lu) 	movapd xmm3, xmm4	mulsd xmm4, xmm4	;# lu*lu 	mulsd xmm6, xmm4	;# rsq*lu*lu 	movapd xmm4, [rsp + nb201nf_three]	subsd xmm4, xmm6	;# 3-rsq*lu*lu 	mulsd xmm4, xmm3	;# lu*(	3-rsq*lu*lu) 	mulsd xmm4, [rsp + nb201nf_half] ;# rinv 	movapd  xmm6, xmm4	;# rinvH1 in xmm6 		;# rsqH2 - seed in xmm2 	cvtsd2ss xmm2, xmm5		rsqrtss xmm2, xmm2	cvtss2sd xmm2, xmm2	movapd  xmm3, xmm2	mulsd   xmm2, xmm2	movapd  xmm4, [rsp + nb201nf_three]	mulsd   xmm2, xmm5	;# rsq*lu*lu 	subsd   xmm4, xmm2	;# 30-rsq*lu*lu 	mulsd   xmm4, xmm3	;# lu*(3-rsq*lu*lu) 	mulsd   xmm4, [rsp + nb201nf_half] ;# iter1 ( new lu) 	movapd xmm3, xmm4	mulsd xmm4, xmm4	;# lu*lu 	mulsd xmm5, xmm4	;# rsq*lu*lu 	movapd xmm4, [rsp + nb201nf_three]	subsd xmm4, xmm5	;# 3-rsq*lu*lu 	mulsd xmm4, xmm3	;# lu*(	3-rsq*lu*lu) 	mulsd xmm4, [rsp + nb201nf_half] ;# rinv 	movapd  xmm5, xmm4	;# rinvH2 in xmm5 	;# do O interactions 	movapd  xmm0, [rsp + nb201nf_krsqO]	addsd   xmm7, xmm0	;# xmm7=rinv+ krsq 	subsd   xmm7, [rsp + nb201nf_crf]	mulsd   xmm7, [rsp + nb201nf_qqO] ;# vcoul 		addsd  xmm7, [rsp + nb201nf_vctot]	;# H1 interactions 	movapd  xmm0, [rsp + nb201nf_krsqH1]	addsd   xmm6, xmm0	;# xmm6=rinv+ krsq 	subsd   xmm6, [rsp + nb201nf_crf]	mulsd   xmm6, [rsp + nb201nf_qqH] ;# vcoul 	addsd  xmm6, xmm7	;# H2 interactions 	movapd  xmm0, [rsp + nb201nf_krsqH2]	addsd   xmm5, xmm0	;# xmm5=rinv+ krsq 	subsd   xmm5, [rsp + nb201nf_crf]	mulsd   xmm5, [rsp + nb201nf_qqH] ;# vcoul 	addsd  xmm5, xmm6	movlpd [rsp + nb201nf_vctot], xmm5	.nb201nf_updateouterdata:	;# get n from stack	mov esi, [rsp + nb201nf_n]        ;# get group index for i particle         mov   rdx, [rbp + nb201nf_gid]      	;# base of gid[]        mov   edx, [rdx + rsi*4]		;# ggid=gid[n]	;# accumulate total potential energy and update it 	movapd xmm7, [rsp + nb201nf_vctot]	;# accumulate 	movhlps xmm6, xmm7	addsd  xmm7, xmm6	;# low xmm7 has the sum now         	;# add earlier value from mem 	mov   rax, [rbp + nb201nf_Vc]	addsd xmm7, [rax + rdx*8] 	;# move back to mem 	movsd [rax + rdx*8], xmm7 	        ;# finish if last         mov ecx, [rsp + nb201nf_nn1]	;# esi already loaded with n	inc esi        sub ecx, esi        jz .nb201nf_outerend        ;# not last, iterate outer loop once more!          mov [rsp + nb201nf_n], esi        jmp .nb201nf_outer.nb201nf_outerend:        ;# check if more outer neighborlists remain        mov   ecx, [rsp + nb201nf_nri]	;# esi already loaded with n above        sub   ecx, esi        jz .nb201nf_end        ;# non-zero, do one more workunit        jmp   .nb201nf_threadloop.nb201nf_end:	mov eax, [rsp + nb201nf_nouter]	mov ebx, [rsp + nb201nf_ninner]	mov rcx, [rbp + nb201nf_outeriter]	mov rdx, [rbp + nb201nf_inneriter]	mov [rcx], eax	mov [rdx], ebx	add rsp, 440	emms        pop r15        pop r14        pop r13        pop r12	pop rbx	pop	rbp	ret

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?