nb_kernel301_x86_64_sse2.intel_syntax.s

来自「最著名最快的分子模拟软件」· S 代码 · 共 2,119 行 · 第 1/4 页

S
2,119
字号
    ;# Load LOTS of table data    movlpd xmm0,  [rsi + r8*8]    movlpd xmm1,  [rsi + r8*8 + 8]    movlpd xmm2,  [rsi + r8*8 + 16]    movlpd xmm3,  [rsi + r8*8 + 24]    movlpd xmm4,  [rsi + r10*8]    movlpd xmm5,  [rsi + r10*8 + 8]    movlpd xmm6,  [rsi + r10*8 + 16]    movlpd xmm7,  [rsi + r10*8 + 24]    movlpd xmm8,  [rsi + r12*8]    movlpd xmm9,  [rsi + r12*8 + 8]    movlpd xmm10, [rsi + r12*8 + 16]    movlpd xmm11, [rsi + r12*8 + 24]    movhpd xmm0,  [rsi + r9*8]    movhpd xmm1,  [rsi + r9*8 + 8]    movhpd xmm2,  [rsi + r9*8 + 16]    movhpd xmm3,  [rsi + r9*8 + 24]    movhpd xmm4,  [rsi + r11*8]    movhpd xmm5,  [rsi + r11*8 + 8]    movhpd xmm6,  [rsi + r11*8 + 16]    movhpd xmm7,  [rsi + r11*8 + 24]    movhpd xmm8,  [rsi + r13*8]    movhpd xmm9,  [rsi + r13*8 + 8]    movhpd xmm10, [rsi + r13*8 + 16]    movhpd xmm11, [rsi + r13*8 + 24]    ;# table data ready in xmm0-xmm3 , xmm4-xmm7 , and xmm8-xmm11        movapd xmm12, [rsp + nb301_epsO]    movapd xmm13, [rsp + nb301_epsH1]    movapd xmm14, [rsp + nb301_epsH2]        mulpd  xmm3, xmm12   ;# Heps    mulpd  xmm7, xmm13    mulpd  xmm11, xmm14     mulpd  xmm2, xmm12   ;# Geps    mulpd  xmm6, xmm13    mulpd  xmm10, xmm14     mulpd  xmm3, xmm12   ;# Heps2    mulpd  xmm7, xmm13    mulpd  xmm11, xmm14     addpd  xmm1, xmm2   ;# F+Geps    addpd  xmm5, xmm6    addpd  xmm9, xmm10     addpd  xmm1, xmm3   ;# F+Geps+Heps2 = Fp    addpd  xmm5, xmm7    addpd  xmm9, xmm11     addpd  xmm3, xmm3    ;# 2*Heps2    addpd  xmm7, xmm7    addpd  xmm11, xmm11    addpd  xmm3, xmm2    ;# 2*Heps2+Geps    addpd  xmm7, xmm6      addpd  xmm11, xmm10    addpd  xmm3, xmm1   ;# FF = Fp + 2*Heps2 + Geps    addpd  xmm7, xmm5    addpd  xmm11, xmm9    mulpd  xmm1, xmm12   ;# eps*Fp    mulpd  xmm5, xmm13    mulpd  xmm9, xmm14    movapd xmm12, [rsp + nb301_qqO]    movapd xmm13, [rsp + nb301_qqH]    addpd  xmm1, xmm0     ;# VV    addpd  xmm5, xmm4    addpd  xmm9, xmm8    mulpd  xmm1, xmm12   ;# VV*qq = vcoul    mulpd  xmm5, xmm13    mulpd  xmm9, xmm13    mulpd  xmm3, xmm12    ;# FF*qq = fij    mulpd  xmm7, xmm13    mulpd  xmm11, xmm13        ;# accumulate vctot    addpd  xmm1, [rsp + nb301_vctot]    addpd  xmm5, xmm9    addpd  xmm1, xmm5    movapd [rsp + nb301_vctot], xmm1        movapd xmm10, [rsp + nb301_tsc]    mulpd  xmm3, xmm10  ;# fscal    mulpd  xmm7, xmm10    mulpd  xmm10, xmm11        xorpd  xmm4, xmm4    xorpd  xmm8, xmm8    xorpd  xmm11, xmm11        mulpd  xmm3, [rsp + nb301_rinvO]    mulpd  xmm7, [rsp + nb301_rinvH1]    mulpd  xmm10,  [rsp + nb301_rinvH2]        subpd  xmm4, xmm3    subpd  xmm8, xmm7    subpd  xmm11, xmm10        ;# move j forces to xmm0-xmm2	movlpd xmm0, [rdi + rax*8]	movlpd xmm1, [rdi + rax*8 + 8]	movlpd xmm2, [rdi + rax*8 + 16]	movhpd xmm0, [rdi + rbx*8]	movhpd xmm1, [rdi + rbx*8 + 8]	movhpd xmm2, [rdi + rbx*8 + 16]    movapd xmm3, xmm4    movapd xmm5, xmm4    movapd xmm7, xmm8    movapd xmm9, xmm8    movapd xmm10, xmm11    movapd xmm12, xmm11	mulpd xmm3, [rsp + nb301_dxO]	mulpd xmm4, [rsp + nb301_dyO]	mulpd xmm5, [rsp + nb301_dzO]	mulpd xmm7, [rsp + nb301_dxH1]	mulpd xmm8, [rsp + nb301_dyH1]	mulpd xmm9, [rsp + nb301_dzH1]	mulpd xmm10, [rsp + nb301_dxH2]	mulpd xmm11, [rsp + nb301_dyH2]	mulpd xmm12, [rsp + nb301_dzH2]    addpd xmm0, xmm3    addpd xmm1, xmm4    addpd xmm2, xmm5    addpd xmm3, [rsp + nb301_fixO]    addpd xmm4, [rsp + nb301_fiyO]    addpd xmm5, [rsp + nb301_fizO]    addpd xmm0, xmm7    addpd xmm1, xmm8    addpd xmm2, xmm9    addpd xmm7, [rsp + nb301_fixH1]    addpd xmm8, [rsp + nb301_fiyH1]    addpd xmm9, [rsp + nb301_fizH1]    addpd xmm0, xmm10    addpd xmm1, xmm11    addpd xmm2, xmm12    addpd xmm10, [rsp + nb301_fixH2]    addpd xmm11, [rsp + nb301_fiyH2]    addpd xmm12, [rsp + nb301_fizH2]    movapd [rsp + nb301_fixO], xmm3    movapd [rsp + nb301_fiyO], xmm4    movapd [rsp + nb301_fizO], xmm5    movapd [rsp + nb301_fixH1], xmm7    movapd [rsp + nb301_fiyH1], xmm8    movapd [rsp + nb301_fizH1], xmm9    movapd [rsp + nb301_fixH2], xmm10    movapd [rsp + nb301_fiyH2], xmm11    movapd [rsp + nb301_fizH2], xmm12       ;# store back j forces from xmm0-xmm2	movlpd [rdi + rax*8], xmm0	movlpd [rdi + rax*8 + 8], xmm1	movlpd [rdi + rax*8 + 16], xmm2	movhpd [rdi + rbx*8], xmm0	movhpd [rdi + rbx*8 + 8], xmm1	movhpd [rdi + rbx*8 + 16], xmm2	;# should we do one more iteration? 	sub dword ptr [rsp + nb301_innerk],  2	jl    .nb301_checksingle	jmp   .nb301_unroll_loop.nb301_checksingle:		mov   edx, [rsp + nb301_innerk]	and   edx, 1	jnz   .nb301_dosingle	jmp   .nb301_updateouterdata.nb301_dosingle:	mov   rdx, [rsp + nb301_innerjjnr]     ;# pointer to jjnr[k] 	mov   eax, [rdx]		mov rsi, [rbp + nb301_charge]    ;# base of charge[] 	xorpd xmm3, xmm3	movlpd xmm3, [rsi + rax*8]	movapd xmm4, xmm3	     	mulpd  xmm3, [rsp + nb301_iqO]	mulpd  xmm4, [rsp + nb301_iqH]	movapd  [rsp + nb301_qqO], xmm3	movapd  [rsp + nb301_qqH], xmm4		mov rsi, [rbp + nb301_pos]       ;# base of pos[] 	lea   rax, [rax + rax*2]     ;# replace jnr with j3 	;# move coordinates to xmm0-xmm2 		movlpd xmm4, [rsi + rax*8]	movlpd xmm5, [rsi + rax*8 + 8]	movlpd xmm6, [rsi + rax*8 + 16]    movapd xmm0, xmm4    movapd xmm1, xmm5    movapd xmm2, xmm6	;# calc dr 	subsd xmm4, [rsp + nb301_ixO]	subsd xmm5, [rsp + nb301_iyO]	subsd xmm6, [rsp + nb301_izO]	;# store dr 	movapd [rsp + nb301_dxO], xmm4	movapd [rsp + nb301_dyO], xmm5	movapd [rsp + nb301_dzO], xmm6    	;# square it 	mulsd xmm4,xmm4	mulsd xmm5,xmm5	mulsd xmm6,xmm6	addsd xmm4, xmm5	addsd xmm4, xmm6	movapd xmm7, xmm4	;# rsqO in xmm7 	;# move j coords to xmm4-xmm6 	movapd xmm4, xmm0	movapd xmm5, xmm1	movapd xmm6, xmm2	;# calc dr 	subsd xmm4, [rsp + nb301_ixH1]	subsd xmm5, [rsp + nb301_iyH1]	subsd xmm6, [rsp + nb301_izH1]	;# store dr 	movapd [rsp + nb301_dxH1], xmm4	movapd [rsp + nb301_dyH1], xmm5	movapd [rsp + nb301_dzH1], xmm6	;# square it 	mulsd xmm4,xmm4	mulsd xmm5,xmm5	mulsd xmm6,xmm6	addsd xmm6, xmm5	addsd xmm6, xmm4	;# rsqH1 in xmm6 	;# move j coords to xmm3-xmm5	movapd xmm3, xmm0	movapd xmm4, xmm1	movapd xmm5, xmm2	;# calc dr 	subsd xmm3, [rsp + nb301_ixH2]	subsd xmm4, [rsp + nb301_iyH2]	subsd xmm5, [rsp + nb301_izH2]	;# store dr 	movapd [rsp + nb301_dxH2], xmm3	movapd [rsp + nb301_dyH2], xmm4	movapd [rsp + nb301_dzH2], xmm5	;# square it 	mulsd xmm3,xmm3	mulsd xmm4,xmm4	mulsd xmm5,xmm5	addsd xmm5, xmm4	addsd xmm5, xmm3	;# rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7 	;# start with rsqO - put seed in xmm2 	cvtsd2ss xmm2, xmm7		rsqrtss xmm2, xmm2	cvtss2sd xmm2, xmm2	movapd  xmm3, xmm2	mulsd   xmm2, xmm2	movapd  xmm4, [rsp + nb301_three]	mulsd   xmm2, xmm7	;# rsq*lu*lu 	subsd   xmm4, xmm2	;# 30-rsq*lu*lu 	mulsd   xmm4, xmm3	;# lu*(3-rsq*lu*lu) 	mulsd   xmm4, [rsp + nb301_half] ;# iter1 ( new lu) 	movapd xmm2, xmm7	movapd xmm3, xmm4	mulsd xmm4, xmm4	;# lu*lu 	mulsd xmm2, xmm4	;# rsq*lu*lu 	movapd xmm4, [rsp + nb301_three]	subsd xmm4, xmm2	;# 3-rsq*lu*lu 	mulsd xmm4, xmm3	;# lu*(	3-rsq*lu*lu) 	mulsd xmm4, [rsp + nb301_half] ;# rinv 	movapd  [rsp + nb301_rinvO], xmm4	;# rinvO in xmm4 	mulsd   xmm7, xmm4	movapd  [rsp + nb301_rO], xmm7	;# r in xmm7 		;# rsqH1 - seed in xmm2 	cvtsd2ss xmm2, xmm6		rsqrtss xmm2, xmm2	cvtss2sd xmm2, xmm2	movapd  xmm3, xmm2	mulsd   xmm2, xmm2	movapd  xmm4, [rsp + nb301_three]	mulsd   xmm2, xmm6	;# rsq*lu*lu 	subsd   xmm4, xmm2	;# 30-rsq*lu*lu 	mulsd   xmm4, xmm3	;# lu*(3-rsq*lu*lu) 	mulsd   xmm4, [rsp + nb301_half] ;# iter1 ( new lu) 	movapd xmm2, xmm6	movapd xmm3, xmm4	mulsd xmm4, xmm4	;# lu*lu 	mulsd xmm2, xmm4	;# rsq*lu*lu 	movapd xmm4, [rsp + nb301_three]	subsd xmm4, xmm2	;# 3-rsq*lu*lu 	mulsd xmm4, xmm3	;# lu*(	3-rsq*lu*lu) 	mulsd xmm4, [rsp + nb301_half] ;# rinv 	movapd [rsp + nb301_rinvH1], xmm4	;# rinvH1 	mulsd  xmm6, xmm4	movapd [rsp + nb301_rH1], xmm6	;# rH1 		;# rsqH2 - seed in xmm2 	cvtsd2ss xmm2, xmm5		rsqrtss xmm2, xmm2	cvtss2sd xmm2, xmm2	movapd  xmm3, xmm2	mulsd   xmm2, xmm2	movapd  xmm4, [rsp + nb301_three]	mulsd   xmm2, xmm5	;# rsq*lu*lu 	subsd   xmm4, xmm2	;# 30-rsq*lu*lu 	mulsd   xmm4, xmm3	;# lu*(3-rsq*lu*lu) 	mulsd   xmm4, [rsp + nb301_half] ;# iter1 ( new lu) 	movapd xmm2, xmm5	movapd xmm3, xmm4	mulsd xmm4, xmm4	;# lu*lu 	mulsd xmm2, xmm4	;# rsq*lu*lu 	movapd xmm4, [rsp + nb301_three]	subsd xmm4, xmm2	;# 3-rsq*lu*lu 	mulsd xmm4, xmm3	;# lu*(	3-rsq*lu*lu) 	mulsd xmm4, [rsp + nb301_half] ;# rinv 	movapd [rsp + nb301_rinvH2], xmm4 ;# rinv 	mulsd xmm5, xmm4	movapd [rsp + nb301_rH2], xmm5 ;# r 	;# do O interactions 	;# rO is still in xmm7 	mulsd   xmm7, [rsp + nb301_tsc]	cvttsd2si r8d, xmm7	;# mm6 = lu idx 	cvtsi2sd xmm6, r8d	subsd xmm7, xmm6	movapd xmm1, xmm7	;# xmm1=eps 	movapd xmm2, xmm1		mulsd  xmm2, xmm2	;# xmm2=eps2 		shl r8d, 2		;# idx *= 4 	mov  rsi, [rbp + nb301_VFtab]	movapd xmm4, [rsi + r8*8]	;# Y1 F1 	xorpd xmm3, xmm3		movapd xmm5, xmm4	unpcklpd xmm4, xmm3	;# Y1 	unpckhpd xmm5, xmm3	;# F1  	movapd xmm6, [rsi + r8*8 + 16]	;# G1 H1 	xorpd xmm3, xmm3	movapd xmm7, xmm6	unpcklpd xmm6, xmm3	;# G1 	unpckhpd xmm7, xmm3	;# H1 	;# coulomb table ready, in xmm4-xmm7  			mulsd  xmm6, xmm1	;# xmm6=Geps 	mulsd  xmm7, xmm2	;# xmm7=Heps2 	addsd  xmm5, xmm6	addsd  xmm5, xmm7	;# xmm5=Fp 		mulsd  xmm7, [rsp + nb301_two]	;# two*Heps2 	movapd xmm3, [rsp + nb301_qqO]	addsd  xmm7, xmm6	addsd  xmm7, xmm5 ;# xmm7=FF 	mulsd  xmm5, xmm1 ;# xmm5=eps*Fp 	addsd  xmm5, xmm4 ;# xmm5=VV 	mulsd  xmm5, xmm3 ;# vcoul=qq*VV  	mulsd  xmm3, xmm7 ;# fijC=FF*qq     ;# at this point mm5 contains vcoul and xmm3 fijC     ;# increment vcoul - then we can get rid of mm5     addsd  xmm5, [rsp + nb301_vctot]    movlpd [rsp + nb301_vctot], xmm5 	xorpd  xmm4, xmm4	mulsd  xmm3, [rsp + nb301_tsc]	mulsd  xmm3, [rsp + nb301_rinvO]		subsd  xmm4, xmm3	movapd xmm0, [rsp + nb301_dxO]	movapd xmm1, [rsp + nb301_dyO]	movapd xmm2, [rsp + nb301_dzO]	mulsd  xmm0, xmm4	mulsd  xmm1, xmm4	mulsd  xmm2, xmm4	;# tx in xmm0-xmm2 	;# update O forces 	movapd xmm3, [rsp + nb301_fixO]	movapd xmm4, [rsp + nb301_fiyO]	movapd xmm7, [rsp + nb301_fizO]	addsd  xmm3, xmm0	addsd  xmm4, xmm1	addsd  xmm7, xmm2	movlpd [rsp + nb301_fixO], xmm3	movlpd [rsp + nb301_fiyO], xmm4	movlpd [rsp + nb301_fizO], xmm7	;# update j forces with water O 	movlpd [rsp + nb301_fjx], xmm0	movlpd [rsp + nb301_fjy], xmm1	movlpd [rsp + nb301_fjz], xmm2	;# Done with O interactions - now H1! 	movapd xmm7, [rsp + nb301_rH1]	mulsd xmm7, [rsp + nb301_tsc]	cvttsd2si r8d, xmm7	;# mm6 = lu idx 	cvtsi2sd xmm6, r8d	subsd xmm7, xmm6	movapd xmm1, xmm7	;# xmm1=eps 	movapd xmm2, xmm1		mulsd  xmm2, xmm2	;# xmm2=eps2 		shl r8d, 2		;# idx *= 4 	mov  rsi, [rbp + nb301_VFtab]		movapd xmm4, [rsi + r8*8]	;# Y1 F1 	xorpd xmm3, xmm3	movapd xmm5, xmm4	unpcklpd xmm4, xmm3	;# Y1  	unpckhpd xmm5, xmm3	;# F1  	movapd xmm6, [rsi + r8*8 + 16]	;# G1 H1 	xorpd xmm3, xmm3	movapd xmm7, xmm6	unpcklpd xmm6, xmm3	;# G1 	unpckhpd xmm7, xmm3	;# H1 	;# coulomb table ready, in xmm4-xmm7  			mulsd  xmm6, xmm1	;# xmm6=Geps 	mulsd  xmm7, xmm2	;# xmm7=Heps2 	addsd  xmm5, xmm6	addsd  xmm5, xmm7	;# xmm5=Fp 		mulsd  xmm7, [rsp + nb301_two]	;# two*Heps2 	movapd xmm3, [rsp + nb301_qqH]	addsd  xmm7, xmm6	addsd  xmm7, xmm5 ;# xmm7=FF 	mulsd  xmm5, xmm1 ;# xmm5=eps*Fp 	addsd  xmm5, xmm4 ;# xmm5=VV 	mulsd  xmm5, xmm3 ;# vcoul=qq*VV  	mulsd  xmm3, xmm7 ;# fijC=FF*qq     ;# at this point mm5 contains vcoul and xmm3 fijC     ;# increment vcoul 	xorpd  xmm4, xmm4    addsd  xmm5, [rsp + nb301_vctot]	mulsd  xmm3, [rsp + nb301_rinvH1]    movlpd [rsp + nb301_vctot], xmm5 	mulsd  xmm3, [rsp + nb301_tsc]	subsd xmm4, xmm3	movapd xmm0, [rsp + nb301_dxH1]	movapd xmm1, [rsp + nb301_dyH1]	movapd xmm2, [rsp + nb301_dzH1]	mulsd  xmm0, xmm4	mulsd  xmm1, xmm4	mulsd  xmm2, xmm4	;# update H1 forces 	movapd xmm3, [rsp + nb301_fixH1]	movapd xmm4, [rsp + nb301_fiyH1]	movapd xmm7, [rsp + nb301_fizH1]	addsd  xmm3, xmm0	addsd  xmm4, xmm1	addsd  xmm7, xmm2	movlpd [rsp + nb301_fixH1], xmm3	movlpd [rsp + nb301_fiyH1], xmm4	movlpd [rsp + nb301_fizH1], xmm7	;# update j forces with water H1 	addsd  xmm0, [rsp + nb301_fjx]	addsd  xmm1, [rsp + nb301_fjy]	addsd  xmm2, [rsp + nb301_fjz]	movlpd [rsp + nb301_fjx], xmm0	movlpd [rsp + nb301_fjy], xmm1	movlpd [rsp + nb301_fjz], xmm2	;# Done with H1, finally we do H2 interactions 	movapd xmm7, [rsp + nb301_rH2]	mulsd   xmm7, [rsp + nb301_tsc]	cvttsd2si r8d, xmm7	;# mm6 = lu idx 	cvtsi2sd xmm6, r8d	subsd xmm7, xmm6	movapd xmm1, xmm7	;# xmm1=eps 	movapd xmm2, xmm1		mulsd  xmm2, xmm2	;# xmm2=eps2 		shl r8d, 2		;# idx *= 4 	mov  rsi, [rbp + nb301_VFtab]	movapd xmm4, [rsi + r8*8]	;# Y1 F1 	xorpd xmm3, xmm3	movapd xmm5, xmm4	unpcklpd xmm4, xmm3	;# Y1 	unpckhpd xmm5, xmm3	;# F1 	movapd xmm6, [rsi + r8*8 + 16]	;# G1 H1 	xorpd xmm3, xmm3	movapd xmm7, xmm6	unpcklpd xmm6, xmm3	;# G1 	unpckhpd xmm7, xmm3	;# H1 	;# coulomb table ready, in xmm4-xmm7  			mulsd  xmm6, xmm1	;# xmm6=Geps 	mulsd  xmm7, xmm2	;# xmm7=Heps2 	addsd  xmm5, xmm6	addsd  xmm5, xmm7	;# xmm5=Fp 		mulsd  xmm7, [rsp + nb301_two]	;# two*Heps2 	movapd xmm3, [rsp + nb301_qqH]	addsd  xmm7, xmm6	addsd  xmm7, xmm5 ;# xmm7=FF 	mulsd  xmm5, xmm1 ;# xmm5=eps*Fp 	addsd  xmm5, xmm4 ;# xmm5=VV 	mulsd  xmm5, xmm3 ;# vcoul=qq*VV  	mulsd  xmm3, xmm7 ;# fijC=FF*qq     ;# at this point mm5 contains vcoul and xmm3 fijC     ;# increment vcoul 	xorpd  xmm4, xmm4    addsd  xmm5, [rsp + nb301_vctot]	mulsd  xmm3, [rsp + nb301_rinvH2]    movlpd [rsp + nb301_vctot], xmm5 	mulsd  xmm3, [rsp + nb301_tsc]	subsd  xmm4, xmm3	movapd xmm0, [rsp + nb301_dxH2]	movapd xmm1, [rsp + nb301_dyH2]	movapd xmm2, [rsp + nb301_dzH2]	mulsd  xmm0, xmm4	mulsd  xmm1, xmm4	mulsd  xmm2, xmm4	;# update H2 forces 	movapd xmm3, [rsp + nb301_fixH2]	movapd xmm4, [rsp + nb301_fiyH2]	movapd xmm7, [rsp + nb301_fizH2]	addsd  xmm3, xmm0

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?