nb_kernel331_x86_64_sse2.intel_syntax.s

来自「最著名最快的分子模拟软件」· S 代码 · 共 2,353 行 · 第 1/5 页

S
2,353
字号
	mulpd   xmm10, xmm5    mulpd   xmm11, xmm8 ;# lu*(3-rsq*lu*lu)	movapd  xmm15, [rsp + nb331_half]	mulpd   xmm9, xmm15  ;# first iteration for rinvO	mulpd   xmm10, xmm15 ;# first iteration for rinvH1    mulpd   xmm11, xmm15 ;# first iteration for rinvH2	    ;# second iteration step    	movapd  xmm2, xmm9	movapd  xmm5, xmm10    movapd  xmm8, xmm11    	mulpd   xmm2, xmm2 ;# lu*lu	mulpd   xmm5, xmm5 ;# lu*lu    mulpd   xmm8, xmm8 ;# lu*lu			movapd  xmm1, [rsp + nb331_three]	movapd  xmm4, xmm1    movapd  xmm7, xmm1	mulpd   xmm2, xmm0 ;# rsq*lu*lu	mulpd   xmm5, xmm3 ;# rsq*lu*lu     mulpd   xmm8, xmm6 ;# rsq*lu*lu		subpd   xmm1, xmm2	subpd   xmm4, xmm5    subpd   xmm7, xmm8 ;# 3-rsq*lu*lu	mulpd   xmm9, xmm1	mulpd   xmm10, xmm4    mulpd   xmm11, xmm7 ;# lu*(3-rsq*lu*lu)	movapd  xmm15, [rsp + nb331_half]	mulpd   xmm9, xmm15  ;#  rinvO	mulpd   xmm10, xmm15 ;#   rinvH1    mulpd   xmm11, xmm15 ;#   rinvH2		movapd  [rsp + nb331_rinvO], xmm9	movapd  [rsp + nb331_rinvH1], xmm10	movapd  [rsp + nb331_rinvH2], xmm11		;# interactions     ;# rsq in xmm0,xmm3,xmm6      ;# rinv in xmm9, xmm10, xmm11    movapd [rsp + nb331_rinvO], xmm9    movapd xmm1, [rsp + nb331_tsc]        mulpd  xmm0, xmm9  ;# r    mulpd  xmm3, xmm10    mulpd  xmm6, xmm11    mulpd  xmm0, xmm1 ;# rtab    mulpd  xmm3, xmm1    mulpd  xmm6, xmm1        ;# truncate and convert to integers    cvttpd2dq xmm1, xmm0    cvttpd2dq xmm4, xmm3    cvttpd2dq xmm7, xmm6            ;# convert back to float    cvtdq2pd  xmm2, xmm1    cvtdq2pd  xmm5, xmm4    cvtdq2pd  xmm8, xmm7        ;# multiply by 4    pslld   xmm1, 2    pslld   xmm4, 2    pslld   xmm7, 2            ;# multiply by three (copy, mult. by two, add back)    movapd  xmm10, xmm1    movapd  xmm11, xmm4    movapd  xmm12, xmm7    pslld   xmm1, 1    pslld   xmm4, 1    pslld   xmm7, 1        paddd   xmm1, xmm10    paddd   xmm4, xmm11    paddd   xmm7, xmm12            ;# move to integer registers    pshufd xmm13, xmm1, 1    pshufd xmm14, xmm4, 1    pshufd xmm15, xmm7, 1    movd    r8d, xmm1    movd    r10d, xmm4    movd    r12d, xmm7    movd    r9d, xmm13    movd    r11d, xmm14    movd    r13d, xmm15            mov  rsi, [rbp + nb331_VFtab]    ;# calculate eps    subpd     xmm0, xmm2    subpd     xmm3, xmm5    subpd     xmm6, xmm8    movapd    xmm12, xmm0    movapd    xmm13, xmm3    movapd    xmm14, xmm6    ;# Load LOTS of table data    movlpd xmm0,  [rsi + r8*8]    movlpd xmm1,  [rsi + r8*8 + 8]    movlpd xmm2,  [rsi + r8*8 + 16]    movlpd xmm3,  [rsi + r8*8 + 24]    movlpd xmm4,  [rsi + r10*8]    movlpd xmm5,  [rsi + r10*8 + 8]    movlpd xmm6,  [rsi + r10*8 + 16]    movlpd xmm7,  [rsi + r10*8 + 24]    movlpd xmm8,  [rsi + r12*8]    movlpd xmm9,  [rsi + r12*8 + 8]    movlpd xmm10, [rsi + r12*8 + 16]    movlpd xmm11, [rsi + r12*8 + 24]    movhpd xmm0,  [rsi + r9*8]    movhpd xmm1,  [rsi + r9*8 + 8]    movhpd xmm2,  [rsi + r9*8 + 16]    movhpd xmm3,  [rsi + r9*8 + 24]    movhpd xmm4,  [rsi + r11*8]    movhpd xmm5,  [rsi + r11*8 + 8]    movhpd xmm6,  [rsi + r11*8 + 16]    movhpd xmm7,  [rsi + r11*8 + 24]    movhpd xmm8,  [rsi + r13*8]    movhpd xmm9,  [rsi + r13*8 + 8]    movhpd xmm10, [rsi + r13*8 + 16]    movhpd xmm11, [rsi + r13*8 + 24]    ;# table data ready in xmm0-xmm3 , xmm4-xmm7 , and xmm8-xmm11        mulpd  xmm3, xmm12   ;# Heps    mulpd  xmm7, xmm13    mulpd  xmm11, xmm14    mulpd  xmm2, xmm12  ;# Geps    mulpd  xmm6, xmm13    mulpd  xmm10, xmm14    mulpd  xmm3, xmm12   ;# Heps2    mulpd  xmm7, xmm13    mulpd  xmm11, xmm14    addpd  xmm1, xmm2   ;# F+Geps    addpd  xmm5, xmm6    addpd  xmm9, xmm10     addpd  xmm1, xmm3   ;# F+Geps+Heps2 = Fp    addpd  xmm5, xmm7    addpd  xmm9, xmm11     addpd  xmm3, xmm3    ;# 2*Heps2    addpd  xmm7, xmm7    addpd  xmm11, xmm11    addpd  xmm3, xmm2    ;# 2*Heps2+Geps    addpd  xmm7, xmm6      addpd  xmm11, xmm10    addpd  xmm3, xmm1   ;# FF = Fp + 2*Heps2 + Geps    addpd  xmm7, xmm5    addpd  xmm11, xmm9    mulpd  xmm1, xmm12   ;# eps*Fp    mulpd  xmm5, xmm13    mulpd  xmm9, xmm14    addpd  xmm1, xmm0     ;# VV    addpd  xmm5, xmm4    addpd  xmm9, xmm8    mulpd  xmm1, [rsp + nb331_qqO]   ;# VV*qq = vcoul    mulpd  xmm5, [rsp + nb331_qqH]    mulpd  xmm9, [rsp + nb331_qqH]    mulpd  xmm3, [rsp + nb331_qqO]    ;# FF*qq = fij    mulpd  xmm7, [rsp + nb331_qqH]    mulpd  xmm11, [rsp + nb331_qqH]     ;# accumulate vctot    addpd  xmm1, [rsp + nb331_vctot]    addpd  xmm5, xmm9    addpd  xmm1, xmm5    movapd [rsp + nb331_vctot], xmm1    movapd xmm2, xmm7    movapd xmm1, xmm11    ;# fij coul in xmm3, xmm2, xmm1            ;# calculate LJ table    movlpd xmm4,  [rsi + r8*8 + 32]    movlpd xmm5,  [rsi + r8*8 + 40]    movlpd xmm6,  [rsi + r8*8 + 48]    movlpd xmm7,  [rsi + r8*8 + 56]    movlpd xmm8,  [rsi + r8*8 + 64]    movlpd xmm9,  [rsi + r8*8 + 72]    movlpd xmm10, [rsi + r8*8 + 80]    movlpd xmm11, [rsi + r8*8 + 88]    movhpd xmm4,  [rsi + r9*8 + 32]    movhpd xmm5,  [rsi + r9*8 + 40]    movhpd xmm6,  [rsi + r9*8 + 48]    movhpd xmm7,  [rsi + r9*8 + 56]    movhpd xmm8,  [rsi + r9*8 + 64]    movhpd xmm9,  [rsi + r9*8 + 72]    movhpd xmm10, [rsi + r9*8 + 80]    movhpd xmm11, [rsi + r9*8 + 88]    ;# dispersion table in xmm4-xmm7, repulsion table in xmm8-xmm11        ;# xmm12 = epsO        mulpd  xmm7, xmm12    ;# Heps    mulpd  xmm11, xmm12     mulpd  xmm6, xmm12   ;# Geps    mulpd  xmm10, xmm12     mulpd  xmm7, xmm12   ;# Heps2    mulpd  xmm11, xmm12     addpd  xmm5, xmm6  ;# F+Geps    addpd  xmm9, xmm10     addpd  xmm5, xmm7   ;# F+Geps+Heps2 = Fp    addpd  xmm9, xmm11     addpd  xmm7, xmm7    ;# 2*Heps2    addpd  xmm11, xmm11    addpd  xmm7, xmm6   ;# 2*Heps2+Geps    addpd  xmm11, xmm10        addpd  xmm7, xmm5  ;# FF = Fp + 2*Heps2 + Geps    addpd  xmm11, xmm9    mulpd  xmm5, xmm12  ;# eps*Fp    mulpd  xmm9, xmm12    movapd xmm12, [rsp + nb331_c6]    movapd xmm13, [rsp + nb331_c12]    addpd  xmm5, xmm4 ;# VV    addpd  xmm9, xmm8    mulpd  xmm5, xmm12  ;# VV*c6 = vnb6    mulpd  xmm9, xmm13  ;# VV*c12 = vnb12    addpd  xmm5, xmm9    addpd  xmm5, [rsp + nb331_Vvdwtot]    movapd [rsp + nb331_Vvdwtot], xmm5            mulpd  xmm7, xmm12   ;# FF*c6 = fnb6    mulpd  xmm11, xmm13   ;# FF*c12  = fnb12    addpd  xmm7, xmm11        addpd  xmm3, xmm7    movapd xmm10, [rsp + nb331_tsc]    mulpd  xmm3, xmm10  ;# fscal    mulpd  xmm2, xmm10    mulpd  xmm1, xmm10            ;# move j forces to xmm11-xmm13    mov rdi, [rbp + nb331_faction]	movlpd xmm11, [rdi + rax*8]	movlpd xmm12, [rdi + rax*8 + 8]	movlpd xmm13, [rdi + rax*8 + 16]	movhpd xmm11, [rdi + rbx*8]	movhpd xmm12, [rdi + rbx*8 + 8]	movhpd xmm13, [rdi + rbx*8 + 16]    xorpd  xmm0, xmm0    xorpd  xmm4, xmm4    xorpd  xmm8, xmm8        subpd  xmm0, xmm3    subpd  xmm4, xmm2    subpd  xmm8, xmm1    mulpd  xmm0, [rsp + nb331_rinvO]    mulpd  xmm4, [rsp + nb331_rinvH1]    mulpd  xmm8, [rsp + nb331_rinvH2]        movapd xmm1, xmm0    movapd xmm2, xmm0    movapd xmm3, xmm4    movapd xmm5, xmm4    movapd xmm6, xmm8    movapd xmm7, xmm8	mulpd xmm0, [rsp + nb331_dxO]	mulpd xmm1, [rsp + nb331_dyO]	mulpd xmm2, [rsp + nb331_dzO]	mulpd xmm3, [rsp + nb331_dxH1]	mulpd xmm4, [rsp + nb331_dyH1]	mulpd xmm5, [rsp + nb331_dzH1]	mulpd xmm6, [rsp + nb331_dxH2]	mulpd xmm7, [rsp + nb331_dyH2]	mulpd xmm8, [rsp + nb331_dzH2]    addpd xmm11,  xmm0    addpd xmm12, xmm1    addpd xmm13, xmm2    addpd xmm0, [rsp + nb331_fixO]    addpd xmm1, [rsp + nb331_fiyO]    addpd xmm2, [rsp + nb331_fizO]    addpd xmm11,  xmm3    addpd xmm12, xmm4    addpd xmm13, xmm5    addpd xmm3, [rsp + nb331_fixH1]    addpd xmm4, [rsp + nb331_fiyH1]    addpd xmm5, [rsp + nb331_fizH1]    addpd xmm11,  xmm6    addpd xmm12, xmm7    addpd xmm13, xmm8    addpd xmm6, [rsp + nb331_fixH2]    addpd xmm7, [rsp + nb331_fiyH2]    addpd xmm8, [rsp + nb331_fizH2]    movapd [rsp + nb331_fixO], xmm0    movapd [rsp + nb331_fiyO], xmm1    movapd [rsp + nb331_fizO], xmm2    movapd [rsp + nb331_fixH1], xmm3    movapd [rsp + nb331_fiyH1], xmm4    movapd [rsp + nb331_fizH1], xmm5    movapd [rsp + nb331_fixH2], xmm6    movapd [rsp + nb331_fiyH2], xmm7    movapd [rsp + nb331_fizH2], xmm8           ;# store back j forces from xmm11-xmm13	movlpd [rdi + rax*8],      xmm11	movlpd [rdi + rax*8 + 8],  xmm12	movlpd [rdi + rax*8 + 16], xmm13	movhpd [rdi + rbx*8],      xmm11	movhpd [rdi + rbx*8 + 8],  xmm12	movhpd [rdi + rbx*8 + 16], xmm13	;# should we do one more iteration? 	sub dword ptr [rsp + nb331_innerk],  2	jl    .nb331_checksingle	jmp   .nb331_unroll_loop.nb331_checksingle:		mov   edx, [rsp + nb331_innerk]	and   edx, 1	jnz   .nb331_dosingle	jmp   .nb331_updateouterdata.nb331_dosingle:	mov   rdx, [rsp + nb331_innerjjnr]     ;# pointer to jjnr[k] 	mov   eax, [rdx]			mov rsi, [rbp + nb331_charge]    ;# base of charge[] 		movsd xmm3, [rsi + rax*8]	movapd xmm4, xmm3    mulsd xmm3, [rsp + nb331_iqO]    mulsd xmm4, [rsp + nb331_iqH]    	movapd  [rsp + nb331_qqO], xmm3	movapd  [rsp + nb331_qqH], xmm4		mov rsi, [rbp + nb331_type]	mov r8d, [rsi + rax*4]	mov rsi, [rbp + nb331_vdwparam]	shl r8d, 1		mov edi, [rsp + nb331_ntia]	add r8d, edi	movsd xmm6, [rsi + r8*8]	     ;# c6a	movsd xmm7, [rsi + r8*8 + 8]	 ;# c12a		movapd [rsp + nb331_c6], xmm6	movapd [rsp + nb331_c12], xmm7	mov rsi, [rbp + nb331_pos]       ;# base of pos[] 	lea   rax, [rax + rax*2]     ;# replace jnr with j3 	;# move j coordinates to local temp variables     movsd xmm0, [rsi + rax*8]     movsd xmm1, [rsi + rax*8 + 8]     movsd xmm2, [rsi + rax*8 + 16]     ;# xmm0 = jx    ;# xmm1 = jy    ;# xmm2 = jz            movapd xmm3, xmm0    movapd xmm4, xmm1    movapd xmm5, xmm2    movapd xmm6, xmm0    movapd xmm7, xmm1    movapd xmm8, xmm2        subsd xmm0, [rsp + nb331_ixO]    subsd xmm1, [rsp + nb331_iyO]    subsd xmm2, [rsp + nb331_izO]    subsd xmm3, [rsp + nb331_ixH1]    subsd xmm4, [rsp + nb331_iyH1]    subsd xmm5, [rsp + nb331_izH1]    subsd xmm6, [rsp + nb331_ixH2]    subsd xmm7, [rsp + nb331_iyH2]    subsd xmm8, [rsp + nb331_izH2]    	movapd [rsp + nb331_dxO], xmm0	movapd [rsp + nb331_dyO], xmm1	movapd [rsp + nb331_dzO], xmm2	mulsd  xmm0, xmm0	mulsd  xmm1, xmm1	mulsd  xmm2, xmm2	movapd [rsp + nb331_dxH1], xmm3	movapd [rsp + nb331_dyH1], xmm4	movapd [rsp + nb331_dzH1], xmm5	mulsd  xmm3, xmm3	mulsd  xmm4, xmm4	mulsd  xmm5, xmm5	movapd [rsp + nb331_dxH2], xmm6	movapd [rsp + nb331_dyH2], xmm7	movapd [rsp + nb331_dzH2], xmm8	mulsd  xmm6, xmm6	mulsd  xmm7, xmm7	mulsd  xmm8, xmm8	addsd  xmm0, xmm1	addsd  xmm0, xmm2	addsd  xmm3, xmm4	addsd  xmm3, xmm5    addsd  xmm6, xmm7    addsd  xmm6, xmm8	;# start doing invsqrt for j atoms    cvtsd2ss xmm1, xmm0    cvtsd2ss xmm4, xmm3    cvtsd2ss xmm7, xmm6	rsqrtss xmm1, xmm1	rsqrtss xmm4, xmm4    rsqrtss xmm7, xmm7    cvtss2sd xmm1, xmm1    cvtss2sd xmm4, xmm4    cvtss2sd xmm7, xmm7		movapd  xmm2, xmm1	movapd  xmm5, xmm4    movapd  xmm8, xmm7    	mulsd   xmm1, xmm1 ;# lu*lu	mulsd   xmm4, xmm4 ;# lu*lu    mulsd   xmm7, xmm7 ;# lu*lu			movapd  xmm9, [rsp + nb331_three]	movapd  xmm10, xmm9    movapd  xmm11, xmm9	mulsd   xmm1, xmm0 ;# rsq*lu*lu	mulsd   xmm4, xmm3 ;# rsq*lu*lu     mulsd   xmm7, xmm6 ;# rsq*lu*lu		subsd   xmm9, xmm1	subsd   xmm10, xmm4    subsd   xmm11, xmm7 ;# 3-rsq*lu*lu	mulsd   xmm9, xmm2	mulsd   xmm10, xmm5    mulsd   xmm11, xmm8 ;# lu*(3-rsq*lu*lu)	movapd  xmm15, [rsp + nb331_half]	mulsd   xmm9, xmm15  ;# first iteration for rinvO	mulsd   xmm10, xmm15 ;# first iteration for rinvH1    mulsd   xmm11, xmm15 ;# first iteration for rinvH2	    ;# second iteration step    	movapd  xmm2, xmm9	movapd  xmm5, xmm10    movapd  xmm8, xmm11    	mulsd   xmm2, xmm2 ;# lu*lu	mulsd   xmm5, xmm5 ;# lu*lu    mulsd   xmm8, xmm8 ;# lu*lu			movapd  xmm1, [rsp + nb331_three]	movapd  xmm4, xmm1    movapd  xmm7, xmm1	mulsd   xmm2, xmm0 ;# rsq*lu*lu	mulsd   xmm5, xmm3 ;# rsq*lu*lu     mulsd   xmm8, xmm6 ;# rsq*lu*lu		subsd   xmm1, xmm2	subsd   xmm4, xmm5    subsd   xmm7, xmm8 ;# 3-rsq*lu*lu	mulsd   xmm9, xmm1	mulsd   xmm10, xmm4

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?