nb_kernel333_x86_64_sse2.intel_syntax.s

来自「最著名最快的分子模拟软件」· S 代码 · 共 2,340 行 · 第 1/5 页

S
2,340
字号
		add qword ptr [rsp + nb333_innerjjnr],  8 ;# advance pointer (unrolled 2) 	mov rsi, [rbp + nb333_charge]    ;# base of charge[] 		movsd xmm3, [rsi + rax*8]	movapd xmm4, xmm3	     	mulsd  xmm3, [rsp + nb333_iqM]	mulsd  xmm4, [rsp + nb333_iqH]	movapd  [rsp + nb333_qqM], xmm3	movapd  [rsp + nb333_qqH], xmm4		mov rsi, [rbp + nb333_type]	mov r8d, [rsi + rax*4]	mov rsi, [rbp + nb333_vdwparam]	shl r8d, 1		mov edi, [rsp + nb333_ntia]	add r8d, edi	movlpd xmm6, [rsi + r8*8]	;# c6a	movlpd xmm7, [rsi + r8*8 + 8]	;# c12a		movapd [rsp + nb333_c6], xmm6	movapd [rsp + nb333_c12], xmm7	mov rsi, [rbp + nb333_pos]       ;# base of pos[] 	lea   rax, [rax + rax*2]     ;# replace jnr with j3 	;# move j coordinates to local temp variables     movsd xmm0, [rsi + rax*8]     movsd xmm1, [rsi + rax*8 + 8]     movsd xmm2, [rsi + rax*8 + 16]     ;# xmm0 = jx    ;# xmm1 = jy    ;# xmm2 = jz            ;# O interaction    ;# copy to xmm3-xmm5    movapd xmm3, xmm0    movapd xmm4, xmm1    movapd xmm5, xmm2        subsd xmm3, [rsp + nb333_ixO]    subsd xmm4, [rsp + nb333_iyO]    subsd xmm5, [rsp + nb333_izO]        movapd [rsp + nb333_dxO], xmm3    movapd [rsp + nb333_dyO], xmm4    movapd [rsp + nb333_dzO], xmm5    	mulsd  xmm3, xmm3	mulsd  xmm4, xmm4	mulsd  xmm5, xmm5	addsd  xmm3, xmm4	addsd  xmm3, xmm5    ;# xmm3=rsq    cvtsd2ss xmm5, xmm3         rsqrtss xmm5, xmm5    cvtss2sd xmm15, xmm5     ;# lu in low xmm2     ;# lookup seed in xmm2     movapd xmm5, xmm15       ;# copy of lu     mulsd xmm15, xmm15        ;# lu*lu     movapd xmm7, [rsp + nb333_three]    mulsd xmm15, xmm3        ;# rsq*lu*lu                        movapd xmm6, [rsp + nb333_half]    subsd xmm7, xmm15        ;# 30-rsq*lu*lu     mulsd xmm7, xmm5            mulsd xmm7, xmm6        ;# xmm0=iter1 of rinv (new lu)     movapd xmm5, xmm7       ;# copy of lu     mulsd xmm7, xmm7        ;# lu*lu     movapd xmm15, [rsp + nb333_three]    mulsd xmm7, xmm3        ;# rsq*lu*lu                        movapd xmm6, [rsp + nb333_half]    subsd xmm15, xmm7        ;# 30-rsq*lu*lu     mulsd xmm15, xmm5            mulsd xmm15, xmm6        ;# xmm15=rinv            mulsd xmm3, xmm15        ;# xmm3=r     ;# xmm15=rinv    ;# xmm3=r    mulsd xmm3, [rsp + nb333_tsc] ;# rtab    ;# truncate and convert to integers    cvttsd2si r10d, xmm3        ;# convert back to float    cvtsi2sd  xmm4, r10d        ;# multiply by 4    shl    r10d, 2        ;# calculate eps    subsd     xmm3, xmm4    ;# xmm3=eps        ;# multiply by 3    lea   r10, [r10 + r10*2]    ;# xmm3=eps    ;# xmm15=rinv	mov rsi, [rbp + nb333_VFtab]    movsd  xmm4, [rsi + r10*8 + 32]    movsd  xmm5, [rsi + r10*8 + 40]    movsd  xmm6, [rsi + r10*8 + 48]    movsd  xmm7, [rsi + r10*8 + 56]    movsd  xmm8, [rsi + r10*8 + 64]    movsd  xmm9, [rsi + r10*8 + 72]    movsd  xmm10, [rsi + r10*8 + 80]    movsd  xmm11, [rsi + r10*8 + 88]    ;# dispersion table in xmm4-xmm7, repulsion table in xmm8-xmm11        mulsd  xmm7, xmm3    ;# Heps    mulsd  xmm11, xmm3     mulsd  xmm6, xmm3   ;# Geps    mulsd  xmm10, xmm3     mulsd  xmm7, xmm3   ;# Heps2    mulsd  xmm11, xmm3     addsd  xmm5, xmm6  ;# F+Geps    addsd  xmm9, xmm10     addsd  xmm5, xmm7   ;# F+Geps+Heps2 = Fp    addsd  xmm9, xmm11     addsd  xmm7, xmm7    ;# 2*Heps2    addsd  xmm11, xmm11    addsd  xmm7, xmm6   ;# 2*Heps2+Geps    addsd  xmm11, xmm10        addsd  xmm7, xmm5  ;# FF = Fp + 2*Heps2 + Geps    addsd  xmm11, xmm9    mulsd  xmm5, xmm3  ;# eps*Fp    mulsd  xmm9, xmm3    movapd xmm12, [rsp + nb333_c6]    movapd xmm13, [rsp + nb333_c12]    addsd  xmm5, xmm4 ;# VV    addsd  xmm9, xmm8    mulsd  xmm5, xmm12  ;# VV*c6 = vnb6    mulsd  xmm9, xmm13  ;# VV*c12 = vnb12    addsd  xmm5, xmm9    addsd  xmm5, [rsp + nb333_Vvdwtot]    movsd [rsp + nb333_Vvdwtot], xmm5            mulsd  xmm7, xmm12   ;# FF*c6 = fnb6    mulsd  xmm11, xmm13   ;# FF*c12  = fnb12    addsd  xmm7, xmm11        mulsd  xmm7, [rsp + nb333_tsc]    mulsd  xmm7, xmm15   ;# -fscal    xorpd  xmm9, xmm9        subsd  xmm9, xmm7     ;# fscal    movapd xmm10, xmm9    movapd xmm11, xmm9    mulsd  xmm9,  [rsp + nb333_dxO] ;# fx/fy/fz    mulsd  xmm10, [rsp + nb333_dyO]    mulsd  xmm11, [rsp + nb333_dzO]    ;# save j force temporarily    movapd [rsp + nb333_fjx], xmm9    movapd [rsp + nb333_fjy], xmm10    movapd [rsp + nb333_fjz], xmm11        ;# increment i O force    addsd xmm9, [rsp + nb333_fixO]    addsd xmm10, [rsp + nb333_fiyO]    addsd xmm11, [rsp + nb333_fizO]    movsd [rsp + nb333_fixO], xmm9    movsd [rsp + nb333_fiyO], xmm10    movsd [rsp + nb333_fizO], xmm11    ;# finished O LJ interaction.    ;# do H1, H2, and M interactions in parallel.    ;# xmm0-xmm2 still contain j coordinates.                    movapd xmm3, xmm0    movapd xmm4, xmm1    movapd xmm5, xmm2    movapd xmm6, xmm0    movapd xmm7, xmm1    movapd xmm8, xmm2        subsd xmm0, [rsp + nb333_ixH1]    subsd xmm1, [rsp + nb333_iyH1]    subsd xmm2, [rsp + nb333_izH1]    subsd xmm3, [rsp + nb333_ixH2]    subsd xmm4, [rsp + nb333_iyH2]    subsd xmm5, [rsp + nb333_izH2]    subsd xmm6, [rsp + nb333_ixM]    subsd xmm7, [rsp + nb333_iyM]    subsd xmm8, [rsp + nb333_izM]    	movapd [rsp + nb333_dxH1], xmm0	movapd [rsp + nb333_dyH1], xmm1	movapd [rsp + nb333_dzH1], xmm2	mulsd  xmm0, xmm0	mulsd  xmm1, xmm1	mulsd  xmm2, xmm2	movapd [rsp + nb333_dxH2], xmm3	movapd [rsp + nb333_dyH2], xmm4	movapd [rsp + nb333_dzH2], xmm5	mulsd  xmm3, xmm3	mulsd  xmm4, xmm4	mulsd  xmm5, xmm5	movapd [rsp + nb333_dxM], xmm6	movapd [rsp + nb333_dyM], xmm7	movapd [rsp + nb333_dzM], xmm8	mulsd  xmm6, xmm6	mulsd  xmm7, xmm7	mulsd  xmm8, xmm8	addsd  xmm0, xmm1	addsd  xmm0, xmm2	addsd  xmm3, xmm4	addsd  xmm3, xmm5    addsd  xmm6, xmm7    addsd  xmm6, xmm8	;# start doing invsqrt for j atoms    cvtsd2ss xmm1, xmm0    cvtsd2ss xmm4, xmm3    cvtsd2ss xmm7, xmm6	rsqrtss xmm1, xmm1	rsqrtss xmm4, xmm4    rsqrtss xmm7, xmm7    cvtss2sd xmm1, xmm1    cvtss2sd xmm4, xmm4    cvtss2sd xmm7, xmm7		movapd  xmm2, xmm1	movapd  xmm5, xmm4    movapd  xmm8, xmm7    	mulsd   xmm1, xmm1 ;# lu*lu	mulsd   xmm4, xmm4 ;# lu*lu    mulsd   xmm7, xmm7 ;# lu*lu			movapd  xmm9, [rsp + nb333_three]	movapd  xmm10, xmm9    movapd  xmm11, xmm9	mulsd   xmm1, xmm0 ;# rsq*lu*lu	mulsd   xmm4, xmm3 ;# rsq*lu*lu     mulsd   xmm7, xmm6 ;# rsq*lu*lu		subsd   xmm9, xmm1	subsd   xmm10, xmm4    subsd   xmm11, xmm7 ;# 3-rsq*lu*lu	mulsd   xmm9, xmm2	mulsd   xmm10, xmm5    mulsd   xmm11, xmm8 ;# lu*(3-rsq*lu*lu)	movapd  xmm15, [rsp + nb333_half]	mulsd   xmm9, xmm15  ;# first iteration for rinvH1	mulsd   xmm10, xmm15 ;# first iteration for rinvH2    mulsd   xmm11, xmm15 ;# first iteration for rinvM    ;# second iteration step    	movapd  xmm2, xmm9	movapd  xmm5, xmm10    movapd  xmm8, xmm11    	mulsd   xmm2, xmm2 ;# lu*lu	mulsd   xmm5, xmm5 ;# lu*lu    mulsd   xmm8, xmm8 ;# lu*lu			movapd  xmm1, [rsp + nb333_three]	movapd  xmm4, xmm1    movapd  xmm7, xmm1	mulsd   xmm2, xmm0 ;# rsq*lu*lu	mulsd   xmm5, xmm3 ;# rsq*lu*lu     mulsd   xmm8, xmm6 ;# rsq*lu*lu		subsd   xmm1, xmm2	subsd   xmm4, xmm5    subsd   xmm7, xmm8 ;# 3-rsq*lu*lu	mulsd   xmm9, xmm1	mulsd   xmm10, xmm4    mulsd   xmm11, xmm7 ;# lu*(3-rsq*lu*lu)	movapd  xmm15, [rsp + nb333_half]	mulsd   xmm9, xmm15  ;#  rinvH1	mulsd   xmm10, xmm15 ;#   rinvH2    mulsd   xmm11, xmm15 ;#   rinvM		movapd  [rsp + nb333_rinvH1], xmm9	movapd  [rsp + nb333_rinvH2], xmm10	movapd  [rsp + nb333_rinvM], xmm11		;# interactions     ;# rsq in xmm0,xmm3,xmm6      ;# rinv in xmm9, xmm10, xmm11    movapd xmm1, [rsp + nb333_tsc]    mulsd  xmm0, xmm9  ;# r    mulsd  xmm3, xmm10    mulsd  xmm6, xmm11    mulsd  xmm0, xmm1 ;# rtab    mulsd  xmm3, xmm1    mulsd  xmm6, xmm1        ;# truncate and convert to integers    cvttsd2si r8d, xmm0    cvttsd2si r10d, xmm3    cvttsd2si r12d, xmm6            ;# convert back to float    cvtsi2sd  xmm2, r8d    cvtsi2sd  xmm5, r10d    cvtsi2sd  xmm8, r12d        ;# multiply by 4    shl   r8d, 2    shl   r10d, 2    shl   r12d, 2        mov  rsi, [rbp + nb333_VFtab]	lea   r8, [r8 + r8*2]    lea   r10, [r10 + r10*2]    lea   r12, [r12 + r12*2]                ;# calculate eps    subsd     xmm0, xmm2    subsd     xmm3, xmm5    subsd     xmm6, xmm8    movapd    xmm12, xmm0  ;# epsH1    movapd    xmm13, xmm3  ;# epsH2    movapd    xmm14, xmm6  ;# epsM    ;# Load LOTS of table data    movsd xmm0,  [rsi + r8*8]    movsd xmm1,  [rsi + r8*8 + 8]    movsd xmm2,  [rsi + r8*8 + 16]    movsd xmm3,  [rsi + r8*8 + 24]    movsd xmm4,  [rsi + r10*8]    movsd xmm5,  [rsi + r10*8 + 8]    movsd xmm6,  [rsi + r10*8 + 16]    movsd xmm7,  [rsi + r10*8 + 24]    movsd xmm8,  [rsi + r12*8]    movsd xmm9,  [rsi + r12*8 + 8]    movsd xmm10, [rsi + r12*8 + 16]    movsd xmm11, [rsi + r12*8 + 24]    ;# table data ready in xmm0-xmm3 , xmm4-xmm7 , and xmm8-xmm11        mulsd  xmm3, xmm12   ;# Heps    mulsd  xmm7, xmm13    mulsd  xmm11, xmm14     mulsd  xmm2, xmm12   ;# Geps    mulsd  xmm6, xmm13    mulsd  xmm10, xmm14     mulsd  xmm3, xmm12   ;# Heps2    mulsd  xmm7, xmm13    mulsd  xmm11, xmm14     addsd  xmm1, xmm2   ;# F+Geps    addsd  xmm5, xmm6    addsd  xmm9, xmm10     addsd  xmm1, xmm3   ;# F+Geps+Heps2 = Fp    addsd  xmm5, xmm7    addsd  xmm9, xmm11     addsd  xmm3, xmm3    ;# 2*Heps2    addsd  xmm7, xmm7    addsd  xmm11, xmm11    addsd  xmm3, xmm2    ;# 2*Heps2+Geps    addsd  xmm7, xmm6      addsd  xmm11, xmm10    addsd  xmm3, xmm1   ;# FF = Fp + 2*Heps2 + Geps    addsd  xmm7, xmm5    addsd  xmm11, xmm9    mulsd  xmm1, xmm12   ;# eps*Fp    mulsd  xmm5, xmm13    mulsd  xmm9, xmm14    movapd xmm12, [rsp + nb333_qqH]    movapd xmm13, [rsp + nb333_qqM]    addsd  xmm1, xmm0     ;# VV    addsd  xmm5, xmm4    addsd  xmm9, xmm8    mulsd  xmm1, xmm12   ;# VV*qq = vcoul    mulsd  xmm5, xmm12    mulsd  xmm9, xmm13    mulsd  xmm3, xmm12    ;# FF*qq = fij    mulsd  xmm7, xmm12    mulsd  xmm11, xmm13        ;# accumulate vctot    addsd  xmm1, [rsp + nb333_vctot]    addsd  xmm5, xmm9    addsd  xmm1, xmm5    movsd [rsp + nb333_vctot], xmm1    movapd xmm10, [rsp + nb333_tsc]    mulsd  xmm3, xmm10  ;# fscal    mulsd  xmm7, xmm10    mulsd  xmm10, xmm11        xorpd  xmm4, xmm4    xorpd  xmm8, xmm8    xorpd  xmm11, xmm11        subsd  xmm4, xmm3    subsd  xmm8, xmm7    subsd  xmm11, xmm10    mulsd  xmm4, [rsp + nb333_rinvH1]    mulsd  xmm8, [rsp + nb333_rinvH2]    mulsd  xmm11, [rsp + nb333_rinvM]        ;# move j forces to xmm0-xmm2    mov rdi, [rbp + nb333_faction]	movsd xmm0, [rdi + rax*8]	movsd xmm1, [rdi + rax*8 + 8]	movsd xmm2, [rdi + rax*8 + 16]    movapd xmm3, xmm4    movapd xmm5, xmm4    movapd xmm7, xmm8    movapd xmm9, xmm8    movapd xmm10, xmm11    movapd xmm12, xmm11    ;# add forces from O interaction    addsd xmm0, [rsp + nb333_fjx]    addsd xmm1, [rsp + nb333_fjy]    addsd xmm2, [rsp + nb333_fjz]	mulsd xmm3, [rsp + nb333_dxH1]	mulsd xmm4, [rsp + nb333_dyH1]	mulsd xmm5, [rsp + nb333_dzH1]	mulsd xmm7, [rsp + nb333_dxH2]	mulsd xmm8, [rsp + nb333_dyH2]	mulsd xmm9, [rsp + nb333_dzH2]	mulsd xmm10, [rsp + nb333_dxM]	mulsd xmm11, [rsp + nb333_dyM]	mulsd xmm12, [rsp + nb333_dzM]    addsd xmm0, xmm3    addsd xmm1, xmm4    addsd xmm2, xmm5    addsd xmm3, [rsp + nb333_fixH1]    addsd xmm4, [rsp + nb333_fiyH1]    addsd xmm5, [rsp + nb333_fizH1]    addsd xmm0, xmm7    addsd xmm1, xmm8    addsd xmm2, xmm9    addsd xmm7, [rsp + nb333_fixH2]    addsd xmm8, [rsp + nb333_fiyH2]    addsd xmm9, [rsp + nb333_fizH2]    addsd xmm0, xmm10    addsd xmm1, xmm11    addsd xmm2, xmm12    addsd xmm10, [rsp + nb333_fixM]    addsd xmm11, [rsp + nb333_fiyM]    addsd xmm12, [rsp + nb333_fizM]    movsd [rsp + nb333_fixH1], xmm3

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?