nb_kernel131_x86_64_sse2.intel_syntax.s

来自「最著名最快的分子模拟软件」· S 代码 · 共 2,068 行 · 第 1/4 页

S
2,068
字号
    movapd [rsp + nb131_rinvO], xmm9    movapd [rsp + nb131_rinvH1], xmm10    movapd [rsp + nb131_rinvH2], xmm11    ;# table LJ interaction    mulpd  xmm0, xmm9    mulpd  xmm0, [rsp + nb131_tsc] ;# rtab    ;# truncate and convert to integers    cvttpd2dq xmm1, xmm0    ;# convert back to float    cvtdq2pd  xmm2, xmm1             ;# multiply by 8    pslld   xmm1, 3    ;# move to integer registers    pshufd  xmm13, xmm1, 1    movd    r8d, xmm1    movd    r10d, xmm13        ;# calculate eps    subpd     xmm0, xmm2    mov  rsi, [rbp + nb131_VFtab]                movlpd xmm4, [rsi + r8*8]   	movlpd xmm5, [rsi + r8*8 + 8]	movlpd xmm6, [rsi + r8*8 + 16]	movlpd xmm7, [rsi + r8*8 + 24]    movlpd xmm8, [rsi + r8*8 + 32]   	movlpd xmm9, [rsi + r8*8 + 40]	movlpd xmm10, [rsi + r8*8 + 48]	movlpd xmm11, [rsi + r8*8 + 56]        movhpd xmm4, [rsi + r10*8]   	movhpd xmm5, [rsi + r10*8 + 8]	movhpd xmm6, [rsi + r10*8 + 16]	movhpd xmm7, [rsi + r10*8 + 24]    movhpd xmm8, [rsi + r10*8 + 32]   	movhpd xmm9, [rsi + r10*8 + 40]	movhpd xmm10, [rsi + r10*8 + 48]	movhpd xmm11, [rsi + r10*8 + 56]    ;# dispersion table in xmm4-xmm7, repulsion table in xmm8-xmm11    mulpd  xmm7, xmm0    ;# Heps    mulpd  xmm11, xmm0     mulpd  xmm6, xmm0   ;# Geps    mulpd  xmm10, xmm0     mulpd  xmm7, xmm0   ;# Heps2    mulpd  xmm11, xmm0     addpd  xmm5, xmm6  ;# F+Geps    addpd  xmm9, xmm10     addpd  xmm5, xmm7   ;# F+Geps+Heps2 = Fp    addpd  xmm9, xmm11     addpd  xmm7, xmm7    ;# 2*Heps2    addpd  xmm11, xmm11    addpd  xmm7, xmm6   ;# 2*Heps2+Geps    addpd  xmm11, xmm10        addpd  xmm7, xmm5  ;# FF = Fp + 2*Heps2 + Geps    addpd  xmm11, xmm9    mulpd  xmm5, xmm0  ;# eps*Fp    mulpd  xmm9, xmm0    movapd xmm12, [rsp + nb131_c6]    movapd xmm13, [rsp + nb131_c12]    addpd  xmm5, xmm4 ;# VV    addpd  xmm9, xmm8    mulpd  xmm5, xmm12  ;# VV*c6 = vnb6    mulpd  xmm9, xmm13  ;# VV*c12 = vnb12    addpd  xmm5, xmm9    addpd  xmm5, [rsp + nb131_Vvdwtot]    movapd [rsp + nb131_Vvdwtot], xmm5            mulpd  xmm7, xmm12   ;# FF*c6 = fnb6    mulpd  xmm11, xmm13   ;# FF*c12  = fnb12    addpd  xmm7, xmm11    mulpd  xmm7, [rsp + nb131_tsc]    movapd xmm9, [rsp + nb131_rinvO]    movapd xmm10, [rsp + nb131_rinvH1]    movapd xmm11, [rsp + nb131_rinvH2]    movapd xmm0, xmm9    movapd xmm1, xmm10    movapd xmm2, xmm11        mulpd  xmm10, xmm10    mulpd  xmm11, xmm11        mulpd  xmm0, [rsp + nb131_qqO]     mulpd  xmm1, [rsp + nb131_qqH]     mulpd  xmm2, [rsp + nb131_qqH]         mulpd  xmm9, xmm0    mulpd  xmm10, xmm1    mulpd  xmm11, xmm2        subpd  xmm9, xmm7    mulpd  xmm9, [rsp + nb131_rinvO]        addpd xmm0, [rsp + nb131_vctot]     addpd xmm1, xmm2    addpd xmm0, xmm1    movapd [rsp + nb131_vctot], xmm0        ;# move j forces to xmm0-xmm2    mov rdi, [rbp + nb131_faction]	movlpd xmm0, [rdi + rax*8]	movlpd xmm1, [rdi + rax*8 + 8]	movlpd xmm2, [rdi + rax*8 + 16]	movhpd xmm0, [rdi + rbx*8]	movhpd xmm1, [rdi + rbx*8 + 8]	movhpd xmm2, [rdi + rbx*8 + 16]    movapd xmm7, xmm9    movapd xmm8, xmm9    movapd xmm13, xmm11    movapd xmm14, xmm11    movapd xmm15, xmm11    movapd xmm11, xmm10    movapd xmm12, xmm10	mulpd xmm7, [rsp + nb131_dxO]	mulpd xmm8, [rsp + nb131_dyO]	mulpd xmm9, [rsp + nb131_dzO]	mulpd xmm10, [rsp + nb131_dxH1]	mulpd xmm11, [rsp + nb131_dyH1]	mulpd xmm12, [rsp + nb131_dzH1]	mulpd xmm13, [rsp + nb131_dxH2]	mulpd xmm14, [rsp + nb131_dyH2]	mulpd xmm15, [rsp + nb131_dzH2]    addpd xmm0, xmm7    addpd xmm1, xmm8    addpd xmm2, xmm9    addpd xmm7, [rsp + nb131_fixO]    addpd xmm8, [rsp + nb131_fiyO]    addpd xmm9, [rsp + nb131_fizO]    addpd xmm0, xmm10    addpd xmm1, xmm11    addpd xmm2, xmm12    addpd xmm10, [rsp + nb131_fixH1]    addpd xmm11, [rsp + nb131_fiyH1]    addpd xmm12, [rsp + nb131_fizH1]    addpd xmm0, xmm13    addpd xmm1, xmm14    addpd xmm2, xmm15    addpd xmm13, [rsp + nb131_fixH2]    addpd xmm14, [rsp + nb131_fiyH2]    addpd xmm15, [rsp + nb131_fizH2]    movapd [rsp + nb131_fixO], xmm7    movapd [rsp + nb131_fiyO], xmm8    movapd [rsp + nb131_fizO], xmm9    movapd [rsp + nb131_fixH1], xmm10    movapd [rsp + nb131_fiyH1], xmm11    movapd [rsp + nb131_fizH1], xmm12    movapd [rsp + nb131_fixH2], xmm13    movapd [rsp + nb131_fiyH2], xmm14    movapd [rsp + nb131_fizH2], xmm15       ;# store back j forces from xmm0-xmm2	movlpd [rdi + rax*8],      xmm0	movlpd [rdi + rax*8 + 8],  xmm1	movlpd [rdi + rax*8 + 16], xmm2	movhpd [rdi + rbx*8],      xmm0	movhpd [rdi + rbx*8 + 8],  xmm1	movhpd [rdi + rbx*8 + 16], xmm2	;# should we do one more iteration? 	sub dword ptr [rsp + nb131_innerk],  2	jl    .nb131_checksingle	jmp   .nb131_unroll_loop.nb131_checksingle:					mov   edx, [rsp + nb131_innerk]	and   edx, 1	jnz    .nb131_dosingle	jmp    .nb131_updateouterdata.nb131_dosingle:				mov   rdx, [rsp + nb131_innerjjnr]     ;# pointer to jjnr[k] 	mov   eax, [rdx]		add qword ptr [rsp + nb131_innerjjnr],  8	;# advance pointer (unrolled 2) 	mov rsi, [rbp + nb131_charge]    ;# base of charge[] 		movsd xmm3, [rsi + rax*8]	movapd xmm4, xmm3	mulsd  xmm3, [rsp + nb131_iqO]	mulsd  xmm4, [rsp + nb131_iqH]	movapd  [rsp + nb131_qqO], xmm3	movapd  [rsp + nb131_qqH], xmm4		mov rsi, [rbp + nb131_type]	mov r8d, [rsi + rax*4]	mov rsi, [rbp + nb131_vdwparam]	shl r8d, 1		mov edi, [rsp + nb131_ntia]	add r8d, edi	movsd xmm6, [rsi + r8*8]	;# c6a	movsd xmm7, [rsi + r8*8 + 8]	;# c12a	movapd [rsp + nb131_c6], xmm6	movapd [rsp + nb131_c12], xmm7		mov rsi, [rbp + nb131_pos]       ;# base of pos[] 	lea   rax, [rax + rax*2]     ;# replace jnr with j3 	;# move j coordinates to local temp variables     movsd xmm0, [rsi + rax*8]     movsd xmm1, [rsi + rax*8 + 8]     movsd xmm2, [rsi + rax*8 + 16]     ;# xmm0 = jx    ;# xmm1 = jy    ;# xmm2 = jz            movapd xmm3, xmm0    movapd xmm4, xmm1    movapd xmm5, xmm2    movapd xmm6, xmm0    movapd xmm7, xmm1    movapd xmm8, xmm2        subsd xmm0, [rsp + nb131_ixO]    subsd xmm1, [rsp + nb131_iyO]    subsd xmm2, [rsp + nb131_izO]    subsd xmm3, [rsp + nb131_ixH1]    subsd xmm4, [rsp + nb131_iyH1]    subsd xmm5, [rsp + nb131_izH1]    subsd xmm6, [rsp + nb131_ixH2]    subsd xmm7, [rsp + nb131_iyH2]    subsd xmm8, [rsp + nb131_izH2]    	movapd [rsp + nb131_dxO], xmm0	movapd [rsp + nb131_dyO], xmm1	movapd [rsp + nb131_dzO], xmm2	mulsd  xmm0, xmm0	mulsd  xmm1, xmm1	mulsd  xmm2, xmm2	movapd [rsp + nb131_dxH1], xmm3	movapd [rsp + nb131_dyH1], xmm4	movapd [rsp + nb131_dzH1], xmm5	mulsd  xmm3, xmm3	mulsd  xmm4, xmm4	mulsd  xmm5, xmm5	movapd [rsp + nb131_dxH2], xmm6	movapd [rsp + nb131_dyH2], xmm7	movapd [rsp + nb131_dzH2], xmm8	mulsd  xmm6, xmm6	mulsd  xmm7, xmm7	mulsd  xmm8, xmm8	addsd  xmm0, xmm1	addsd  xmm0, xmm2	addsd  xmm3, xmm4	addsd  xmm3, xmm5    addsd  xmm6, xmm7    addsd  xmm6, xmm8	;# start doing invsqrt for j atoms    cvtsd2ss xmm1, xmm0    cvtsd2ss xmm4, xmm3    cvtsd2ss xmm7, xmm6	rsqrtss xmm1, xmm1	rsqrtss xmm4, xmm4    rsqrtss xmm7, xmm7    cvtss2sd xmm1, xmm1    cvtss2sd xmm4, xmm4    cvtss2sd xmm7, xmm7		movapd  xmm2, xmm1	movapd  xmm5, xmm4    movapd  xmm8, xmm7    	mulsd   xmm1, xmm1 ;# lu*lu	mulsd   xmm4, xmm4 ;# lu*lu    mulsd   xmm7, xmm7 ;# lu*lu			movapd  xmm9, [rsp + nb131_three]	movapd  xmm10, xmm9    movapd  xmm11, xmm9	mulsd   xmm1, xmm0 ;# rsq*lu*lu	mulsd   xmm4, xmm3 ;# rsq*lu*lu     mulsd   xmm7, xmm6 ;# rsq*lu*lu		subsd   xmm9, xmm1	subsd   xmm10, xmm4    subsd   xmm11, xmm7 ;# 3-rsq*lu*lu	mulsd   xmm9, xmm2	mulsd   xmm10, xmm5    mulsd   xmm11, xmm8 ;# lu*(3-rsq*lu*lu)	movapd  xmm15, [rsp + nb131_half]	mulsd   xmm9, xmm15  ;# first iteration for rinvO	mulsd   xmm10, xmm15 ;# first iteration for rinvH1    mulsd   xmm11, xmm15 ;# first iteration for rinvH2    ;# second iteration step    	movapd  xmm2, xmm9	movapd  xmm5, xmm10    movapd  xmm8, xmm11    	mulsd   xmm2, xmm2 ;# lu*lu	mulsd   xmm5, xmm5 ;# lu*lu    mulsd   xmm8, xmm8 ;# lu*lu			movapd  xmm1, [rsp + nb131_three]	movapd  xmm4, xmm1    movapd  xmm7, xmm1    	mulsd   xmm2, xmm0 ;# rsq*lu*lu	mulsd   xmm5, xmm3 ;# rsq*lu*lu     mulsd   xmm8, xmm6 ;# rsq*lu*lu		subsd   xmm1, xmm2	subsd   xmm4, xmm5    subsd   xmm7, xmm8 ;# 3-rsq*lu*lu	mulsd   xmm9, xmm1	mulsd   xmm10, xmm4    mulsd   xmm11, xmm7 ;# lu*(3-rsq*lu*lu)	movapd  xmm15, [rsp + nb131_half]	mulsd   xmm9, xmm15  ;#  rinvO 	mulsd   xmm10, xmm15 ;#   rinvH1    mulsd   xmm11, xmm15 ;#   rinvH2		;# O interactions     ;# rsq in xmm0,xmm3,xmm6      ;# rinv in xmm9, xmm10, xmm11    movapd [rsp + nb131_rsqO], xmm0    movapd [rsp + nb131_rsqH1], xmm3    movapd [rsp + nb131_rsqH2], xmm6    movapd [rsp + nb131_rinvO], xmm9    movapd [rsp + nb131_rinvH1], xmm10    movapd [rsp + nb131_rinvH2], xmm11    ;# table LJ interaction    mulsd  xmm0, xmm9    mulsd  xmm0, [rsp + nb131_tsc] ;# rtab    ;# truncate and convert to integers    cvttsd2si r8d, xmm0    ;# convert back to float    cvtsi2sd  xmm2, r8d        ;# mult. by 8    shl r8d, 3        ;# calculate eps    subsd     xmm0, xmm2    mov  rsi, [rbp + nb131_VFtab]                movsd xmm4, [rsi + r8*8]   	movsd xmm5, [rsi + r8*8 + 8]	movsd xmm6, [rsi + r8*8 + 16]	movsd xmm7, [rsi + r8*8 + 24]    movsd xmm8, [rsi + r8*8 + 32]   	movsd xmm9, [rsi + r8*8 + 40]	movsd xmm10, [rsi + r8*8 + 48]	movsd xmm11, [rsi + r8*8 + 56]    ;# dispersion table in xmm4-xmm7, repulsion table in xmm8-xmm11    mulsd  xmm7, xmm0    ;# Heps    mulsd  xmm11, xmm0     mulsd  xmm6, xmm0   ;# Geps    mulsd  xmm10, xmm0     mulsd  xmm7, xmm0   ;# Heps2    mulsd  xmm11, xmm0     addsd  xmm5, xmm6  ;# F+Geps    addsd  xmm9, xmm10     addsd  xmm5, xmm7   ;# F+Geps+Heps2 = Fp    addsd  xmm9, xmm11     addsd  xmm7, xmm7    ;# 2*Heps2    addsd  xmm11, xmm11    addsd  xmm7, xmm6   ;# 2*Heps2+Geps    addsd  xmm11, xmm10        addsd  xmm7, xmm5  ;# FF = Fp + 2*Heps2 + Geps    addsd  xmm11, xmm9    mulsd  xmm5, xmm0  ;# eps*Fp    mulsd  xmm9, xmm0    movapd xmm12, [rsp + nb131_c6]    movapd xmm13, [rsp + nb131_c12]    addsd  xmm5, xmm4 ;# VV    addsd  xmm9, xmm8    mulsd  xmm5, xmm12  ;# VV*c6 = vnb6    mulsd  xmm9, xmm13  ;# VV*c12 = vnb12    addsd  xmm5, xmm9    addsd  xmm5, [rsp + nb131_Vvdwtot]    movsd [rsp + nb131_Vvdwtot], xmm5            mulsd  xmm7, xmm12   ;# FF*c6 = fnb6    mulsd  xmm11, xmm13   ;# FF*c12  = fnb12    addsd  xmm7, xmm11    mulsd  xmm7, [rsp + nb131_tsc]    movapd xmm9, [rsp + nb131_rinvO]    movapd xmm10, [rsp + nb131_rinvH1]    movapd xmm11, [rsp + nb131_rinvH2]    movapd xmm0, xmm9    movapd xmm1, xmm10    movapd xmm2, xmm11        mulsd  xmm10, xmm10    mulsd  xmm11, xmm11        mulsd  xmm0, [rsp + nb131_qqO]     mulsd  xmm1, [rsp + nb131_qqH]     mulsd  xmm2, [rsp + nb131_qqH]         mulsd  xmm9, xmm0    mulsd  xmm10, xmm1    mulsd  xmm11, xmm2        subsd  xmm9, xmm7    mulsd  xmm9, [rsp + nb131_rinvO]        addsd xmm0, [rsp + nb131_vctot]     addsd xmm1, xmm2    addsd xmm0, xmm1    movsd [rsp + nb131_vctot], xmm0        ;# move j forces to xmm0-xmm2    mov rdi, [rbp + nb131_faction]	movsd xmm0, [rdi + rax*8]	movsd xmm1, [rdi + rax*8 + 8]	movsd xmm2, [rdi + rax*8 + 16]    movapd xmm7, xmm9    movapd xmm8, xmm9    movapd xmm13, xmm11    movapd xmm14, xmm11    movapd xmm15, xmm11    movapd xmm11, xmm10    movapd xmm12, xmm10	mulsd xmm7, [rsp + nb131_dxO]	mulsd xmm8, [rsp + nb131_dyO]	mulsd xmm9, [rsp + nb131_dzO]	mulsd xmm10, [rsp + nb131_dxH1]	mulsd xmm11, [rsp + nb131_dyH1]	mulsd xmm12, [rsp + nb131_dzH1]	mulsd xmm13, [rsp + nb131_dxH2]	mulsd xmm14, [rsp + nb131_dyH2]	mulsd xmm15, [rsp + nb131_dzH2]    addsd xmm0, xmm7    addsd xmm1, xmm8    addsd xmm2, xmm9    addsd xmm7, [rsp + nb131_fixO]    addsd xmm8, [rsp + nb131_fiyO]    addsd xmm9, [rsp + nb131_fizO]    addsd xmm0, xmm10    addsd xmm1, xmm11    addsd xmm2, xmm12    addsd xmm10, [rsp + nb131_fixH1]    addsd xmm11, [rsp + nb131_fiyH1]    addsd xmm12, [rsp + nb131_fizH1]    addsd xmm0, xmm13    addsd xmm1, xmm14    addsd xmm2, xmm15    addsd xmm13, [rsp + nb131_fixH2]    addsd xmm14, [rsp + nb131_fiyH2]    addsd xmm15, [rsp + nb131_fizH2]    movsd [rsp + nb131_fixO], xmm7    movsd [rsp + nb131_fiyO], xmm8    movsd [rsp + nb131_fizO], xmm9    movsd [rsp + nb131_fixH1], xmm10    movsd [rsp + nb131_fiyH1], xmm11    movsd [rsp + nb131_fizH1], xmm12    movsd [rsp + nb131_fixH2], xmm13    movsd [rsp + nb131_fiyH2], xmm14    movsd [rsp + nb131_fizH2], xmm15       ;# store back j forces from xmm0-xmm2	movsd [rdi + rax*8],      xmm0	movsd [rdi + rax*8 + 8],  xmm1	movsd [rdi + rax*8 + 16], xmm2.nb131_updateouterdata:	mov   ecx, [rsp + nb131_ii3]	mov   rdi, [rbp + nb131_faction]	mov   rsi, [rbp + nb131_fshift]	mov   edx, [rsp + nb131_is3]	;# accumulate  Oi forces in xmm0, xmm1, xmm2 	movapd xmm0, [rsp + nb131_fixO]	movapd xmm1, [rsp + nb131_fiyO]	movapd xmm2, [rsp + nb131_fizO]	movhlps xmm3, xmm0	movhlps xmm4, xmm1	movhlps xmm5, xmm2	addsd  xmm0, xmm3	addsd  xmm1, xmm4	addsd  xmm2, xmm5 ;# sum is in low xmm0-xmm2 	movapd xmm3, xmm0		movapd xmm4, xmm1		movapd xmm5, xmm2	

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?