nb_kernel202_x86_64_sse2.intel_syntax.s

来自「最著名最快的分子模拟软件」· S 代码 · 共 2,314 行 · 第 1/5 页

S
2,314
字号
	;# increment i force 	movsd  xmm3, [rdi + rcx*8 + 24]	movsd  xmm4, [rdi + rcx*8 + 32]	movsd  xmm5, [rdi + rcx*8 + 40]	subsd  xmm3, xmm0	subsd  xmm4, xmm1	subsd  xmm5, xmm2	movsd  [rdi + rcx*8 + 24], xmm3	movsd  [rdi + rcx*8 + 32], xmm4	movsd  [rdi + rcx*8 + 40], xmm5	;# accumulate force in xmm6/xmm7 for fshift 	addsd xmm7, xmm2	unpcklpd xmm0, xmm1	addpd xmm6, xmm0	;# accumulate H2i forces in xmm0, xmm1, xmm2 	movapd xmm0, [rsp + nb202_fixH2]	movapd xmm1, [rsp + nb202_fiyH2]	movapd xmm2, [rsp + nb202_fizH2]	movhlps xmm3, xmm0	movhlps xmm4, xmm1	movhlps xmm5, xmm2	addsd  xmm0, xmm3	addsd  xmm1, xmm4	addsd  xmm2, xmm5 ;# sum is in low xmm0-xmm2 	movapd xmm3, xmm0		movapd xmm4, xmm1		movapd xmm5, xmm2		;# increment i force 	movsd  xmm3, [rdi + rcx*8 + 48]	movsd  xmm4, [rdi + rcx*8 + 56]	movsd  xmm5, [rdi + rcx*8 + 64]	subsd  xmm3, xmm0	subsd  xmm4, xmm1	subsd  xmm5, xmm2	movsd  [rdi + rcx*8 + 48], xmm3	movsd  [rdi + rcx*8 + 56], xmm4	movsd  [rdi + rcx*8 + 64], xmm5	;# accumulate force in xmm6/xmm7 for fshift 	addsd xmm7, xmm2	unpcklpd xmm0, xmm1	addpd xmm6, xmm0	;# increment fshift force 	movlpd xmm3, [rsi + rdx*8]	movhpd xmm3, [rsi + rdx*8 + 8]	movsd  xmm4, [rsi + rdx*8 + 16]	subpd  xmm3, xmm6	subsd  xmm4, xmm7	movlpd [rsi + rdx*8],      xmm3	movhpd [rsi + rdx*8 + 8],  xmm3	movsd  [rsi + rdx*8 + 16], xmm4	;# get n from stack	mov esi, [rsp + nb202_n]        ;# get group index for i particle         mov   rdx, [rbp + nb202_gid]      	;# base of gid[]        mov   edx, [rdx + rsi*4]		;# ggid=gid[n]	;# accumulate total potential energy and update it 	movapd xmm7, [rsp + nb202_vctot]	;# accumulate 	movhlps xmm6, xmm7	addsd  xmm7, xmm6	;# low xmm7 has the sum now         	;# add earlier value from mem 	mov   rax, [rbp + nb202_Vc]	addsd xmm7, [rax + rdx*8] 	;# move back to mem 	movsd [rax + rdx*8], xmm7 	        ;# finish if last         mov ecx, [rsp + nb202_nn1]	;# esi already loaded with n	inc esi        sub ecx, esi        jz .nb202_outerend        ;# not last, iterate outer loop once more!          mov [rsp + nb202_n], esi        jmp .nb202_outer.nb202_outerend:        ;# check if more outer neighborlists remain        mov   ecx, [rsp + nb202_nri]	;# esi already loaded with n above        sub   ecx, esi        jz .nb202_end        ;# non-zero, do one more workunit        jmp   .nb202_threadloop.nb202_end:	mov eax, [rsp + nb202_nouter]	mov ebx, [rsp + nb202_ninner]	mov rcx, [rbp + nb202_outeriter]	mov rdx, [rbp + nb202_inneriter]	mov [rcx], eax	mov [rdx], ebx	add rsp, 1544	emms        pop r15        pop r14        pop r13        pop r12	pop rbx	pop	rbp	ret		.globl nb_kernel202nf_x86_64_sse2.globl _nb_kernel202nf_x86_64_sse2nb_kernel202nf_x86_64_sse2:	_nb_kernel202nf_x86_64_sse2:	;#	Room for return address and rbp (16 bytes).equiv          nb202nf_fshift,         16.equiv          nb202nf_gid,            24.equiv          nb202nf_pos,            32.equiv          nb202nf_faction,        40.equiv          nb202nf_charge,         48.equiv          nb202nf_p_facel,        56.equiv          nb202nf_argkrf,         64.equiv          nb202nf_argcrf,         72.equiv          nb202nf_Vc,             80.equiv          nb202nf_type,           88.equiv          nb202nf_p_ntype,        96.equiv          nb202nf_vdwparam,       104.equiv          nb202nf_Vvdw,           112.equiv          nb202nf_p_tabscale,     120.equiv          nb202nf_VFtab,          128.equiv          nb202nf_invsqrta,       136.equiv          nb202nf_dvda,           144.equiv          nb202nf_p_gbtabscale,   152.equiv          nb202nf_GBtab,          160.equiv          nb202nf_p_nthreads,     168.equiv          nb202nf_count,          176.equiv          nb202nf_mtx,            184.equiv          nb202nf_outeriter,      192.equiv          nb202nf_inneriter,      200.equiv          nb202nf_work,           208	;# stack offsets for local variables  	;# bottom of stack is cache-aligned for sse use .equiv          nb202nf_ixO,            0.equiv          nb202nf_iyO,            16.equiv          nb202nf_izO,            32.equiv          nb202nf_ixH1,           48.equiv          nb202nf_iyH1,           64.equiv          nb202nf_izH1,           80.equiv          nb202nf_ixH2,           96.equiv          nb202nf_iyH2,           112.equiv          nb202nf_izH2,           128.equiv          nb202nf_jxO,            144.equiv          nb202nf_jyO,            160.equiv          nb202nf_jzO,            176.equiv          nb202nf_jxH1,           192.equiv          nb202nf_jyH1,           208.equiv          nb202nf_jzH1,           224.equiv          nb202nf_jxH2,           240.equiv          nb202nf_jyH2,           256.equiv          nb202nf_jzH2,           272.equiv          nb202nf_qqOO,           288.equiv          nb202nf_qqOH,           304.equiv          nb202nf_qqHH,           320.equiv          nb202nf_vctot,          336.equiv          nb202nf_half,           352.equiv          nb202nf_three,          368.equiv          nb202nf_rsqOO,          384.equiv          nb202nf_rsqOH1,         400.equiv          nb202nf_rsqOH2,         416.equiv          nb202nf_rsqH1O,         432.equiv          nb202nf_rsqH1H1,        448.equiv          nb202nf_rsqH1H2,        464.equiv          nb202nf_rsqH2O,         480.equiv          nb202nf_rsqH2H1,        496.equiv          nb202nf_rsqH2H2,        512.equiv          nb202nf_rinvOO,         528.equiv          nb202nf_rinvOH1,        544.equiv          nb202nf_rinvOH2,        560.equiv          nb202nf_rinvH1O,        576.equiv          nb202nf_rinvH1H1,       592.equiv          nb202nf_rinvH1H2,       608.equiv          nb202nf_rinvH2O,        624.equiv          nb202nf_rinvH2H1,       640.equiv          nb202nf_rinvH2H2,       656.equiv          nb202nf_krf,            672.equiv          nb202nf_crf,            688.equiv          nb202nf_is3,            704.equiv          nb202nf_ii3,            708.equiv          nb202nf_nri,            712.equiv          nb202nf_iinr,           720.equiv          nb202nf_jindex,         728.equiv          nb202nf_jjnr,           736.equiv          nb202nf_shift,          744.equiv          nb202nf_shiftvec,       752.equiv          nb202nf_facel,          760.equiv          nb202nf_innerjjnr,      768.equiv          nb202nf_innerk,         776.equiv          nb202nf_n,              780.equiv          nb202nf_nn1,            784.equiv          nb202nf_nouter,         788.equiv          nb202nf_ninner,         792	push rbp	mov  rbp, rsp	push rbx	emms        push r12        push r13        push r14        push r15	sub rsp, 808		;# local variable stack space (n*16+8)	;# zero 32-bit iteration counters	mov eax, 0	mov [rsp + nb202nf_nouter], eax	mov [rsp + nb202nf_ninner], eax	mov edi, [rdi]	mov [rsp + nb202nf_nri], edi	mov [rsp + nb202nf_iinr], rsi	mov [rsp + nb202nf_jindex], rdx	mov [rsp + nb202nf_jjnr], rcx	mov [rsp + nb202nf_shift], r8	mov [rsp + nb202nf_shiftvec], r9	mov rsi, [rbp + nb202nf_p_facel]	movsd xmm0, [rsi]	movsd [rsp + nb202nf_facel], xmm0	mov rsi, [rbp + nb202nf_argkrf]	mov rdi, [rbp + nb202nf_argcrf]	movsd xmm1, [rsi]	movsd xmm2, [rdi]	shufpd xmm1, xmm1, 0	shufpd xmm2, xmm2, 0	movapd [rsp + nb202nf_krf], xmm1	movapd [rsp + nb202nf_crf], xmm2	;# create constant floating-point factors on stack	mov eax, 0x00000000     ;# lower half of double half IEEE (hex)	mov ebx, 0x3fe00000	mov [rsp + nb202nf_half], eax	mov [rsp + nb202nf_half+4], ebx	movsd xmm1, [rsp + nb202nf_half]	shufpd xmm1, xmm1, 0    ;# splat to all elements	movapd xmm3, xmm1	addpd  xmm3, xmm3       ;# one	movapd xmm2, xmm3	addpd  xmm2, xmm2       ;# two	addpd  xmm3, xmm2	;# three	movapd [rsp + nb202nf_half], xmm1	movapd [rsp + nb202nf_three], xmm3	;# assume we have at least one i particle - start directly 	mov   rcx, [rsp + nb202nf_iinr]       ;# rcx = pointer into iinr[] 		mov   ebx, [rcx]	    ;# ebx =ii 	mov   rdx, [rbp + nb202nf_charge]	movsd xmm3, [rdx + rbx*8]		movsd xmm4, xmm3		movsd xmm5, [rdx + rbx*8 + 8]	movsd xmm6, [rsp + nb202nf_facel]	mulsd  xmm3, xmm3	mulsd  xmm4, xmm5	mulsd  xmm5, xmm5	mulsd  xmm3, xmm6	mulsd  xmm4, xmm6	mulsd  xmm5, xmm6	shufpd xmm3, xmm3, 0	shufpd xmm4, xmm4, 0	shufpd xmm5, xmm5, 0	movapd [rsp + nb202nf_qqOO], xmm3	movapd [rsp + nb202nf_qqOH], xmm4	movapd [rsp + nb202nf_qqHH], xmm5	.nb202nf_threadloop:        mov   rsi, [rbp + nb202nf_count]        ;# pointer to sync counter        mov   eax, [rsi].nb202nf_spinlock:        mov   ebx, eax                          ;# ebx=*count=nn0        add   ebx, 1                           	;# ebx=nn1=nn0+10        lock        cmpxchg [esi], ebx                      ;# write nn1 to *counter,                                                ;# if it hasnt changed.                                                ;# or reread *counter to eax.        pause                                   ;# -> better p4 performance        jnz .nb202nf_spinlock        ;# if(nn1>nri) nn1=nri        mov ecx, [rsp + nb202nf_nri]        mov edx, ecx        sub ecx, ebx        cmovle ebx, edx                         ;# if(nn1>nri) nn1=nri        ;# Cleared the spinlock if we got here.        ;# eax contains nn0, ebx contains nn1.        mov [rsp + nb202nf_n], eax        mov [rsp + nb202nf_nn1], ebx        sub ebx, eax                            ;# calc number of outer lists	mov esi, eax				;# copy n to esi        jg  .nb202nf_outerstart        jmp .nb202nf_end.nb202nf_outerstart:	;# ebx contains number of outer iterations	add ebx, [rsp + nb202nf_nouter]	mov [rsp + nb202nf_nouter], ebx.nb202nf_outer:	mov   rax, [rsp + nb202nf_shift]      ;# rax = pointer into shift[] 	mov   ebx, [rax+rsi*4]		;# rbx=shift[n] 		lea   rbx, [rbx + rbx*2]    ;# rbx=3*is 	mov   rax, [rsp + nb202nf_shiftvec]   ;# rax = base of shiftvec[] 	movsd xmm0, [rax + rbx*8]	movsd xmm1, [rax + rbx*8 + 8]	movsd xmm2, [rax + rbx*8 + 16] 	mov   rcx, [rsp + nb202nf_iinr]       ;# rcx = pointer into iinr[] 		mov   ebx, [rcx+rsi*4]	    ;# ebx =ii 	lea   rbx, [rbx + rbx*2]	;# rbx = 3*ii=ii3 	mov   rax, [rbp + nb202nf_pos]    ;# rax = base of pos[]  	mov   [rsp + nb202nf_ii3], ebx			movapd xmm3, xmm0	movapd xmm4, xmm1	movapd xmm5, xmm2	addsd xmm3, [rax + rbx*8]	addsd xmm4, [rax + rbx*8 + 8]	addsd xmm5, [rax + rbx*8 + 16]			shufpd xmm3, xmm3, 0	shufpd xmm4, xmm4, 0	shufpd xmm5, xmm5, 0	movapd [rsp + nb202nf_ixO], xmm3	movapd [rsp + nb202nf_iyO], xmm4	movapd [rsp + nb202nf_izO], xmm5	movsd xmm3, xmm0	movsd xmm4, xmm1	movsd xmm5, xmm2	addsd xmm0, [rax + rbx*8 + 24]	addsd xmm1, [rax + rbx*8 + 32]	addsd xmm2, [rax + rbx*8 + 40]			addsd xmm3, [rax + rbx*8 + 48]	addsd xmm4, [rax + rbx*8 + 56]	addsd xmm5, [rax + rbx*8 + 64]			shufpd xmm0, xmm0, 0	shufpd xmm1, xmm1, 0	shufpd xmm2, xmm2, 0	shufpd xmm3, xmm3, 0	shufpd xmm4, xmm4, 0	shufpd xmm5, xmm5, 0	movapd [rsp + nb202nf_ixH1], xmm0	movapd [rsp + nb202nf_iyH1], xmm1	movapd [rsp + nb202nf_izH1], xmm2	movapd [rsp + nb202nf_ixH2], xmm3	movapd [rsp + nb202nf_iyH2], xmm4	movapd [rsp + nb202nf_izH2], xmm5	;# clear vctot 	xorpd xmm4, xmm4	movapd [rsp + nb202nf_vctot], xmm4		mov   rax, [rsp + nb202nf_jindex]	mov   ecx, [rax + rsi*4]	     ;# jindex[n] 	mov   edx, [rax + rsi*4 + 4]	     ;# jindex[n+1] 	sub   edx, ecx               ;# number of innerloop atoms 	mov   rsi, [rbp + nb202nf_pos]	mov   rax, [rsp + nb202nf_jjnr]	shl   ecx, 2	add   rax, rcx	mov   [rsp + nb202nf_innerjjnr], rax     ;# pointer to jjnr[nj0] 	mov   ecx, edx	sub   edx,  2	add   ecx, [rsp + nb202nf_ninner]	mov   [rsp + nb202nf_ninner], ecx	add   edx, 0	mov   [rsp + nb202nf_innerk], edx    ;# number of innerloop atoms 	jge   .nb202nf_unroll_loop	jmp   .nb202nf_checksingle.nb202nf_unroll_loop:	;# twice unrolled innerloop here 	mov   rdx, [rsp + nb202nf_innerjjnr]     ;# pointer to jjnr[k] 	mov   eax, [rdx]		mov   ebx, [rdx + 4] 		add qword ptr [rsp + nb202nf_innerjjnr],  8	;# advance pointer (unrolled 2) 	mov rsi, [rbp + nb202nf_pos]       ;# base of pos[] 	lea   rax, [rax + rax*2]     ;# replace jnr with j3 	lea   rbx, [rbx + rbx*2]			;# move j coordinates to local temp variables 	movlpd xmm2, [rsi + rax*8]	movlpd xmm3, [rsi + rax*8 + 8]	movlpd xmm4, [rsi + rax*8 + 16]	movlpd xmm5, [rsi + rax*8 + 24]	movlpd xmm6, [rsi + rax*8 + 32]	movlpd xmm7, [rsi + rax*8 + 40]	movhpd xmm2, [rsi + rbx*8]	movhpd xmm3, [rsi + rbx*8 + 8]	movhpd xmm4, [rsi + rbx*8 + 16]	movhpd xmm5, [rsi + rbx*8 + 24]	movhpd xmm6, [rsi + rbx*8 + 32]	movhpd xmm7, [rsi + rbx*8 + 40]	movapd 	[rsp + nb202nf_jxO], xmm2	movapd 	[rsp + nb202nf_jyO], xmm3	movapd 	[rsp + nb202nf_jzO], xmm4	movapd 	[rsp + nb202nf_jxH1], xmm5	movapd 	[rsp + nb202nf_jyH1], xmm6	movapd 	[rsp + nb202nf_jzH1], xmm7	movlpd xmm2, [rsi + rax*8 + 48]	movlpd xmm3, [rsi + rax*8 + 56]	movlpd xmm4, [rsi + rax*8 + 64]	movhpd xmm2, [rsi + rbx*8 + 48]	movhpd xmm3, [rsi + rbx*8 + 56]	movhpd xmm4, [rsi + rbx*8 + 64]	movapd 	[rsp + nb202nf_jxH2], xmm2	movapd 	[rsp + nb202nf_jyH2], xmm3	movapd 	[rsp + nb202nf_jzH2], xmm4		movapd xmm0, [rsp + nb202nf_ixO]	movapd xmm1, [rsp + nb202nf_iyO]	movapd xmm2, [rsp + nb202nf_izO]	movapd xmm3, [rsp + nb202nf_ixO]	movapd xmm4, [rsp + nb202nf_iyO]	movapd xmm5, [rsp + nb202nf_izO]	subpd  xmm0, [rsp + nb202nf_jxO]	subpd  xmm1, [rsp + nb202nf_jyO]	subpd  xmm2, [rsp + nb202nf_jzO]	subpd  xmm3, [rsp + nb202nf_jxH1]	subpd  xmm4, [rsp + nb202nf_jyH1]	subpd  xmm5, [rsp + nb202nf_jzH1]	mulpd  xmm0, xmm0	mulpd  xmm1, xmm1	mulpd  xmm2, xmm2	mulpd  xmm3, xmm3	mulpd  xmm4, xmm4	mulpd  xmm5, xmm5	addpd  xmm0, xmm1	addpd  xmm0, xmm2	addpd  xmm3, xmm4	addpd  xmm3, xmm5	movapd [rsp + nb202nf_rsqOO], xmm0	movapd [rsp + nb202nf_rsqOH1], xmm3	movapd xmm0, [rsp + nb202nf_ixO]	movapd xmm1, [rsp + nb202nf_iy

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?