nb_kernel101_x86_64_sse2.intel_syntax.s

来自「最著名最快的分子模拟软件」· S 代码 · 共 1,627 行 · 第 1/3 页

S
1,627
字号
;#;# $Id: nb_kernel101_x86_64_sse2.intel_syntax.s,v 1.1.2.3 2007/09/13 05:30:45 lindahl Exp $;#;# Gromacs 4.0                         Copyright (c) 1991-2003 ;# David van der Spoel, Erik Lindahl;#;# This program is free software; you can redistribute it and/or;# modify it under the terms of the GNU General Public License;# as published by the Free Software Foundation; either version 2;# of the License, or (at your option) any later version.;#;# To help us fund GROMACS development, we humbly ask that you cite;# the research papers on the package. Check out http://www.gromacs.org;# ;# And Hey:;# Gnomes, ROck Monsters And Chili Sauce;#;# These files require GNU binutils 2.10 or later, since we;# use intel syntax for portability, or a recent version ;# of NASM that understands Extended 3DNow and SSE2 instructions.;# (NASM is normally only used with MS Visual C++).;# Since NASM and gnu as disagree on some definitions and use ;# completely different preprocessing options I have to introduce a;# trick: NASM uses ';' for comments, while gnu as uses '#' on x86.;# Gnu as treats ';' as a line break, i.e. ignores it. This is the;# reason why all comments need both symbols...;# The source is written for GNU as, with intel syntax. When you use;# NASM we redefine a couple of things. The false if-statement around ;# the following code is seen by GNU as, but NASM doesn't see it, so ;# the code inside is read by NASM but not gcc.; .if 0    # block below only read by NASM%define .section	section%define .long		dd%define .align		align%define .globl		global;# NASM only wants 'dword', not 'dword ptr'.%define ptr.equiv          .equiv                  2   %1 equ %2%endmacro; .endif                   # End of NASM-specific block; .intel_syntax noprefix   # Line only read by gnu as		.globl nb_kernel101_x86_64_sse2.globl _nb_kernel101_x86_64_sse2nb_kernel101_x86_64_sse2:	_nb_kernel101_x86_64_sse2:	;#	Room for return address and rbp (16 bytes).equiv          nb101_fshift,           16.equiv          nb101_gid,              24.equiv          nb101_pos,              32.equiv          nb101_faction,          40.equiv          nb101_charge,           48.equiv          nb101_p_facel,          56.equiv          nb101_argkrf,           64.equiv          nb101_argcrf,           72.equiv          nb101_Vc,               80.equiv          nb101_type,             88.equiv          nb101_p_ntype,          96.equiv          nb101_vdwparam,         104.equiv          nb101_Vvdw,             112.equiv          nb101_p_tabscale,       120.equiv          nb101_VFtab,            128.equiv          nb101_invsqrta,         136.equiv          nb101_dvda,             144.equiv          nb101_p_gbtabscale,     152.equiv          nb101_GBtab,            160.equiv          nb101_p_nthreads,       168.equiv          nb101_count,            176.equiv          nb101_mtx,              184.equiv          nb101_outeriter,        192.equiv          nb101_inneriter,        200.equiv          nb101_work,             208	;# stack offsets for local variables  	;# bottom of stack is cache-aligned for sse2 use .equiv          nb101_ixO,              0.equiv          nb101_iyO,              16.equiv          nb101_izO,              32.equiv          nb101_ixH1,             48.equiv          nb101_iyH1,             64.equiv          nb101_izH1,             80.equiv          nb101_ixH2,             96.equiv          nb101_iyH2,             112.equiv          nb101_izH2,             128.equiv          nb101_iqO,              144.equiv          nb101_iqH,              160.equiv          nb101_dxO,              176.equiv          nb101_dyO,              192.equiv          nb101_dzO,              208.equiv          nb101_dxH1,             224.equiv          nb101_dyH1,             240.equiv          nb101_dzH1,             256.equiv          nb101_dxH2,             272.equiv          nb101_dyH2,             288.equiv          nb101_dzH2,             304.equiv          nb101_qqO,              320.equiv          nb101_qqH,              336.equiv          nb101_vctot,            352.equiv          nb101_fixO,             368.equiv          nb101_fiyO,             384.equiv          nb101_fizO,             400.equiv          nb101_fixH1,            416.equiv          nb101_fiyH1,            432.equiv          nb101_fizH1,            448.equiv          nb101_fixH2,            464.equiv          nb101_fiyH2,            480.equiv          nb101_fizH2,            496.equiv          nb101_fjx,              512.equiv          nb101_fjy,              528.equiv          nb101_fjz,              544.equiv          nb101_half,             560.equiv          nb101_three,            576.equiv          nb101_nri,              592.equiv          nb101_innerjjnr,        600.equiv          nb101_iinr,             608.equiv          nb101_jindex,           616.equiv          nb101_jjnr,             624.equiv          nb101_shift,            632.equiv          nb101_shiftvec,         640.equiv          nb101_facel,            648.equiv          nb101_is3,              656.equiv          nb101_ii3,              660.equiv          nb101_innerk,           664.equiv          nb101_n,                668.equiv          nb101_nn1,              672.equiv          nb101_nouter,           676.equiv          nb101_ninner,           680	push rbp	mov  rbp, rsp	push rbx		emms        push r12        push r13        push r14        push r15	sub rsp, 696		;# local variable stack space (n*16+8)	;# zero 32-bit iteration counters	mov eax, 0	mov [rsp + nb101_nouter], eax	mov [rsp + nb101_ninner], eax	mov edi, [rdi]	mov [rsp + nb101_nri], edi	mov [rsp + nb101_iinr], rsi	mov [rsp + nb101_jindex], rdx	mov [rsp + nb101_jjnr], rcx	mov [rsp + nb101_shift], r8	mov [rsp + nb101_shiftvec], r9	mov rsi, [rbp + nb101_p_facel]	movsd xmm0, [rsi]	movsd [rsp + nb101_facel], xmm0	;# create constant floating-point factors on stack	mov eax, 0x00000000     ;# lower half of double half IEEE (hex)	mov ebx, 0x3fe00000	mov [rsp + nb101_half], eax	mov [rsp + nb101_half+4], ebx	movsd xmm1, [rsp + nb101_half]	shufpd xmm1, xmm1, 0    ;# splat to all elements	movapd xmm3, xmm1	addpd  xmm3, xmm3       ;# one	movapd xmm2, xmm3	addpd  xmm2, xmm2       ;# two	addpd  xmm3, xmm2	;# three	movapd [rsp + nb101_half], xmm1	movapd [rsp + nb101_three], xmm3	;# assume we have at least one i particle - start directly 	mov   rcx, [rsp + nb101_iinr]       ;# rcx = pointer into iinr[] 		mov   ebx, [rcx]	    ;# ebx =ii 	mov   rdx, [rbp + nb101_charge]	movsd xmm3, [rdx + rbx*8]		movsd xmm4, [rdx + rbx*8 + 8]		mov rsi, [rbp + nb101_p_facel]	movsd xmm0, [rsi]	movsd xmm5, [rsp + nb101_facel]	mulsd  xmm3, xmm5	mulsd  xmm4, xmm5	shufpd xmm3, xmm3, 0	shufpd xmm4, xmm4, 0	movapd [rsp + nb101_iqO], xmm3	movapd [rsp + nb101_iqH], xmm4	.nb101_threadloop:        mov   rsi, [rbp + nb101_count]          ;# pointer to sync counter        mov   eax, [rsi].nb101_spinlock:        mov   ebx, eax                          ;# ebx=*count=nn0        add   ebx, 1                           ;# ebx=nn1=nn0+10        lock        cmpxchg [esi], ebx                      ;# write nn1 to *counter,                                                ;# if it hasnt changed.                                                ;# or reread *counter to eax.        pause                                   ;# -> better p4 performance        jnz .nb101_spinlock        ;# if(nn1>nri) nn1=nri        mov ecx, [rsp + nb101_nri]        mov edx, ecx        sub ecx, ebx        cmovle ebx, edx                         ;# if(nn1>nri) nn1=nri        ;# Cleared the spinlock if we got here.        ;# eax contains nn0, ebx contains nn1.        mov [rsp + nb101_n], eax        mov [rsp + nb101_nn1], ebx        sub ebx, eax                            ;# calc number of outer lists	mov esi, eax				;# copy n to esi        jg  .nb101_outerstart        jmp .nb101_end.nb101_outerstart:	;# ebx contains number of outer iterations	add ebx, [rsp + nb101_nouter]	mov [rsp + nb101_nouter], ebx.nb101_outer:	mov   rax, [rsp + nb101_shift]      ;# rax = pointer into shift[] 	mov   ebx, [rax + rsi*4]		;# rbx=shift[n] 		lea   rbx, [rbx + rbx*2]    ;# rbx=3*is 	mov   [rsp + nb101_is3],ebx    	;# store is3 	mov   rax, [rsp + nb101_shiftvec]   ;# rax = base of shiftvec[] 	movsd xmm0, [rax + rbx*8]	movsd xmm1, [rax + rbx*8 + 8]	movsd xmm2, [rax + rbx*8 + 16] 	mov   rcx, [rsp + nb101_iinr]       ;# rcx = pointer into iinr[] 		mov   ebx, [rcx+rsi*4]	    ;# ebx =ii 	movapd xmm3, xmm0	movapd xmm4, xmm1	movapd xmm5, xmm2	lea   rbx, [rbx + rbx*2]	;# rbx = 3*ii=ii3 	mov   rax, [rbp + nb101_pos]    ;# rax = base of pos[]  	mov   [rsp + nb101_ii3], ebx	addsd xmm3, [rax + rbx*8]	addsd xmm4, [rax + rbx*8 + 8]	addsd xmm5, [rax + rbx*8 + 16]			shufpd xmm3, xmm3, 0	shufpd xmm4, xmm4, 0	shufpd xmm5, xmm5, 0	movapd [rsp + nb101_ixO], xmm3	movapd [rsp + nb101_iyO], xmm4	movapd [rsp + nb101_izO], xmm5	movsd xmm3, xmm0	movsd xmm4, xmm1	movsd xmm5, xmm2	addsd xmm0, [rax + rbx*8 + 24]	addsd xmm1, [rax + rbx*8 + 32]	addsd xmm2, [rax + rbx*8 + 40]			addsd xmm3, [rax + rbx*8 + 48]	addsd xmm4, [rax + rbx*8 + 56]	addsd xmm5, [rax + rbx*8 + 64]			shufpd xmm0, xmm0, 0	shufpd xmm1, xmm1, 0	shufpd xmm2, xmm2, 0	shufpd xmm3, xmm3, 0	shufpd xmm4, xmm4, 0	shufpd xmm5, xmm5, 0	movapd [rsp + nb101_ixH1], xmm0	movapd [rsp + nb101_iyH1], xmm1	movapd [rsp + nb101_izH1], xmm2	movapd [rsp + nb101_ixH2], xmm3	movapd [rsp + nb101_iyH2], xmm4	movapd [rsp + nb101_izH2], xmm5		;# clear vctot and i forces 	xorpd xmm4, xmm4	movapd [rsp + nb101_vctot], xmm4	movapd [rsp + nb101_fixO], xmm4	movapd [rsp + nb101_fiyO], xmm4	movapd [rsp + nb101_fizO], xmm4	movapd [rsp + nb101_fixH1], xmm4	movapd [rsp + nb101_fiyH1], xmm4	movapd [rsp + nb101_fizH1], xmm4	movapd [rsp + nb101_fixH2], xmm4	movapd [rsp + nb101_fiyH2], xmm4	movapd [rsp + nb101_fizH2], xmm4		mov   rax, [rsp + nb101_jindex]	mov   ecx, [rax+rsi*4]	     ;# jindex[n] 	mov   edx, [rax + rsi*4 + 4]	     ;# jindex[n+1] 	sub   edx, ecx               ;# number of innerloop atoms 	mov   rsi, [rbp + nb101_pos]	mov   rdi, [rbp + nb101_faction]		mov   rax, [rsp + nb101_jjnr]	shl   ecx, 2	add   rax, rcx	mov   [rsp + nb101_innerjjnr], rax     ;# pointer to jjnr[nj0] 	mov   ecx, edx	sub   edx,  2	add   ecx, [rsp + nb101_ninner]	mov   [rsp + nb101_ninner], ecx	add   edx, 0	mov   [rsp + nb101_innerk], edx    ;# number of innerloop atoms 	jge   .nb101_unroll_loop	jmp   .nb101_checksingle.nb101_unroll_loop:	;# twice unrolled innerloop here 	mov   rdx, [rsp + nb101_innerjjnr]     ;# pointer to jjnr[k] 	mov   eax, [rdx]		mov   ebx, [rdx + 4]              	add qword ptr [rsp + nb101_innerjjnr],  8 ;# advance pointer (unrolled 2) 	mov rsi, [rbp + nb101_charge]    ;# base of charge[] 		movlpd xmm6, [rsi + rax*8]	;# jq A 	movhpd xmm6, [rsi + rbx*8]	;# jq B 	movapd xmm3, [rsp + nb101_iqO]	movapd xmm4, [rsp + nb101_iqH]	mulpd xmm3, xmm6		;# qqO 	mulpd xmm4, xmm6		;# qqH 		movapd  [rsp + nb101_qqO], xmm3	movapd  [rsp + nb101_qqH], xmm4		mov rsi, [rbp + nb101_pos]       ;# base of pos[] 	lea   rax, [rax + rax*2]     ;# replace jnr with j3 	lea   rbx, [rbx + rbx*2]		;# move j coordinates to local temp variables     movlpd xmm0, [rsi + rax*8]     movlpd xmm1, [rsi + rax*8 + 8]     movlpd xmm2, [rsi + rax*8 + 16]     movhpd xmm0, [rsi + rbx*8]     movhpd xmm1, [rsi + rbx*8 + 8]     movhpd xmm2, [rsi + rbx*8 + 16]     ;# xmm0 = jx    ;# xmm1 = jy    ;# xmm2 = jz            movapd xmm3, xmm0    movapd xmm4, xmm1    movapd xmm5, xmm2    movapd xmm6, xmm0    movapd xmm7, xmm1    movapd xmm8, xmm2        subpd xmm0, [rsp + nb101_ixO]    subpd xmm1, [rsp + nb101_iyO]    subpd xmm2, [rsp + nb101_izO]    subpd xmm3, [rsp + nb101_ixH1]    subpd xmm4, [rsp + nb101_iyH1]    subpd xmm5, [rsp + nb101_izH1]    subpd xmm6, [rsp + nb101_ixH2]    subpd xmm7, [rsp + nb101_iyH2]    subpd xmm8, [rsp + nb101_izH2]    	movapd [rsp + nb101_dxO], xmm0	movapd [rsp + nb101_dyO], xmm1	movapd [rsp + nb101_dzO], xmm2	mulpd  xmm0, xmm0	mulpd  xmm1, xmm1	mulpd  xmm2, xmm2	movapd [rsp + nb101_dxH1], xmm3	movapd [rsp + nb101_dyH1], xmm4	movapd [rsp + nb101_dzH1], xmm5	mulpd  xmm3, xmm3	mulpd  xmm4, xmm4	mulpd  xmm5, xmm5	movapd [rsp + nb101_dxH2], xmm6	movapd [rsp + nb101_dyH2], xmm7	movapd [rsp + nb101_dzH2], xmm8	mulpd  xmm6, xmm6	mulpd  xmm7, xmm7	mulpd  xmm8, xmm8	addpd  xmm0, xmm1	addpd  xmm0, xmm2	addpd  xmm3, xmm4	addpd  xmm3, xmm5    addpd  xmm6, xmm7    addpd  xmm6, xmm8	;# start doing invsqrt for j atoms    cvtpd2ps xmm1, xmm0    cvtpd2ps xmm4, xmm3    cvtpd2ps xmm7, xmm6	rsqrtps xmm1, xmm1	rsqrtps xmm4, xmm4    rsqrtps xmm7, xmm7    cvtps2pd xmm1, xmm1    cvtps2pd xmm4, xmm4    cvtps2pd xmm7, xmm7		movapd  xmm2, xmm1	movapd  xmm5, xmm4    movapd  xmm8, xmm7    	mulpd   xmm1, xmm1 ;# lu*lu	mulpd   xmm4, xmm4 ;# lu*lu    mulpd   xmm7, xmm7 ;# lu*lu			movapd  xmm9, [rsp + nb101_three]	movapd  xmm10, xmm9    movapd  xmm11, xmm9	mulpd   xmm1, xmm0 ;# rsq*lu*lu	mulpd   xmm4, xmm3 ;# rsq*lu*lu     mulpd   xmm7, xmm6 ;# rsq*lu*lu		subpd   xmm9, xmm1	subpd   xmm10, xmm4    subpd   xmm11, xmm7 ;# 3-rsq*lu*lu	mulpd   xmm9, xmm2	mulpd   xmm10, xmm5    mulpd   xmm11, xmm8 ;# lu*(3-rsq*lu*lu)	movapd  xmm15, [rsp + nb101_half]	mulpd   xmm9, xmm15  ;# first iteration for rinvO	mulpd   xmm10, xmm15 ;# first iteration for rinvH1    mulpd   xmm11, xmm15 ;# first iteration for rinvH2    ;# second iteration step    	movapd  xmm2, xmm9	movapd  xmm5, xmm10    movapd  xmm8, xmm11    	mulpd   xmm2, xmm2 ;# lu*lu	mulpd   xmm5, xmm5 ;# lu*lu    mulpd   xmm8, xmm8 ;# lu*lu			movapd  xmm1, [rsp + nb101_three]	movapd  xmm4, xmm1    movapd  xmm7, xmm1	mulpd   xmm2, xmm0 ;# rsq*lu*lu	mulpd   xmm5, xmm3 ;# rsq*lu*lu     mulpd   xmm8, xmm6 ;# rsq*lu*lu		subpd   xmm1, xmm2	subpd   xmm4, xmm5    subpd   xmm7, xmm8 ;# 3-rsq*lu*lu	mulpd   xmm9, xmm1	mulpd   xmm10, xmm4    mulpd   xmm11, xmm7 ;# lu*(3-rsq*lu*lu)	movapd  xmm15, [rsp + nb101_half]	mulpd   xmm9, xmm15  ;#  rinvO 	mulpd   xmm10, xmm15 ;#   rinvH1    mulpd   xmm11, xmm15 ;#   rinvH2		;# interactions     movapd xmm0, xmm9    movapd xmm1, xmm10    movapd xmm2, xmm11    mulpd  xmm9, xmm9    mulpd  xmm10, xmm10    mulpd  xmm11, xmm11    mulpd  xmm0, [rsp + nb101_qqO]     mulpd  xmm1, [rsp + nb101_qqH]     mulpd  xmm2, [rsp + nb101_qqH]     mulpd  xmm9, xmm0    mulpd  xmm10, xmm1    mulpd  xmm11, xmm2        addpd xmm0, [rsp + nb101_vctot]     addpd xmm1, xmm2    addpd xmm0, xmm1    movapd [rsp + nb101_vctot], xmm0        ;# move j forces to xmm0-xmm2	mov   rdi, [rbp + nb101_faction]		movlpd xmm0, [rdi + rax*8]	movlpd xmm1, [rdi + rax*8 + 8]	movlpd xmm2, [rdi + rax*8 + 16]	movhpd xmm0, [rdi + rbx*8]	movhpd xmm1, [rdi + rbx*8 + 8]	movhpd xmm2, [rdi + rbx*8 + 16]    movapd xmm7, xmm9    movapd xmm8, xmm9    movapd xmm13, xmm11    movapd xmm14, xmm11    movapd xmm15, xmm11    movapd xmm11, xmm10    movapd xmm12, xmm10	mulpd xmm7, [rsp + nb101_dxO]	mulpd xmm8, [rsp + nb101_dyO]	mulpd xmm9, [rsp + nb101_dzO]	mulpd xmm10, [rsp + nb101_dxH1]	mulpd xmm11, [rsp + nb101_dyH1]	mulpd xmm12, [rsp + nb101_dzH1]	mulpd xmm13, [rsp + nb101_dxH2]	mulpd xmm14, [rsp + nb101_dyH2]	mulpd xmm15, [rsp + nb101_dzH2]    addpd xmm0, xmm7    addpd xmm1, xmm8    addpd xmm2, xmm9    addpd xmm7, [rsp + nb101_fixO]    addpd xmm8, [rsp + nb101_fiyO]    addpd xmm9, [rsp + nb101_fizO]    addpd xmm0, xmm10    addpd xmm1, xmm11    addpd xmm2, xmm12    addpd xmm10, [rsp + nb101_fixH1]    addpd xmm11, [rsp + nb101_fiyH1]    addpd xmm12, [rsp + nb101_fizH1]    addpd xmm0, xmm13    addpd xmm1, xmm14    addpd xmm2, xmm15    addpd xmm13, [rsp + nb101_fixH2]    addpd xmm14, [rsp + nb101_fiyH2]    addpd xmm15, [rsp + nb101_fizH2]    movapd [rsp + nb101_fixO], xmm7    movapd [rsp + nb101_fiyO], xmm8    movapd [rsp + nb101_fizO], xmm9    movapd [rsp + nb101_fixH1], xmm10    movapd [rsp + nb101_fiyH1], xmm11    movapd [rsp + nb101_fizH1], xmm12    movapd [rsp + nb101_fixH2], xmm13    movapd [rsp + nb101_fiyH2], xmm14    movapd [rsp + nb101_fizH2], xmm15       ;# store back j forces from xmm0-xmm2

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?