nb_kernel301_x86_64_sse2.intel_syntax.s
来自「最著名最快的分子模拟软件」· S 代码 · 共 2,119 行 · 第 1/4 页
S
2,119 行
;#;# $Id: nb_kernel301_x86_64_sse2.intel_syntax.s,v 1.1.2.2 2006/09/22 08:40:36 lindahl Exp $;#;# Gromacs 4.0 Copyright (c) 1991-2003 ;# David van der Spoel, Erik Lindahl;#;# This program is free software; you can redistribute it and/or;# modify it under the terms of the GNU General Public License;# as published by the Free Software Foundation; either version 2;# of the License, or (at your option) any later version.;#;# To help us fund GROMACS development, we humbly ask that you cite;# the research papers on the package. Check out http://www.gromacs.org;# ;# And Hey:;# Gnomes, ROck Monsters And Chili Sauce;#;# These files require GNU binutils 2.10 or later, since we;# use intel syntax for portability, or a recent version ;# of NASM that understands Extended 3DNow and SSE2 instructions.;# (NASM is normally only used with MS Visual C++).;# Since NASM and gnu as disagree on some definitions and use ;# completely different preprocessing options I have to introduce a;# trick: NASM uses ';' for comments, while gnu as uses '#' on x86.;# Gnu as treats ';' as a line break, i.e. ignores it. This is the;# reason why all comments need both symbols...;# The source is written for GNU as, with intel syntax. When you use;# NASM we redefine a couple of things. The false if-statement around ;# the following code is seen by GNU as, but NASM doesn't see it, so ;# the code inside is read by NASM but not gcc.; .if 0 # block below only read by NASM%define .section section%define .long dd%define .align align%define .globl global;# NASM only wants 'dword', not 'dword ptr'.%define ptr.equiv .equiv 2 %1 equ %2%endmacro; .endif # End of NASM-specific block; .intel_syntax noprefix # Line only read by gnu as .globl nb_kernel301_x86_64_sse2.globl _nb_kernel301_x86_64_sse2nb_kernel301_x86_64_sse2: _nb_kernel301_x86_64_sse2: ;# Room for return address and rbp (16 bytes).equiv nb301_fshift, 16.equiv nb301_gid, 24.equiv nb301_pos, 32.equiv nb301_faction, 40.equiv nb301_charge, 48.equiv nb301_p_facel, 56.equiv nb301_argkrf, 64.equiv nb301_argcrf, 72.equiv nb301_Vc, 80.equiv nb301_type, 88.equiv nb301_p_ntype, 96.equiv nb301_vdwparam, 104.equiv nb301_Vvdw, 112.equiv nb301_p_tabscale, 120.equiv nb301_VFtab, 128.equiv nb301_invsqrta, 136.equiv nb301_dvda, 144.equiv nb301_p_gbtabscale, 152.equiv nb301_GBtab, 160.equiv nb301_p_nthreads, 168.equiv nb301_count, 176.equiv nb301_mtx, 184.equiv nb301_outeriter, 192.equiv nb301_inneriter, 200.equiv nb301_work, 208 ;# stack offsets for local variables ;# bottom of stack is cache-aligned for sse2 use .equiv nb301_ixO, 0.equiv nb301_iyO, 16.equiv nb301_izO, 32.equiv nb301_ixH1, 48.equiv nb301_iyH1, 64.equiv nb301_izH1, 80.equiv nb301_ixH2, 96.equiv nb301_iyH2, 112.equiv nb301_izH2, 128.equiv nb301_iqO, 144.equiv nb301_iqH, 160.equiv nb301_dxO, 176.equiv nb301_dyO, 192.equiv nb301_dzO, 208.equiv nb301_dxH1, 224.equiv nb301_dyH1, 240.equiv nb301_dzH1, 256.equiv nb301_dxH2, 272.equiv nb301_dyH2, 288.equiv nb301_dzH2, 304.equiv nb301_qqO, 320.equiv nb301_qqH, 336.equiv nb301_rinvO, 352.equiv nb301_rinvH1, 368.equiv nb301_rinvH2, 384.equiv nb301_rO, 400.equiv nb301_rH1, 416.equiv nb301_rH2, 432.equiv nb301_tsc, 448.equiv nb301_two, 464.equiv nb301_vctot, 480.equiv nb301_fixO, 496.equiv nb301_fiyO, 512.equiv nb301_fizO, 528.equiv nb301_fixH1, 544.equiv nb301_fiyH1, 560.equiv nb301_fizH1, 576.equiv nb301_fixH2, 592.equiv nb301_fiyH2, 608.equiv nb301_fizH2, 624.equiv nb301_fjx, 640.equiv nb301_fjy, 656.equiv nb301_fjz, 672.equiv nb301_epsO, 688.equiv nb301_epsH1, 704.equiv nb301_epsH2, 720.equiv nb301_half, 736.equiv nb301_three, 752.equiv nb301_is3, 768.equiv nb301_ii3, 772.equiv nb301_nri, 776.equiv nb301_iinr, 780.equiv nb301_jindex, 788.equiv nb301_jjnr, 796.equiv nb301_shift, 804.equiv nb301_shiftvec, 812.equiv nb301_facel, 820.equiv nb301_innerjjnr, 828.equiv nb301_innerk, 836.equiv nb301_n, 840.equiv nb301_nn1, 844.equiv nb301_nouter, 848.equiv nb301_ninner, 852 push rbp mov rbp, rsp push rbx emms push r12 push r13 push r14 push r15 sub rsp, 872 ;# local variable stack space (n*16+8) ;# zero 32-bit iteration counters mov eax, 0 mov [rsp + nb301_nouter], eax mov [rsp + nb301_ninner], eax mov edi, [rdi] mov [rsp + nb301_nri], edi mov [rsp + nb301_iinr], rsi mov [rsp + nb301_jindex], rdx mov [rsp + nb301_jjnr], rcx mov [rsp + nb301_shift], r8 mov [rsp + nb301_shiftvec], r9 mov rsi, [rbp + nb301_p_facel] movsd xmm0, [rsi] movsd [rsp + nb301_facel], xmm0 mov rax, [rbp + nb301_p_tabscale] movsd xmm3, [rax] shufpd xmm3, xmm3, 0 movapd [rsp + nb301_tsc], xmm3 ;# create constant floating-point factors on stack mov eax, 0x00000000 ;# lower half of double half IEEE (hex) mov ebx, 0x3fe00000 mov [rsp + nb301_half], eax mov [rsp + nb301_half+4], ebx movsd xmm1, [rsp + nb301_half] shufpd xmm1, xmm1, 0 ;# splat to all elements movapd xmm3, xmm1 addpd xmm3, xmm3 ;# one movapd xmm2, xmm3 addpd xmm2, xmm2 ;# two addpd xmm3, xmm2 ;# three movapd [rsp + nb301_half], xmm1 movapd [rsp + nb301_two], xmm2 movapd [rsp + nb301_three], xmm3 ;# assume we have at least one i particle - start directly mov rcx, [rsp + nb301_iinr] ;# rcx = pointer into iinr[] mov ebx, [rcx] ;# ebx =ii mov rdx, [rbp + nb301_charge] movsd xmm3, [rdx + rbx*8] movsd xmm4, [rdx + rbx*8 + 8] mov rsi, [rbp + nb301_p_facel] movsd xmm0, [rsi] movsd xmm5, [rsp + nb301_facel] mulsd xmm3, xmm5 mulsd xmm4, xmm5 shufpd xmm3, xmm3, 0 shufpd xmm4, xmm4, 0 movapd [rsp + nb301_iqO], xmm3 movapd [rsp + nb301_iqH], xmm4 .nb301_threadloop: mov rsi, [rbp + nb301_count] ;# pointer to sync counter mov eax, [rsi].nb301_spinlock: mov ebx, eax ;# ebx=*count=nn0 add ebx, 1 ;# ebx=nn1=nn0+10 lock cmpxchg [esi], ebx ;# write nn1 to *counter, ;# if it hasnt changed. ;# or reread *counter to eax. pause ;# -> better p4 performance jnz .nb301_spinlock ;# if(nn1>nri) nn1=nri mov ecx, [rsp + nb301_nri] mov edx, ecx sub ecx, ebx cmovle ebx, edx ;# if(nn1>nri) nn1=nri ;# Cleared the spinlock if we got here. ;# eax contains nn0, ebx contains nn1. mov [rsp + nb301_n], eax mov [rsp + nb301_nn1], ebx sub ebx, eax ;# calc number of outer lists mov esi, eax ;# copy n to esi jg .nb301_outerstart jmp .nb301_end.nb301_outerstart: ;# ebx contains number of outer iterations add ebx, [rsp + nb301_nouter] mov [rsp + nb301_nouter], ebx.nb301_outer: mov rax, [rsp + nb301_shift] ;# rax = pointer into shift[] mov ebx, [rax+rsi*4] ;# rbx=shift[n] lea rbx, [rbx + rbx*2] ;# rbx=3*is mov [rsp + nb301_is3],ebx ;# store is3 mov rax, [rsp + nb301_shiftvec] ;# rax = base of shiftvec[] movsd xmm0, [rax + rbx*8] movsd xmm1, [rax + rbx*8 + 8] movsd xmm2, [rax + rbx*8 + 16] mov rcx, [rsp + nb301_iinr] ;# rcx = pointer into iinr[] mov ebx, [rcx+rsi*4] ;# ebx =ii movapd xmm3, xmm0 movapd xmm4, xmm1 movapd xmm5, xmm2 lea rbx, [rbx + rbx*2] ;# rbx = 3*ii=ii3 mov rax, [rbp + nb301_pos] ;# rax = base of pos[] mov [rsp + nb301_ii3], ebx addsd xmm3, [rax + rbx*8] addsd xmm4, [rax + rbx*8 + 8] addsd xmm5, [rax + rbx*8 + 16] shufpd xmm3, xmm3, 0 shufpd xmm4, xmm4, 0 shufpd xmm5, xmm5, 0 movapd [rsp + nb301_ixO], xmm3 movapd [rsp + nb301_iyO], xmm4 movapd [rsp + nb301_izO], xmm5 movsd xmm3, xmm0 movsd xmm4, xmm1 movsd xmm5, xmm2 addsd xmm0, [rax + rbx*8 + 24] addsd xmm1, [rax + rbx*8 + 32] addsd xmm2, [rax + rbx*8 + 40] addsd xmm3, [rax + rbx*8 + 48] addsd xmm4, [rax + rbx*8 + 56] addsd xmm5, [rax + rbx*8 + 64] shufpd xmm0, xmm0, 0 shufpd xmm1, xmm1, 0 shufpd xmm2, xmm2, 0 shufpd xmm3, xmm3, 0 shufpd xmm4, xmm4, 0 shufpd xmm5, xmm5, 0 movapd [rsp + nb301_ixH1], xmm0 movapd [rsp + nb301_iyH1], xmm1 movapd [rsp + nb301_izH1], xmm2 movapd [rsp + nb301_ixH2], xmm3 movapd [rsp + nb301_iyH2], xmm4 movapd [rsp + nb301_izH2], xmm5 ;# clear vctot and i forces xorpd xmm4, xmm4 movapd [rsp + nb301_vctot], xmm4 movapd [rsp + nb301_fixO], xmm4 movapd [rsp + nb301_fiyO], xmm4 movapd [rsp + nb301_fizO], xmm4 movapd [rsp + nb301_fixH1], xmm4 movapd [rsp + nb301_fiyH1], xmm4 movapd [rsp + nb301_fizH1], xmm4 movapd [rsp + nb301_fixH2], xmm4 movapd [rsp + nb301_fiyH2], xmm4 movapd [rsp + nb301_fizH2], xmm4 mov rax, [rsp + nb301_jindex] mov ecx, [rax + rsi*4] ;# jindex[n] mov edx, [rax + rsi*4 + 4] ;# jindex[n+1] sub edx, ecx ;# number of innerloop atoms mov rsi, [rbp + nb301_pos] mov rdi, [rbp + nb301_faction] mov rax, [rsp + nb301_jjnr] shl ecx, 2 add rax, rcx mov [rsp + nb301_innerjjnr], rax ;# pointer to jjnr[nj0] mov ecx, edx sub edx, 2 add ecx, [rsp + nb301_ninner] mov [rsp + nb301_ninner], ecx add edx, 0 mov [rsp + nb301_innerk], edx ;# number of innerloop atoms jge .nb301_unroll_loop jmp .nb301_checksingle.nb301_unroll_loop: ;# twice unrolled innerloop here mov rdx, [rsp + nb301_innerjjnr] ;# pointer to jjnr[k] mov eax, [rdx] mov ebx, [rdx + 4] add qword ptr [rsp + nb301_innerjjnr], 8 ;# advance pointer (unrolled 2) mov rsi, [rbp + nb301_charge] ;# base of charge[] movlpd xmm3, [rsi + rax*8] movhpd xmm3, [rsi + rbx*8] movapd xmm4, xmm3 mulpd xmm3, [rsp + nb301_iqO] mulpd xmm4, [rsp + nb301_iqH] movapd [rsp + nb301_qqO], xmm3 movapd [rsp + nb301_qqH], xmm4 mov rsi, [rbp + nb301_pos] ;# base of pos[] lea rax, [rax + rax*2] ;# replace jnr with j3 lea rbx, [rbx + rbx*2] ;# move j coordinates to local temp variables movlpd xmm0, [rsi + rax*8] movlpd xmm1, [rsi + rax*8 + 8] movlpd xmm2, [rsi + rax*8 + 16] movhpd xmm0, [rsi + rbx*8] movhpd xmm1, [rsi + rbx*8 + 8] movhpd xmm2, [rsi + rbx*8 + 16] ;# xmm0 = jx ;# xmm1 = jy ;# xmm2 = jz movapd xmm3, xmm0 movapd xmm4, xmm1 movapd xmm5, xmm2 movapd xmm6, xmm0 movapd xmm7, xmm1 movapd xmm8, xmm2 subpd xmm0, [rsp + nb301_ixO] subpd xmm1, [rsp + nb301_iyO] subpd xmm2, [rsp + nb301_izO] subpd xmm3, [rsp + nb301_ixH1] subpd xmm4, [rsp + nb301_iyH1] subpd xmm5, [rsp + nb301_izH1] subpd xmm6, [rsp + nb301_ixH2] subpd xmm7, [rsp + nb301_iyH2] subpd xmm8, [rsp + nb301_izH2] movapd [rsp + nb301_dxO], xmm0 movapd [rsp + nb301_dyO], xmm1 movapd [rsp + nb301_dzO], xmm2 mulpd xmm0, xmm0 mulpd xmm1, xmm1 mulpd xmm2, xmm2 movapd [rsp + nb301_dxH1], xmm3 movapd [rsp + nb301_dyH1], xmm4 movapd [rsp + nb301_dzH1], xmm5 mulpd xmm3, xmm3 mulpd xmm4, xmm4 mulpd xmm5, xmm5 movapd [rsp + nb301_dxH2], xmm6 movapd [rsp + nb301_dyH2], xmm7 movapd [rsp + nb301_dzH2], xmm8 mulpd xmm6, xmm6 mulpd xmm7, xmm7 mulpd xmm8, xmm8 addpd xmm0, xmm1 addpd xmm0, xmm2 addpd xmm3, xmm4 addpd xmm3, xmm5 addpd xmm6, xmm7 addpd xmm6, xmm8 ;# start doing invsqrt for j atoms cvtpd2ps xmm1, xmm0 cvtpd2ps xmm4, xmm3 cvtpd2ps xmm7, xmm6 rsqrtps xmm1, xmm1 rsqrtps xmm4, xmm4 rsqrtps xmm7, xmm7 cvtps2pd xmm1, xmm1 cvtps2pd xmm4, xmm4 cvtps2pd xmm7, xmm7 movapd xmm2, xmm1 movapd xmm5, xmm4 movapd xmm8, xmm7 mulpd xmm1, xmm1 ;# lu*lu mulpd xmm4, xmm4 ;# lu*lu mulpd xmm7, xmm7 ;# lu*lu movapd xmm9, [rsp + nb301_three] movapd xmm10, xmm9 movapd xmm11, xmm9 mulpd xmm1, xmm0 ;# rsq*lu*lu mulpd xmm4, xmm3 ;# rsq*lu*lu mulpd xmm7, xmm6 ;# rsq*lu*lu subpd xmm9, xmm1 subpd xmm10, xmm4 subpd xmm11, xmm7 ;# 3-rsq*lu*lu mulpd xmm9, xmm2 mulpd xmm10, xmm5 mulpd xmm11, xmm8 ;# lu*(3-rsq*lu*lu) movapd xmm15, [rsp + nb301_half] mulpd xmm9, xmm15 ;# first iteration for rinvO mulpd xmm10, xmm15 ;# first iteration for rinvH1 mulpd xmm11, xmm15 ;# first iteration for rinvH2 ;# second iteration step movapd xmm2, xmm9 movapd xmm5, xmm10 movapd xmm8, xmm11 mulpd xmm2, xmm2 ;# lu*lu mulpd xmm5, xmm5 ;# lu*lu mulpd xmm8, xmm8 ;# lu*lu movapd xmm1, [rsp + nb301_three] movapd xmm4, xmm1 movapd xmm7, xmm1 mulpd xmm2, xmm0 ;# rsq*lu*lu mulpd xmm5, xmm3 ;# rsq*lu*lu mulpd xmm8, xmm6 ;# rsq*lu*lu subpd xmm1, xmm2 subpd xmm4, xmm5 subpd xmm7, xmm8 ;# 3-rsq*lu*lu mulpd xmm9, xmm1 mulpd xmm10, xmm4 mulpd xmm11, xmm7 ;# lu*(3-rsq*lu*lu) movapd xmm15, [rsp + nb301_half] mulpd xmm9, xmm15 ;# rinvO mulpd xmm10, xmm15 ;# rinvH1 mulpd xmm11, xmm15 ;# rinvH2 movapd [rsp + nb301_rinvO], xmm9 movapd [rsp + nb301_rinvH1], xmm10 movapd [rsp + nb301_rinvH2], xmm11 ;# interactions ;# rsq in xmm0,xmm3,xmm6 ;# rinv in xmm9, xmm10, xmm11 movapd xmm1, [rsp + nb301_tsc] mulpd xmm0, xmm9 ;# r mulpd xmm3, xmm10 mulpd xmm6, xmm11 mulpd xmm0, xmm1 ;# rtab mulpd xmm3, xmm1 mulpd xmm6, xmm1 ;# truncate and convert to integers cvttpd2dq xmm1, xmm0 cvttpd2dq xmm4, xmm3 cvttpd2dq xmm7, xmm6 ;# convert back to float cvtdq2pd xmm2, xmm1 cvtdq2pd xmm5, xmm4 cvtdq2pd xmm8, xmm7 ;# multiply by 4 pslld xmm1, 2 pslld xmm4, 2 pslld xmm7, 2 ;# move to integer registers pshufd xmm13, xmm1, 1 pshufd xmm14, xmm4, 1 pshufd xmm15, xmm7, 1 movd r8d, xmm1 movd r10d, xmm4 movd r12d, xmm7 movd r9d, xmm13 movd r11d, xmm14 movd r13d, xmm15 mov rsi, [rbp + nb301_VFtab] ;# calculate eps subpd xmm0, xmm2 subpd xmm3, xmm5 subpd xmm6, xmm8 movapd [rsp + nb301_epsO], xmm0 movapd [rsp + nb301_epsH1], xmm3 movapd [rsp + nb301_epsH2], xmm6
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?