nb_kernel304_ia32_sse.intel_syntax.s
来自「最著名最快的分子模拟软件」· S 代码 · 共 2,258 行 · 第 1/5 页
S
2,258 行
;#;# $Id: nb_kernel304_ia32_sse.intel_syntax.s,v 1.1.2.1 2006/03/01 15:18:30 lindahl Exp $;#;# Gromacs 4.0 Copyright (c) 1991-2003 ;# David van der Spoel, Erik Lindahl;#;# This program is free software; you can redistribute it and/or;# modify it under the terms of the GNU General Public License;# as published by the Free Software Foundation; either version 2;# of the License, or (at your option) any later version.;#;# To help us fund GROMACS development, we humbly ask that you cite;# the research papers on the package. Check out http://www.gromacs.org;# ;# And Hey:;# Gnomes, ROck Monsters And Chili Sauce;#;# These files require GNU binutils 2.10 or later, since we;# use intel syntax for portability, or a recent version ;# of NASM that understands Extended 3DNow and SSE2 instructions.;# (NASM is normally only used with MS Visual C++).;# Since NASM and gnu as disagree on some definitions and use ;# completely different preprocessing options I have to introduce a;# trick: NASM uses ';' for comments, while gnu as uses '#' on x86.;# Gnu as treats ';' as a line break, i.e. ignores it. This is the;# reason why all comments need both symbols...;# The source is written for GNU as, with intel syntax. When you use;# NASM we redefine a couple of things. The false if-statement around ;# the following code is seen by GNU as, but NASM doesn't see it, so ;# the code inside is read by NASM but not gcc.; .if 0 # block below only read by NASM%define .section section%define .long dd%define .align align%define .globl global;# NASM only wants 'dword', not 'dword ptr'.%define ptr.equiv .equiv 2 %1 equ %2%endmacro; .endif # End of NASM-specific block; .intel_syntax noprefix # Line only read by gnu as .globl nb_kernel304_ia32_sse.globl _nb_kernel304_ia32_ssenb_kernel304_ia32_sse: _nb_kernel304_ia32_sse: .equiv nb304_p_nri, 8.equiv nb304_iinr, 12.equiv nb304_jindex, 16.equiv nb304_jjnr, 20.equiv nb304_shift, 24.equiv nb304_shiftvec, 28.equiv nb304_fshift, 32.equiv nb304_gid, 36.equiv nb304_pos, 40.equiv nb304_faction, 44.equiv nb304_charge, 48.equiv nb304_p_facel, 52.equiv nb304_argkrf, 56.equiv nb304_argcrf, 60.equiv nb304_Vc, 64.equiv nb304_type, 68.equiv nb304_p_ntype, 72.equiv nb304_vdwparam, 76.equiv nb304_Vvdw, 80.equiv nb304_p_tabscale, 84.equiv nb304_VFtab, 88.equiv nb304_invsqrta, 92.equiv nb304_dvda, 96.equiv nb304_p_gbtabscale, 100.equiv nb304_GBtab, 104.equiv nb304_p_nthreads, 108.equiv nb304_count, 112.equiv nb304_mtx, 116.equiv nb304_outeriter, 120.equiv nb304_inneriter, 124.equiv nb304_work, 128 ;# stack offsets for local variables ;# bottom of stack is cache-aligned for sse use .equiv nb304_ixH1, 0.equiv nb304_iyH1, 16.equiv nb304_izH1, 32.equiv nb304_ixH2, 48.equiv nb304_iyH2, 64.equiv nb304_izH2, 80.equiv nb304_ixM, 96.equiv nb304_iyM, 112.equiv nb304_izM, 128.equiv nb304_jxH1, 144.equiv nb304_jyH1, 160.equiv nb304_jzH1, 176.equiv nb304_jxH2, 192.equiv nb304_jyH2, 208.equiv nb304_jzH2, 224.equiv nb304_jxM, 240.equiv nb304_jyM, 256.equiv nb304_jzM, 272.equiv nb304_dxH1H1, 288.equiv nb304_dyH1H1, 304.equiv nb304_dzH1H1, 320.equiv nb304_dxH1H2, 336.equiv nb304_dyH1H2, 352.equiv nb304_dzH1H2, 368.equiv nb304_dxH1M, 384.equiv nb304_dyH1M, 400.equiv nb304_dzH1M, 416.equiv nb304_dxH2H1, 432.equiv nb304_dyH2H1, 448.equiv nb304_dzH2H1, 464.equiv nb304_dxH2H2, 480.equiv nb304_dyH2H2, 496.equiv nb304_dzH2H2, 512.equiv nb304_dxH2M, 528.equiv nb304_dyH2M, 544.equiv nb304_dzH2M, 560.equiv nb304_dxMH1, 576.equiv nb304_dyMH1, 592.equiv nb304_dzMH1, 608.equiv nb304_dxMH2, 624.equiv nb304_dyMH2, 640.equiv nb304_dzMH2, 656.equiv nb304_dxMM, 672.equiv nb304_dyMM, 688.equiv nb304_dzMM, 704.equiv nb304_qqHH, 720.equiv nb304_qqMH, 736.equiv nb304_qqMM, 752.equiv nb304_two, 768.equiv nb304_tsc, 784.equiv nb304_vctot, 800.equiv nb304_fixH1, 816.equiv nb304_fiyH1, 832.equiv nb304_fizH1, 848.equiv nb304_fixH2, 864.equiv nb304_fiyH2, 880.equiv nb304_fizH2, 896.equiv nb304_fixM, 912.equiv nb304_fiyM, 928.equiv nb304_fizM, 944.equiv nb304_fjxH1, 960.equiv nb304_fjyH1, 976.equiv nb304_fjzH1, 992.equiv nb304_fjxH2, 1008.equiv nb304_fjyH2, 1024.equiv nb304_fjzH2, 1040.equiv nb304_fjxM, 1056.equiv nb304_fjyM, 1072.equiv nb304_fjzM, 1088.equiv nb304_fjzMb, 1092.equiv nb304_fjzMc, 1096.equiv nb304_fjzMd, 1100.equiv nb304_half, 1104.equiv nb304_three, 1120.equiv nb304_rsqH1H1, 1136.equiv nb304_rsqH1H2, 1152.equiv nb304_rsqH1M, 1168.equiv nb304_rsqH2H1, 1184.equiv nb304_rsqH2H2, 1200.equiv nb304_rsqH2M, 1216.equiv nb304_rsqMH1, 1232.equiv nb304_rsqMH2, 1248.equiv nb304_rsqMM, 1264.equiv nb304_rinvH1H1, 1280.equiv nb304_rinvH1H2, 1296.equiv nb304_rinvH1M, 1312.equiv nb304_rinvH2H1, 1328.equiv nb304_rinvH2H2, 1344.equiv nb304_rinvH2M, 1360.equiv nb304_rinvMH1, 1376.equiv nb304_rinvMH2, 1392.equiv nb304_rinvMM, 1408.equiv nb304_is3, 1424.equiv nb304_ii3, 1428.equiv nb304_innerjjnr, 1432.equiv nb304_innerk, 1436.equiv nb304_n, 1440.equiv nb304_nn1, 1444.equiv nb304_nri, 1448.equiv nb304_nouter, 1452.equiv nb304_ninner, 1456.equiv nb304_salign, 1460 push ebp mov ebp,esp push eax push ebx push ecx push edx push esi push edi sub esp, 1464 ;# local stack space mov eax, esp and eax, 0xf sub esp, eax mov [esp + nb304_salign], eax emms ;# Move args passed by reference to stack mov ecx, [ebp + nb304_p_nri] mov ecx, [ecx] mov [esp + nb304_nri], ecx ;# zero iteration counters mov eax, 0 mov [esp + nb304_nouter], eax mov [esp + nb304_ninner], eax mov eax, [ebp + nb304_p_tabscale] movss xmm3, [eax] shufps xmm3, xmm3, 0 movaps [esp + nb304_tsc], xmm3 ;# create constant floating-point factors on stack mov eax, 0x3f000000 ;# constant 0.5 in IEEE (hex) mov [esp + nb304_half], eax movss xmm1, [esp + nb304_half] shufps xmm1, xmm1, 0 ;# splat to all elements movaps xmm2, xmm1 addps xmm2, xmm2 ;# constant 1.0 movaps xmm3, xmm2 addps xmm2, xmm2 ;# constant 2.0 addps xmm3, xmm2 ;# constant 3.0 movaps [esp + nb304_half], xmm1 movaps [esp + nb304_two], xmm2 movaps [esp + nb304_three], xmm3 ;# assume we have at least one i particle - start directly mov ecx, [ebp + nb304_iinr] ;# ecx = pointer into iinr[] mov ebx, [ecx] ;# ebx =ii mov edx, [ebp + nb304_charge] movss xmm3, [edx + ebx*4 + 4] movss xmm4, xmm3 movss xmm5, [edx + ebx*4 + 12] mov esi, [ebp + nb304_p_facel] movss xmm6, [esi] mulss xmm3, xmm3 mulss xmm4, xmm5 mulss xmm5, xmm5 mulss xmm3, xmm6 mulss xmm4, xmm6 mulss xmm5, xmm6 shufps xmm3, xmm3, 0 shufps xmm4, xmm4, 0 shufps xmm5, xmm5, 0 movaps [esp + nb304_qqHH], xmm3 movaps [esp + nb304_qqMH], xmm4 movaps [esp + nb304_qqMM], xmm5 .nb304_threadloop: mov esi, [ebp + nb304_count] ;# pointer to sync counter mov eax, [esi].nb304_spinlock: mov ebx, eax ;# ebx=*count=nn0 add ebx, 1 ;# ebx=nn1=nn0+10 lock cmpxchg [esi], ebx ;# write nn1 to *counter, ;# if it hasnt changed. ;# or reread *counter to eax. pause ;# -> better p4 performance jnz .nb304_spinlock ;# if(nn1>nri) nn1=nri mov ecx, [esp + nb304_nri] mov edx, ecx sub ecx, ebx cmovle ebx, edx ;# if(nn1>nri) nn1=nri ;# Cleared the spinlock if we got here. ;# eax contains nn0, ebx contains nn1. mov [esp + nb304_n], eax mov [esp + nb304_nn1], ebx sub ebx, eax ;# calc number of outer lists mov esi, eax ;# copy n to esi jg .nb304_outerstart jmp .nb304_end .nb304_outerstart: ;# ebx contains number of outer iterations add ebx, [esp + nb304_nouter] mov [esp + nb304_nouter], ebx.nb304_outer: mov eax, [ebp + nb304_shift] ;# eax = pointer into shift[] mov ebx, [eax + esi*4] ;# ebx=shift[n] lea ebx, [ebx + ebx*2] ;# ebx=3*is mov [esp + nb304_is3],ebx ;# store is3 mov eax, [ebp + nb304_shiftvec] ;# eax = base of shiftvec[] movss xmm0, [eax + ebx*4] movss xmm1, [eax + ebx*4 + 4] movss xmm2, [eax + ebx*4 + 8] mov ecx, [ebp + nb304_iinr] ;# ecx = pointer into iinr[] mov ebx, [ecx + esi*4] ;# ebx =ii lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3 mov eax, [ebp + nb304_pos] ;# eax = base of pos[] mov [esp + nb304_ii3], ebx movaps xmm3, xmm0 movaps xmm4, xmm1 movaps xmm5, xmm2 addss xmm3, [eax + ebx*4 + 12] addss xmm4, [eax + ebx*4 + 16] addss xmm5, [eax + ebx*4 + 20] shufps xmm3, xmm3, 0 shufps xmm4, xmm4, 0 shufps xmm5, xmm5, 0 movaps [esp + nb304_ixH1], xmm3 movaps [esp + nb304_iyH1], xmm4 movaps [esp + nb304_izH1], xmm5 movss xmm3, xmm0 movss xmm4, xmm1 movss xmm5, xmm2 addss xmm0, [eax + ebx*4 + 24] addss xmm1, [eax + ebx*4 + 28] addss xmm2, [eax + ebx*4 + 32] addss xmm3, [eax + ebx*4 + 36] addss xmm4, [eax + ebx*4 + 40] addss xmm5, [eax + ebx*4 + 44] shufps xmm0, xmm0, 0 shufps xmm1, xmm1, 0 shufps xmm2, xmm2, 0 shufps xmm3, xmm3, 0 shufps xmm4, xmm4, 0 shufps xmm5, xmm5, 0 movaps [esp + nb304_ixH2], xmm0 movaps [esp + nb304_iyH2], xmm1 movaps [esp + nb304_izH2], xmm2 movaps [esp + nb304_ixM], xmm3 movaps [esp + nb304_iyM], xmm4 movaps [esp + nb304_izM], xmm5 ;# clear vctot and i forces xorps xmm4, xmm4 movaps [esp + nb304_vctot], xmm4 movaps [esp + nb304_fixH1], xmm4 movaps [esp + nb304_fiyH1], xmm4 movaps [esp + nb304_fizH1], xmm4 movaps [esp + nb304_fixH2], xmm4 movaps [esp + nb304_fiyH2], xmm4 movaps [esp + nb304_fizH2], xmm4 movaps [esp + nb304_fixM], xmm4 movaps [esp + nb304_fiyM], xmm4 movaps [esp + nb304_fizM], xmm4 mov eax, [ebp + nb304_jindex] mov ecx, [eax + esi*4] ;# jindex[n] mov edx, [eax + esi*4 + 4] ;# jindex[n+1] sub edx, ecx ;# number of innerloop atoms mov esi, [ebp + nb304_pos] mov edi, [ebp + nb304_faction] mov eax, [ebp + nb304_jjnr] shl ecx, 2 add eax, ecx mov [esp + nb304_innerjjnr], eax ;# pointer to jjnr[nj0] mov ecx, edx sub edx, 4 add ecx, [esp + nb304_ninner] mov [esp + nb304_ninner], ecx add edx, 0 mov [esp + nb304_innerk], edx ;# number of innerloop atoms jge .nb304_unroll_loop jmp .nb304_single_check.nb304_unroll_loop: ;# quad-unroll innerloop here mov edx, [esp + nb304_innerjjnr] ;# pointer to jjnr[k] mov eax, [edx] mov ebx, [edx + 4] mov ecx, [edx + 8] mov edx, [edx + 12] ;# eax-edx=jnr1-4 add dword ptr [esp + nb304_innerjjnr], 16 ;# advance pointer (unrolled 4) mov esi, [ebp + nb304_pos] ;# base of pos[] lea eax, [eax + eax*2] ;# replace jnr with j3 lea ebx, [ebx + ebx*2] lea ecx, [ecx + ecx*2] ;# replace jnr with j3 lea edx, [edx + edx*2] ;# move j coordinates to local temp variables movlps xmm2, [esi + eax*4 + 12] movlps xmm3, [esi + eax*4 + 24] movlps xmm4, [esi + eax*4 + 36] movlps xmm5, [esi + ebx*4 + 12] movlps xmm6, [esi + ebx*4 + 24] movlps xmm7, [esi + ebx*4 + 36] movhps xmm2, [esi + ecx*4 + 12] movhps xmm3, [esi + ecx*4 + 24] movhps xmm4, [esi + ecx*4 + 36] movhps xmm5, [esi + edx*4 + 12] movhps xmm6, [esi + edx*4 + 24] movhps xmm7, [esi + edx*4 + 36] movaps xmm0, xmm2 movaps xmm1, xmm3 unpcklps xmm0, xmm5 unpcklps xmm1, xmm6 unpckhps xmm2, xmm5 unpckhps xmm3, xmm6 movaps xmm5, xmm4 movaps xmm6, xmm0 unpcklps xmm4, xmm7 unpckhps xmm5, xmm7 movaps xmm7, xmm1 movlhps xmm0, xmm2 movaps [esp + nb304_jxH1], xmm0 movhlps xmm2, xmm6 movaps [esp + nb304_jyH1], xmm2 movlhps xmm1, xmm3 movaps [esp + nb304_jxH2], xmm1 movhlps xmm3, xmm7 movaps xmm6, xmm4 movaps [esp + nb304_jyH2], xmm3 movlhps xmm4, xmm5 movaps [esp + nb304_jxM], xmm4 movhlps xmm5, xmm6 movaps [esp + nb304_jyM], xmm5 movss xmm0, [esi + eax*4 + 20] movss xmm1, [esi + eax*4 + 32] movss xmm2, [esi + eax*4 + 44] movss xmm3, [esi + ecx*4 + 20] movss xmm4, [esi + ecx*4 + 32] movss xmm5, [esi + ecx*4 + 44] movhps xmm0, [esi + ebx*4 + 16] movhps xmm1, [esi + ebx*4 + 28] movhps xmm2, [esi + ebx*4 + 40] movhps xmm3, [esi + edx*4 + 16] movhps xmm4, [esi + edx*4 + 28] movhps xmm5, [esi + edx*4 + 40]
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?