nb_kernel204_ia32_sse.intel_syntax.s
来自「最著名最快的分子模拟软件」· S 代码 · 共 2,173 行 · 第 1/5 页
S
2,173 行
;#;# $Id: nb_kernel204_ia32_sse.intel_syntax.s,v 1.1.2.1 2006/03/01 15:18:29 lindahl Exp $;#;# Gromacs 4.0 Copyright (c) 1991-2003 ;# David van der Spoel, Erik Lindahl;#;# This program is free software; you can redistribute it and/or;# modify it under the terms of the GNU General Public License;# as published by the Free Software Foundation; either version 2;# of the License, or (at your option) any later version.;#;# To help us fund GROMACS development, we humbly ask that you cite;# the research papers on the package. Check out http://www.gromacs.org;# ;# And Hey:;# Gnomes, ROck Monsters And Chili Sauce;#;# These files require GNU binutils 2.10 or later, since we;# use intel syntax for portability, or a recent version ;# of NASM that understands Extended 3DNow and SSE2 instructions.;# (NASM is normally only used with MS Visual C++).;# Since NASM and gnu as disagree on some definitions and use ;# completely different preprocessing options I have to introduce a;# trick: NASM uses ';' for comments, while gnu as uses '#' on x86.;# Gnu as treats ';' as a line break, i.e. ignores it. This is the;# reason why all comments need both symbols...;# The source is written for GNU as, with intel syntax. When you use;# NASM we redefine a couple of things. The false if-statement around ;# the following code is seen by GNU as, but NASM doesn't see it, so ;# the code inside is read by NASM but not gcc.; .if 0 # block below only read by NASM%define .section section%define .long dd%define .align align%define .globl global;# NASM only wants 'dword', not 'dword ptr'.%define ptr.equiv .equiv 2 %1 equ %2%endmacro; .endif # End of NASM-specific block; .intel_syntax noprefix # Line only read by gnu as .globl nb_kernel204_ia32_sse.globl _nb_kernel204_ia32_ssenb_kernel204_ia32_sse: _nb_kernel204_ia32_sse: .equiv nb204_p_nri, 8.equiv nb204_iinr, 12.equiv nb204_jindex, 16.equiv nb204_jjnr, 20.equiv nb204_shift, 24.equiv nb204_shiftvec, 28.equiv nb204_fshift, 32.equiv nb204_gid, 36.equiv nb204_pos, 40.equiv nb204_faction, 44.equiv nb204_charge, 48.equiv nb204_p_facel, 52.equiv nb204_argkrf, 56.equiv nb204_argcrf, 60.equiv nb204_Vc, 64.equiv nb204_type, 68.equiv nb204_p_ntype, 72.equiv nb204_vdwparam, 76.equiv nb204_Vvdw, 80.equiv nb204_p_tabscale, 84.equiv nb204_VFtab, 88.equiv nb204_invsqrta, 92.equiv nb204_dvda, 96.equiv nb204_p_gbtabscale, 100.equiv nb204_GBtab, 104.equiv nb204_p_nthreads, 108.equiv nb204_count, 112.equiv nb204_mtx, 116.equiv nb204_outeriter, 120.equiv nb204_inneriter, 124.equiv nb204_work, 128 ;# stack offsets for local variables ;# bottom of stack is cache-aligned for sse use .equiv nb204_ixH1, 0.equiv nb204_iyH1, 16.equiv nb204_izH1, 32.equiv nb204_ixH2, 48.equiv nb204_iyH2, 64.equiv nb204_izH2, 80.equiv nb204_ixM, 96.equiv nb204_iyM, 112.equiv nb204_izM, 128.equiv nb204_jxH1, 144.equiv nb204_jyH1, 160.equiv nb204_jzH1, 176.equiv nb204_jxH2, 192.equiv nb204_jyH2, 208.equiv nb204_jzH2, 224.equiv nb204_jxM, 240.equiv nb204_jyM, 256.equiv nb204_jzM, 272.equiv nb204_dxH1H1, 288.equiv nb204_dyH1H1, 304.equiv nb204_dzH1H1, 320.equiv nb204_dxH1H2, 336.equiv nb204_dyH1H2, 352.equiv nb204_dzH1H2, 368.equiv nb204_dxH1M, 384.equiv nb204_dyH1M, 400.equiv nb204_dzH1M, 416.equiv nb204_dxH2H1, 432.equiv nb204_dyH2H1, 448.equiv nb204_dzH2H1, 464.equiv nb204_dxH2H2, 480.equiv nb204_dyH2H2, 496.equiv nb204_dzH2H2, 512.equiv nb204_dxH2M, 528.equiv nb204_dyH2M, 544.equiv nb204_dzH2M, 560.equiv nb204_dxMH1, 576.equiv nb204_dyMH1, 592.equiv nb204_dzMH1, 608.equiv nb204_dxMH2, 624.equiv nb204_dyMH2, 640.equiv nb204_dzMH2, 656.equiv nb204_dxMM, 672.equiv nb204_dyMM, 688.equiv nb204_dzMM, 704.equiv nb204_qqHH, 720.equiv nb204_qqMH, 736.equiv nb204_qqMM, 752.equiv nb204_vctot, 768.equiv nb204_fixH1, 784.equiv nb204_fiyH1, 800.equiv nb204_fizH1, 816.equiv nb204_fixH2, 832.equiv nb204_fiyH2, 848.equiv nb204_fizH2, 864.equiv nb204_fixM, 880.equiv nb204_fiyM, 896.equiv nb204_fizM, 912.equiv nb204_fjxH1, 928.equiv nb204_fjyH1, 944.equiv nb204_fjzH1, 960.equiv nb204_fjxH2, 976.equiv nb204_fjyH2, 992.equiv nb204_fjzH2, 1008.equiv nb204_fjxM, 1024.equiv nb204_fjyM, 1040.equiv nb204_fjzM, 1056.equiv nb204_fjzMb, 1060.equiv nb204_fjzMc, 1064.equiv nb204_fjzMd, 1068.equiv nb204_half, 1072.equiv nb204_three, 1088.equiv nb204_rsqH1H1, 1104.equiv nb204_rsqH1H2, 1120.equiv nb204_rsqH1M, 1136.equiv nb204_rsqH2H1, 1152.equiv nb204_rsqH2H2, 1168.equiv nb204_rsqH2M, 1184.equiv nb204_rsqMH1, 1200.equiv nb204_rsqMH2, 1216.equiv nb204_rsqMM, 1232.equiv nb204_rinvH1H1, 1248.equiv nb204_rinvH1H2, 1264.equiv nb204_rinvH1M, 1280.equiv nb204_rinvH2H1, 1296.equiv nb204_rinvH2H2, 1312.equiv nb204_rinvH2M, 1328.equiv nb204_rinvMH1, 1344.equiv nb204_rinvMH2, 1360.equiv nb204_rinvMM, 1376.equiv nb204_two, 1392.equiv nb204_krf, 1408.equiv nb204_crf, 1424.equiv nb204_is3, 1440.equiv nb204_ii3, 1444.equiv nb204_innerjjnr, 1448.equiv nb204_innerk, 1452.equiv nb204_n, 1456.equiv nb204_nn1, 1460.equiv nb204_nri, 1464.equiv nb204_nouter, 1468.equiv nb204_ninner, 1472.equiv nb204_salign, 1476 push ebp mov ebp,esp push eax push ebx push ecx push edx push esi push edi sub esp, 1480 ;# local stack space mov eax, esp and eax, 0xf sub esp, eax mov [esp + nb204_salign], eax emms ;# Move args passed by reference to stack mov ecx, [ebp + nb204_p_nri] mov ecx, [ecx] mov [esp + nb204_nri], ecx ;# zero iteration counters mov eax, 0 mov [esp + nb204_nouter], eax mov [esp + nb204_ninner], eax mov esi, [ebp + nb204_argkrf] mov edi, [ebp + nb204_argcrf] movss xmm5, [esi] movss xmm6, [edi] shufps xmm5, xmm5, 0 shufps xmm6, xmm6, 0 movaps [esp + nb204_krf], xmm5 movaps [esp + nb204_crf], xmm6 ;# create constant floating-point factors on stack mov eax, 0x3f000000 ;# constant 0.5 in IEEE (hex) mov [esp + nb204_half], eax movss xmm1, [esp + nb204_half] shufps xmm1, xmm1, 0 ;# splat to all elements movaps xmm2, xmm1 addps xmm2, xmm2 ;# constant 1.0 movaps xmm3, xmm2 addps xmm2, xmm2 ;# constant 2.0 addps xmm3, xmm2 ;# constant 3.0 movaps [esp + nb204_half], xmm1 movaps [esp + nb204_two], xmm2 movaps [esp + nb204_three], xmm3 ;# assume we have at least one i particle - start directly mov ecx, [ebp + nb204_iinr] ;# ecx = pointer into iinr[] mov ebx, [ecx] ;# ebx =ii mov edx, [ebp + nb204_charge] movss xmm3, [edx + ebx*4 + 4] movss xmm4, xmm3 movss xmm5, [edx + ebx*4 + 12] mov esi, [ebp + nb204_p_facel] movss xmm6, [esi] mulss xmm3, xmm3 mulss xmm4, xmm5 mulss xmm5, xmm5 mulss xmm3, xmm6 mulss xmm4, xmm6 mulss xmm5, xmm6 shufps xmm3, xmm3, 0 shufps xmm4, xmm4, 0 shufps xmm5, xmm5, 0 movaps [esp + nb204_qqHH], xmm3 movaps [esp + nb204_qqMH], xmm4 movaps [esp + nb204_qqMM], xmm5 .nb204_threadloop: mov esi, [ebp + nb204_count] ;# pointer to sync counter mov eax, [esi].nb204_spinlock: mov ebx, eax ;# ebx=*count=nn0 add ebx, 1 ;# ebx=nn1=nn0+10 lock cmpxchg [esi], ebx ;# write nn1 to *counter, ;# if it hasnt changed. ;# or reread *counter to eax. pause ;# -> better p4 performance jnz .nb204_spinlock ;# if(nn1>nri) nn1=nri mov ecx, [esp + nb204_nri] mov edx, ecx sub ecx, ebx cmovle ebx, edx ;# if(nn1>nri) nn1=nri ;# Cleared the spinlock if we got here. ;# eax contains nn0, ebx contains nn1. mov [esp + nb204_n], eax mov [esp + nb204_nn1], ebx sub ebx, eax ;# calc number of outer lists mov esi, eax ;# copy n to esi jg .nb204_outerstart jmp .nb204_end .nb204_outerstart: ;# ebx contains number of outer iterations add ebx, [esp + nb204_nouter] mov [esp + nb204_nouter], ebx.nb204_outer: mov eax, [ebp + nb204_shift] ;# eax = pointer into shift[] mov ebx, [eax + esi*4] ;# ebx=shift[n] lea ebx, [ebx + ebx*2] ;# ebx=3*is mov [esp + nb204_is3],ebx ;# store is3 mov eax, [ebp + nb204_shiftvec] ;# eax = base of shiftvec[] movss xmm0, [eax + ebx*4] movss xmm1, [eax + ebx*4 + 4] movss xmm2, [eax + ebx*4 + 8] mov ecx, [ebp + nb204_iinr] ;# ecx = pointer into iinr[] mov ebx, [ecx + esi*4] ;# ebx =ii lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3 mov eax, [ebp + nb204_pos] ;# eax = base of pos[] mov [esp + nb204_ii3], ebx movaps xmm3, xmm0 movaps xmm4, xmm1 movaps xmm5, xmm2 addss xmm3, [eax + ebx*4 + 12] addss xmm4, [eax + ebx*4 + 16] addss xmm5, [eax + ebx*4 + 20] shufps xmm3, xmm3, 0 shufps xmm4, xmm4, 0 shufps xmm5, xmm5, 0 movaps [esp + nb204_ixH1], xmm3 movaps [esp + nb204_iyH1], xmm4 movaps [esp + nb204_izH1], xmm5 movss xmm3, xmm0 movss xmm4, xmm1 movss xmm5, xmm2 addss xmm0, [eax + ebx*4 + 24] addss xmm1, [eax + ebx*4 + 28] addss xmm2, [eax + ebx*4 + 32] addss xmm3, [eax + ebx*4 + 36] addss xmm4, [eax + ebx*4 + 40] addss xmm5, [eax + ebx*4 + 44] shufps xmm0, xmm0, 0 shufps xmm1, xmm1, 0 shufps xmm2, xmm2, 0 shufps xmm3, xmm3, 0 shufps xmm4, xmm4, 0 shufps xmm5, xmm5, 0 movaps [esp + nb204_ixH2], xmm0 movaps [esp + nb204_iyH2], xmm1 movaps [esp + nb204_izH2], xmm2 movaps [esp + nb204_ixM], xmm3 movaps [esp + nb204_iyM], xmm4 movaps [esp + nb204_izM], xmm5 ;# clear vctot and i forces xorps xmm4, xmm4 movaps [esp + nb204_vctot], xmm4 movaps [esp + nb204_fixH1], xmm4 movaps [esp + nb204_fiyH1], xmm4 movaps [esp + nb204_fizH1], xmm4 movaps [esp + nb204_fixH2], xmm4 movaps [esp + nb204_fiyH2], xmm4 movaps [esp + nb204_fizH2], xmm4 movaps [esp + nb204_fixM], xmm4 movaps [esp + nb204_fiyM], xmm4 movaps [esp + nb204_fizM], xmm4 mov eax, [ebp + nb204_jindex] mov ecx, [eax + esi*4] ;# jindex[n] mov edx, [eax + esi*4 + 4] ;# jindex[n+1] sub edx, ecx ;# number of innerloop atoms mov esi, [ebp + nb204_pos] mov edi, [ebp + nb204_faction] mov eax, [ebp + nb204_jjnr] shl ecx, 2 add eax, ecx mov [esp + nb204_innerjjnr], eax ;# pointer to jjnr[nj0] mov ecx, edx sub edx, 4 add ecx, [esp + nb204_ninner] mov [esp + nb204_ninner], ecx add edx, 0 mov [esp + nb204_innerk], edx ;# number of innerloop atoms jge .nb204_unroll_loop jmp .nb204_single_check.nb204_unroll_loop: ;# quad-unroll innerloop here mov edx, [esp + nb204_innerjjnr] ;# pointer to jjnr[k] mov eax, [edx] mov ebx, [edx + 4] mov ecx, [edx + 8] mov edx, [edx + 12] ;# eax-edx=jnr1-4 add dword ptr [esp + nb204_innerjjnr], 16 ;# advance pointer (unrolled 4) mov esi, [ebp + nb204_pos] ;# base of pos[] lea eax, [eax + eax*2] ;# replace jnr with j3 lea ebx, [ebx + ebx*2] lea ecx, [ecx + ecx*2] ;# replace jnr with j3 lea edx, [edx + edx*2] ;# move j coordinates to local temp variables movlps xmm2, [esi + eax*4 + 12] movlps xmm3, [esi + eax*4 + 24] movlps xmm4, [esi + eax*4 + 36] movlps xmm5, [esi + ebx*4 + 12] movlps xmm6, [esi + ebx*4 + 24] movlps xmm7, [esi + ebx*4 + 36] movhps xmm2, [esi + ecx*4 + 12] movhps xmm3, [esi + ecx*4 + 24] movhps xmm4, [esi + ecx*4 + 36] movhps xmm5, [esi + edx*4 + 12] movhps xmm6, [esi + edx*4 + 24] movhps xmm7, [esi + edx*4 + 36] movaps xmm0, xmm2 movaps xmm1, xmm3 unpcklps xmm0, xmm5 unpcklps xmm1, xmm6 unpckhps xmm2, xmm5 unpckhps xmm3, xmm6 movaps xmm5, xmm4 movaps xmm6, xmm0 unpcklps xmm4, xmm7 unpckhps xmm5, xmm7 movaps xmm7, xmm1 movlhps xmm0, xmm2 movaps [esp + nb204_jxH1], xmm0 movhlps xmm2, xmm6 movaps [esp + nb204_jyH1], xmm2 movlhps xmm1, xmm3 movaps [esp + nb204_jxH2], xmm1 movhlps xmm3, xmm7 movaps xmm6, xmm4
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?