nb_kernel304_ia32_sse.intel_syntax.s

来自「最著名最快的分子模拟软件」· S 代码 · 共 2,258 行 · 第 1/5 页

S
2,258
字号
;#;# $Id: nb_kernel304_ia32_sse.intel_syntax.s,v 1.1.2.1 2006/03/01 15:18:30 lindahl Exp $;#;# Gromacs 4.0                         Copyright (c) 1991-2003 ;# David van der Spoel, Erik Lindahl;#;# This program is free software; you can redistribute it and/or;# modify it under the terms of the GNU General Public License;# as published by the Free Software Foundation; either version 2;# of the License, or (at your option) any later version.;#;# To help us fund GROMACS development, we humbly ask that you cite;# the research papers on the package. Check out http://www.gromacs.org;# ;# And Hey:;# Gnomes, ROck Monsters And Chili Sauce;#;# These files require GNU binutils 2.10 or later, since we;# use intel syntax for portability, or a recent version ;# of NASM that understands Extended 3DNow and SSE2 instructions.;# (NASM is normally only used with MS Visual C++).;# Since NASM and gnu as disagree on some definitions and use ;# completely different preprocessing options I have to introduce a;# trick: NASM uses ';' for comments, while gnu as uses '#' on x86.;# Gnu as treats ';' as a line break, i.e. ignores it. This is the;# reason why all comments need both symbols...;# The source is written for GNU as, with intel syntax. When you use;# NASM we redefine a couple of things. The false if-statement around ;# the following code is seen by GNU as, but NASM doesn't see it, so ;# the code inside is read by NASM but not gcc.; .if 0    # block below only read by NASM%define .section	section%define .long		dd%define .align		align%define .globl		global;# NASM only wants 'dword', not 'dword ptr'.%define ptr.equiv          .equiv                  2   %1 equ %2%endmacro; .endif                   # End of NASM-specific block; .intel_syntax noprefix   # Line only read by gnu as		.globl nb_kernel304_ia32_sse.globl _nb_kernel304_ia32_ssenb_kernel304_ia32_sse:	_nb_kernel304_ia32_sse:	.equiv          nb304_p_nri,            8.equiv          nb304_iinr,             12.equiv          nb304_jindex,           16.equiv          nb304_jjnr,             20.equiv          nb304_shift,            24.equiv          nb304_shiftvec,         28.equiv          nb304_fshift,           32.equiv          nb304_gid,              36.equiv          nb304_pos,              40.equiv          nb304_faction,          44.equiv          nb304_charge,           48.equiv          nb304_p_facel,          52.equiv          nb304_argkrf,           56.equiv          nb304_argcrf,           60.equiv          nb304_Vc,               64.equiv          nb304_type,             68.equiv          nb304_p_ntype,          72.equiv          nb304_vdwparam,         76.equiv          nb304_Vvdw,             80.equiv          nb304_p_tabscale,       84.equiv          nb304_VFtab,            88.equiv          nb304_invsqrta,         92.equiv          nb304_dvda,             96.equiv          nb304_p_gbtabscale,     100.equiv          nb304_GBtab,            104.equiv          nb304_p_nthreads,       108.equiv          nb304_count,            112.equiv          nb304_mtx,              116.equiv          nb304_outeriter,        120.equiv          nb304_inneriter,        124.equiv          nb304_work,             128	;# stack offsets for local variables  	;# bottom of stack is cache-aligned for sse use .equiv          nb304_ixH1,             0.equiv          nb304_iyH1,             16.equiv          nb304_izH1,             32.equiv          nb304_ixH2,             48.equiv          nb304_iyH2,             64.equiv          nb304_izH2,             80.equiv          nb304_ixM,              96.equiv          nb304_iyM,              112.equiv          nb304_izM,              128.equiv          nb304_jxH1,             144.equiv          nb304_jyH1,             160.equiv          nb304_jzH1,             176.equiv          nb304_jxH2,             192.equiv          nb304_jyH2,             208.equiv          nb304_jzH2,             224.equiv          nb304_jxM,              240.equiv          nb304_jyM,              256.equiv          nb304_jzM,              272.equiv          nb304_dxH1H1,           288.equiv          nb304_dyH1H1,           304.equiv          nb304_dzH1H1,           320.equiv          nb304_dxH1H2,           336.equiv          nb304_dyH1H2,           352.equiv          nb304_dzH1H2,           368.equiv          nb304_dxH1M,            384.equiv          nb304_dyH1M,            400.equiv          nb304_dzH1M,            416.equiv          nb304_dxH2H1,           432.equiv          nb304_dyH2H1,           448.equiv          nb304_dzH2H1,           464.equiv          nb304_dxH2H2,           480.equiv          nb304_dyH2H2,           496.equiv          nb304_dzH2H2,           512.equiv          nb304_dxH2M,            528.equiv          nb304_dyH2M,            544.equiv          nb304_dzH2M,            560.equiv          nb304_dxMH1,            576.equiv          nb304_dyMH1,            592.equiv          nb304_dzMH1,            608.equiv          nb304_dxMH2,            624.equiv          nb304_dyMH2,            640.equiv          nb304_dzMH2,            656.equiv          nb304_dxMM,             672.equiv          nb304_dyMM,             688.equiv          nb304_dzMM,             704.equiv          nb304_qqHH,             720.equiv          nb304_qqMH,             736.equiv          nb304_qqMM,             752.equiv          nb304_two,              768.equiv          nb304_tsc,              784.equiv          nb304_vctot,            800.equiv          nb304_fixH1,            816.equiv          nb304_fiyH1,            832.equiv          nb304_fizH1,            848.equiv          nb304_fixH2,            864.equiv          nb304_fiyH2,            880.equiv          nb304_fizH2,            896.equiv          nb304_fixM,             912.equiv          nb304_fiyM,             928.equiv          nb304_fizM,             944.equiv          nb304_fjxH1,            960.equiv          nb304_fjyH1,            976.equiv          nb304_fjzH1,            992.equiv          nb304_fjxH2,            1008.equiv          nb304_fjyH2,            1024.equiv          nb304_fjzH2,            1040.equiv          nb304_fjxM,             1056.equiv          nb304_fjyM,             1072.equiv          nb304_fjzM,             1088.equiv          nb304_fjzMb,            1092.equiv          nb304_fjzMc,            1096.equiv          nb304_fjzMd,            1100.equiv          nb304_half,             1104.equiv          nb304_three,            1120.equiv          nb304_rsqH1H1,          1136.equiv          nb304_rsqH1H2,          1152.equiv          nb304_rsqH1M,           1168.equiv          nb304_rsqH2H1,          1184.equiv          nb304_rsqH2H2,          1200.equiv          nb304_rsqH2M,           1216.equiv          nb304_rsqMH1,           1232.equiv          nb304_rsqMH2,           1248.equiv          nb304_rsqMM,            1264.equiv          nb304_rinvH1H1,         1280.equiv          nb304_rinvH1H2,         1296.equiv          nb304_rinvH1M,          1312.equiv          nb304_rinvH2H1,         1328.equiv          nb304_rinvH2H2,         1344.equiv          nb304_rinvH2M,          1360.equiv          nb304_rinvMH1,          1376.equiv          nb304_rinvMH2,          1392.equiv          nb304_rinvMM,           1408.equiv          nb304_is3,              1424.equiv          nb304_ii3,              1428.equiv          nb304_innerjjnr,        1432.equiv          nb304_innerk,           1436.equiv          nb304_n,                1440.equiv          nb304_nn1,              1444.equiv          nb304_nri,              1448.equiv          nb304_nouter,           1452.equiv          nb304_ninner,           1456.equiv          nb304_salign,           1460	push ebp	mov ebp,esp		push eax	push ebx	push ecx	push edx	push esi	push edi	sub esp, 1464		;# local stack space 	mov  eax, esp	and  eax, 0xf	sub esp, eax	mov [esp + nb304_salign], eax	emms	;# Move args passed by reference to stack	mov ecx, [ebp + nb304_p_nri]	mov ecx, [ecx]	mov [esp + nb304_nri], ecx	;# zero iteration counters	mov eax, 0	mov [esp + nb304_nouter], eax	mov [esp + nb304_ninner], eax	mov eax, [ebp + nb304_p_tabscale]	movss xmm3, [eax]	shufps xmm3, xmm3, 0	movaps [esp + nb304_tsc],  xmm3	;# create constant floating-point factors on stack	mov eax, 0x3f000000     ;# constant 0.5 in IEEE (hex)	mov [esp + nb304_half], eax	movss xmm1, [esp + nb304_half]	shufps xmm1, xmm1, 0    ;# splat to all elements	movaps xmm2, xmm1       	addps  xmm2, xmm2	;# constant 1.0	movaps xmm3, xmm2	addps  xmm2, xmm2	;# constant 2.0	addps  xmm3, xmm2	;# constant 3.0	movaps [esp + nb304_half],  xmm1	movaps [esp + nb304_two],  xmm2	movaps [esp + nb304_three],  xmm3	;# assume we have at least one i particle - start directly 	mov   ecx, [ebp + nb304_iinr]   	;# ecx = pointer into iinr[] 		mov   ebx, [ecx]		;# ebx =ii 	mov   edx, [ebp + nb304_charge]	movss xmm3, [edx + ebx*4 + 4]		movss xmm4, xmm3		movss xmm5, [edx + ebx*4 + 12]		mov esi, [ebp + nb304_p_facel]	movss xmm6, [esi]	mulss  xmm3, xmm3	mulss  xmm4, xmm5	mulss  xmm5, xmm5	mulss  xmm3, xmm6	mulss  xmm4, xmm6	mulss  xmm5, xmm6	shufps xmm3, xmm3, 0	shufps xmm4, xmm4, 0	shufps xmm5, xmm5, 0	movaps [esp + nb304_qqHH], xmm3	movaps [esp + nb304_qqMH], xmm4	movaps [esp + nb304_qqMM], xmm5		.nb304_threadloop:        mov   esi, [ebp + nb304_count]          ;# pointer to sync counter        mov   eax, [esi].nb304_spinlock:        mov   ebx, eax                          ;# ebx=*count=nn0        add   ebx, 1                           ;# ebx=nn1=nn0+10        lock        cmpxchg [esi], ebx                      ;# write nn1 to *counter,                                                ;# if it hasnt changed.                                                ;# or reread *counter to eax.        pause                                   ;# -> better p4 performance        jnz .nb304_spinlock        ;# if(nn1>nri) nn1=nri        mov ecx, [esp + nb304_nri]        mov edx, ecx        sub ecx, ebx        cmovle ebx, edx                         ;# if(nn1>nri) nn1=nri        ;# Cleared the spinlock if we got here.        ;# eax contains nn0, ebx contains nn1.        mov [esp + nb304_n], eax        mov [esp + nb304_nn1], ebx        sub ebx, eax                            ;# calc number of outer lists	mov esi, eax				;# copy n to esi        jg  .nb304_outerstart        jmp .nb304_end	.nb304_outerstart:	;# ebx contains number of outer iterations	add ebx, [esp + nb304_nouter]	mov [esp + nb304_nouter], ebx.nb304_outer:	mov   eax, [ebp + nb304_shift]  	;# eax = pointer into shift[] 	mov   ebx, [eax + esi*4]		;# ebx=shift[n] 		lea   ebx, [ebx + ebx*2]	;# ebx=3*is 	mov   [esp + nb304_is3],ebx    	;# store is3 	mov   eax, [ebp + nb304_shiftvec]   ;# eax = base of shiftvec[] 	movss xmm0, [eax + ebx*4]	movss xmm1, [eax + ebx*4 + 4]	movss xmm2, [eax + ebx*4 + 8] 	mov   ecx, [ebp + nb304_iinr]   	;# ecx = pointer into iinr[] 		mov   ebx, [ecx + esi*4]		;# ebx =ii 	lea   ebx, [ebx + ebx*2]	;# ebx = 3*ii=ii3 	mov   eax, [ebp + nb304_pos]	;# eax = base of pos[]  	mov   [esp + nb304_ii3], ebx			movaps xmm3, xmm0	movaps xmm4, xmm1	movaps xmm5, xmm2	addss xmm3, [eax + ebx*4 + 12]	addss xmm4, [eax + ebx*4 + 16]	addss xmm5, [eax + ebx*4 + 20]			shufps xmm3, xmm3, 0	shufps xmm4, xmm4, 0	shufps xmm5, xmm5, 0	movaps [esp + nb304_ixH1], xmm3	movaps [esp + nb304_iyH1], xmm4	movaps [esp + nb304_izH1], xmm5	movss xmm3, xmm0	movss xmm4, xmm1	movss xmm5, xmm2	addss xmm0, [eax + ebx*4 + 24]	addss xmm1, [eax + ebx*4 + 28]	addss xmm2, [eax + ebx*4 + 32]			addss xmm3, [eax + ebx*4 + 36]	addss xmm4, [eax + ebx*4 + 40]	addss xmm5, [eax + ebx*4 + 44]			shufps xmm0, xmm0, 0	shufps xmm1, xmm1, 0	shufps xmm2, xmm2, 0	shufps xmm3, xmm3, 0	shufps xmm4, xmm4, 0	shufps xmm5, xmm5, 0	movaps [esp + nb304_ixH2], xmm0	movaps [esp + nb304_iyH2], xmm1	movaps [esp + nb304_izH2], xmm2	movaps [esp + nb304_ixM], xmm3	movaps [esp + nb304_iyM], xmm4	movaps [esp + nb304_izM], xmm5	;# clear vctot and i forces 	xorps xmm4, xmm4	movaps [esp + nb304_vctot], xmm4	movaps [esp + nb304_fixH1], xmm4	movaps [esp + nb304_fiyH1], xmm4	movaps [esp + nb304_fizH1], xmm4	movaps [esp + nb304_fixH2], xmm4	movaps [esp + nb304_fiyH2], xmm4	movaps [esp + nb304_fizH2], xmm4	movaps [esp + nb304_fixM], xmm4	movaps [esp + nb304_fiyM], xmm4	movaps [esp + nb304_fizM], xmm4		mov   eax, [ebp + nb304_jindex]	mov   ecx, [eax + esi*4]	 	;# jindex[n] 	mov   edx, [eax + esi*4 + 4]	 	;# jindex[n+1] 	sub   edx, ecx           	;# number of innerloop atoms 	mov   esi, [ebp + nb304_pos]	mov   edi, [ebp + nb304_faction]		mov   eax, [ebp + nb304_jjnr]	shl   ecx, 2	add   eax, ecx	mov   [esp + nb304_innerjjnr], eax 	;# pointer to jjnr[nj0] 	mov   ecx, edx	sub   edx,  4	add   ecx, [esp + nb304_ninner]	mov   [esp + nb304_ninner], ecx	add   edx, 0	mov   [esp + nb304_innerk], edx	;# number of innerloop atoms 	jge   .nb304_unroll_loop	jmp   .nb304_single_check.nb304_unroll_loop:		;# quad-unroll innerloop here 	mov   edx, [esp + nb304_innerjjnr] 	;# pointer to jjnr[k] 	mov   eax, [edx]		mov   ebx, [edx + 4] 	mov   ecx, [edx + 8]	mov   edx, [edx + 12]     	;# eax-edx=jnr1-4 		add dword ptr [esp + nb304_innerjjnr],  16 ;# advance pointer (unrolled 4) 	mov esi, [ebp + nb304_pos]   	;# base of pos[] 	lea   eax, [eax + eax*2] 	;# replace jnr with j3 	lea   ebx, [ebx + ebx*2]		lea   ecx, [ecx + ecx*2] 	;# replace jnr with j3 	lea   edx, [edx + edx*2]			;# move j coordinates to local temp variables 	movlps xmm2, [esi + eax*4 + 12]	movlps xmm3, [esi + eax*4 + 24]	movlps xmm4, [esi + eax*4 + 36]	movlps xmm5, [esi + ebx*4 + 12]	movlps xmm6, [esi + ebx*4 + 24]	movlps xmm7, [esi + ebx*4 + 36]	movhps xmm2, [esi + ecx*4 + 12]	movhps xmm3, [esi + ecx*4 + 24]	movhps xmm4, [esi + ecx*4 + 36]	movhps xmm5, [esi + edx*4 + 12]	movhps xmm6, [esi + edx*4 + 24]	movhps xmm7, [esi + edx*4 + 36]	movaps xmm0, xmm2	movaps xmm1, xmm3	unpcklps xmm0, xmm5	unpcklps xmm1, xmm6	unpckhps xmm2, xmm5	unpckhps xmm3, xmm6	movaps xmm5, xmm4	movaps   xmm6, xmm0	unpcklps xmm4, xmm7	unpckhps xmm5, xmm7	movaps   xmm7, xmm1	movlhps  xmm0, xmm2	movaps [esp + nb304_jxH1], xmm0	movhlps  xmm2, xmm6	movaps [esp + nb304_jyH1], xmm2	movlhps  xmm1, xmm3	movaps [esp + nb304_jxH2], xmm1	movhlps  xmm3, xmm7	movaps   xmm6, xmm4	movaps [esp + nb304_jyH2], xmm3	movlhps  xmm4, xmm5	movaps [esp + nb304_jxM], xmm4	movhlps  xmm5, xmm6	movaps [esp + nb304_jyM], xmm5	movss  xmm0, [esi + eax*4 + 20]	movss  xmm1, [esi + eax*4 + 32]	movss  xmm2, [esi + eax*4 + 44]	movss  xmm3, [esi + ecx*4 + 20]	movss  xmm4, [esi + ecx*4 + 32]	movss  xmm5, [esi + ecx*4 + 44]	movhps xmm0, [esi + ebx*4 + 16]	movhps xmm1, [esi + ebx*4 + 28]	movhps xmm2, [esi + ebx*4 + 40]		movhps xmm3, [esi + edx*4 + 16]	movhps xmm4, [esi + edx*4 + 28]	movhps xmm5, [esi + edx*4 + 40]	

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?