nb_kernel410_ia32_sse.intel_syntax.s

来自「最著名最快的分子模拟软件」· S 代码 · 共 2,051 行 · 第 1/4 页

S
2,051
字号
;#;# $Id: nb_kernel410_ia32_sse.intel_syntax.s,v 1.1.2.1 2006/03/01 15:18:31 lindahl Exp $;#;# Gromacs 4.0                         Copyright (c) 1991-2003 ;# David van der Spoel, Erik Lindahl;#;# This program is free software; you can redistribute it and/or;# modify it under the terms of the GNU General Public License;# as published by the Free Software Foundation; either version 2;# of the License, or (at your option) any later version.;#;# To help us fund GROMACS development, we humbly ask that you cite;# the research papers on the package. Check out http://www.gromacs.org;# ;# And Hey:;# Gnomes, ROck Monsters And Chili Sauce;#;# These files require GNU binutils 2.10 or later, since we;# use intel syntax for portability, or a recent version ;# of NASM that understands Extended 3DNow and SSE2 instructions.;# (NASM is normally only used with MS Visual C++).;# Since NASM and gnu as disagree on some definitions and use ;# completely different preprocessing options I have to introduce a;# trick: NASM uses ';' for comments, while gnu as uses '#' on x86.;# Gnu as treats ';' as a line break, i.e. ignores it. This is the;# reason why all comments need both symbols...;# The source is written for GNU as, with intel syntax. When you use;# NASM we redefine a couple of things. The false if-statement around ;# the following code is seen by GNU as, but NASM doesn't see it, so ;# the code inside is read by NASM but not gcc.; .if 0    # block below only read by NASM%define .section	section%define .long		dd%define .align		align%define .globl		global;# NASM only wants 'dword', not 'dword ptr'.%define ptr.equiv          .equiv                  2   %1 equ %2%endmacro; .endif                   # End of NASM-specific block; .intel_syntax noprefix   # Line only read by gnu as.globl nb_kernel410_ia32_sse.globl _nb_kernel410_ia32_ssenb_kernel410_ia32_sse:	_nb_kernel410_ia32_sse:	.equiv          nb410_p_nri,            8.equiv          nb410_iinr,             12.equiv          nb410_jindex,           16.equiv          nb410_jjnr,             20.equiv          nb410_shift,            24.equiv          nb410_shiftvec,         28.equiv          nb410_fshift,           32.equiv          nb410_gid,              36.equiv          nb410_pos,              40.equiv          nb410_faction,          44.equiv          nb410_charge,           48.equiv          nb410_p_facel,          52.equiv          nb410_argkrf,           56.equiv          nb410_argcrf,           60.equiv          nb410_Vc,               64.equiv          nb410_type,             68.equiv          nb410_p_ntype,          72.equiv          nb410_vdwparam,         76.equiv          nb410_Vvdw,             80.equiv          nb410_p_tabscale,       84.equiv          nb410_VFtab,            88.equiv          nb410_invsqrta,         92.equiv          nb410_dvda,             96.equiv          nb410_p_gbtabscale,     100.equiv          nb410_GBtab,            104.equiv          nb410_p_nthreads,       108.equiv          nb410_count,            112.equiv          nb410_mtx,              116.equiv          nb410_outeriter,        120.equiv          nb410_inneriter,        124.equiv          nb410_work,             128	;# stack offsets for local variables  	;# bottom of stack is cache-aligned for sse use .equiv          nb410_ix,               0.equiv          nb410_iy,               16.equiv          nb410_iz,               32.equiv          nb410_iq,               48.equiv          nb410_dx,               64.equiv          nb410_dy,               80.equiv          nb410_dz,               96.equiv          nb410_two,              112.equiv          nb410_six,              128.equiv          nb410_twelve,           144.equiv          nb410_gbtsc,            160.equiv          nb410_qq,               176.equiv          nb410_c6,               192.equiv          nb410_c12,              208.equiv          nb410_fscal,            224.equiv          nb410_vctot,            240.equiv          nb410_Vvdwtot,          256.equiv          nb410_fix,              272.equiv          nb410_fiy,              288.equiv          nb410_fiz,              304.equiv          nb410_half,             320.equiv          nb410_three,            336.equiv          nb410_r,                352.equiv          nb410_isai,             368.equiv          nb410_isaprod,          384.equiv          nb410_dvdasum,          400.equiv          nb410_gbscale,          416.equiv          nb410_is3,              432.equiv          nb410_ii3,              436.equiv          nb410_ii,               440.equiv          nb410_ntia,             444.equiv          nb410_innerjjnr,        448.equiv          nb410_innerk,           452.equiv          nb410_n,                456.equiv          nb410_nn1,              460.equiv          nb410_jnra,             464.equiv          nb410_jnrb,             468.equiv          nb410_jnrc,             472.equiv          nb410_jnrd,             476.equiv          nb410_nri,              480.equiv          nb410_facel,            484.equiv          nb410_ntype,            488.equiv          nb410_nouter,           492.equiv          nb410_ninner,           496.equiv          nb410_salign,           500	push ebp	mov ebp,esp	    	push eax    	push ebx    	push ecx    	push edx	push esi	push edi	sub esp, 504		;# local stack space 	mov  eax, esp	and  eax, 0xf	sub esp, eax	mov [esp + nb410_salign], eax	emms	;# Move args passed by reference to stack	mov ecx, [ebp + nb410_p_nri]	mov esi, [ebp + nb410_p_facel]	mov edi, [ebp + nb410_p_ntype]	mov ecx, [ecx]	mov esi, [esi]	mov edi, [edi]	mov [esp + nb410_nri], ecx	mov [esp + nb410_facel], esi	mov [esp + nb410_ntype], edi	;# zero iteration counters	mov eax, 0	mov [esp + nb410_nouter], eax	mov [esp + nb410_ninner], eax	mov eax, [ebp + nb410_p_gbtabscale]	movss xmm5, [eax]		shufps xmm5, xmm5, 0	movaps [esp + nb410_gbtsc], xmm5	;# create constant floating-point factors on stack	mov eax, 0x3f000000     ;# constant 0.5 in IEEE (hex)	mov [esp + nb410_half], eax	movss xmm1, [esp + nb410_half]	shufps xmm1, xmm1, 0    ;# splat to all elements	movaps xmm2, xmm1       	addps  xmm2, xmm2	;# constant 1.0	movaps xmm3, xmm2	addps  xmm2, xmm2	;# constant 2.0	addps  xmm3, xmm2	;# constant 3.0	movaps xmm4, xmm3	addps  xmm4, xmm4	;# 6.0	movaps xmm5, xmm4	addps  xmm5, xmm5	;# constant 12.0	movaps [esp + nb410_half],  xmm1	movaps [esp + nb410_two],  xmm2	movaps [esp + nb410_three],  xmm3	movaps [esp + nb410_six],  xmm4	movaps [esp + nb410_twelve],  xmm5.nb410_threadloop:        mov   esi, [ebp + nb410_count]          ;# pointer to sync counter        mov   eax, [esi].nb410_spinlock:        mov   ebx, eax                          ;# ebx=*count=nn0        add   ebx, 1                           ;# ebx=nn1=nn0+10        lock        cmpxchg [esi], ebx                      ;# write nn1 to *counter,                                                ;# if it hasnt changed.                                                ;# or reread *counter to eax.        pause                                   ;# -> better p4 performance        jnz .nb410_spinlock        ;# if(nn1>nri) nn1=nri        mov ecx, [esp + nb410_nri]        mov edx, ecx        sub ecx, ebx        cmovle ebx, edx                         ;# if(nn1>nri) nn1=nri        ;# Cleared the spinlock if we got here.        ;# eax contains nn0, ebx contains nn1.        mov [esp + nb410_n], eax        mov [esp + nb410_nn1], ebx        sub ebx, eax                            ;# calc number of outer lists	mov esi, eax				;# copy n to esi        jg  .nb410_outerstart        jmp .nb410_end.nb410_outerstart:	;# ebx contains number of outer iterations	add ebx, [esp + nb410_nouter]	mov [esp + nb410_nouter], ebx.nb410_outer:	mov   eax, [ebp + nb410_shift]      ;# eax = pointer into shift[] 	mov   ebx, [eax+esi*4]		;# ebx=shift[n] 		lea   ebx, [ebx + ebx*2]    ;# ebx=3*is 	mov   [esp + nb410_is3],ebx    	;# store is3 	mov   eax, [ebp + nb410_shiftvec]   ;# eax = base of shiftvec[] 	movss xmm0, [eax + ebx*4]	movss xmm1, [eax + ebx*4 + 4]	movss xmm2, [eax + ebx*4 + 8] 	mov   ecx, [ebp + nb410_iinr]       ;# ecx = pointer into iinr[] 		mov   ebx, [ecx + esi*4]	    ;# ebx =ii 	mov   [esp + nb410_ii], ebx	mov   edx, [ebp + nb410_charge]	movss xmm3, [edx + ebx*4]		mulss xmm3, [esp + nb410_facel]	shufps xmm3, xmm3, 0	mov   edx, [ebp + nb410_invsqrta]	;# load invsqrta[ii]	movss xmm4, [edx + ebx*4]	shufps xmm4, xmm4, 0    	mov   edx, [ebp + nb410_type]     	mov   edx, [edx + ebx*4]    	imul  edx, [esp + nb410_ntype]    	shl   edx, 1    	mov   [esp + nb410_ntia], edx			lea   ebx, [ebx + ebx*2]	;# ebx = 3*ii=ii3 	mov   eax, [ebp + nb410_pos]    ;# eax = base of pos[]  	addss xmm0, [eax + ebx*4]	addss xmm1, [eax + ebx*4 + 4]	addss xmm2, [eax + ebx*4 + 8]	movaps [esp + nb410_iq], xmm3	movaps [esp + nb410_isai], xmm4		shufps xmm0, xmm0, 0	shufps xmm1, xmm1, 0	shufps xmm2, xmm2, 0	movaps [esp + nb410_ix], xmm0	movaps [esp + nb410_iy], xmm1	movaps [esp + nb410_iz], xmm2	mov   [esp + nb410_ii3], ebx		;# clear vctot and i forces 	xorps xmm4, xmm4	movaps [esp + nb410_vctot], xmm4	movaps [esp + nb410_Vvdwtot], xmm4	movaps [esp + nb410_dvdasum], xmm4	movaps [esp + nb410_fix], xmm4	movaps [esp + nb410_fiy], xmm4	movaps [esp + nb410_fiz], xmm4		mov   eax, [ebp + nb410_jindex]	mov   ecx, [eax + esi*4]	     ;# jindex[n] 	mov   edx, [eax + esi*4 + 4]	     ;# jindex[n+1] 	sub   edx, ecx               ;# number of innerloop atoms 	mov   esi, [ebp + nb410_pos]	mov   edi, [ebp + nb410_faction]		mov   eax, [ebp + nb410_jjnr]	shl   ecx, 2	add   eax, ecx	mov   [esp + nb410_innerjjnr], eax     ;# pointer to jjnr[nj0] 	mov   ecx, edx	sub   edx,  4	add   ecx, [esp + nb410_ninner]	mov   [esp + nb410_ninner], ecx	add   edx, 0	mov   [esp + nb410_innerk], edx    ;# number of innerloop atoms 	jge   .nb410_unroll_loop	jmp   .nb410_finish_inner.nb410_unroll_loop:		;# quad-unroll innerloop here 	mov   edx, [esp + nb410_innerjjnr]     ;# pointer to jjnr[k] 	mov   eax, [edx]		mov   ebx, [edx + 4]              	mov   ecx, [edx + 8]            	mov   edx, [edx + 12]         ;# eax-edx=jnr1-4 	add dword ptr [esp + nb410_innerjjnr],  16 ;# advance pointer (unrolled 4) 	;# load isaj	mov esi, [ebp + nb410_invsqrta]	movss xmm3, [esi + eax*4]	movss xmm4, [esi + ecx*4]	movss xmm6, [esi + ebx*4]	movss xmm7, [esi + edx*4]	movaps xmm2, [esp + nb410_isai]	shufps xmm3, xmm6, 0 	shufps xmm4, xmm7, 0 	shufps xmm3, xmm4, 136  ;# constant 10001000 ;# all isaj in xmm3	mulps  xmm2, xmm3			movaps [esp + nb410_isaprod], xmm2	movaps xmm1, xmm2	mulps xmm1, [esp + nb410_gbtsc]	movaps [esp + nb410_gbscale], xmm1		mov esi, [ebp + nb410_charge]    ;# base of charge[] 		movss xmm3, [esi + eax*4]	movss xmm4, [esi + ecx*4]	movss xmm6, [esi + ebx*4]	movss xmm7, [esi + edx*4]	mulps xmm2, [esp + nb410_iq]	shufps xmm3, xmm6, 0 	shufps xmm4, xmm7, 0 	shufps xmm3, xmm4, 136  ;# constant 10001000 ;# all charges in xmm3  	mulps  xmm3, xmm2	movaps [esp + nb410_qq], xmm3		movd mm0, eax	movd mm1, ebx	movd mm2, ecx	movd mm3, edx		mov esi, [ebp + nb410_type]	mov eax, [esi + eax*4]	mov ebx, [esi + ebx*4]	mov ecx, [esi + ecx*4]	mov edx, [esi + edx*4]	mov esi, [ebp + nb410_vdwparam]	shl eax, 1		shl ebx, 1		shl ecx, 1		shl edx, 1		mov edi, [esp + nb410_ntia]	add eax, edi	add ebx, edi	add ecx, edi	add edx, edi	movlps xmm6, [esi + eax*4]	movlps xmm7, [esi + ecx*4]	movhps xmm6, [esi + ebx*4]	movhps xmm7, [esi + edx*4]	movaps xmm4, xmm6	shufps xmm4, xmm7, 136  ;# constant 10001000	shufps xmm6, xmm7, 221  ;# constant 11011101		movd  eax, mm0			movd  ebx, mm1	movd  ecx, mm2	movd  edx, mm3	movaps [esp + nb410_c6], xmm4	movaps [esp + nb410_c12], xmm6		mov esi, [ebp + nb410_pos]       ;# base of pos[] 	mov [esp + nb410_jnra], eax	mov [esp + nb410_jnrb], ebx	mov [esp + nb410_jnrc], ecx	mov [esp + nb410_jnrd], edx	lea   eax, [eax + eax*2]     ;# replace jnr with j3 	lea   ebx, [ebx + ebx*2]		lea   ecx, [ecx + ecx*2]     ;# replace jnr with j3 	lea   edx, [edx + edx*2]		;# move four coordinates to xmm0-xmm2 		movlps xmm4, [esi + eax*4]	movlps xmm5, [esi + ecx*4]	movss xmm2, [esi + eax*4 + 8]	movss xmm6, [esi + ecx*4 + 8]	movhps xmm4, [esi + ebx*4]	movhps xmm5, [esi + edx*4]	movss xmm0, [esi + ebx*4 + 8]	movss xmm1, [esi + edx*4 + 8]	shufps xmm2, xmm0, 0	shufps xmm6, xmm1, 0		movaps xmm0, xmm4	movaps xmm1, xmm4	shufps xmm2, xmm6, 136  ;# constant 10001000		shufps xmm0, xmm5, 136  ;# constant 10001000	shufps xmm1, xmm5, 221  ;# constant 11011101			;# move ix-iz to xmm4-xmm6 	movaps xmm4, [esp + nb410_ix]	movaps xmm5, [esp + nb410_iy]	movaps xmm6, [esp + nb410_iz]	;# calc dr 	subps xmm4, xmm0	subps xmm5, xmm1	subps xmm6, xmm2	;# store dr 	movaps [esp + nb410_dx], xmm4	movaps [esp + nb410_dy], xmm5	movaps [esp + nb410_dz], xmm6	;# square it 	mulps xmm4,xmm4	mulps xmm5,xmm5	mulps xmm6,xmm6	addps xmm4, xmm5	addps xmm4, xmm6	;# rsq in xmm4 	rsqrtps xmm5, xmm4	;# lookup seed in xmm5 	movaps xmm2, xmm5	mulps xmm5, xmm5	movaps xmm1, [esp + nb410_three]	mulps xmm5, xmm4	;# rsq*lu*lu 				movaps xmm0, [esp + nb410_half]	subps xmm1, xmm5	;# constant 30-rsq*lu*lu 	mulps xmm1, xmm2	mulps xmm0, xmm1	;# xmm0=rinv 	mulps xmm4, xmm0	;# xmm4=r 	movaps [esp + nb410_r], xmm4	mulps xmm4, [esp + nb410_gbscale]	movhlps xmm5, xmm4	cvttps2pi mm6, xmm4	cvttps2pi mm7, xmm5	;# mm6/mm7 contain lu indices 	cvtpi2ps xmm6, mm6	cvtpi2ps xmm5, mm7	movlhps xmm6, xmm5	subps xmm4, xmm6		movaps xmm1, xmm4	;# xmm1=eps 	movaps xmm2, xmm1		mulps  xmm2, xmm2	;# xmm2=eps2 	pslld mm6, 2	pslld mm7, 2	movd mm0, eax		movd mm1, ebx	movd mm2, ecx	movd mm3, edx	mov  esi, [ebp + nb410_GBtab]	movd eax, mm6	psrlq mm6, 32	movd ecx, mm7	psrlq mm7, 32	movd ebx, mm6	movd edx, mm7	;# load coulomb table	movaps xmm4, [esi + eax*4]	movaps xmm5, [esi + ebx*4]	movaps xmm6, [esi + ecx*4]	movaps xmm7, [esi + edx*4]	;# transpose, using xmm3 for scratch	movaps xmm3, xmm6	shufps xmm3, xmm7, 0xEE 	shufps xmm6, xmm7, 0x44	movaps xmm7, xmm4	shufps xmm7, xmm5, 0xEE	shufps xmm4, xmm5, 0x44	movaps xmm5, xmm4	shufps xmm5, xmm6, 0xDD	shufps xmm4, xmm6, 0x88	movaps xmm6, xmm7	shufps xmm6, xmm3, 0x88	shufps xmm7, xmm3, 0xDD	;# coulomb table ready, in xmm4-xmm7  			mulps  xmm6, xmm1	;# xmm6=Geps 	mulps  xmm7, xmm2	;# xmm7=Heps2 	addps  xmm5, xmm6	addps  xmm5, xmm7	;# xmm5=Fp 		mulps  xmm7, [esp + nb410_two]	;# two*Heps2 	movaps xmm3, [esp + nb410_qq]	addps  xmm7, xmm6	addps  xmm7, xmm5 ;# xmm7=FF 	mulps  xmm5, xmm1 ;# xmm5=eps*Fp 	addps  xmm5, xmm4 ;# xmm5=VV 	mulps  xmm5, xmm3 ;# vcoul=qq*VV  	mulps  xmm3, xmm7 ;# fijC=FF*qq	;# get jnr from stack	mov eax, [esp + nb410_jnra]	mov ebx, [esp + nb410_jnrb]	mov ecx, [esp + nb410_jnrc]

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?