nb_kernel112_ppc_altivec.c

来自「最著名最快的分子模拟软件」· C语言 代码 · 共 1,557 行 · 第 1/5 页

C
1,557
字号
/* -*- mode: c; tab-width: 4; indent-tabs-mode: n; c-basic-offset: 4 -*-  * * $Id: nb_kernel112_ppc_altivec.c,v 1.2 2005/03/15 10:45:25 hess Exp $ *  * This file is part of Gromacs        Copyright (c) 1991-2004 * David van der Spoel, Erik Lindahl, University of Groningen. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * To help us fund GROMACS development, we humbly ask that you cite * the research papers on the package. Check out http://www.gromacs.org *  * And Hey: * Gnomes, ROck Monsters And Chili Sauce */#ifdef HAVE_CONFIG_H#include <config.h>#endif/* Must come directly after config.h */#include <gmx_thread.h>#include "ppc_altivec_util.h"#include "nb_kernel112_ppc_altivec.h"/* NB: This is one of the most common nonbonded functions called, so I tried * to optimize it by doing all register saving/restoring manually. It only gave * 5-10% better performance, so I have not implemented it in the other loops. * (it makes the code horrible to read). However, since it is slightly faster * there is no reason not to keep the optimized code... */void nb_kernel112_ppc_altivec  (int *             p_nri,                       int               iinr[],                       int               jindex[],                       int               jjnr[],                       int               shift[],                       float             shiftvec[],                       float             fshift[],                       int               gid[],                       float             pos[],                       float             faction[],                       float             charge[],                       float *           p_facel,                       float *           p_krf,                       float *           p_crf,                       float             Vc[],                       int               type[],                       int *             p_ntype,                       float             vdwparam[],                       float             Vvdw[],                       float *           p_tabscale,                       float             VFtab[],                       float             invsqrta[],                       float             dvda[],                       float *           p_gbtabscale,                       float             GBtab[],                       int *             p_nthreads,                       int *             count,                       void *            mtx,                       int *             outeriter,                       int *             inneriter,					   float *           work){	register vector float v0;	register vector float v1;	register vector float v2;	register vector float v3;	register vector float v4;	register vector float v5;	register vector float v6;	register vector float v7;	register vector float v8;	register vector float v9;	register vector float v10;	register vector float v11;	register vector float v12;	register vector float v13;	register vector float v14;	register vector float v15;	register vector float v16;	register vector float v17;	register vector float v18;	register vector float v19;	register vector float v20;	register vector float v21;	register vector float v22;	register vector float v23;	register vector float v24;	register vector float v25;	register vector float v26;	register vector float v27;	register vector float v28;	register vector float v29;	register vector float v30;	register vector float v31;	union vfloat {		float f[4];		vector float v;	} stackdata[52];  	int n,k,ii,is3,ii3,nj0,nj1;	int jnra,jnrb,jnrc,jnrd;	int j3a,j3b,j3c,j3d;	int nri, ntype, nouter, ninner;#ifdef GMX_THREADS	int nn0, nn1;#endif    nouter   = 0;    ninner   = 0;    nri      = *p_nri;    ntype    = *p_ntype;	/* set non java mode */	v10       = (vector float)vec_mfvscr();	v11       = (vector float)vec_sl(vec_splat_u32(1),vec_splat_u32(8));	v12       = (vector float)vec_sl((vector unsigned int)v11,									 vec_splat_u32(8));	v10       = (vector float)vec_or((vector unsigned short)v10,									 (vector unsigned short)v12);	vec_mtvscr((vector unsigned short)v10);	v0        = (vector float)vec_splat_u32(0);	v0        = vec_ctf((vector unsigned int)v0,0);     /* load 0 to v0 */	v1        = vec_lde(0,p_facel); /* load facel float to a vector */	v2        = (vector float) vec_lvsl(0,p_facel); 	v1        = vec_perm(v1,v1,(vector unsigned char) v2); /* move to elem 0 */	v1        = vec_splat(v1,0); /* splat it to all elem */  		ii        = iinr[0];  	v3        = vec_lde(0,charge+ii); /* load qO float to a vector */	v4        = (vector float) vec_lvsl(0,charge+ii); 	v3        = vec_perm(v3,v3,(vector unsigned char) v4); /* move to elem 0 */	v3        = vec_splat(v3,0); /* splat it to all elem */	v5        = vec_lde(0,charge+ii+1); /* load qH float to a vector */	v6        = (vector float) vec_lvsl(0,charge+ii+1); 	v5        = vec_perm(v5,v5,(vector unsigned char) v6); /* move to elem 0 */	v5        = vec_splat(v5,0); /* splat it to all elem */	v4        = vec_madd(v3,v5,v0); /* qqOH */	v3        = vec_madd(v3,v3,v0); /* qqOO */	v5        = vec_madd(v5,v5,v0); /* qqHH */	v4        = vec_madd(v4,v1,v0); /* qqOH * facel */	v3        = vec_madd(v3,v1,v0); /* qqOO * facel */	v5        = vec_madd(v5,v1,v0); /* qqHH * facel */	n         = 2*type[ii];	n        = (ntype+1)*n;  	v1        = vec_ld( 0,vdwparam+n);  /* c6a c12a - the vdwparam array is at least									 * 8-byte aligned and n is even here.									 */	v2        = (vector float) vec_lvsl(0,vdwparam+n);	v1        = vec_perm(v1,v1,(vector unsigned char)v2); /* c6 c12 in 0,1 */	v2        = vec_splat(v1,1);  /* c12 in all elements */	v1        = vec_splat(v1,0);  /* c6 in all elements */	/* store things to stack before starting outer loop */	vec_st(v3,  0, (float *) stackdata); /* qqOO*facel is in stack pos 0 */	vec_st(v4, 16, (float *) stackdata); /* qqOH*facel is in stack pos 1 */	vec_st(v5, 32, (float *) stackdata); /* qqHH*facel is in stack pos 2 */	vec_st(v1, 48, (float *) stackdata); /* c6 is in stack pos 3  */	vec_st(v2, 64, (float *) stackdata); /* c12 is in stack pos 4 */  #ifdef GMX_THREADS    nthreads = *p_nthreads;	do {		gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx);		nn0              = *count;		nn1              = nn0+(nri-nn0)/(2*nthreads)+3;		*count           = nn1;		gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx);		if(nn1>nri) nn1=nri;		for(n=nn0; (n<nn1); n++) {#if 0		} /* maintain correct indentation even with conditional left braces */#endif#else /* without gmx_threads */		for(n=0;n<nri;n++) {#endif  			is3        = 3*shift[n];			ii         = iinr[n];			ii3        = 3*ii;			/* load shift */			/* load three consecutive shiftvector floats.              * We never access the fourth element,			 * so this is safe even at the end of an array. 			 */			v4         = (vector float)vec_lvsl(0, shiftvec+is3);			v1         = vec_lde(0, shiftvec+is3);			v2         = vec_lde(4, shiftvec+is3);			v3         = vec_lde(8, shiftvec+is3);			/* Load shX,shY,shZ to elem 0 of v1,v2,v3 */			v1         = vec_perm(v1,v1,(vector unsigned char)v4); 			v2         = vec_perm(v2,v2,(vector unsigned char)v4); 			v3         = vec_perm(v3,v3,(vector unsigned char)v4);			v2         = vec_sld(v2,v2,4);			v3         = vec_sld(v3,v3,8);			v1         = vec_mergeh(v1,v3);			v1         = vec_mergeh(v1,v2);  /* [ shX shY shZ - ] */			/* load i coordinates */			v2         = (vector float)vec_lvsl(0, pos+ii3);			/* load 3atoms coords into three vectors.              * We do not yet know how it is aligned.              */			v3         = vec_ld(0, pos+ii3); 			v4         = vec_ld(16, pos+ii3);			v5         = vec_ld(32, pos+ii3);			v6         = vec_sld(v1,v1,12); /*  - shX shY shZ   */			v7         = vec_sld(v6,v1,4);  /*  shX shY shZ shX */			v8         = vec_sld(v6,v1,8);  /*  shY shZ shX shY */			v9         = vec_sld(v6,v1,12); /*  shZ shX shY shZ */			/* v3 = Ox  Oy  Oz H1x */			v3         = vec_perm(v3,v4,(vector unsigned char)v2);			/* v4 = H1y H1z H2x H2y */			v4         = vec_perm(v4,v5,(vector unsigned char)v2); 			/* v5 = H2z   -   -   - */			v5         = vec_perm(v5,v5,(vector unsigned char)v2); 			v3         = vec_add(v3,v7);			v4         = vec_add(v4,v8);			v5         = vec_add(v5,v9);			v6         = vec_splat(v3,0);  /* Ox Ox Ox Ox */			v7         = vec_splat(v3,1);  /* Oy Oy Oy Oy */			v8         = vec_splat(v3,2);  /* Oz Oz Oz Oz */			v9         = vec_splat(v3,3);  /* H1x H1x H1x H1x */			v10        = vec_splat(v4,0);  /* H1y H1y H1y H1y */			v11        = vec_splat(v4,1);  /* H1z H1z H1z H1z */			v12        = vec_splat(v4,2);  /* H2x H2x H2x H2x */			v13        = vec_splat(v4,3);  /* H2y H2y H2y H2y */			v14        = vec_splat(v5,0);  /* H2z H2z H2z H2z */			/* Store i 3atoms coordinates to stack */			vec_st(v6,  80, (float *)stackdata); /* i Ox is in stack pos 5 */			vec_st(v7,  96, (float *)stackdata); /* i Oy is in stack pos 6 */			vec_st(v8, 112, (float *)stackdata); /* i Oz is in stack pos 7 */			vec_st(v9, 128, (float *)stackdata); /* i H1x is in stack pos 8 */			vec_st(v10,144, (float *)stackdata); /* i H1y is in stack pos 9 */			vec_st(v11,160, (float *)stackdata); /* i H1z is in stack pos 10 */			vec_st(v12,176, (float *)stackdata); /* i H2x is in stack pos 11 */			vec_st(v13,192, (float *)stackdata); /* i H2y is in stack pos 12 */			vec_st(v14,208, (float *)stackdata); /* i H2z is in stack pos 13 */			nj0        = jindex[n];			nj1        = jindex[n+1];			/*			vec_dst( jjnr + nj1, 0x10010100, 0 ); */			/* zero vctot, in stack pos 14 */			vec_st(v0, 224, (float *)stackdata); 			/* zero vctot, in stack pos 15 */			vec_st(v0, 240, (float *)stackdata); 			/* zero fiOx, in stack pos 16 */			vec_st(v0, 256, (float *)stackdata); 			/* zero fiOy, in stack pos 17 */			vec_st(v0, 272, (float *)stackdata); 			/* zero fiOz, in stack pos 18 */			vec_st(v0, 288, (float *)stackdata); 			/* zero fiH1x, in stack pos 19 */			vec_st(v0, 304, (float *)stackdata); 			/* zero fiH1y, in stack pos 20 */			vec_st(v0, 320, (float *)stackdata); 			/* zero fiH1z, in stack pos 21 */			vec_st(v0, 336, (float *)stackdata); 			/* zero fiH2x, in stack pos 22 */			vec_st(v0, 352, (float *)stackdata); 			/* zero fiH2y, in stack pos 23 */			vec_st(v0, 368, (float *)stackdata); 			/* zero fiH2z, in stack pos 24 */			vec_st(v0, 384, (float *)stackdata); 			for(k=nj0; k<(nj1-3); k+=4) { 				jnra            = jjnr[k];				jnrb            = jjnr[k+1];				jnrc            = jjnr[k+2];				jnrd            = jjnr[k+3];				/*				vec_dst( jjnr + k + 4, 0x02020020, 0 ); */				j3a             = 3*jnra;				j3b             = 3*jnrb;				j3c             = 3*jnrc;				j3d             = 3*jnrd;				/*				vec_dst( pos+j3a, 0x10010100, 1 ); */				v1              = (vector float)vec_lvsl(0, pos+j3a);				v8              = (vector float)vec_lvsl(0, pos+j3b);				v15             = (vector float)vec_lvsl(0, pos+j3c);				v22             = (vector float)vec_lvsl(0, pos+j3d);				v2              = vec_ld(0, pos+j3a);				v9              = vec_ld(0, pos+j3b);				v16             = vec_ld(0, pos+j3c);				v23             = vec_ld(0, pos+j3d);				v3              = vec_ld(16, pos+j3a);				v10             = vec_ld(16, pos+j3b);				v17             = vec_ld(16, pos+j3c);				v24             = vec_ld(16, pos+j3d);				v4              = vec_ld(32, pos+j3a);				v11             = vec_ld(32, pos+j3b);				v18             = vec_ld(32, pos+j3c);

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?