nb_kernel303_ppc_altivec.c
来自「最著名最快的分子模拟软件」· C语言 代码 · 共 738 行 · 第 1/2 页
C
738 行
/* -*- mode: c; tab-width: 4; indent-tabs-mode: n; c-basic-offset: 4 -*- * * $Id: nb_kernel303_ppc_altivec.c,v 1.1 2004/12/26 19:26:00 lindahl Exp $ * * This file is part of Gromacs Copyright (c) 1991-2004 * David van der Spoel, Erik Lindahl, University of Groningen. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * To help us fund GROMACS development, we humbly ask that you cite * the research papers on the package. Check out http://www.gromacs.org * * And Hey: * Gnomes, ROck Monsters And Chili Sauce */#ifdef HAVE_CONFIG_H#include <config.h>#endif/* Must come directly after config.h */#include <gmx_thread.h>#include "ppc_altivec_util.h"#include "nb_kernel303_ppc_altivec.h"void nb_kernel303_ppc_altivec (int * p_nri, int iinr[], int jindex[], int jjnr[], int shift[], float shiftvec[], float fshift[], int gid[], float pos[], float faction[], float charge[], float * p_facel, float * p_krf, float * p_crf, float Vc[], int type[], int * p_ntype, float vdwparam[], float Vvdw[], float * p_tabscale, float VFtab[], float invsqrta[], float dvda[], float * p_gbtabscale, float GBtab[], int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, float * work){ vector float iMx,iMy,iMz,iH1x,iH1y,iH1z,iH2x,iH2y,iH2z; vector float dMx,dMy,dMz,dH1x,dH1y,dH1z,dH2x,dH2y,dH2z; vector float vfacel,nul; vector float fsM,fsH1,fsH2,tsc,VVcM,FFcM,VVcH1,FFcH1,VVcH2,FFcH2; vector float vctot,qqM,qqH,iqM,iqH,jq; vector float fiMx,fiMy,fiMz,fiH1x,fiH1y,fiH1z,fiH2x,fiH2y,fiH2z; vector float tmp1,tmp2,tmp3,tmp4; vector float rinvM,rinvH1,rinvH2,rM,rH1,rH2,rsqM,rsqH1,rsqH2; int n,k,ii,is3,ii3,nj0,nj1; int jnra,jnrb,jnrc,jnrd; int j3a,j3b,j3c,j3d; int nri, ntype, nouter, ninner;#ifdef GMX_THREADS int nn0, nn1;#endif nouter = 0; ninner = 0; nri = *p_nri; ntype = *p_ntype; nul=vec_zero(); vfacel=load_float_and_splat(p_facel); tsc=load_float_and_splat(p_tabscale); iqH = vec_madd(load_float_and_splat(charge+iinr[0]+1),vfacel,nul); iqM = vec_madd(load_float_and_splat(charge+iinr[0]+3),vfacel,nul); #ifdef GMX_THREADS nthreads = *p_nthreads; do { gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx); nn0 = *count; nn1 = nn0+(nri-nn0)/(2*nthreads)+3; *count = nn1; gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx); if(nn1>nri) nn1=nri; for(n=nn0; (n<nn1); n++) {#if 0 } /* maintain correct indentation even with conditional left braces */#endif#else /* without gmx_threads */ for(n=0;n<nri;n++) {#endif is3 = 3*shift[n]; ii = iinr[n]; ii3 = 3*ii; load_1_3atoms_shift_and_splat(pos+ii3+3,shiftvec+is3, &iH1x,&iH1y,&iH1z, &iH2x,&iH2y,&iH2z,&iMx,&iMy,&iMz); vctot = nul; fiMx = nul; fiMy = nul; fiMz = nul; fiH1x = nul; fiH1y = nul; fiH1z = nul; fiH2x = nul; fiH2y = nul; fiH2z = nul; nj0 = jindex[n]; nj1 = jindex[n+1]; for(k=nj0; k<(nj1-3); k+=4) { jnra = jjnr[k]; jnrb = jjnr[k+1]; jnrc = jjnr[k+2]; jnrd = jjnr[k+3]; j3a = 3*jnra; j3b = 3*jnrb; j3c = 3*jnrc; j3d = 3*jnrd; transpose_4_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b), load_xyz(pos+j3c), load_xyz(pos+j3d),&dH2x,&dH2y,&dH2z); dMx = vec_sub(iMx,dH2x); dMy = vec_sub(iMy,dH2y); dMz = vec_sub(iMz,dH2z); dH1x = vec_sub(iH1x,dH2x); dH1y = vec_sub(iH1y,dH2y); dH1z = vec_sub(iH1z,dH2z); dH2x = vec_sub(iH2x,dH2x); dH2y = vec_sub(iH2y,dH2y); dH2z = vec_sub(iH2z,dH2z); rsqM = vec_madd(dMx,dMx,nul); rsqH1 = vec_madd(dH1x,dH1x,nul); rsqH2 = vec_madd(dH2x,dH2x,nul); rsqM = vec_madd(dMy,dMy,rsqM); rsqH1 = vec_madd(dH1y,dH1y,rsqH1); rsqH2 = vec_madd(dH2y,dH2y,rsqH2); rsqM = vec_madd(dMz,dMz,rsqM); rsqH1 = vec_madd(dH1z,dH1z,rsqH1); rsqH2 = vec_madd(dH2z,dH2z,rsqH2); do_3_invsqrt(rsqM,rsqH1,rsqH2,&rinvM,&rinvH1,&rinvH2); rM = vec_madd(rsqM,rinvM,nul); rH1 = vec_madd(rsqH1,rinvH1,nul); rH2 = vec_madd(rsqH2,rinvH2,nul); /* load 4 j charges and multiply by iq */ jq=load_4_float(charge+jnra,charge+jnrb, charge+jnrc,charge+jnrd); do_4_ctable_coul(VFtab,vec_madd(rM,tsc,nul),&VVcM,&FFcM); do_4_ctable_coul(VFtab,vec_madd(rH1,tsc,nul),&VVcH1,&FFcH1); do_4_ctable_coul(VFtab,vec_madd(rH2,tsc,nul),&VVcH2,&FFcH2); qqM = vec_madd(iqM,jq,nul); qqH = vec_madd(iqH,jq,nul); vctot = vec_madd(qqM,VVcM,vctot); fsM = vec_nmsub(qqM,FFcM,nul); fsH1 = vec_nmsub(qqH,FFcH1,nul); fsH2 = vec_nmsub(qqH,FFcH2,nul); vctot = vec_madd(qqH,VVcH1,vctot); fsM = vec_madd(fsM,tsc,nul); fsH1 = vec_madd(fsH1,tsc,nul); fsH2 = vec_madd(fsH2,tsc,nul); vctot = vec_madd(qqH,VVcH2,vctot); fsM = vec_madd(fsM,rinvM,nul); fsH1 = vec_madd(fsH1,rinvH1,nul); fsH2 = vec_madd(fsH2,rinvH2,nul); fiMx = vec_madd(fsM,dMx,fiMx); /* +=fx */ dMx = vec_nmsub(fsM,dMx,nul); /* -fx */ fiMy = vec_madd(fsM,dMy,fiMy); /* +=fy */ dMy = vec_nmsub(fsM,dMy,nul); /* -fy */ fiMz = vec_madd(fsM,dMz,fiMz); /* +=fz */ dMz = vec_nmsub(fsM,dMz,nul); /* -fz */ fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */ dMx = vec_nmsub(fsH1,dH1x,dMx); /* -fx */ fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */ dMy = vec_nmsub(fsH1,dH1y,dMy); /* -fy */ fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */ dMz = vec_nmsub(fsH1,dH1z,dMz); /* -fz */ fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */ dMx = vec_nmsub(fsH2,dH2x,dMx); /* -fx */ fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */ dMy = vec_nmsub(fsH2,dH2y,dMy); /* -fy */ fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */ dMz = vec_nmsub(fsH2,dH2z,dMz); /* -fz */ transpose_3_to_4(dMx,dMy,dMz,&tmp1,&tmp2,&tmp3,&tmp4); add_xyz_to_mem(faction+j3a,tmp1); add_xyz_to_mem(faction+j3b,tmp2); add_xyz_to_mem(faction+j3c,tmp3); add_xyz_to_mem(faction+j3d,tmp4); } if(k<(nj1-2)) { jnra = jjnr[k]; jnrb = jjnr[k+1]; jnrc = jjnr[k+2]; j3a = 3*jnra; j3b = 3*jnrb; j3c = 3*jnrc; transpose_4_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b), load_xyz(pos+j3c),nul,&dH2x,&dH2y,&dH2z); dMx = vec_sub(iMx,dH2x); dMy = vec_sub(iMy,dH2y); dMz = vec_sub(iMz,dH2z); dH1x = vec_sub(iH1x,dH2x); dH1y = vec_sub(iH1y,dH2y); dH1z = vec_sub(iH1z,dH2z); dH2x = vec_sub(iH2x,dH2x); dH2y = vec_sub(iH2y,dH2y); dH2z = vec_sub(iH2z,dH2z); rsqM = vec_madd(dMx,dMx,nul); rsqH1 = vec_madd(dH1x,dH1x,nul); rsqH2 = vec_madd(dH2x,dH2x,nul); rsqM = vec_madd(dMy,dMy,rsqM); rsqH1 = vec_madd(dH1y,dH1y,rsqH1); rsqH2 = vec_madd(dH2y,dH2y,rsqH2); rsqM = vec_madd(dMz,dMz,rsqM); rsqH1 = vec_madd(dH1z,dH1z,rsqH1); rsqH2 = vec_madd(dH2z,dH2z,rsqH2); zero_highest_element_in_3_vectors(&rsqM,&rsqH1,&rsqH2); do_3_invsqrt(rsqM,rsqH1,rsqH2,&rinvM,&rinvH1,&rinvH2); zero_highest_element_in_3_vectors(&rinvM,&rinvH1,&rinvH2); rM = vec_madd(rsqM,rinvM,nul); rH1 = vec_madd(rsqH1,rinvH1,nul); rH2 = vec_madd(rsqH2,rinvH2,nul); /* load 3 j charges and multiply by iq */ jq=load_3_float(charge+jnra,charge+jnrb,charge+jnrc); do_3_ctable_coul(VFtab,vec_madd(rM,tsc,nul),&VVcM,&FFcM); do_3_ctable_coul(VFtab,vec_madd(rH1,tsc,nul),&VVcH1,&FFcH1); do_3_ctable_coul(VFtab,vec_madd(rH2,tsc,nul),&VVcH2,&FFcH2); qqM = vec_madd(iqM,jq,nul); qqH = vec_madd(iqH,jq,nul); vctot = vec_madd(qqM,VVcM,vctot); fsM = vec_nmsub(qqM,FFcM,nul); fsH1 = vec_nmsub(qqH,FFcH1,nul); fsH2 = vec_nmsub(qqH,FFcH2,nul); vctot = vec_madd(qqH,VVcH1,vctot); fsM = vec_madd(fsM,tsc,nul); fsH1 = vec_madd(fsH1,tsc,nul); fsH2 = vec_madd(fsH2,tsc,nul); vctot = vec_madd(qqH,VVcH2,vctot); fsM = vec_madd(fsM,rinvM,nul); fsH1 = vec_madd(fsH1,rinvH1,nul); fsH2 = vec_madd(fsH2,rinvH2,nul); fiMx = vec_madd(fsM,dMx,fiMx); /* +=fx */ dMx = vec_nmsub(fsM,dMx,nul); /* -fx */ fiMy = vec_madd(fsM,dMy,fiMy); /* +=fy */ dMy = vec_nmsub(fsM,dMy,nul); /* -fy */ fiMz = vec_madd(fsM,dMz,fiMz); /* +=fz */ dMz = vec_nmsub(fsM,dMz,nul); /* -fz */ fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */ dMx = vec_nmsub(fsH1,dH1x,dMx); /* -fx */ fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */ dMy = vec_nmsub(fsH1,dH1y,dMy); /* -fy */ fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */ dMz = vec_nmsub(fsH1,dH1z,dMz); /* -fz */ fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */ dMx = vec_nmsub(fsH2,dH2x,dMx); /* -fx */ fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */ dMy = vec_nmsub(fsH2,dH2y,dMy); /* -fy */ fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */ dMz = vec_nmsub(fsH2,dH2z,dMz); /* -fz */ transpose_4_to_3(dMx,dMy,dMz,nul,&tmp1,&tmp2,&tmp3); add_xyz_to_mem(faction+j3a,tmp1); add_xyz_to_mem(faction+j3b,tmp2); add_xyz_to_mem(faction+j3c,tmp3); } else if(k<(nj1-1)) { jnra = jjnr[k]; jnrb = jjnr[k+1]; j3a = 3*jnra; j3b = 3*jnrb; transpose_2_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b),&dH2x,&dH2y,&dH2z); dMx = vec_sub(iMx,dH2x); dMy = vec_sub(iMy,dH2y); dMz = vec_sub(iMz,dH2z); dH1x = vec_sub(iH1x,dH2x); dH1y = vec_sub(iH1y,dH2y); dH1z = vec_sub(iH1z,dH2z); dH2x = vec_sub(iH2x,dH2x); dH2y = vec_sub(iH2y,dH2y); dH2z = vec_sub(iH2z,dH2z); rsqM = vec_madd(dMx,dMx,nul); rsqH1 = vec_madd(dH1x,dH1x,nul); rsqH2 = vec_madd(dH2x,dH2x,nul); rsqM = vec_madd(dMy,dMy,rsqM); rsqH1 = vec_madd(dH1y,dH1y,rsqH1); rsqH2 = vec_madd(dH2y,dH2y,rsqH2); rsqM = vec_madd(dMz,dMz,rsqM); rsqH1 = vec_madd(dH1z,dH1z,rsqH1); rsqH2 = vec_madd(dH2z,dH2z,rsqH2); zero_highest_2_elements_in_3_vectors(&rsqM,&rsqH1,&rsqH2); do_3_invsqrt(rsqM,rsqH1,rsqH2,&rinvM,&rinvH1,&rinvH2); zero_highest_2_elements_in_3_vectors(&rinvM,&rinvH1,&rinvH2); rM = vec_madd(rsqM,rinvM,nul); rH1 = vec_madd(rsqH1,rinvH1,nul); rH2 = vec_madd(rsqH2,rinvH2,nul); /* load 2 j charges and multiply by iq */ jq=load_2_float(charge+jnra,charge+jnrb); do_2_ctable_coul(VFtab,vec_madd(rM,tsc,nul),&VVcM,&FFcM); do_2_ctable_coul(VFtab,vec_madd(rH1,tsc,nul),&VVcH1,&FFcH1); do_2_ctable_coul(VFtab,vec_madd(rH2,tsc,nul),&VVcH2,&FFcH2); qqM = vec_madd(iqM,jq,nul); qqH = vec_madd(iqH,jq,nul); vctot = vec_madd(qqM,VVcM,vctot); fsM = vec_nmsub(qqM,FFcM,nul); fsH1 = vec_nmsub(qqH,FFcH1,nul); fsH2 = vec_nmsub(qqH,FFcH2,nul); vctot = vec_madd(qqH,VVcH1,vctot); fsM = vec_madd(fsM,tsc,nul); fsH1 = vec_madd(fsH1,tsc,nul); fsH2 = vec_madd(fsH2,tsc,nul); vctot = vec_madd(qqH,VVcH2,vctot); fsM = vec_madd(fsM,rinvM,nul); fsH1 = vec_madd(fsH1,rinvH1,nul); fsH2 = vec_madd(fsH2,rinvH2,nul); fiMx = vec_madd(fsM,dMx,fiMx); /* +=fx */ dMx = vec_nmsub(fsM,dMx,nul); /* -fx */ fiMy = vec_madd(fsM,dMy,fiMy); /* +=fy */ dMy = vec_nmsub(fsM,dMy,nul); /* -fy */ fiMz = vec_madd(fsM,dMz,fiMz); /* +=fz */ dMz = vec_nmsub(fsM,dMz,nul); /* -fz */ fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */ dMx = vec_nmsub(fsH1,dH1x,dMx); /* -fx */ fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */ dMy = vec_nmsub(fsH1,dH1y,dMy); /* -fy */ fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */ dMz = vec_nmsub(fsH1,dH1z,dMz); /* -fz */ fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */ dMx = vec_nmsub(fsH2,dH2x,dMx); /* -fx */ fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */ dMy = vec_nmsub(fsH2,dH2y,dMy); /* -fy */ fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */ dMz = vec_nmsub(fsH2,dH2z,dMz); /* -fz */ transpose_3_to_2(dMx,dMy,dMz,&tmp1,&tmp2); add_xyz_to_mem(faction+j3a,tmp1); add_xyz_to_mem(faction+j3b,tmp2); } else if(k<nj1) { jnra = jjnr[k]; j3a = 3*jnra;
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?