nb_kernel112_ppc_altivec.c
来自「最著名最快的分子模拟软件」· C语言 代码 · 共 1,557 行 · 第 1/5 页
C
1,557 行
/* -*- mode: c; tab-width: 4; indent-tabs-mode: n; c-basic-offset: 4 -*- * * $Id: nb_kernel112_ppc_altivec.c,v 1.2 2005/03/15 10:45:25 hess Exp $ * * This file is part of Gromacs Copyright (c) 1991-2004 * David van der Spoel, Erik Lindahl, University of Groningen. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * To help us fund GROMACS development, we humbly ask that you cite * the research papers on the package. Check out http://www.gromacs.org * * And Hey: * Gnomes, ROck Monsters And Chili Sauce */#ifdef HAVE_CONFIG_H#include <config.h>#endif/* Must come directly after config.h */#include <gmx_thread.h>#include "ppc_altivec_util.h"#include "nb_kernel112_ppc_altivec.h"/* NB: This is one of the most common nonbonded functions called, so I tried * to optimize it by doing all register saving/restoring manually. It only gave * 5-10% better performance, so I have not implemented it in the other loops. * (it makes the code horrible to read). However, since it is slightly faster * there is no reason not to keep the optimized code... */void nb_kernel112_ppc_altivec (int * p_nri, int iinr[], int jindex[], int jjnr[], int shift[], float shiftvec[], float fshift[], int gid[], float pos[], float faction[], float charge[], float * p_facel, float * p_krf, float * p_crf, float Vc[], int type[], int * p_ntype, float vdwparam[], float Vvdw[], float * p_tabscale, float VFtab[], float invsqrta[], float dvda[], float * p_gbtabscale, float GBtab[], int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, float * work){ register vector float v0; register vector float v1; register vector float v2; register vector float v3; register vector float v4; register vector float v5; register vector float v6; register vector float v7; register vector float v8; register vector float v9; register vector float v10; register vector float v11; register vector float v12; register vector float v13; register vector float v14; register vector float v15; register vector float v16; register vector float v17; register vector float v18; register vector float v19; register vector float v20; register vector float v21; register vector float v22; register vector float v23; register vector float v24; register vector float v25; register vector float v26; register vector float v27; register vector float v28; register vector float v29; register vector float v30; register vector float v31; union vfloat { float f[4]; vector float v; } stackdata[52]; int n,k,ii,is3,ii3,nj0,nj1; int jnra,jnrb,jnrc,jnrd; int j3a,j3b,j3c,j3d; int nri, ntype, nouter, ninner;#ifdef GMX_THREADS int nn0, nn1;#endif nouter = 0; ninner = 0; nri = *p_nri; ntype = *p_ntype; /* set non java mode */ v10 = (vector float)vec_mfvscr(); v11 = (vector float)vec_sl(vec_splat_u32(1),vec_splat_u32(8)); v12 = (vector float)vec_sl((vector unsigned int)v11, vec_splat_u32(8)); v10 = (vector float)vec_or((vector unsigned short)v10, (vector unsigned short)v12); vec_mtvscr((vector unsigned short)v10); v0 = (vector float)vec_splat_u32(0); v0 = vec_ctf((vector unsigned int)v0,0); /* load 0 to v0 */ v1 = vec_lde(0,p_facel); /* load facel float to a vector */ v2 = (vector float) vec_lvsl(0,p_facel); v1 = vec_perm(v1,v1,(vector unsigned char) v2); /* move to elem 0 */ v1 = vec_splat(v1,0); /* splat it to all elem */ ii = iinr[0]; v3 = vec_lde(0,charge+ii); /* load qO float to a vector */ v4 = (vector float) vec_lvsl(0,charge+ii); v3 = vec_perm(v3,v3,(vector unsigned char) v4); /* move to elem 0 */ v3 = vec_splat(v3,0); /* splat it to all elem */ v5 = vec_lde(0,charge+ii+1); /* load qH float to a vector */ v6 = (vector float) vec_lvsl(0,charge+ii+1); v5 = vec_perm(v5,v5,(vector unsigned char) v6); /* move to elem 0 */ v5 = vec_splat(v5,0); /* splat it to all elem */ v4 = vec_madd(v3,v5,v0); /* qqOH */ v3 = vec_madd(v3,v3,v0); /* qqOO */ v5 = vec_madd(v5,v5,v0); /* qqHH */ v4 = vec_madd(v4,v1,v0); /* qqOH * facel */ v3 = vec_madd(v3,v1,v0); /* qqOO * facel */ v5 = vec_madd(v5,v1,v0); /* qqHH * facel */ n = 2*type[ii]; n = (ntype+1)*n; v1 = vec_ld( 0,vdwparam+n); /* c6a c12a - the vdwparam array is at least * 8-byte aligned and n is even here. */ v2 = (vector float) vec_lvsl(0,vdwparam+n); v1 = vec_perm(v1,v1,(vector unsigned char)v2); /* c6 c12 in 0,1 */ v2 = vec_splat(v1,1); /* c12 in all elements */ v1 = vec_splat(v1,0); /* c6 in all elements */ /* store things to stack before starting outer loop */ vec_st(v3, 0, (float *) stackdata); /* qqOO*facel is in stack pos 0 */ vec_st(v4, 16, (float *) stackdata); /* qqOH*facel is in stack pos 1 */ vec_st(v5, 32, (float *) stackdata); /* qqHH*facel is in stack pos 2 */ vec_st(v1, 48, (float *) stackdata); /* c6 is in stack pos 3 */ vec_st(v2, 64, (float *) stackdata); /* c12 is in stack pos 4 */ #ifdef GMX_THREADS nthreads = *p_nthreads; do { gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx); nn0 = *count; nn1 = nn0+(nri-nn0)/(2*nthreads)+3; *count = nn1; gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx); if(nn1>nri) nn1=nri; for(n=nn0; (n<nn1); n++) {#if 0 } /* maintain correct indentation even with conditional left braces */#endif#else /* without gmx_threads */ for(n=0;n<nri;n++) {#endif is3 = 3*shift[n]; ii = iinr[n]; ii3 = 3*ii; /* load shift */ /* load three consecutive shiftvector floats. * We never access the fourth element, * so this is safe even at the end of an array. */ v4 = (vector float)vec_lvsl(0, shiftvec+is3); v1 = vec_lde(0, shiftvec+is3); v2 = vec_lde(4, shiftvec+is3); v3 = vec_lde(8, shiftvec+is3); /* Load shX,shY,shZ to elem 0 of v1,v2,v3 */ v1 = vec_perm(v1,v1,(vector unsigned char)v4); v2 = vec_perm(v2,v2,(vector unsigned char)v4); v3 = vec_perm(v3,v3,(vector unsigned char)v4); v2 = vec_sld(v2,v2,4); v3 = vec_sld(v3,v3,8); v1 = vec_mergeh(v1,v3); v1 = vec_mergeh(v1,v2); /* [ shX shY shZ - ] */ /* load i coordinates */ v2 = (vector float)vec_lvsl(0, pos+ii3); /* load 3atoms coords into three vectors. * We do not yet know how it is aligned. */ v3 = vec_ld(0, pos+ii3); v4 = vec_ld(16, pos+ii3); v5 = vec_ld(32, pos+ii3); v6 = vec_sld(v1,v1,12); /* - shX shY shZ */ v7 = vec_sld(v6,v1,4); /* shX shY shZ shX */ v8 = vec_sld(v6,v1,8); /* shY shZ shX shY */ v9 = vec_sld(v6,v1,12); /* shZ shX shY shZ */ /* v3 = Ox Oy Oz H1x */ v3 = vec_perm(v3,v4,(vector unsigned char)v2); /* v4 = H1y H1z H2x H2y */ v4 = vec_perm(v4,v5,(vector unsigned char)v2); /* v5 = H2z - - - */ v5 = vec_perm(v5,v5,(vector unsigned char)v2); v3 = vec_add(v3,v7); v4 = vec_add(v4,v8); v5 = vec_add(v5,v9); v6 = vec_splat(v3,0); /* Ox Ox Ox Ox */ v7 = vec_splat(v3,1); /* Oy Oy Oy Oy */ v8 = vec_splat(v3,2); /* Oz Oz Oz Oz */ v9 = vec_splat(v3,3); /* H1x H1x H1x H1x */ v10 = vec_splat(v4,0); /* H1y H1y H1y H1y */ v11 = vec_splat(v4,1); /* H1z H1z H1z H1z */ v12 = vec_splat(v4,2); /* H2x H2x H2x H2x */ v13 = vec_splat(v4,3); /* H2y H2y H2y H2y */ v14 = vec_splat(v5,0); /* H2z H2z H2z H2z */ /* Store i 3atoms coordinates to stack */ vec_st(v6, 80, (float *)stackdata); /* i Ox is in stack pos 5 */ vec_st(v7, 96, (float *)stackdata); /* i Oy is in stack pos 6 */ vec_st(v8, 112, (float *)stackdata); /* i Oz is in stack pos 7 */ vec_st(v9, 128, (float *)stackdata); /* i H1x is in stack pos 8 */ vec_st(v10,144, (float *)stackdata); /* i H1y is in stack pos 9 */ vec_st(v11,160, (float *)stackdata); /* i H1z is in stack pos 10 */ vec_st(v12,176, (float *)stackdata); /* i H2x is in stack pos 11 */ vec_st(v13,192, (float *)stackdata); /* i H2y is in stack pos 12 */ vec_st(v14,208, (float *)stackdata); /* i H2z is in stack pos 13 */ nj0 = jindex[n]; nj1 = jindex[n+1]; /* vec_dst( jjnr + nj1, 0x10010100, 0 ); */ /* zero vctot, in stack pos 14 */ vec_st(v0, 224, (float *)stackdata); /* zero vctot, in stack pos 15 */ vec_st(v0, 240, (float *)stackdata); /* zero fiOx, in stack pos 16 */ vec_st(v0, 256, (float *)stackdata); /* zero fiOy, in stack pos 17 */ vec_st(v0, 272, (float *)stackdata); /* zero fiOz, in stack pos 18 */ vec_st(v0, 288, (float *)stackdata); /* zero fiH1x, in stack pos 19 */ vec_st(v0, 304, (float *)stackdata); /* zero fiH1y, in stack pos 20 */ vec_st(v0, 320, (float *)stackdata); /* zero fiH1z, in stack pos 21 */ vec_st(v0, 336, (float *)stackdata); /* zero fiH2x, in stack pos 22 */ vec_st(v0, 352, (float *)stackdata); /* zero fiH2y, in stack pos 23 */ vec_st(v0, 368, (float *)stackdata); /* zero fiH2z, in stack pos 24 */ vec_st(v0, 384, (float *)stackdata); for(k=nj0; k<(nj1-3); k+=4) { jnra = jjnr[k]; jnrb = jjnr[k+1]; jnrc = jjnr[k+2]; jnrd = jjnr[k+3]; /* vec_dst( jjnr + k + 4, 0x02020020, 0 ); */ j3a = 3*jnra; j3b = 3*jnrb; j3c = 3*jnrc; j3d = 3*jnrd; /* vec_dst( pos+j3a, 0x10010100, 1 ); */ v1 = (vector float)vec_lvsl(0, pos+j3a); v8 = (vector float)vec_lvsl(0, pos+j3b); v15 = (vector float)vec_lvsl(0, pos+j3c); v22 = (vector float)vec_lvsl(0, pos+j3d); v2 = vec_ld(0, pos+j3a); v9 = vec_ld(0, pos+j3b); v16 = vec_ld(0, pos+j3c); v23 = vec_ld(0, pos+j3d); v3 = vec_ld(16, pos+j3a); v10 = vec_ld(16, pos+j3b); v17 = vec_ld(16, pos+j3c); v24 = vec_ld(16, pos+j3d); v4 = vec_ld(32, pos+j3a); v11 = vec_ld(32, pos+j3b); v18 = vec_ld(32, pos+j3c);
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?