tfc.c

来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 496 行

C
496
字号
/* *             Automatically Tuned Linear Algebra Software v3.8.0 *                    (C) Copyright 1997 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: *   1. Redistributions of source code must retain the above copyright *      notice, this list of conditions and the following disclaimer. *   2. Redistributions in binary form must reproduce the above copyright *      notice, this list of conditions, and the following disclaimer in the *      documentation and/or other materials provided with the distribution. *   3. The name of the ATLAS group or the names of its contributers may *      not be used to endorse or promote products derived from this *      software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#include <stdio.h>#include <stdlib.h>#include <assert.h>#include <math.h>#include <limits.h>#include "atlas_misc.h"#include "atlas_lvl3.h"#include "atlas_f77.h"#include Mstr(Mjoin(Mjoin(atlas_,PRE),sysinfo.h))#define dumb_seed(iseed_) srand(iseed_)#ifndef RAND_MAX  /* rather dangerous non-ansi workaround */   #define RAND_MAX ((unsigned long)(1<<30))#endif#define dumb_rand() ( 0.5 - ((double)rand())/((double)RAND_MAX) )#define nshape 5enum ATLAS_MATSHAPE {AtlasM_NB=0, AtlasN_NB=1, AtlasMN_NB=2, AtlasK_NB=3,                     Atlas0_NB=4, AtlasMN_REST};___main(){}__main(){}MAIN__(){}_MAIN_(){}double time00();#define DENMAT 175#define MAXALLOC (8*1024*1024*8)#ifdef ATL_DeclareSlensF77_INTEGER ATL_Slen1, ATL_Slen2;#endifvoid Mjoin(PATL,small_mm)   (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,    const int M, const int N, const int K, const SCALAR alpha,    const TYPE *A, const int lda, const TYPE *B, const int ldb,    const SCALAR beta, TYPE *C, const int ldc);void Mjoin(PATL,big_mm)   (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,    const int M, const int N, const int K, const SCALAR alpha,    const TYPE *A, const int lda, const TYPE *B, const int ldb,    const SCALAR beta, TYPE *C, const int ldc);void Mjoin(PATL,dbig_mm)   (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,    const int M, const int N, const int K, const SCALAR alpha,    const TYPE *A, const int lda, const TYPE *B, const int ldb,    const SCALAR beta, TYPE *C, const int ldc);void Mjoin(PATL,sbig_mm)   (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,    const int M, const int N, const int K, const SCALAR alpha,    const TYPE *A, const int lda, const TYPE *B, const int ldb,    const SCALAR beta, TYPE *C, const int ldc);typedef void (*GEMMPTR)   (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,    const int M, const int N, const int K, const SCALAR alpha,    const TYPE *A, const int lda, const TYPE *B, const int ldb,    const SCALAR beta, TYPE *C, const int ldc);static GEMMPTR big_gemm, small_gemm;void matgen(int M0, int N, TYPE *A, int lda0, int seed){   int i, j;   const int M = M0 SHIFT, lda = lda0 SHIFT;   dumb_seed(seed);   for (j=N; j; j--)   {      for (i=0; i != M; i++) A[i] = dumb_rand();      A += lda;   }}int mmcase(char TA, char TB, int M, int N, int K, SCALAR alpha, SCALAR beta,           double *mf0, double *mf1){#ifdef TREAL   char *form="%4d   %c   %c %4d %4d %4d  %5.1f  %5.1f  %6.2f %6.1f  %4.2f\n";   #define MALPH alpha   #define MBETA beta#else   #define MALPH *alpha, alpha[1]   #define MBETA *beta, beta[1]   char *form="%4d   %c   %c %4d %4d %4d  %5.1f %5.1f  %5.1f %5.1f  %6.2f %6.1f %4.2f\n";#endif   int lda, ldb, ldc, n, reps, i, j, incA, incB, incC;   int la, lb, lc;   double t0, t1, t2, t3, mflop;   TYPE *A, *B, *C, *a, *b, *c, *ast, *bst, *cst;   TYPE maxval, f1, ferr;   static TYPE feps=0.0;   static int itst=1;   enum ATLAS_TRANS TAc, TBc;   extern int ATL_bigmmOutOfMem;   if (TA == 'n' || TA == 'N')   {      lda = Mmax(M,100);      incA = la = lda * K;      TAc = AtlasNoTrans;   }   else   {      lda = Mmax(K,100);      incA = la = lda * M;      TAc = AtlasTrans;   }   if (TB == 'n' || TB == 'N')   {      ldb = Mmax(K,100);      incB = lb = ldb * N;      TBc = AtlasNoTrans;   }   else   {      ldb = Mmax(N,100);      incB = lb = ldb * K;      TBc = AtlasTrans;   }   ldc = Mmax(M,100);   j = ATL_DivBySize(L2SIZE);   i = Mmax(ldc*N, j);   i += Mmax(la, j);   i += Mmax(lb, j);   if (i*ATL_sizeof > MAXALLOC) return(1);   i = la;   j = (ATL_DivBySize(L2SIZE) / i);   if (j < 1) j = 1;   j *= i;   la = j;   a = A = malloc(j*ATL_sizeof);   if (a == NULL) return(1);   ast = A + (j SHIFT);   matgen(j, 1, A, j, K*1112);   i = lb;   j = (ATL_DivBySize(L2SIZE) / i);   if (j < 1) j = 1;   j *= i;   lb = j;   b = B = malloc(j*ATL_sizeof);   if (b == NULL)   {      free(A);      return(2);   }   matgen(j, 1, B, j, N*1287);   bst = B + (j SHIFT);   incC = i = ldc * N;   lc = j = ((ATL_DivBySize(L2SIZE) + i-1) / i)*i;   c = C = malloc(j*ATL_sizeof);   if (c == NULL)   {      free(A);      free(B);      return(3);   }   matgen(j, 1, C, j, M*N);   cst = C + (j SHIFT);   #ifdef TCPLX      incA *= 2; incB *= 2; incC *= 2;   #endif/* * preload instructions from disk */   small_gemm(TAc, TBc, 80, 80, 80, alpha, a, 80, b, 80, beta, c, 80);   big_gemm(TAc, TBc, 80, 80, 80, alpha, a, 80, b, 80, beta, c, 80);   matgen(la, 1, A, la, K*1112);   matgen(lb, 1, B, lb, N*1287);   matgen(lc, 1, C, lc, M*N);/* * Insist we have at least as many flops as doing a matmul of order DENMAT */   t1 = (((double) M) * N) * K;   #ifdef ATL_nkflop      t0 = 1000.0 * ATL_nkflop;      t1 *= 2.0;   #else      t0 = DENMAT;      t0 *= t0*t0;   #endif   t0 /= t1;   reps = t0 + 0.9;   if (!reps) reps = 1;   i = reps;   t0 = time00();   do   {      small_gemm(TAc, TBc, M, N, K, alpha, a, lda, b, ldb, beta, c, ldc);      a += incA;      b += incB;      c += incC;      if (a == ast) a = A;      if (b == bst) b = B;      if (c == cst) c = C;   }   while (--i);   t1 = time00() - t0;   if (t1 <= 0.0) mflop = t1 = 0.0;   else mflop = ( ((2.0*M)*N)*K*reps ) / (t1*1000000.0);   #ifdef TCPLX      mflop *= 4.0;   #endif   *mf0 = mflop;   printf(form, itst, TA, TB, M, N, K, MALPH, MBETA, t1, mflop, 1.0);   a = A;  b = B;  c = C;   matgen(la, 1, A, la, K*1112);   matgen(lb, 1, B, lb, N*1287);   matgen(lc, 1, C, lc, M*N);   i = reps;   t0 = time00();   do   {      big_gemm(TAc, TBc, M, N, K, alpha, a, lda, b, ldb, beta, c, ldc);      if (ATL_bigmmOutOfMem)      {         free(A);         free(B);         free(C);         ATL_bigmmOutOfMem = 0;         return(4);      }      a += incA;      b += incB;      c += incC;      if (a == ast) a = A;      if (b == bst) b = B;      if (c == cst) c = C;   }   while (--i);   t2 = time00() - t0;   if (t2 <= 0.0) t2 = mflop = 0.0;   else mflop = ( ((2.0*M)*N)*K*reps ) / (t2*1000000.0);   #ifdef TCPLX      mflop *= 4.0;   #endif   *mf1 = mflop;   if (t1 == t2) t3 = 1.0;   else if (t2 != 0.0) t3 = t1/t2;   else t3 = 0.0;   printf(form, itst++, TA, TB, M, N, K, MALPH, MBETA, t2, mflop, t3);   free(A);   free(B);   free(C);   return(0);}void GetDims(enum ATLAS_MATSHAPE shape, int n, int nb, int *M, int *N, int *K){   *M = *N = *K = n;   switch(shape)   {   case AtlasM_NB:      *M = nb;      break;   case AtlasN_NB:      *N = nb;      break;   case AtlasMN_NB:      *M = *N = nb;      break;   case AtlasK_NB:      *K = nb;      break;   case AtlasMN_REST:  /* restricted M & N, but basically square */      if (n > 6*nb)         *M = *N = 6*nb;      break;   case Atlas0_NB:     break;   default:      fprintf(stderr, "SHAPE=%d\n", shape);      exit(-1);   }}void SortPoints(int N, int *Ns, double *mfs0, double *mfs1)/* * simple selection sort for data points, sorting on Ns */{   int i, j, itmp;   double tmp;   for (i=0; i != N; i++)   {      for (j=i+1; j < N; j++)      {         if (Ns[j] < Ns[i])         {            itmp = Ns[i];            Ns[i] = Ns[j];            Ns[j] = itmp;            tmp = mfs0[i];            mfs0[i] = mfs0[j];            mfs0[j] = tmp;            tmp = mfs1[i];            mfs1[i] = mfs1[j];            mfs1[j] = tmp;         }      }   }}unsigned long PredictNcross(enum ATLAS_MATSHAPE shape, int nb, int N, int *Ns,                            double *mfs0, double *mfs1){   double slope0, slope1, tmp;   int m, n, k;   unsigned long NN;   SortPoints(N, Ns, mfs0, mfs1);   slope0 = mfs0[N-1] - mfs0[0] / (double) (Ns[N-1] - Ns[0]);   slope1 = mfs1[N-1] - mfs1[0] / (double) (Ns[N-1] - Ns[0]);   if (slope1 < slope0) return(LONG_MAX);   tmp = (mfs1[N-1] - mfs0[N-1]) / (slope0 - slope1);   NN = Ns[N-1] + tmp;   GetDims(shape, NN, nb, &m, &n, &k);   return(m*n*k);}unsigned long tloop(enum ATLAS_MATSHAPE shape, char TA, char TB,                    int nb, SCALAR alpha, SCALAR beta){   int smallN = 10, bigN = 750, stepN=10, MinDist=5;   int i, n, n0, nn, M, N, K, Ncross=0;   long MNK=0;   double mfs0[64], mfs1[64];   int Ns[64];   double mf0, mf1, tmp;#ifdef TREAL   printf("\nTEST  TA  TB    M    N    K  alpha   beta    Time  Mflop  SpUp\n");   printf("====  ==  ==  ===  ===  ===  =====  =====  ======  =====  ==== \n\n");#else   printf("\nTEST  TA  TB    M    N    K        alpha         beta    Time  Mflop  SpUp\n");   printf("====  ==  ==  ===  ===  ===  ===== =====  ===== =====  ======  =====  ====\n\n");#endif   GetDims(shape, smallN, nb, &M, &N, &K);   assert(mmcase(TA, TB, M, N, K, alpha, beta, &mf0, &mf1) == 0);   Ns[0] = smallN; mfs0[0] = mf0; mfs1[0] = mf1;   if (mf0 < mf1) return(M*N*K);/* * Find crossover point */   n = bigN;   i = 1;   do   {      GetDims(shape, n, nb, &M, &N, &K);      if (mmcase(TA, TB, M, N, K, alpha, beta, &mf0, &mf1))      {         n = n - stepN;         bigN -= stepN;         if (bigN < stepN) break;      }      else      {         Ns[i] = n; mfs0[i] = mf0; mfs1[i] = mf1;         i++;         if (mf0 < mf1) Ncross = n;         else n += bigN;      }   }   while (!Ncross);   if (!Ncross) return(PredictNcross(shape, nb, i, Ns, mfs0, mfs1));/* * Refine Xover point using recursive halving */   nn = Ns[i-1];   n0 = Ns[i-2];   n = n0 + (nn - n0) / 2;   while (n - n0 > MinDist)   {      GetDims(shape, n, nb, &M, &N, &K);      assert(mmcase(TA, TB, M, N, K, alpha, beta, &mf0, &mf1) == 0);      if (mf0 < mf1) nn = n;      else n0 = n;      n = n0 + (nn - n0) / 2;   }   GetDims(shape, n, nb, &M, &N, &K);   return(shape == AtlasMN_REST ? K : M*N*K);}void DoShapes(FILE *fpout, char TA, char TB, int nb, int N0, int NN, int incN,              SCALAR alpha, SCALAR beta){   enum ATLAS_MATSHAPE shape;   unsigned long n, nn;   int M, N, K;   static char *nm[nshape] = {"M", "N", "MN", "K", "GE"};   big_gemm    = Mjoin(PATL,big_mm);   small_gemm  = Mjoin(PATL,small_mm);   for (shape=AtlasM_NB; shape <= Atlas0_NB; shape++)   {      n = tloop(shape, TA, TB, nb, alpha, beta);      fprintf(fpout, "#define %c%c_MNK_%s %ld\n", TA, TB, nm[shape], n);   }}main(int nargs, char *args[]){   char TA='n', TB='n';   int nb, N0, NN, incN, jstop=0;   long icross;   #ifdef TREAL      TYPE alpha=1.0, beta=1.0;   #else      TYPE alpha[2] = {-1.0, 0.0}, beta[2] = {1.0, 0.0};   #endif   FILE *fp;   big_gemm    = Mjoin(PATL,big_mm);   small_gemm  = Mjoin(PATL,small_mm);#ifndef SM_FOUT   if (nargs < 5)   {      fprintf(stderr, "usage: %s <NB> <N0> <NN> <incN> [TA TB alpha beta]\n", args[0]);      exit(-1);   }   nb = atoi(args[1]);   N0 = atoi(args[2]);   NN = atoi(args[3]);   incN = atoi(args[4]);   if (nargs > 5) TA = *args[5];   if (nargs > 6) TA = *args[6];   if (nargs > 7) alpha = atof(args[7]);   if (nargs > 8) beta  = atof(args[8]);   tloop(jstop, TA, TB, nb, N0, NN, incN, alpha, beta);#else   fprintf(stderr, "\n\nFinding crossover point for small case algorithms:\n");   fp = fopen(Mstr(SM_FOUT), "w");   nb = NB;   N0 = 10;   NN = 100;   incN = 5;   fprintf(fp, "#ifndef %sXOVER_H\n#define %sXOVER_H\n\n",           Mstr(PREU), Mstr(PREU));   fprintf(fp, "#define ATL_3NB %d\n", 3*NB);   DoShapes(fp, 'N', 'N', nb, N0, NN, incN, alpha, beta);   DoShapes(fp, 'N', 'T', nb, N0, NN, incN, alpha, beta);   DoShapes(fp, 'T', 'N', nb, N0, NN, incN, alpha, beta);   DoShapes(fp, 'T', 'T', nb, N0, NN, incN, alpha, beta);#ifdef TCPLX   small_gemm  = Mjoin(PATL,big_mm);   big_gemm  = Mjoin(PATL,Mjoin(UPR,big_mm));   icross = tloop(AtlasMN_REST, 'N', 'T', nb, alpha, beta);   fprintf(fp, "#define C2R_K %ld\n", icross);#endif   fprintf(fp, "\n#endif\n");   fclose(fp);#endif   exit(0);}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?