matrix.c

C++编写的高性能矩阵乘法的Stranssen算法
字号:
#include "matrix.h"#if STRAS_TIME_PARTS || PRISM_MEM_CHK#include "matrix_test.h"#endifdouble one=1.0,m_one=-1.0;   /* Needed as FORTRAN parameter */void s_matrix_alloc(int rows, int cols, double **a, int *lda, char file[], int lineno)     /* Allocates a matrix A of size rows x cols and returns the leading dimension       *      * NOTE:  Call this routine by invoking matrix_alloc() where      * the CPP definitions in matrix.h do the name shift and       * add the last two arguments automatically.      *      */{    if (rows == 0) {    *a = NULL;    return;  }  else {    #if STRAS_TIME_PARTS    CLOCK_START(gt0);#endif        /* Allocate memory for array */    *a = (double *) MALLOC((unsigned) (rows*cols)*sizeof(double));    if (*a == NULL)       s_generror("allocation failure in s_matrix_alloc()", file, lineno);    #if STRAS_TIME_PARTS    CLOCK_UPDATE(time_matrix,gt0,d_correct1,i_sum_clock);#endif        *lda = rows; }   }void submatrix(double *a, int lda, int rl, int cl, double **b)     /* Returns a pointer to submatrix B which points to A[c1][r1]. */{#if STRAS_TIME_PARTS  CLOCK_START(gt0);#endif  /* Set pointer to starting location of submatrix */  *b = a+(cl-1)*lda + rl - 1;#if STRAS_TIME_PARTS  CLOCK_UPDATE(time_submatrix,gt0,d_correct1,i_sum_clock);#endif}void free_matrix(double *a)     /* Frees a double matrix of size mem_size allocated by matrix_alloc().         Parameter mem_size used for profiling only. */     {#if STRAS_TIME_PARTS  CLOCK_START(gt0);#endif  free(a);#if STRAS_TIME_PARTS  CLOCK_UPDATE(time_free_matrix,gt0,d_correct1,i_sum_clock);#endif}void matrix_add(int m, int n, double alpha, double *a, int lda, double *b, 		int ldb, double *c, int ldc)     /*  Matrix Add.  A, B, & C are m x n matrices.      *      *  C = alpha * (A + B)      *      *  NOTE: We pass lda, ldb, & ldc so that when we call Fortran routine we have       *        these values.  Not needed by others but passed for consistency.      *      *  NOTE: Cases that are optimized are used by our fmm routines.      */{  int i,j,inc=1;  double *p_a,*p_b,*p_c;  #if STRAS_TIME_PARTS  CLOCK_START(gt0);#endif#if STRAS_BLAS_ADD  if (b != c) {                   /* If aliasing was not used */    if (alpha == 1.0) {           /* Compute C = A + B        */#pragma _CRI ivdep       for (j = 0; j < n; j++) {	dcopy_(&m,a+j*lda,&inc,c+j*ldc,&inc);	daxpy_(&m,&one,b+j*ldb,&inc,c+j*ldc,&inc);      }    }    else {                        /* Compute C = alpha * (A + B) */#pragma _CRI ivdep       for (j = 0; j < n; j++) { 	dcopy_(&m,a+j*lda,&inc,c+j*ldc,&inc);	dscal_(&m,&alpha,c+j*ldc,&inc);	daxpy_(&m,&alpha,b+j*ldb,&inc,c+j*ldc,&inc);      }    }  }  else {                          /* If aliasing was used */    if (alpha == 1.0) {           /* Compute C = A + C    */#pragma _CRI ivdep       for (j = 0; j < n; j++) {	daxpy_(&m,&one,a+j*lda,&inc,c+j*ldc,&inc);      }    }    else {                        /* Compute C = alpha * (A + C) */#pragma _CRI ivdep       for (j = 0; j < n; j++) { 	daxpy_(&m,&one,a+j*lda,&inc,c+j*ldc,&inc);	dscal_(&m,&alpha,c+j*ldc,&inc);      }    }  }  #elif STRAS_FORTRAN_ADD  matrix_fadd_(&m,&n,&alpha,a,&lda,b,&ldb,c,&ldc);#elif STRAS_ESSL_ADD  if (alpha == 1.0)               /* Compute C = A + B */    dgeadd_(a,&lda,"N",b,&ldb,"N",c,&ldc,&m,&n);   else                            /* Compute C = alpha * (A + B) + C */    matrix_fadd_(&m,&n,&alpha,a,&lda,b,&ldb,c,&ldc);  #else/* Use pointer addition by default */  for (j = 0; j < n; j++) {    p_a = a+j*lda;    p_b = b+j*ldb;    p_c = c+j*ldc;#pragma _CRI ivdep     for (i = 0; i < m; i++) {      *p_c++ = alpha * (*p_a++ + *p_b++);    }  }#endif  #if STRAS_TIME_PARTS  CLOCK_UPDATE(time_matrix_add,gt0,d_correct1,i_sum_clock);#endif}void matrix_sub(int m, int n, double alpha, double *a, int lda, double *b, 		int ldb, double *c, int ldc)     /*  Matrix subtraction.  A, B & C are m x n matrices.      *      *  C = alpha * (A - B)      */{  int i,j,inc=1;  double *p_a,*p_b,*p_c,m_alpha;#if STRAS_TIME_PARTS  CLOCK_START(gt0);#endif#if STRAS_BLAS_SUB  if (b != c) {                   /* If aliasing was not used */    if (alpha == 1.0)             /* Compute C = A - B        */#pragma _CRI ivdep       for (j = 0; j < n; j++) {	dcopy_(&m,a+j*lda,&inc,c+j*ldc,&inc);	daxpy_(&m,&m_one,b+j*ldb,&inc,c+j*ldc,&inc);      }    else {                        /* Compute C = alpha * (A - B) */      m_alpha = -alpha;#pragma _CRI ivdep       for (j = 0; j < n; j++) { 	dcopy_(&m,a+j*lda,&inc,c+j*ldc,&inc);	dscal_(&m,&alpha,c+j*ldc,&inc);	daxpy_(&m,&m_alpha,b+j*ldb,&inc,c+j*ldc,&inc);      }    }  }  else {                          /* If aliasing was used */    if (alpha == 1.0)             /* Compute C = A - C    */#pragma _CRI ivdep       for (j = 0; j < n; j++) {	dscal_(&m,&m_one,c+j*ldc,&inc);	daxpy_(&m,&one,a+j*lda,&inc,c+j*ldc,&inc);      }    else {                        /* Compute C = alpha * (A - C) */      m_alpha = -alpha;#pragma _CRI ivdep       for (j = 0; j < n; j++) { 	dscal_(&m,&m_alpha,c+j*ldc,&inc);	daxpy_(&m,&alpha,a+j*lda,&inc,c+j*ldc,&inc);      }    }  }    #elif STRAS_FORTRAN_SUB  matrix_fsub_(&m,&n,&alpha,a,&lda,b,&ldb,c,&ldc);#elif STRAS_ESSL_SUB  if (alpha == 1.0)               /* Compute C = A - B */    dgesub_(a,&lda,"N",b,&ldb,"N",c,&ldc,&m,&n);   else                            /* Compute C = alpha * (A - B) */    matrix_fsub_(&m,&n,&alpha,a,&lda,b,&ldb,c,&ldc);#else/* Use pointer subtraction by default */  for (j = 0; j < n; j++) {    p_a = a+j*lda;    p_b = b+j*ldb;    p_c = c+j*ldc;#pragma _CRI ivdep     for (i = 0; i < m; i++) {      *p_c++ = alpha * (*p_a++ - *p_b++);    }  }#endif#if STRAS_TIME_PARTS  CLOCK_UPDATE(time_matrix_sub,gt0,d_correct1,i_sum_clock);#endif}void matrix_lin_update(int m, int n, double alpha, double *a, int lda, double beta, 		       double *c, int ldc)     /*  Matrix Linear Combination.  A & C are m x n matrices.      *      *  C = alpha * A + beta * C      *      *  NOTE: Cases that are optimized are used by our fmm routines.      */{  int i,j,inc=1;  double *p_a,*p_c;  #if STRAS_TIME_PARTS  CLOCK_START(gt0);#endif#if STRAS_BLAS_ADD  if (beta == 0.0) {    if (alpha == 1.0) {#pragma _CRI ivdep       for (j = 0; j < n; j++) {	dcopy_(&m,a+j*lda,&inc,c+j*ldc,&inc);      }    }    else {#pragma _CRI ivdep       for (j = 0; j < n; j++) {        dcopy_(&m,a+j*lda,&inc,c+j*ldc,&inc);        dscal_(&m,&alpha,c+j*ldc,&inc);      }    }  }  else if (beta == 1.0)#pragma _CRI ivdep     for (j = 0; j < n; j++) {      daxpy_(&m,&alpha,a+j*lda,&inc,c+j*ldc,&inc);    }  else#pragma _CRI ivdep     for (j = 0; j < n; j++) {      dscal_(&m,&beta,c+j*ldc,&inc);      daxpy_(&m,&alpha,a+j*lda,&inc,c+j*ldc,&inc);    }#elif STRAS_FORTRAN_ADD || STRAS_ESSL_ADD  if (alpha == 1.0) {    if (beta == 0.0) {#pragma _CRI ivdep       for (j = 0; j < n; j++) {	dcopy_(&m,a+j*lda,&inc,c+j*ldc,&inc);      }    }    else if (beta == -1.0) {      matrix_fsub_(&m,&n,&one,a,&lda,c,&ldc,c,&ldc);    }    else {      matrix_flinear_(&m,&n,&one,a,&lda,&beta,c,&ldc);    }  }  else if (beta == 1.0) {    if (alpha == -1.0) {      matrix_fsub_(&m,&n,&one,c,&ldc,a,&lda,c,&ldc);    }    else {#pragma _CRI ivdep       for (j = 0; j < n; j++) {	daxpy_(&m,&alpha,a+j*lda,&inc,c+j*ldc,&inc);      }    }  }  else if (beta == -1.0) {    matrix_flinear_(&m,&n,&alpha,a,&lda,&m_one,c,&ldc);  }  else {    matrix_flinear_(&m,&n,&alpha,a,&lda,&beta,c,&ldc);  }#else/* Use pointer update by default */  for (j = 0; j < n; j++) {    p_a = a+j*lda;    p_c = c+j*ldc;#pragma _CRI ivdep     for (i = 0; i < m; i++) {      /* Some compilers (Cray mostly) handle *pc++ on RHS differently and         give the wrong answer; split into 2 separate statements.*/       *p_c = beta * *p_c + alpha * *p_a++;      p_c++;    }  }#endif#if STRAS_TIME_PARTS  CLOCK_UPDATE(time_matrix_linear,gt0,d_correct1,i_sum_clock);#endif}void matrix_prod(char c_transa, char c_transb, int m, int n, int k, double alpha, 		 double *a, int lda, double *b, int ldb, double beta, double *c, int ldc)     /*        * Matrix product.  A is an m x k matrix and B is a k x n matrix.      *      * C = alpha * (A * B) + beta * C      */{#if STRAS_TIME_PARTS  CLOCK_START(gt0);#endif  dgemm_(&c_transa, &c_transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc);#if STRAS_TIME_PARTS  CLOCK_UPDATE(time_matrix_prod,gt0,d_correct1,i_sum_clock);#endif}int tmp_fmm(int m, int n, int k, double alpha, double beta)     /* Compute amount of memory needed for temporary workspace by       * Strassen/Winograd.       */{  int i_tmp,j,i_min,m2,n2,k2,i_max;  double tmp1;    /* Compute j = number of levels of recursion performed */  j = 0;  m2 = m;  n2 = n;  k2 = k;  i_min = MIN(MIN(m,n),k);  i_max = MAX(MAX(m,n),k);  /* Since the routine no_recursion can be called with varying values of alpha &   * beta, this routine must calculate the max possible memeory needed, which   * requires the smallest values for the rectangular cutoffs.   *   * If rectangular cutoff values were not given, use the default values.   */  if (in_cutoff_m <= 0) {    cutoff_m = MIN(STRAS_CUTOFF_M, STRAS_CUTOFF_M1);    cutoff_n = MIN(STRAS_CUTOFF_N, STRAS_CUTOFF_N1);    cutoff_k = MIN(STRAS_CUTOFF_K, STRAS_CUTOFF_K1);  }  while (((((double)m2 * (double)n2 * (double)k2) > 	   cutoff_m*(double)n2*(double)k2+cutoff_n*(double)m2*(double)k2+	   cutoff_k*(double)n2*(double)m2) || (i_min > cutoff)) 	 && (i_max > cutoff))    {      m2 = m2 >> 1;      n2 = n2 >> 1;      k2 = k2 >> 1;      i_max = MAX(MAX(m2,n2),k2);      i_min = MIN(MIN(m2,n2),k2);      j++;    }    /* Compute memory needed based on number of levels of recursion */  tmp1 = (1./3.) * (1 - pow(.25, (double) j));  if (beta != 0.0)     i_tmp = m*k + k*n + m*n;  else    i_tmp = m*MAX(k,n) + k*n;  i_tmp = ceil((tmp1 * i_tmp));  return i_tmp;}int TMP_DGEFMM_MEM(int *m, int *n, int *k, double *alpha, double *beta)     /* Compute amount of memory needed for temporary workspace by       * Strassen/Winograd. This routine has a Fortran style interface.       */{  int i_tmp;  /* Call the C routine */  i_tmp = tmp_fmm(*m, *n, *k, *alpha, *beta);  return i_tmp;}int no_recursion(int m, int n, int k, double alpha, double beta)     /* Determine if recursion should continue based on matrix size(s) */{  int stop_recursion = 0,i_min,i_max;  double r_mnk;  i_min = MIN(MIN(m,n),k);  i_max = MAX(MAX(m,n),k);  /* If rectangular cutoff values were not given, use the default values. */  if (in_cutoff_m <= 0) {    if (alpha == 1.0 && beta == 0.0) {      cutoff_m = STRAS_CUTOFF_M1;      cutoff_n = STRAS_CUTOFF_N1;      cutoff_k = STRAS_CUTOFF_K1;    }    else {      cutoff_m = STRAS_CUTOFF_M;      cutoff_n = STRAS_CUTOFF_N;      cutoff_k = STRAS_CUTOFF_K;    }  }  r_mnk = (double) m * (double) n * (double) k;  if (((r_mnk <= cutoff_m*(double)n*(double)k+cutoff_n*(double)m*(double)k+	cutoff_k*(double)n*(double)m) && (i_min <= cutoff)) || (i_max <= cutoff))     stop_recursion = 1;    return stop_recursion;}void s_generror(char c_error_text[], char file[], int lineno)     /* Routine to print error message.      *      * NOTE: Call this routine by invoking generror() where      * the CPP definitions in matrix.h do the name shift       * and add the last two arguments automatically.      *      */{  char    buf[150]      ;  sprintf(buf, "\n*** Run-time error in file %s on line %d ***\n", file, lineno);  strcat(buf, c_error_text);#if PRISM_SP  printf("%s\n\n", buf);#else  fprintf(stderr, "%s***\n\n", buf);#endif  exit(99);}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -