⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 4.txt

📁 performance lab in C
💻 TXT
字号:
chihoy:3:kernels.c [chihoy, perflab] [Fri Oct 12 19:21:08 2007]
   1  /*
   2   * Kernels to be optimized for the CS:APP Performance Lab
   3   */
   4  
   5  /* 
   6     Name (Andrew ID)
   7     Chi Ho Yoon (chihoy)
   8  
   9     My Solution code is designed to compute the optimized result(CPE) for sum(sum(C[i][j]*x^i*y^j)).
  10     This code, firstly, compute the sum(C[i][j]*y^j) where i is fixed and j ranges from 0 to N-1.
  11     In here, the code applies Horner's equation to optimize the result.
  12     After then, the result is multiplied with the corresponding x^i.
  13     Adding those results for i ranges from 0 to M-1 eventually computes sum(sum(C[i][j]*x^i*y^j)).
  14     However, in order to maximize the computational speed, 8 Horner equations are computed simultaneously.
  15     If the remaining Horner equations are less than 8, then 4 Horner equations are computed simultaneously.
  16     If the remaining Horner equations are less than 4, then 2 Horner equations are computed simultaneously.
  17     If the remaining Horner equations are less than 2, then each Horner equation is computed seperately.
  18     This algorithm significantly reduces CPE, since it computes many equations at one time.
  19     Its effect is much more important than that of the Horner equation itself.
  20  
  21  */
  22  
  23  
  24  #include <stdio.h>
  25  #include <stdlib.h>
  26  #include "defs.h"
  27  
  28  /* Data structures defined in driver.c */
  29  extern val_t C[M][N];
  30  extern val_t *x_p;
  31  extern val_t *y_p;
  32  
  33  /*******************************************************
  34   * Part I: Define functions
  35   *
  36   * Your different versions of the poly2d kernel go here.
  37   *******************************************************/
  38  
  39  /* 
  40   * poly2d_baseline - The simple baseline version of poly2d
  41   */
  42  char poly2d_baseline_descr[] = "poly2d_baseline: Baseline implementation";
  43  val_t poly2d_baseline(val_t C[M][N]) {
  44  
  45      int i, j;
  46      val_t sol = 0.0;
  47      val_t xpwr = 1.0;
  48      val_t ypwr = 1.0;
  49      val_t x = *x_p;
  50      val_t y = *y_p;
  51  
  52      for (i = 0; i < M; i++) {
  53          ypwr = 1.0;
  54          for (j = 0; j < N; j++) {
  55              sol += C[i][j] * (xpwr * ypwr);
  56              ypwr *= y;
  57          }
  58          xpwr *= x;
  59      }
  60      return sol;
  61  }
  62  
  63  char poly2d_precompy_descr[] = "poly2d_precomp: Precompute y powers";
  64  val_t poly2d_precompy(val_t C[M][N]) {
  65    
  66      int i, j;
  67      val_t sol = 0.0;
  68      val_t xpwr = 1.0;
  69      val_t ypwr[N];
  70      val_t x = *x_p;
  71      val_t y = *y_p;
  72  
  73      ypwr[0] = 1.0;
  74      for (j = 1; j < N; j++) {
  75          ypwr[j] = ypwr[j-1]*y;
  76      }
  77  
  78      for (i = 0; i < M; i++) {
  79          for (j = 0; j < N; j++) {
  80              sol += C[i][j] * (xpwr * ypwr[j]);
  81          }
  82          xpwr *= x;
  83      }
  84      return sol;  
  85  
  86  }
  87  
  88  // This is my solution code for this lab.
  89  char poly2d_mine_descr[] = "polyd2_mine: My solution";
  90  val_t poly2d_mine(val_t C[M][N]) {
  91  
  92    int i,j,j_max,i_max1,i_max2,i_max3;
  93    val_t sol = 0.0;
  94    val_t x = *x_p;
  95    val_t y = *y_p;
  96    val_t xpwr[M];
  97    val_t temp0 = 0.0;
  98    val_t temp1 = 0.0;
  99    val_t temp2 = 0.0;
 100    val_t temp3 = 0.0;
 101    val_t temp4 = 0.0;
 102    val_t temp5 = 0.0;
 103    val_t temp6 = 0.0;
 104    val_t temp7 = 0.0;
 105    
 106    j_max = N-1;
 107    i_max1 = M-7;
 108    i_max2 = M-3;
 109    i_max3 = M-1;
 110    
 111    // First of all, we compute x-powers and store them into the array.
 112    xpwr[0] = 1.0;
 113    for (i = 1; i < M; i++) {
 114      xpwr[i] = xpwr[i-1]*x;
 115    }
 116  
 117    // We compute the Horner sums for y-variables where i ranges from  i to i+7 (total of 8 Horner sums) at the same time 
 118    // so that we can reduce the calculation time.
 119    for (i = 0; i < i_max1; i=i+8) {
 120      // Clear the previous data in 8 temporary data storages.
 121      temp0 = 0;
 122      temp1 = 0;
 123      temp2 = 0;
 124      temp3 = 0;
 125      temp4 = 0;
 126      temp5 = 0;
 127      temp6 = 0;
 128      temp7 = 0;
 129  
 130      // Compute the Horner equations for y-variables from C[i][j] to C[i+7][j] (j from 0 to N-1) simultaneously 
 131      // (There are total of 8 Horner sums)
 132      for (j = j_max; j >= 0; j--) {
 133        temp0 *= y;
 134        temp0 += C[i][j];
 135        temp1 *= y;
 136        temp1 += C[i+1][j];
 137        temp2 *= y;
 138        temp2 += C[i+2][j];
 139        temp3 *= y;
 140        temp3 += C[i+3][j];
 141        temp4 *= y;
 142        temp4 += C[i+4][j];
 143        temp5 *= y;
 144        temp5 += C[i+5][j];
 145        temp6 *= y;
 146        temp6 += C[i+6][j];
 147        temp7 *= y;
 148        temp7 += C[i+7][j];
 149      }
 150      
 151      // After then, we multiply the corresponding x-powers to the Horner sums.
 152      temp0 *= xpwr[i];
 153      temp1 *= xpwr[i+1];
 154      temp2 *= xpwr[i+2];
 155      temp3 *= xpwr[i+3];
 156      temp4 *= xpwr[i+4];
 157      temp5 *= xpwr[i+5];
 158      temp6 *= xpwr[i+6];
 159      temp7 *= xpwr[i+7];
 160  
 161      // Add the results altogether and store them in solution.
 162      sol += temp0 + temp1 + temp2 + temp3 + temp4 + temp5 + temp6 + temp7;
 163  
 164    }
 165  
 166  
 167    // When there are less than 8 Horner equations left for the computation,
 168    // we compute 4 Horner equations (j from 0 to N-1) simultaneously instead of 8 equations above.
 169    // Basic algorithm is identical to the one used above with 8 equations.
 170    for (; i < i_max2; i=i+4) {
 171      temp0 = 0;
 172      temp1 = 0;
 173      temp2 = 0;
 174      temp3 = 0;
 175  
 176      for (j = j_max; j >= 0; j--) {
 177        temp0 *= y;
 178        temp0 += C[i][j];
 179        temp1 *= y;
 180        temp1 += C[i+1][j];
 181        temp2 *= y;
 182        temp2 += C[i+2][j];
 183        temp3 *= y;
 184        temp3 += C[i+3][j];
 185      }
 186  
 187      temp0 *= xpwr[i];
 188      temp1 *= xpwr[i+1];
 189      temp2 *= xpwr[i+2];
 190      temp3 *= xpwr[i+3];
 191  
 192      sol += temp0 + temp1 + temp2 + temp3;
 193    }
 194  
 195  
 196    // When there are less than 4 Horner equations left for the computation,
 197    // we compute 2 Horner sums (j from 0 to N-1) simultaneously instead of 8 or 4 equations above.
 198    // Basic algorithm is identical to the ones used above.
 199    for(; i < i_max3; i=i+2) {
 200      temp0 = 0;
 201      temp1 = 0;
 202  
 203      for (j = j_max; j >= 0; j--) {
 204        temp0 *= y;
 205        temp0 += C[i][j];
 206        temp1 *= y;
 207        temp1 += C[i+1][j];
 208      }
 209  
 210      temp0 *= xpwr[i];
 211      temp1 *= xpwr[i+1];
 212  
 213      sol += temp0 + temp1;
 214    }
 215  
 216  
 217    // Compute one Horner equation at a time.
 218    for(; i < M; i++) {
 219      temp0 = 0;
 220  
 221      for (j = j_max; j >= 0; j--) {
 222        temp0 *= y;
 223        temp0 += C[i][j];
 224      }
 225  
 226      temp0 *= xpwr[i];
 227  
 228      sol += temp0;
 229    }
 230    
 231    return sol;
 232  }
 233  
 234  /****************************************************************** 
 235   * Part II: Register the functions defined in Part I
 236   *
 237   * Here is where you register all of your different versions of the
 238   * poly2d kernel with the driver by calling add_function() for each
 239   * test function. When you run the driver program, it will test and
 240   * report the performance of each registered test function.
 241   ******************************************************************/
 242  
 243  void register_functions() 
 244  {
 245      add_function(&poly2d_baseline, poly2d_baseline_descr);
 246      add_function(&poly2d_precompy, poly2d_precompy_descr);
 247      add_function(&poly2d_mine, poly2d_mine_descr);
 248  
 249      /* register additional versions here */
 250  
 251  }
 252  
 253  
 254  
 255  

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -