📄 4.txt
字号:
chihoy:3:kernels.c [chihoy, perflab] [Fri Oct 12 19:21:08 2007]
1 /*
2 * Kernels to be optimized for the CS:APP Performance Lab
3 */
4
5 /*
6 Name (Andrew ID)
7 Chi Ho Yoon (chihoy)
8
9 My Solution code is designed to compute the optimized result(CPE) for sum(sum(C[i][j]*x^i*y^j)).
10 This code, firstly, compute the sum(C[i][j]*y^j) where i is fixed and j ranges from 0 to N-1.
11 In here, the code applies Horner's equation to optimize the result.
12 After then, the result is multiplied with the corresponding x^i.
13 Adding those results for i ranges from 0 to M-1 eventually computes sum(sum(C[i][j]*x^i*y^j)).
14 However, in order to maximize the computational speed, 8 Horner equations are computed simultaneously.
15 If the remaining Horner equations are less than 8, then 4 Horner equations are computed simultaneously.
16 If the remaining Horner equations are less than 4, then 2 Horner equations are computed simultaneously.
17 If the remaining Horner equations are less than 2, then each Horner equation is computed seperately.
18 This algorithm significantly reduces CPE, since it computes many equations at one time.
19 Its effect is much more important than that of the Horner equation itself.
20
21 */
22
23
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include "defs.h"
27
28 /* Data structures defined in driver.c */
29 extern val_t C[M][N];
30 extern val_t *x_p;
31 extern val_t *y_p;
32
33 /*******************************************************
34 * Part I: Define functions
35 *
36 * Your different versions of the poly2d kernel go here.
37 *******************************************************/
38
39 /*
40 * poly2d_baseline - The simple baseline version of poly2d
41 */
42 char poly2d_baseline_descr[] = "poly2d_baseline: Baseline implementation";
43 val_t poly2d_baseline(val_t C[M][N]) {
44
45 int i, j;
46 val_t sol = 0.0;
47 val_t xpwr = 1.0;
48 val_t ypwr = 1.0;
49 val_t x = *x_p;
50 val_t y = *y_p;
51
52 for (i = 0; i < M; i++) {
53 ypwr = 1.0;
54 for (j = 0; j < N; j++) {
55 sol += C[i][j] * (xpwr * ypwr);
56 ypwr *= y;
57 }
58 xpwr *= x;
59 }
60 return sol;
61 }
62
63 char poly2d_precompy_descr[] = "poly2d_precomp: Precompute y powers";
64 val_t poly2d_precompy(val_t C[M][N]) {
65
66 int i, j;
67 val_t sol = 0.0;
68 val_t xpwr = 1.0;
69 val_t ypwr[N];
70 val_t x = *x_p;
71 val_t y = *y_p;
72
73 ypwr[0] = 1.0;
74 for (j = 1; j < N; j++) {
75 ypwr[j] = ypwr[j-1]*y;
76 }
77
78 for (i = 0; i < M; i++) {
79 for (j = 0; j < N; j++) {
80 sol += C[i][j] * (xpwr * ypwr[j]);
81 }
82 xpwr *= x;
83 }
84 return sol;
85
86 }
87
88 // This is my solution code for this lab.
89 char poly2d_mine_descr[] = "polyd2_mine: My solution";
90 val_t poly2d_mine(val_t C[M][N]) {
91
92 int i,j,j_max,i_max1,i_max2,i_max3;
93 val_t sol = 0.0;
94 val_t x = *x_p;
95 val_t y = *y_p;
96 val_t xpwr[M];
97 val_t temp0 = 0.0;
98 val_t temp1 = 0.0;
99 val_t temp2 = 0.0;
100 val_t temp3 = 0.0;
101 val_t temp4 = 0.0;
102 val_t temp5 = 0.0;
103 val_t temp6 = 0.0;
104 val_t temp7 = 0.0;
105
106 j_max = N-1;
107 i_max1 = M-7;
108 i_max2 = M-3;
109 i_max3 = M-1;
110
111 // First of all, we compute x-powers and store them into the array.
112 xpwr[0] = 1.0;
113 for (i = 1; i < M; i++) {
114 xpwr[i] = xpwr[i-1]*x;
115 }
116
117 // We compute the Horner sums for y-variables where i ranges from i to i+7 (total of 8 Horner sums) at the same time
118 // so that we can reduce the calculation time.
119 for (i = 0; i < i_max1; i=i+8) {
120 // Clear the previous data in 8 temporary data storages.
121 temp0 = 0;
122 temp1 = 0;
123 temp2 = 0;
124 temp3 = 0;
125 temp4 = 0;
126 temp5 = 0;
127 temp6 = 0;
128 temp7 = 0;
129
130 // Compute the Horner equations for y-variables from C[i][j] to C[i+7][j] (j from 0 to N-1) simultaneously
131 // (There are total of 8 Horner sums)
132 for (j = j_max; j >= 0; j--) {
133 temp0 *= y;
134 temp0 += C[i][j];
135 temp1 *= y;
136 temp1 += C[i+1][j];
137 temp2 *= y;
138 temp2 += C[i+2][j];
139 temp3 *= y;
140 temp3 += C[i+3][j];
141 temp4 *= y;
142 temp4 += C[i+4][j];
143 temp5 *= y;
144 temp5 += C[i+5][j];
145 temp6 *= y;
146 temp6 += C[i+6][j];
147 temp7 *= y;
148 temp7 += C[i+7][j];
149 }
150
151 // After then, we multiply the corresponding x-powers to the Horner sums.
152 temp0 *= xpwr[i];
153 temp1 *= xpwr[i+1];
154 temp2 *= xpwr[i+2];
155 temp3 *= xpwr[i+3];
156 temp4 *= xpwr[i+4];
157 temp5 *= xpwr[i+5];
158 temp6 *= xpwr[i+6];
159 temp7 *= xpwr[i+7];
160
161 // Add the results altogether and store them in solution.
162 sol += temp0 + temp1 + temp2 + temp3 + temp4 + temp5 + temp6 + temp7;
163
164 }
165
166
167 // When there are less than 8 Horner equations left for the computation,
168 // we compute 4 Horner equations (j from 0 to N-1) simultaneously instead of 8 equations above.
169 // Basic algorithm is identical to the one used above with 8 equations.
170 for (; i < i_max2; i=i+4) {
171 temp0 = 0;
172 temp1 = 0;
173 temp2 = 0;
174 temp3 = 0;
175
176 for (j = j_max; j >= 0; j--) {
177 temp0 *= y;
178 temp0 += C[i][j];
179 temp1 *= y;
180 temp1 += C[i+1][j];
181 temp2 *= y;
182 temp2 += C[i+2][j];
183 temp3 *= y;
184 temp3 += C[i+3][j];
185 }
186
187 temp0 *= xpwr[i];
188 temp1 *= xpwr[i+1];
189 temp2 *= xpwr[i+2];
190 temp3 *= xpwr[i+3];
191
192 sol += temp0 + temp1 + temp2 + temp3;
193 }
194
195
196 // When there are less than 4 Horner equations left for the computation,
197 // we compute 2 Horner sums (j from 0 to N-1) simultaneously instead of 8 or 4 equations above.
198 // Basic algorithm is identical to the ones used above.
199 for(; i < i_max3; i=i+2) {
200 temp0 = 0;
201 temp1 = 0;
202
203 for (j = j_max; j >= 0; j--) {
204 temp0 *= y;
205 temp0 += C[i][j];
206 temp1 *= y;
207 temp1 += C[i+1][j];
208 }
209
210 temp0 *= xpwr[i];
211 temp1 *= xpwr[i+1];
212
213 sol += temp0 + temp1;
214 }
215
216
217 // Compute one Horner equation at a time.
218 for(; i < M; i++) {
219 temp0 = 0;
220
221 for (j = j_max; j >= 0; j--) {
222 temp0 *= y;
223 temp0 += C[i][j];
224 }
225
226 temp0 *= xpwr[i];
227
228 sol += temp0;
229 }
230
231 return sol;
232 }
233
234 /******************************************************************
235 * Part II: Register the functions defined in Part I
236 *
237 * Here is where you register all of your different versions of the
238 * poly2d kernel with the driver by calling add_function() for each
239 * test function. When you run the driver program, it will test and
240 * report the performance of each registered test function.
241 ******************************************************************/
242
243 void register_functions()
244 {
245 add_function(&poly2d_baseline, poly2d_baseline_descr);
246 add_function(&poly2d_precompy, poly2d_precompy_descr);
247 add_function(&poly2d_mine, poly2d_mine_descr);
248
249 /* register additional versions here */
250
251 }
252
253
254
255
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -