📄 mm.c
字号:
/* matrix multiply tests -- C language, version 1.0, May 1993
*
* compile with -DN=<size>
*
* I usually run a script file
* time.script 500 >500.times
* where the script file contains
* cc -O -DN=$1 mm.c
* a.out -n (I suggest at least two runs per method to
* a.out -n alert you to variations. Five or ten runs
* a.out -t each, giving avg. and std dev. of times is
* a.out -t best.)
* ...
*
* Contact Mark Smotherman (mark@cs.clemson.edu) for questions, comments,
* and to report results showing wide variations. E.g., a wide variation
* appeared on an IBM RS/6000 Model 320 with "cc -O -DN=500 mm.c" (xlc
* compiler):
* 500x500 mm - normal algorithm utime 230.81 secs
* 500x500 mm - normal algorithm utime 230.72 secs
* 500x500 mm - temporary variable in loop utime 231.00 secs
* 500x500 mm - temporary variable in loop utime 230.79 secs
* 500x500 mm - unrolled inner loop, factor of 8 utime 232.09 secs
* 500x500 mm - unrolled inner loop, factor of 8 utime 231.84 secs
* 500x500 mm - pointers used to access matrices utime 230.74 secs
* 500x500 mm - pointers used to access matrices utime 230.45 secs
* 500x500 mm - blocking, factor of 32 utime 60.40 secs
* 500x500 mm - blocking, factor of 32 utime 60.57 secs
* 500x500 mm - interchanged inner loops utime 27.36 secs
* 500x500 mm - interchanged inner loops utime 27.40 secs
* 500x500 mm - 20x20 subarray (from D. Warner) utime 9.49 secs
* 500x500 mm - 20x20 subarray (from D. Warner) utime 9.50 secs
* 500x500 mm - 20x20 subarray (from T. Maeno) utime 9.10 secs
* 500x500 mm - 20x20 subarray (from T. Maeno) utime 9.05 secs
*
* The algorithms can also be sensitive to TLB thrashing. On a 600x600
* test an IBM RS/6000 Model 30 showed variations depending on relative
* location of the matrices. (The model 30 has 64 TLB entries organized
* as 2-way set associative.)
*
* 600x600 mm - 20x20 subarray (from T. Maeno) utime 19.12 secs
* 600x600 mm - 20x20 subarray (from T. Maeno) utime 19.23 secs
* 600x600 mm - 20x20 subarray (from D. Warner) utime 18.87 secs
* 600x600 mm - 20x20 subarray (from D. Warner) utime 18.64 secs
* 600x600 mm - 20x20 btranspose (Warner/Smotherman) utime 17.70 secs
* 600x600 mm - 20x20 btranspose (Warner/Smotherman) utime 17.76 secs
*
* Changing the declaration to include 10000 dummy entries between the
* b and c matrices (suggested by T. Maeno), i.e.,
*
* double a[N][N],b[N][N],dummy[10000],c[N][N],d[N][N],bt[N][N];
*
* 600x600 mm - 20x20 subarray (from T. Maeno) utime 16.41 secs
* 600x600 mm - 20x20 subarray (from T. Maeno) utime 16.40 secs
* 600x600 mm - 20x20 subarray (from D. Warner) utime 16.68 secs
* 600x600 mm - 20x20 subarray (from D. Warner) utime 16.67 secs
* 600x600 mm - 20x20 btranspose (Warner/Smotherman) utime 16.97 secs
* 600x600 mm - 20x20 btranspose (Warner/Smotherman) utime 16.98 secs
*
* I hope to add other algorithms (e.g., Strassen-Winograd) in the near
* future.
*/
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include "report.h"
#include "timer.h"
/* #include <sys/time.h> */
/* #include <sys/resource.h> */
void msg();
void normal();
void tnsq();
void unroll4();
void unroll8();
void unroll16();
void pnsq();
void transpose();
void reg_loops();
void tiling();
void maeno();
void warner();
void robert();
void exit();
double a[N][N],b[N][N],c[N][N],bt[N][N],d[N][N];
void main(argc,argv)
int argc;
char *argv[];
{
char alg_choice;
int parm;
int i,j;
double sum,row_sum;
/* double t_start=0.0,t_end=0.0,dtime(); */
/* defaults */
alg_choice = 'x';
parm = 4;
i = 1;
while(argc>1){
switch(argv[i][0]){
case '-':
alg_choice = argv[i][1];
break;
default:
sscanf(argv[i],"%d",&parm);
}
argc--;
i++;
}
switch(alg_choice){
case 'n':
printf("%3dx%3d mm - normal algorithm ",N,N);
break;
case 'v':
printf("%3dx%3d mm - temporary variable in loop ",N,N);
break;
case 'p':
printf("%3dx%3d mm - pointers used to access matrices ",N,N);
break;
case 't':
printf("%3dx%3d mm - transposed b matrix ",N,N);
break;
case 'i':
printf("%3dx%3d mm - interchanged inner loops ",N,N);
break;
case 'u':
if((parm!=4)&&(parm!=8)&&(parm!=16)){
printf("%s: unrolling factor of %d not implemented\n",argv[0],parm);
printf(" usage: %s -<option> [<n>]\n",argv[0]);
printf(" option u - innermost loop unrolled by factor of n=4,8,16\n");
exit(1);
}
printf("%3dx%3d mm - unrolled inner loop, factor of %2d ",N,N,parm);
break;
case 'b':
if((parm<4)||(parm>N)){
printf("%s: blocking step size of %d is unreasonable\n",argv[0],parm);
exit(1);
}
printf("%3dx%3d mm - blocking, factor of %3d ",N,N,parm);
break;
case 'm':
if((N%parm)||(N%4)){
printf("%s: matrix size for Maeno method must be\n",argv[0]);
printf(" evenly divisible both by 4 and by block size %d\n",parm);
exit(1);
}
if(parm%4){
printf("%s: block size for Maeno method must be",argv[0]);
printf(" evenly divisible by 4\n");
exit(1);
}
printf("%3dx%3d mm - %3dx%3d subarray (from T. Maeno) ",N,N,
parm,parm);
break;
case 'w':
if ((N%parm)||(N%2))
{
printf("%s: matrix size for Warner method must be\n",argv[0]);
printf(" evenly divisible both by 2 and by block size %d\n",parm);
exit(1);
}
if (parm%2)
{
printf("%s: block size for Warner method must be",argv[0]);
printf(" evenly divisible by 2\n");
exit(1);
}
printf("%3dx%3d mm - %3dx%3d subarray (from D. Warner) ",N,N,parm,parm);
break;
case 'r':
printf("%3dx%3d mm - Robert's algorithm ",N,N);
break;
case 'x':
printf("%s: unspecified algorithm\n",argv[0]);
msg(argv[0]);
exit(1);
default:
printf("%s: unknown algorithm choice %c\n",argv[0],alg_choice);
msg(argv[0]);
exit(1);
}
/* set coefficients so that result matrix should have row entries
* equal to (1/2)*n*(n-1)*i in row i
*/
for (i=0;i<N;i++){
for (j=0;j<N;j++){
a[i][j] = b[i][j] = (double) i;
}
}
/* try to flush cache */
for(i=0;i<N;i++){
for (j=0;j<N;j++){
d[i][j] = 0.0;
}
}
switch(alg_choice){
case 'n':
TimerOn();
normal();
TimerOff();
Report( "mm(n)", TimerElapsed() );
printf(" utime %10.2f secs\n", TimerElapsed() );
break;
case 'v':
TimerOn();
tnsq();
TimerOff();
Report( "mm(v)", TimerElapsed() );
printf(" utime %10.2f secs\n", TimerElapsed() );
break;
case 'u':
switch(parm){
case 4:
TimerOn();
unroll4();
TimerOff();
Report( "mm(u4)", TimerElapsed() );
break;
case 8:
TimerOn();
unroll8();
TimerOff();
Report( "mm(u8)", TimerElapsed() );
break;
case 16:
TimerOn();
unroll16();
TimerOff();
Report( "mm(u16)", TimerElapsed() );
break;
default:
printf("program logic bug in call to unrolled versions\n");
exit(1);
}
printf(" utime %10.2f secs\n", TimerElapsed() );
break;
case 'p':
TimerOn();
pnsq();
TimerOff();
Report( "mm(p)", TimerElapsed() );
printf(" utime %10.2f secs\n", TimerElapsed() );
break;
case 't':
TimerOn();
transpose();
TimerOff();
Report( "mm(t)", TimerElapsed() );
printf(" utime %10.2f secs\n", TimerElapsed() );
break;
case 'i':
TimerOn();
reg_loops();
TimerOff();
Report( "mm(i)", TimerElapsed() );
printf(" utime %10.2f secs\n", TimerElapsed() );
break;
case 'b':
TimerOn();
tiling(parm);
TimerOff();
Report( "mm(b)", TimerElapsed() );
printf(" utime %10.2f secs\n", TimerElapsed() );
break;
case 'm':
TimerOn();
maeno(parm);
TimerOff();
Report( "mm(m)", TimerElapsed() );
printf(" utime %10.2f secs\n", TimerElapsed() );
break;
case 'w':
TimerOn();
warner(parm);
TimerOff();
Report( "mm(w)", TimerElapsed() );
printf(" utime %10.2f secs\n", TimerElapsed() );
break;
case 'r':
TimerOn();
robert();
TimerOff();
Report( "mm(r)", TimerElapsed() );
printf(" utime %10.2f secs\n", TimerElapsed() );
break;
default:
printf("unknown algorithm!\n");
exit(1);
}
/* check result */
sum = 0.5*((double)(N*(N-1)));
for (i=0;i<N;i++){
row_sum = ((double)i)*sum;
for (j=0;j<N;j++){
if (c[i][j]!=row_sum){
printf("error in result entry c[%d][%d]: %e != %e\n",
i,j,c[i][j],row_sum);
exit(1);
}
if (a[i][j]!=((double)i)){
printf("error in entry a[%d][%d]: %e != %d\n",
i,j,a[i][j],i);
exit(1);
}
if (b[i][j]!=((double)i)){
printf("error in entry b[%d][%d]: %e != %d\n",
i,j,b[i][j],i);
exit(1);
}
}
}
exit( EXIT_SUCCESS );
}
void msg(cmd)
char *cmd;
{
printf(" usage: %s -<option> [<n>]\n",cmd);
printf(" option n - normal matrix multiply\n");
printf(" option v - normal matrix multiply using temp variable\n");
printf(" option u - innermost loop unrolled by factor of n=4,8,16\n");
printf(" option p - matrix multiply using pointers\n");
printf(" option t - matrix multiply using transpose of b matrix\n");
printf(" option i - matrix multiply with interchanged loops\n");
printf(" option b - matrix multiply using blocking with step=n\n");
printf(" option m - matrix multiply using Maeno method of blocking,\n");
printf(" n specifies size of subarray (divisible by 4)\n");
printf(" option w - matrix multiply using Warner method of blocking,\n");
printf(" n specifies size of subarray (divisible by 2)\n");
printf(" option r - matrix multiply using the Robert method with:\n");
printf(" b transpose, pointers, and temporary variables\n");
}
void normal(){
int i,j,k;
for (i=0;i<N;i++){
for (j=0;j<N;j++){
c[i][j] = 0.0;
for (k=0;k<N;k++){
c[i][j] += a[i][k]*b[k][j];
}
}
}
}
void tnsq(){
int i,j,k;
double temp;
for (i=0;i<N;i++){
for (j=0;j<N;j++){
temp = a[i][0]*b[0][j];
for (k=1;k<N;k++){
temp += a[i][k]*b[k][j];
}
c[i][j] = temp;
}
}
}
void unroll4(){
int i,j,k;
double temp;
for (i=0;i<N;i++){
for (j=0;j<N;j++){
temp = 0.0;
for (k=0;k<(N-3);k+=4){
temp += a[i][k]*b[k][j];
temp += a[i][k+1]*b[k+1][j];
temp += a[i][k+2]*b[k+2][j];
temp += a[i][k+3]*b[k+3][j];
}
for (;k<N;k++){
temp += a[i][k]*b[k][j];
}
c[i][j] = temp;
}
}
}
void unroll8(){
int i,j,k;
double temp;
for (i=0;i<N;i++){
for (j=0;j<N;j++){
temp = 0.0;
for (k=0;k<(N-7);k+=8){
temp += a[i][k]*b[k][j];
temp += a[i][k+1]*b[k+1][j];
temp += a[i][k+2]*b[k+2][j];
temp += a[i][k+3]*b[k+3][j];
temp += a[i][k+4]*b[k+4][j];
temp += a[i][k+5]*b[k+5][j];
temp += a[i][k+6]*b[k+6][j];
temp += a[i][k+7]*b[k+7][j];
}
for (;k<N;k++){
temp += a[i][k]*b[k][j];
}
c[i][j] = temp;
}
}
}
void unroll16(){
int i,j,k;
double temp;
for (i=0;i<N;i++){
for (j=0;j<N;j++){
temp = 0.0;
for (k=0;k<(N-15);k+=16){
temp += a[i][k]*b[k][j];
temp += a[i][k+1]*b[k+1][j];
temp += a[i][k+2]*b[k+2][j];
temp += a[i][k+3]*b[k+3][j];
temp += a[i][k+4]*b[k+4][j];
temp += a[i][k+5]*b[k+5][j];
temp += a[i][k+6]*b[k+6][j];
temp += a[i][k+7]*b[k+7][j];
temp += a[i][k+8]*b[k+8][j];
temp += a[i][k+9]*b[k+9][j];
temp += a[i][k+10]*b[k+10][j];
temp += a[i][k+11]*b[k+11][j];
temp += a[i][k+12]*b[k+12][j];
temp += a[i][k+13]*b[k+13][j];
temp += a[i][k+14]*b[k+14][j];
temp += a[i][k+15]*b[k+15][j];
}
for (;k<N;k++){
temp += a[i][k]*b[k][j];
}
c[i][j] = temp;
}
}
}
void pnsq(){
int i,j,k;
double temp,*pa,*pb;
for (i=0;i<N;i++){
for (j=0;j<N;j++){
pa = a[i];
pb = &b[0][j];
temp = (*pa++)*(*pb);
for (k=1;k<N;k++){
pb = pb + N;
temp += (*pa++)*(*pb);
}
c[i][j] = temp;
}
}
}
void transpose(){
int i,j,k;
double temp;
for (i=0;i<N;i++){
for (j=0;j<N;j++){
bt[j][i] = b[i][j];
}
}
for (i=0;i<N;i++){
for (j=0;j<N;j++){
temp = a[i][0]*bt[j][0];
for (k=1;k<N;k++){
temp += a[i][k]*bt[j][k];
}
c[i][j] = temp;
}
}
}
/* from Monica Lam ASPLOS-IV paper */
void reg_loops(){
int i,j,k;
register double a_entry;
for (i=0;i<N;i++){
for (j=0;j<N;j++){
c[i][j] = 0.0;
}
}
for (i=0;i<N;i++){
for (k=0;k<N;k++){
a_entry = a[i][k];
for (j=0;j<N;j++){
c[i][j] += a_entry*b[k][j];
}
}
}
}
/* from Monica Lam ASPLOS-IV paper */
void tiling(step)
int step;
{
int i,j,k,jj,kk;
register double a_entry;
for (i=0;i<N;i++){
for (j=0;j<N;j++){
c[i][j] = 0.0;
}
}
for(kk=0;kk<N;kk+=step){
for(jj=0;jj<N;jj+=step){
for (i=0;i<N;i++){
for(k=kk;k<min(kk+step,N);k++){
a_entry = a[i][k];
for(j=jj;j<min(jj+step,N);j++){
c[i][j] += a_entry*b[k][j];
}
}
}
}
}
}
/* Matrix Multiply tuned for SS-10/30;
* Maeno Toshinori
* Tokyo Institute of Technology
*
* Using gcc-2.4.1 (-O2), this program ends in 12 seconds on SS-10/30.
*
* in original algorithm - sub-area for cache tiling
* #define L 20
* #define L2 20
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -