📄 c_dotp_flt.c
字号:
/****************************************************************************/
/* Copyright (C) 1996-2000 Texas Instruments Incorporated */
/* All Rights Reserved */
/* */
/* C_DOTP_FLT.C - Floating point dot product example. */
/* Example code from Programmer's Guide on optimizing C code.*/
/* */
/****************************************************************************/
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
#define FHI(a) _itof(_hi(a))
#define FLO(a) _itof(_lo(a))
#define THRESHOLD 0.001
float dotp1(float * restrict, float * restrict, int);
float dotp2(double * restrict, double * restrict, int);
float dotp3(double * restrict, double * restrict, int);
#pragma DATA_ALIGN(a2, 8)
#pragma DATA_ALIGN(b2, 8)
float a1[512], b1[512], a2[512], b2[512];
float ret1, ret2, ret3;
/****************************************************************************/
/* TOP LEVEL DRIVER FOR THE TEST. */
/****************************************************************************/
int main()
{
#pragma DATA_MEM_BANK(a3, 0);
#pragma DATA_MEM_BANK(b3, 0);
float a3[512], b3[512];
clock_t t_overhead, t_start, t_stop;
int i;
/************************************************************************/
/* INITIALIZE INPUT ARRAYS */
/************************************************************************/
for (i = 0; i < 512; i++)
{
a1[i] = a2[i] = a3[i] = rand() * 0.000123;
b1[i] = b2[i] = b3[i] = rand() * 0.000123;
}
/************************************************************************/
/* COMPUTE THE OVERHEAD OF CALLING CLOCK TWICE TO GET TIMING INFO. */
/************************************************************************/
t_start = clock();
t_stop = clock();
t_overhead = t_stop - t_start;
/************************************************************************/
/* TIME DOTP1 */
/************************************************************************/
t_start = clock();
ret1 = dotp1(a1, b1, 512);
t_stop = clock();
printf("DOTP1: %d cycles\n", t_stop - t_start - t_overhead);
/************************************************************************/
/* TIME DOTP2 */
/************************************************************************/
t_start = clock();
ret2 = dotp2((double *)a2, (double *)b2, 512);
t_stop = clock();
printf("DOTP2: %d cycles\n", t_stop - t_start - t_overhead);
if (fabs(ret1 - ret2) > THRESHOLD) printf("Result failure dotp2()\n");
else printf("Correct result dotp2()\n");
/************************************************************************/
/* TIME DOTP3 */
/************************************************************************/
t_start = clock();
ret3 = dotp3((double *)a3, (double *)b3, 512);
t_stop = clock();
printf("DOTP3: %d cycles\n", t_stop - t_start - t_overhead);
if (fabs(ret2 - ret3) > THRESHOLD) printf("Result failure dotp3()\n");
else printf("Correct result dotp3()\n");
}
/****************************************************************************/
/* DOTP1 - BASIC FORM. */
/****************************************************************************/
float dotp1(float a[restrict], float b[restrict], int N)
{
int i;
float sum = 0;
for (i = 0; i < N; i++)
sum += a[i] * b[i];
return sum;
}
/****************************************************************************/
/* DOTP2 - USING INTRINSICS */
/****************************************************************************/
float dotp2(double a[restrict], double b[restrict], int N)
{
int i;
float sum0 = 0;
float sum1 = 0;
for (i = 0; i < N/2; i++)
{
sum0 += _itof(_hi(a[i])) * _itof(_hi(b[i]));
sum1 += _itof(_lo(a[i])) * _itof(_lo(b[i]));
}
return sum0 + sum1;
}
/****************************************************************************/
/* DOTP3 - PEAK PERFORMANCE */
/****************************************************************************/
float dotp3(double a[restrict], double b[restrict], int N)
{
int i;
float sum0 = 0;
float sum1 = 0;
float sum2 = 0;
float sum3 = 0;
float sum4 = 0;
float sum5 = 0;
float sum6 = 0;
float sum7 = 0;
for (i = 0; i < N/2; i+= 4)
{
sum0 += FHI(a[i]) * FHI(b[i]);
sum1 += FLO(a[i]) * FLO(b[i]);
sum2 += FHI(a[i+1]) * FHI(b[i+1]);
sum3 += FLO(a[i+1]) * FLO(b[i+1]);
sum4 += FHI(a[i+2]) * FHI(b[i+2]);
sum5 += FLO(a[i+2]) * FLO(b[i+2]);
sum6 += FHI(a[i+3]) * FHI(b[i+3]);
sum7 += FLO(a[i+3]) * FLO(b[i+3]);
}
sum0 += sum1;
sum2 += sum3;
sum4 += sum5;
sum6 += sum7;
sum0 += sum2;
sum4 += sum6;
return sum0 + sum4;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -