⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 symv_k.c

📁 Optimized GotoBLAS libraries
💻 C
字号:
/*********************************************************************//*                                                                   *//*             Optimized BLAS libraries                              *//*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     *//*                                                                   *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   *//* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of     *//* profits, interruption of business, or related expenses which may  *//* arise from use of Software or Documentation, including but not    *//* limited to those resulting from defects in Software and/or        *//* Documentation, or loss or inaccuracy of data of any kind.         *//*********************************************************************/#include <stdio.h>#include <ctype.h>#include "common.h"#undef SYMV_P#define SYMV_P 4int CNAME(long m, FLOAT alpha, FLOAT *a, long lda,	  FLOAT *x, long incx, FLOAT *y, long incy, void *buffer){  long i, is;  FLOAT *X = buffer;  FLOAT *Y = y;  FLOAT *bufferY = (FLOAT *)(((long)buffer + m * sizeof(FLOAT) + 4095) & ~4095);  FLOAT xsum1, xsum2, xsum3, xsum4;  FLOAT atemp1, atemp2, atemp3, atemp4;  FLOAT xtemp1, xtemp2, xtemp3, xtemp4;  FLOAT *xx, *yy;  FLOAT *A1, *A2, *A3, *A4;  COPY_K(m, x, incx, X, 1);  if (alpha != ONE) SCAL_K(m, 0, 0, alpha, X, 1, NULL, 0, NULL, 0);  if (incy != 1) {    Y = bufferY;    COPY_K(m, y, incy, Y, 1);  }#ifndef LOWER  is = 0;  if (is + 4 <= m) {    do {      A1 = a + 0 * lda;      A2 = a + 1 * lda;      A3 = a + 2 * lda;      A4 = a + 3 * lda;      a += 4 * lda;      atemp1 = X[is + 0];      atemp2 = X[is + 1];      atemp3 = X[is + 2];      atemp4 = X[is + 3];            xsum1 = ZERO; xsum2 = ZERO;      xsum3 = ZERO; xsum4 = ZERO;	      xx = X;      yy = Y;      for (i = 0; i < is; i+= 4) {	xtemp1 = xx[0];	xtemp2 = xx[1];	xtemp3 = xx[2];	xtemp4 = xx[3];	xx += 4;		xsum1 += (xtemp1 * A1[0] + xtemp2 * A1[1]) + (xtemp3 * A1[2] + xtemp4 * A1[3]);	xsum2 += (xtemp1 * A2[0] + xtemp2 * A2[1]) + (xtemp3 * A2[2] + xtemp4 * A2[3]);	xsum3 += (xtemp1 * A3[0] + xtemp2 * A3[1]) + (xtemp3 * A3[2] + xtemp4 * A3[3]);	xsum4 += (xtemp1 * A4[0] + xtemp2 * A4[1]) + (xtemp3 * A4[2] + xtemp4 * A4[3]);		yy[0] += atemp1 * A1[0] + atemp2 * A2[0] + atemp3 * A3[0] + atemp4 * A4[0];	yy[1] += atemp1 * A1[1] + atemp2 * A2[1] + atemp3 * A3[1] + atemp4 * A4[1];	yy[2] += atemp1 * A1[2] + atemp2 * A2[2] + atemp3 * A3[2] + atemp4 * A4[2];	yy[3] += atemp1 * A1[3] + atemp2 * A2[3] + atemp3 * A3[3] + atemp4 * A4[3];	yy += 4;		A1 += 4;	A2 += 4;	A3 += 4;	A4 += 4;      }      xsum1 += atemp1 * A1[0] + atemp2 * A2[0] + atemp3 * A3[0] + atemp4 * A4[0];      xsum2 += atemp1 * A2[0] + atemp2 * A2[1] + atemp3 * A3[1] + atemp4 * A4[1];      xsum3 += atemp1 * A3[0] + atemp2 * A3[1] + atemp3 * A3[2] + atemp4 * A4[2];      xsum4 += atemp1 * A4[0] + atemp2 * A4[1] + atemp3 * A4[2] + atemp4 * A4[3];      Y[is + 0] += xsum1;      Y[is + 1] += xsum2;      Y[is + 2] += xsum3;      Y[is + 3] += xsum4;      is += 4;    } while (is + 4 <= m);  }  if (m & 2) {    A1 = a + 0 * lda;    A2 = a + 1 * lda;    a += 2 * lda;    atemp1 = X[is + 0];    atemp2 = X[is + 1];        xsum1 = ZERO; xsum2 = ZERO;    xx = X;    yy = Y;    for (i = 0; i < is; i+= 2) {      xtemp1 = xx[0];      xtemp2 = xx[1];      xx += 2;            xsum1 += (xtemp1 * A1[0] + xtemp2 * A1[1]);      xsum2 += (xtemp1 * A2[0] + xtemp2 * A2[1]);            yy[0] += atemp1 * A1[0] + atemp2 * A2[0];      yy[1] += atemp1 * A1[1] + atemp2 * A2[1];      yy += 2;            A1 += 2;      A2 += 2;    }        xsum1 += atemp1 * A1[0] + atemp2 * A2[0];    xsum2 += atemp1 * A2[0] + atemp2 * A2[1];        Y[is + 0] += xsum1;    Y[is + 1] += xsum2;    is += 2;  }  if (m & 1) {    A1 = a + 0 * lda;    a += lda;    atemp1 = X[is + 0];        xsum1 = ZERO;    xx = X;    yy = Y;    for (i = 0; i < is; i++) {      xtemp1 = xx[0];      xx += 1;            xsum1 += xtemp1 * A1[0];            yy[0] += atemp1 * A1[0];      yy += 1;            A1 ++;    }        xsum1 += atemp1 * A1[0];        Y[is + 0] += xsum1;  }#else  is = 0;  if (is + 4 <= m) {    do {      A1 = a + 0 * lda;      A2 = a + 1 * lda;      A3 = a + 2 * lda;      A4 = a + 3 * lda;      a += 4 + 4 * lda;      xx = X + is;      yy = Y + is + 4;      atemp1 = xx[0];      atemp2 = xx[1];      atemp3 = xx[2];      atemp4 = xx[3];      xx += 4;            xsum1 = atemp1 * A1[0] + atemp2 * A1[1] + atemp3 * A1[2] + atemp4 * A1[3];      xsum2 = atemp1 * A1[1] + atemp2 * A2[1] + atemp3 * A2[2] + atemp4 * A2[3];      xsum3 = atemp1 * A1[2] + atemp2 * A2[2] + atemp3 * A3[2] + atemp4 * A3[3];      xsum4 = atemp1 * A1[3] + atemp2 * A2[3] + atemp3 * A3[3] + atemp4 * A4[3];            A1 += 4;      A2 += 4;      A3 += 4;      A4 += 4;      i = ((m - is - 4) >> 2);      if (i > 0) {	do {	  xtemp1 = xx[0];	  xtemp2 = xx[1];	  xtemp3 = xx[2];	  xtemp4 = xx[3];	  xx += 4;	  	  xsum1 += (xtemp1 * A1[0] + xtemp2 * A1[1]) + (xtemp3 * A1[2] + xtemp4 * A1[3]);	  xsum2 += (xtemp1 * A2[0] + xtemp2 * A2[1]) + (xtemp3 * A2[2] + xtemp4 * A2[3]);	  xsum3 += (xtemp1 * A3[0] + xtemp2 * A3[1]) + (xtemp3 * A3[2] + xtemp4 * A3[3]);	  xsum4 += (xtemp1 * A4[0] + xtemp2 * A4[1]) + (xtemp3 * A4[2] + xtemp4 * A4[3]);	  	  yy[0] += atemp1 * A1[0] + atemp2 * A2[0] + atemp3 * A3[0] + atemp4 * A4[0];	  yy[1] += atemp1 * A1[1] + atemp2 * A2[1] + atemp3 * A3[1] + atemp4 * A4[1];	  yy[2] += atemp1 * A1[2] + atemp2 * A2[2] + atemp3 * A3[2] + atemp4 * A4[2];	  yy[3] += atemp1 * A1[3] + atemp2 * A2[3] + atemp3 * A3[3] + atemp4 * A4[3];	  yy += 4;	  	  A1 += 4;	  A2 += 4;	  A3 += 4;	  A4 += 4;	  i --;	} while (i > 0);      }      if  (m & 2) {	xtemp1 = xx[0];	xtemp2 = xx[1];	xx += 2;		xsum1 += (xtemp1 * A1[0] + xtemp2 * A1[1]);	xsum2 += (xtemp1 * A2[0] + xtemp2 * A2[1]);	xsum3 += (xtemp1 * A3[0] + xtemp2 * A3[1]);	xsum4 += (xtemp1 * A4[0] + xtemp2 * A4[1]);		yy[0] += atemp1 * A1[0] + atemp2 * A2[0] + atemp3 * A3[0] + atemp4 * A4[0];	yy[1] += atemp1 * A1[1] + atemp2 * A2[1] + atemp3 * A3[1] + atemp4 * A4[1];		yy += 2;		A1 += 2;	A2 += 2;	A3 += 2;	A4 += 2;      }      if  (m & 1) {	xtemp1 = xx[0];	xx ++;		xsum1 += xtemp1 * A1[0];	xsum2 += xtemp1 * A2[0];	xsum3 += xtemp1 * A3[0];	xsum4 += xtemp1 * A4[0];		yy[0] += atemp1 * A1[0] + atemp2 * A2[0] + atemp3 * A3[0] + atemp4 * A4[0];	yy += 1;		A1 += 1;	A2 += 1;	A3 += 1;	A4 += 1;      }      Y[is + 0] += xsum1;      Y[is + 1] += xsum2;      Y[is + 2] += xsum3;      Y[is + 3] += xsum4;      is += 4;    } while (is + 4 <= m);  }  if (m & 2) {    A1 = a + 0 * lda;    A2 = a + 1 * lda;    a += 2 + 2 * lda;    xx = X + is;    yy = Y + is + 2;    atemp1 = xx[0];    atemp2 = xx[1];    xx += 2;          xsum1 = atemp1 * A1[0] + atemp2 * A1[1];    xsum2 = atemp1 * A1[1] + atemp2 * A2[1];          A1 += 2;    A2 += 2;    if  (m & 1) {      xtemp1 = xx[0];      xx ++;            xsum1 += xtemp1 * A1[0];      xsum2 += xtemp1 * A2[0];            yy[0] += atemp1 * A1[0] + atemp2 * A2[0];      yy += 1;            A1 += 1;      A2 += 1;    }        Y[is + 0] += xsum1;    Y[is + 1] += xsum2;        is += 2;  }  if (m & 1) {    A1 = a + 0 * lda;    a += 1 + 1 * lda;    xx = X + is;    yy = Y + is + 1;    atemp1 = xx[0];    xx += 1;          xsum1 = atemp1 * A1[0];          A1 ++;    Y[is + 0] += xsum1;        is += 1;  }#endif  if (incy != 1) {    COPY_K(m, Y, 1, y, incy);  }  return 0;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -