⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 zsymv_k.c

📁 Optimized GotoBLAS libraries
💻 C
字号:
/*********************************************************************//*                                                                   *//*             Optimized BLAS libraries                              *//*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     *//*                                                                   *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   *//* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of     *//* profits, interruption of business, or related expenses which may  *//* arise from use of Software or Documentation, including but not    *//* limited to those resulting from defects in Software and/or        *//* Documentation, or loss or inaccuracy of data of any kind.         *//*********************************************************************/#include <stdio.h>#include <ctype.h>#include "common.h"int CNAME(long m, FLOAT alpha_r, FLOAT alpha_i, 	  FLOAT *a, long lda,	  FLOAT *x, long incx, FLOAT *y, long incy, void *buffer){  long i, is;  FLOAT *X = buffer;  FLOAT *Y = y;  FLOAT *bufferY = (FLOAT *)(((long)buffer + 4 * m * sizeof(FLOAT) + 4095) & ~4095);  FLOAT xsum1, xsum2, xsum3, xsum4;  FLOAT atemp1, atemp2, atemp3, atemp4;  FLOAT atemp5, atemp6, atemp7, atemp8;  FLOAT xtemp1, xtemp2, xtemp3, xtemp4;  FLOAT xtemp5, xtemp6, xtemp7, xtemp8;  FLOAT *xx, *yy;  FLOAT *A1, *A2;  xx = X;  for (i = 0; i < m; i++) {    xtemp1 = x[0];    xtemp2 = x[1];    x += incx * 2;        xtemp3 = alpha_r * xtemp1 - alpha_i * xtemp2;    xtemp4 = alpha_i * xtemp1 + alpha_r * xtemp2;    xx[0] =  xtemp3;    xx[1] =  xtemp4;    xx[2] = -xtemp4;    xx[3] =  xtemp3;    xx += 4;  }  if (incy != 1) {    Y = bufferY;    ZCOPY_K(m, y, incy, Y, 1);  }  lda *= 2;#ifndef LOWER  is = 0;  if (is + 2 <= m) {    do {            A1 = a + 0 * lda;      A2 = a + 1 * lda;      a += 2 * lda;            atemp1 = X[is * 4 + 0];      atemp2 = X[is * 4 + 1];      atemp3 = X[is * 4 + 2];      atemp4 = X[is * 4 + 3];      atemp5 = X[is * 4 + 4];      atemp6 = X[is * 4 + 5];      atemp7 = X[is * 4 + 6];      atemp8 = X[is * 4 + 7];            xsum1 = ZERO; xsum2 = ZERO;      xsum3 = ZERO; xsum4 = ZERO;            xx = X;      yy = Y;            for (i = 0; i < is; i+= 2) {	xtemp1 = xx[0];	xtemp2 = xx[1];	xtemp3 = xx[2];	xtemp4 = xx[3];	xtemp5 = xx[4];	xtemp6 = xx[5];	xtemp7 = xx[6];	xtemp8 = xx[7];	xx += 8;		xsum1 += xtemp1 * A1[0] + xtemp3 * A1[1] + xtemp5 * A1[2] + xtemp7 * A1[3];	xsum2 += xtemp2 * A1[0] + xtemp4 * A1[1] + xtemp6 * A1[2] + xtemp8 * A1[3];	xsum3 += xtemp1 * A2[0] + xtemp3 * A2[1] + xtemp5 * A2[2] + xtemp7 * A2[3];	xsum4 += xtemp2 * A2[0] + xtemp4 * A2[1] + xtemp6 * A2[2] + xtemp8 * A2[3];		yy[0] += atemp1 * A1[0] + atemp3 * A1[1] + atemp5 * A2[0] + atemp7 * A2[1];	yy[1] += atemp4 * A1[1] + atemp2 * A1[0] + atemp8 * A2[1] + atemp6 * A2[0] ;	yy[2] += atemp1 * A1[2] + atemp3 * A1[3] + atemp5 * A2[2] + atemp7 * A2[3];	yy[3] += atemp4 * A1[3] + atemp2 * A1[2] + atemp8 * A2[3] + atemp6 * A2[2];	yy += 4;	A1 += 4;	A2 += 4;      }            xsum1 += atemp1 * A1[0] + atemp3 * A1[1] + atemp5 * A2[0] + atemp7 * A2[1];      xsum2 += atemp2 * A1[0] + atemp4 * A1[1] + atemp6 * A2[0] + atemp8 * A2[1];      xsum3 += atemp1 * A2[0] + atemp3 * A2[1] + atemp5 * A2[2] + atemp7 * A2[3];      xsum4 += atemp2 * A2[0] + atemp4 * A2[1] + atemp6 * A2[2] + atemp8 * A2[3];            Y[is * 2 + 0] += xsum1;      Y[is * 2 + 1] += xsum2;      Y[is * 2 + 2] += xsum3;      Y[is * 2 + 3] += xsum4;      is += 2;          } while (is + 2 <= m);  }    if (m & 1) {    A1 = a + 0 * lda;    a += lda;    atemp1 = X[is * 4 + 0];    atemp2 = X[is * 4 + 1];    atemp3 = X[is * 4 + 2];    atemp4 = X[is * 4 + 3];        xsum1 = ZERO;    xsum2 = ZERO;    xx = X;    yy = Y;    xtemp1 = ZERO;    xtemp2 = ZERO;    xtemp3 = ZERO;    xtemp4 = ZERO;    for (i = 0; i < is; i++) {      xtemp1 = xx[0];      xtemp2 = xx[1];      xtemp3 = xx[2];      xtemp4 = xx[3];      xx += 4;            xsum1 += xtemp1 * A1[0] + xtemp3 * A1[1];      xsum2 += xtemp2 * A1[0] + xtemp4 * A1[1];            yy[0] += atemp1 * A1[0] + atemp3 * A1[1];      yy[1] += atemp2 * A1[0] + atemp4 * A1[1];      yy += 2;      A1 += 2;    }        xsum1 += atemp1 * A1[0] + atemp3 * A1[1];    xsum2 += atemp2 * A1[0] + atemp4 * A1[1];        Y[is * 2 + 0] += xsum1;    Y[is * 2 + 1] += xsum2;  }#else  is = 0;  if (is + 2 <= m) {    do {      A1 = a + 0 * lda;      A2 = a + 1 * lda;      a += 4 + 2 * lda;            xx = X + is * 4;      yy = Y + is * 2 + 4;            atemp1 = xx[0];      atemp2 = xx[1];      atemp3 = xx[2];      atemp4 = xx[3];      atemp5 = xx[4];      atemp6 = xx[5];      atemp7 = xx[6];      atemp8 = xx[7];      xx += 8;            xsum1 = atemp1 * A1[0] + atemp3 * A1[1] + atemp5 * A1[2] + atemp7 * A1[3];      xsum2 = atemp2 * A1[0] + atemp4 * A1[1] + atemp6 * A1[2] + atemp8 * A1[3];      xsum3 = atemp1 * A1[2] + atemp3 * A1[3] + atemp5 * A2[2] + atemp7 * A2[3];      xsum4 = atemp2 * A1[2] + atemp4 * A1[3] + atemp6 * A2[2] + atemp8 * A2[3];            A1 += 4;      A2 += 4;      i = ((m - is - 2) >> 1);      if (i > 0) {	do {	  xtemp1 = xx[0];	  xtemp2 = xx[1];	  xtemp3 = xx[2];	  xtemp4 = xx[3];	  xtemp5 = xx[4];	  xtemp6 = xx[5];	  xtemp7 = xx[6];	  xtemp8 = xx[7];	  xx += 8;	  	  xsum1 += xtemp1 * A1[0] + xtemp3 * A1[1] + xtemp5 * A1[2] + xtemp7 * A1[3];	  xsum2 += xtemp2 * A1[0] + xtemp4 * A1[1] + xtemp6 * A1[2] + xtemp8 * A1[3];	  xsum3 += xtemp1 * A2[0] + xtemp3 * A2[1] + xtemp5 * A2[2] + xtemp7 * A2[3];	  xsum4 += xtemp2 * A2[0] + xtemp4 * A2[1] + xtemp6 * A2[2] + xtemp8 * A2[3];	  	  yy[0] += atemp1 * A1[0] + atemp3 * A1[1] + atemp5 * A2[0] + atemp7 * A2[1];	  yy[1] += atemp2 * A1[0] + atemp4 * A1[1] + atemp6 * A2[0] + atemp8 * A2[1];	  yy[2] += atemp1 * A1[2] + atemp3 * A1[3] + atemp5 * A2[2] + atemp7 * A2[3];	  yy[3] += atemp2 * A1[2] + atemp4 * A1[3] + atemp6 * A2[2] + atemp8 * A2[3];	  	  yy += 4;	  	  A1 += 4;	  A2 += 4;	  i --;	} while (i > 0);      }            if  (m & 1) {	xtemp1 = xx[0];	xtemp2 = xx[1];	xtemp3 = xx[2];	xtemp4 = xx[3];	xx += 4;		xsum1 += xtemp1 * A1[0] + xtemp3 * A1[1];	xsum2 += xtemp2 * A1[0] + xtemp4 * A1[1];	xsum3 += xtemp1 * A2[0] + xtemp3 * A2[1];	xsum4 += xtemp2 * A2[0] + xtemp4 * A2[1];		yy[0] += atemp1 * A1[0] + atemp3 * A1[1] + atemp5 * A2[0] + atemp7 * A2[1];	yy[1] += atemp2 * A1[0] + atemp4 * A1[1] + atemp6 * A2[0] + atemp8 * A2[1];	yy += 2;		A1 += 2;	A2 += 2;      }      Y[is * 2 + 0] += xsum1;      Y[is * 2 + 1] += xsum2;      Y[is * 2 + 2] += xsum3;      Y[is * 2 + 3] += xsum4;            is += 2;    } while (is + 2 <= m);  }  if (m & 1) {    A1 = a;    xx = X + is * 4;    atemp1 = xx[0];    atemp2 = xx[1];    atemp3 = xx[2];    atemp4 = xx[3];          xsum1 = atemp1 * A1[0] + atemp3 * A1[1];    xsum2 = atemp2 * A1[0] + atemp4 * A1[1];          Y[is * 2 + 0] += xsum1;    Y[is * 2 + 1] += xsum2;  }#endif  if (incy != 1) {    ZCOPY_K(m, Y, 1, y, incy);  }  return 0;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -