atl_gemvn_4x2_0.c
来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 646 行 · 第 1/2 页
C
646 行
/* * Automatically Tuned Linear Algebra Software v3.8.0 * (C) Copyright 1999 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions, and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the ATLAS group or the names of its contributers may * not be used to endorse or promote products derived from this * software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#include "atlas_misc.h"#include "atlas_level1.h"#include "atlas_level2.h"#include "atlas_lvl2.h"static void gemvMlt8(const int M, const int N, const TYPE *A, const int lda, const TYPE *X, const SCALAR beta, TYPE *Y);static void gemvNle4(const int M, const int N, const TYPE *A, const int lda, const TYPE *X, const SCALAR beta, TYPE *Y);#ifdef BETA0 #define Yget(y_, yp_, bet_) (y_) = ATL_rzero#elif defined BETAX #define Yget(y_, yp_, bet_) (y_) = (yp_) * (bet_)#else #define Yget(y_, yp_, bet_) (y_) = (yp_)#endifstatic void gemvN32x4(const int M, const int N, const TYPE *A, const int lda, const TYPE *x, const SCALAR beta0, TYPE *y)/* * rank-4 daxpy based NoTrans gemv */{ const int M16 = (M>>4)<<4; TYPE *stY = y + M16 - 32; const TYPE *A0 = A, *A1 = A+lda, *A2 = A1 + lda, *A3 = A2 + lda; register TYPE z0, z1, z2, z3, z4, z5, z6, z7; register TYPE y0, y1, y2, y3, y4, y5, y6, y7; const register TYPE x0 = *x, x1 = x[1], x2 = x[2], x3 = x[3]; #ifdef BETAX const register TYPE beta = beta0; #else #define beta beta0 #endif ATL_assert(N == 4); if (M16 >= 32) { #ifdef BETA0 y0 = y1 = y2 = y3 = y4 = y5 = y6 = y7 = ATL_rzero; #else y0 = *y; y1 = y[1]; y2 = y[2]; y3 = y[3]; y4 = y[4]; y5 = y[5]; y6 = y[6]; y7 = y[7]; #ifdef BETAX y0 *= beta; y1 *= beta; y2 *= beta; y3 *= beta; y4 *= beta; y5 *= beta; y6 *= beta; y7 *= beta; #endif #endif y0 += x0 * *A0; Yget(z0, y[8], beta); y1 += x1 * A1[1]; y2 += x2 * A2[2]; Yget(z1, y[9], beta); y3 += x3 * A3[3]; y4 += x0 * A0[4]; Yget(z2, y[10], beta); y5 += x1 * A1[5]; y6 += x2 * A2[6]; Yget(z3, y[11], beta); y7 += x3 * A3[7]; y0 += x1 * *A1; Yget(z4, y[12], beta); y1 += x2 * A2[1]; y2 += x3 * A3[2]; Yget(z5, y[13], beta); y3 += x0 * A0[3]; y4 += x1 * A1[4]; Yget(z6, y[14], beta); y5 += x2 * A2[5]; y6 += x3 * A3[6]; Yget(z7, y[15], beta); y7 += x0 * A0[7]; y0 += x2 * *A2; y1 += x3 * A3[1]; y2 += x0 * A0[2]; y3 += x1 * A1[3]; y4 += x2 * A2[4]; y5 += x3 * A3[5]; y6 += x0 * A0[6]; y7 += x1 * A1[7]; y0 += x3 * *A3; y1 += x0 * A0[1]; y2 += x1 * A1[2]; y3 += x2 * A2[3]; y4 += x3 * A3[4]; y5 += x0 * A0[5]; y6 += x1 * A1[6]; y7 += x2 * A2[7]; z0 += x0 * A0[8]; *y = y0; z1 += x1 * A1[9]; z2 += x2 * A2[10]; y[1] = y1; z3 += x3 * A3[11]; z4 += x0 * A0[12]; y[2] = y2; z5 += x1 * A1[13]; z6 += x2 * A2[14]; y[3] = y3; z7 += x3 * A3[15]; z0 += x1 * A1[8]; y[4] = y4; z1 += x2 * A2[9]; z2 += x3 * A3[10]; y[5] = y5; z3 += x0 * A0[11]; z4 += x1 * A1[12]; y[6] = y6; z5 += x2 * A2[13]; z6 += x3 * A3[14]; y[7] = y7; z7 += x0 * A0[15]; z0 += x2 * A2[8]; Yget(y0, y[16], beta); z1 += x3 * A3[ 9]; z2 += x0 * A0[10]; Yget(y1, y[17], beta); z3 += x1 * A1[11]; z4 += x2 * A2[12]; Yget(y2, y[18], beta); z5 += x3 * A3[13]; z6 += x0 * A0[14]; Yget(y3, y[19], beta); z7 += x1 * A1[15]; z0 += x3 * A3[8]; Yget(y4, y[20], beta); z1 += x0 * A0[9]; z2 += x1 * A1[10]; Yget(y5, y[21], beta); z3 += x2 * A2[11]; z4 += x3 * A3[12]; Yget(y6, y[22], beta); A3 += 16; z5 += x0 * A0[13]; A0 += 16; z6 += x1 * A1[14]; Yget(y7, y[23], beta); A1 += 16; z7 += x2 * A2[15]; A2 += 16; if (M16 != 32) { do { y0 += x0 * *A0; y[8] = z0; y1 += x1 * A1[1]; y2 += x2 * A2[2]; y[9] = z1; y3 += x3 * A3[3]; y4 += x0 * A0[4]; y[10] = z2; y5 += x1 * A1[5]; y6 += x2 * A2[6]; y[11] = z3; y7 += x3 * A3[7]; y0 += x1 * *A1; y[12] = z4; y1 += x2 * A2[1]; y2 += x3 * A3[2]; y[13] = z5; y3 += x0 * A0[3]; y4 += x1 * A1[4]; y[14] = z6; y5 += x2 * A2[5]; y6 += x3 * A3[6]; y[15] = z7; y += 16; y7 += x0 * A0[7]; y0 += x2 * *A2; Yget(z0, y[8], beta); y1 += x3 * A3[1]; y2 += x0 * A0[2]; Yget(z1, y[9], beta); y3 += x1 * A1[3]; y4 += x2 * A2[4]; Yget(z2, y[10], beta); y5 += x3 * A3[5]; y6 += x0 * A0[6]; Yget(z3, y[11], beta); y7 += x1 * A1[7]; y0 += x3 * *A3; Yget(z4, y[12], beta); y1 += x0 * A0[1]; y2 += x1 * A1[2]; Yget(z5, y[13], beta); y3 += x2 * A2[3]; y4 += x3 * A3[4]; Yget(z6, y[14], beta); y5 += x0 * A0[5]; y6 += x1 * A1[6]; Yget(z7, y[15], beta); y7 += x2 * A2[7]; z0 += x0 * A0[8]; *y = y0; z1 += x1 * A1[9]; z2 += x2 * A2[10]; y[1] = y1; z3 += x3 * A3[11]; z4 += x0 * A0[12]; y[2] = y2; z5 += x1 * A1[13]; z6 += x2 * A2[14]; y[3] = y3; z7 += x3 * A3[15]; z0 += x1 * A1[8]; y[4] = y4; z1 += x2 * A2[9]; z2 += x3 * A3[10]; y[5] = y5; z3 += x0 * A0[11]; z4 += x1 * A1[12]; y[6] = y6; z5 += x2 * A2[13]; z6 += x3 * A3[14]; y[7] = y7; z7 += x0 * A0[15]; z0 += x2 * A2[8]; Yget(y0, y[16], beta); z1 += x3 * A3[ 9]; z2 += x0 * A0[10]; Yget(y1, y[17], beta); z3 += x1 * A1[11]; z4 += x2 * A2[12]; Yget(y2, y[18], beta); z5 += x3 * A3[13]; z6 += x0 * A0[14]; Yget(y3, y[19], beta); z7 += x1 * A1[15]; z0 += x3 * A3[8]; Yget(y4, y[20], beta); z1 += x0 * A0[9]; z2 += x1 * A1[10]; Yget(y5, y[21], beta); z3 += x2 * A2[11]; z4 += x3 * A3[12]; Yget(y6, y[22], beta); A3 += 16; z5 += x0 * A0[13]; A0 += 16; z6 += x1 * A1[14]; Yget(y7, y[23], beta); A1 += 16; z7 += x2 * A2[15]; A2 += 16; } while (y != stY); } y0 += x0 * *A0; y[8] = z0; y1 += x1 * A1[1]; y2 += x2 * A2[2]; y[9] = z1; y3 += x3 * A3[3]; y4 += x0 * A0[4]; y[10] = z2; y5 += x1 * A1[5]; y6 += x2 * A2[6]; y[11] = z3; y7 += x3 * A3[7]; y0 += x1 * *A1; y[12] = z4; y1 += x2 * A2[1]; y2 += x3 * A3[2]; y[13] = z5; y3 += x0 * A0[3]; y4 += x1 * A1[4]; y[14] = z6; y5 += x2 * A2[5]; y6 += x3 * A3[6]; y[15] = z7; y += 16; y7 += x0 * A0[7]; y0 += x2 * *A2; Yget(z0, y[8], beta); y1 += x3 * A3[1]; y2 += x0 * A0[2]; Yget(z1, y[9], beta); y3 += x1 * A1[3]; y4 += x2 * A2[4]; Yget(z2, y[10], beta); y5 += x3 * A3[5]; y6 += x0 * A0[6]; Yget(z3, y[11], beta); y7 += x1 * A1[7]; y0 += x3 * *A3; Yget(z4, y[12], beta); y1 += x0 * A0[1]; y2 += x1 * A1[2]; Yget(z5, y[13], beta); y3 += x2 * A2[3]; y4 += x3 * A3[4]; Yget(z6, y[14], beta); y5 += x0 * A0[5]; y6 += x1 * A1[6]; Yget(z7, y[15], beta); y7 += x2 * A2[7]; z0 += x0 * A0[8]; *y = y0; z1 += x1 * A1[9]; z2 += x2 * A2[10]; y[1] = y1; z3 += x3 * A3[11]; z4 += x0 * A0[12]; y[2] = y2; z5 += x1 * A1[13]; z6 += x2 * A2[14]; y[3] = y3; z7 += x3 * A3[15]; z0 += x1 * A1[8]; y[4] = y4; z1 += x2 * A2[9]; z2 += x3 * A3[10]; y[5] = y5; z3 += x0 * A0[11]; z4 += x1 * A1[12]; y[6] = y6; z5 += x2 * A2[13]; z6 += x3 * A3[14]; y[7] = y7; z7 += x0 * A0[15]; z0 += x2 * A2[8]; z1 += x3 * A3[ 9]; z2 += x0 * A0[10]; z3 += x1 * A1[11]; z4 += x2 * A2[12]; z5 += x3 * A3[13]; z6 += x0 * A0[14]; z7 += x1 * A1[15]; z0 += x3 * A3[8]; z1 += x0 * A0[9]; z2 += x1 * A1[10]; z3 += x2 * A2[11]; z4 += x3 * A3[12]; z5 += x0 * A0[13]; z6 += x1 * A1[14]; z7 += x2 * A2[15]; y[8] = z0; y[9] = z1; y[10] = z2; y[11] = z3; y[12] = z4; y[13] = z5; y[14] = z6; y[15] = z7; if (M-M16) gemvMlt8(M-M16, N, A0+16, lda, x, beta, y+16); } else if (N) gemvMlt8(M, N, A, lda, x, beta, y);}static void gemv32x4(const int M, const int N, const TYPE *A, const int lda, const TYPE *X, const SCALAR beta, TYPE *Y){ #ifdef BETA1 int j; #endif const int incA = lda<<2; if (N >= 4) { if (M >= 32) {
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?