atl_gemvn_8x32_2.c
来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 448 行 · 第 1/2 页
C
448 行
/* * Automatically Tuned Linear Algebra Software v3.8.0 * (C) Copyright 1999 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions, and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the ATLAS group or the names of its contributers may * not be used to endorse or promote products derived from this * software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#include "atlas_misc.h"#include "atlas_level1.h"#include "atlas_level2.h"#include "atlas_lvl2.h"#define Yass(y_) (y_) +=#ifdef BETA1static void gemv8x32(const int M, const int N, const TYPE *A, const int lda, const TYPE *X, const SCALAR beta, TYPE *Y)/* * Try to let compiler do all the work */{ const TYPE *A0=A, *A1=A+lda, *A2=A1+lda, *A3=A2+lda; const TYPE *A4=A3+lda, *A5=A4+lda, *A6=A5+lda, *A7=A6+lda; const TYPE *A8=A7+lda, *A9=A8+lda, *A10=A9+lda, *A11=A10+lda; const TYPE *A12=A11+lda, *A13=A12+lda, *A14=A13+lda, *A15=A14+lda; const TYPE *A16=A15+lda, *A17=A16+lda, *A18=A17+lda, *A19=A18+lda; const TYPE *A20=A19+lda, *A21=A20+lda, *A22=A21+lda, *A23=A22+lda; const TYPE *A24=A23+lda, *A25=A24+lda, *A26=A25+lda, *A27=A26+lda; const TYPE *A28=A27+lda, *A29=A28+lda, *A30=A29+lda, *A31=A30+lda; const TYPE x0=*X, x1=X[1], x2=X[2], x3=X[3]; const TYPE x4=X[4], x5=X[5], x6=X[6], x7=X[7]; const TYPE x8=X[8], x9=X[9], x10=X[10], x11=X[11]; const TYPE x12=X[12], x13=X[13], x14=X[14], x15=X[15]; const TYPE x16=X[16], x17=X[17], x18=X[18], x19=X[19]; const TYPE x20=X[20], x21=X[21], x22=X[22], x23=X[23]; const TYPE x24=X[24], x25=X[25], x26=X[26], x27=X[27]; const TYPE x28=X[28], x29=X[29], x30=X[30], x31=X[31]; TYPE *stY = Y + ((M>>3)<<3); ATL_assert(M >= 8 && (N == 32)); do { Yass(*Y) *A0 * x0 + *A1 * x1 + *A2 * x2 + *A3 * x3 + *A4 * x4 + *A5 * x5 + *A6 * x6 + *A7 * x7 + *A8 * x8 + *A9 * x9 + *A10 * x10 + *A11 * x11 + *A12 * x12 + *A13 * x13 + *A14 * x14 + *A15 * x15 + *A16 * x16 + *A17 * x17 + *A18 * x18 + *A19 * x19 + *A20 * x20 + *A21 * x21 + *A22 * x22 + *A23 * x23 + *A24 * x24 + *A25 * x25 + *A26 * x26 + *A27 * x27 + *A28 * x28 + *A29 * x29 + *A30 * x30 + *A31 * x31; Yass(Y[1]) A0[1]*x0 + A1[1]*x1 + A2[1]*x2 + A3[1]*x3 + A4[1]*x4 + A5[1]*x5 + A6[1]*x6 + A7[1]*x7 + A8[1]*x8 + A9[1]*x9 + A10[1]*x10 + A11[1]*x11 + A12[1]*x12 + A13[1]*x13 + A14[1]*x14 + A15[1]*x15 + A16[1]*x16 + A17[1]*x17 + A18[1]*x18 + A19[1]*x19 + A20[1]*x20 + A21[1]*x21 + A22[1]*x22 + A23[1]*x23 + A24[1]*x24 + A25[1]*x25 + A26[1]*x26 + A27[1]*x27 + A28[1]*x28 + A29[1]*x29 + A30[1]*x30 + A31[1]*x31; Yass(Y[2]) A0[2]*x0 + A1[2]*x1 + A2[2]*x2 + A3[2]*x3 + A4[2]*x4 + A5[2]*x5 + A6[2]*x6 + A7[2]*x7 + A8[2]*x8 + A9[2]*x9 + A10[2]*x10 + A11[2]*x11 + A12[2]*x12 + A13[2]*x13 + A14[2]*x14 + A15[2]*x15 + A16[2]*x16 + A17[2]*x17 + A18[2]*x18 + A19[2]*x19 + A20[2]*x20 + A21[2]*x21 + A22[2]*x22 + A23[2]*x23 + A24[2]*x24 + A25[2]*x25 + A26[2]*x26 + A27[2]*x27 + A28[2]*x28 + A29[2]*x29 + A30[2]*x30 + A31[2]*x31; Yass(Y[3]) A0[3]*x0 + A1[3]*x1 + A2[3]*x2 + A3[3]*x3 + A4[3]*x4 + A5[3]*x5 + A6[3]*x6 + A7[3]*x7 + A8[3]*x8 + A9[3]*x9 + A10[3]*x10 + A11[3]*x11 + A12[3]*x12 + A13[3]*x13 + A14[3]*x14 + A15[3]*x15 + A16[3]*x16 + A17[3]*x17 + A18[3]*x18 + A19[3]*x19 + A20[3]*x20 + A21[3]*x21 + A22[3]*x22 + A23[3]*x23 + A24[3]*x24 + A25[3]*x25 + A26[3]*x26 + A27[3]*x27 + A28[3]*x28 + A29[3]*x29 + A30[3]*x30 + A31[3]*x31; Yass(Y[4]) A0[4]*x0 + A1[4]*x1 + A2[4]*x2 + A3[4]*x3 + A4[4]*x4 + A5[4]*x5 + A6[4]*x6 + A7[4]*x7 + A8[4]*x8 + A9[4]*x9 + A10[4]*x10 + A11[4]*x11 + A12[4]*x12 + A13[4]*x13 + A14[4]*x14 + A15[4]*x15 + A16[4]*x16 + A17[4]*x17 + A18[4]*x18 + A19[4]*x19 + A20[4]*x20 + A21[4]*x21 + A22[4]*x22 + A23[4]*x23 + A24[4]*x24 + A25[4]*x25 + A26[4]*x26 + A27[4]*x27 + A28[4]*x28 + A29[4]*x29 + A30[4]*x30 + A31[4]*x31; Yass(Y[5]) A0[5]*x0 + A1[5]*x1 + A2[5]*x2 + A3[5]*x3 + A4[5]*x4 + A5[5]*x5 + A6[5]*x6 + A7[5]*x7 + A8[5]*x8 + A9[5]*x9 + A10[5]*x10 + A11[5]*x11 + A12[5]*x12 + A13[5]*x13 + A14[5]*x14 + A15[5]*x15 + A16[5]*x16 + A17[5]*x17 + A18[5]*x18 + A19[5]*x19 + A20[5]*x20 + A21[5]*x21 + A22[5]*x22 + A23[5]*x23 + A24[5]*x24 + A25[5]*x25 + A26[5]*x26 + A27[5]*x27 + A28[5]*x28 + A29[5]*x29 + A30[5]*x30 + A31[5]*x31; Yass(Y[6]) A0[6]*x0 + A1[6]*x1 + A2[6]*x2 + A3[6]*x3 + A4[6]*x4 + A5[6]*x5 + A6[6]*x6 + A7[6]*x7 + A8[6]*x8 + A9[6]*x9 + A10[6]*x10 + A11[6]*x11 + A12[6]*x12 + A13[6]*x13 + A14[6]*x14 + A15[6]*x15 + A16[6]*x16 + A17[6]*x17 + A18[6]*x18 + A19[6]*x19 + A20[6]*x20 + A21[6]*x21 + A22[6]*x22 + A23[6]*x23 + A24[6]*x24 + A25[6]*x25 + A26[6]*x26 + A27[6]*x27 + A28[6]*x28 + A29[6]*x29 + A30[6]*x30 + A31[6]*x31; Yass(Y[7]) A0[7]*x0 + A1[7]*x1 + A2[7]*x2 + A3[7]*x3 + A4[7]*x4 + A5[7]*x5 + A6[7]*x6 + A7[7]*x7 + A8[7]*x8 + A9[7]*x9 + A10[7]*x10 + A11[7]*x11 + A12[7]*x12 + A13[7]*x13 + A14[7]*x14 + A15[7]*x15 + A16[7]*x16 + A17[7]*x17 + A18[7]*x18 + A19[7]*x19 + A20[7]*x20 + A21[7]*x21 + A22[7]*x22 + A23[7]*x23 + A24[7]*x24 + A25[7]*x25 + A26[7]*x26 + A27[7]*x27 + A28[7]*x28 + A29[7]*x29 + A30[7]*x30 + A31[7]*x31; A0 += 8; A1 += 8; A2 += 8; A3 += 8; A4 += 8; A5 += 8; A6 += 8; A7 += 8; A8 += 8; A9 += 8; A10 += 8; A11 += 8; A12 += 8; A13 += 8; A14 += 8; A15 += 8; A16 += 8; A17 += 8; A18 += 8; A19 += 8; A20 += 8; A21 += 8; A22 += 8; A23 += 8; A24 += 8; A25 += 8; A26 += 8; A27 += 8; A28 += 8; A29 += 8; A30 += 8; A31 += 8; Y += 8; } while (Y != stY);}static void gemv1x32(const int M, const int N, const TYPE *A, const int lda, const TYPE *X, const SCALAR beta, TYPE *Y)/* * Try to let compiler do all the work */{ const TYPE *A0=A, *A1=A+lda, *A2=A1+lda, *A3=A2+lda; const TYPE *A4=A3+lda, *A5=A4+lda, *A6=A5+lda, *A7=A6+lda; const TYPE *A8=A7+lda, *A9=A8+lda, *A10=A9+lda, *A11=A10+lda; const TYPE *A12=A11+lda, *A13=A12+lda, *A14=A13+lda, *A15=A14+lda; const TYPE *A16=A15+lda, *A17=A16+lda, *A18=A17+lda, *A19=A18+lda; const TYPE *A20=A19+lda, *A21=A20+lda, *A22=A21+lda, *A23=A22+lda; const TYPE *A24=A23+lda, *A25=A24+lda, *A26=A25+lda, *A27=A26+lda; const TYPE *A28=A27+lda, *A29=A28+lda, *A30=A29+lda, *A31=A30+lda; const TYPE x0=*X, x1=X[1], x2=X[2], x3=X[3]; const TYPE x4=X[4], x5=X[5], x6=X[6], x7=X[7]; const TYPE x8=X[8], x9=X[9], x10=X[10], x11=X[11]; const TYPE x12=X[12], x13=X[13], x14=X[14], x15=X[15]; const TYPE x16=X[16], x17=X[17], x18=X[18], x19=X[19]; const TYPE x20=X[20], x21=X[21], x22=X[22], x23=X[23]; const TYPE x24=X[24], x25=X[25], x26=X[26], x27=X[27]; const TYPE x28=X[28], x29=X[29], x30=X[30], x31=X[31]; TYPE *stY = Y + M; ATL_assert(N == 32); do { Yass(*Y) *A0 * x0 + *A1 * x1 + *A2 * x2 + *A3 * x3 + *A4 * x4 + *A5 * x5 + *A6 * x6 + *A7 * x7 + *A8 * x8 + *A9 * x9 + *A10 * x10 + *A11 * x11 + *A12 * x12 + *A13 * x13 + *A14 * x14 + *A15 * x15 + *A16 * x16 + *A17 * x17 + *A18 * x18 + *A19 * x19 + *A20 * x20 + *A21 * x21 + *A22 * x22 + *A23 * x23 + *A24 * x24 + *A25 * x25 + *A26 * x26 + *A27 * x27 + *A28 * x28 + *A29 * x29 + *A30 * x30 + *A31 * x31; Y++; A0++; A1++; A2++; A3++; A4++; A5++; A6++; A7++; A8++; A9++; A10++; A11++; A12++; A13++; A14++; A15++; A16++; A17++; A18++; A19++; A20++; A21++; A22++;
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?