atl_dmm4x4x8_us.c
来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 1,163 行 · 第 1/2 页
C
1,163 行
! Automatically Tuned Linear Algebra Software v3.8.0! (C) Copyright 2002 R. Clint Whaley!! Redistribution and use in source and binary forms, with or without! modification, are permitted provided that the following conditions! are met:! 1. Redistributions of source code must retain the above copyright! notice, this list of conditions and the following disclaimer.! 2. Redistributions in binary form must reproduce the above copyright! notice, this list of conditions, and the following disclaimer in the! documentation and/or other materials provided with the distribution.! 3. The name of the ATLAS group or the names of its contributers may! not be used to endorse or promote products derived from this! software without specific written permission.!! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE! POSSIBILITY OF SUCH DAMAGE.!#if !defined(KB) #define KB 0#endif#ifndef ATL_GAS_SPARC #error "This kernel requires sparc assembler!"#endif#define PFD 168#define PFC 64#if 1 #define prefR1(mem) prefetch mem, 0 #define prefR2(mem) prefetch mem, 0 #define prefW2(mem) prefetch mem, 2#else #define prefW2(mem) #define prefR2(mem) #define prefR1(mem)#endif!! Floating point registers!#define rA0 %f0#define rA1 %f2#define rA2 %f4#define rA3 %f6#define ra0 %f8#define ra1 %f10#define ra2 %f12#define ra3 %f14#define rB0 %f16#define rB1 %f18#define rB2 %f20#define rB3 %f22#define m0 %f24#define m1 %f26#define m2 %f28#define m3 %f30#define rC00 %f32#define rC10 %f34#define rC20 %f36#define rC30 %f38#define rC01 %f40#define rC11 %f42#define rC21 %f44#define rC31 %f46#define rC02 %f48#define rC12 %f50#define rC22 %f52#define rC32 %f54#define rC03 %f56#define rC13 %f58#define rC23 %f60#define rC33 %f62!! Integer registers!#ifdef ATL_USE64BITS #define M %i0 #define N %i1 #define ldab %i2 #define pA1 %i3 #define pA2 %i5 #define pA0 %i4 #define pB0 %l0 #define pC0 %l1 #define incAn %l2 #define II %l3 #define KK %l4 #define incCn %l5 #define pA3 %l6 #define pB1 %l7 #define pB2 %o1 #define pB3 %o2 #define pC1 %o3 #define pC2 %o4 #define pC3 %o5 #define pfA %o7 #define Kstart %g1 #define incAm %g2 #define incBm %g3 #define incBn %g4 #define pfB %g5#else #define M %i0 #define N %i1 #define ldab %i2 #define pA1 %i3 #define pA2 %i4 #define pA0 %i5 #define pB0 %l0 #define pC0 %l1 #define incAn %l2 #define II %l3 #define KK %l4 #define incCn %l5 #define pA3 %l6 #define pB1 %l7 #define pB2 %o1 #define pB3 %o2 #define pC1 %o3 #define pC2 %o4 #define pC3 %o5 #define pfA %o7 #define Kstart %g1 #define incAm %g2 #define incBm %g3 #define incBn %g4 #define pfB %i2 /* aliased with ldab */#endif#ifdef DCPLX #define CMUL(arg_) ((arg_)*2) #define incCm 64 #define CSH 4#else #define CMUL(arg_) arg_ #define incCm 32 #define CSH 3#endif!! Saving registers: g[2,3,4] --> FSIZE = 4*4 + 64 = 80! 64 bits: g[2,3,4,5] : FSIZE = 4*8+128 = 160#ifdef ATL_USE64BITS #define FSIZE 144 #define BIAS 2047 #define BOFF FSIZE-8#else #define FSIZE 96 #define BIAS 0#endif! 32 bits:! i0, i1 i2 i3,i4!void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha,! i5 [%fp+92] [%fp+96] [%fp+100]! const TYPE *A, const int lda, const TYPE *B, const int ldb,! [%fp+104] [%fp+112] [%fp+116]! const TYPE beta, TYPE *C, const int ldc)! 64 bits:! i0, i1 i2 f0!void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha,! i4 i5 [%fp+176] [%fp+184]! const TYPE *A, const int lda, const TYPE *B, const int ldb,! f16, [%fp+192] [%fp+200] [%fp+208]! const TYPE beta, TYPE *C, const int ldc)#ifdef ATL_USE64BITS .register %g2, #scratch .register %g3, #scratch#endif .section ".text" .align 8 .global ATL_USERMMATL_USERMM: save %sp, -FSIZE, %sp!! Save non-scratch registers, save BETA to local frame if needed!#ifdef ATL_USE64BITS #ifdef BETAX std %f16, [%sp+BIAS+BOFF] #endif#else st %g2, [%sp+80] st %g3, [%sp+84] st %g4, [%sp+88]#endif!! Load args and start operations!#ifdef ATL_USE64BITS ldx [%fp+BIAS+176], pB0 ldx [%fp+BIAS+200], pC0 ldsw [%fp+BIAS+212], incCn#else ld [%fp+96], pB0 ld [%fp+112], pC0 ld [%fp+116], incCn#endif srl ldab, 3, Kstart sub Kstart, 1, Kstart sll ldab, 3, ldab sll ldab, 2, incBn ! incBn = ldab * 4 sll incCn, CSH, incCn ! incCn = ldc*size sll ldab, 1, incAm add incAm, ldab, incAm add incAm, 64, incAm ! incAm = (ldab*3+8)*size mov 64, incBm sub incBm, ldab, incBm ! incBm = (8-ldab)*size add pA0, ldab, pA1 add pA1, ldab, pA2 add pA2, ldab, pA3 add pB0, ldab, pB1 add pB1, ldab, pB2 add pB2, ldab, pB3 add pC0, incCn, pC1 add pC1, incCn, pC2 add pC2, incCn, pC3 smul ldab, M, incAn add pA0, incAn, pfA smul ldab, N, pfB add pfB, pB0, pfB sub %g0, incAn, incAn sll incCn, 2, incCn sll M, CSH, M sub incCn, M, incCn srl M, CSH, MNLOOP: mov M, IIMLOOP:!! Load C[0..3][0..3] and apply beta if necessary!#ifdef BETA0!! NOTE: can zero double fp reg using VIS instruction: fzero %fX! prefW2([pC0+PFC]) fzero rC00 fzero rC10 fzero rC20 fzero rC30 prefW2([pC1+PFC]) fzero rC01 fzero rC11 fzero rC21 fzero rC31 prefW2([pC2+PFC]) fzero rC02 fzero rC12 fzero rC22 fzero rC32 prefW2([pC3+PFC]) fzero rC03 fzero rC13 fzero rC23 fzero rC33#else #ifdef BETAX #ifdef ATL_USE64BITS ldd [%sp+BIAS+BOFF], ra3 #else ldd [%fp+104], ra3 #endif #endif prefW2([pC0+PFC]) ldd [pC0], rC00 ldd [pC0+CMUL(8)], rC10 ldd [pC0+CMUL(16)], rC20 ldd [pC0+CMUL(24)], rC30 prefW2([pC1+PFC]) ldd [pC1], rC01 ldd [pC1+CMUL(8)], rC11 ldd [pC1+CMUL(16)], rC21 ldd [pC1+CMUL(24)], rC31 prefW2([pC2+PFC]) ldd [pC2], rC02 ldd [pC2+CMUL(8)], rC12 ldd [pC2+CMUL(16)], rC22 ldd [pC2+CMUL(24)], rC32 prefW2([pC3+PFC]) ldd [pC3], rC03 ldd [pC3+CMUL(8)], rC13 ldd [pC3+CMUL(16)], rC23 ldd [pC3+CMUL(24)], rC33 #ifdef BETAX fmuld rC00, ra3, rC00 fmuld rC10, ra3, rC10 fmuld rC20, ra3, rC20 fmuld rC30, ra3, rC30 fmuld rC01, ra3, rC01 fmuld rC11, ra3, rC11 fmuld rC21, ra3, rC21 fmuld rC31, ra3, rC31 fmuld rC02, ra3, rC02 fmuld rC12, ra3, rC12 fmuld rC22, ra3, rC22 fmuld rC32, ra3, rC32 fmuld rC03, ra3, rC03 fmuld rC13, ra3, rC13 fmuld rC23, ra3, rC23 fmuld rC33, ra3, rC33 #endif#endif!! Load A & B registers and fill multiply pipeline! ldd [pB0], rB0 ldd [pA0], rA0 ldd [pA1], rA1 ldd [pA2], rA2 ldd [pA3], rA3 ldd [pB1], rB1 ldd [pB2], rB2 ldd [pB3], rB3 fmuld rA0, rB0, m0 ldd [pA0+8], ra0 fmuld rA1, rB0, m1 ldd [pA1+8], ra1 fmuld rA2, rB0, m2 ldd [pA2+8], ra2 fmuld rA3, rB0, m3 ldd [pB0+8], rB0 ldd [pA3+8], ra3!! For K == 1, we never enter the loop at all!#if (KB != 8) #if KB == 0 subcc Kstart, %g0, %g0 bz KDRAIN nop #endif mov Kstart, KK .align 4KLOOP:!! K=0 iteration! faddd rC00, m0, rC00 prefR1([pA0+PFD]) fmuld rA0, rB1, m0 faddd rC10, m1, rC10 prefR1([pA1+PFD]) fmuld rA1, rB1, m1 faddd rC20, m2, rC20 prefR1([pA2+PFD]) fmuld rA2, rB1, m2 faddd rC30, m3, rC30 fmuld rA3, rB1, m3 ldd [pB1+8], rB1 faddd rC01, m0, rC01 fmuld rA0, rB2, m0 faddd rC11, m1, rC11 fmuld rA1, rB2, m1 faddd rC21, m2, rC21 fmuld rA2, rB2, m2 faddd rC31, m3, rC31 fmuld rA3, rB2, m3 ldd [pB2+8], rB2 faddd rC02, m0, rC02 fmuld rA0, rB3, m0 ldd [pA0+16], rA0 faddd rC12, m1, rC12 fmuld rA1, rB3, m1 ldd [pA1+16], rA1 faddd rC22, m2, rC22 fmuld rA2, rB3, m2 ldd [pA2+16], rA2 faddd rC32, m3, rC32 fmuld rA3, rB3, m3 ldd [pB3+8], rB3 faddd rC03, m0, rC03 fmuld ra0, rB0, m0 ldd [pA3+16], rA3 faddd rC13, m1, rC13 fmuld ra1, rB0, m1 faddd rC23, m2, rC23 fmuld ra2, rB0, m2 faddd rC33, m3, rC33 fmuld ra3, rB0, m3 ldd [pB0+16], rB0!! K=1 iteration! faddd rC00, m0, rC00 fmuld ra0, rB1, m0 faddd rC10, m1, rC10 fmuld ra1, rB1, m1 faddd rC20, m2, rC20 fmuld ra2, rB1, m2 faddd rC30, m3, rC30 fmuld ra3, rB1, m3 ldd [pB1+16], rB1 faddd rC01, m0, rC01 fmuld ra0, rB2, m0 faddd rC11, m1, rC11 fmuld ra1, rB2, m1 faddd rC21, m2, rC21 fmuld ra2, rB2, m2 faddd rC31, m3, rC31 fmuld ra3, rB2, m3 ldd [pB2+16], rB2 faddd rC02, m0, rC02 fmuld ra0, rB3, m0 ldd [pA0+24], ra0 faddd rC12, m1, rC12 fmuld ra1, rB3, m1 ldd [pA1+24], ra1 faddd rC22, m2, rC22 fmuld ra2, rB3, m2 ldd [pA2+24], ra2 faddd rC32, m3, rC32 fmuld ra3, rB3, m3 ldd [pB3+16], rB3 faddd rC03, m0, rC03 fmuld rA0, rB0, m0 ldd [pA3+24], ra3 faddd rC13, m1, rC13 fmuld rA1, rB0, m1 faddd rC23, m2, rC23 fmuld rA2, rB0, m2 faddd rC33, m3, rC33 fmuld rA3, rB0, m3 ldd [pB0+24], rB0!! K=2 iteration! faddd rC00, m0, rC00 fmuld rA0, rB1, m0 faddd rC10, m1, rC10 fmuld rA1, rB1, m1 faddd rC20, m2, rC20 fmuld rA2, rB1, m2 faddd rC30, m3, rC30 fmuld rA3, rB1, m3 ldd [pB1+24], rB1 faddd rC01, m0, rC01 fmuld rA0, rB2, m0 prefR1([pA3+PFD]) faddd rC11, m1, rC11 fmuld rA1, rB2, m1 faddd rC21, m2, rC21 fmuld rA2, rB2, m2 faddd rC31, m3, rC31 fmuld rA3, rB2, m3 ldd [pB2+24], rB2 faddd rC02, m0, rC02 fmuld rA0, rB3, m0 ldd [pA0+32], rA0 faddd rC12, m1, rC12 fmuld rA1, rB3, m1 ldd [pA1+32], rA1 faddd rC22, m2, rC22 fmuld rA2, rB3, m2 ldd [pA2+32], rA2 faddd rC32, m3, rC32 fmuld rA3, rB3, m3 ldd [pB3+24], rB3 faddd rC03, m0, rC03 fmuld ra0, rB0, m0 ldd [pA3+32], rA3 faddd rC13, m1, rC13 fmuld ra1, rB0, m1 faddd rC23, m2, rC23 fmuld ra2, rB0, m2 faddd rC33, m3, rC33 fmuld ra3, rB0, m3 ldd [pB0+32], rB0!! K=3 iteration! faddd rC00, m0, rC00 fmuld ra0, rB1, m0 faddd rC10, m1, rC10 fmuld ra1, rB1, m1 faddd rC20, m2, rC20 fmuld ra2, rB1, m2 faddd rC30, m3, rC30 fmuld ra3, rB1, m3 ldd [pB1+32], rB1 faddd rC01, m0, rC01 fmuld ra0, rB2, m0 faddd rC11, m1, rC11 fmuld ra1, rB2, m1 faddd rC21, m2, rC21 fmuld ra2, rB2, m2 faddd rC31, m3, rC31 fmuld ra3, rB2, m3 ldd [pB2+32], rB2 faddd rC02, m0, rC02 fmuld ra0, rB3, m0 ldd [pA0+40], ra0 faddd rC12, m1, rC12 fmuld ra1, rB3, m1 ldd [pA1+40], ra1 faddd rC22, m2, rC22 fmuld ra2, rB3, m2 ldd [pA2+40], ra2 faddd rC32, m3, rC32 fmuld ra3, rB3, m3 ldd [pB3+32], rB3 faddd rC03, m0, rC03 fmuld rA0, rB0, m0 ldd [pA3+40], ra3 faddd rC13, m1, rC13 fmuld rA1, rB0, m1 faddd rC23, m2, rC23 fmuld rA2, rB0, m2 faddd rC33, m3, rC33 fmuld rA3, rB0, m3 ldd [pB0+40], rB0!! K=4 iteration! faddd rC00, m0, rC00 fmuld rA0, rB1, m0 faddd rC10, m1, rC10 fmuld rA1, rB1, m1 faddd rC20, m2, rC20 fmuld rA2, rB1, m2 faddd rC30, m3, rC30 fmuld rA3, rB1, m3 ldd [pB1+40], rB1 faddd rC01, m0, rC01 fmuld rA0, rB2, m0 faddd rC11, m1, rC11 fmuld rA1, rB2, m1 faddd rC21, m2, rC21 fmuld rA2, rB2, m2 faddd rC31, m3, rC31 fmuld rA3, rB2, m3 ldd [pB2+40], rB2 faddd rC02, m0, rC02 fmuld rA0, rB3, m0 ldd [pA0+48], rA0 faddd rC12, m1, rC12 fmuld rA1, rB3, m1 ldd [pA1+48], rA1 faddd rC22, m2, rC22 fmuld rA2, rB3, m2 ldd [pA2+48], rA2 faddd rC32, m3, rC32 fmuld rA3, rB3, m3 ldd [pB3+40], rB3 faddd rC03, m0, rC03 fmuld ra0, rB0, m0 ldd [pA3+48], rA3 faddd rC13, m1, rC13 fmuld ra1, rB0, m1 faddd rC23, m2, rC23 fmuld ra2, rB0, m2 faddd rC33, m3, rC33 fmuld ra3, rB0, m3 ldd [pB0+48], rB0!! K=5 iteration! faddd rC00, m0, rC00 fmuld ra0, rB1, m0 faddd rC10, m1, rC10 fmuld ra1, rB1, m1
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?