atl_dmm4x4x2_us.c

来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 531 行 · 第 1/2 页

C
531
字号
!              Automatically Tuned Linear Algebra Software v3.8.0!                     (C) Copyright 2002 R. Clint Whaley!!  Redistribution and use in source and binary forms, with or without!  modification, are permitted provided that the following conditions!  are met:!    1. Redistributions of source code must retain the above copyright!       notice, this list of conditions and the following disclaimer.!    2. Redistributions in binary form must reproduce the above copyright!       notice, this list of conditions, and the following disclaimer in the!       documentation and/or other materials provided with the distribution.!    3. The name of the ATLAS group or the names of its contributers may!       not be used to endorse or promote products derived from this!       software without specific written permission.!!  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS!  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED!  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR!  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS!  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR!  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF!  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS!  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN!  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)!  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE!  POSSIBILITY OF SUCH DAMAGE.!#if !defined(KB)   #define KB 0#endif#ifndef ATL_GAS_SPARC   #error "This kernel requires sparc assembler!"#endif!! Floating point registers!#define rA0     %f0#define rA1     %f2#define rA2     %f4#define rA3     %f6#define ra0     %f8#define ra1     %f10#define ra2     %f12#define ra3     %f14#define rB0     %f16#define rB1     %f18#define rB2     %f20#define rB3     %f22#define m0      %f24#define m1      %f26#define m2      %f28#define m3      %f30#define rC00    %f32#define rC10    %f34#define rC20    %f36#define rC30    %f38#define rC01    %f40#define rC11    %f42#define rC21    %f44#define rC31    %f46#define rC02    %f48#define rC12    %f50#define rC22    %f52#define rC32    %f54#define rC03    %f56#define rC13    %f58#define rC23    %f60#define rC33    %f62!! Integer registers!#define M       %i0#define N       %i1#define ldab    %i2#define pA1     %i3#define pA2     %i4#define pA0     %i5#define pB0     %l0#define pC0     %l1#define incAn   %l2#define II      %l3#define KK      %l4#define incCn   %l5#define pA3     %l6#define pB1     %l7#define pB2     %o1#define pB3     %o2#define pC1     %o3#define pC2     %o4#define pC3     %o5#define pfA     %o7#define Kstart  %g1#define incAm   %g2#define incBm   %g3#define incBn   %g4#ifdef DCPLX   #define incCm 64   #define CSH 4   #define CMUL(arg_) ((arg_)*2)#else   #define incCm 32   #define CSH 3   #define CMUL(arg_) arg_#endif#if 1   #define prefR1(mem) prefetch mem, 0   #define prefR2(mem) prefetch mem, 0   #define prefW2(mem) prefetch mem, 2#else   #define prefW2(mem)   #define prefR2(mem)   #define prefR1(mem)#endif!! Saving registers: i[1,3,4,5] g[2,3,4] --> FSIZE = 7*4 + 64 = 92 ~ 96#define FSIZE   96!!                         i0,          i1           i2             i3,i4!void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha,!                           i5        [%fp+92]      [%fp+96]      [%fp+100]!                const TYPE *A, const int lda, const TYPE *B, const int ldb,!                      [%fp+104] [%fp+112]     [%fp+116]!                const TYPE beta, TYPE *C, const int ldc)        .section        ".text"        .align  8        .global ATL_USERMMATL_USERMM:        save %sp, -FSIZE, %sp!!       Save non-scratch registers!        st      %i1, [%sp+64]        st      %i3, [%sp+68]        st      %i4, [%sp+72]        st      %i5, [%sp+76]        st      %g2, [%sp+80]        st      %g3, [%sp+84]        st      %g4, [%sp+88]!!!        ld      [%fp+96], pB0        ld      [%fp+112], pC0        ld      [%fp+116], incCn        srl     ldab, 1, Kstart        sub     Kstart, 1, Kstart        sll     ldab, 3, ldab	sll	ldab, 2, incBn      ! incBn = ldab * 4        sll     incCn, CSH, incCn   ! incCn = ldc*size        sll     ldab, 1, incAm        add     incAm, ldab, incAm        add     incAm, 16, incAm    !   incAm = (ldab*3+2)*size        mov     16, incBm        sub     incBm, ldab, incBm  !   incBm = (2-ldab)*size        add     pA0, ldab, pA1        add     pA1, ldab, pA2        add     pA2, ldab, pA3        add     pB0, ldab, pB1        add     pB1, ldab, pB2        add     pB2, ldab, pB3        add     pC0, incCn, pC1        add     pC1, incCn, pC2        add     pC2, incCn, pC3        smul    ldab, M, incAn        add     pA0, incAn, pfA        sub     %g0, incAn, incAn	sll	incCn, 2, incCn	sll	M, CSH, M	sub	incCn, M, incCn	srl	M, CSH, MNLOOP:        mov     M, IIMLOOP:!!       Load C[0..3][0..3] and apply beta if necessary!#ifdef BETA0!! NOTE: can zero double fp reg using VIS instruction: fzero %fX!                                                        prefW2([pC0+64])        fzero   rC00        fzero   rC10        fzero   rC20        fzero   rC30                                                        prefW2([pC1+64])        fzero   rC01        fzero   rC11        fzero   rC21        fzero   rC31                                                        prefW2([pC2+64])        fzero   rC02        fzero   rC12        fzero   rC22        fzero   rC32                                                        prefW2([pC3+64])        fzero   rC03        fzero   rC13        fzero   rC23        fzero   rC33#else   #ifdef BETAX        ldd     [%fp+104], ra3   #endif                                                        prefW2([pC0+64])        ldd     [pC0], rC00        ldd     [pC0+CMUL(8)], rC10        ldd     [pC0+CMUL(16)], rC20        ldd     [pC0+CMUL(24)], rC30                                                        prefW2([pC1+64])        ldd     [pC1], rC01        ldd     [pC1+CMUL(8)], rC11        ldd     [pC1+CMUL(16)], rC21        ldd     [pC1+CMUL(24)], rC31                                                        prefW2([pC2+64])        ldd     [pC2], rC02        ldd     [pC2+CMUL(8)], rC12        ldd     [pC2+CMUL(16)], rC22        ldd     [pC2+CMUL(24)], rC32                                                        prefW2([pC3+64])        ldd     [pC3], rC03        ldd     [pC3+CMUL(8)], rC13        ldd     [pC3+CMUL(16)], rC23        ldd     [pC3+CMUL(24)], rC33   #ifdef BETAX        fmuld   rC00, ra3, rC00        fmuld   rC10, ra3, rC10        fmuld   rC20, ra3, rC20        fmuld   rC30, ra3, rC30        fmuld   rC01, ra3, rC01        fmuld   rC11, ra3, rC11        fmuld   rC21, ra3, rC21        fmuld   rC31, ra3, rC31        fmuld   rC02, ra3, rC02        fmuld   rC12, ra3, rC12        fmuld   rC22, ra3, rC22        fmuld   rC32, ra3, rC32        fmuld   rC03, ra3, rC03        fmuld   rC13, ra3, rC13        fmuld   rC23, ra3, rC23        fmuld   rC33, ra3, rC33   #endif#endif!!       Load A & B registers and fill multiply pipeline!        ldd     [pB0], rB0        ldd     [pA0], rA0        ldd     [pA1], rA1        ldd     [pA2], rA2        ldd     [pA3], rA3        ldd     [pB1], rB1        ldd     [pB2], rB2        ldd     [pB3], rB3        fmuld   rA0, rB0, m0        ldd     [pA0+8], ra0        fmuld   rA1, rB0, m1        ldd     [pA1+8], ra1        fmuld   rA2, rB0, m2        ldd     [pA2+8], ra2        fmuld   rA3, rB0, m3        ldd     [pB0+8], rB0        ldd     [pA3+8], ra3!

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?