atl_dmm4x4x2pf_av.c
来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 811 行 · 第 1/2 页
C
811 行
/* * Automatically Tuned Linear Algebra Software v3.8.0 * (C) Copyright 2001 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions, and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the ATLAS group or the names of its contributers may * not be used to endorse or promote products derived from this * software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#define Mjoin(pre, nam) my_join(pre, nam)#define my_join(pre, nam) pre ## nam#if !defined(ATL_AS_OSX_PPC) && !defined(ATL_GAS_LINUX_PPC) #error "This kernel requires PPC assembler"#endif#ifdef MB #if (MB/4)*4 != MB #error "MB must be multiple of 4!" #endif#endif#ifdef NB #if (NB/4)*4 != NB #error "NB must be multiple of 4!" #endif#endif#ifdef KB #if (KB/2)*2 != KB #error "KB must be multiple of 2!" #endif#endif#ifdef ATL_GAS_LINUX_PPC #define r0 0 #define r1 1 #define r2 2 #define r3 3 #define r4 4 #define r5 5 #define r6 6 #define r7 7 #define r8 8 #define r9 9 #define r10 10 #define r11 11 #define r12 12 #define r13 13 #define r14 14 #define r15 15 #define r16 16 #define r17 17 #define r18 18 #define r19 19 #define r20 20 #define r21 21 #define r22 22 #define r23 23 #define r24 24 #define r25 25 #define r26 26 #define r27 27 #define r28 28 #define r29 29 #define r30 30 #define r31 31 #define f0 0 #define f1 1 #define f2 2 #define f3 3 #define f4 4 #define f5 5 #define f6 6 #define f7 7 #define f8 8 #define f9 9 #define f10 10 #define f11 11 #define f12 12 #define f13 13 #define f14 14 #define f15 15 #define f16 16 #define f17 17 #define f18 18 #define f19 19 #define f20 20 #define f21 21 #define f22 22 #define f23 23 #define f24 24 #define f25 25 #define f26 26 #define f27 27 #define f28 28 #define f29 29 #define f30 30 #define f31 31#endif/* * Integer register usage shown by these defines */#ifdef ATL_GAS_LINUX_PPC #ifdef ATL_USE64BITS #define pC0 r6 #define pC1 r15 #define pC2 r22 #define pC3 r11 #define pA0 r7 #define pA1 r16 #define pA2 r17 #define pA3 r18 #define pB0 r9 #define pB1 r19 #define pB2 r20 #define pB3 r21 #define incAm r8 #define incAn r23 #define incBm r24 #define incBn r10 #define incCn r5 #define stK r0 #define stM r14 #define stN r4 #define M r3 #define ctlB r12 #define ctlC r25 #else #define pC0 r10 #define pC1 r15 #define pC2 r22 #define pC3 r11 #define pA0 r6 #define pA1 r16 #define pA2 r17 #define pA3 r18 #define pB0 r8 #define pB1 r19 #define pB2 r20 #define pB3 r21 #define incAm r7 #define incAn r23 #define incBm r24 #define incBn r9 #define incCn r5 #define stK r0 #define stM r14 #define stN r4 #define M r3 #define ctlB r12 #define ctlC r25 #endif#else #define pC0 r6 #define pC1 r25 #define pC2 r15 #define pC3 r14 #define pA0 r8 #define pA1 r16 #define pA2 r17 #define pA3 r18 #define pB0 r10 #define pB1 r19 #define pB2 r20 #define pB3 r21 #define incAm r9 #define incAn r22 #define incBm r23 #define incBn r24 #define incCn r5 #define stK r0 #define stM r7 #define stN r4 #define M r3 #define ctlB r11 #define ctlC r12#endif#ifdef DCPLX #define incCm 64#else #define incCm 32#endif/* * fp register usage shown by these defines */#define beta f2#define rC00 f0#define rC10 f1#define rC20 f3#define rC30 f4#define rC01 f5#define rC11 f6#define rC21 f7#define rC31 f8#define rC02 f9#define rC12 f10#define rC22 f11#define rC32 f12#define rC03 f13#define rC13 f14#define rC23 f15#define rC33 f16#ifdef BETA0#define ZERO f17#endif#define ra0 f20#define ra1 f21#define ra2 f22#define ra3 f23#define rA0 f24#define rA1 f25#define rA2 f26#define rA3 f27#define rB0 f28#define rB1 f29#define rB2 f30#define rB3 f31/* * Offsets from stack pointer for integer register save area, fp reg area, */#ifndef ATL_GAS_LINUX_PPC #define IROFF -220 #define FROFF -144#elif defined(ATL_USE64BITS) #define FROFF -288 #define IROFF FROFF+144#endif#ifdef ATL_USE64BITS #define slwi sldi #define srwi srdi #define mullw mulld #define cmpwi cmpdi#endif#if 0 r3 r4 r5 r6-r7,f2void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha, (r6) r8 (r7) r9 (r8) r10 (r9) 56(r1) const TYPE *A, const int lda, const TYPE *B, const int ldb, f2 68(r1) 72(r1) const TYPE beta, TYPE *C, const int ldc) (r10) 8(r1)!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!NOTE: 64 bit Linux ABI wastes para-passing iregs and stack space like OS X:!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! r3 r4 r5 r6/f1void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha, r7 r8 r9 r10 const TYPE *A, const int lda, const TYPE *B, const int ldb, f2 120(r1) 128(r1) const TYPE beta, TYPE *C, const int ldc)#endif.text#ifdef ATL_GAS_LINUX_PPC #if defined(ATL_USE64BITS)/* * No idea what this does, but seg fault without it (I think it is * partially resp for making code callable from both static & PIC code) */ .align 2 .globl ATL_USERMM .section ".opd","aw" .align 3ATL_USERMM: .quad Mjoin(.,ATL_USERMM),.TOC.@tocbase,0 .previous .size Mjoin(.,ATL_USERMM),24 .type Mjoin(.,ATL_USERMM),@function .globl Mjoin(.,ATL_USERMM)Mjoin(.,ATL_USERMM): #else.globl ATL_USERMMATL_USERMM: #define IROFF FROFF+144 #define FROFF 8 #define FSIZE 224 mflr r0 stw r0, 4(r1) stwu r1, -FSIZE(r1) #endif#else.globl Mjoin(_,ATL_USERMM)Mjoin(_,ATL_USERMM):/* * Save iregs */ mflr r0 stw r0, 8(r1) mfcr r0 stw r0, 4(r1)#endif#ifdef ATL_USE64BITS std r14, IROFF(r1) std r15, 8+IROFF(r1) std r16, 16+IROFF(r1) std r17, 32+IROFF(r1) std r18, 40+IROFF(r1) std r19, 48+IROFF(r1) std r20, 56+IROFF(r1) std r21, 64+IROFF(r1) std r22, 72+IROFF(r1) std r23, 80+IROFF(r1) std r24, 88+IROFF(r1) std r25, 96+IROFF(r1)#else stw r14, IROFF(r1) stw r15, 4+IROFF(r1) stw r16, 8+IROFF(r1) stw r17, 12+IROFF(r1) stw r18, 16+IROFF(r1) stw r19, 20+IROFF(r1) stw r20, 24+IROFF(r1) stw r21, 28+IROFF(r1) stw r22, 32+IROFF(r1) stw r23, 36+IROFF(r1) stw r24, 40+IROFF(r1) stw r25, 44+IROFF(r1)#endif mr stK, r5/* * Setup ctrl reg for prefetch of A & B */ slwi incAm, incAm, 3 srwi ctlB, stK, 1 slwi ctlB, ctlB, 8 add ctlB, ctlB, M slwi ctlB, ctlB, 16 or ctlB, ctlB, incAm dst pA0, ctlB, 1#ifndef ATL_GAS_LINUX_PPC lwz incBn, 56(r1)#endif slwi incBn, incBn, 3 srwi ctlB, stK, 1 slwi ctlB, ctlB, 8 addi ctlB, ctlB, 4 slwi ctlB, ctlB, 16 or ctlB, ctlB, incBn dst pB0, ctlB, 2/* * Save fregs */ stfd f14, FROFF(r1) stfd f15, FROFF+8(r1) stfd f16, FROFF+16(r1) stfd f17, FROFF+24(r1) stfd f18, FROFF+32(r1) stfd f19, FROFF+40(r1) stfd f20, FROFF+48(r1) stfd f21, FROFF+56(r1) stfd f22, FROFF+64(r1) stfd f23, FROFF+72(r1) stfd f24, FROFF+80(r1) stfd f25, FROFF+88(r1) stfd f26, FROFF+96(r1) stfd f27, FROFF+104(r1) stfd f28, FROFF+112(r1) stfd f29, FROFF+120(r1) stfd f30, FROFF+128(r1)/* * Store zero in freg for future use, and save last freg */#ifdef BETA0 xor pC1, pC1, pC1 #ifdef ATL_USE64BITS std pC1, FROFF+136(r1) #else stw pC1, FROFF+136(r1) stw pC1, FROFF+140(r1) #endif lfd ZERO, FROFF+136(r1)#endif stfd f31, FROFF+136(r1)/* * Setup C pointers and so on, setup C prefetch * incCn = (ldc*4 - MB)*sizeof */#ifdef ATL_GAS_LINUX_PPC #ifdef ATL_USE64BITS ld pC0, 120(r1)
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?