atl_dmm4x4x2pf_av.c

来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 811 行 · 第 1/2 页

C
811
字号
/* *             Automatically Tuned Linear Algebra Software v3.8.0 *                    (C) Copyright 2001 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: *   1. Redistributions of source code must retain the above copyright *      notice, this list of conditions and the following disclaimer. *   2. Redistributions in binary form must reproduce the above copyright *      notice, this list of conditions, and the following disclaimer in the *      documentation and/or other materials provided with the distribution. *   3. The name of the ATLAS group or the names of its contributers may *      not be used to endorse or promote products derived from this *      software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#define Mjoin(pre, nam) my_join(pre, nam)#define my_join(pre, nam) pre ## nam#if !defined(ATL_AS_OSX_PPC) && !defined(ATL_GAS_LINUX_PPC)   #error "This kernel requires PPC assembler"#endif#ifdef MB   #if (MB/4)*4 != MB      #error "MB must be multiple of 4!"   #endif#endif#ifdef NB   #if (NB/4)*4 != NB      #error "NB must be multiple of 4!"   #endif#endif#ifdef KB   #if (KB/2)*2 != KB      #error "KB must be multiple of 2!"   #endif#endif#ifdef ATL_GAS_LINUX_PPC   #define r0 0   #define r1 1   #define r2 2   #define r3 3   #define r4 4   #define r5 5   #define r6 6   #define r7 7   #define r8 8   #define r9 9   #define r10 10   #define r11 11   #define r12 12   #define r13 13   #define r14 14   #define r15 15   #define r16 16   #define r17 17   #define r18 18   #define r19 19   #define r20 20   #define r21 21   #define r22 22   #define r23 23   #define r24 24   #define r25 25   #define r26 26   #define r27 27   #define r28 28   #define r29 29   #define r30 30   #define r31 31   #define f0 0   #define f1 1   #define f2 2   #define f3 3   #define f4 4   #define f5 5   #define f6 6   #define f7 7   #define f8 8   #define f9 9   #define f10 10   #define f11 11   #define f12 12   #define f13 13   #define f14 14   #define f15 15   #define f16 16   #define f17 17   #define f18 18   #define f19 19   #define f20 20   #define f21 21   #define f22 22   #define f23 23   #define f24 24   #define f25 25   #define f26 26   #define f27 27   #define f28 28   #define f29 29   #define f30 30   #define f31 31#endif/* * Integer register usage shown by these defines */#ifdef ATL_GAS_LINUX_PPC   #ifdef ATL_USE64BITS      #define pC0     r6      #define pC1     r15      #define pC2     r22      #define pC3     r11      #define pA0     r7      #define pA1     r16      #define pA2     r17      #define pA3     r18      #define pB0     r9      #define pB1     r19      #define pB2     r20      #define pB3     r21      #define incAm   r8      #define incAn   r23      #define incBm   r24      #define incBn   r10      #define incCn   r5      #define stK     r0      #define stM     r14      #define stN     r4      #define M       r3      #define ctlB    r12      #define ctlC    r25   #else      #define pC0     r10      #define pC1     r15      #define pC2     r22      #define pC3     r11      #define pA0     r6      #define pA1     r16      #define pA2     r17      #define pA3     r18      #define pB0     r8      #define pB1     r19      #define pB2     r20      #define pB3     r21      #define incAm   r7      #define incAn   r23      #define incBm   r24      #define incBn   r9      #define incCn   r5      #define stK     r0      #define stM     r14      #define stN     r4      #define M       r3      #define ctlB    r12      #define ctlC    r25   #endif#else   #define pC0     r6   #define pC1     r25   #define pC2     r15   #define pC3     r14   #define pA0     r8   #define pA1     r16   #define pA2     r17   #define pA3     r18   #define pB0     r10   #define pB1     r19   #define pB2     r20   #define pB3     r21   #define incAm   r9   #define incAn   r22   #define incBm   r23   #define incBn   r24   #define incCn   r5   #define stK     r0   #define stM     r7   #define stN     r4   #define M       r3   #define ctlB    r11   #define ctlC    r12#endif#ifdef DCPLX   #define incCm 64#else   #define incCm 32#endif/* * fp register usage shown by these defines */#define	beta	f2#define rC00	f0#define rC10	f1#define rC20	f3#define rC30	f4#define rC01	f5#define rC11	f6#define rC21	f7#define rC31	f8#define rC02	f9#define rC12	f10#define rC22	f11#define rC32	f12#define rC03	f13#define rC13	f14#define rC23	f15#define rC33	f16#ifdef BETA0#define ZERO	f17#endif#define ra0	f20#define ra1	f21#define ra2	f22#define ra3	f23#define rA0	f24#define rA1	f25#define rA2	f26#define rA3	f27#define rB0	f28#define rB1	f29#define rB2	f30#define rB3	f31/* * Offsets from stack pointer for integer register save area, fp reg area, */#ifndef ATL_GAS_LINUX_PPC   #define IROFF -220   #define FROFF -144#elif defined(ATL_USE64BITS)   #define FROFF -288   #define IROFF FROFF+144#endif#ifdef ATL_USE64BITS   #define slwi         sldi   #define srwi         srdi   #define mullw        mulld   #define cmpwi        cmpdi#endif#if 0                         r3           r4           r5          r6-r7,f2void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha,                (r6)       r8  (r7)       r9  (r8)      r10  (r9)   56(r1)                const TYPE *A, const int lda, const TYPE *B, const int ldb,                             f2   68(r1)          72(r1)                const TYPE beta, TYPE *C, const int ldc)		                  (r10)    8(r1)!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!NOTE: 64 bit Linux ABI wastes para-passing iregs and stack space like OS X:!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!                         r3           r4           r5             r6/f1void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha,                           r7             r8             r9            r10                const TYPE *A, const int lda, const TYPE *B, const int ldb,                             f2   120(r1)        128(r1)                const TYPE beta, TYPE *C, const int ldc)#endif.text#ifdef ATL_GAS_LINUX_PPC   #if defined(ATL_USE64BITS)/* *       No idea what this does, but seg fault without it (I think it is *       partially resp for making code callable from both static & PIC code) */        .align 2        .globl  ATL_USERMM        .section        ".opd","aw"        .align  3ATL_USERMM:        .quad   Mjoin(.,ATL_USERMM),.TOC.@tocbase,0        .previous        .size   Mjoin(.,ATL_USERMM),24        .type   Mjoin(.,ATL_USERMM),@function        .globl  Mjoin(.,ATL_USERMM)Mjoin(.,ATL_USERMM):   #else.globl	ATL_USERMMATL_USERMM:      #define IROFF FROFF+144      #define FROFF 8      #define FSIZE 224	mflr	r0	stw	r0, 4(r1)	stwu	r1, -FSIZE(r1)   #endif#else.globl	Mjoin(_,ATL_USERMM)Mjoin(_,ATL_USERMM):/* *	Save iregs */	mflr	r0	stw	r0, 8(r1)	mfcr	r0	stw	r0, 4(r1)#endif#ifdef ATL_USE64BITS	std	r14, IROFF(r1)	std	r15, 8+IROFF(r1)	std	r16, 16+IROFF(r1)	std	r17, 32+IROFF(r1)	std	r18, 40+IROFF(r1)	std	r19, 48+IROFF(r1)	std	r20, 56+IROFF(r1)	std	r21, 64+IROFF(r1)	std	r22, 72+IROFF(r1)	std	r23, 80+IROFF(r1)	std	r24, 88+IROFF(r1)	std	r25, 96+IROFF(r1)#else	stw	r14, IROFF(r1)	stw	r15, 4+IROFF(r1)	stw	r16, 8+IROFF(r1)	stw	r17, 12+IROFF(r1)	stw	r18, 16+IROFF(r1)	stw	r19, 20+IROFF(r1)	stw	r20, 24+IROFF(r1)	stw	r21, 28+IROFF(r1)	stw	r22, 32+IROFF(r1)	stw	r23, 36+IROFF(r1)	stw	r24, 40+IROFF(r1)	stw	r25, 44+IROFF(r1)#endif	mr	stK, r5/* *      Setup ctrl reg for prefetch of A & B */ 	slwi	incAm, incAm, 3	srwi	ctlB, stK, 1	slwi	ctlB, ctlB, 8	add	ctlB, ctlB, M	slwi	ctlB, ctlB, 16	or 	ctlB, ctlB, incAm	dst	pA0, ctlB, 1#ifndef ATL_GAS_LINUX_PPC        lwz     incBn, 56(r1)#endif	slwi	incBn, incBn, 3	srwi	ctlB, stK, 1	slwi	ctlB, ctlB, 8	addi	ctlB, ctlB, 4	slwi	ctlB, ctlB, 16	or	ctlB, ctlB, incBn	dst	pB0, ctlB, 2/* *	Save fregs */	stfd	f14, FROFF(r1)	stfd	f15, FROFF+8(r1)	stfd	f16, FROFF+16(r1)	stfd	f17, FROFF+24(r1)	stfd	f18, FROFF+32(r1)	stfd	f19, FROFF+40(r1)	stfd	f20, FROFF+48(r1)	stfd	f21, FROFF+56(r1)	stfd	f22, FROFF+64(r1)	stfd	f23, FROFF+72(r1)	stfd	f24, FROFF+80(r1)	stfd	f25, FROFF+88(r1)	stfd	f26, FROFF+96(r1)	stfd	f27, FROFF+104(r1)	stfd	f28, FROFF+112(r1)	stfd	f29, FROFF+120(r1)	stfd	f30, FROFF+128(r1)/* *      Store zero in freg for future use, and save last freg */#ifdef BETA0	xor	pC1, pC1, pC1   #ifdef ATL_USE64BITS	std	pC1, FROFF+136(r1)   #else	stw	pC1, FROFF+136(r1)	stw	pC1, FROFF+140(r1)   #endif	lfd	ZERO, FROFF+136(r1)#endif	stfd	f31, FROFF+136(r1)/* *      Setup C pointers and so on, setup C prefetch *      incCn = (ldc*4 - MB)*sizeof */#ifdef ATL_GAS_LINUX_PPC   #ifdef ATL_USE64BITS	ld 	pC0, 120(r1)

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?