atl_dmm4x4x16_hppa.c
来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 1,251 行 · 第 1/3 页
C
1,251 行
#if !defined(ATL_LINUX_PARISC) && !defined(ATL_HPUX_PARISC) #error "This kernel requires gas PA-RISC assembler!"#endif#if !defined(KB) || KB == 0 #error "KB must be compile time constant#endif#if (KB/16)*16 != KB #error "KB must be a multiple of 16!"#endif#ifndef MB #define MB 0#endif#ifndef NB #define NB 0#endif#if (((NB/4)*4) != NB) || (((MB/4)*4) != MB) #error "MB & NB must be a multiple of 4!"#endif#ifndef Mjoin #define Mjoin(pre, nam) my_join(pre, nam) #define my_join(pre, nam) pre ## nam#endif#ifdef DCPLX #define CMUL(i_) (2*(i_))#else #define CMUL(i_) i_#endif/* * Integer register usage */#define incCn %r20#define incAn %r21#define rM %r26#define rN %r25#define rK %r24#define pA0 %r1#define pA1 %r3#define pA2 %r4#define pA3 %r5#define pfA %r6#define pfB %r7#define pB0 %r28#define pB1 %r31#define pB2 %r22#define pB3 %r23#define rMM %r10#define pC0 %r29#define pC1 %r11#define pC2 %r12#define pC3 %r13/* * fp reg usage */#define rC00 %fr31#define rC10 %fr30#define rC20 %fr29#define rC30 %fr28#define rC01 %fr27#define rC11 %fr26#define rC21 %fr25#define rC31 %fr24#define rC02 %fr23#define rC12 %fr22#define rC22 %fr21#define rC32 %fr20#define rC03 %fr19#define rC13 %fr18#define rC23 %fr17#define rC33 %fr16#define rA0 %fr15#define rA1 %fr14#define rA2 %fr13#define rA3 %fr12#define ra0 %fr11#define ra1 %fr10#define ra2 %fr9#define ra3 %fr8#define rB0 %fr7#define rB1 %fr6#define rB2 %fr5#define rB3 %fr4#define FSIZE 128 .LEVEL 2.0#ifdef ATL_HPUX_PARISC .SPACE $PRIVATE$ .SUBSPA $DATA$,QUAD=1,ALIGN=8,ACCESS=31 .SUBSPA $BSS$,QUAD=1,ALIGN=8,ACCESS=31,ZERO,SORT=82 .SPACE $TEXT$ .SUBSPA $LIT$,QUAD=0,ALIGN=8,ACCESS=44 .SUBSPA $CODE$,QUAD=0,ALIGN=8,ACCESS=44,CODE_ONLY .IMPORT $global$,DATA .IMPORT $$dyncall,MILLICODE .SPACE $TEXT$ .SUBSPA $CODE$ .align 4 .NSUBSPA $CODE$,QUAD=0,ALIGN=8,ACCESS=44,CODE_ONLY .EXPORT ATL_USERMM,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=FRATL_USERMM#else .text .align 4 .globl ATL_USERMMATL_USERMM:#endif .PROC .CALLINFO FRAME=FSIZE,NO_CALLS .ENTRY/*void ATL_USERMM; 36 40 44 56 60(const int M, const int N, const int K, const double alpha, const double *A,; 64 68 72 80 84const int lda, const double *B, const int ldb, const double beta, double *C,; 88const int ldc)*/ ldw -60(%r30), pA0 ldw -68(%r30), pB0 ldw -84(%r30), pC0 ldw -88(%r30), incCn/* * Move frame pointer, and save registers */ ldo FSIZE(%r30), %r30 fstd %fr12, -8(%r30) fstd %fr13, -16(%r30) fstd %fr14, -24(%r30) fstd %fr15, -32(%r30) fstd %fr16, -40(%r30) fstd %fr17, -48(%r30) fstd %fr18, -56(%r30) fstd %fr19, -64(%r30) fstd %fr20, -72(%r30) fstd %fr21, -80(%r30) stw %r3, -84(%r30) stw %r4, -88(%r30) stw %r5, -92(%r30); stw %r6, -96(%r30); stw %r7, -100(%r30); stw %r8, -104(%r30); stw %r9, -108(%r30) stw %r10, -112(%r30) stw %r11, -116(%r30) stw %r12, -120(%r30) stw %r13, -124(%r30) copy rM, rMM;; incCn = ldc*sizeof;#ifdef DCPLX depw,z incCn,27,28,incCn#else depw,z incCn,28,29,incCn#endif;; incAn = M*K*sizeof; ldo -FSIZE-36(%r30), pA1 ldi 8*KB, incAn stw rM, -4(pA1) stw incAn, 0(pA1) fldw 0(pA1), %fr15L fldw -4(pA1), %fr15R xmpyu %fr15R, %fr15L, %fr14 fstw %fr14R, 0(pA1) ldw 0(pA1), incAn; extrs incAn, 30, 31, pfB; add pA0, incAn, pfA; add pB0, incAn, pfB; add pB0, pfB, pfB;; Init pC[1-3] & set incCn = 4*ldc - MB; add incCn, pC0, pC1 add incCn, pC1, pC2 add incCn, pC2, pC3 depw,z incCn,29,30, incCn ; incCn = 4*ldc*size#ifdef DCPLX depw,z rMM, 27,28,pA1 ; pA1 = MB*size#else depw,z rMM, 28,29,pA1 ; pA1 = MB*size#endif sub incCn, pA1, incCn ; incCn = 4*ldc*size - MB*size;; Init pA[1-5] & pB[1-3]; ldo KB*8(pA0), pA1 ldo 2*KB*8(pA0), pA2 ldo 3*KB*8(pA0), pA3 ldo KB*8(pB0), pB1 ldo 2*KB*8(pB0), pB2 ldo 3*KB*8(pB0), pB3NLOOP: copy rMM, rMMLOOP:#ifdef BETA0 fcpy,dbl %fr0, rC00 fcpy,dbl %fr0, rC10 fcpy,dbl %fr0, rC20 fcpy,dbl %fr0, rC30 fcpy,dbl %fr0, rC01 fcpy,dbl %fr0, rC11 fcpy,dbl %fr0, rC21 fcpy,dbl %fr0, rC31 fcpy,dbl %fr0, rC02 fcpy,dbl %fr0, rC12 fcpy,dbl %fr0, rC22 fcpy,dbl %fr0, rC32 fcpy,dbl %fr0, rC03 fcpy,dbl %fr0, rC13 fcpy,dbl %fr0, rC23 fcpy,dbl %fr0, rC33#else fldd 0(pC0), rC00 fldd CMUL(8)(pC0), rC10 fldd CMUL(16)(pC0), rC20 fldd CMUL(24)(pC0), rC30 fldd 0(pC1), rC01 fldd CMUL(8)(pC1), rC11 fldd CMUL(16)(pC1), rC21 fldd CMUL(24)(pC1), rC31 fldd 0(pC2), rC02 fldd CMUL(8)(pC2), rC12 fldd CMUL(16)(pC2), rC22 fldd CMUL(24)(pC2), rC32 fldd 0(pC3), rC03 fldd CMUL(8)(pC3), rC13 fldd CMUL(16)(pC3), rC23 fldd CMUL(24)(pC3), rC33 #ifdef BETAX fldd -FSIZE-80(%r30), ra0 fmpy,dbl ra0, rC00, rC00 fmpy,dbl ra0, rC10, rC10 fmpy,dbl ra0, rC20, rC20 fmpy,dbl ra0, rC30, rC30 fmpy,dbl ra0, rC01, rC01 fmpy,dbl ra0, rC11, rC11 fmpy,dbl ra0, rC21, rC21 fmpy,dbl ra0, rC31, rC31 fmpy,dbl ra0, rC02, rC02 fmpy,dbl ra0, rC12, rC12 fmpy,dbl ra0, rC22, rC22 fmpy,dbl ra0, rC32, rC32 fmpy,dbl ra0, rC03, rC03 fmpy,dbl ra0, rC13, rC13 fmpy,dbl ra0, rC23, rC23 fmpy,dbl ra0, rC33, rC33 #endif#endif fldd 0(pB0), rB0 fldd 0(pA0), rA0 fldd 0(pA1), rA1 fldd 0(pA2), rA2 fldd 0(pA3), rA3 fldd 0(pB1), rB1 fldd 0(pB2), rB2 fldd 0(pB3), rB3#if KB > 16 ldi KB-16, rKKLOOP: fmpyfadd,dbl rA0, rB0, rC00, rC00 fmpyfadd,dbl rA1, rB0, rC10, rC10 fldd 8(pA0), ra0 fmpyfadd,dbl rA2, rB0, rC20, rC20 fmpyfadd,dbl rA3, rB0, rC30, rC30 fldd 8(pB0), rB0 fmpyfadd,dbl rA0, rB1, rC01, rC01 fmpyfadd,dbl rA1, rB1, rC11, rC11 fldd 8(pA1), ra1 fmpyfadd,dbl rA2, rB1, rC21, rC21 fmpyfadd,dbl rA3, rB1, rC31, rC31 fldd 8(pB1), rB1 fmpyfadd,dbl rA0, rB2, rC02, rC02 fmpyfadd,dbl rA1, rB2, rC12, rC12 fldd 8(pA2), ra2 fmpyfadd,dbl rA2, rB2, rC22, rC22 fmpyfadd,dbl rA3, rB2, rC32, rC32 fldd 8(pB2), rB2 fmpyfadd,dbl rA0, rB3, rC03, rC03 fmpyfadd,dbl rA1, rB3, rC13, rC13 fldd 8(pA3), ra3 fmpyfadd,dbl rA2, rB3, rC23, rC23 fmpyfadd,dbl rA3, rB3, rC33, rC33 fldd 8(pB3), rB3 fmpyfadd,dbl ra0, rB0, rC00, rC00 fmpyfadd,dbl ra1, rB0, rC10, rC10 fldd 16(pA0), rA0 fmpyfadd,dbl ra2, rB0, rC20, rC20 fmpyfadd,dbl ra3, rB0, rC30, rC30 fldd 16(pB0), rB0 fmpyfadd,dbl ra0, rB1, rC01, rC01 fmpyfadd,dbl ra1, rB1, rC11, rC11 fldd 16(pA1), rA1 fmpyfadd,dbl ra2, rB1, rC21, rC21 fmpyfadd,dbl ra3, rB1, rC31, rC31 fldd 16(pB1), rB1 fmpyfadd,dbl ra0, rB2, rC02, rC02 fmpyfadd,dbl ra1, rB2, rC12, rC12 fldd 16(pA2), rA2 fmpyfadd,dbl ra2, rB2, rC22, rC22 fmpyfadd,dbl ra3, rB2, rC32, rC32 fldd 16(pB2), rB2 fmpyfadd,dbl ra0, rB3, rC03, rC03 fmpyfadd,dbl ra1, rB3, rC13, rC13 fldd 16(pA3), rA3 fmpyfadd,dbl ra2, rB3, rC23, rC23 fmpyfadd,dbl ra3, rB3, rC33, rC33 fldd 16(pB3), rB3 fmpyfadd,dbl rA0, rB0, rC00, rC00 fmpyfadd,dbl rA1, rB0, rC10, rC10 fldd 24(pA0), ra0 fmpyfadd,dbl rA2, rB0, rC20, rC20 fmpyfadd,dbl rA3, rB0, rC30, rC30 fldd 24(pB0), rB0 fmpyfadd,dbl rA0, rB1, rC01, rC01 fmpyfadd,dbl rA1, rB1, rC11, rC11 fldd 24(pA1), ra1 fmpyfadd,dbl rA2, rB1, rC21, rC21 fmpyfadd,dbl rA3, rB1, rC31, rC31 fldd 24(pB1), rB1 fmpyfadd,dbl rA0, rB2, rC02, rC02 fmpyfadd,dbl rA1, rB2, rC12, rC12 fldd 24(pA2), ra2 fmpyfadd,dbl rA2, rB2, rC22, rC22 fmpyfadd,dbl rA3, rB2, rC32, rC32 fldd 24(pB2), rB2 fmpyfadd,dbl rA0, rB3, rC03, rC03 fmpyfadd,dbl rA1, rB3, rC13, rC13 fldd 24(pA3), ra3 fmpyfadd,dbl rA2, rB3, rC23, rC23 fmpyfadd,dbl rA3, rB3, rC33, rC33 fldd 24(pB3), rB3 fmpyfadd,dbl ra0, rB0, rC00, rC00 fmpyfadd,dbl ra1, rB0, rC10, rC10 fldd 32(pA0), rA0 fmpyfadd,dbl ra2, rB0, rC20, rC20 fmpyfadd,dbl ra3, rB0, rC30, rC30 fldd 32(pB0), rB0 fmpyfadd,dbl ra0, rB1, rC01, rC01 fmpyfadd,dbl ra1, rB1, rC11, rC11 fldd 32(pA1), rA1 fmpyfadd,dbl ra2, rB1, rC21, rC21 fmpyfadd,dbl ra3, rB1, rC31, rC31 fldd 32(pB1), rB1 fmpyfadd,dbl ra0, rB2, rC02, rC02 fmpyfadd,dbl ra1, rB2, rC12, rC12 fldd 32(pA2), rA2 fmpyfadd,dbl ra2, rB2, rC22, rC22 fmpyfadd,dbl ra3, rB2, rC32, rC32 fldd 32(pB2), rB2 fmpyfadd,dbl ra0, rB3, rC03, rC03 fmpyfadd,dbl ra1, rB3, rC13, rC13 fldd 32(pA3), rA3 fmpyfadd,dbl ra2, rB3, rC23, rC23 fmpyfadd,dbl ra3, rB3, rC33, rC33 fldd 32(pB3), rB3 fmpyfadd,dbl rA0, rB0, rC00, rC00 fmpyfadd,dbl rA1, rB0, rC10, rC10 fldd 40(pA0), ra0 fmpyfadd,dbl rA2, rB0, rC20, rC20 fmpyfadd,dbl rA3, rB0, rC30, rC30 fldd 40(pB0), rB0 fmpyfadd,dbl rA0, rB1, rC01, rC01 fmpyfadd,dbl rA1, rB1, rC11, rC11 fldd 40(pA1), ra1 fmpyfadd,dbl rA2, rB1, rC21, rC21 fmpyfadd,dbl rA3, rB1, rC31, rC31 fldd 40(pB1), rB1 fmpyfadd,dbl rA0, rB2, rC02, rC02 fmpyfadd,dbl rA1, rB2, rC12, rC12 fldd 40(pA2), ra2 fmpyfadd,dbl rA2, rB2, rC22, rC22 fmpyfadd,dbl rA3, rB2, rC32, rC32 fldd 40(pB2), rB2 fmpyfadd,dbl rA0, rB3, rC03, rC03 fmpyfadd,dbl rA1, rB3, rC13, rC13 fldd 40(pA3), ra3 fmpyfadd,dbl rA2, rB3, rC23, rC23 fmpyfadd,dbl rA3, rB3, rC33, rC33 fldd 40(pB3), rB3 fmpyfadd,dbl ra0, rB0, rC00, rC00 fmpyfadd,dbl ra1, rB0, rC10, rC10 fldd 48(pA0), rA0 fmpyfadd,dbl ra2, rB0, rC20, rC20 fmpyfadd,dbl ra3, rB0, rC30, rC30 fldd 48(pB0), rB0
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?