📄 atl_cmmjitcp.c
字号:
/* * Automatically Tuned Linear Algebra Software v3.8.0 * (C) Copyright 2007 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions, and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the ATLAS group or the names of its contributers may * not be used to endorse or promote products derived from this * software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#include "atlas_misc.h"#include "atlas_level3.h"#ifdef SCPLX #include "smm.h"#else #include "dmm.h"#endif#ifndef ATL_MaxMalloc /* temp, defined in atlas_lvl3.h */ #define ATL_MaxMalloc 16777216#endiftypedef void (*MAT2BLK3)(const int, const int, const SCALAR, const TYPE*, const int, TYPE*, const int, TYPE*, const int);void Mjoin(PATLU,pNBmm_bX)(const int M, const int N, const int K, const TYPE alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const TYPE beta, TYPE *C, const int ldc);void Mjoin(PATLU,pMBmm_bX)(const int M, const int N, const int K, const TYPE alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const TYPE beta, TYPE *C, const int ldc);void Mjoin(PATLU,pKBmm_bX)(const int M, const int N, const int K, const TYPE alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const TYPE beta, TYPE *C, const int ldc);void NBmm_bX(const int M, const int N, const int K, const TYPE alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const TYPE beta, TYPE *C, const int ldc);void Mjoin(PATLU,pNBmm_b1)(const int M, const int N, const int K, const TYPE alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const TYPE beta, TYPE *C, const int ldc);void Mjoin(PATLU,pMBmm_b1)(const int M, const int N, const int K, const TYPE alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const TYPE beta, TYPE *C, const int ldc);void Mjoin(PATLU,pKBmm_b1)(const int M, const int N, const int K, const TYPE alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const TYPE beta, TYPE *C, const int ldc);void NBmm_b1(const int M, const int N, const int K, const TYPE alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const TYPE beta, TYPE *C, const int ldc);void Mjoin(PATLU,pNBmm_b0)(const int M, const int N, const int K, const TYPE alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const TYPE beta, TYPE *C, const int ldc);void Mjoin(PATLU,pMBmm_b0)(const int M, const int N, const int K, const TYPE alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const TYPE beta, TYPE *C, const int ldc);void Mjoin(PATLU,pKBmm_b0)(const int M, const int N, const int K, const TYPE alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const TYPE beta, TYPE *C, const int ldc);void NBmm_b0(const int M, const int N, const int K, const TYPE alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const TYPE beta, TYPE *C, const int ldc);void Mjoin(PATLU,pKBmm)(const int M, const int N, const int K, const TYPE alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const TYPE beta, TYPE *C, const int ldc);void Mjoin(PATL,row2blkT_a1)(int, int, const TYPE*, int, TYPE*, const SCALAR);void Mjoin(PATL,col2blk_a1)(int, int, const TYPE*, int, TYPE*, const SCALAR);void Mjoin(PATL,gereal2cplx) (const int M, const int N, const TYPE *alpha, const TYPE *R, const int ldr, const TYPE *I, const int ldi, const TYPE *beta, TYPE *C, const int ldc);static void ATL_gecplx2real_a1 (const int M, const int N, const SCALAR alpha, const TYPE *A, const int lda, TYPE *pR, const int ldr, TYPE *pI, const int ldi)/* * Splits real & imag components of A into separate real arrays R/I. */{ const int incA = (lda-M)<<1; int i, j;/* * Copy backwards so 1st part of matrix is LRU */ A += ((N-1)*lda+M-1)<<1; pR += (N-1)*ldr; pI += (N-1)*ldi; for (j=N; j; j--, A -= incA, pR -= ldr, pI -= ldi) { for (i=M-1; i >= 0; i--, A -= 2) { pR[i] = *A; pI[i] = A[1]; } }}static void ATL_gecplx2realT_a1 (const int M, const int N, const SCALAR alpha, const TYPE *A, const int lda, TYPE *pR, const int ldr, TYPE *pI, const int ldi)/* * Splits real & imag components of A' into separate real arrays R/I. * Output matrix is MxN, so A must be NxM */{ const int lda2 = (lda-N)<<1, incR = 1-N*ldr, incI = 1-N*ldi; int i, j;/* * Loop over M cols of A */ for (i=M; i; i--, A += lda2, pR += incR, pI += incI) { for (j=N; j; j--, A += 2, pR += ldr, pI += ldi) { *pR = *A; *pI = A[1]; } }}static void ATL_gecplx2realConj_a1 (const int M, const int N, const SCALAR alpha, const TYPE *A, const int lda, TYPE *pR, const int ldr, TYPE *pI, const int ldi)/* * Splits real & imag components of A into separate real arrays R/I. */{ const int incA = (lda-M)<<1; int i, j;/* * Copy backwards so 1st part of matrix is LRU */ A += ((N-1)*lda+M-1)<<1; pR += (N-1)*ldr; pI += (N-1)*ldi; for (j=N; j; j--, A -= incA, pR -= ldr, pI -= ldi) { for (i=M-1; i >= 0; i--, A -= 2) { pR[i] = *A; pI[i] = -A[1]; } }}static void ATL_gecplx2realC_a1 (const int M, const int N, const SCALAR alpha, const TYPE *A, const int lda, TYPE *pR, const int ldr, TYPE *pI, const int ldi)/* * Splits real & imag components of A' into separate real arrays R/I. * Output matrix is MxN, so A must be NxM */{ const int lda2 = (lda-N)<<1, incR = 1-N*ldr, incI = 1-N*ldi; int i, j;/* * Loop over M cols of A */ for (i=M; i; i--, A += lda2, pR += incR, pI += incI) { for (j=N; j; j--, A += 2, pR += ldr, pI += ldi) { *pR = *A; *pI = -A[1]; } }}static void Mjoin(PATL,mmK) (int M, /* true # of rows in row-panel, M <= MB */ int N, /* true # of cols in col-panel, N < = NB */ int nblk, /* # of blocks in K dimension */ int KR, /* KR = K - nKb*KB; */ const TYPE *A, /* array to copy from, NULL if already cp */ const int lda, /* leading dimension of A */ const int incA, /* inc to next blk in A */ const TYPE *alpha, TYPE *pA, /* wrkspace to copy A to */ const int incAW, /* 0 : keep using same KBxMB space */ const TYPE *B, /* array to copy from, NULL if already cp */ const int ldb, /* leading dimension of B */ const int incB, /* inc to next blk in B */ TYPE *pB, /* wrkspace to copy B to */ const int incBW, /* 0 : keep using same KBxNB space */ const TYPE *beta, TYPE *C, /* output matrix */ const int ldc, TYPE *pC, /* ldpc x NB workspace */ const int ldpc, MAT2BLK3 A2blk, /* rout to copy A */ MAT2BLK3 B2blk) /* rout to copy B *//* * Performs a K-inner-loop matmul, while copying A & B if necessary. * If M > m, we are doing extra flops so we don't call cleanup (same for N) * This just-in-time copy is better alg when K dim dominates M & N. */{ int m, n, kr; /* # of row/cols to operate on, m >= M, n >= N */ int k; const TYPE one[2] = {ATL_rone, ATL_rzero}, zero[2] = {ATL_rzero, ATL_rzero};/* * Indexes to next blk (i.e. real to imag) of matrices; always uses full * block stride, even when we have a partial block */ int ipb = NB*KB, ipa = MB*KB, ipc = ldpc*NB; void (*NBmm0)(const int, const int, const int, const TYPE, const TYPE*, const int, const TYPE*, const int, const TYPE, TYPE*, const int); void (*NBmm1)(const int, const int, const int, const TYPE, const TYPE*, const int, const TYPE*, const int, const TYPE, TYPE*, const int); void (*NBmmX)(const int, const int, const int, const TYPE, const TYPE*, const int, const TYPE*, const int, const TYPE, TYPE*, const int); m = (M < MB && M+ATL_mmMU >= MB) ? MB : M; n = (N < NB && N+ATL_mmNU >= NB) ? NB : N; if (m == MB && n == NB) { NBmm0 = NBmm_b0; NBmm1 = NBmm_b1; NBmmX = NBmm_bX; } else if (m == MB) /* N cleanup needed */ { NBmm0 = Mjoin(PATLU,pNBmm_b0); NBmm1 = Mjoin(PATLU,pNBmm_b1); NBmmX = Mjoin(PATLU,pNBmm_bX); } else if (n == NB) /* M cleanup needed */ { NBmm0 = Mjoin(PATLU,pMBmm_b0); NBmm1 = Mjoin(PATLU,pMBmm_b1); NBmmX = Mjoin(PATLU,pMBmm_bX); } else /* two or more dim < NB, requires generated cleanup */ NBmm0 = NBmm1 = NBmmX = Mjoin(PATLU,pKBmm); if (nblk) { if (B) { if (n > N) { Mjoin(PATLU,gezero)(KB, n-N, pB+KB*N, KB); Mjoin(PATLU,gezero)(KB, n-N, pB+ipb+KB*N, KB); } B2blk(KB, N, one, B, ldb, pB+ipb, KB, pB, KB); B += incB; } if (A) { if (m > M) { Mjoin(PATLU,gezero)(KB, m-M, pA+KB*M, KB); Mjoin(PATLU,gezero)(KB, m-M, pA+ipa+KB*M, KB); } A2blk(KB, M, one, A, lda, pA+ipa, KB, pA, KB); A += incA; } NBmm0(m, n, KB, ATL_rone, pA, KB, pB, KB, ATL_rzero, pC, ldpc); NBmm0(m, n, KB, ATL_rone, pA, KB, pB+ipb, KB, ATL_rzero, pC+ipc, ldpc); NBmmX(m, n, KB, ATL_rone, pA+ipa, KB, pB+ipb, KB, ATL_rnone, pC, ldpc); NBmm1(m, n, KB, ATL_rone, pA+ipa, KB, pB, KB, ATL_rone, pC+ipc, ldpc); pA += incAW; pB += incBW; for (k=nblk-1; k; k--) { if (B) { if (n > N) { Mjoin(PATLU,gezero)(KB, n-N, pB+KB*N, KB); Mjoin(PATLU,gezero)(KB, n-N, pB+ipb+KB*N, KB); } B2blk(KB, N, one, B, ldb, pB+ipb, KB, pB, KB); B += incB; } if (A) { if (m > M) { Mjoin(PATLU,gezero)(KB, m-M, pA+KB*M, KB); Mjoin(PATLU,gezero)(KB, m-M, pA+ipa+KB*M, KB); } A2blk(KB, M, one, A, lda, pA+ipa, KB, pA, KB); A += incA; } NBmmX(m, n, KB, ATL_rone, pA, KB, pB, KB, ATL_rnone, pC, ldpc); NBmm1(m, n, KB, ATL_rone, pA, KB, pB+ipb, KB, ATL_rone, pC+ipc, ldpc); NBmmX(m, n, KB, ATL_rone, pA+ipa, KB, pB+ipb, KB, ATL_rnone, pC, ldpc); NBmm1(m, n, KB, ATL_rone, pA+ipa, KB, pB, KB, ATL_rone, pC+ipc, ldpc); pA += incAW; pB += incBW; } } if (KR) /* need to cleanup K-loop */ { if (KR+4 >= KB) /* do extra flops to avoid cleanup loop */ kr = KB; else /* must use K cleanup */ { kr = KR; if (m < MB || n < NB) /* use general K cleanup */ { n = N; m = M; if (!nblk) { Mjoin(PATLU,gezero)(M, N, pC, ldpc); Mjoin(PATLU,gezero)(M, N, pC+ipc, ldpc); } NBmm1 = NBmmX = NBmm0 = Mjoin(PATLU,pKBmm); } else /* use K-only cleanup */ { NBmm0 = Mjoin(PATLU,pKBmm_b0); NBmm1 = Mjoin(PATLU,pKBmm_b1); NBmmX = Mjoin(PATLU,pKBmm_bX);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -