⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 trsm_kernel_lt.c

📁 Optimized GotoBLAS libraries
💻 C
字号:
/*********************************************************************//*                                                                   *//*             Optimized BLAS libraries                              *//*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     *//*                                                                   *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   *//* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of     *//* profits, interruption of business, or related expenses which may  *//* arise from use of Software or Documentation, including but not    *//* limited to those resulting from defects in Software and/or        *//* Documentation, or loss or inaccuracy of data of any kind.         *//*********************************************************************/#include "common.h"static FLOAT dm1 = -1.;#ifdef CONJ#define GEMM_KERNEL   GEMM_KERNEL_L#else#define GEMM_KERNEL   GEMM_KERNEL_N#endif#ifndef COMPLEXstatic inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {  FLOAT aa, bb;  int i, j, k;  for (i = 0; i < m; i++) {    aa = *(a + i);        for (j = 0; j < n; j ++) {      bb = *(c + i + j * ldc);      bb *= aa;      *b             = bb;      *(c + i + j * ldc) = bb;      b ++;      for (k = i + 1; k < m; k ++){	*(c + k + j * ldc) -= bb * *(a + k);      }    }    a += m;  }}#elsestatic inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {  FLOAT aa1, aa2;  FLOAT bb1, bb2;  FLOAT cc1, cc2;  int i, j, k;  ldc *= 2;  for (i = 0; i < m; i++) {    aa1 = *(a + i * 2 + 0);    aa2 = *(a + i * 2 + 1);        for (j = 0; j < n; j ++) {      bb1 = *(c + i * 2 + 0 + j * ldc);      bb2 = *(c + i * 2 + 1 + j * ldc);#ifndef CONJ      cc1 = aa1 * bb1 - aa2 * bb2;      cc2 = aa1 * bb2 + aa2 * bb1;#else      cc1 = aa1 * bb1 + aa2 * bb2;      cc2 = aa1 * bb2 - aa2 * bb1;#endif      *(b + 0) = cc1;      *(b + 1) = cc2;      *(c + i * 2 + 0 + j * ldc) = cc1;      *(c + i * 2 + 1 + j * ldc) = cc2;      b += 2;      for (k = i + 1; k < m; k ++){#ifndef CONJ	*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1);	*(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);#else	*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1);	*(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);#endif      }    }    a += m * 2;  }}#endifvoid CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,#ifdef COMPLEX	   FLOAT dummy2,#endif	   FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){  FLOAT *aa, *cc;  BLASLONG  kk;  BLASLONG i, j, jj;#if 0  fprintf(stderr, "TRSM KERNEL LT : m = %3ld  n = %3ld  k = %3ld offset = %3ld\n",	  m, n, k, offset);#endif  jj = 0;  j = (n >> GEMM_UNROLL_N_SHIFT);  while (j > 0) {        kk = offset;    aa = a;    cc = c;        i = (m >> GEMM_UNROLL_M_SHIFT);        while (i > 0) {	if (kk > 0) {	  GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1, #ifdef COMPLEX		      ZERO,#endif		      aa, b, cc, ldc); 	}	solve(GEMM_UNROLL_M, GEMM_UNROLL_N, 	      aa + kk * GEMM_UNROLL_M * COMPSIZE,	      b  + kk * GEMM_UNROLL_N * COMPSIZE,	      cc, ldc);      aa += GEMM_UNROLL_M * k * COMPSIZE;      cc += GEMM_UNROLL_M     * COMPSIZE;      kk += GEMM_UNROLL_M;      i --;    }        if (m & (GEMM_UNROLL_M - 1)) {      i = (GEMM_UNROLL_M >> 1);      while (i > 0) {	if (m & i) {	    if (kk > 0) {	      GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, #ifdef COMPLEX			  ZERO,#endif			  aa, b, cc, ldc); 	    }	  solve(i, GEMM_UNROLL_N, 		aa + kk * i             * COMPSIZE,		b  + kk * GEMM_UNROLL_N * COMPSIZE,		cc, ldc);	  aa += i * k * COMPSIZE;	  cc += i     * COMPSIZE;	  kk += i;	}	i >>= 1;      }    }        b += GEMM_UNROLL_N * k   * COMPSIZE;    c += GEMM_UNROLL_N * ldc * COMPSIZE;    j --;    jj += GEMM_UNROLL_M;  }    if (n & (GEMM_UNROLL_N - 1)) {    j = (GEMM_UNROLL_N >> 1);    while (j > 0) {      if (n & j) {		kk = offset;	aa = a;	cc = c;		i = (m >> GEMM_UNROLL_M_SHIFT);		while (i > 0) {	  if (kk > 0) {	    GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, #ifdef COMPLEX			ZERO,#endif			aa,			b, 			cc,			ldc); 	  }	  solve(GEMM_UNROLL_M, j, 		aa + kk * GEMM_UNROLL_M * COMPSIZE, 		b  + kk * j             * COMPSIZE, cc, ldc);	  aa += GEMM_UNROLL_M * k * COMPSIZE;	  cc += GEMM_UNROLL_M     * COMPSIZE;	  kk += GEMM_UNROLL_M;	  i --;	}		if (m & (GEMM_UNROLL_M - 1)) {	  i = (GEMM_UNROLL_M >> 1);	  while (i > 0) {	    if (m & i) {	      if (kk > 0) {		GEMM_KERNEL(i, j, kk, dm1, #ifdef COMPLEX			    ZERO,#endif			    aa,			    b, 			    cc,			    ldc); 	      }	      solve(i, j, 		    aa + kk * i * COMPSIZE, 		    b  + kk * j * COMPSIZE, cc, ldc);	      aa += i * k * COMPSIZE;	      cc += i     * COMPSIZE;	      kk += i;	      }	    i >>= 1;	  }	}		b += j * k   * COMPSIZE;	c += j * ldc * COMPSIZE;      }      j >>= 1;    }  }    return;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -