📄 scal.s
字号:
/*********************************************************************//* *//* Optimized BLAS libraries *//* By Kazushige Goto <kgoto@tacc.utexas.edu> *//* *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO *//* THE USE OF THE SOFTWARE OR DOCUMENTATION. *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of *//* profits, interruption of business, or related expenses which may *//* arise from use of Software or Documentation, including but not *//* limited to those resulting from defects in Software and/or *//* Documentation, or loss or inaccuracy of data of any kind. *//*********************************************************************/#define ASSEMBLER#include "common.h" #define N r3#define XX r4#define PREA r5#ifdef linux#ifndef __64BIT__#define X r6#define INCX r7#else#define X r7#define INCX r8#endif#endif#if defined(_AIX) || defined(__APPLE__)#if !defined(__64BIT__) && defined(DOUBLE)#define X r8#define INCX r9#else#define X r7#define INCX r8#endif#endif#define FZERO f0#define ALPHA f1 PROLOGUE PROFCODE addi SP, SP, -8 li r0, 0 stw r0, 0(SP) lfs FZERO, 0(SP) addi SP, SP, 8 slwi INCX, INCX, BASE_SHIFT li PREA, 80 * SIZE cmpwi cr0, N, 0 blelr- cr0 fcmpu cr0, FZERO, ALPHA bne- cr0, LL(A1I1) cmpwi cr0, INCX, SIZE bne- cr0, LL(A0IN) srawi. r0, N, 4 mtspr CTR, r0 beq- cr0, LL(A0I1_Remain) .align 4LL(A0I1_kernel): STFD FZERO, 0 * SIZE(X) STFD FZERO, 1 * SIZE(X) STFD FZERO, 2 * SIZE(X) STFD FZERO, 3 * SIZE(X) STFD FZERO, 4 * SIZE(X) STFD FZERO, 5 * SIZE(X) STFD FZERO, 6 * SIZE(X) STFD FZERO, 7 * SIZE(X) STFD FZERO, 8 * SIZE(X) STFD FZERO, 9 * SIZE(X) STFD FZERO, 10 * SIZE(X) STFD FZERO, 11 * SIZE(X) STFD FZERO, 12 * SIZE(X) STFD FZERO, 13 * SIZE(X) STFD FZERO, 14 * SIZE(X) STFD FZERO, 15 * SIZE(X) addi X, X, 16 * SIZE bdnz LL(A0I1_kernel) .align 4LL(A0I1_Remain): andi. r0, N, 15 mtspr CTR, r0 beqlr+ .align 4LL(A0I1_RemainKernel): STFD FZERO, 0 * SIZE(X) addi X, X, 1 * SIZE bdnz LL(A0I1_RemainKernel) blr .align 4LL(A0IN): srawi. r0, N, 3 mtspr CTR, r0 beq- LL(A0IN_Remain) .align 4LL(A0IN_Kernel): STFD FZERO, 0 * SIZE(X) add X, X, INCX STFD FZERO, 0 * SIZE(X) add X, X, INCX STFD FZERO, 0 * SIZE(X) add X, X, INCX STFD FZERO, 0 * SIZE(X) add X, X, INCX STFD FZERO, 0 * SIZE(X) add X, X, INCX STFD FZERO, 0 * SIZE(X) add X, X, INCX STFD FZERO, 0 * SIZE(X) add X, X, INCX STFD FZERO, 0 * SIZE(X) add X, X, INCX dcbtst X, PREA bdnz LL(A0IN_Kernel) .align 4LL(A0IN_Remain): andi. r0, N, 7 mtspr CTR, r0 beqlr+ .align 4LL(A0IN_RemainKernel): STFD FZERO, 0 * SIZE(X) add X, X, INCX bdnz LL(A0IN_RemainKernel) blr .align 4LL(A1I1): cmpwi cr0, INCX, SIZE bne- LL(A1IN) mr XX, X srawi. r0, N, 3 mtspr CTR, r0 beq+ LL(A1I1_Remain) .align 4LL(A1I1_kernel): LFD f2, 0 * SIZE(X) LFD f3, 1 * SIZE(X) LFD f4, 2 * SIZE(X) LFD f5, 3 * SIZE(X) LFD f6, 4 * SIZE(X) LFD f7, 5 * SIZE(X) LFD f8, 6 * SIZE(X) LFD f9, 7 * SIZE(X) FMUL f2, ALPHA, f2 FMUL f3, ALPHA, f3 FMUL f4, ALPHA, f4 FMUL f5, ALPHA, f5 FMUL f6, ALPHA, f6 FMUL f7, ALPHA, f7 FMUL f8, ALPHA, f8 FMUL f9, ALPHA, f9 STFD f2, 0 * SIZE(X) STFD f3, 1 * SIZE(X) STFD f4, 2 * SIZE(X) STFD f5, 3 * SIZE(X) STFD f6, 4 * SIZE(X) STFD f7, 5 * SIZE(X) STFD f8, 6 * SIZE(X) STFD f9, 7 * SIZE(X) addi X, X, 8 * SIZE dcbtst X, PREA bdnz LL(A1I1_kernel) .align 4LL(A1I1_Remain): andi. r0, N, 7 mtspr CTR, r0 beqlr+ .align 4LL(A1I1_RemainKernel): LFD f2, 0 * SIZE(X) FMUL f2, ALPHA, f2 STFD f2, 0 * SIZE(X) addi X, X, 1 * SIZE bdnz LL(A1I1_RemainKernel) blr .align 4LL(A1IN): mr XX, X srawi. r0, N, 3 mtspr CTR, r0 beq- LL(A1IN_Remain) .align 4LL(A1IN_Kernel): LFD f2, 0 * SIZE(XX) add XX, XX, INCX LFD f3, 0 * SIZE(XX) add XX, XX, INCX LFD f4, 0 * SIZE(XX) add XX, XX, INCX LFD f5, 0 * SIZE(XX) add XX, XX, INCX FMUL f2, ALPHA, f2 FMUL f3, ALPHA, f3 FMUL f4, ALPHA, f4 FMUL f5, ALPHA, f5 LFD f6, 0 * SIZE(XX) add XX, XX, INCX LFD f7, 0 * SIZE(XX) add XX, XX, INCX LFD f8, 0 * SIZE(XX) add XX, XX, INCX LFD f9, 0 * SIZE(XX) add XX, XX, INCX FMUL f6, ALPHA, f6 FMUL f7, ALPHA, f7 FMUL f8, ALPHA, f8 FMUL f9, ALPHA, f9 STFD f2, 0 * SIZE(X) add X, X, INCX STFD f3, 0 * SIZE(X) add X, X, INCX STFD f4, 0 * SIZE(X) add X, X, INCX STFD f5, 0 * SIZE(X) add X, X, INCX STFD f6, 0 * SIZE(X) add X, X, INCX STFD f7, 0 * SIZE(X) add X, X, INCX STFD f8, 0 * SIZE(X) add X, X, INCX STFD f9, 0 * SIZE(X) add X, X, INCX bdnz LL(A1IN_Kernel) .align 4LL(A1IN_Remain): andi. r0, N, 7 mtspr CTR, r0 beqlr+ .align 4LL(A1IN_RemainKernel): LFD f2, 0 * SIZE(XX) add XX, XX, INCX FMUL f2, ALPHA, f2 STFD f2, 0 * SIZE(X) add X, X, INCX bdnz LL(A1IN_RemainKernel) blr EPILOGUE
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -