📄 zsymv_u.s
字号:
/*********************************************************************//* *//* Optimized BLAS libraries *//* By Kazushige Goto <kgoto@tacc.utexas.edu> *//* *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO *//* THE USE OF THE SOFTWARE OR DOCUMENTATION. *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of *//* profits, interruption of business, or related expenses which may *//* arise from use of Software or Documentation, including but not *//* limited to those resulting from defects in Software and/or *//* Documentation, or loss or inaccuracy of data of any kind. *//*********************************************************************/#define ASSEMBLER#include "common.h"#ifndef NEEDPARAM#ifndef DOUBLE#include "cparam_u.h"#else#include "zparam_u.h"#endif#endif#ifdef linux#ifndef __64BIT__#define M r3#define A r4#define LDA r5#define X r6#define INCX r7#define Y r8#define INCY r9#define BUFFER r10#else#define M r3#define A r6#define LDA r7#define X r8#define INCX r9#define Y r10#define INCY r4#define BUFFER r5#endif#endif#if defined(_AIX) || defined(__APPLE__)#if !defined(__64BIT__) && defined(DOUBLE)#define M r3#define A r8#define LDA r9#define X r10#define INCX r4#define Y r5#define INCY r6#define BUFFER r7#else#define M r3#define A r6#define LDA r7#define X r8#define INCX r9#define Y r10#define INCY r4#define BUFFER r5#endif#endif#define I r11#define J r12#define AO1 r14#define AO2 r15#define AO3 r16#define AO4 r17#define XX r18#define YY r19#define NEW_Y r20#define IS r21#define TEMP r22#define PREA r24#define y01 f0#define y02 f1#define y03 f2#define y04 f3#define y05 f4#define y06 f5#define y07 f6#define y08 f7#define xtemp1 f8#define xtemp2 f9#define xtemp3 f10#define xtemp4 f11#define xtemp5 f12#define xtemp6 f13#define xtemp7 f14#define xtemp8 f15#define atemp1 f16#define atemp2 f17#define atemp3 f18#define atemp4 f19#define xsum1 f20#define xsum2 f21#define xsum3 f22#define xsum4 f23#define a1 f24#define a2 f25#define a3 f26#define a4 f27#define a5 f28#define a6 f29#define a7 f30#define a8 f31#define alpha_r f1#define alpha_i f2#if defined(PPCG4)#define PREFETCHSIZE_A 24#endif#if defined(PPC440) || defined(PPC440FP2)#define PREFETCHSIZE_A 24#endif#ifdef PPC970#define PREFETCHSIZE_A 32#endif#ifdef CELL#define PREFETCHSIZE_A 72#endif#ifdef POWER4#define PREFETCHSIZE_A 16#endif#ifdef POWER5#define PREFETCHSIZE_A 64#endif#if defined(POWER4) || defined(POWER5) || defined(PPC970)#define NOP1#define NOP2#else#define NOP1 mr LDA, LDA#define NOP2 mr INCX, INCX#endif#ifndef NEEDPARAM#ifndef __64BIT__#define STACKSIZE 224#define ALPHA_R 200(SP)#define ALPHA_I 208(SP)#define FZERO 216(SP)#else#define STACKSIZE 280#define ALPHA_R 256(SP)#define ALPHA_I 264(SP)#define FZERO 272(SP)#endif#ifndef HEMV#define FMADD1 FNMSUB#define FMADD2 FMADD#else#define FMADD1 FMADD#define FMADD2 FNMSUB#endif PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP)#ifdef __64BIT__ std r0, FZERO std r14, 144(SP) std r15, 152(SP) std r16, 160(SP) std r17, 168(SP) std r18, 176(SP) std r19, 184(SP) std r20, 192(SP) std r21, 200(SP) std r22, 208(SP) std r23, 216(SP) std r24, 224(SP) std r25, 232(SP) std r26, 240(SP) std r27, 248(SP)#else stw r0, 0 + FZERO stw r0, 4 + FZERO stw r14, 144(SP) stw r15, 148(SP) stw r16, 152(SP) stw r17, 156(SP) stw r18, 160(SP) stw r19, 164(SP) stw r20, 168(SP) stw r21, 172(SP) stw r22, 176(SP) stw r23, 180(SP) stw r24, 184(SP) stw r25, 188(SP) stw r26, 192(SP) stw r27, 196(SP)#endif#ifdef linux#ifdef __64BIT__ ld INCY, 112 + STACKSIZE(SP) ld BUFFER, 120 + STACKSIZE(SP)#endif#endif#if defined(_AIX) || defined(__APPLE__)#ifndef __64BIT__#ifdef DOUBLE lwz INCX, 56 + STACKSIZE(SP) lwz Y, 60 + STACKSIZE(SP) lwz INCY, 64 + STACKSIZE(SP) lwz BUFFER, 68 + STACKSIZE(SP)#else lwz INCY, 56 + STACKSIZE(SP) lwz BUFFER, 60 + STACKSIZE(SP)#endif#else ld INCY, 112 + STACKSIZE(SP) ld BUFFER, 120 + STACKSIZE(SP)#endif#endif STFD alpha_r, ALPHA_R STFD alpha_i, ALPHA_I slwi LDA, LDA, ZBASE_SHIFT slwi INCX, INCX, ZBASE_SHIFT slwi INCY, INCY, ZBASE_SHIFT li PREA, PREFETCHSIZE_A * SIZE cmpwi cr0, M, 0 ble- LL(999) cmpwi cr0, INCX, 2 * SIZE beq LL(05) mr XX, X mr X, BUFFER srawi. r0, M, 2 mtspr CTR, r0 ble LL(03) .align 4LL(01): LFD a1, 0 * SIZE(XX) LFD a2, 1 * SIZE(XX) add XX, XX, INCX LFD a3, 0 * SIZE(XX) LFD a4, 1 * SIZE(XX) add XX, XX, INCX LFD a5, 0 * SIZE(XX) LFD a6, 1 * SIZE(XX) add XX, XX, INCX LFD a7, 0 * SIZE(XX) LFD a8, 1 * SIZE(XX) add XX, XX, INCX dcbt XX, PREA dcbtst BUFFER, PREA STFD a1, 0 * SIZE(BUFFER) STFD a2, 1 * SIZE(BUFFER) STFD a3, 2 * SIZE(BUFFER) STFD a4, 3 * SIZE(BUFFER) STFD a5, 4 * SIZE(BUFFER) STFD a6, 5 * SIZE(BUFFER) STFD a7, 6 * SIZE(BUFFER) STFD a8, 7 * SIZE(BUFFER) addi BUFFER, BUFFER, 8 * SIZE bdnz LL(01) .align 4LL(03): andi. r0, M, 3 mtspr CTR, r0 ble LL(05) .align 4LL(04): LFD a1, 0 * SIZE(XX) LFD a2, 1 * SIZE(XX) add XX, XX, INCX STFD a1, 0 * SIZE(BUFFER) STFD a2, 1 * SIZE(BUFFER) addi BUFFER, BUFFER, 2 * SIZE bdnz LL(04) .align 4LL(05): mr NEW_Y, Y lfd f0, FZERO cmpwi cr0, INCY, 2 * SIZE beq LL(10) mr NEW_Y, BUFFER addi r0, M, 3 srawi. r0, r0, 2 mtspr CTR, r0 .align 4LL(06): STFD f0, 0 * SIZE(BUFFER) STFD f0, 1 * SIZE(BUFFER) STFD f0, 2 * SIZE(BUFFER) STFD f0, 3 * SIZE(BUFFER) STFD f0, 4 * SIZE(BUFFER) STFD f0, 5 * SIZE(BUFFER) STFD f0, 6 * SIZE(BUFFER) STFD f0, 7 * SIZE(BUFFER) addi BUFFER, BUFFER, 8 * SIZE bdnz LL(06) .align 4LL(10): li IS, 0 cmpwi cr0, M, 2 blt LL(20) .align 4LL(11): mr AO1, A add AO2, A, LDA add A, AO2, LDA slwi TEMP, IS, ZBASE_SHIFT add TEMP, X, TEMP LFD y05, ALPHA_R LFD y06, ALPHA_I LFD xtemp1, 0 * SIZE(TEMP) LFD xtemp2, 1 * SIZE(TEMP) LFD xtemp3, 2 * SIZE(TEMP) LFD xtemp4, 3 * SIZE(TEMP) FMUL atemp1, y05, xtemp1 FMUL atemp2, y06, xtemp1 FMUL atemp3, y05, xtemp3 FMUL atemp4, y06, xtemp3 FNMSUB atemp1, y06, xtemp2, atemp1 FMADD atemp2, y05, xtemp2, atemp2 FNMSUB atemp3, y06, xtemp4, atemp3 FMADD atemp4, y05, xtemp4, atemp4 lfd xsum1, FZERO fmr xsum2, xsum1 fmr xsum3, xsum1 fmr xsum4, xsum1 mr XX, X mr YY, NEW_Y LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a5, 0 * SIZE(AO2) LFD a6, 1 * SIZE(AO2) LFD a7, 2 * SIZE(AO2) LFD a8, 3 * SIZE(AO2) LFD xtemp1, 0 * SIZE(XX) LFD xtemp2, 1 * SIZE(XX) LFD xtemp3, 2 * SIZE(XX) LFD xtemp4, 3 * SIZE(XX) LFD y01, 0 * SIZE(YY) LFD y02, 1 * SIZE(YY) LFD y03, 2 * SIZE(YY) LFD y04, 3 * SIZE(YY) srawi. r0, IS, 3 mtspr CTR, r0 ble LL(15) FMADD xsum1, xtemp1, a1, xsum1 PREFETCH_A1 FMADD y01, atemp1, a1, y01 NOP2 FMADD xsum2, xtemp2, a1, xsum2 NOP1 FMADD y02, atemp2, a1, y02 LFD a1, 4 * SIZE(AO1) FMADD xsum3, xtemp1, a5, xsum3 NOP1 FMADD y03, atemp1, a3, y03 NOP2 FMADD xsum4, xtemp2, a5, xsum4 NOP1 FMADD y04, atemp2, a3, y04 NOP2 FMADD1 xsum1, xtemp2, a2, xsum1 LFD y05, 4 * SIZE(YY) FNMSUB y01, atemp2, a2, y01 NOP2 FMADD2 xsum2, xtemp1, a2, xsum2 LFD y06, 5 * SIZE(YY) FMADD y02, atemp1, a2, y02 LFD a2, 5 * SIZE(AO1) FMADD1 xsum3, xtemp2, a6, xsum3 LFD xtemp2, 5 * SIZE(XX) FNMSUB y03, atemp2, a4, y03 NOP2 FMADD2 xsum4, xtemp1, a6, xsum4 LFD xtemp1, 4 * SIZE(XX) FMADD y04, atemp1, a4, y04 NOP2 FMADD xsum1, xtemp3, a3, xsum1 LFD y07, 6 * SIZE(YY) FMADD y01, atemp3, a5, y01 NOP2 FMADD xsum2, xtemp4, a3, xsum2 LFD a3, 6 * SIZE(AO1) FMADD y02, atemp4, a5, y02 LFD a5, 4 * SIZE(AO2) FMADD xsum3, xtemp3, a7, xsum3 LFD y08, 7 * SIZE(YY) FMADD y03, atemp3, a7, y03 NOP2 FMADD xsum4, xtemp4, a7, xsum4 NOP1 FMADD y04, atemp4, a7, y04 LFD a7, 6 * SIZE(AO2) FMADD1 xsum1, xtemp4, a4, xsum1 NOP1 FNMSUB y01, atemp4, a6, y01 PREFETCH_X FMADD2 xsum2, xtemp3, a4, xsum2 LFD a4, 7 * SIZE(AO1) FMADD y02, atemp3, a6, y02 LFD a6, 5 * SIZE(AO2) FMADD1 xsum3, xtemp4, a8, xsum3 LFD xtemp4, 7 * SIZE(XX) FNMSUB y03, atemp4, a8, y03 NOP2 FMADD2 xsum4, xtemp3, a8, xsum4 LFD xtemp3, 6 * SIZE(XX) FMADD y04, atemp3, a8, y04 LFD a8, 7 * SIZE(AO2) FMADD xsum1, xtemp1, a1, xsum1 STFD y01, 0 * SIZE(YY) FMADD y05, atemp1, a1, y05 NOP2 FMADD xsum2, xtemp2, a1, xsum2 STFD y02, 1 * SIZE(YY) FMADD y06, atemp2, a1, y06 LFD a1, 8 * SIZE(AO1) FMADD xsum3, xtemp1, a5, xsum3 STFD y03, 2 * SIZE(YY) FMADD y07, atemp1, a3, y07 NOP2 FMADD xsum4, xtemp2, a5, xsum4 STFD y04, 3 * SIZE(YY) FMADD y08, atemp2, a3, y08 NOP2 FMADD1 xsum1, xtemp2, a2, xsum1 LFD y01, 8 * SIZE(YY) FNMSUB y05, atemp2, a2, y05 NOP2 FMADD2 xsum2, xtemp1, a2, xsum2 LFD y02, 9 * SIZE(YY) FMADD y06, atemp1, a2, y06 LFD a2, 9 * SIZE(AO1) FMADD1 xsum3, xtemp2, a6, xsum3
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -