📄 zgemv_n_ppc440.s
字号:
/*********************************************************************//* *//* Optimized BLAS libraries *//* By Kazushige Goto <kgoto@tacc.utexas.edu> *//* *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO *//* THE USE OF THE SOFTWARE OR DOCUMENTATION. *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of *//* profits, interruption of business, or related expenses which may *//* arise from use of Software or Documentation, including but not *//* limited to those resulting from defects in Software and/or *//* Documentation, or loss or inaccuracy of data of any kind. *//*********************************************************************/#define ASSEMBLER#include "common.h"#ifdef linux#ifndef __64BIT__#define M r3#define N r4#define A r6#define LDA r7#define X r8#define INCX r9#define Y r10#define INCY r5#else#define M r3#define N r4#define A r8#define LDA r9#define X r10#define INCX r5#define Y r6#define INCY r7#endif#endif#if defined(_AIX) || defined(__APPLE__)#if !defined(__64BIT__) && defined(DOUBLE)#define M r3#define N r4#define A r10#define LDA r5#define X r6#define INCX r7#define Y r8#define INCY r9#else#define M r3#define N r4#define A r8#define LDA r9#define X r10#define INCX r5#define Y r6#define INCY r7#endif#endif #define I r11#define J r12#define AO1 r14#define AO2 r15#define AO3 r16#define AO4 r17#define Y1 r18#define Y2 r19#define PREA r20#define YY r21#define BUFFER r22#define y01 f0#define y02 f1#define y03 f2#define y04 f3#define y05 f4#define y06 f5#define y07 f6#define y08 f7#define y09 f8#define y10 f9#define y11 f10#define y12 f11#define y13 f12#define y14 f13#define y15 f14#define y16 f15#define alpha1r f16#define alpha1i f17#define alpha2r f18#define alpha2i f19#define alpha3r f20#define alpha3i f21#define alpha4r f22#define alpha4i f23#define a1 f24#define a2 f25#define a3 f26#define a4 f27#define a5 f28#define a6 f29#define a7 f30#define a8 f31#define alpha_r f14#define alpha_i f15#if defined(PPCG4)#define PREFETCHSIZE_A (3 * 4)#endif#ifndef XCONJ#define FMADDR FMADD#define FMSUBR FNMSUB#else#define FMADDR FNMSUB#define FMSUBR FMADD#endif#ifndef CONJ#define FMADDX FMADD#define FMSUBX FNMSUB#else#define FMADDX FNMSUB#define FMSUBX FMADD#endif#ifndef NEEDPARAM#ifndef __64BIT__#define STACKSIZE 232#define ALPHA_R 208(SP)#define ALPHA_I 216(SP)#define FZERO 224(SP)#else#define STACKSIZE 280#define ALPHA_R 256(SP)#define ALPHA_I 264(SP)#define FZERO 272(SP)#endif PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP)#ifdef __64BIT__ std r0, FZERO std r14, 144(SP) std r15, 152(SP) std r16, 160(SP) std r17, 168(SP) std r18, 176(SP) std r19, 184(SP) std r20, 192(SP) std r21, 200(SP) std r22, 208(SP)#else stw r0, 0 + FZERO stw r0, 4 + FZERO stw r14, 144(SP) stw r15, 148(SP) stw r16, 152(SP) stw r17, 156(SP) stw r18, 160(SP) stw r19, 164(SP) stw r20, 168(SP) stw r21, 172(SP) stw r22, 176(SP)#endif#ifdef linux#ifndef __64BIT__ lwz INCY, 8 + STACKSIZE(SP) lwz BUFFER, 12 + STACKSIZE(SP)#else ld INCX, 112 + STACKSIZE(SP) ld Y, 120 + STACKSIZE(SP) ld INCY, 128 + STACKSIZE(SP) ld BUFFER, 136 + STACKSIZE(SP)#endif#endif#if defined(_AIX) || defined(__APPLE__)#ifndef __64BIT__#ifdef DOUBLE lwz LDA, 56 + STACKSIZE(SP) lwz X, 60 + STACKSIZE(SP) lwz INCX, 64 + STACKSIZE(SP) lwz Y, 68 + STACKSIZE(SP) lwz INCY, 72 + STACKSIZE(SP) lwz BUFFER, 76 + STACKSIZE(SP)#else lwz INCX, 56 + STACKSIZE(SP) lwz Y, 60 + STACKSIZE(SP) lwz INCY, 64 + STACKSIZE(SP) lwz BUFFER, 68 + STACKSIZE(SP)#endif#else ld INCX, 112 + STACKSIZE(SP) ld Y, 120 + STACKSIZE(SP) ld INCY, 128 + STACKSIZE(SP) ld BUFFER, 136 + STACKSIZE(SP)#endif#endif stfd f1, ALPHA_R stfd f2, ALPHA_I slwi LDA, LDA, ZBASE_SHIFT slwi INCX, INCX, ZBASE_SHIFT slwi INCY, INCY, ZBASE_SHIFT addi INCX, INCX, -SIZE addi INCY, INCY, -SIZE addi A, A, -SIZE cmpwi cr0, M, 0 ble- LL(999) sub X, X, INCX cmpwi cr0, N, 0 sub Y, Y, INCY ble- LL(999) li PREA, PREFETCHSIZE_A * SIZE mr YY, Y lfd f0, FZERO cmpi cr0, 0, INCY, SIZE beq LL(10) addi YY, BUFFER, -SIZE addi Y1, BUFFER, -SIZE addi r0, M, 3 srawi. r0, r0, 2 mtspr CTR, r0 .align 4LL(02): STFDU f0, 1 * SIZE(Y1) STFDU f0, 1 * SIZE(Y1) STFDU f0, 1 * SIZE(Y1) STFDU f0, 1 * SIZE(Y1) STFDU f0, 1 * SIZE(Y1) STFDU f0, 1 * SIZE(Y1) STFDU f0, 1 * SIZE(Y1) STFDU f0, 1 * SIZE(Y1) bdnz LL(02) .align 4LL(10): srawi. J, N, 2 ble LL(20) .align 4LL(11): lfd alpha_r, ALPHA_R lfd alpha_i, ALPHA_I LFDUX a1, X, INCX LFDU a2, 1 * SIZE(X) LFDUX a3, X, INCX LFDU a4, 1 * SIZE(X) LFDUX a5, X, INCX LFDU a6, 1 * SIZE(X) LFDUX a7, X, INCX LFDU a8, 1 * SIZE(X) FMUL alpha1r, alpha_r, a1 FMUL alpha1i, alpha_i, a1 FMUL alpha2r, alpha_r, a3 FMUL alpha2i, alpha_i, a3 FMUL alpha3r, alpha_r, a5 mr Y1, YY FMUL alpha3i, alpha_i, a5 mr Y2, YY FMUL alpha4r, alpha_r, a7 mr AO1, A FMUL alpha4i, alpha_i, a7 add AO2, A, LDA FMSUBR alpha1r, alpha_i, a2, alpha1r add AO3, AO2, LDA FMADDR alpha1i, alpha_r, a2, alpha1i add AO4, AO3, LDA FMSUBR alpha2r, alpha_i, a4, alpha2r add A, AO4, LDA FMADDR alpha2i, alpha_r, a4, alpha2i FMSUBR alpha3r, alpha_i, a6, alpha3r srawi. r0, M, 2 FMADDR alpha3i, alpha_r, a6, alpha3i FMSUBR alpha4r, alpha_i, a8, alpha4r mtspr CTR, r0 FMADDR alpha4i, alpha_r, a8, alpha4i ble LL(15) .align 4 LFDU a1, 1 * SIZE(AO1) LFDU y01, 1 * SIZE(Y1) LFDU a2, 1 * SIZE(AO1) LFDU y02, 1 * SIZE(Y1) LFDU a3, 1 * SIZE(AO1) LFDU y03, 1 * SIZE(Y1) LFDU a4, 1 * SIZE(AO1) LFDU y04, 1 * SIZE(Y1) LFDU a5, 1 * SIZE(AO1) LFDU y05, 1 * SIZE(Y1) LFDU a6, 1 * SIZE(AO1) LFDU y06, 1 * SIZE(Y1) LFDU a7, 1 * SIZE(AO1) LFDU y07, 1 * SIZE(Y1) LFDU a8, 1 * SIZE(AO1) LFDU y08, 1 * SIZE(Y1) FMADD y09, alpha1r, a1, y01 FMADD y10, alpha1i, a1, y02 FMADD y11, alpha1r, a3, y03 FMADD y12, alpha1i, a3, y04 FMADD y13, alpha1r, a5, y05 FMADD y14, alpha1i, a5, y06 FMADD y15, alpha1r, a7, y07 FMADD y16, alpha1i, a7, y08 bdz LL(13) .align 4LL(12): FMSUBX y09, alpha1i, a2, y09 LFDU a1, 1 * SIZE(AO2) FMADDX y10, alpha1r, a2, y10 LFDU a2, 1 * SIZE(AO2) FMSUBX y11, alpha1i, a4, y11 LFDU a3, 1 * SIZE(AO2) FMADDX y12, alpha1r, a4, y12 LFDU a4, 1 * SIZE(AO2)#ifdef PPCG4 dcbt AO2, PREA#endif FMSUBX y13, alpha1i, a6, y13 LFDU a5, 1 * SIZE(AO2) FMADDX y14, alpha1r, a6, y14 LFDU a6, 1 * SIZE(AO2) FMSUBX y15, alpha1i, a8, y15 LFDU a7, 1 * SIZE(AO2) FMADDX y16, alpha1r, a8, y16 LFDU a8, 1 * SIZE(AO2)#if defined(PPCG4) && defined(DOUBLE) dcbt AO2, PREA#endif FMADD y09, alpha2r, a1, y09 LFDU y01, 1 * SIZE(Y1) FMADD y10, alpha2i, a1, y10 LFDU y02, 1 * SIZE(Y1) FMADD y11, alpha2r, a3, y11 LFDU y03, 1 * SIZE(Y1) FMADD y12, alpha2i, a3, y12 LFDU y04, 1 * SIZE(Y1)#ifdef PPCG4 dcbtst Y1, PREA#endif FMADD y13, alpha2r, a5, y13 FMADD y14, alpha2i, a5, y14 FMADD y15, alpha2r, a7, y15 FMADD y16, alpha2i, a7, y16 FMSUBX y09, alpha2i, a2, y09 LFDU a1, 1 * SIZE(AO3) FMADDX y10, alpha2r, a2, y10 LFDU a2, 1 * SIZE(AO3) FMSUBX y11, alpha2i, a4, y11 LFDU a3, 1 * SIZE(AO3) FMADDX y12, alpha2r, a4, y12 LFDU a4, 1 * SIZE(AO3)#ifdef PPCG4 dcbt AO3, PREA#endif FMSUBX y13, alpha2i, a6, y13 LFDU a5, 1 * SIZE(AO3) FMADDX y14, alpha2r, a6, y14 LFDU a6, 1 * SIZE(AO3) FMSUBX y15, alpha2i, a8, y15 LFDU a7, 1 * SIZE(AO3) FMADDX y16, alpha2r, a8, y16 LFDU a8, 1 * SIZE(AO3)#if defined(PPCG4) && defined(DOUBLE) dcbt AO3, PREA#endif FMADD y09, alpha3r, a1, y09 LFDU y05, 1 * SIZE(Y1) FMADD y10, alpha3i, a1, y10 LFDU y06, 1 * SIZE(Y1) FMADD y11, alpha3r, a3, y11 LFDU y07, 1 * SIZE(Y1) FMADD y12, alpha3i, a3, y12 LFDU y08, 1 * SIZE(Y1)#if defined(PPCG4) && defined(DOUBLE) dcbtst Y1, PREA#endif FMADD y13, alpha3r, a5, y13 FMADD y14, alpha3i, a5, y14 FMADD y15, alpha3r, a7, y15 FMADD y16, alpha3i, a7, y16 FMSUBX y09, alpha3i, a2, y09 LFDU a1, 1 * SIZE(AO4) FMADDX y10, alpha3r, a2, y10 LFDU a2, 1 * SIZE(AO4) FMSUBX y11, alpha3i, a4, y11 LFDU a3, 1 * SIZE(AO4) FMADDX y12, alpha3r, a4, y12 LFDU a4, 1 * SIZE(AO4)#ifdef PPCG4 dcbt AO4, PREA#endif FMSUBX y13, alpha3i, a6, y13 LFDU a5, 1 * SIZE(AO4) FMADDX y14, alpha3r, a6, y14 LFDU a6, 1 * SIZE(AO4) FMSUBX y15, alpha3i, a8, y15 LFDU a7, 1 * SIZE(AO4) FMADDX y16, alpha3r, a8, y16 LFDU a8, 1 * SIZE(AO4)#if defined(PPCG4) && defined(DOUBLE) dcbt AO4, PREA#endif FMADD y09, alpha4r, a1, y09 FMADD y10, alpha4i, a1, y10 FMADD y11, alpha4r, a3, y11 FMADD y12, alpha4i, a3, y12 FMADD y13, alpha4r, a5, y13 FMADD y14, alpha4i, a5, y14 FMADD y15, alpha4r, a7, y15 FMADD y16, alpha4i, a7, y16 FMSUBX y09, alpha4i, a2, y09 LFDU a1, 1 * SIZE(AO1) FMADDX y10, alpha4r, a2, y10 LFDU a2, 1 * SIZE(AO1) FMSUBX y11, alpha4i, a4, y11 LFDU a3, 1 * SIZE(AO1) FMADDX y12, alpha4r, a4, y12 LFDU a4, 1 * SIZE(AO1)#ifdef PPCG4 dcbt AO1, PREA#endif FMSUBX y13, alpha4i, a6, y13 LFDU a5, 1 * SIZE(AO1) FMADDX y14, alpha4r, a6, y14 LFDU a6, 1 * SIZE(AO1) FMSUBX y15, alpha4i, a8, y15 LFDU a7, 1 * SIZE(AO1) FMADDX y16, alpha4r, a8, y16 LFDU a8, 1 * SIZE(AO1)#if defined(PPCG4) && defined(DOUBLE) dcbt AO1, PREA#endif STFDU y09, 1 * SIZE(Y2) FMADD y09, alpha1r, a1, y01 STFDU y10, 1 * SIZE(Y2) FMADD y10, alpha1i, a1, y02 STFDU y11, 1 * SIZE(Y2) FMADD y11, alpha1r, a3, y03 STFDU y12, 1 * SIZE(Y2) FMADD y12, alpha1i, a3, y04 STFDU y13, 1 * SIZE(Y2) FMADD y13, alpha1r, a5, y05 STFDU y14, 1 * SIZE(Y2) FMADD y14, alpha1i, a5, y06 STFDU y15, 1 * SIZE(Y2) FMADD y15, alpha1r, a7, y07 STFDU y16, 1 * SIZE(Y2) FMADD y16, alpha1i, a7, y08 bdnz LL(12) .align 4LL(13): FMSUBX y09, alpha1i, a2, y09 LFDU a1, 1 * SIZE(AO2) FMADDX y10, alpha1r, a2, y10 LFDU a2, 1 * SIZE(AO2) FMSUBX y11, alpha1i, a4, y11 LFDU a3, 1 * SIZE(AO2) FMADDX y12, alpha1r, a4, y12 LFDU a4, 1 * SIZE(AO2) FMSUBX y13, alpha1i, a6, y13 LFDU a5, 1 * SIZE(AO2) FMADDX y14, alpha1r, a6, y14 LFDU a6, 1 * SIZE(AO2) FMSUBX y15, alpha1i, a8, y15 LFDU a7, 1 * SIZE(AO2) FMADDX y16, alpha1r, a8, y16 LFDU a8, 1 * SIZE(AO2) FMADD y09, alpha2r, a1, y09 FMADD y10, alpha2i, a1, y10 FMADD y11, alpha2r, a3, y11 FMADD y12, alpha2i, a3, y12 FMADD y13, alpha2r, a5, y13 FMADD y14, alpha2i, a5, y14 FMADD y15, alpha2r, a7, y15 FMADD y16, alpha2i, a7, y16 FMSUBX y09, alpha2i, a2, y09 LFDU a1, 1 * SIZE(AO3) FMADDX y10, alpha2r, a2, y10 LFDU a2, 1 * SIZE(AO3) FMSUBX y11, alpha2i, a4, y11 LFDU a3, 1 * SIZE(AO3) FMADDX y12, alpha2r, a4, y12 LFDU a4, 1 * SIZE(AO3) FMSUBX y13, alpha2i, a6, y13 LFDU a5, 1 * SIZE(AO3) FMADDX y14, alpha2r, a6, y14 LFDU a6, 1 * SIZE(AO3) FMSUBX y15, alpha2i, a8, y15 LFDU a7, 1 * SIZE(AO3) FMADDX y16, alpha2r, a8, y16 LFDU a8, 1 * SIZE(AO3) FMADD y09, alpha3r, a1, y09 FMADD y10, alpha3i, a1, y10 FMADD y11, alpha3r, a3, y11 FMADD y12, alpha3i, a3, y12 FMADD y13, alpha3r, a5, y13 FMADD y14, alpha3i, a5, y14 FMADD y15, alpha3r, a7, y15 FMADD y16, alpha3i, a7, y16 FMSUBX y09, alpha3i, a2, y09 LFDU a1, 1 * SIZE(AO4) FMADDX y10, alpha3r, a2, y10 LFDU a2, 1 * SIZE(AO4) FMSUBX y11, alpha3i, a4, y11 LFDU a3, 1 * SIZE(AO4) FMADDX y12, alpha3r, a4, y12 LFDU a4, 1 * SIZE(AO4) FMSUBX y13, alpha3i, a6, y13 LFDU a5, 1 * SIZE(AO4) FMADDX y14, alpha3r, a6, y14 LFDU a6, 1 * SIZE(AO4) FMSUBX y15, alpha3i, a8, y15 LFDU a7, 1 * SIZE(AO4) FMADDX y16, alpha3r, a8, y16 LFDU a8, 1 * SIZE(AO4) FMADD y09, alpha4r, a1, y09 FMADD y10, alpha4i, a1, y10 FMADD y11, alpha4r, a3, y11 FMADD y12, alpha4i, a3, y12 FMADD y13, alpha4r, a5, y13 FMADD y14, alpha4i, a5, y14 FMADD y15, alpha4r, a7, y15 FMADD y16, alpha4i, a7, y16 FMSUBX y09, alpha4i, a2, y09 FMADDX y10, alpha4r, a2, y10 FMSUBX y11, alpha4i, a4, y11 FMADDX y12, alpha4r, a4, y12 FMSUBX y13, alpha4i, a6, y13 STFDU y09, 1 * SIZE(Y2) FMADDX y14, alpha4r, a6, y14 STFDU y10, 1 * SIZE(Y2) FMSUBX y15, alpha4i, a8, y15 STFDU y11, 1 * SIZE(Y2) FMADDX y16, alpha4r, a8, y16 STFDU y12, 1 * SIZE(Y2) STFDU y13, 1 * SIZE(Y2) STFDU y14, 1 * SIZE(Y2) STFDU y15, 1 * SIZE(Y2) STFDU y16, 1 * SIZE(Y2) .align 4LL(15): andi. r0, M, 2 ble LL(17) LFDU a1, 1 * SIZE(AO1) LFDU y01, 1 * SIZE(Y1) LFDU a2, 1 * SIZE(AO1) LFDU y02, 1 * SIZE(Y1) LFDU a3, 1 * SIZE(AO1) LFDU y03, 1 * SIZE(Y1) LFDU a4, 1 * SIZE(AO1) LFDU y04, 1 * SIZE(Y1) FMADD y01, alpha1r, a1, y01 LFDU a5, 1 * SIZE(AO2) FMADD y02, alpha1i, a1, y02 LFDU a6, 1 * SIZE(AO2) FMADD y03, alpha1r, a3, y03 LFDU a7, 1 * SIZE(AO2) FMADD y04, alpha1i, a3, y04 LFDU a8, 1 * SIZE(AO2) FMSUBX y01, alpha1i, a2, y01 LFDU a1, 1 * SIZE(AO3) FMADDX y02, alpha1r, a2, y02 LFDU a2, 1 * SIZE(AO3) FMSUBX y03, alpha1i, a4, y03 LFDU a3, 1 * SIZE(AO3) FMADDX y04, alpha1r, a4, y04 LFDU a4, 1 * SIZE(AO3) FMADD y01, alpha2r, a5, y01 FMADD y02, alpha2i, a5, y02 FMADD y03, alpha2r, a7, y03 FMADD y04, alpha2i, a7, y04 FMSUBX y01, alpha2i, a6, y01 LFDU a5, 1 * SIZE(AO4) FMADDX y02, alpha2r, a6, y02 LFDU a6, 1 * SIZE(AO4) FMSUBX y03, alpha2i, a8, y03 LFDU a7, 1 * SIZE(AO4) FMADDX y04, alpha2r, a8, y04
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -