📄 zgemv_t_ppc440.s
字号:
/*********************************************************************//* *//* Optimized BLAS libraries *//* By Kazushige Goto <kgoto@tacc.utexas.edu> *//* *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO *//* THE USE OF THE SOFTWARE OR DOCUMENTATION. *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of *//* profits, interruption of business, or related expenses which may *//* arise from use of Software or Documentation, including but not *//* limited to those resulting from defects in Software and/or *//* Documentation, or loss or inaccuracy of data of any kind. *//*********************************************************************/#define ASSEMBLER#include "common.h"#define P 1024#ifndef __64BIT__#define STACKSIZE 224#else#define STACKSIZE 304#endif#ifdef linux#ifndef __64BIT__#define M r3#define N r4#define A r6#define LDA r7#define X r8#define INCX r9#define Y r10#define INCY r5#else#define M r3#define N r4#define A r8#define LDA r9#define X r10#define INCX r5#define Y r6#define INCY r7#endif#endif#if defined(_AIX) || defined(__APPLE__)#if !defined(__64BIT__) && defined(DOUBLE)#define M r3#define N r4#define A r10#define LDA r5#define X r6#define INCX r7#define Y r8#define INCY r9#else#define M r3#define N r4#define A r8#define LDA r9#define X r10#define INCX r5#define Y r6#define INCY r7#endif#endif#define BUFFER r11#define XP r12#define X1 r14#define J r15#define AO1 r16#define AO2 r17#define AO3 r18#define AO4 r19#define PREA r20#define PREC r21#define YY r22#if defined(PPCG4)#define PREFETCHSIZE_A (3 * 8)#define PREFETCHSIZE_C 7#endif#if !(defined(CONJ) && defined(XCONJ))#define FMADDR FMADD#define FMSUBR FNMSUB#else#define FMADDR FNMSUB#define FMSUBR FMADD#endif#ifndef NEEDPARAM#ifndef __64BIT__#define FZERO 200(SP)#else#define FZERO 256(SP)#endif PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP)#ifdef __64BIT__ std r14, 144(SP) std r15, 152(SP) std r16, 160(SP) std r17, 168(SP) std r18, 176(SP) std r19, 184(SP) std r20, 192(SP) std r21, 200(SP) std r22, 208(SP) std r0, FZERO#else stw r14, 144(SP) stw r15, 148(SP) stw r16, 152(SP) stw r17, 156(SP) stw r18, 160(SP) stw r19, 164(SP) stw r20, 168(SP) stw r21, 172(SP) stw r22, 176(SP) stw r0, FZERO stw r0, 4 + FZERO#endif#ifdef linux#ifndef __64BIT__ lwz INCY, 8 + STACKSIZE(SP) lwz BUFFER, 12 + STACKSIZE(SP)#else ld INCX, 112 + STACKSIZE(SP) ld Y, 120 + STACKSIZE(SP) ld INCY, 128 + STACKSIZE(SP) ld BUFFER, 136 + STACKSIZE(SP)#endif#endif#if defined(_AIX) || defined(__APPLE__)#ifndef __64BIT__#ifdef DOUBLE lwz LDA, 56 + STACKSIZE(SP) lwz X, 60 + STACKSIZE(SP) lwz INCX, 64 + STACKSIZE(SP) lwz Y, 68 + STACKSIZE(SP) lwz INCY, 72 + STACKSIZE(SP) lwz BUFFER, 76 + STACKSIZE(SP)#else lwz INCX, 56 + STACKSIZE(SP) lwz Y, 60 + STACKSIZE(SP) lwz INCY, 64 + STACKSIZE(SP) lwz BUFFER, 68 + STACKSIZE(SP)#endif#else ld INCX, 112 + STACKSIZE(SP) ld Y, 120 + STACKSIZE(SP) ld INCY, 128 + STACKSIZE(SP) ld BUFFER, 136 + STACKSIZE(SP)#endif#endif#ifndef XCONJ#ifndef CONJ#define FMADD1 FMADD#define FMADD2 FMADD#define FMADD3 FNMSUB#define FMADD4 FMADD#else#define FMADD1 FMADD#define FMADD2 FMADD#define FMADD3 FMADD#define FMADD4 FNMSUB#endif#else#ifndef CONJ#define FMADD1 FMADD#define FMADD2 FNMSUB#define FMADD3 FMADD#define FMADD4 FMADD#else#define FMADD1 FMADD#define FMADD2 FMADD#define FMADD3 FNMSUB#define FMADD4 FMADD#endif#endif#define y1 f0#define y2 f1#define y3 f2#define y4 f3#define y5 f4#define y6 f5#define y7 f6#define y8 f7#define a1 f8#define a2 f9#define a3 f10#define a4 f11#define a5 f12#define a6 f13#define a7 f14#define a8 f15#define b1 f16#define b2 f17#define b3 f18#define b4 f19#define b5 f20#define b6 f21#define b7 f22#define b8 f23#define alpha_r f24#define alpha_i f25 fmr alpha_r, f1 fmr alpha_i, f2 slwi LDA, LDA, ZBASE_SHIFT slwi INCX, INCX, ZBASE_SHIFT slwi INCY, INCY, ZBASE_SHIFT li PREA, PREFETCHSIZE_A * SIZE li PREC, PREFETCHSIZE_C * SIZE addi A, A, -SIZE addi INCX, INCX, -SIZE addi INCY, INCY, -SIZE sub X, X, INCX sub Y, Y, INCY mr YY, Y cmpwi cr0, M, 0 ble LL(999) cmpwi cr0, N, 0 ble LL(999) mr XP, X cmpwi cr0, INCX, SIZE beq LL(10) addi XP, BUFFER, -SIZE addi X1, BUFFER, -SIZE srawi. r0, M, 2 mtspr CTR, r0 ble LL(05) .align 4LL(02): LFDUX f0, X, INCX LFDU f1, 1 * SIZE(X) LFDUX f2, X, INCX LFDU f3, 1 * SIZE(X) LFDUX f4, X, INCX LFDU f5, 1 * SIZE(X) LFDUX f6, X, INCX LFDU f7, 1 * SIZE(X) STFDU f0, 1 * SIZE(X1) STFDU f1, 1 * SIZE(X1) STFDU f2, 1 * SIZE(X1) STFDU f3, 1 * SIZE(X1) STFDU f4, 1 * SIZE(X1) STFDU f5, 1 * SIZE(X1) STFDU f6, 1 * SIZE(X1) STFDU f7, 1 * SIZE(X1) bdnz LL(02) .align 4LL(05): andi. r0, M, 3 mtspr CTR, r0 ble LL(10) .align 4LL(06): LFDUX f0, X, INCX LFDU f1, 1 * SIZE(X) STFDU f0, 1 * SIZE(X1) STFDU f1, 1 * SIZE(X1) bdnz LL(06) .align 4LL(10): srawi. J, N, 2 ble LL(20) .align 4LL(11): lfd y1, FZERO mr AO1, A fmr y2, y1 mr X1, XP fmr y3, y1 add AO2, A, LDA fmr y4, y1 add AO3, AO2, LDA fmr y5, y1 add AO4, AO3, LDA fmr y6, y1 add A, AO4, LDA fmr y7, y1 dcbtst PREC, Y fmr y8, y1 srawi. r0, M, 2 mtspr CTR, r0 ble LL(15) LFDU a1, 1 * SIZE(AO1) LFDU b1, 1 * SIZE(X1) LFDU a2, 1 * SIZE(AO1) LFDU b2, 1 * SIZE(X1) LFDU a3, 1 * SIZE(AO2) LFDU a4, 1 * SIZE(AO2) LFDU a5, 1 * SIZE(AO3) LFDU a6, 1 * SIZE(AO3) LFDU a7, 1 * SIZE(AO4) bdz LL(13) .align 5LL(12): FMADD1 y1, a1, b1, y1 LFDU a8, 1 * SIZE(AO4) FMADD2 y2, a1, b2, y2 LFDU b3, 1 * SIZE(X1) FMADD1 y3, a3, b1, y3 LFDU b4, 1 * SIZE(X1) FMADD2 y4, a3, b2, y4#ifdef PPCG4 dcbt AO1, PREA#endif FMADD3 y1, a2, b2, y1 LFDU a1, 1 * SIZE(AO1) FMADD4 y2, a2, b1, y2 LFDU a2, 1 * SIZE(AO1) FMADD3 y3, a4, b2, y3 LFDU a3, 1 * SIZE(AO2) FMADD4 y4, a4, b1, y4 LFDU a4, 1 * SIZE(AO2)#ifdef PPCG4 dcbt X1, PREA#endif FMADD1 y5, a5, b1, y5 FMADD2 y6, a5, b2, y6 FMADD1 y7, a7, b1, y7 FMADD2 y8, a7, b2, y8#ifdef PPCG4 dcbt AO2, PREA#endif FMADD3 y5, a6, b2, y5 LFDU a5, 1 * SIZE(AO3) FMADD4 y6, a6, b1, y6 LFDU a6, 1 * SIZE(AO3) FMADD3 y7, a8, b2, y7 LFDU a7, 1 * SIZE(AO4) FMADD4 y8, a8, b1, y8 LFDU a8, 1 * SIZE(AO4) FMADD1 y1, a1, b3, y1 LFDU b1, 1 * SIZE(X1) FMADD2 y2, a1, b4, y2 LFDU b2, 1 * SIZE(X1) FMADD1 y3, a3, b3, y3 FMADD2 y4, a3, b4, y4#ifdef PPCG4 dcbt AO3, PREA#endif FMADD3 y1, a2, b4, y1 LFDU a1, 1 * SIZE(AO1) FMADD4 y2, a2, b3, y2 LFDU a2, 1 * SIZE(AO1) FMADD3 y3, a4, b4, y3 LFDU a3, 1 * SIZE(AO2) FMADD4 y4, a4, b3, y4 LFDU a4, 1 * SIZE(AO2) FMADD1 y5, a5, b3, y5 FMADD2 y6, a5, b4, y6 FMADD1 y7, a7, b3, y7 FMADD2 y8, a7, b4, y8#ifdef PPCG4 dcbt AO4, PREA#endif FMADD3 y5, a6, b4, y5 LFDU a5, 1 * SIZE(AO3) FMADD4 y6, a6, b3, y6 LFDU a6, 1 * SIZE(AO3) FMADD3 y7, a8, b4, y7 LFDU a7, 1 * SIZE(AO4) FMADD4 y8, a8, b3, y8 LFDU a8, 1 * SIZE(AO4) FMADD1 y1, a1, b1, y1 LFDU b3, 1 * SIZE(X1) FMADD2 y2, a1, b2, y2 LFDU b4, 1 * SIZE(X1) FMADD1 y3, a3, b1, y3 FMADD2 y4, a3, b2, y4#if defined(PPCG4) && defined(DOUBLE) dcbt AO1, PREA#endif FMADD3 y1, a2, b2, y1 LFDU a1, 1 * SIZE(AO1) FMADD4 y2, a2, b1, y2 LFDU a2, 1 * SIZE(AO1) FMADD3 y3, a4, b2, y3 LFDU a3, 1 * SIZE(AO2) FMADD4 y4, a4, b1, y4 LFDU a4, 1 * SIZE(AO2)#if defined(PPCG4) && defined(DOUBLE) dcbt X1, PREA#endif FMADD1 y5, a5, b1, y5 FMADD2 y6, a5, b2, y6 FMADD1 y7, a7, b1, y7 FMADD2 y8, a7, b2, y8#if defined(PPCG4) && defined(DOUBLE) dcbt AO2, PREA#endif FMADD3 y5, a6, b2, y5 LFDU a5, 1 * SIZE(AO3) FMADD4 y6, a6, b1, y6 LFDU a6, 1 * SIZE(AO3) FMADD3 y7, a8, b2, y7 LFDU a7, 1 * SIZE(AO4) FMADD4 y8, a8, b1, y8 LFDU a8, 1 * SIZE(AO4) FMADD1 y1, a1, b3, y1 FMADD2 y2, a1, b4, y2 FMADD1 y3, a3, b3, y3 FMADD2 y4, a3, b4, y4#if defined(PPCG4) && defined(DOUBLE) dcbt AO3, PREA#endif FMADD3 y1, a2, b4, y1 LFDU a1, 1 * SIZE(AO1) FMADD4 y2, a2, b3, y2 LFDU a2, 1 * SIZE(AO1) FMADD3 y3, a4, b4, y3 LFDU a3, 1 * SIZE(AO2) FMADD4 y4, a4, b3, y4 LFDU a4, 1 * SIZE(AO2) FMADD1 y5, a5, b3, y5 LFDU b1, 1 * SIZE(X1) FMADD2 y6, a5, b4, y6 LFDU b2, 1 * SIZE(X1) FMADD1 y7, a7, b3, y7 FMADD2 y8, a7, b4, y8#if defined(PPCG4) && defined(DOUBLE) dcbt AO4, PREA#endif FMADD3 y5, a6, b4, y5 LFDU a5, 1 * SIZE(AO3) FMADD4 y6, a6, b3, y6 LFDU a6, 1 * SIZE(AO3) FMADD3 y7, a8, b4, y7 LFDU a7, 1 * SIZE(AO4) FMADD4 y8, a8, b3, y8 bdnz LL(12) .align 4 LL(13): FMADD1 y1, a1, b1, y1 LFDU a8, 1 * SIZE(AO4) FMADD2 y2, a1, b2, y2 LFDU b3, 1 * SIZE(X1) FMADD1 y3, a3, b1, y3 LFDU b4, 1 * SIZE(X1) FMADD2 y4, a3, b2, y4 FMADD3 y1, a2, b2, y1 LFDU a1, 1 * SIZE(AO1) FMADD4 y2, a2, b1, y2 LFDU a2, 1 * SIZE(AO1) FMADD3 y3, a4, b2, y3 LFDU a3, 1 * SIZE(AO2) FMADD4 y4, a4, b1, y4 LFDU a4, 1 * SIZE(AO2) FMADD1 y5, a5, b1, y5 FMADD2 y6, a5, b2, y6 FMADD1 y7, a7, b1, y7 FMADD2 y8, a7, b2, y8 FMADD3 y5, a6, b2, y5 LFDU a5, 1 * SIZE(AO3) FMADD4 y6, a6, b1, y6 LFDU a6, 1 * SIZE(AO3) FMADD3 y7, a8, b2, y7 LFDU a7, 1 * SIZE(AO4) FMADD4 y8, a8, b1, y8 LFDU a8, 1 * SIZE(AO4) FMADD1 y1, a1, b3, y1 LFDU b1, 1 * SIZE(X1) FMADD2 y2, a1, b4, y2 LFDU b2, 1 * SIZE(X1) FMADD1 y3, a3, b3, y3 FMADD2 y4, a3, b4, y4 FMADD3 y1, a2, b4, y1 LFDU a1, 1 * SIZE(AO1) FMADD4 y2, a2, b3, y2 LFDU a2, 1 * SIZE(AO1) FMADD3 y3, a4, b4, y3 LFDU a3, 1 * SIZE(AO2) FMADD4 y4, a4, b3, y4 LFDU a4, 1 * SIZE(AO2) FMADD1 y5, a5, b3, y5 FMADD2 y6, a5, b4, y6 FMADD1 y7, a7, b3, y7 FMADD2 y8, a7, b4, y8 FMADD3 y5, a6, b4, y5 LFDU a5, 1 * SIZE(AO3) FMADD4 y6, a6, b3, y6 LFDU a6, 1 * SIZE(AO3) FMADD3 y7, a8, b4, y7 LFDU a7, 1 * SIZE(AO4) FMADD4 y8, a8, b3, y8 LFDU a8, 1 * SIZE(AO4) FMADD1 y1, a1, b1, y1 LFDU b3, 1 * SIZE(X1) FMADD2 y2, a1, b2, y2 LFDU b4, 1 * SIZE(X1) FMADD1 y3, a3, b1, y3 FMADD2 y4, a3, b2, y4 FMADD3 y1, a2, b2, y1 LFDU a1, 1 * SIZE(AO1) FMADD4 y2, a2, b1, y2 LFDU a2, 1 * SIZE(AO1) FMADD3 y3, a4, b2, y3 LFDU a3, 1 * SIZE(AO2) FMADD4 y4, a4, b1, y4 LFDU a4, 1 * SIZE(AO2) FMADD1 y5, a5, b1, y5 FMADD2 y6, a5, b2, y6 FMADD1 y7, a7, b1, y7 FMADD2 y8, a7, b2, y8 FMADD3 y5, a6, b2, y5 LFDU a5, 1 * SIZE(AO3) FMADD4 y6, a6, b1, y6 LFDU a6, 1 * SIZE(AO3) FMADD3 y7, a8, b2, y7 LFDU a7, 1 * SIZE(AO4) FMADD4 y8, a8, b1, y8 LFDU a8, 1 * SIZE(AO4) FMADD1 y1, a1, b3, y1 FMADD2 y2, a1, b4, y2 FMADD1 y3, a3, b3, y3 FMADD2 y4, a3, b4, y4 FMADD3 y1, a2, b4, y1 FMADD4 y2, a2, b3, y2 FMADD3 y3, a4, b4, y3 FMADD4 y4, a4, b3, y4 FMADD1 y5, a5, b3, y5 FMADD2 y6, a5, b4, y6 FMADD1 y7, a7, b3, y7 FMADD2 y8, a7, b4, y8 FMADD3 y5, a6, b4, y5 FMADD4 y6, a6, b3, y6 FMADD3 y7, a8, b4, y7 FMADD4 y8, a8, b3, y8 .align 4
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -