📄 gemv_t.s
字号:
/*********************************************************************//* *//* Optimized BLAS libraries *//* By Kazushige Goto <kgoto@tacc.utexas.edu> *//* *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO *//* THE USE OF THE SOFTWARE OR DOCUMENTATION. *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of *//* profits, interruption of business, or related expenses which may *//* arise from use of Software or Documentation, including but not *//* limited to those resulting from defects in Software and/or *//* Documentation, or loss or inaccuracy of data of any kind. *//*********************************************************************/#define ASSEMBLER#include "common.h"#ifndef NEEDPARAM#ifndef DOUBLE#include "sparam_t.h"#else#include "dparam_t.h"#endif#endif#ifdef linux#ifndef __64BIT__#define M r3#define N r4#define A r6#define LDA r7#define X r8#define INCX r9#define Y r10#define INCY r5#else#define M r3#define N r4#define A r7#define LDA r8#define X r9#define INCX r10#define Y r5#define INCY r6#endif#endif#if defined(_AIX) || defined(__APPLE__)#if !defined(__64BIT__) && defined(DOUBLE)#define M r3#define N r4#define A r8#define LDA r9#define X r10#define INCX r5#define Y r6#define INCY r7#else#define M r3#define N r4#define A r7#define LDA r8#define X r9#define INCX r10#define Y r5#define INCY r6#endif#endif#define BUFFER r11#define XP r12#define AO1 r14#define AO2 r15#define AO3 r16#define AO4 r17#define AO5 r18#define AO6 r19#define AO7 r20#define AO8 r21#define MIN_N r22#define J r23#define CO r24#define PREA r25#define PREC r26#define BO r27#define PLDA_M r28#define IS r29#define Y1 CO#if defined(PPCG4)#define PREFETCHSIZE_A 42#define PREFETCHSIZE_C 16#endif#if defined(PPC440) || defined(PPC440FP2)#define PREFETCHSIZE_A 42#define PREFETCHSIZE_C 16#endif#ifdef PPC970#define PREFETCHSIZE_A 42#define PREFETCHSIZE_C 16#endif#ifdef CELL#define PREFETCHSIZE_A 42#define PREFETCHSIZE_C 16#endif#ifdef POWER4#define PREFETCHSIZE_A 48#define PREFETCHSIZE_C 16#endif#ifdef POWER5#define PREFETCHSIZE_A 24#define PREFETCHSIZE_C 8#endif#define y01 f0#define y02 f1#define y03 f2#define y04 f3#define y05 f4#define y06 f5#define y07 f6#define y08 f7#define y09 f8#define y10 f9#define y11 f10#define y12 f11#define y13 f12#define y14 f13#define y15 f14#define y16 f15#define a1 f16#define a2 f17#define a3 f18#define a4 f19#define a5 f20#define a6 f21#define a7 f22#define a8 f23#define b1 f24#define b2 f25#define b3 f26#define b4 f27#define b5 f28#define b6 f29#define b7 f30#define b8 f31#define alpha f31#ifndef NEEDPARAM#define P 2048#ifndef __64BIT__#define STACKSIZE 224#else#define STACKSIZE 288#endif#define FZERO 144(SP)#define ALPHA 152(SP) PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP)#ifdef __64BIT__ std r0, FZERO stfd f1, ALPHA std r14, 160(SP) std r15, 168(SP) std r16, 176(SP) std r17, 184(SP) std r18, 192(SP) std r19, 200(SP) std r20, 208(SP) std r21, 216(SP) std r22, 224(SP) std r23, 232(SP) std r24, 240(SP) std r25, 248(SP) std r26, 256(SP) std r27, 264(SP) std r28, 272(SP) std r29, 280(SP)#else stw r0, 0 + FZERO stw r0, 4 + FZERO stfd f1, ALPHA stw r14, 160(SP) stw r15, 164(SP) stw r16, 168(SP) stw r17, 172(SP) stw r18, 176(SP) stw r19, 180(SP) stw r20, 184(SP) stw r21, 188(SP) stw r22, 192(SP) stw r23, 196(SP) stw r24, 200(SP) stw r25, 204(SP) stw r26, 208(SP) stw r27, 212(SP) stw r28, 216(SP) stw r29, 220(SP)#endif#ifdef linux#ifndef __64BIT__ lwz INCY, 8 + STACKSIZE(SP) lwz BUFFER, 12 + STACKSIZE(SP)#else ld Y, 112 + STACKSIZE(SP) ld INCY, 120 + STACKSIZE(SP) ld BUFFER, 128 + STACKSIZE(SP)#endif#endif#if defined(_AIX) || defined(__APPLE__)#ifndef __64BIT__#ifdef DOUBLE lwz INCX, 56 + STACKSIZE(SP) lwz Y, 60 + STACKSIZE(SP) lwz INCY, 64 + STACKSIZE(SP) lwz BUFFER, 68 + STACKSIZE(SP)#else lwz Y, 56 + STACKSIZE(SP) lwz INCY, 60 + STACKSIZE(SP) lwz BUFFER, 64 + STACKSIZE(SP)#endif#else ld Y, 112 + STACKSIZE(SP) ld INCY, 120 + STACKSIZE(SP) ld BUFFER, 128 + STACKSIZE(SP)#endif#endif mullw PLDA_M, LDA, N li XP, P subf PLDA_M, XP, PLDA_M slwi PLDA_M, PLDA_M, BASE_SHIFT slwi LDA, LDA, BASE_SHIFT slwi INCX, INCX, BASE_SHIFT slwi INCY, INCY, BASE_SHIFT subf Y, INCY, Y li IS, 0 addi A, A, -SIZE li PREA, PREFETCHSIZE_A * SIZE li PREC, PREFETCHSIZE_C * SIZE cmpi cr0, 0, M, 0 ble LL(999) cmpi cr0, 0, N, 0 ble LL(999) .align 4LL(ISLoop): subf MIN_N, IS, M slwi r0, IS, BASE_SHIFT cmpi cr0, 0, MIN_N, P ble+ LL(min_nP) li MIN_N, PLL(min_nP): add XP, X, r0 cmpi cr0, 0, INCX, SIZE beq LL(10) mr XP, BUFFER addi CO, BUFFER, -SIZE srawi. r0, MIN_N, 3 mtspr CTR, r0 ble LL(CopyRemain) .align 4LL(CopyKernel): LFD f0, 0 * SIZE(X) add X, X, INCX LFD f1, 0 * SIZE(X) add X, X, INCX LFD f2, 0 * SIZE(X) add X, X, INCX LFD f3, 0 * SIZE(X) add X, X, INCX LFD f4, 0 * SIZE(X) add X, X, INCX LFD f5, 0 * SIZE(X) add X, X, INCX LFD f6, 0 * SIZE(X) add X, X, INCX LFD f7, 0 * SIZE(X) add X, X, INCX STFD f0, 1 * SIZE(CO) STFD f1, 2 * SIZE(CO) STFD f2, 3 * SIZE(CO) STFD f3, 4 * SIZE(CO) STFD f4, 5 * SIZE(CO) STFD f5, 6 * SIZE(CO) STFD f6, 7 * SIZE(CO) STFDU f7, 8 * SIZE(CO) bdnz LL(CopyKernel) .align 4LL(CopyRemain): andi. r0, MIN_N, 7 mtspr CTR, r0 ble LL(10) .align 4LL(CopySub): LFD f0, 0 * SIZE(X) add X, X, INCX STFDU f0, 1 * SIZE(CO) bdnz LL(CopySub) .align 4LL(10): mr CO, Y addi XP, XP, -SIZE srawi. J, N, 3 ble LL(20) .align 4LL(11): mr AO1, A add AO2, A, LDA add AO3, AO2, LDA add AO4, AO3, LDA add AO5, AO4, LDA add AO6, AO5, LDA add AO7, AO6, LDA add AO8, AO7, LDA add A, AO8, LDA mr BO, XP lfd y01, FZERO fmr y02, y01 fmr y03, y01 fmr y04, y01 fmr y05, y01 fmr y06, y01 fmr y07, y01 fmr y08, y01 fmr y09, y01 fmr y10, y01 fmr y11, y01 fmr y12, y01 fmr y13, y01 fmr y14, y01 fmr y15, y01 fmr y16, y01 PREFETCH_Y srawi. r0, MIN_N, 4 mtspr CTR, r0 ble LL(14) LFD a1, 1 * SIZE(AO1) LFD a2, 1 * SIZE(AO2) LFD a3, 1 * SIZE(AO3) LFD a4, 1 * SIZE(AO4) LFD a5, 1 * SIZE(AO5) LFD a6, 1 * SIZE(AO6) LFD a7, 1 * SIZE(AO7) LFD a8, 1 * SIZE(AO8) LFD b1, 1 * SIZE(BO) LFD b2, 2 * SIZE(BO) LFD b3, 3 * SIZE(BO) LFD b4, 4 * SIZE(BO) LFD b5, 5 * SIZE(BO) LFD b6, 6 * SIZE(BO) LFD b7, 7 * SIZE(BO) LFD b8, 8 * SIZE(BO) bdz LL(13) .align 4LL(12): FMADD y01, a1, b1, y01 LFD a1, 2 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 2 * SIZE(AO2) FMADD y03, a3, b1, y03 LFD a3, 2 * SIZE(AO3) FMADD y04, a4, b1, y04 LFD a4, 2 * SIZE(AO4) FMADD y05, a5, b1, y05 LFD a5, 2 * SIZE(AO5) FMADD y06, a6, b1, y06 LFD a6, 2 * SIZE(AO6) FMADD y07, a7, b1, y07 LFD a7, 2 * SIZE(AO7) FMADD y08, a8, b1, y08 LFD a8, 2 * SIZE(AO8) FMADD y09, a1, b2, y09 LFD a1, 3 * SIZE(AO1) FMADD y10, a2, b2, y10 LFD a2, 3 * SIZE(AO2) FMADD y11, a3, b2, y11 LFD a3, 3 * SIZE(AO3) FMADD y12, a4, b2, y12 LFD a4, 3 * SIZE(AO4) FMADD y13, a5, b2, y13 LFD a5, 3 * SIZE(AO5) FMADD y14, a6, b2, y14 LFD a6, 3 * SIZE(AO6) FMADD y15, a7, b2, y15 LFD a7, 3 * SIZE(AO7) FMADD y16, a8, b2, y16 LFD a8, 3 * SIZE(AO8) FMADD y01, a1, b3, y01 LFD a1, 4 * SIZE(AO1) FMADD y02, a2, b3, y02 LFD a2, 4 * SIZE(AO2) FMADD y03, a3, b3, y03 LFD a3, 4 * SIZE(AO3) FMADD y04, a4, b3, y04 LFD a4, 4 * SIZE(AO4) FMADD y05, a5, b3, y05 LFD a5, 4 * SIZE(AO5) FMADD y06, a6, b3, y06 LFD a6, 4 * SIZE(AO6) FMADD y07, a7, b3, y07 LFD a7, 4 * SIZE(AO7) FMADD y08, a8, b3, y08 LFD a8, 4 * SIZE(AO8) FMADD y09, a1, b4, y09 LFD a1, 5 * SIZE(AO1) FMADD y10, a2, b4, y10 LFD a2, 5 * SIZE(AO2) FMADD y11, a3, b4, y11 LFD a3, 5 * SIZE(AO3) FMADD y12, a4, b4, y12 LFD a4, 5 * SIZE(AO4) FMADD y13, a5, b4, y13 LFD a5, 5 * SIZE(AO5) FMADD y14, a6, b4, y14 LFD a6, 5 * SIZE(AO6) FMADD y15, a7, b4, y15 LFD a7, 5 * SIZE(AO7) FMADD y16, a8, b4, y16 LFD a8, 5 * SIZE(AO8) LFD b1, 9 * SIZE(BO) LFD b2, 10 * SIZE(BO) LFD b3, 11 * SIZE(BO) LFD b4, 12 * SIZE(BO) FMADD y01, a1, b5, y01 LFD a1, 6 * SIZE(AO1) FMADD y02, a2, b5, y02 LFD a2, 6 * SIZE(AO2) FMADD y03, a3, b5, y03 LFD a3, 6 * SIZE(AO3) FMADD y04, a4, b5, y04 LFD a4, 6 * SIZE(AO4) FMADD y05, a5, b5, y05 LFD a5, 6 * SIZE(AO5) FMADD y06, a6, b5, y06 LFD a6, 6 * SIZE(AO6) FMADD y07, a7, b5, y07 LFD a7, 6 * SIZE(AO7) FMADD y08, a8, b5, y08 LFD a8, 6 * SIZE(AO8) FMADD y09, a1, b6, y09 LFD a1, 7 * SIZE(AO1) FMADD y10, a2, b6, y10 LFD a2, 7 * SIZE(AO2) FMADD y11, a3, b6, y11 LFD a3, 7 * SIZE(AO3) FMADD y12, a4, b6, y12 LFD a4, 7 * SIZE(AO4) FMADD y13, a5, b6, y13 LFD a5, 7 * SIZE(AO5) FMADD y14, a6, b6, y14 LFD a6, 7 * SIZE(AO6) FMADD y15, a7, b6, y15 LFD a7, 7 * SIZE(AO7) FMADD y16, a8, b6, y16 LFD a8, 7 * SIZE(AO8) FMADD y01, a1, b7, y01 LFD a1, 8 * SIZE(AO1) FMADD y02, a2, b7, y02 LFD a2, 8 * SIZE(AO2) FMADD y03, a3, b7, y03 LFD a3, 8 * SIZE(AO3) FMADD y04, a4, b7, y04 LFD a4, 8 * SIZE(AO4) FMADD y05, a5, b7, y05 LFD a5, 8 * SIZE(AO5) FMADD y06, a6, b7, y06 LFD a6, 8 * SIZE(AO6) FMADD y07, a7, b7, y07 LFD a7, 8 * SIZE(AO7) FMADD y08, a8, b7, y08 LFD a8, 8 * SIZE(AO8) FMADD y09, a1, b8, y09 LFD a1, 9 * SIZE(AO1) FMADD y10, a2, b8, y10 LFD a2, 9 * SIZE(AO2) FMADD y11, a3, b8, y11 LFD a3, 9 * SIZE(AO3) FMADD y12, a4, b8, y12 LFD a4, 9 * SIZE(AO4) FMADD y13, a5, b8, y13 LFD a5, 9 * SIZE(AO5) FMADD y14, a6, b8, y14 LFD a6, 9 * SIZE(AO6) FMADD y15, a7, b8, y15 LFD a7, 9 * SIZE(AO7) FMADD y16, a8, b8, y16 LFD a8, 9 * SIZE(AO8) LFD b5, 13 * SIZE(BO) LFD b6, 14 * SIZE(BO) LFD b7, 15 * SIZE(BO) LFD b8, 16 * SIZE(BO) PREFETCH_A1 PREFETCH_A2 PREFETCH_A3 PREFETCH_A4 FMADD y01, a1, b1, y01 LFD a1, 10 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 10 * SIZE(AO2) FMADD y03, a3, b1, y03 LFD a3, 10 * SIZE(AO3) FMADD y04, a4, b1, y04 LFD a4, 10 * SIZE(AO4) FMADD y05, a5, b1, y05 LFD a5, 10 * SIZE(AO5) FMADD y06, a6, b1, y06 LFD a6, 10 * SIZE(AO6) FMADD y07, a7, b1, y07 LFD a7, 10 * SIZE(AO7) FMADD y08, a8, b1, y08 LFD a8, 10 * SIZE(AO8) FMADD y09, a1, b2, y09 LFD a1, 11 * SIZE(AO1) FMADD y10, a2, b2, y10 LFD a2, 11 * SIZE(AO2) FMADD y11, a3, b2, y11 LFD a3, 11 * SIZE(AO3) FMADD y12, a4, b2, y12 LFD a4, 11 * SIZE(AO4) FMADD y13, a5, b2, y13 LFD a5, 11 * SIZE(AO5) FMADD y14, a6, b2, y14 LFD a6, 11 * SIZE(AO6) FMADD y15, a7, b2, y15 LFD a7, 11 * SIZE(AO7) FMADD y16, a8, b2, y16 LFD a8, 11 * SIZE(AO8) FMADD y01, a1, b3, y01 LFD a1, 12 * SIZE(AO1) FMADD y02, a2, b3, y02 LFD a2, 12 * SIZE(AO2) FMADD y03, a3, b3, y03 LFD a3, 12 * SIZE(AO3) FMADD y04, a4, b3, y04 LFD a4, 12 * SIZE(AO4) FMADD y05, a5, b3, y05 LFD a5, 12 * SIZE(AO5) FMADD y06, a6, b3, y06 LFD a6, 12 * SIZE(AO6) FMADD y07, a7, b3, y07 LFD a7, 12 * SIZE(AO7) FMADD y08, a8, b3, y08 LFD a8, 12 * SIZE(AO8) FMADD y09, a1, b4, y09 LFD a1, 13 * SIZE(AO1) FMADD y10, a2, b4, y10 LFD a2, 13 * SIZE(AO2) FMADD y11, a3, b4, y11 LFD a3, 13 * SIZE(AO3) FMADD y12, a4, b4, y12 LFD a4, 13 * SIZE(AO4) FMADD y13, a5, b4, y13 LFD a5, 13 * SIZE(AO5) FMADD y14, a6, b4, y14 LFD a6, 13 * SIZE(AO6) FMADD y15, a7, b4, y15 LFD a7, 13 * SIZE(AO7) FMADD y16, a8, b4, y16 LFD a8, 13 * SIZE(AO8) LFD b1, 17 * SIZE(BO) LFD b2, 18 * SIZE(BO) LFD b3, 19 * SIZE(BO) LFD b4, 20 * SIZE(BO) FMADD y01, a1, b5, y01 LFD a1, 14 * SIZE(AO1) FMADD y02, a2, b5, y02 LFD a2, 14 * SIZE(AO2) FMADD y03, a3, b5, y03 LFD a3, 14 * SIZE(AO3) FMADD y04, a4, b5, y04 LFD a4, 14 * SIZE(AO4) FMADD y05, a5, b5, y05 LFD a5, 14 * SIZE(AO5) FMADD y06, a6, b5, y06 LFD a6, 14 * SIZE(AO6) FMADD y07, a7, b5, y07 LFD a7, 14 * SIZE(AO7) FMADD y08, a8, b5, y08 LFD a8, 14 * SIZE(AO8) FMADD y09, a1, b6, y09 LFD a1, 15 * SIZE(AO1) FMADD y10, a2, b6, y10 LFD a2, 15 * SIZE(AO2) FMADD y11, a3, b6, y11 LFD a3, 15 * SIZE(AO3) FMADD y12, a4, b6, y12 LFD a4, 15 * SIZE(AO4) FMADD y13, a5, b6, y13 LFD a5, 15 * SIZE(AO5) FMADD y14, a6, b6, y14 LFD a6, 15 * SIZE(AO6) FMADD y15, a7, b6, y15 LFD a7, 15 * SIZE(AO7) FMADD y16, a8, b6, y16 LFD a8, 15 * SIZE(AO8) FMADD y01, a1, b7, y01 LFD a1, 16 * SIZE(AO1) FMADD y02, a2, b7, y02 LFD a2, 16 * SIZE(AO2) FMADD y03, a3, b7, y03 LFD a3, 16 * SIZE(AO3) FMADD y04, a4, b7, y04 LFD a4, 16 * SIZE(AO4) FMADD y05, a5, b7, y05 LFD a5, 16 * SIZE(AO5) FMADD y06, a6, b7, y06 LFD a6, 16 * SIZE(AO6)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -