📄 gemv_t_ppc440.s
字号:
/*********************************************************************//* *//* Optimized BLAS libraries *//* By Kazushige Goto <kgoto@tacc.utexas.edu> *//* *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO *//* THE USE OF THE SOFTWARE OR DOCUMENTATION. *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of *//* profits, interruption of business, or related expenses which may *//* arise from use of Software or Documentation, including but not *//* limited to those resulting from defects in Software and/or *//* Documentation, or loss or inaccuracy of data of any kind. *//*********************************************************************/#define ASSEMBLER#include "common.h"#ifdef linux#ifndef __64BIT__#define M r3#define N r4#define A r6#define LDA r7#define X r8#define INCX r9#define Y r10#define INCY r5#else#define M r3#define N r4#define A r7#define LDA r8#define X r9#define INCX r10#define Y r5#define INCY r6#endif#endif#if defined(_AIX) || defined(__APPLE__)#if !defined(__64BIT__) && defined(DOUBLE)#define M r3#define N r4#define A r8#define LDA r9#define X r10#define INCX r5#define Y r6#define INCY r7#else#define M r3#define N r4#define A r7#define LDA r8#define X r9#define INCX r10#define Y r5#define INCY r6#endif#endif#define BUFFER r11#define XP r12#define AO1 r14#define AO2 r15#define AO3 r16#define AO4 r17#define J r18#define YY r19#define PREA r20#define PREC r21#define X1 r22#if defined(PPCG4)#define PREFETCHSIZE_A 42#define PREFETCHSIZE_C 7#endif#define y01 f0#define y02 f1#define y03 f2#define y04 f3#define y05 f4#define y06 f5#define y07 f6#define y08 f7#define a1 f8#define a2 f9#define a3 f10#define a4 f11#define a5 f12#define a6 f13#define a7 f14#define a8 f15#define b1 f16#define b2 f17#define b3 f18#define b4 f19#define b5 f20#define b6 f21#define b7 f22#define b8 f23#define alpha f23#ifndef NEEDPARAM#ifndef __64BIT__#define STACKSIZE 224#else#define STACKSIZE 288#endif#define FZERO 144(SP)#define ALPHA 152(SP) PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP)#ifdef __64BIT__ std r0, FZERO stfd f1, ALPHA std r14, 160(SP) std r15, 168(SP) std r16, 176(SP) std r17, 184(SP) std r18, 192(SP) std r19, 200(SP) std r20, 208(SP) std r21, 216(SP) std r22, 224(SP)#else stw r0, 0 + FZERO stw r0, 4 + FZERO stfd f1, ALPHA stw r14, 160(SP) stw r15, 164(SP) stw r16, 168(SP) stw r17, 172(SP) stw r18, 176(SP) stw r19, 180(SP) stw r20, 184(SP) stw r21, 188(SP) stw r22, 192(SP)#endif#ifdef linux#ifndef __64BIT__ lwz INCY, 8 + STACKSIZE(SP) lwz BUFFER, 12 + STACKSIZE(SP)#else ld Y, 112 + STACKSIZE(SP) ld INCY, 120 + STACKSIZE(SP) ld BUFFER, 128 + STACKSIZE(SP)#endif#endif#if defined(_AIX) || defined(__APPLE__)#ifndef __64BIT__#ifdef DOUBLE lwz INCX, 56 + STACKSIZE(SP) lwz Y, 60 + STACKSIZE(SP) lwz INCY, 64 + STACKSIZE(SP) lwz BUFFER, 68 + STACKSIZE(SP)#else lwz Y, 56 + STACKSIZE(SP) lwz INCY, 60 + STACKSIZE(SP) lwz BUFFER, 64 + STACKSIZE(SP)#endif#else ld Y, 112 + STACKSIZE(SP) ld INCY, 120 + STACKSIZE(SP) ld BUFFER, 128 + STACKSIZE(SP)#endif#endif slwi LDA, LDA, BASE_SHIFT slwi INCX, INCX, BASE_SHIFT slwi INCY, INCY, BASE_SHIFT addi A, A, -SIZE sub X, X, INCX sub Y, Y, INCY li PREA, PREFETCHSIZE_A * SIZE li PREC, PREFETCHSIZE_C * SIZE cmpi cr0, 0, M, 0 ble LL(999) cmpi cr0, 0, N, 0 ble LL(999) mr XP, X cmpi cr0, 0, INCX, SIZE beq LL(10) addi XP, BUFFER, -SIZE addi X1, BUFFER, -SIZE srawi. r0, M, 3 mtspr CTR, r0 ble LL(CopyRemain) .align 4LL(CopyKernel): LFDUX f0, X, INCX LFDUX f1, X, INCX LFDUX f2, X, INCX LFDUX f3, X, INCX LFDUX f4, X, INCX LFDUX f5, X, INCX LFDUX f6, X, INCX LFDUX f7, X, INCX STFDU f0, 1 * SIZE(X1) STFDU f1, 1 * SIZE(X1) STFDU f2, 1 * SIZE(X1) STFDU f3, 1 * SIZE(X1) STFDU f4, 1 * SIZE(X1) STFDU f5, 1 * SIZE(X1) STFDU f6, 1 * SIZE(X1) STFDU f7, 1 * SIZE(X1) bdnz LL(CopyKernel) .align 4LL(CopyRemain): andi. r0, M, 7 mtspr CTR, r0 ble LL(10) .align 4LL(CopySub): LFDUX f0, X, INCX STFDU f0, 1 * SIZE(X1) bdnz LL(CopySub) .align 4LL(10): mr YY, Y srawi. J, N, 2 ble LL(30) .align 4LL(21): mr AO1, A add AO2, A, LDA add AO3, AO2, LDA add AO4, AO3, LDA add A, AO4, LDA mr X1, XP lfd y01, FZERO fmr y02, y01 fmr y03, y01 fmr y04, y01 fmr y05, y01 fmr y06, y01 fmr y07, y01 fmr y08, y01 dcbtst Y, PREC srawi. r0, M, 3 mtspr CTR, r0 ble LL(24) LFDU a1, 1 * SIZE(AO1) LFDU a2, 1 * SIZE(AO2) LFDU a3, 1 * SIZE(AO3) LFDU a4, 1 * SIZE(AO4) LFDU b1, 1 * SIZE(X1) LFDU b2, 1 * SIZE(X1) LFDU a5, 1 * SIZE(AO1) LFDU a6, 1 * SIZE(AO2) LFDU a7, 1 * SIZE(AO3) LFDU a8, 1 * SIZE(AO4) LFDU b3, 1 * SIZE(X1) LFDU b4, 1 * SIZE(X1) bdz LL(23) .align 4LL(22):#ifdef PPCG4 dcbt X1, PREA#endif FMADD y01, a1, b1, y01 LFDU a1, 1 * SIZE(AO1) FMADD y02, a2, b1, y02 LFDU a2, 1 * SIZE(AO2) FMADD y03, a3, b1, y03 LFDU a3, 1 * SIZE(AO3) FMADD y04, a4, b1, y04 LFDU a4, 1 * SIZE(AO4) LFDU b1, 1 * SIZE(X1)#ifdef PPCG4 dcbt AO1, PREA#endif FMADD y05, a5, b2, y05 LFDU a5, 1 * SIZE(AO1) FMADD y06, a6, b2, y06 LFDU a6, 1 * SIZE(AO2) FMADD y07, a7, b2, y07 LFDU a7, 1 * SIZE(AO3) FMADD y08, a8, b2, y08 LFDU a8, 1 * SIZE(AO4) LFDU b2, 1 * SIZE(X1)#ifdef PPCG4 dcbt AO2, PREA#endif FMADD y01, a1, b3, y01 LFDU a1, 1 * SIZE(AO1) FMADD y02, a2, b3, y02 LFDU a2, 1 * SIZE(AO2) FMADD y03, a3, b3, y03 LFDU a3, 1 * SIZE(AO3) FMADD y04, a4, b3, y04 LFDU a4, 1 * SIZE(AO4) LFDU b3, 1 * SIZE(X1)#ifdef PPCG4 dcbt AO3, PREA#endif FMADD y05, a5, b4, y05 LFDU a5, 1 * SIZE(AO1) FMADD y06, a6, b4, y06 LFDU a6, 1 * SIZE(AO2) FMADD y07, a7, b4, y07 LFDU a7, 1 * SIZE(AO3) FMADD y08, a8, b4, y08 LFDU a8, 1 * SIZE(AO4)#ifdef PPCG4 dcbt AO4, PREA#endif LFDU b4, 1 * SIZE(X1)#if defined(PPCG4) && defined(DOUBLE) dcbt X1, PREA#endif FMADD y01, a1, b1, y01 LFDU a1, 1 * SIZE(AO1) FMADD y02, a2, b1, y02 LFDU a2, 1 * SIZE(AO2) FMADD y03, a3, b1, y03 LFDU a3, 1 * SIZE(AO3) FMADD y04, a4, b1, y04 LFDU a4, 1 * SIZE(AO4) LFDU b1, 1 * SIZE(X1)#if defined(PPCG4) && defined(DOUBLE) dcbt AO1, PREA#endif FMADD y05, a5, b2, y05 LFDU a5, 1 * SIZE(AO1) FMADD y06, a6, b2, y06 LFDU a6, 1 * SIZE(AO2) FMADD y07, a7, b2, y07 LFDU a7, 1 * SIZE(AO3) FMADD y08, a8, b2, y08 LFDU a8, 1 * SIZE(AO4) LFDU b2, 1 * SIZE(X1)#if defined(PPCG4) && defined(DOUBLE) dcbt AO2, PREA#endif FMADD y01, a1, b3, y01 LFDU a1, 1 * SIZE(AO1) FMADD y02, a2, b3, y02 LFDU a2, 1 * SIZE(AO2) FMADD y03, a3, b3, y03 LFDU a3, 1 * SIZE(AO3) FMADD y04, a4, b3, y04 LFDU a4, 1 * SIZE(AO4) LFDU b3, 1 * SIZE(X1)#if defined(PPCG4) && defined(DOUBLE) dcbt AO3, PREA#endif FMADD y05, a5, b4, y05 LFDU a5, 1 * SIZE(AO1) FMADD y06, a6, b4, y06 LFDU a6, 1 * SIZE(AO2) FMADD y07, a7, b4, y07 LFDU a7, 1 * SIZE(AO3) FMADD y08, a8, b4, y08 LFDU a8, 1 * SIZE(AO4) LFDU b4, 1 * SIZE(X1)#if defined(PPCG4) && defined(DOUBLE) dcbt AO4, PREA#endif bdnz LL(22) .align 4 LL(23): FMADD y01, a1, b1, y01 LFDU a1, 1 * SIZE(AO1) FMADD y02, a2, b1, y02 LFDU a2, 1 * SIZE(AO2) FMADD y03, a3, b1, y03 LFDU a3, 1 * SIZE(AO3) FMADD y04, a4, b1, y04 LFDU a4, 1 * SIZE(AO4) LFDU b1, 1 * SIZE(X1) FMADD y05, a5, b2, y05 LFDU a5, 1 * SIZE(AO1) FMADD y06, a6, b2, y06 LFDU a6, 1 * SIZE(AO2) FMADD y07, a7, b2, y07 LFDU a7, 1 * SIZE(AO3) FMADD y08, a8, b2, y08 LFDU a8, 1 * SIZE(AO4) LFDU b2, 1 * SIZE(X1) FMADD y01, a1, b3, y01 LFDU a1, 1 * SIZE(AO1) FMADD y02, a2, b3, y02 LFDU a2, 1 * SIZE(AO2) FMADD y03, a3, b3, y03 LFDU a3, 1 * SIZE(AO3) FMADD y04, a4, b3, y04 LFDU a4, 1 * SIZE(AO4) LFDU b3, 1 * SIZE(X1) FMADD y05, a5, b4, y05 LFDU a5, 1 * SIZE(AO1) FMADD y06, a6, b4, y06 LFDU a6, 1 * SIZE(AO2) FMADD y07, a7, b4, y07 LFDU a7, 1 * SIZE(AO3) FMADD y08, a8, b4, y08 LFDU a8, 1 * SIZE(AO4) LFDU b4, 1 * SIZE(X1) FMADD y01, a1, b1, y01 LFDU a1, 1 * SIZE(AO1) FMADD y02, a2, b1, y02 LFDU a2, 1 * SIZE(AO2) FMADD y03, a3, b1, y03 LFDU a3, 1 * SIZE(AO3) FMADD y04, a4, b1, y04 LFDU a4, 1 * SIZE(AO4) FMADD y05, a5, b2, y05 LFDU a5, 1 * SIZE(AO1) FMADD y06, a6, b2, y06 LFDU a6, 1 * SIZE(AO2) FMADD y07, a7, b2, y07 LFDU a7, 1 * SIZE(AO3) FMADD y08, a8, b2, y08 LFDU a8, 1 * SIZE(AO4) FMADD y01, a1, b3, y01 FMADD y02, a2, b3, y02 FMADD y03, a3, b3, y03 FMADD y04, a4, b3, y04 FMADD y05, a5, b4, y05 FMADD y06, a6, b4, y06 FMADD y07, a7, b4, y07 FMADD y08, a8, b4, y08 .align 4LL(24): andi. r0, M, 7 ble LL(28) andi. r0, M, 4 ble LL(26) LFDU a1, 1 * SIZE(AO1) LFDU a2, 1 * SIZE(AO2) LFDU b1, 1 * SIZE(X1) LFDU a3, 1 * SIZE(AO3) LFDU a4, 1 * SIZE(AO4) LFDU b2, 1 * SIZE(X1) FMADD y01, a1, b1, y01 LFDU a5, 1 * SIZE(AO1) FMADD y02, a2, b1, y02 LFDU a6, 1 * SIZE(AO2) FMADD y03, a3, b1, y03 LFDU a7, 1 * SIZE(AO3) FMADD y04, a4, b1, y04 LFDU a8, 1 * SIZE(AO4) LFDU b3, 1 * SIZE(X1) FMADD y05, a5, b2, y05
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -