📄 gemm_tcopy_hummer_8.s
字号:
/*********************************************************************//* *//* Optimized BLAS libraries *//* By Kazushige Goto <kgoto@tacc.utexas.edu> *//* *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO *//* THE USE OF THE SOFTWARE OR DOCUMENTATION. *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of *//* profits, interruption of business, or related expenses which may *//* arise from use of Software or Documentation, including but not *//* limited to those resulting from defects in Software and/or *//* Documentation, or loss or inaccuracy of data of any kind. *//*********************************************************************/#define ASSEMBLER#include "common.h" #define M r3#define N r4#define A r5#define LDA r6#define B r7#define AO1 r8#define AO2 r9#define AO3 r10#define AO4 r11#define J r12#define B1 r21#define B2 r22#define B3 r23#define B4 r24#define M8 r25#define AO5 r26#define AO6 r27#define AO7 r28#define AO8 r29#define INC r30#define INC2 r31 #define c01 f0#define c02 f1#define c03 f2#define c04 f3#define c05 f4#define c06 f5#define c07 f6#define c08 f7#define c09 f8#define c10 f9#define c11 f10#define c12 f11#define c13 f12#define c14 f13#define c15 f14#define c16 f15#define c17 f16#define c18 f17#define c19 f18#define c20 f19#define c21 f20#define c22 f21#define c23 f22#define c24 f23#define c25 f24#define c26 f25#define c27 f26#define c28 f27#define c29 f28#define c30 f29#define c31 f30#define c32 f31#define STACKSIZE 64 PROLOGUE PROFCODE li r0, -16 stfpdux f14, SP, r0 stfpdux f15, SP, r0 stfpdux f16, SP, r0 stfpdux f17, SP, r0 stfpdux f18, SP, r0 stfpdux f19, SP, r0 stfpdux f20, SP, r0 stfpdux f21, SP, r0 stfpdux f22, SP, r0 stfpdux f23, SP, r0 stfpdux f24, SP, r0 stfpdux f25, SP, r0 stfpdux f26, SP, r0 stfpdux f27, SP, r0 stfpdux f28, SP, r0 stfpdux f29, SP, r0 stfpdux f30, SP, r0 stfpdux f31, SP, r0 stwu r31, -4(SP) stwu r30, -4(SP) stwu r29, -4(SP) stwu r28, -4(SP) stwu r27, -4(SP) stwu r26, -4(SP) stwu r25, -4(SP) stwu r24, -4(SP) stwu r23, -4(SP) stwu r22, -4(SP) stwu r21, -4(SP) slwi LDA, LDA, BASE_SHIFT slwi M8, M, 3 + BASE_SHIFT li r8, -8 li r9, -4 li r10, -2 and B2, N, r8 and B3, N, r9 and B4, N, r10 mullw B2, B2, M mullw B3, B3, M mullw B4, B4, M slwi B2, B2, BASE_SHIFT slwi B3, B3, BASE_SHIFT slwi B4, B4, BASE_SHIFT add B2, B2, B add B3, B3, B add B4, B4, B cmpwi cr0, M, 0 ble- .L999 cmpwi cr0, N, 0 ble- .L999 subi B2, B2, 2 * SIZE subi B3, B3, 2 * SIZE subi B4, B4, 2 * SIZE subi M8, M8, 62 * SIZE li INC, 1 * SIZE li INC2, 2 * SIZE andi. r0, A, 2 * SIZE - 1 bne .L100 andi. r0, LDA, 2 * SIZE - 1 bne .L100 subi A, A, 2 * SIZE srawi. J, M, 3 ble .L20 .align 4.L10: mr AO1, A add AO2, A, LDA add AO3, AO2, LDA add AO4, AO3, LDA add AO5, AO4, LDA add AO6, AO5, LDA add AO7, AO6, LDA add AO8, AO7, LDA add A, AO8, LDA sub B1, B, M8 addi B, B, 64 * SIZE srawi. r0, N, 3 mtspr CTR, r0 ble .L15 .align 4.L12: LFPDUX c01, AO1, INC2 LFPDUX c02, AO1, INC2 LFPDUX c03, AO1, INC2 LFPDUX c04, AO1, INC2 LFPDUX c05, AO2, INC2 LFPDUX c06, AO2, INC2 LFPDUX c07, AO2, INC2 LFPDUX c08, AO2, INC2 LFPDUX c09, AO3, INC2 LFPDUX c10, AO3, INC2 LFPDUX c11, AO3, INC2 LFPDUX c12, AO3, INC2 LFPDUX c13, AO4, INC2 LFPDUX c14, AO4, INC2 LFPDUX c15, AO4, INC2 LFPDUX c16, AO4, INC2 LFPDUX c17, AO5, INC2 LFPDUX c18, AO5, INC2 LFPDUX c19, AO5, INC2 LFPDUX c20, AO5, INC2 LFPDUX c21, AO6, INC2 LFPDUX c22, AO6, INC2 LFPDUX c23, AO6, INC2 LFPDUX c24, AO6, INC2 LFPDUX c25, AO7, INC2 LFPDUX c26, AO7, INC2 LFPDUX c27, AO7, INC2 LFPDUX c28, AO7, INC2 LFPDUX c29, AO8, INC2 LFPDUX c30, AO8, INC2 LFPDUX c31, AO8, INC2 LFPDUX c32, AO8, INC2 STFPDUX c01, B1, M8 STFPDUX c02, B1, INC2 STFPDUX c03, B1, INC2 STFPDUX c04, B1, INC2 STFPDUX c05, B1, INC2 STFPDUX c06, B1, INC2 STFPDUX c07, B1, INC2 STFPDUX c08, B1, INC2 STFPDUX c09, B1, INC2 STFPDUX c10, B1, INC2 STFPDUX c11, B1, INC2 STFPDUX c12, B1, INC2 STFPDUX c13, B1, INC2 STFPDUX c14, B1, INC2 STFPDUX c15, B1, INC2 STFPDUX c16, B1, INC2 STFPDUX c17, B1, INC2 STFPDUX c18, B1, INC2 STFPDUX c19, B1, INC2 STFPDUX c20, B1, INC2 STFPDUX c21, B1, INC2 STFPDUX c22, B1, INC2 STFPDUX c23, B1, INC2 STFPDUX c24, B1, INC2 STFPDUX c25, B1, INC2 STFPDUX c26, B1, INC2 STFPDUX c27, B1, INC2 STFPDUX c28, B1, INC2 STFPDUX c29, B1, INC2 STFPDUX c30, B1, INC2 STFPDUX c31, B1, INC2 STFPDUX c32, B1, INC2 bdnz .L12 .align 4 .L15: andi. r0, N, 7 ble .L19 andi. r0, N, 4 ble .L16 LFPDUX c01, AO1, INC2 LFPDUX c02, AO1, INC2 LFPDUX c03, AO2, INC2 LFPDUX c04, AO2, INC2 LFPDUX c05, AO3, INC2 LFPDUX c06, AO3, INC2 LFPDUX c07, AO4, INC2 LFPDUX c08, AO4, INC2 LFPDUX c09, AO5, INC2 LFPDUX c10, AO5, INC2 LFPDUX c11, AO6, INC2 LFPDUX c12, AO6, INC2 LFPDUX c13, AO7, INC2 LFPDUX c14, AO7, INC2 LFPDUX c15, AO8, INC2 LFPDUX c16, AO8, INC2 STFPDUX c01, B2, INC2 STFPDUX c02, B2, INC2 STFPDUX c03, B2, INC2 STFPDUX c04, B2, INC2 STFPDUX c05, B2, INC2 STFPDUX c06, B2, INC2 STFPDUX c07, B2, INC2 STFPDUX c08, B2, INC2 STFPDUX c09, B2, INC2 STFPDUX c10, B2, INC2 STFPDUX c11, B2, INC2 STFPDUX c12, B2, INC2 STFPDUX c13, B2, INC2 STFPDUX c14, B2, INC2 STFPDUX c15, B2, INC2 STFPDUX c16, B2, INC2 .align 4.L16: andi. r0, N, 2 ble .L17 LFPDUX c01, AO1, INC2 LFPDUX c03, AO2, INC2 LFPDUX c05, AO3, INC2 LFPDUX c07, AO4, INC2 LFPDUX c09, AO5, INC2 LFPDUX c11, AO6, INC2 LFPDUX c13, AO7, INC2 LFPDUX c15, AO8, INC2 STFPDUX c01, B3, INC2 STFPDUX c03, B3, INC2 STFPDUX c05, B3, INC2 STFPDUX c07, B3, INC2 STFPDUX c09, B3, INC2 STFPDUX c11, B3, INC2 STFPDUX c13, B3, INC2 STFPDUX c15, B3, INC2 .align 4.L17: andi. r0, N, 1 ble .L19 LFDUX c01, AO1, INC2 LFDUX c02, AO3, INC2 LFDUX c03, AO5, INC2 LFDUX c04, AO7, INC2 LFSDUX c01, AO2, INC2 LFSDUX c02, AO4, INC2 LFSDUX c03, AO6, INC2 LFSDUX c04, AO8, INC2 STFPDUX c01, B4, INC2 STFPDUX c02, B4, INC2 STFPDUX c03, B4, INC2 STFPDUX c04, B4, INC2 .align 4.L19: addic. J, J, -1 bgt .L10 .align 4.L20: andi. J, M, 4 addi M8, M8, 32 * SIZE ble .L30 mr AO1, A add AO2, A, LDA add AO3, AO2, LDA add AO4, AO3, LDA add A, AO4, LDA sub B1, B, M8 addi B, B, 32 * SIZE srawi. r0, N, 3 mtspr CTR, r0 ble .L25 .align 4.L22: LFPDUX c01, AO1, INC2 LFPDUX c02, AO1, INC2 LFPDUX c03, AO1, INC2 LFPDUX c04, AO1, INC2 LFPDUX c05, AO2, INC2 LFPDUX c06, AO2, INC2 LFPDUX c07, AO2, INC2 LFPDUX c08, AO2, INC2 LFPDUX c09, AO3, INC2 LFPDUX c10, AO3, INC2 LFPDUX c11, AO3, INC2 LFPDUX c12, AO3, INC2 LFPDUX c13, AO4, INC2 LFPDUX c14, AO4, INC2 LFPDUX c15, AO4, INC2 LFPDUX c16, AO4, INC2 STFPDUX c01, B1, M8 STFPDUX c02, B1, INC2 STFPDUX c03, B1, INC2 STFPDUX c04, B1, INC2 STFPDUX c05, B1, INC2 STFPDUX c06, B1, INC2 STFPDUX c07, B1, INC2 STFPDUX c08, B1, INC2 STFPDUX c09, B1, INC2 STFPDUX c10, B1, INC2 STFPDUX c11, B1, INC2 STFPDUX c12, B1, INC2 STFPDUX c13, B1, INC2 STFPDUX c14, B1, INC2 STFPDUX c15, B1, INC2 STFPDUX c16, B1, INC2 bdnz .L22 .align 4 .L25: andi. r0, N, 7 ble .L30 andi. r0, N, 4 ble .L26 LFPDUX c01, AO1, INC2 LFPDUX c02, AO1, INC2 LFPDUX c03, AO2, INC2 LFPDUX c04, AO2, INC2 LFPDUX c05, AO3, INC2 LFPDUX c06, AO3, INC2 LFPDUX c07, AO4, INC2 LFPDUX c08, AO4, INC2 STFPDUX c01, B2, INC2 STFPDUX c02, B2, INC2 STFPDUX c03, B2, INC2 STFPDUX c04, B2, INC2 STFPDUX c05, B2, INC2 STFPDUX c06, B2, INC2 STFPDUX c07, B2, INC2 STFPDUX c08, B2, INC2 .align 4.L26: andi. r0, N, 2 ble .L27 LFPDUX c01, AO1, INC2 LFPDUX c03, AO2, INC2 LFPDUX c05, AO3, INC2 LFPDUX c07, AO4, INC2 STFPDUX c01, B3, INC2 STFPDUX c03, B3, INC2 STFPDUX c05, B3, INC2 STFPDUX c07, B3, INC2 .align 4.L27: andi. r0, N, 1 ble .L30 LFDUX c01, AO1, INC2 LFDUX c02, AO2, INC2 LFDUX c03, AO3, INC2 LFDUX c04, AO4, INC2 fsmfp c01, c02 fsmfp c03, c04 STFPDUX c01, B4, INC2 STFPDUX c03, B4, INC2 .align 4.L30: andi. J, M, 2 addi M8, M8, 16 * SIZE ble .L40 mr AO1, A add AO2, A, LDA add A, AO2, LDA sub B1, B, M8 addi B, B, 16 * SIZE srawi. r0, N, 3 mtspr CTR, r0 ble .L35 .align 4.L32: LFPDUX c01, AO1, INC2 LFPDUX c02, AO1, INC2 LFPDUX c03, AO1, INC2 LFPDUX c04, AO1, INC2 LFPDUX c05, AO2, INC2 LFPDUX c06, AO2, INC2 LFPDUX c07, AO2, INC2 LFPDUX c08, AO2, INC2 STFPDUX c01, B1, M8 STFPDUX c02, B1, INC2 STFPDUX c03, B1, INC2 STFPDUX c04, B1, INC2 STFPDUX c05, B1, INC2 STFPDUX c06, B1, INC2 STFPDUX c07, B1, INC2 STFPDUX c08, B1, INC2 bdnz .L32 .align 4 .L35: andi. r0, N, 7 ble .L40 andi. r0, N, 4 ble .L36 LFPDUX c01, AO1, INC2 LFPDUX c02, AO1, INC2 LFPDUX c03, AO2, INC2 LFPDUX c04, AO2, INC2 STFPDUX c01, B2, INC2 STFPDUX c02, B2, INC2 STFPDUX c03, B2, INC2 STFPDUX c04, B2, INC2 .align 4.L36: andi. r0, N, 2 ble .L37 LFPDUX c01, AO1, INC2 LFPDUX c02, AO2, INC2 STFPDUX c01, B3, INC2 STFPDUX c02, B3, INC2 .align 4.L37: andi. r0, N, 1 ble .L40 LFDUX c01, AO1, INC2 LFDUX c02, AO2, INC2 fsmfp c01, c02 STFPDUX c01, B4, INC2 .align 4.L40: andi. J, M, 1 addi M8, M8, 8 * SIZE ble .L999 mr AO1, A sub B1, B, M8 srawi. r0, N, 3 mtspr CTR, r0 ble .L45 .align 4.L42: LFPDUX c01, AO1, INC2 LFPDUX c02, AO1, INC2 LFPDUX c03, AO1, INC2 LFPDUX c04, AO1, INC2 STFPDUX c01, B1, M8 STFPDUX c02, B1, INC2 STFPDUX c03, B1, INC2 STFPDUX c04, B1, INC2 bdnz .L42 .align 4 .L45: andi. r0, N, 7 ble .L999 andi. r0, N, 4 ble .L46 LFPDUX c01, AO1, INC2 LFPDUX c02, AO1, INC2 STFPDUX c01, B2, INC2 STFPDUX c02, B2, INC2 .align 4.L46: andi. r0, N, 2 ble .L47 LFPDUX c01, AO1, INC2 STFPDUX c01, B3, INC2 .align 4.L47: andi. r0, N, 1 ble .L999 LFDX c01, AO1, INC2 STFDX c01, B4, INC2 b .L999 .align 4.L100: subi A, A, SIZE srawi. J, M, 3 ble .L120 .align 4.L110: mr AO1, A add AO2, A, LDA add AO3, AO2, LDA add AO4, AO3, LDA add AO5, AO4, LDA add AO6, AO5, LDA add AO7, AO6, LDA add AO8, AO7, LDA add A, AO8, LDA
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -