📄 gemm_kernel_altivec.s
字号:
/*********************************************************************//* *//* Optimized BLAS libraries *//* By Kazushige Goto <kgoto@tacc.utexas.edu> *//* *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO *//* THE USE OF THE SOFTWARE OR DOCUMENTATION. *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of *//* profits, interruption of business, or related expenses which may *//* arise from use of Software or Documentation, including but not *//* limited to those resulting from defects in Software and/or *//* Documentation, or loss or inaccuracy of data of any kind. *//*********************************************************************/#define ASSEMBLER#include "common.h" #ifndef __64BIT__#define LOAD lwz#else#define LOAD ld#endif#ifdef __64BIT__#define STACKSIZE 360#else#define STACKSIZE 272#endif#define ALPHA 0#define FZERO 16#define M r3#define N r4#define K r5#ifdef linux#ifndef __64BIT__#define A r6#define B r7#define C r8#define LDC r9#else#define A r7#define B r8#define C r9#define LDC r10#endif#endif#if defined(_AIX) || defined(__APPLE__)#if !defined(__64BIT__) && defined(DOUBLE)#define A r8#define B r9#define C r10#define LDC r7#else#define A r7#define B r8#define C r9#define LDC r10#endif#endif#define STACK r11#define I r21#define J r22#define AO r23#define BO r24#define CO1 r25#define CO2 r26#define CO3 r27#define CO4 r28#define PREA r29#define PREB r29#define PREC r30#define VREG r31#define LOAD_A lvx#define LOAD_B lvx#define OFFSET_0 0#define OFFSET_1 r14#define OFFSET_2 r15#define OFFSET_3 r16#define OFFSET_4 r17#define OFFSET_5 r18#define OFFSET_6 r19#define OFFSET_7 r20#define c01 v0#define c02 v1#define c03 v2#define c04 v3#define c05 v4#define c06 v5#define c07 v6#define c08 v7#define c09 v8#define c10 v9#define c11 v10#define c12 v11#define c13 v12#define c14 v13#define c15 v14#define c16 v15#define a1 v16#define a2 v17#define a3 v18#define a4 v19#define a5 v20#define a6 v21#define a7 v22#define a8 v23#define b1 v24#define b2 v25#define bp1 v26#define bp2 v27#define C1 v16#define C2 v17#define C3 v18#define C4 v19#define C5 v20#define C6 v21#define C7 v22#define C8 v23#define C9 v24#define c00 v25#define PERMRSHIFT1 v26#define PERMRSHIFT2 v27#define PERMRSHIFT3 v28#define PERMRSHIFT4 v29#define VZERO v30#define alpha v31#ifndef NEEDPARAM#ifndef DOUBLE#include "../sparam.h"#else#include "../dparam.h"#endif PROLOGUE PROFCODE addi SP, SP, -STACKSIZE mr STACK, SP li r0, 0 * 16 stvx v20, SP, r0 li r0, 1 * 16 stvx v21, SP, r0 li r0, 2 * 16 stvx v22, SP, r0 li r0, 3 * 16 stvx v23, SP, r0 li r0, 4 * 16 stvx v24, SP, r0 li r0, 5 * 16 stvx v25, SP, r0 li r0, 6 * 16 stvx v26, SP, r0 li r0, 7 * 16 stvx v27, SP, r0 li r0, 8 * 16 stvx v28, SP, r0 li r0, 9 * 16 stvx v29, SP, r0 li r0, 10 * 16 stvx v30, SP, r0 li r0, 11 * 16 stvx v31, SP, r0#ifdef __64BIT__ std r31, 192(SP) std r30, 200(SP) std r29, 208(SP) std r28, 216(SP) std r27, 224(SP) std r26, 232(SP) std r25, 240(SP) std r24, 248(SP) std r23, 256(SP) std r22, 264(SP) std r21, 272(SP) std r20, 280(SP) std r19, 288(SP) std r18, 296(SP) std r17, 304(SP) std r16, 312(SP) std r15, 320(SP) std r14, 328(SP)#else stw r31, 192(SP) stw r30, 196(SP) stw r29, 200(SP) stw r28, 204(SP) stw r27, 208(SP) stw r26, 212(SP) stw r25, 216(SP) stw r24, 220(SP) stw r23, 224(SP) stw r22, 228(SP) stw r21, 232(SP) stw r20, 236(SP) stw r19, 240(SP) stw r18, 244(SP) stw r17, 248(SP) stw r16, 252(SP) stw r15, 256(SP) stw r14, 260(SP)#endif#if defined(_AIX) || defined(__APPLE__)#if !defined(__64BIT__) && defined(DOUBLE) lwz LDC, 56 + STACKSIZE(SP)#endif#endif li r0, -1 mfspr VREG, VRsave mtspr VRsave, r0 addi SP, SP, -128 li r0, -128 and SP, SP, r0 li OFFSET_1, 4 * SIZE li OFFSET_2, 8 * SIZE li OFFSET_3, 12 * SIZE li OFFSET_4, 16 * SIZE li OFFSET_5, 20 * SIZE li OFFSET_6, 24 * SIZE li OFFSET_7, 28 * SIZE stfs f1, ALPHA + 0(SP) stfs f1, ALPHA + 4(SP) stfs f1, ALPHA + 8(SP) stfs f1, ALPHA + 12(SP) li r29, 0 stw r29, FZERO(SP) slwi LDC, LDC, BASE_SHIFT li PREC, (15 * SIZE)#ifdef CELL li PREB, (3 * 32 * SIZE)#else li PREB, (5 * 32 * SIZE)#endif cmpwi cr0, M, 0 ble LL(999) cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, K, 0 ble LL(999) srawi. J, N, 2 ble LL(60) .align 4LL(01): mr CO1, C add CO2, C, LDC add CO3, CO2, LDC add CO4, CO3, LDC add C, CO4, LDC mr AO, A srawi. I, M, 4 ble LL(20) .align 4LL(11): vxor c01, c01, c01 LOAD_B b1, OFFSET_0, B vxor c02, c02, c02 LOAD_A a1, OFFSET_0, AO vxor c03, c03, c03 LOAD_A a2, OFFSET_1, AO vxor c04, c04, c04 LOAD_A a3, OFFSET_2, AO vxor c05, c05, c05 LOAD_A a4, OFFSET_3, AO vxor c06, c06, c06 LOAD_A a5, OFFSET_4, AO vxor c07, c07, c07 nop vxor c08, c08, c08 vxor c09, c09, c09 dcbtst CO1, PREC vxor c10, c10, c10 dcbtst CO2, PREC vxor c11, c11, c11 dcbtst CO3, PREC vxor c12, c12, c12 dcbtst CO4, PREC vxor c13, c13, c13 mr BO, B vxor c14, c14, c14 srawi. r0, K, 2 vxor c15, c15, c15 mtspr CTR, r0 vxor c16, c16, c16 vspltw bp1, b1, 0 ble LL(13) .align 4#define NOP1 mr r3, r3#define NOP2 mr r4, r4LL(12): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 PREFETCH_A vmaddfp c03, a3, bp1, c03 NOP1 vmaddfp c04, a4, bp1, c04 vspltw bp1, b1, 2 vmaddfp c05, a1, bp2, c05 PREFETCH_B vmaddfp c06, a2, bp2, c06 NOP2 vmaddfp c07, a3, bp2, c07 NOP1 vmaddfp c08, a4, bp2, c08 vspltw bp2, b1, 3 vmaddfp c09, a1, bp1, c09 NOP1 vmaddfp c10, a2, bp1, c10 LOAD_B b2, OFFSET_1, BO vmaddfp c11, a3, bp1, c11 addi BO, BO, 8 * SIZE vmaddfp c12, a4, bp1, c12 vspltw bp1, b2, 0 vmaddfp c13, a1, bp2, c13 NOP1 vmaddfp c14, a2, bp2, c14 LOAD_A a5, OFFSET_4, AO vmaddfp c15, a3, bp2, c15 LOAD_A a6, OFFSET_5, AO vmaddfp c16, a4, bp2, c16 vspltw bp2, b2, 1 vmaddfp c01, a5, bp1, c01 LOAD_A a7, OFFSET_6, AO vmaddfp c02, a6, bp1, c02 LOAD_A a8, OFFSET_7, AO vmaddfp c03, a7, bp1, c03 NOP1 vmaddfp c04, a8, bp1, c04 NOP2 vmaddfp c05, a5, bp2, c05 vspltw bp1, b2, 2 vmaddfp c06, a6, bp2, c06 addi AO, AO, 32 * SIZE vmaddfp c07, a7, bp2, c07 LOAD_B b1, OFFSET_0, BO vmaddfp c08, a8, bp2, c08 vspltw bp2, b2, 3 vmaddfp c09, a5, bp1, c09 NOP1 vmaddfp c10, a6, bp1, c10 NOP2 vmaddfp c11, a7, bp1, c11 NOP1 vmaddfp c12, a8, bp1, c12 vspltw bp1, b1, 0 vmaddfp c13, a5, bp2, c13 PREFETCH_A vmaddfp c14, a6, bp2, c14 LOAD_A a1, OFFSET_0, AO vmaddfp c15, a7, bp2, c15 LOAD_A a2, OFFSET_1, AO vmaddfp c16, a8, bp2, c16 vspltw bp2, b1, 1 vmaddfp c01, a1, bp1, c01 LOAD_A a3, OFFSET_2, AO vmaddfp c02, a2, bp1, c02 LOAD_A a4, OFFSET_3, AO vmaddfp c03, a3, bp1, c03 NOP1 vmaddfp c04, a4, bp1, c04 vspltw bp1, b1, 2 vmaddfp c05, a1, bp2, c05 NOP1 vmaddfp c06, a2, bp2, c06 NOP2 vmaddfp c07, a3, bp2, c07 NOP1 vmaddfp c08, a4, bp2, c08 vspltw bp2, b1, 3 vmaddfp c09, a1, bp1, c09 LOAD_B b2, OFFSET_1, BO vmaddfp c10, a2, bp1, c10 NOP2 vmaddfp c11, a3, bp1, c11 NOP1 vmaddfp c12, a4, bp1, c12 addi BO, BO, 8 * SIZE vmaddfp c13, a1, bp2, c13 vspltw bp1, b2, 0 vmaddfp c14, a2, bp2, c14 LOAD_A a5, OFFSET_4, AO vmaddfp c15, a3, bp2, c15 LOAD_A a6, OFFSET_5, AO vmaddfp c16, a4, bp2, c16 vspltw bp2, b2, 1 vmaddfp c01, a5, bp1, c01 LOAD_A a7, OFFSET_6, AO vmaddfp c02, a6, bp1, c02 LOAD_A a8, OFFSET_7, AO vmaddfp c03, a7, bp1, c03 addi AO, AO, 32 * SIZE vmaddfp c04, a8, bp1, c04 NOP2 vmaddfp c05, a5, bp2, c05 vspltw bp1, b2, 2 vmaddfp c06, a6, bp2, c06 NOP2 vmaddfp c07, a7, bp2, c07 NOP1 vmaddfp c08, a8, bp2, c08 LOAD_B b1, OFFSET_0, BO vmaddfp c09, a5, bp1, c09 vspltw bp2, b2, 3 vmaddfp c10, a6, bp1, c10 LOAD_A a1, OFFSET_0, AO // vmaddfp c11, a7, bp1, c11 LOAD_A a2, OFFSET_1, AO vmaddfp c12, a8, bp1, c12 NOP2 vmaddfp c13, a5, bp2, c13 vspltw bp1, b1, 0 vmaddfp c14, a6, bp2, c14 LOAD_A a3, OFFSET_2, AO vmaddfp c15, a7, bp2, c15 LOAD_A a4, OFFSET_3, AO vmaddfp c16, a8, bp2, c16 bdnz+ LL(12) .align 4LL(13): andi. r0, K, 2 nop nop ble+ LL(15) .align 4 vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 NOP2 vmaddfp c03, a3, bp1, c03 NOP1 vmaddfp c04, a4, bp1, c04 NOP2 vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 NOP2 vmaddfp c07, a3, bp2, c07 NOP1 vmaddfp c08, a4, bp2, c08 LOAD_B b2, OFFSET_1, BO vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c10, a2, bp1, c10 LOAD_A a5, OFFSET_4, AO vmaddfp c11, a3, bp1, c11 LOAD_A a6, OFFSET_5, AO vmaddfp c12, a4, bp1, c12 addi BO, BO, 8 * SIZE vmaddfp c13, a1, bp2, c13 vspltw bp1, b2, 0 vmaddfp c14, a2, bp2, c14 LOAD_A a7, OFFSET_6, AO vmaddfp c15, a3, bp2, c15 LOAD_A a8, OFFSET_7, AO vmaddfp c16, a4, bp2, c16 addi AO, AO, 32 * SIZE vmaddfp c01, a5, bp1, c01 vspltw bp2, b2, 1 vmaddfp c02, a6, bp1, c02 NOP2 vmaddfp c03, a7, bp1, c03 NOP1 vmaddfp c04, a8, bp1, c04 NOP2 vmaddfp c05, a5, bp2, c05 vspltw bp1, b2, 2 vmaddfp c06, a6, bp2, c06 NOP2 vmaddfp c07, a7, bp2, c07 NOP1 vmaddfp c08, a8, bp2, c08 LOAD_B b1, OFFSET_0, BO vmaddfp c09, a5, bp1, c09 vspltw bp2, b2, 3 vmaddfp c10, a6, bp1, c10 LOAD_A a1, OFFSET_0, AO vmaddfp c11, a7, bp1, c11 LOAD_A a2, OFFSET_1, AO vmaddfp c12, a8, bp1, c12 NOP2 vmaddfp c13, a5, bp2, c13 vspltw bp1, b1, 0 vmaddfp c14, a6, bp2, c14 LOAD_A a3, OFFSET_2, AO vmaddfp c15, a7, bp2, c15 LOAD_A a4, OFFSET_3, AO vmaddfp c16, a8, bp2, c16 .align 4LL(15): andi. r0, K, 1 lvx alpha, OFFSET_0, SP vxor VZERO, VZERO, VZERO ble+ LL(18) .align 4 vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 nop vmaddfp c03, a3, bp1, c03 nop vmaddfp c04, a4, bp1, c04 nop vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 nop vmaddfp c07, a3, bp2, c07 nop vmaddfp c08, a4, bp2, c08 nop vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c10, a2, bp1, c10 addi AO, AO, 16 * SIZE vmaddfp c11, a3, bp1, c11 addi BO, BO, 4 * SIZE vmaddfp c12, a4, bp1, c12 nop vmaddfp c13, a1, bp2, c13 vmaddfp c14, a2, bp2, c14 vmaddfp c15, a3, bp2, c15 vmaddfp c16, a4, bp2, c16 .align 4LL(18): lvx C1, OFFSET_0, CO1 cmpwi cr0, LDC, 32 * SIZE lvx C2, OFFSET_1, CO1 lvsr PERMRSHIFT1, 0, CO1 lvx C3, OFFSET_2, CO1 lvsr PERMRSHIFT2, 0, CO2 lvx C4, OFFSET_3, CO1 lvsr PERMRSHIFT3, 0, CO3 lvx C5, OFFSET_4, CO1 lvsr PERMRSHIFT4, 0, CO4 ble LL(19) vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, c03, PERMRSHIFT1 vperm c03, c03, c04, PERMRSHIFT1 vperm c04, c04, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 lvx C1, OFFSET_0, CO2 vmaddfp c01, alpha, c01, C2 lvx C6, OFFSET_1, CO2 vmaddfp c02, alpha, c02, C3 lvx C7, OFFSET_2, CO2 vmaddfp c03, alpha, c03, C4 lvx C8, OFFSET_3, CO2 vmaddfp c04, alpha, c04, C5 lvx C9, OFFSET_4, CO2 stvx c00, OFFSET_0, CO1 vperm c00, VZERO, c05, PERMRSHIFT2 stvx c01, OFFSET_1, CO1 vperm c05, c05, c06, PERMRSHIFT2 stvx c02, OFFSET_2, CO1 vperm c06, c06, c07, PERMRSHIFT2 stvx c03, OFFSET_3, CO1 vperm c07, c07, c08, PERMRSHIFT2 stvx c04, OFFSET_4, CO1 vperm c08, c08, VZERO, PERMRSHIFT2 vmaddfp c00, alpha, c00, C1 lvx C1, OFFSET_0, CO3 vmaddfp c05, alpha, c05, C6 lvx C2, OFFSET_1, CO3 vmaddfp c06, alpha, c06, C7 lvx C3, OFFSET_2, CO3 vmaddfp c07, alpha, c07, C8 lvx C4, OFFSET_3, CO3 vmaddfp c08, alpha, c08, C9 lvx C5, OFFSET_4, CO3 stvx c00, OFFSET_0, CO2 vperm c00, VZERO, c09, PERMRSHIFT3 stvx c05, OFFSET_1, CO2 vperm c09, c09, c10, PERMRSHIFT3 stvx c06, OFFSET_2, CO2 vperm c10, c10, c11, PERMRSHIFT3 stvx c07, OFFSET_3, CO2 vperm c11, c11, c12, PERMRSHIFT3 stvx c08, OFFSET_4, CO2 vperm c12, c12, VZERO, PERMRSHIFT3 vmaddfp c00, alpha, c00, C1 lvx C9, OFFSET_4, CO4 vmaddfp c09, alpha, c09, C2 lvx C1, OFFSET_0, CO4 vmaddfp c10, alpha, c10, C3 lvx C6, OFFSET_1, CO4 vmaddfp c11, alpha, c11, C4 lvx C7, OFFSET_2, CO4 vmaddfp c12, alpha, c12, C5 lvx C8, OFFSET_3, CO4 stvx c00, OFFSET_0, CO3 vperm c00, VZERO, c13, PERMRSHIFT4 stvx c09, OFFSET_1, CO3 vperm c13, c13, c14, PERMRSHIFT4
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -