⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 gemm_kernel_altivec.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 4 页
字号:
/*********************************************************************//*                                                                   *//*             Optimized BLAS libraries                              *//*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     *//*                                                                   *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   *//* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of     *//* profits, interruption of business, or related expenses which may  *//* arise from use of Software or Documentation, including but not    *//* limited to those resulting from defects in Software and/or        *//* Documentation, or loss or inaccuracy of data of any kind.         *//*********************************************************************/#define ASSEMBLER#include "common.h"		#ifndef __64BIT__#define LOAD	lwz#else#define LOAD	ld#endif#ifdef __64BIT__#define STACKSIZE 360#else#define STACKSIZE 272#endif#define ALPHA		  0#define FZERO		 16#define	M	r3#define	N	r4#define	K	r5#ifdef linux#ifndef __64BIT__#define A	r6#define	B	r7#define	C	r8#define	LDC	r9#else#define A	r7#define	B	r8#define	C	r9#define	LDC	r10#endif#endif#if defined(_AIX) || defined(__APPLE__)#if !defined(__64BIT__) && defined(DOUBLE)#define A	r8#define	B	r9#define	C	r10#define	LDC	r7#else#define A	r7#define	B	r8#define	C	r9#define	LDC	r10#endif#endif#define STACK	r11#define	I	r21#define J	r22#define AO	r23#define	BO	r24#define	CO1	r25#define CO2	r26#define	CO3	r27#define	CO4	r28#define PREA	r29#define PREB	r29#define PREC	r30#define VREG	r31#define LOAD_A	lvx#define LOAD_B	lvx#define OFFSET_0	  0#define OFFSET_1	r14#define OFFSET_2	r15#define OFFSET_3	r16#define OFFSET_4	r17#define OFFSET_5	r18#define OFFSET_6	r19#define OFFSET_7	r20#define	c01	v0#define	c02	v1#define	c03	v2#define	c04	v3#define	c05	v4#define	c06	v5#define	c07	v6#define	c08	v7#define	c09	v8#define	c10	v9#define	c11	v10#define	c12	v11#define	c13	v12#define	c14	v13#define	c15	v14#define	c16	v15#define	a1	v16#define	a2	v17#define	a3	v18#define	a4	v19#define	a5	v20#define	a6	v21#define	a7	v22#define	a8	v23#define	b1	v24#define	b2	v25#define	bp1	v26#define	bp2	v27#define C1	v16#define C2	v17#define C3	v18#define C4	v19#define C5	v20#define C6	v21#define C7	v22#define C8	v23#define C9	v24#define c00	v25#define PERMRSHIFT1	 v26#define PERMRSHIFT2	 v27#define PERMRSHIFT3	 v28#define PERMRSHIFT4	 v29#define VZERO	v30#define alpha	v31#ifndef NEEDPARAM#ifndef DOUBLE#include "../sparam.h"#else#include "../dparam.h"#endif	PROLOGUE	PROFCODE	addi	SP, SP, -STACKSIZE	mr	STACK, SP	li	r0,  0 * 16	stvx	v20, SP, r0	li	r0,  1 * 16	stvx	v21, SP, r0	li	r0,  2 * 16	stvx	v22, SP, r0	li	r0,  3 * 16	stvx	v23, SP, r0	li	r0,  4 * 16	stvx	v24, SP, r0	li	r0,  5 * 16	stvx	v25, SP, r0	li	r0,  6 * 16	stvx	v26, SP, r0	li	r0,  7 * 16	stvx	v27, SP, r0	li	r0,  8 * 16	stvx	v28, SP, r0	li	r0,  9 * 16	stvx	v29, SP, r0	li	r0, 10 * 16	stvx	v30, SP, r0	li	r0, 11 * 16	stvx	v31, SP, r0#ifdef __64BIT__	std	r31,  192(SP)	std	r30,  200(SP)	std	r29,  208(SP)	std	r28,  216(SP)	std	r27,  224(SP)	std	r26,  232(SP)	std	r25,  240(SP)	std	r24,  248(SP)	std	r23,  256(SP)	std	r22,  264(SP)	std	r21,  272(SP)	std	r20,  280(SP)	std	r19,  288(SP)	std	r18,  296(SP)	std	r17,  304(SP)	std	r16,  312(SP)	std	r15,  320(SP)	std	r14,  328(SP)#else	stw	r31,  192(SP)	stw	r30,  196(SP)	stw	r29,  200(SP)	stw	r28,  204(SP)	stw	r27,  208(SP)	stw	r26,  212(SP)	stw	r25,  216(SP)	stw	r24,  220(SP)	stw	r23,  224(SP)	stw	r22,  228(SP)	stw	r21,  232(SP)	stw	r20,  236(SP)	stw	r19,  240(SP)	stw	r18,  244(SP)	stw	r17,  248(SP)	stw	r16,  252(SP)	stw	r15,  256(SP)	stw	r14,  260(SP)#endif#if defined(_AIX) || defined(__APPLE__)#if !defined(__64BIT__) && defined(DOUBLE)	lwz	LDC,    56 + STACKSIZE(SP)#endif#endif	li	r0, -1	mfspr	VREG, VRsave	mtspr	VRsave, r0	addi	SP, SP, -128	li	r0, -128	and	SP, SP, r0	li	OFFSET_1,  4 * SIZE	li	OFFSET_2,  8 * SIZE	li	OFFSET_3, 12 * SIZE	li	OFFSET_4, 16 * SIZE	li	OFFSET_5, 20 * SIZE	li	OFFSET_6, 24 * SIZE	li	OFFSET_7, 28 * SIZE	stfs	f1,  ALPHA +  0(SP)	stfs	f1,  ALPHA +  4(SP)	stfs	f1,  ALPHA +  8(SP)	stfs	f1,  ALPHA + 12(SP)	li	r29, 0	stw	r29, FZERO(SP)	slwi	LDC, LDC, BASE_SHIFT	li	PREC,   (15 * SIZE)#ifdef CELL	li	PREB,   (3 * 32 * SIZE)#else	li	PREB,   (5 * 32 * SIZE)#endif	cmpwi	cr0, M, 0	ble	LL(999)	cmpwi	cr0, N, 0	ble	LL(999)	cmpwi	cr0, K, 0	ble	LL(999)	srawi.	J, N,  2	ble	LL(60)	.align 4LL(01):	mr	CO1, C	add	CO2, C,  LDC	add	CO3, CO2, LDC	add	CO4, CO3, LDC	add	C,   CO4, LDC	mr	AO, A	srawi.	I, M,  4	ble	LL(20)	.align 4LL(11):	vxor	c01, c01, c01	LOAD_B	b1, OFFSET_0, B	vxor	c02, c02, c02	LOAD_A	a1, OFFSET_0, AO	vxor	c03, c03, c03	LOAD_A	a2, OFFSET_1, AO	vxor	c04, c04, c04	LOAD_A	a3, OFFSET_2, AO	vxor	c05, c05, c05	LOAD_A	a4, OFFSET_3, AO	vxor	c06, c06, c06	LOAD_A	a5, OFFSET_4, AO	vxor	c07, c07, c07	nop	vxor	c08, c08, c08	vxor	c09, c09, c09	dcbtst	CO1, PREC	vxor	c10, c10, c10	dcbtst	CO2, PREC	vxor	c11, c11, c11	dcbtst	CO3, PREC	vxor	c12, c12, c12	dcbtst	CO4, PREC	vxor	c13, c13, c13	mr	BO, B	vxor	c14, c14, c14	srawi.	r0,  K,  2	vxor	c15, c15, c15	mtspr	CTR, r0	vxor	c16, c16, c16	vspltw	bp1, b1, 0	ble	LL(13)	.align 4#define NOP1   mr	r3, r3#define NOP2   mr	r4, r4LL(12):	vmaddfp	c01, a1, bp1, c01	vspltw	bp2, b1, 1	vmaddfp	c02, a2, bp1, c02	PREFETCH_A	vmaddfp	c03, a3, bp1, c03	NOP1	vmaddfp	c04, a4, bp1, c04	vspltw	bp1, b1, 2	vmaddfp	c05, a1, bp2, c05	PREFETCH_B	vmaddfp	c06, a2, bp2, c06	NOP2	vmaddfp	c07, a3, bp2, c07	NOP1	vmaddfp	c08, a4, bp2, c08	vspltw	bp2, b1, 3	vmaddfp	c09, a1, bp1, c09	NOP1	vmaddfp	c10, a2, bp1, c10	LOAD_B	b2, OFFSET_1, BO	vmaddfp	c11, a3, bp1, c11	addi	BO, BO,  8 * SIZE	vmaddfp	c12, a4, bp1, c12	vspltw	bp1, b2, 0	vmaddfp	c13, a1, bp2, c13	NOP1	vmaddfp	c14, a2, bp2, c14	LOAD_A	a5, OFFSET_4, AO	vmaddfp	c15, a3, bp2, c15	LOAD_A	a6, OFFSET_5, AO	vmaddfp	c16, a4, bp2, c16	vspltw	bp2, b2, 1	vmaddfp	c01, a5, bp1, c01	LOAD_A	a7, OFFSET_6, AO	vmaddfp	c02, a6, bp1, c02	LOAD_A	a8, OFFSET_7, AO	vmaddfp	c03, a7, bp1, c03	NOP1	vmaddfp	c04, a8, bp1, c04	NOP2	vmaddfp	c05, a5, bp2, c05	vspltw	bp1, b2, 2	vmaddfp	c06, a6, bp2, c06	addi	AO, AO, 32 * SIZE	vmaddfp	c07, a7, bp2, c07	LOAD_B	b1, OFFSET_0, BO	vmaddfp	c08, a8, bp2, c08	vspltw	bp2, b2, 3	vmaddfp	c09, a5, bp1, c09	NOP1	vmaddfp	c10, a6, bp1, c10	NOP2	vmaddfp	c11, a7, bp1, c11	NOP1	vmaddfp	c12, a8, bp1, c12	vspltw	bp1, b1, 0	vmaddfp	c13, a5, bp2, c13	PREFETCH_A	vmaddfp	c14, a6, bp2, c14	LOAD_A	a1, OFFSET_0, AO	vmaddfp	c15, a7, bp2, c15	LOAD_A	a2, OFFSET_1, AO	vmaddfp	c16, a8, bp2, c16	vspltw	bp2, b1, 1	vmaddfp	c01, a1, bp1, c01	LOAD_A	a3, OFFSET_2, AO	vmaddfp	c02, a2, bp1, c02	LOAD_A	a4, OFFSET_3, AO	vmaddfp	c03, a3, bp1, c03	NOP1	vmaddfp	c04, a4, bp1, c04	vspltw	bp1, b1, 2	vmaddfp	c05, a1, bp2, c05	NOP1	vmaddfp	c06, a2, bp2, c06	NOP2	vmaddfp	c07, a3, bp2, c07	NOP1	vmaddfp	c08, a4, bp2, c08	vspltw	bp2, b1, 3	vmaddfp	c09, a1, bp1, c09	LOAD_B	b2, OFFSET_1, BO	vmaddfp	c10, a2, bp1, c10	NOP2	vmaddfp	c11, a3, bp1, c11	NOP1	vmaddfp	c12, a4, bp1, c12	addi	BO, BO,  8 * SIZE	vmaddfp	c13, a1, bp2, c13	vspltw	bp1, b2, 0	vmaddfp	c14, a2, bp2, c14	LOAD_A	a5, OFFSET_4, AO	vmaddfp	c15, a3, bp2, c15	LOAD_A	a6, OFFSET_5, AO	vmaddfp	c16, a4, bp2, c16	vspltw	bp2, b2, 1	vmaddfp	c01, a5, bp1, c01	LOAD_A	a7, OFFSET_6, AO	vmaddfp	c02, a6, bp1, c02	LOAD_A	a8, OFFSET_7, AO	vmaddfp	c03, a7, bp1, c03	addi	AO, AO, 32 * SIZE	vmaddfp	c04, a8, bp1, c04	NOP2	vmaddfp	c05, a5, bp2, c05	vspltw	bp1, b2, 2	vmaddfp	c06, a6, bp2, c06	NOP2	vmaddfp	c07, a7, bp2, c07	NOP1	vmaddfp	c08, a8, bp2, c08	LOAD_B	b1, OFFSET_0, BO	vmaddfp	c09, a5, bp1, c09	vspltw	bp2, b2, 3	vmaddfp	c10, a6, bp1, c10	LOAD_A	a1, OFFSET_0, AO	//	vmaddfp	c11, a7, bp1, c11	LOAD_A	a2, OFFSET_1, AO	vmaddfp	c12, a8, bp1, c12	NOP2	vmaddfp	c13, a5, bp2, c13	vspltw	bp1, b1, 0	vmaddfp	c14, a6, bp2, c14	LOAD_A	a3, OFFSET_2, AO	vmaddfp	c15, a7, bp2, c15	LOAD_A	a4, OFFSET_3, AO	vmaddfp	c16, a8, bp2, c16	bdnz+	LL(12)	.align 4LL(13):	andi.	r0,  K,  2	nop	nop	ble+	LL(15)	.align 4	vmaddfp	c01, a1, bp1, c01	vspltw	bp2, b1, 1	vmaddfp	c02, a2, bp1, c02	NOP2	vmaddfp	c03, a3, bp1, c03	NOP1	vmaddfp	c04, a4, bp1, c04	NOP2	vmaddfp	c05, a1, bp2, c05	vspltw	bp1, b1, 2	vmaddfp	c06, a2, bp2, c06	NOP2	vmaddfp	c07, a3, bp2, c07	NOP1	vmaddfp	c08, a4, bp2, c08	LOAD_B	b2, OFFSET_1, BO	vmaddfp	c09, a1, bp1, c09	vspltw	bp2, b1, 3	vmaddfp	c10, a2, bp1, c10	LOAD_A	a5, OFFSET_4, AO	vmaddfp	c11, a3, bp1, c11	LOAD_A	a6, OFFSET_5, AO	vmaddfp	c12, a4, bp1, c12	addi	BO, BO,  8 * SIZE	vmaddfp	c13, a1, bp2, c13	vspltw	bp1, b2, 0	vmaddfp	c14, a2, bp2, c14	LOAD_A	a7, OFFSET_6, AO	vmaddfp	c15, a3, bp2, c15	LOAD_A	a8, OFFSET_7, AO	vmaddfp	c16, a4, bp2, c16	addi	AO, AO, 32 * SIZE	vmaddfp	c01, a5, bp1, c01	vspltw	bp2, b2, 1	vmaddfp	c02, a6, bp1, c02	NOP2	vmaddfp	c03, a7, bp1, c03	NOP1	vmaddfp	c04, a8, bp1, c04	NOP2	vmaddfp	c05, a5, bp2, c05	vspltw	bp1, b2, 2	vmaddfp	c06, a6, bp2, c06	NOP2	vmaddfp	c07, a7, bp2, c07	NOP1	vmaddfp	c08, a8, bp2, c08	LOAD_B	b1, OFFSET_0, BO	vmaddfp	c09, a5, bp1, c09	vspltw	bp2, b2, 3	vmaddfp	c10, a6, bp1, c10	LOAD_A	a1, OFFSET_0, AO	vmaddfp	c11, a7, bp1, c11	LOAD_A	a2, OFFSET_1, AO	vmaddfp	c12, a8, bp1, c12	NOP2	vmaddfp	c13, a5, bp2, c13	vspltw	bp1, b1, 0	vmaddfp	c14, a6, bp2, c14	LOAD_A	a3, OFFSET_2, AO	vmaddfp	c15, a7, bp2, c15	LOAD_A	a4, OFFSET_3, AO	vmaddfp	c16, a8, bp2, c16	.align 4LL(15):	andi.	r0,  K,  1	lvx	alpha, OFFSET_0, SP	vxor	VZERO, VZERO, VZERO	ble+	LL(18)	.align 4	vmaddfp	c01, a1, bp1, c01	vspltw	bp2, b1, 1	vmaddfp	c02, a2, bp1, c02	nop	vmaddfp	c03, a3, bp1, c03	nop	vmaddfp	c04, a4, bp1, c04	nop	vmaddfp	c05, a1, bp2, c05	vspltw	bp1, b1, 2	vmaddfp	c06, a2, bp2, c06	nop	vmaddfp	c07, a3, bp2, c07	nop	vmaddfp	c08, a4, bp2, c08	nop	vmaddfp	c09, a1, bp1, c09	vspltw	bp2, b1, 3	vmaddfp	c10, a2, bp1, c10	addi	AO, AO, 16 * SIZE	vmaddfp	c11, a3, bp1, c11	addi	BO, BO,  4 * SIZE	vmaddfp	c12, a4, bp1, c12	nop	vmaddfp	c13, a1, bp2, c13	vmaddfp	c14, a2, bp2, c14	vmaddfp	c15, a3, bp2, c15	vmaddfp	c16, a4, bp2, c16	.align 4LL(18):	lvx	C1, OFFSET_0, CO1	cmpwi	cr0, LDC, 32 * SIZE	lvx	C2, OFFSET_1, CO1	lvsr	PERMRSHIFT1, 0, CO1	lvx	C3, OFFSET_2, CO1	lvsr	PERMRSHIFT2, 0, CO2	lvx	C4, OFFSET_3, CO1	lvsr	PERMRSHIFT3, 0, CO3	lvx	C5, OFFSET_4, CO1	lvsr	PERMRSHIFT4, 0, CO4	ble	LL(19)	vperm	c00, VZERO, c01,   PERMRSHIFT1	vperm	c01, c01,   c02,   PERMRSHIFT1	vperm	c02, c02,   c03,   PERMRSHIFT1	vperm	c03, c03,   c04,   PERMRSHIFT1	vperm	c04, c04,   VZERO, PERMRSHIFT1	vmaddfp	c00, alpha, c00, C1	lvx	C1, OFFSET_0, CO2	vmaddfp	c01, alpha, c01, C2	lvx	C6, OFFSET_1, CO2	vmaddfp	c02, alpha, c02, C3	lvx	C7, OFFSET_2, CO2	vmaddfp	c03, alpha, c03, C4	lvx	C8, OFFSET_3, CO2	vmaddfp	c04, alpha, c04, C5	lvx	C9, OFFSET_4, CO2	stvx	c00, OFFSET_0, CO1	vperm	c00, VZERO, c05,   PERMRSHIFT2	stvx	c01, OFFSET_1, CO1	vperm	c05, c05,   c06,   PERMRSHIFT2	stvx	c02, OFFSET_2, CO1	vperm	c06, c06,   c07,   PERMRSHIFT2	stvx	c03, OFFSET_3, CO1	vperm	c07, c07,   c08,   PERMRSHIFT2	stvx	c04, OFFSET_4, CO1	vperm	c08, c08,   VZERO, PERMRSHIFT2	vmaddfp	c00, alpha, c00, C1	lvx	C1, OFFSET_0, CO3	vmaddfp	c05, alpha, c05, C6	lvx	C2, OFFSET_1, CO3	vmaddfp	c06, alpha, c06, C7	lvx	C3, OFFSET_2, CO3	vmaddfp	c07, alpha, c07, C8	lvx	C4, OFFSET_3, CO3	vmaddfp	c08, alpha, c08, C9	lvx	C5, OFFSET_4, CO3	stvx	c00, OFFSET_0, CO2	vperm	c00, VZERO, c09,   PERMRSHIFT3	stvx	c05, OFFSET_1, CO2	vperm	c09, c09,   c10,   PERMRSHIFT3	stvx	c06, OFFSET_2, CO2	vperm	c10, c10,   c11,   PERMRSHIFT3	stvx	c07, OFFSET_3, CO2	vperm	c11, c11,   c12,   PERMRSHIFT3	stvx	c08, OFFSET_4, CO2	vperm	c12, c12,   VZERO, PERMRSHIFT3	vmaddfp	c00, alpha, c00, C1	lvx	C9, OFFSET_4, CO4	vmaddfp	c09, alpha, c09, C2	lvx	C1, OFFSET_0, CO4	vmaddfp	c10, alpha, c10, C3	lvx	C6, OFFSET_1, CO4	vmaddfp	c11, alpha, c11, C4	lvx	C7, OFFSET_2, CO4	vmaddfp	c12, alpha, c12, C5	lvx	C8, OFFSET_3, CO4	stvx	c00, OFFSET_0, CO3	vperm	c00, VZERO, c13,   PERMRSHIFT4	stvx	c09, OFFSET_1, CO3	vperm	c13, c13,   c14,   PERMRSHIFT4

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -