⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 zgemm_kernel_altivec_g4.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 3 页
字号:
/*********************************************************************//*                                                                   *//*             Optimized BLAS libraries                              *//*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     *//*                                                                   *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   *//* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of     *//* profits, interruption of business, or related expenses which may  *//* arise from use of Software or Documentation, including but not    *//* limited to those resulting from defects in Software and/or        *//* Documentation, or loss or inaccuracy of data of any kind.         *//*********************************************************************/#define ASSEMBLER#include "common.h"		#ifndef __64BIT__#define LOAD	lwz#else#define LOAD	ld#endif#ifdef __64BIT__#define STACKSIZE 360#else#define STACKSIZE 272#endif#define ALIGN_SIZE	0xffff#define SWAP		  0#define NEG		 16#define ALPHA_R		 32#define ALPHA_I		 48#define FZERO		 64#define	M	r3#define	N	r4#define	K	r5#ifdef linux#ifndef __64BIT__#define A	r6#define	B	r7#define	C	r8#define	LDC	r9#else#define A	r8#define	B	r9#define	C	r10#define	LDC	r6#endif#endif#if defined(_AIX) || defined(__APPLE__)#if !defined(__64BIT__) && defined(DOUBLE)#define A	r10#define	B	r6#define	C	r7#define	LDC	r8#else#define A	r8#define	B	r9#define	C	r10#define	LDC	r6#endif#endif#define STACK	r11#define	I	r21#define J	r22#define AO	r23#define	BO	r24#define	CO1	r25#define CO2	r26#define PREA	r29#define PREB	r29#define PREC	r30#define VREG	r31#define LOAD_A	lvx#define LOAD_B	lvx#define OFFSET_0	  0#define OFFSET_1	r14#define OFFSET_2	r15#define OFFSET_3	r16#define OFFSET_4	r17#define OFFSET_5	r18#define OFFSET_6	r19#define OFFSET_7	r20#define	c01	v0#define	c02	v1#define	c03	v2#define	c04	v3#define	c05	v4#define	c06	v5#define	c07	v6#define	c08	v7#define	c09	v8#define	c10	v9#define	c11	v10#define	c12	v11#define	c13	v12#define	c14	v13#define	c15	v14#define	c16	v15#define	a1	v16#define	a2	v17#define	a3	v18#define	a4	v19#define	a5	v20#define	a6	v21#define	a7	v22#define	a8	v23#define	b1	v24#define	b2	v25#define	bp1	v26#define	bp2	v27#define C1	v16#define C2	v17#define C3	v18#define C4	v19#define C5	v20#define c00	v24#define VZERO		 v25#define PERMRSHIFT1	 v26#define PERMRSHIFT2	 v27#define swap		 v28#define neg		 v29#define alpha_r		 v30#define alpha_i		 v31#ifndef NEEDPARAM#ifndef DOUBLE#include "../cparam.h"#else#include "../zparam.h"#endif	PROLOGUE	PROFCODE	addi	SP, SP, -STACKSIZE	mr	STACK, SP	li	r0,  0 * 16	stvx	v20, SP, r0	li	r0,  1 * 16	stvx	v21, SP, r0	li	r0,  2 * 16	stvx	v22, SP, r0	li	r0,  3 * 16	stvx	v23, SP, r0	li	r0,  4 * 16	stvx	v24, SP, r0	li	r0,  5 * 16	stvx	v25, SP, r0	li	r0,  6 * 16	stvx	v26, SP, r0	li	r0,  7 * 16	stvx	v27, SP, r0	li	r0,  8 * 16	stvx	v28, SP, r0	li	r0,  9 * 16	stvx	v29, SP, r0	li	r0, 10 * 16	stvx	v30, SP, r0	li	r0, 11 * 16	stvx	v31, SP, r0#ifdef __64BIT__	std	r31,  192(SP)	std	r30,  200(SP)	std	r29,  208(SP)	std	r28,  216(SP)	std	r27,  224(SP)	std	r26,  232(SP)	std	r25,  240(SP)	std	r24,  248(SP)	std	r23,  256(SP)	std	r22,  264(SP)	std	r21,  272(SP)	std	r20,  280(SP)	std	r19,  288(SP)	std	r18,  296(SP)	std	r17,  304(SP)	std	r16,  312(SP)	std	r15,  320(SP)	std	r14,  328(SP)#else	stw	r31,  192(SP)	stw	r30,  196(SP)	stw	r29,  200(SP)	stw	r28,  204(SP)	stw	r27,  208(SP)	stw	r26,  212(SP)	stw	r25,  216(SP)	stw	r24,  220(SP)	stw	r23,  224(SP)	stw	r22,  228(SP)	stw	r21,  232(SP)	stw	r20,  236(SP)	stw	r19,  240(SP)	stw	r18,  244(SP)	stw	r17,  248(SP)	stw	r16,  252(SP)	stw	r15,  256(SP)	stw	r14,  260(SP)#endif#ifdef linux#ifdef __64BIT__	ld	LDC,    112 + STACKSIZE(SP)#endif#endif#if defined(_AIX) || defined(__APPLE__)#ifdef __64BIT__	ld	LDC,    112 + STACKSIZE(SP)#else#ifdef DOUBLE	lwz	B,       56 + STACKSIZE(SP)	lwz	C,       60 + STACKSIZE(SP)	lwz	LDC,     64 + STACKSIZE(SP)#else	lwz	LDC,     56 + STACKSIZE(SP)#endif#endif#endif	li	r0, -1	mfspr	VREG, VRsave	mtspr	VRsave, r0	addi	SP, SP, -128	li	r0, -8192	and	SP, SP, r0	fneg	f3, f1	fneg	f4, f2#if   defined(NN) || defined(NT) || defined(TN) || defined(TT) || \      defined(NC) || defined(TC) || defined(NR) || defined(TR)	stfs	f1,  ALPHA_R +  0(SP)	stfs	f1,  ALPHA_R +  4(SP)	stfs	f1,  ALPHA_R +  8(SP)	stfs	f1,  ALPHA_R + 12(SP)	stfs	f4,  ALPHA_I +  0(SP)	stfs	f2,  ALPHA_I +  4(SP)	stfs	f4,  ALPHA_I +  8(SP)	stfs	f2,  ALPHA_I + 12(SP)#else	stfs	f1,  ALPHA_R +  0(SP)	stfs	f3,  ALPHA_R +  4(SP)	stfs	f1,  ALPHA_R +  8(SP)	stfs	f3,  ALPHA_R + 12(SP)	stfs	f2,  ALPHA_I +  0(SP)	stfs	f2,  ALPHA_I +  4(SP)	stfs	f2,  ALPHA_I +  8(SP)	stfs	f2,  ALPHA_I + 12(SP)#endif	li	I,    Address_L(0x04050607)	addis	I, I, Address_H(0x04050607)	stw	I, SWAP +  0(SP)	li	I,    Address_L(0x00010203)	addis	I, I, Address_H(0x00010203)	stw	I, SWAP +  4(SP)	li	I,    Address_L(0x0c0d0e0f)	addis	I, I, Address_H(0x0c0d0e0f)	stw	I, SWAP +  8(SP)	li	I,    Address_L(0x08090a0b)	addis	I, I, Address_H(0x08090a0b)	stw	I, SWAP + 12(SP)#if   defined(NN) || defined(NT) || defined(TN) || defined(TT) || \      defined(RR) || defined(RC) || defined(CR) || defined(CC)	lis	I, 0x8000	stw	I, NEG +  0(SP)	stw	I, NEG +  8(SP)	li	I, 0	stw	I, NEG +  4(SP)	stw	I, NEG + 12(SP)#else	li	I, 0	stw	I, NEG +  0(SP)	stw	I, NEG +  8(SP)	lis	I, 0x8000	stw	I, NEG +  4(SP)	stw	I, NEG + 12(SP)#endif	li	r0, 0	stw	r0, FZERO(SP)	slwi	LDC, LDC, ZBASE_SHIFT	li	PREC,   (15 * SIZE)	li	PREB,   (25 * 8 * SIZE)	li	OFFSET_1,  4 * SIZE	li	OFFSET_2,  8 * SIZE	li	OFFSET_3, 12 * SIZE	li	OFFSET_4, 16 * SIZE	li	OFFSET_5, 20 * SIZE	li	OFFSET_6, 24 * SIZE	li	OFFSET_7, 28 * SIZE	cmpwi	cr0, M, 0	ble	LL(999)	cmpwi	cr0, N, 0	ble	LL(999)	cmpwi	cr0, K, 0	ble	LL(999)	srawi.	J, N,  1	ble	LL(50)	.align 4LL(01):	mr	CO1, C	add	CO2, C,  LDC	add	C,   CO2, LDC	mr	AO, A	srawi.	I, M,  3	ble	LL(20)	.align 4LL(11):	vxor	c01, c01, c01	LOAD_B	b1, OFFSET_0, B	vxor	c02, c02, c02	LOAD_A	a1, OFFSET_0, AO	vxor	c03, c03, c03	LOAD_A	a2, OFFSET_1, AO	vxor	c04, c04, c04	LOAD_A	a3, OFFSET_2, AO	vxor	c05, c05, c05	LOAD_A	a4, OFFSET_3, AO	vxor	c06, c06, c06	LOAD_B	b2, OFFSET_2, B	vxor	c07, c07, c07	LOAD_A	a5, OFFSET_4, AO	vxor	c08, c08, c08	LOAD_A	a6, OFFSET_5, AO	vxor	c09, c09, c09	dcbtst	CO1, PREC	vxor	c10, c10, c10	dcbtst	CO2, PREC	vxor	c11, c11, c11	vxor	c12, c12, c12	vxor	c13, c13, c13	mr	BO, B	vxor	c14, c14, c14	srawi.	r0,  K,  2	vxor	c15, c15, c15	mtspr	CTR, r0	vxor	c16, c16, c16	vspltw	bp1, b1, 0	ble	LL(15)	.align 4LL(12):/* 1 */	vmaddfp	c01, a1, bp1, c01	vspltw	bp2, b1, 1	vmaddfp	c02, a2, bp1, c02	addi	AO, AO,  8 * SIZE	vmaddfp	c03, a3, bp1, c03	LOAD_A	a7, OFFSET_4, AO	vmaddfp	c04, a4, bp1, c04	LOAD_A	a8, OFFSET_5, AO/* 2 */	vmaddfp	c05, a1, bp2, c05	vspltw	bp1, b1, 2	vmaddfp	c06, a2, bp2, c06	dcbt	BO, PREB	vmaddfp	c07, a3, bp2, c07	dcbt	AO, PREB	vmaddfp	c08, a4, bp2, c08	addi	AO, AO,  8 * SIZE/* 3 */	vmaddfp	c09, a1, bp1, c09	vspltw	bp2, b1, 3	vmaddfp	c10, a2, bp1, c10	LOAD_B	b1, OFFSET_1, BO	vmaddfp	c11, a3, bp1, c11	dcbt	AO, PREB	vmaddfp	c12, a4, bp1, c12	addi	AO, AO, 8 * SIZE/* 4 */	vmaddfp	c13, a1, bp2, c13	vspltw	bp1, b1, 0	vmaddfp	c14, a2, bp2, c14	LOAD_A	a1, OFFSET_2, AO	vmaddfp	c15, a3, bp2, c15	dcbt	AO, PREB	vmaddfp	c16, a4, bp2, c16	addi	AO, AO,  8 * SIZE/* 5 */	vmaddfp	c01, a5, bp1, c01	vspltw	bp2, b1, 1	vmaddfp	c02, a6, bp1, c02	LOAD_A	a2, OFFSET_1, AO	vmaddfp	c03, a7, bp1, c03	LOAD_A	a3, OFFSET_2, AO	vmaddfp	c04, a8, bp1, c04	LOAD_A	a4, OFFSET_3, AO/* 6 */	vmaddfp	c05, a5, bp2, c05	vspltw	bp1, b1, 2	vmaddfp	c06, a6, bp2, c06	nop	vmaddfp	c07, a7, bp2, c07	dcbt	AO, PREA	vmaddfp	c08, a8, bp2, c08	addi	AO, AO,  8 * SIZE/* 7 */	vmaddfp	c09, a5, bp1, c09	vspltw	bp2, b1, 3	vmaddfp	c10, a6, bp1, c10	LOAD_B	b1, OFFSET_4, BO	vmaddfp	c11, a7, bp1, c11	nop	vmaddfp	c12, a8, bp1, c12	nop/* 8 */	vmaddfp	c13, a5, bp2, c13	vspltw	bp1, b2, 0	vmaddfp	c14, a6, bp2, c14	LOAD_A	a5, OFFSET_2, AO	vmaddfp	c15, a7, bp2, c15	LOAD_A	a6, OFFSET_3, AO	vmaddfp	c16, a8, bp2, c16	LOAD_A	a7, OFFSET_4, AO/* 9 */	vmaddfp	c01, a1, bp1, c01	vspltw	bp2, b2, 1	vmaddfp	c02, a2, bp1, c02	LOAD_A	a8, OFFSET_5, AO	vmaddfp	c03, a3, bp1, c03	addi	BO, BO,  8 * SIZE	vmaddfp	c04, a4, bp1, c04	nop/* 10 */	vmaddfp	c05, a1, bp2, c05	vspltw	bp1, b2, 2	vmaddfp	c06, a2, bp2, c06	nop	vmaddfp	c07, a3, bp2, c07	nop	vmaddfp	c08, a4, bp2, c08	nop/* 11 */	vmaddfp	c09, a1, bp1, c09	vspltw	bp2, b2, 3	vmaddfp	c10, a2, bp1, c10	LOAD_B	b2, OFFSET_1, BO	vmaddfp	c11, a3, bp1, c11	dcbt	AO, PREA	vmaddfp	c12, a4, bp1, c12	addi	AO, AO,  8 * SIZE/* 12 */	vmaddfp	c13, a1, bp2, c13	vspltw	bp1, b2, 0	vmaddfp	c14, a2, bp2, c14	LOAD_A	a1, OFFSET_4, AO	vmaddfp	c15, a3, bp2, c15	LOAD_A	a2, OFFSET_5, AO	vmaddfp	c16, a4, bp2, c16	LOAD_A	a3, OFFSET_6, AO/* 13 */	vmaddfp	c01, a5, bp1, c01	vspltw	bp2, b2, 1	vmaddfp	c02, a6, bp1, c02	LOAD_A	a4, OFFSET_7, AO	vmaddfp	c03, a7, bp1, c03	dcbt	AO, PREA	vmaddfp	c04, a8, bp1, c04	addi	AO, AO,  8 * SIZE/* 14 */	vmaddfp	c05, a5, bp2, c05	vspltw	bp1, b2, 2	vmaddfp	c06, a6, bp2, c06	nop	vmaddfp	c07, a7, bp2, c07	dcbt	AO, PREA	vmaddfp	c08, a8, bp2, c08	addi	AO, AO,  8 * SIZE/* 15 */	vmaddfp	c09, a5, bp1, c09	vspltw	bp2, b2, 3	vmaddfp	c10, a6, bp1, c10	LOAD_B	b2, OFFSET_4, BO	vmaddfp	c11, a7, bp1, c11	dcbt	AO, PREA	vmaddfp	c12, a8, bp1, c12	addi	BO, BO,  8 * SIZE/* 16 */	vmaddfp	c13, a5, bp2, c13	vspltw	bp1, b1, 0	vmaddfp	c14, a6, bp2, c14	LOAD_A	a5, OFFSET_4, AO	vmaddfp	c15, a7, bp2, c15	LOAD_A	a6, OFFSET_5, AO	vmaddfp	c16, a8, bp2, c16	bdnz+	LL(12)	.align 4LL(15):	lvx	swap,    OFFSET_0, SP	lvx	neg,     OFFSET_1, SP	lvx	alpha_r, OFFSET_2, SP	lvx	alpha_i, OFFSET_3, SP	andi.	r0,  K,  3	mtspr	CTR, r0	ble+	LL(18)	.align 4LL(16):	vmaddfp	c01, a1, bp1, c01	vspltw	bp2, b1, 1	vmaddfp	c02, a2, bp1, c02	nop	vmaddfp	c03, a3, bp1, c03	nop	vmaddfp	c04, a4, bp1, c04	nop	vmaddfp	c05, a1, bp2, c05	vspltw	bp1, b1, 2	vmaddfp	c06, a2, bp2, c06	nop	vmaddfp	c07, a3, bp2, c07	nop	vmaddfp	c08, a4, bp2, c08	nop	vmaddfp	c09, a1, bp1, c09	vspltw	bp2, b1, 3	vmaddfp	c10, a2, bp1, c10	LOAD_B	b1, OFFSET_1, BO	vmaddfp	c11, a3, bp1, c11

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -