⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 zgemv_t_ppc440.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 2 页
字号:
/*********************************************************************//*                                                                   *//*             Optimized BLAS libraries                              *//*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     *//*                                                                   *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   *//* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of     *//* profits, interruption of business, or related expenses which may  *//* arise from use of Software or Documentation, including but not    *//* limited to those resulting from defects in Software and/or        *//* Documentation, or loss or inaccuracy of data of any kind.         *//*********************************************************************/#define ASSEMBLER#include "common.h"#define P 1024#ifndef __64BIT__#define STACKSIZE 224#else#define STACKSIZE 304#endif#ifdef linux#ifndef __64BIT__#define M	r3#define	N	r4#define A	r6#define LDA	r7#define X	r8#define	INCX	r9#define	Y	r10#define	INCY	r5#else#define M	r3#define	N	r4#define A	r8#define LDA	r9#define X	r10#define	INCX	r5#define	Y	r6#define	INCY	r7#endif#endif#if defined(_AIX) || defined(__APPLE__)#if !defined(__64BIT__) && defined(DOUBLE)#define M	r3#define	N	r4#define A	r10#define LDA	r5#define X	r6#define	INCX	r7#define	Y	r8#define	INCY	r9#else#define M	r3#define	N	r4#define A	r8#define LDA	r9#define X	r10#define	INCX	r5#define	Y	r6#define	INCY	r7#endif#endif#define	BUFFER	r11#define	XP	r12#define	X1	r14#define	J	r15#define	AO1	r16#define	AO2	r17#define	AO3	r18#define	AO4	r19#define	PREA	r20#define	PREC	r21#define	YY	r22#if defined(PPCG4)#define PREFETCHSIZE_A  (3 * 8)#define PREFETCHSIZE_C   7#endif#if !(defined(CONJ) && defined(XCONJ))#define FMADDR FMADD#define FMSUBR FNMSUB#else#define FMADDR FNMSUB#define FMSUBR FMADD#endif#ifndef NEEDPARAM#ifndef __64BIT__#define FZERO	200(SP)#else#define FZERO	256(SP)#endif	PROLOGUE	PROFCODE	addi	SP, SP,  -STACKSIZE	li	r0,   0	stfd	f14,     0(SP)	stfd	f15,     8(SP)	stfd	f16,    16(SP)	stfd	f17,    24(SP)	stfd	f18,    32(SP)	stfd	f19,    40(SP)	stfd	f20,    48(SP)	stfd	f21,    56(SP)	stfd	f22,    64(SP)	stfd	f23,    72(SP)	stfd	f24,    80(SP)	stfd	f25,    88(SP)	stfd	f26,    96(SP)	stfd	f27,   104(SP)	stfd	f28,   112(SP)	stfd	f29,   120(SP)	stfd	f30,   128(SP)	stfd	f31,   136(SP)#ifdef __64BIT__	std	r14,   144(SP)	std	r15,   152(SP)	std	r16,   160(SP)	std	r17,   168(SP)	std	r18,   176(SP)	std	r19,   184(SP)	std	r20,   192(SP)	std	r21,   200(SP)	std	r22,   208(SP)	std	r0,    FZERO#else	stw	r14,   144(SP)	stw	r15,   148(SP)	stw	r16,   152(SP)	stw	r17,   156(SP)	stw	r18,   160(SP)	stw	r19,   164(SP)	stw	r20,   168(SP)	stw	r21,   172(SP)	stw	r22,   176(SP)	stw	r0,    FZERO	stw	r0,    4 + FZERO#endif#ifdef linux#ifndef __64BIT__	lwz	INCY,	  8 + STACKSIZE(SP)	lwz	BUFFER,  12 + STACKSIZE(SP)#else	ld	INCX,    112 + STACKSIZE(SP)	ld	Y,       120 + STACKSIZE(SP)	ld	INCY,    128 + STACKSIZE(SP)	ld	BUFFER,  136 + STACKSIZE(SP)#endif#endif#if defined(_AIX) || defined(__APPLE__)#ifndef __64BIT__#ifdef DOUBLE	lwz	LDA,     56 + STACKSIZE(SP)	lwz	X,       60 + STACKSIZE(SP)	lwz	INCX,    64 + STACKSIZE(SP)	lwz	Y,       68 + STACKSIZE(SP)	lwz	INCY,    72 + STACKSIZE(SP)	lwz	BUFFER,  76 + STACKSIZE(SP)#else	lwz	INCX,    56 + STACKSIZE(SP)	lwz	Y,       60 + STACKSIZE(SP)	lwz	INCY,    64 + STACKSIZE(SP)	lwz	BUFFER,  68 + STACKSIZE(SP)#endif#else	ld	INCX,    112 + STACKSIZE(SP)	ld	Y,       120 + STACKSIZE(SP)	ld	INCY,    128 + STACKSIZE(SP)	ld	BUFFER,  136 + STACKSIZE(SP)#endif#endif#ifndef XCONJ#ifndef CONJ#define FMADD1	FMADD#define FMADD2	FMADD#define FMADD3	FNMSUB#define FMADD4	FMADD#else#define FMADD1	FMADD#define FMADD2	FMADD#define FMADD3	FMADD#define FMADD4	FNMSUB#endif#else#ifndef CONJ#define FMADD1	FMADD#define FMADD2	FNMSUB#define FMADD3	FMADD#define FMADD4	FMADD#else#define FMADD1	FMADD#define FMADD2	FMADD#define FMADD3	FNMSUB#define FMADD4	FMADD#endif#endif#define y1 f0#define y2 f1#define y3 f2#define y4 f3#define y5 f4#define y6 f5#define y7 f6#define y8 f7#define a1	f8#define a2	f9#define a3	f10#define a4	f11#define a5	f12#define a6	f13#define a7	f14#define a8	f15#define b1	f16#define b2	f17#define b3	f18#define b4	f19#define b5	f20#define b6	f21#define b7	f22#define b8	f23#define alpha_r	f24#define alpha_i	f25	fmr	alpha_r, f1	fmr	alpha_i, f2	slwi	LDA,  LDA,  ZBASE_SHIFT	slwi	INCX, INCX, ZBASE_SHIFT	slwi	INCY, INCY, ZBASE_SHIFT	li	PREA, PREFETCHSIZE_A * SIZE	li	PREC, PREFETCHSIZE_C * SIZE	addi	A, A, -SIZE	addi	INCX, INCX, -SIZE	addi	INCY, INCY, -SIZE	sub	X, X, INCX	sub	Y, Y, INCY	mr	YY, Y	cmpwi	cr0, M, 0	ble	LL(999)	cmpwi	cr0, N, 0	ble	LL(999)	mr	XP, X	cmpwi	cr0, INCX, SIZE	beq	LL(10)	addi	XP, BUFFER, -SIZE	addi	X1, BUFFER, -SIZE	srawi.	r0, M, 2	mtspr	CTR, r0	ble	LL(05)	.align 4LL(02):	LFDUX	f0, X, INCX	LFDU	f1, 1 * SIZE(X)	LFDUX	f2, X, INCX	LFDU	f3, 1 * SIZE(X)	LFDUX	f4, X, INCX	LFDU	f5, 1 * SIZE(X)	LFDUX	f6, X, INCX	LFDU	f7, 1 * SIZE(X)	STFDU	f0,  1 * SIZE(X1)	STFDU	f1,  1 * SIZE(X1)	STFDU	f2,  1 * SIZE(X1)	STFDU	f3,  1 * SIZE(X1)	STFDU	f4,  1 * SIZE(X1)	STFDU	f5,  1 * SIZE(X1)	STFDU	f6,  1 * SIZE(X1)	STFDU	f7,  1 * SIZE(X1)	bdnz	LL(02)	.align 4LL(05):	andi.	r0, M, 3	mtspr	CTR, r0	ble	LL(10)	.align 4LL(06):	LFDUX	f0, X, INCX	LFDU	f1,  1 * SIZE(X)	STFDU	f0,  1 * SIZE(X1)	STFDU	f1,  1 * SIZE(X1)	bdnz	LL(06)	.align 4LL(10):	srawi.	J, N, 2	ble	LL(20)	.align 4LL(11):	lfd	 y1,  FZERO	mr     AO1, A	fmr	 y2,  y1	mr     X1, XP	fmr	 y3,  y1	add    AO2, A,   LDA	fmr	 y4,  y1	add    AO3, AO2, LDA	fmr	 y5,  y1	add    AO4, AO3, LDA	fmr	 y6,  y1	add    A,   AO4, LDA	fmr	 y7,  y1	dcbtst	 PREC, Y	fmr	 y8,  y1	srawi.	r0,  M, 2	mtspr	CTR, r0	ble	LL(15)	LFDU	a1, 1 * SIZE(AO1)	LFDU	b1, 1 * SIZE(X1)	LFDU	a2, 1 * SIZE(AO1)	LFDU	b2, 1 * SIZE(X1)	LFDU	a3, 1 * SIZE(AO2)	LFDU	a4, 1 * SIZE(AO2)	LFDU	a5, 1 * SIZE(AO3)	LFDU	a6, 1 * SIZE(AO3)	LFDU	a7, 1 * SIZE(AO4)	bdz	LL(13)	.align 5LL(12):	FMADD1	y1,  a1,  b1, y1	LFDU	a8, 1 * SIZE(AO4)	FMADD2	y2,  a1,  b2, y2	LFDU	b3, 1 * SIZE(X1)	FMADD1	y3,  a3,  b1, y3	LFDU	b4, 1 * SIZE(X1)	FMADD2	y4,  a3,  b2, y4#ifdef PPCG4	dcbt	AO1, PREA#endif	FMADD3	y1,  a2,  b2, y1	LFDU	a1, 1 * SIZE(AO1)	FMADD4	y2,  a2,  b1, y2	LFDU	a2, 1 * SIZE(AO1)	FMADD3	y3,  a4,  b2, y3	LFDU	a3, 1 * SIZE(AO2)	FMADD4	y4,  a4,  b1, y4	LFDU	a4, 1 * SIZE(AO2)#ifdef PPCG4	dcbt	X1, PREA#endif	FMADD1	y5,  a5,  b1, y5	FMADD2	y6,  a5,  b2, y6	FMADD1	y7,  a7,  b1, y7	FMADD2	y8,  a7,  b2, y8#ifdef PPCG4	dcbt	AO2, PREA#endif	FMADD3	y5,  a6,  b2, y5	LFDU	a5, 1 * SIZE(AO3)	FMADD4	y6,  a6,  b1, y6	LFDU	a6, 1 * SIZE(AO3)	FMADD3	y7,  a8,  b2, y7	LFDU	a7, 1 * SIZE(AO4)	FMADD4	y8,  a8,  b1, y8	LFDU	a8, 1 * SIZE(AO4)	FMADD1	y1,  a1,  b3, y1	LFDU	b1, 1 * SIZE(X1)	FMADD2	y2,  a1,  b4, y2	LFDU	b2, 1 * SIZE(X1)	FMADD1	y3,  a3,  b3, y3	FMADD2	y4,  a3,  b4, y4#ifdef PPCG4	dcbt	AO3, PREA#endif	FMADD3	y1,  a2,  b4, y1	LFDU	a1, 1 * SIZE(AO1)	FMADD4	y2,  a2,  b3, y2	LFDU	a2, 1 * SIZE(AO1)	FMADD3	y3,  a4,  b4, y3	LFDU	a3, 1 * SIZE(AO2)	FMADD4	y4,  a4,  b3, y4	LFDU	a4, 1 * SIZE(AO2)	FMADD1	y5,  a5,  b3, y5	FMADD2	y6,  a5,  b4, y6	FMADD1	y7,  a7,  b3, y7	FMADD2	y8,  a7,  b4, y8#ifdef PPCG4	dcbt	AO4, PREA#endif	FMADD3	y5,  a6,  b4, y5	LFDU	a5, 1 * SIZE(AO3)	FMADD4	y6,  a6,  b3, y6	LFDU	a6, 1 * SIZE(AO3)	FMADD3	y7,  a8,  b4, y7	LFDU	a7, 1 * SIZE(AO4)	FMADD4	y8,  a8,  b3, y8	LFDU	a8, 1 * SIZE(AO4)	FMADD1	y1,  a1,  b1, y1	LFDU	b3, 1 * SIZE(X1)	FMADD2	y2,  a1,  b2, y2	LFDU	b4, 1 * SIZE(X1)	FMADD1	y3,  a3,  b1, y3	FMADD2	y4,  a3,  b2, y4#if defined(PPCG4) && defined(DOUBLE)	dcbt	AO1, PREA#endif	FMADD3	y1,  a2,  b2, y1	LFDU	a1, 1 * SIZE(AO1)	FMADD4	y2,  a2,  b1, y2	LFDU	a2, 1 * SIZE(AO1)	FMADD3	y3,  a4,  b2, y3	LFDU	a3, 1 * SIZE(AO2)	FMADD4	y4,  a4,  b1, y4	LFDU	a4, 1 * SIZE(AO2)#if defined(PPCG4) && defined(DOUBLE)	dcbt	X1, PREA#endif	FMADD1	y5,  a5,  b1, y5	FMADD2	y6,  a5,  b2, y6	FMADD1	y7,  a7,  b1, y7	FMADD2	y8,  a7,  b2, y8#if defined(PPCG4) && defined(DOUBLE)	dcbt	AO2, PREA#endif	FMADD3	y5,  a6,  b2, y5	LFDU	a5, 1 * SIZE(AO3)	FMADD4	y6,  a6,  b1, y6	LFDU	a6, 1 * SIZE(AO3)	FMADD3	y7,  a8,  b2, y7	LFDU	a7, 1 * SIZE(AO4)	FMADD4	y8,  a8,  b1, y8	LFDU	a8, 1 * SIZE(AO4)	FMADD1	y1,  a1,  b3, y1	FMADD2	y2,  a1,  b4, y2	FMADD1	y3,  a3,  b3, y3	FMADD2	y4,  a3,  b4, y4#if defined(PPCG4) && defined(DOUBLE)	dcbt	AO3, PREA#endif	FMADD3	y1,  a2,  b4, y1	LFDU	a1, 1 * SIZE(AO1)	FMADD4	y2,  a2,  b3, y2	LFDU	a2, 1 * SIZE(AO1)	FMADD3	y3,  a4,  b4, y3	LFDU	a3, 1 * SIZE(AO2)	FMADD4	y4,  a4,  b3, y4	LFDU	a4, 1 * SIZE(AO2)	FMADD1	y5,  a5,  b3, y5	LFDU	b1, 1 * SIZE(X1)	FMADD2	y6,  a5,  b4, y6	LFDU	b2, 1 * SIZE(X1)	FMADD1	y7,  a7,  b3, y7	FMADD2	y8,  a7,  b4, y8#if defined(PPCG4) && defined(DOUBLE)	dcbt	AO4, PREA#endif	FMADD3	y5,  a6,  b4, y5	LFDU	a5, 1 * SIZE(AO3)	FMADD4	y6,  a6,  b3, y6	LFDU	a6, 1 * SIZE(AO3)	FMADD3	y7,  a8,  b4, y7	LFDU	a7, 1 * SIZE(AO4)	FMADD4	y8,  a8,  b3, y8 	bdnz	LL(12)	.align 4	LL(13):	FMADD1	y1,  a1,  b1, y1	LFDU	a8, 1 * SIZE(AO4)	FMADD2	y2,  a1,  b2, y2	LFDU	b3, 1 * SIZE(X1)	FMADD1	y3,  a3,  b1, y3	LFDU	b4, 1 * SIZE(X1)	FMADD2	y4,  a3,  b2, y4	FMADD3	y1,  a2,  b2, y1	LFDU	a1, 1 * SIZE(AO1)	FMADD4	y2,  a2,  b1, y2	LFDU	a2, 1 * SIZE(AO1)	FMADD3	y3,  a4,  b2, y3	LFDU	a3, 1 * SIZE(AO2)	FMADD4	y4,  a4,  b1, y4	LFDU	a4, 1 * SIZE(AO2)	FMADD1	y5,  a5,  b1, y5	FMADD2	y6,  a5,  b2, y6	FMADD1	y7,  a7,  b1, y7	FMADD2	y8,  a7,  b2, y8	FMADD3	y5,  a6,  b2, y5	LFDU	a5, 1 * SIZE(AO3)	FMADD4	y6,  a6,  b1, y6	LFDU	a6, 1 * SIZE(AO3)	FMADD3	y7,  a8,  b2, y7	LFDU	a7, 1 * SIZE(AO4)	FMADD4	y8,  a8,  b1, y8	LFDU	a8, 1 * SIZE(AO4)	FMADD1	y1,  a1,  b3, y1	LFDU	b1, 1 * SIZE(X1)	FMADD2	y2,  a1,  b4, y2	LFDU	b2, 1 * SIZE(X1)	FMADD1	y3,  a3,  b3, y3	FMADD2	y4,  a3,  b4, y4	FMADD3	y1,  a2,  b4, y1	LFDU	a1, 1 * SIZE(AO1)	FMADD4	y2,  a2,  b3, y2	LFDU	a2, 1 * SIZE(AO1)	FMADD3	y3,  a4,  b4, y3	LFDU	a3, 1 * SIZE(AO2)	FMADD4	y4,  a4,  b3, y4	LFDU	a4, 1 * SIZE(AO2)	FMADD1	y5,  a5,  b3, y5	FMADD2	y6,  a5,  b4, y6	FMADD1	y7,  a7,  b3, y7	FMADD2	y8,  a7,  b4, y8	FMADD3	y5,  a6,  b4, y5	LFDU	a5, 1 * SIZE(AO3)	FMADD4	y6,  a6,  b3, y6	LFDU	a6, 1 * SIZE(AO3)	FMADD3	y7,  a8,  b4, y7	LFDU	a7, 1 * SIZE(AO4)	FMADD4	y8,  a8,  b3, y8	LFDU	a8, 1 * SIZE(AO4)	FMADD1	y1,  a1,  b1, y1	LFDU	b3, 1 * SIZE(X1)	FMADD2	y2,  a1,  b2, y2	LFDU	b4, 1 * SIZE(X1)	FMADD1	y3,  a3,  b1, y3	FMADD2	y4,  a3,  b2, y4	FMADD3	y1,  a2,  b2, y1	LFDU	a1, 1 * SIZE(AO1)	FMADD4	y2,  a2,  b1, y2	LFDU	a2, 1 * SIZE(AO1)	FMADD3	y3,  a4,  b2, y3	LFDU	a3, 1 * SIZE(AO2)	FMADD4	y4,  a4,  b1, y4	LFDU	a4, 1 * SIZE(AO2)	FMADD1	y5,  a5,  b1, y5	FMADD2	y6,  a5,  b2, y6	FMADD1	y7,  a7,  b1, y7	FMADD2	y8,  a7,  b2, y8	FMADD3	y5,  a6,  b2, y5	LFDU	a5, 1 * SIZE(AO3)	FMADD4	y6,  a6,  b1, y6	LFDU	a6, 1 * SIZE(AO3)	FMADD3	y7,  a8,  b2, y7	LFDU	a7, 1 * SIZE(AO4)	FMADD4	y8,  a8,  b1, y8	LFDU	a8, 1 * SIZE(AO4)	FMADD1	y1,  a1,  b3, y1	FMADD2	y2,  a1,  b4, y2	FMADD1	y3,  a3,  b3, y3	FMADD2	y4,  a3,  b4, y4	FMADD3	y1,  a2,  b4, y1	FMADD4	y2,  a2,  b3, y2	FMADD3	y3,  a4,  b4, y3	FMADD4	y4,  a4,  b3, y4	FMADD1	y5,  a5,  b3, y5	FMADD2	y6,  a5,  b4, y6	FMADD1	y7,  a7,  b3, y7	FMADD2	y8,  a7,  b4, y8	FMADD3	y5,  a6,  b4, y5	FMADD4	y6,  a6,  b3, y6	FMADD3	y7,  a8,  b4, y7	FMADD4	y8,  a8,  b3, y8	.align 4

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -