⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 ztrsm_kernel_hummer_ln.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 4 页
字号:
/*********************************************************************//*                                                                   *//*             Optimized BLAS libraries                              *//*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     *//*                                                                   *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   *//* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of     *//* profits, interruption of business, or related expenses which may  *//* arise from use of Software or Documentation, including but not    *//* limited to those resulting from defects in Software and/or        *//* Documentation, or loss or inaccuracy of data of any kind.         *//*********************************************************************/#define ASSEMBLER#include "common.h"		#undef ZERO#define ALPHA    0#define FZERO	16#define	M	r3#define	N	r4#define	K	r5#ifdef linux#define A	r6#define	B	r7#define	C	r8#define	LDC	r9#define OFFSET	r10#endif#define TEMP	r11#define AORIG	r12#define KK	r14#define INCM1	r15#define INCM3	r16#define INCM5	r17#define INCM7	r18#define INC2	r19#define INC	r20#define INC4	r21#define	I	r22#define J	r23#define AO	r24#define BO	r25#define AO2	r26#define	BO2	r27	#define	CO1	r28#define CO2	r29#define	ZERO	r31#ifndef NEEDPARAM#define A1	f16#define A2	f17#define A3	f18#define A4	f19#define A5	f20#define A6	f21#define A7	f22#define A8	f23#define A9	f24#define A10	f25#define B1	f26#define B2	f27#define B3	f28#define B4	f29#define B5	f30#define B6	f31#define AP	B6#ifndef CONJ#define FXCPMADD	fxcpmadd#define FXCSMADD	fxcxnpma#else#if defined(LN) || defined(LT)#define FXCPMADD	fxcpnsma#define FXCSMADD	fxcxma#else#define FXCPMADD	fxcpmadd#define FXCSMADD	fxcxnsma#endif#endif#ifndef CONJ#define FXCXNPMA	fxcxnpma#define FXCXNSMA	fxcxnsma#else#define FXCXNPMA	fxcxnsma#define FXCXNSMA	fxcxnpma#endif	PROLOGUE	PROFCODE	li	r0, -16	stfpdux	f14, SP, r0	stfpdux	f15, SP, r0	stfpdux	f16, SP, r0	stfpdux	f17, SP, r0	stfpdux	f18, SP, r0	stfpdux	f19, SP, r0	stfpdux	f20, SP, r0	stfpdux	f21, SP, r0	stfpdux	f22, SP, r0	stfpdux	f23, SP, r0	stfpdux	f24, SP, r0	stfpdux	f25, SP, r0	stfpdux	f26, SP, r0	stfpdux	f27, SP, r0	stfpdux	f28, SP, r0	stfpdux	f29, SP, r0	stfpdux	f30, SP, r0	stfpdux	f31, SP, r0		stwu	r31,  -4(SP)	stwu	r30,  -4(SP)	stwu	r29,  -4(SP)	stwu	r28,  -4(SP)	stwu	r27,  -4(SP)	stwu	r26,  -4(SP)	stwu	r25,  -4(SP)	stwu	r24,  -4(SP)	stwu	r23,  -4(SP)	stwu	r22,  -4(SP)	stwu	r21,  -4(SP)	stwu	r20,  -4(SP)	stwu	r19,  -4(SP)	stwu	r18,  -4(SP)	stwu	r17,  -4(SP)	stwu	r16,  -4(SP)	stwu	r15,  -4(SP)	stwu	r14,  -4(SP)	li	r0,   0	stwu	r0,   -4(SP)	stwu	r0,   -4(SP)	stfdu	f2,   -8(SP)	stfdu	f1,   -8(SP)	slwi	LDC, LDC, ZBASE_SHIFT	cmpwi	cr0, M, 0	ble	.L999	cmpwi	cr0, N, 0	ble	.L999	cmpwi	cr0, K, 0	ble	.L999	li	INC,    1 * SIZE	li	INC2,   2 * SIZE	li	INC4,   4 * SIZE	li	INCM1, -1 * SIZE	li	INCM3, -3 * SIZE	li	INCM5, -5 * SIZE	li	INCM7, -7 * SIZE	addi	C, C, - 1 * SIZE	#ifdef LN	mullw	r0, M, K	slwi	r0, r0, ZBASE_SHIFT	add	A, A, r0	slwi	r0, M, ZBASE_SHIFT	add	C, C, r0#endif#ifdef RN	neg	KK, OFFSET#endif#ifdef RT	mullw	r0, N, K	slwi	r0, r0, ZBASE_SHIFT	add	B, B, r0	mullw	r0, N, LDC	add	C, C, r0	sub	KK, N, OFFSET#endif	srawi.	J, N,  1	ble	.L50	.align 4.L10:#ifdef RT	slwi	r0, K, 1 + ZBASE_SHIFT	sub	B, B, r0	slwi	r0, LDC, 1	sub	C, C, r0#endif	mr	CO1, C	add	CO2, C,   LDC#ifdef LN	add	KK, M, OFFSET#endif#ifdef LT	mr	KK, OFFSET#endif#if defined(LN) || defined(RT)	addi	AORIG, A, -4 * SIZE#else	addi	AO, A, -4 * SIZE#endif#ifndef RT	add	C,  CO2, LDC#endif	li	r0, FZERO	lfpsx	f0, SP, r0		andi.	I, M,  1	beq	.L20#if defined(LT) || defined(RN)	addi	AO2, AO,   2 * SIZE	fpmr	f1,  f0	addi	BO,  B,  - 4 * SIZE	fpmr	f2,  f0	addi	BO2, B,  - 2 * SIZE	fpmr	f3, f0	srawi.	r0,  KK,  2	mtspr	CTR, r0	ble	.L34#else#ifdef LN	slwi	r0,   K,  0 + ZBASE_SHIFT	sub	AORIG, AORIG, r0#endif	slwi	r0  , KK, 0 + ZBASE_SHIFT	slwi	TEMP, KK, 1 + ZBASE_SHIFT	add	AO, AORIG, r0	add	BO, B,     TEMP	sub	TEMP, K, KK	addi	AO2, AO,   2 * SIZE	fpmr	f1,  f0	addi	BO,  BO,  - 4 * SIZE	fpmr	f2,  f0	addi	BO2, BO,    2 * SIZE	fpmr	f3, f0	srawi.	r0,  TEMP,  2	mtspr	CTR, r0	ble	.L34#endif	LFPDUX	A1,  AO, INC4	LFPDUX	B1,  BO, INC4	LFPDUX	B2, BO2, INC4	LFPDUX	A2, AO2, INC4	LFPDUX	B3,  BO, INC4	LFPDUX	B4, BO2, INC4	LFPDUX	A3,  AO, INC4	LFPDUX	A5,  BO, INC4	LFPDUX	A6, BO2, INC4	LFPDUX	A4, AO2, INC4	LFPDUX	A7,  BO, INC4	LFPDUX	A8, BO2, INC4	bdz-	.L33	.align 4.L32:	FXCPMADD	f0,  B1, A1, f0	FXCSMADD	f1,  B1, A1, f1	LFPDUX	B1,  BO, INC4	FXCPMADD	f2,  B2, A1, f2	FXCSMADD	f3,  B2, A1, f3	LFPDUX	B2, BO2, INC4	LFPDUX	A1,  AO, INC4	FXCPMADD	f0,  B3, A2, f0	FXCSMADD	f1,  B3, A2, f1	LFPDUX	B3,  BO, INC4	FXCPMADD	f2,  B4, A2, f2	FXCSMADD	f3,  B4, A2, f3	LFPDUX	B4, BO2, INC4	LFPDUX	A2, AO2, INC4	FXCPMADD	f0,  A5, A3, f0	FXCSMADD	f1,  A5, A3, f1	LFPDUX	A5,  BO, INC4	FXCPMADD	f2,  A6, A3, f2	FXCSMADD	f3,  A6, A3, f3	LFPDUX	A6, BO2, INC4	LFPDUX	A3,  AO, INC4	FXCPMADD	f0,  A7, A4, f0	FXCSMADD	f1,  A7, A4, f1	LFPDUX	A7,  BO, INC4	FXCPMADD	f2,  A8, A4, f2	FXCSMADD	f3,  A8, A4, f3	LFPDUX	A8, BO2, INC4	LFPDUX	A4, AO2, INC4	bdnz+	.L32	.align 4.L33:	FXCPMADD	f0,  B1, A1, f0	FXCSMADD	f1,  B1, A1, f1	FXCPMADD	f2,  B2, A1, f2	FXCSMADD	f3,  B2, A1, f3	FXCPMADD	f0,  B3, A2, f0	FXCSMADD	f1,  B3, A2, f1	FXCPMADD	f2,  B4, A2, f2	FXCSMADD	f3,  B4, A2, f3	FXCPMADD	f0,  A5, A3, f0	FXCSMADD	f1,  A5, A3, f1	FXCPMADD	f2,  A6, A3, f2	FXCSMADD	f3,  A6, A3, f3	FXCPMADD	f0,  A7, A4, f0	FXCSMADD	f1,  A7, A4, f1	FXCPMADD	f2,  A8, A4, f2	FXCSMADD	f3,  A8, A4, f3	.align 4.L34:#if defined(LT) || defined(RN)	andi.	r0,  KK,  3	mtspr	CTR, r0	ble+	.L38#else	andi.	r0, TEMP, 3	mtspr	CTR, r0	ble+	.L38#endif	LFPDX	A1,  AO,  INC4	LFPDUX	B1,  BO,  INC4	LFPDUX	B2,  BO2, INC4	add	AO, AO, INC2	bdz-	.L37	.align 4.L36:	FXCPMADD	f0,  B1, A1, f0	FXCSMADD	f1,  B1, A1, f1	LFPDUX	B1,  BO,  INC4	FXCPMADD	f2,  B2, A1, f2	FXCSMADD	f3,  B2, A1, f3	LFPDX	A1,  AO,  INC4	LFPDUX	B2,  BO2, INC4	add	AO, AO, INC2	bdnz+	.L36	.align 4.L37:	FXCPMADD	f0,  B1, A1, f0	FXCSMADD	f1,  B1, A1, f1	FXCPMADD	f2,  B2, A1, f2	FXCSMADD	f3,  B2, A1, f3	.align 4.L38:	fpadd	f0, f0, f1	fpadd	f2, f2, f3#if defined(LN) || defined(RT)#ifdef LN	subi	r0, KK, 1#else	subi	r0, KK, 2#endif	slwi	TEMP, r0, 0 + ZBASE_SHIFT	slwi	r0,   r0, 1 + ZBASE_SHIFT	add	AO, AORIG, TEMP	add	BO, B,     r0	addi	BO,  BO, - 4 * SIZE#endif	addi	AO2, AO,   2 * SIZE	addi	BO2, BO,   2 * SIZE#if defined(LN) || defined(LT)	LFPDX	f16, BO,  INC4	LFPDX	f17, BO2, INC4#else	LFPDX	f16, AO,  INC4	LFPDX	f17, AO2, INC4#endif	fpsub	f0,  f16,  f0	fpsub	f2,  f17,  f2#ifdef LN	LFPDX	A1,  AO,  INC4	fxpmul	  f4,  A1, f0	fxpmul	  f5,  A1, f2	FXCXNPMA  f0,  A1, f0,  f4	FXCXNPMA  f2,  A1, f2,  f5#endif#ifdef LT	LFPDX	A1,  AO,  INC4	fxpmul	  f4,  A1, f0	fxpmul	  f5,  A1, f2	FXCXNPMA  f0,  A1, f0, f4	FXCXNPMA  f2,  A1, f2, f5#endif#ifdef RN	LFPDUX	A1,  BO,  INC4	LFPDUX	A2,  BO2, INC4	add	BO,  BO,  INC4	LFPDUX	A3,  BO2, INC4	subi	BO,  BO,   8 * SIZE	subi	BO2, BO2,  8 * SIZE	fxpmul	  f4,  A1, f0	FXCXNPMA  f0,  A1, f0, f4	fxcpnmsub f2,  A2, f0, f2	FXCXNSMA  f2,  A2, f0, f2	fxpmul	  f4,  A3, f2	FXCXNPMA  f2,  A3, f2,  f4#endif#ifdef RT	LFPDUX	A1,  BO,  INC4	add	BO2, BO2, INC4	LFPDUX	A2,  BO,  INC4	LFPDUX	A3,  BO2, INC4	subi	BO,  BO,   8 * SIZE	subi	BO2, BO2,  8 * SIZE	fxpmul	  f4,  A3, f2	FXCXNPMA  f2,  A3, f2,  f4	fxcpnmsub f0,  A2, f2,  f0	FXCXNSMA  f0,  A2, f2,  f0	fxpmul	  f4,  A1, f0	FXCXNPMA  f0,  A1, f0,  f4#endif#ifdef LN	subi	CO1, CO1, 2 * SIZE	subi	CO2, CO2, 2 * SIZE#endif#if defined(LN) || defined(LT)	STFPDX	f0,  BO,  INC4	STFPDX	f2,  BO2, INC4#else	STFPDX	f0,  AO,  INC4	STFPDX	f2,  AO2, INC4#endif	STFDUX	f0,  CO1, INC	STFSDUX	f0,  CO1, INC	STFDUX	f2,  CO2, INC	STFSDUX	f2,  CO2, INC#ifdef LN	subi	CO1, CO1, 2 * SIZE	subi	CO2, CO2, 2 * SIZE#endif#ifdef RT	slwi	r0, K, 0 + ZBASE_SHIFT	add	AORIG, AORIG, r0#endif#if defined(LT) || defined(RN)	sub	TEMP, K, KK	slwi	r0,   TEMP, 0 + ZBASE_SHIFT	slwi	TEMP, TEMP, 1 + ZBASE_SHIFT	add	AO, AO, r0	add	BO, BO, TEMP#endif#ifdef LT	addi	KK, KK, 1#endif#ifdef LN	subi	KK, KK, 1#endif	li	r0, FZERO	lfpsx	f0, SP, r0	.align 4	.L20:	andi.	I, M,  2	beq	.L30#if defined(LT) || defined(RN)	addi	AO2, AO,   2 * SIZE	fpmr	f4,  f0	addi	BO,  B,  - 4 * SIZE	fpmr	f8,  f0	addi	BO2, B,  - 2 * SIZE	fpmr	f12, f0	srawi.	r0,  KK,  2 	fpmr	f1,  f0	fpmr	f5,  f0	fpmr	f9,  f0	mtspr	CTR, r0	fpmr	f13, f0	ble	.L24#else#ifdef LN	slwi	r0,   K,  1 + ZBASE_SHIFT	sub	AORIG, AORIG, r0#endif	slwi	r0  , KK, 1 + ZBASE_SHIFT	add	AO, AORIG, r0	add	BO, B,     r0	sub	TEMP, K, KK	addi	AO2, AO,   2 * SIZE	fpmr	f4,  f0	addi	BO,  BO,  - 4 * SIZE	fpmr	f8,  f0	addi	BO2, BO,    2 * SIZE	fpmr	f12, f0 	fpmr	f1,  f0	fpmr	f5,  f0	fpmr	f9,  f0	fpmr	f13, f0	srawi.	r0,  TEMP,  2	mtspr	CTR, r0	ble	.L24#endif	LFPDUX	A1,   AO, INC4	LFPDUX	B1,   BO, INC4	LFPDUX	A2,  AO2, INC4	LFPDUX	B2,  BO2, INC4	LFPDUX	A3,   AO, INC4	LFPDUX	B3,   BO, INC4	LFPDUX	A4,  AO2, INC4	LFPDUX	B4,  BO2, INC4	LFPDUX	A5,   AO, INC4	LFPDUX	B5,   BO, INC4	LFPDUX	A6,  AO2, INC4	LFPDUX	B6,  BO2, INC4	LFPDUX	A7,   AO, INC4	LFPDUX	A9,   BO, INC4	LFPDUX	A10, BO2, INC4	bdz-	.L23	.align 4.L22:	FXCPMADD	f0,  B1, A1, f0	nop	FXCSMADD	f4,  B1, A1, f4	LFPDUX	A8,  AO2, INC4	FXCPMADD	f8,  B2, A1, f8	nop	FXCSMADD	f12, B2, A1, f12	LFPDUX	A1,   AO, INC4	FXCPMADD	f1,  B1, A2, f1	nop	FXCSMADD	f5,  B1, A2, f5	LFPDUX	B1,   BO, INC4	FXCPMADD	f9,  B2, A2, f9	nop	FXCSMADD	f13, B2, A2, f13	LFPDUX	B2,  BO2, INC4	FXCPMADD	f0,  B3, A3, f0	nop	FXCSMADD	f4,  B3, A3, f4	LFPDUX	A2,  AO2, INC4	FXCPMADD	f8,  B4, A3, f8	nop	FXCSMADD	f12, B4, A3, f12	LFPDUX	A3,   AO, INC4	FXCPMADD	f1,  B3, A4, f1	nop	FXCSMADD	f5,  B3, A4, f5	LFPDUX	B3,   BO, INC4	FXCPMADD	f9,  B4, A4, f9	nop	FXCSMADD	f13, B4, A4, f13	LFPDUX	B4,  BO2, INC4	FXCPMADD	f0,  B5, A5, f0	nop	FXCSMADD	f4,  B5, A5, f4	LFPDUX	A4,  AO2, INC4	FXCPMADD	f8,  B6, A5, f8	nop	FXCSMADD	f12, B6, A5, f12	LFPDUX	A5,   AO, INC4	FXCPMADD	f1,  B5, A6, f1	nop	FXCSMADD	f5,  B5, A6, f5	LFPDUX	B5,   BO, INC4	FXCPMADD	f9,  B6, A6, f9	nop	FXCSMADD	f13, B6, A6, f13	LFPDUX	B6,  BO2, INC4	FXCPMADD	f0,  A9,  A7, f0	nop	FXCSMADD	f4,  A9,  A7, f4	LFPDUX	A6,  AO2, INC4	FXCPMADD	f8,  A10, A7, f8	nop	FXCSMADD	f12, A10, A7, f12	LFPDUX	A7,   AO, INC4	FXCPMADD	f1,  A9,  A8, f1	nop	FXCSMADD	f5,  A9,  A8, f5	LFPDUX	A9,   BO, INC4	FXCPMADD	f9,  A10, A8, f9	nop	FXCSMADD	f13, A10, A8, f13	LFPDUX	A10, BO2, INC4	bdnz+	.L22	.align 4.L23:	FXCPMADD	f0,  B1, A1, f0	FXCSMADD	f4,  B1, A1, f4	LFPDUX	A8,  AO2, INC4	FXCPMADD	f8,  B2, A1, f8	FXCSMADD	f12, B2, A1, f12	FXCPMADD	f1,  B1, A2, f1	FXCSMADD	f5,  B1, A2, f5	FXCPMADD	f9,  B2, A2, f9	FXCSMADD	f13, B2, A2, f13	FXCPMADD	f0,  B3, A3, f0	FXCSMADD	f4,  B3, A3, f4	FXCPMADD	f8,  B4, A3, f8	FXCSMADD	f12, B4, A3, f12	FXCPMADD	f1,  B3, A4, f1	FXCSMADD	f5,  B3, A4, f5	FXCPMADD	f9,  B4, A4, f9	FXCSMADD	f13, B4, A4, f13	FXCPMADD	f0,  B5, A5, f0	FXCSMADD	f4,  B5, A5, f4	FXCPMADD	f8,  B6, A5, f8	FXCSMADD	f12, B6, A5, f12	FXCPMADD	f1,  B5, A6, f1	FXCSMADD	f5,  B5, A6, f5	FXCPMADD	f9,  B6, A6, f9	FXCSMADD	f13, B6, A6, f13	FXCPMADD	f0,  A9, A7, f0	FXCSMADD	f4,  A9, A7, f4	FXCPMADD	f8,  A10, A7, f8	FXCSMADD	f12, A10, A7, f12	FXCPMADD	f1,  A9, A8, f1	FXCSMADD	f5,  A9, A8, f5	FXCPMADD	f9,  A10, A8, f9	FXCSMADD	f13, A10, A8, f13	.align 4.L24:#if defined(LT) || defined(RN)	andi.	r0,  KK,  3	mtspr	CTR, r0	ble+	.L28#else	andi.	r0, TEMP, 3	mtspr	CTR, r0	ble+	.L28#endif	LFPDUX	A1,  AO,  INC4	LFPDUX	A2,  AO2, INC4	LFPDUX	B1,  BO,  INC4	LFPDUX	B2,  BO2, INC4	bdz-	.L27	.align 4.L26:	FXCPMADD	f0,  B1, A1, f0	FXCSMADD	f4,  B1, A1, f4	FXCPMADD	f8,  B2, A1, f8	FXCSMADD	f12, B2, A1, f12	LFPDUX	A1,  AO,  INC4	FXCPMADD	f1,  B1, A2, f1	FXCSMADD	f5,  B1, A2, f5	LFPDUX	B1,  BO,  INC4

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -