⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 gemm_ncopy_hummer_8.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 2 页
字号:
/*********************************************************************//*                                                                   *//*             Optimized BLAS libraries                              *//*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     *//*                                                                   *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   *//* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of     *//* profits, interruption of business, or related expenses which may  *//* arise from use of Software or Documentation, including but not    *//* limited to those resulting from defects in Software and/or        *//* Documentation, or loss or inaccuracy of data of any kind.         *//*********************************************************************/#define ASSEMBLER#include "common.h"	#define	M	r3#define	N	r4#define	A	r5#define	LDA	r6#define B	r7#define AO1	r8#define AO2	r9#define AO3	r10#define AO4	r11#define J	r12#define AO5	r26#define AO6	r27#define AO7	r28#define AO8	r29#define INC	r30#define INC2	r31#define c01	f0#define c02	f1#define c03	f2#define c04	f3#define c05	f4#define c06	f5#define c07	f6#define c08	f7#define c09	f8#define c10	f9#define c11	f10#define c12	f11#define c13	f12#define c14	f13#define c15	f14#define c16	f15#define c17	f16#define c18	f17#define c19	f18#define c20	f19#define c21	f20#define c22	f21#define c23	f22#define c24	f23#define c25	f24#define c26	f25#define c27	f26#define c28	f27#define c29	f28#define c30	f29#define c31	f30#define c32	f31#define	sel_p	f30#define	sel_s	f31	PROLOGUE	PROFCODE	li	r0, -16	stfpdux	f14, SP, r0	stfpdux	f15, SP, r0	stfpdux	f16, SP, r0	stfpdux	f17, SP, r0	stfpdux	f18, SP, r0	stfpdux	f19, SP, r0	stfpdux	f20, SP, r0	stfpdux	f21, SP, r0	stfpdux	f22, SP, r0	stfpdux	f23, SP, r0	stfpdux	f24, SP, r0	stfpdux	f25, SP, r0	stfpdux	f26, SP, r0	stfpdux	f27, SP, r0	stfpdux	f28, SP, r0	stfpdux	f29, SP, r0	stfpdux	f30, SP, r0	stfpdux	f31, SP, r0		stwu	r31,  -4(SP)	stwu	r30,  -4(SP)	stwu	r29,  -4(SP)	stwu	r28,  -4(SP)	stwu	r27,  -4(SP)	stwu	r26,  -4(SP)	lis	r9,   0x3f80	lis	r10,  0xbf80	stwu	r9,    -4(SP)	stwu	r10,   -4(SP)	stwu	r10,   -4(SP)	stwu	r9,    -4(SP)	slwi	LDA, LDA, BASE_SHIFT	li	r0, 0	lfpsux	sel_p, SP, r0	li	r0, 8	lfpsux	sel_s, SP, r0	cmpwi	cr0, M, 0	ble-	.L999	cmpwi	cr0, N, 0	ble-	.L999	li	INC,  1 * SIZE	li	INC2, 2 * SIZE	subi	B, B, 2 * SIZE	andi.	r0, A,   2 * SIZE - 1	bne	.L100	andi.	r0, LDA, 2 * SIZE - 1	bne	.L100	subi	A, A, 2 * SIZE	srawi.	J,  N,  3	ble	.L20	.align 4.L11:	mr	AO1, A	add	AO2, A,   LDA	add	AO3, AO2, LDA	add	AO4, AO3, LDA	add	AO5, AO4, LDA	add	AO6, AO5, LDA	add	AO7, AO6, LDA	add	AO8, AO7, LDA	add	A,   AO8, LDA	srawi.	r0,  M,  2	mtspr	CTR, r0	ble	.L15	.align 4.L12:	LFPDUX	c01,   AO1, INC2	LFXDUX	c02,   AO2, INC2	LFPDUX	c03,   AO3, INC2	LFXDUX	c04,   AO4, INC2	LFPDUX	c05,   AO5, INC2	LFXDUX	c06,   AO6, INC2	LFPDUX	c07,   AO7, INC2	LFXDUX	c08,   AO8, INC2	LFPDUX	c09,   AO1, INC2	LFXDUX	c10,   AO2, INC2	LFPDUX	c11,   AO3, INC2	LFXDUX	c12,   AO4, INC2	fpsel	c17, sel_p, c01, c02	LFPDUX	c13,   AO5, INC2	fpsel	c18, sel_p, c03, c04	LFXDUX	c14,   AO6, INC2	fpsel	c19, sel_p, c05, c06	LFPDUX	c15,   AO7, INC2	fpsel	c20, sel_p, c07, c08	LFXDUX	c16,   AO8, INC2	fpsel	c21, sel_s, c01, c02	fpsel	c22, sel_s, c03, c04	STFPDUX	c17,   B, INC2	fpsel	c23, sel_s, c05, c06	STFPDUX	c18,   B, INC2	fpsel	c24, sel_s, c07, c08	STFPDUX	c19,   B, INC2	fpsel	c01, sel_p, c09, c10	STFPDUX	c20,   B, INC2	fpsel	c02, sel_p, c11, c12	STFXDUX	c21,   B, INC2	fpsel	c03, sel_p, c13, c14	STFXDUX	c22,   B, INC2	fpsel	c04, sel_p, c15, c16	STFXDUX	c23,   B, INC2	fpsel	c05, sel_s, c09, c10	STFXDUX	c24,   B, INC2	fpsel	c06, sel_s, c11, c12	STFPDUX	c01,   B, INC2	fpsel	c07, sel_s, c13, c14	STFPDUX	c02,   B, INC2	fpsel	c08, sel_s, c15, c16	STFPDUX	c03,   B, INC2	STFPDUX	c04,   B, INC2	STFXDUX	c05,   B, INC2	STFXDUX	c06,   B, INC2	STFXDUX	c07,   B, INC2	STFXDUX	c08,   B, INC2	bdnz	.L12	.align 4	.L15:	andi.	r0,  M,  3	ble	.L19	andi.	r0,  M,  2	beq	.L17	LFPDUX	c01,   AO1, INC2	LFXDUX	c02,   AO2, INC2	LFPDUX	c03,   AO3, INC2	LFXDUX	c04,   AO4, INC2	LFPDUX	c05,   AO5, INC2	fpsel	c09, sel_p, c01, c02	LFXDUX	c06,   AO6, INC2	fpsel	c10, sel_p, c03, c04	LFPDUX	c07,   AO7, INC2	fpsel	c11, sel_p, c05, c06	LFXDUX	c08,   AO8, INC2	fpsel	c12, sel_p, c07, c08	fpsel	c13, sel_s, c01, c02	fpsel	c14, sel_s, c03, c04	STFPDUX	c09,   B, INC2	fpsel	c15, sel_s, c05, c06	STFPDUX	c10,   B, INC2	fpsel	c16, sel_s, c07, c08	STFPDUX	c11,   B, INC2	STFPDUX	c12,   B, INC2	STFXDUX	c13,   B, INC2	STFXDUX	c14,   B, INC2	STFXDUX	c15,   B, INC2	STFXDUX	c16,   B, INC2	.align 4.L17:	andi.	r0,  M,  1	beq	.L19	LFDUX	c01,   AO1, INC2	LFDUX	c02,   AO3, INC2	LFDUX	c03,   AO5, INC2	LFDUX	c04,   AO7, INC2	LFSDUX	c01,   AO2, INC2	LFSDUX	c02,   AO4, INC2	LFSDUX	c03,   AO6, INC2	LFSDUX	c04,   AO8, INC2	STFPDUX	c01,   B, INC2	STFPDUX	c02,   B, INC2	STFPDUX	c03,   B, INC2	STFPDUX	c04,   B, INC2	.align 4.L19:	addic.	J, J, -1	bgt	.L11	.align 4.L20:	andi.	J,  N,  4	ble	.L30	.align 4.L21:	mr	AO1, A	add	AO2, A,   LDA	add	AO3, AO2, LDA	add	AO4, AO3, LDA	add	A,   AO4, LDA	srawi.	r0,  M,  3	mtspr	CTR, r0	ble	.L25	.align 4.L22:	LFPDUX	c01,   AO1, INC2	LFXDUX	c02,   AO2, INC2	LFPDUX	c03,   AO3, INC2	LFXDUX	c04,   AO4, INC2	LFPDUX	c05,   AO1, INC2	LFXDUX	c06,   AO2, INC2	LFPDUX	c07,   AO3, INC2	LFXDUX	c08,   AO4, INC2	LFPDUX	c09,   AO1, INC2	LFXDUX	c10,   AO2, INC2	LFPDUX	c11,   AO3, INC2	LFXDUX	c12,   AO4, INC2	fpsel	c17, sel_p, c01, c02	LFPDUX	c13,   AO1, INC2	fpsel	c18, sel_p, c03, c04	LFXDUX	c14,   AO2, INC2	fpsel	c19, sel_s, c01, c02	LFPDUX	c15,   AO3, INC2	fpsel	c20, sel_s, c03, c04	LFXDUX	c16,   AO4, INC2	fpsel	c21, sel_p, c05, c06	fpsel	c22, sel_p, c07, c08	STFPDUX	c17,   B, INC2	fpsel	c23, sel_s, c05, c06	STFPDUX	c18,   B, INC2	fpsel	c24, sel_s, c07, c08	STFXDUX	c19,   B, INC2	fpsel	c01, sel_p, c09, c10	STFXDUX	c20,   B, INC2	fpsel	c02, sel_p, c11, c12	STFPDUX	c21,   B, INC2	fpsel	c03, sel_s, c09, c10	STFPDUX	c22,   B, INC2	fpsel	c04, sel_s, c11, c12	STFXDUX	c23,   B, INC2	fpsel	c05, sel_p, c13, c14	STFXDUX	c24,   B, INC2	fpsel	c06, sel_p, c15, c16	STFPDUX	c01,   B, INC2	fpsel	c07, sel_s, c13, c14	STFPDUX	c02,   B, INC2	fpsel	c08, sel_s, c15, c16	STFXDUX	c03,   B, INC2	STFXDUX	c04,   B, INC2	STFPDUX	c05,   B, INC2	STFPDUX	c06,   B, INC2	STFXDUX	c07,   B, INC2	STFXDUX	c08,   B, INC2	bdnz	.L22	.align 4	.L25:	andi.	r0,  M,  7	ble	.L30	andi.	r0,  M,  4	beq	.L26	LFPDUX	c01,   AO1, INC2	LFXDUX	c02,   AO2, INC2	LFPDUX	c03,   AO3, INC2	LFXDUX	c04,   AO4, INC2	LFPDUX	c05,   AO1, INC2	fpsel	c09, sel_p, c01, c02	LFXDUX	c06,   AO2, INC2	fpsel	c10, sel_p, c03, c04	LFPDUX	c07,   AO3, INC2	fpsel	c11, sel_s, c01, c02	LFXDUX	c08,   AO4, INC2	fpsel	c12, sel_s, c03, c04	fpsel	c13, sel_p, c05, c06	fpsel	c14, sel_p, c07, c08	STFPDUX	c09,   B, INC2	fpsel	c15, sel_s, c05, c06	STFPDUX	c10,   B, INC2	fpsel	c16, sel_s, c07, c08	STFXDUX	c11,   B, INC2	STFXDUX	c12,   B, INC2	STFPDUX	c13,   B, INC2	STFPDUX	c14,   B, INC2	STFXDUX	c15,   B, INC2	STFXDUX	c16,   B, INC2	.align 4.L26:	andi.	r0,  M,  2	beq	.L27	LFPDUX	c01,   AO1, INC2	LFXDUX	c02,   AO2, INC2	LFPDUX	c03,   AO3, INC2	LFXDUX	c04,   AO4, INC2	fpsel	c05, sel_p, c01, c02	fpsel	c06, sel_p, c03, c04	fpsel	c07, sel_s, c01, c02	fpsel	c08, sel_s, c03, c04	STFPDUX	c05,   B, INC2	STFPDUX	c06,   B, INC2	STFXDUX	c07,   B, INC2	STFXDUX	c08,   B, INC2	.align 4.L27:	andi.	r0,  M,  1	beq	.L30	LFDUX	c01,   AO1, INC2	LFDUX	c02,   AO2, INC2	LFDUX	c03,   AO3, INC2	LFDUX	c04,   AO4, INC2	fsmfp	c01, c02	fsmfp	c03, c04		STFPDUX	c01,   B, INC2	STFPDUX	c03,   B, INC2	.align 4	.L30:	andi.	J,  N,  2	ble	.L40	mr	AO1, A	add	AO2, A,   LDA	add	A,   AO2, LDA	srawi.	r0,  M,  3	mtspr	CTR, r0	ble	.L35	.align 4.L32:	LFPDUX	c01,   AO1, INC2	LFXDUX	c05,   AO2, INC2	LFPDUX	c02,   AO1, INC2	LFXDUX	c06,   AO2, INC2	LFPDUX	c03,   AO1, INC2	fpsel	c09, sel_p, c01, c05	LFXDUX	c07,   AO2, INC2	fpsel	c10, sel_s, c01, c05	LFPDUX	c04,   AO1, INC2	fpsel	c11, sel_p, c02, c06	LFXDUX	c08,   AO2, INC2	fpsel	c12, sel_s, c02, c06	fpsel	c13, sel_p, c03, c07	fpsel	c14, sel_s, c03, c07	STFPDUX	c09,   B, INC2	fpsel	c15, sel_p, c04, c08	STFXDUX	c10,   B, INC2	fpsel	c16, sel_s, c04, c08	STFPDUX	c11,   B, INC2	STFXDUX	c12,   B, INC2	STFPDUX	c13,   B, INC2	STFXDUX	c14,   B, INC2	STFPDUX	c15,   B, INC2	STFXDUX	c16,   B, INC2	bdnz	.L32	.align 4	.L35:	andi.	r0,  M,  7	ble	.L40	andi.	r0,  M,  4	beq	.L36	LFPDUX	c01,   AO1, INC2	LFXDUX	c03,   AO2, INC2	LFPDUX	c02,   AO1, INC2	LFXDUX	c04,   AO2, INC2	fpsel	c05, sel_p, c01, c03	fpsel	c06, sel_s, c01, c03	fpsel	c07, sel_p, c02, c04	fpsel	c08, sel_s, c02, c04	STFPDUX	c05,   B, INC2	STFXDUX	c06,   B, INC2	STFPDUX	c07,   B, INC2	STFXDUX	c08,   B, INC2	.align 4.L36:	andi.	r0,  M,  2	beq	.L37	LFPDUX	c01,   AO1, INC2	LFXDUX	c02,   AO2, INC2	fpsel	c03, sel_p, c01, c02	fpsel	c04, sel_s, c01, c02	STFPDUX	c03,   B, INC2	STFXDUX	c04,   B, INC2	.align 4.L37:	andi.	r0,  M,  1	beq	.L40	LFDUX	c01,   AO1, INC2	LFDUX	c02,   AO2, INC2	fsmfp	c01, c02	STFPDUX	c01,   B, INC2	.align 4.L40:	andi.	J,  N,  1	ble	.L999	mr	AO1, A	srawi.	r0,  M,  3	mtspr	CTR, r0	ble	.L45	.align 4.L42:	LFPDUX	c01,   AO1, INC2	LFPDUX	c02,   AO1, INC2	LFPDUX	c03,   AO1, INC2	LFPDUX	c04,   AO1, INC2	STFPDUX	c01,   B, INC2	STFPDUX	c02,   B, INC2	STFPDUX	c03,   B, INC2	STFPDUX	c04,   B, INC2	bdnz	.L42	.align 4	.L45:	andi.	r0,  M,  7	ble	.L999	andi.	r0,  M,  4	beq	.L46	LFPDUX	c01,   AO1, INC2	LFPDUX	c02,   AO1, INC2	STFPDUX	c01,   B, INC2	STFPDUX	c02,   B, INC2	.align 4.L46:	andi.	r0,  M,  2	beq	.L47	LFPDUX	c01,   AO1, INC2	STFPDUX	c01,   B, INC2	.align 4.L47:	andi.	r0,  M,  1	beq	.L999	LFDX	c01,   AO1, INC2	STFDX	c01,   B,  INC2	b	.L999	.align 4.L100:	subi	A, A, 1 * SIZE	srawi.	J,  N,  3	ble	.L120	.align 4.L111:	mr	AO1, A	add	AO2, A,   LDA	add	AO3, AO2, LDA	add	AO4, AO3, LDA	add	AO5, AO4, LDA	add	AO6, AO5, LDA	add	AO7, AO6, LDA	add	AO8, AO7, LDA	add	A,   AO8, LDA	srawi.	r0,  M,  3	mtspr	CTR, r0	ble	.L115	.align 4

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -