⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 dot.s

📁 Optimized GotoBLAS libraries
💻 S
字号:
/*********************************************************************//*                                                                   *//*             Optimized BLAS libraries                              *//*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     *//*                                                                   *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   *//* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of     *//* profits, interruption of business, or related expenses which may  *//* arise from use of Software or Documentation, including but not    *//* limited to those resulting from defects in Software and/or        *//* Documentation, or loss or inaccuracy of data of any kind.         *//*********************************************************************/#define ASSEMBLER#include "common.h"	#define N	r3#define X	r4#define INCX	r5#define Y	r6#define INCY	r7#define PREA	r8#define FZERO	f0#define STACKSIZE 96	PROLOGUE	PROFCODE	addi	SP, SP, -STACKSIZE	li	r0,   0	stfd	f14,    0(SP)	stfd	f15,    8(SP)	stfd	f16,   16(SP)	stfd	f17,   24(SP)	stfd	f18,   32(SP)	stfd	f19,   40(SP)	stfd	f20,   48(SP)	stfd	f21,   56(SP)	stfd	f22,   64(SP)	stfd	f23,   72(SP)	stw	r0,    80(SP)	lfs	FZERO, 80(SP)#ifdef F_INTERFACE	LDINT	N,    0(N)	LDINT	INCX, 0(INCX)	LDINT	INCY, 0(INCY)#endif	slwi	INCX, INCX, BASE_SHIFT	slwi	INCY, INCY, BASE_SHIFT	fmr	f1,  FZERO	fmr	f2,  FZERO	fmr	f3,  FZERO	fmr	f4,  FZERO	fmr	f5,  FZERO	fmr	f6,  FZERO	fmr	f7,  FZERO	li	PREA, 3 * 16 * SIZE	cmpwi	cr0, N, 0	ble-	cr0, LL(999)	cmpwi	cr0, INCX, SIZE	bne	cr0, LL(100)	cmpwi	cr0, INCY, SIZE	bne	cr0, LL(100)	srawi.	r0, N, 4	mtspr	CTR, r0	beq-	cr0, LL(50)	.align 4	LFD	f8,     0 * SIZE(X)	LFD	f9,     1 * SIZE(X)	LFD	f10,    2 * SIZE(X)	LFD	f11,    3 * SIZE(X)	LFD	f16,    0 * SIZE(Y)	LFD	f17,    1 * SIZE(Y)	LFD	f18,    2 * SIZE(Y)	LFD	f19,    3 * SIZE(Y)	LFD	f12,    4 * SIZE(X)	LFD	f13,    5 * SIZE(X)	LFD	f14,    6 * SIZE(X)	LFD	f15,    7 * SIZE(X)	LFD	f20,    4 * SIZE(Y)	LFD	f21,    5 * SIZE(Y)	LFD	f22,    6 * SIZE(Y)	LFD	f23,    7 * SIZE(Y)	bdz	LL(20)	.align 4LL(10):	FMADD	f0, f8,  f16, f0	FMADD	f1, f9,  f17, f1	FMADD	f2, f10, f18, f2	FMADD	f3, f11, f19, f3	LFD	f8,    8 * SIZE(X)	LFD	f9,    9 * SIZE(X)	LFD	f10,  10 * SIZE(X)	LFD	f11,  11 * SIZE(X)	LFD	f16,   8 * SIZE(Y)	LFD	f17,   9 * SIZE(Y)	LFD	f18,  10 * SIZE(Y)	LFD	f19,  11 * SIZE(Y)	FMADD	f4,  f12, f20, f4	FMADD	f5,  f13, f21, f5	FMADD	f6,  f14, f22, f6	FMADD	f7,  f15, f23, f7	LFD	f12,  12 * SIZE(X)	LFD	f13,  13 * SIZE(X)	LFD	f14,  14 * SIZE(X)	LFD	f15,  15 * SIZE(X)	LFD	f20,  12 * SIZE(Y)	LFD	f21,  13 * SIZE(Y)	LFD	f22,  14 * SIZE(Y)	LFD	f23,  15 * SIZE(Y)	FMADD	f0,  f8,  f16, f0	FMADD	f1,  f9,  f17, f1	FMADD	f2,  f10, f18, f2	FMADD	f3,  f11, f19, f3	LFD	f8,   16 * SIZE(X)	LFD	f9,   17 * SIZE(X)	LFD	f10,  18 * SIZE(X)	LFD	f11,  19 * SIZE(X)	LFD	f16,  16 * SIZE(Y)	LFD	f17,  17 * SIZE(Y)	LFD	f18,  18 * SIZE(Y)	LFD	f19,  19 * SIZE(Y)	FMADD	f4, f12, f20, f4	FMADD	f5, f13, f21, f5	FMADD	f6, f14, f22, f6	FMADD	f7, f15, f23, f7	LFD	f12,   20 * SIZE(X)	LFD	f13,   21 * SIZE(X)	LFD	f14,   22 * SIZE(X)	LFD	f15,   23 * SIZE(X)	LFD	f20,   20 * SIZE(Y)	LFD	f21,   21 * SIZE(Y)	LFD	f22,   22 * SIZE(Y)	LFD	f23,   23 * SIZE(Y)	dcbt	X, PREA	dcbt	Y, PREA	addi	X, X, 16 * SIZE	addi	Y, Y, 16 * SIZE	bdnz	LL(10)	.align 4LL(20):	FMADD	f0, f8,  f16, f0	FMADD	f1, f9,  f17, f1	FMADD	f2, f10, f18, f2	FMADD	f3, f11, f19, f3	LFD	f8,    8 * SIZE(X)	LFD	f9,    9 * SIZE(X)	LFD	f10,  10 * SIZE(X)	LFD	f11,  11 * SIZE(X)	LFD	f16,   8 * SIZE(Y)	LFD	f17,   9 * SIZE(Y)	LFD	f18,  10 * SIZE(Y)	LFD	f19,  11 * SIZE(Y)	FMADD	f4,  f12, f20, f4	FMADD	f5,  f13, f21, f5	FMADD	f6,  f14, f22, f6	FMADD	f7,  f15, f23, f7	LFD	f12,  12 * SIZE(X)	LFD	f13,  13 * SIZE(X)	LFD	f14,  14 * SIZE(X)	LFD	f15,  15 * SIZE(X)	LFD	f20,  12 * SIZE(Y)	LFD	f21,  13 * SIZE(Y)	LFD	f22,  14 * SIZE(Y)	LFD	f23,  15 * SIZE(Y)	FMADD	f0,  f8,  f16, f0	FMADD	f1,  f9,  f17, f1	FMADD	f2,  f10, f18, f2	FMADD	f3,  f11, f19, f3	FMADD	f4, f12, f20, f4	FMADD	f5, f13, f21, f5	FMADD	f6, f14, f22, f6	FMADD	f7, f15, f23, f7	addi	X, X, 16 * SIZE	addi	Y, Y, 16 * SIZE	.align 4LL(50):	andi.	r0,  N, 15	mtspr	CTR, r0	beq	LL(999)	.align 4LL(60):	LFD	f8,  0 * SIZE(X)	LFD	f16, 0 * SIZE(Y)	addi	X, X,  1 * SIZE	addi	Y, Y,  1 * SIZE	FMADD	f0, f8, f16,  f0	bdnz	LL(60)	b	LL(999)	.align 4LL(100):#ifdef F_INTERFACE	cmpwi	cr0, INCX, 0	bge+	LL(102)	subi	r0, N, 1	mullw	r0, r0, INCX	sub	X, X, r0	.align 4LL(102):	cmpwi	cr0, INCY, 0	bge+	LL(104)	subi	r0, N, 1	mullw	r0, r0, INCY	sub	Y, Y, r0	.align 4LL(104):#endif	sub	X, X, INCX	sub	Y, Y, INCY	srawi.	r0, N, 4	mtspr	CTR,  r0	beq-	LL(150)	LFDUX	f8,    X, INCX	LFDUX	f16,   Y, INCY	LFDUX	f9,    X, INCX	LFDUX	f17,   Y, INCY	LFDUX	f10,   X, INCX	LFDUX	f18,   Y, INCY	LFDUX	f11,   X, INCX	LFDUX	f19,   Y, INCY	LFDUX	f12,   X, INCX	LFDUX	f20,   Y, INCY	LFDUX	f13,   X, INCX	LFDUX	f21,   Y, INCY	LFDUX	f14,   X, INCX	LFDUX	f22,   Y, INCY	LFDUX	f15,   X, INCX	LFDUX	f23,   Y, INCY	bdz	LL(120)	.align 4LL(110):	FMADD	f0,  f8,  f16, f0	FMADD	f1,  f9,  f17, f1	FMADD	f2,  f10, f18, f2	FMADD	f3,  f11, f19, f3	LFDUX	f8,    X, INCX	LFDUX	f16,   Y, INCY	LFDUX	f9,    X, INCX	LFDUX	f17,   Y, INCY	LFDUX	f10,   X, INCX	LFDUX	f18,   Y, INCY	LFDUX	f11,   X, INCX	LFDUX	f19,   Y, INCY	FMADD	f4,  f12, f20, f4	FMADD	f5,  f13, f21, f5	FMADD	f6,  f14, f22, f6	FMADD	f7,  f15, f23, f7	LFDUX	f12,   X, INCX	LFDUX	f20,   Y, INCY	LFDUX	f13,   X, INCX	LFDUX	f21,   Y, INCY	LFDUX	f14,   X, INCX	LFDUX	f22,   Y, INCY	LFDUX	f15,   X, INCX	LFDUX	f23,   Y, INCY	FMADD	f0,  f8,  f16, f0	FMADD	f1,  f9,  f17, f1	FMADD	f2,  f10, f18, f2	FMADD	f3,  f11, f19, f3	LFDUX	f8,    X, INCX	LFDUX	f16,   Y, INCY	LFDUX	f9,    X, INCX	LFDUX	f17,   Y, INCY	LFDUX	f10,   X, INCX	LFDUX	f18,   Y, INCY	LFDUX	f11,   X, INCX	LFDUX	f19,   Y, INCY	FMADD	f4,  f12, f20, f4	FMADD	f5,  f13, f21, f5	FMADD	f6,  f14, f22, f6	FMADD	f7,  f15, f23, f7	LFDUX	f12,   X, INCX	LFDUX	f20,   Y, INCY	LFDUX	f13,   X, INCX	LFDUX	f21,   Y, INCY	LFDUX	f14,   X, INCX	LFDUX	f22,   Y, INCY	LFDUX	f15,   X, INCX	LFDUX	f23,   Y, INCY	bdnz	LL(110)	.align 4LL(120):	FMADD	f0,  f8,  f16, f0	FMADD	f1,  f9,  f17, f1	FMADD	f2,  f10, f18, f2	FMADD	f3,  f11, f19, f3	LFDUX	f8,    X, INCX	LFDUX	f16,   Y, INCY	LFDUX	f9,    X, INCX	LFDUX	f17,   Y, INCY	LFDUX	f10,   X, INCX	LFDUX	f18,   Y, INCY	LFDUX	f11,   X, INCX	LFDUX	f19,   Y, INCY	FMADD	f4,  f12, f20, f4	FMADD	f5,  f13, f21, f5	FMADD	f6,  f14, f22, f6	FMADD	f7,  f15, f23, f7	LFDUX	f12,   X, INCX	LFDUX	f20,   Y, INCY	LFDUX	f13,   X, INCX	LFDUX	f21,   Y, INCY	LFDUX	f14,   X, INCX	LFDUX	f22,   Y, INCY	LFDUX	f15,   X, INCX	LFDUX	f23,   Y, INCY	FMADD	f0,  f8,  f16, f0	FMADD	f1,  f9,  f17, f1	FMADD	f2,  f10, f18, f2	FMADD	f3,  f11, f19, f3	FMADD	f4,  f12, f20, f4	FMADD	f5,  f13, f21, f5	FMADD	f6,  f14, f22, f6	FMADD	f7,  f15, f23, f7	.align 4LL(150):	andi.	r0,  N, 15	mtspr	CTR, r0	beq	LL(999)	.align 4LL(160):	LFDUX	f8,    X, INCX	LFDUX	f16,   Y, INCY	FMADD	f0,  f8,  f16, f0	bdnz	LL(160)	.align 4LL(999):	FADD	f0,  f0,  f1	FADD	f2,  f2,  f3	FADD	f4,  f4,  f5	FADD	f6,  f6,  f7	FADD	f0,  f0,  f2	FADD	f4,  f4,  f6	FADD	f1,  f0,  f4	lfd	f14,    0(SP)	lfd	f15,    8(SP)	lfd	f16,   16(SP)	lfd	f17,   24(SP)	lfd	f18,   32(SP)	lfd	f19,   40(SP)	lfd	f20,   48(SP)	lfd	f21,   56(SP)	lfd	f22,   64(SP)	lfd	f23,   72(SP)	addi	SP, SP, STACKSIZE	blr	EPILOGUE

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -