📄 qgemv_n.s
字号:
/*********************************************************************//* *//* Optimized BLAS libraries *//* By Kazushige Goto <kgoto@tacc.utexas.edu> *//* *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO *//* THE USE OF THE SOFTWARE OR DOCUMENTATION. *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of *//* profits, interruption of business, or related expenses which may *//* arise from use of Software or Documentation, including but not *//* limited to those resulting from defects in Software and/or *//* Documentation, or loss or inaccuracy of data of any kind. *//*********************************************************************/#define ASSEMBLER#include "common.h"#define SP r12#define M r32#define N r33#ifndef XDOUBLE#define A r36#define LDA r37#define X r38#define INCX r39#define Y r34#define INCY r35#else#define A r38#define LDA r39#define X r34#define INCX r35#define Y r36#define INCY r37#endif#define BUFFER r11#define I r14#define J r15#define AO1 r16#define AO2 r17#define AO3 r18#define AO4 r19#define AO5 r20#define AO6 r21#define AO7 r22#define AO8 r23#define YLD1 r24#define YLD2 r25#define YST1 r26#define YST2 r27#define II r28#define YY r29#define ARLC r30#define PR r31 #define LDA7M8 r8#define PREA r9#define PREB r10#define ALPHA1 f8#define ALPHA2 f9#define ALPHA3 f10#define ALPHA4 f11#define ALPHA5 f12#define ALPHA6 f13#define ALPHA7 f14#define ALPHA8 f15#define RPREFETCHSIZE ( 8 * 1 + 6)#define WPREFETCHSIZE ( 8 * 1 + 6)#define RPREFETCH lfetch.nt1#define WPREFETCH lfetch.excl.nt1#define ALPHA f6 PROLOGUE .prologue PROFCODE { .mmi mov ARLC = ar.lc } ;; mov PR = pr adds r14 = 16, SP adds r15 = 24, SP adds r16 = 32, SP .body ;; #ifdef XDOUBLE ld8 X = [r14], 16 ld8 INCX = [r15], 16 ;;#endif ld8 Y = [r14], 16 ld8 INCY = [r15], 16 ;; ld8 BUFFER = [r14] ;; mov ALPHA = f8 cmp.ge p7, p0 = 0, M cmp.ge p6, p0 = 0, N ;; shladd INCX = INCX, BASE_SHIFT, r0 shladd LDA = LDA, BASE_SHIFT, r0 shladd INCY = INCY, BASE_SHIFT, r0 ;; (p7) br.cond.dpnt .L999 (p6) br.cond.dpnt .L999 ;; sub I = A, Y mov YY = Y ;; cmp.eq p10, p0 = SIZE, INCY (p10) br.cond.dptk .L10 ;; shr J = M, 3 mov YY = BUFFER ;; (p8) adds YY = SIZE, BUFFER ;; mov ar.lc = J mov YST1 = YY adds YST2 = 4 * SIZE, YY ;;.L02: STFD [YST1] = f0, 1 * SIZE STFD [YST2] = f0, 1 * SIZE ;; STFD [YST1] = f0, 1 * SIZE STFD [YST2] = f0, 1 * SIZE ;; STFD [YST1] = f0, 1 * SIZE STFD [YST2] = f0, 1 * SIZE ;; STFD [YST1] = f0, 5 * SIZE STFD [YST2] = f0, 5 * SIZE br.cloop.sptk.few .L02 ;;.L10: shr J = N, 3 ;; cmp.eq p6, p0 = r0, J (p6) br.cond.dpnt .L20 ;; .align 16.L11: shladd LDA7M8 = LDA, 3, r0 ;; sub LDA7M8 = LDA, LDA7M8 ;; adds LDA7M8 = 8 * SIZE, LDA7M8 ;; mov YLD1 = YY mov YST1 = YY adds YLD2 = 1 * SIZE, YY adds YST2 = 1 * SIZE, YY ;; LDFD ALPHA1 = [X], INCX ;; LDFD ALPHA2 = [X], INCX ;; LDFD ALPHA3 = [X], INCX ;; LDFD ALPHA4 = [X], INCX ;; LDFD ALPHA5 = [X], INCX ;; LDFD ALPHA6 = [X], INCX ;; LDFD ALPHA7 = [X], INCX ;; LDFD ALPHA8 = [X], INCX ;; FMPY ALPHA1 = ALPHA, ALPHA1 FMPY ALPHA2 = ALPHA, ALPHA2 FMPY ALPHA3 = ALPHA, ALPHA3 FMPY ALPHA4 = ALPHA, ALPHA4 FMPY ALPHA5 = ALPHA, ALPHA5 FMPY ALPHA6 = ALPHA, ALPHA6 ;; mov AO1 = A adds AO2 = 1 * SIZE, A adds AO3 = 2 * SIZE, A adds AO4 = 3 * SIZE, A adds AO5 = 4 * SIZE, A adds AO6 = 5 * SIZE, A adds AO7 = 6 * SIZE, A adds AO8 = 7 * SIZE, A shladd A = LDA, 3, A ;; shr I = M, 3 mov pr.rot= 0 ;; cmp.eq p16, p0 = r0, r0 ;; adds I = -1, I adds J = -1, J ;; adds PREB = (WPREFETCHSIZE) * SIZE, YY ;; cmp.lt p7, p8 = r0, J tbit.nz p13, p11 = M, 2 mov ar.ec= 2 ;; FMPY ALPHA7 = ALPHA, ALPHA7 ;; { .mfi and II = 7, M FMPY ALPHA8 = ALPHA, ALPHA8 mov ar.lc = I } { .mib cmp.eq p6, p0 = -1, I tbit.nz p14, p12 = M, 1 (p6) br.cond.dpnt .L15 } ;; .align 16.L12: { .mmf (p17) LDFD f93 = [AO5], LDA7M8 (p17) LDFD f94 = [AO6], LDA7M8 (p17) FMA f101 = ALPHA1, f33, f101 } { .mmf (p17) LDFD f95 = [AO7], LDA7M8 (p17) LDFD f96 = [AO8], LDA7M8 (p17) FMA f104 = ALPHA1, f34, f104 } ;; { .mmf (p16) LDFD f32 = [AO1] (p16) LDFD f33 = [AO2], LDA (p17) FMA f107 = ALPHA1, f35, f107 } { .mmf (p16) LDFD f34 = [AO3], LDA (p16) LDFD f35 = [AO4], LDA (p17) FMA f110 = ALPHA1, f36, f110 } ;; { .mmf (p16) LDFD f100 = [YLD1], 2 * SIZE (p16) LDFD f103 = [YLD2], 2 * SIZE (p17) FMA f113 = ALPHA1, f37, f113 } { .mmf (p16) adds PREA = (RPREFETCHSIZE) * SIZE, AO1 (p16) add AO1 = AO1, LDA (p17) FMA f116 = ALPHA1, f38, f116 } ;; { .mmf (p18) STFD [YST1] = f102, 2 * SIZE (p18) STFD [YST2] = f105, 2 * SIZE (p17) FMA f119 = ALPHA1, f39, f119 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f122 = ALPHA1, f40, f122 } ;; { .mmf (p16) LDFD f36 = [AO5], LDA (p16) LDFD f37 = [AO6], LDA (p17) FMA f101 = ALPHA2, f41, f101 } { .mmf (p16) LDFD f38 = [AO7], LDA (p16) LDFD f39 = [AO8], LDA (p17) FMA f104 = ALPHA2, f42, f104 } ;; { .mmf (p16) LDFD f40 = [AO1], LDA (p16) LDFD f41 = [AO2], LDA (p17) FMA f107 = ALPHA2, f43, f107 } { .mmf (p16) LDFD f42 = [AO3], LDA (p16) LDFD f43 = [AO4], LDA (p17) FMA f110 = ALPHA2, f44, f110 } ;; { .mmf (p16) LDFD f106 = [YLD1], 2 * SIZE (p16) LDFD f109 = [YLD2], 2 * SIZE (p17) FMA f113 = ALPHA2, f45, f113 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f116 = ALPHA2, f46, f116 } ;; { .mmf (p18) STFD [YST1] = f108, 2 * SIZE (p18) STFD [YST2] = f111, 2 * SIZE (p17) FMA f119 = ALPHA2, f47, f119 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f122 = ALPHA2, f48, f122 } ;; { .mmf (p16) LDFD f44 = [AO5], LDA (p16) LDFD f45 = [AO6], LDA (p17) FMA f101 = ALPHA3, f49, f101 } { .mmf (p16) LDFD f46 = [AO7], LDA (p16) LDFD f47 = [AO8], LDA (p17) FMA f104 = ALPHA3, f50, f104 } ;; { .mmf (p16) LDFD f48 = [AO1], LDA (p16) LDFD f49 = [AO2], LDA (p17) FMA f107 = ALPHA3, f51, f107 } { .mmf (p16) LDFD f50 = [AO3], LDA (p16) LDFD f51 = [AO4], LDA (p17) FMA f110 = ALPHA3, f52, f110 } ;; { .mmf (p16) LDFD f112 = [YLD1], 2 * SIZE (p16) LDFD f115 = [YLD2], 2 * SIZE (p17) FMA f113 = ALPHA3, f53, f113 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f116 = ALPHA3, f54, f116 } ;; { .mmf (p18) STFD [YST1] = f114, 2 * SIZE (p18) STFD [YST2] = f117, 2 * SIZE (p17) FMA f119 = ALPHA3, f55, f119 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f122 = ALPHA3, f56, f122 } ;; { .mmf (p16) LDFD f52 = [AO5], LDA (p16) LDFD f53 = [AO6], LDA (p17) FMA f101 = ALPHA4, f57, f101 } { .mmf (p16) LDFD f54 = [AO7], LDA (p16) LDFD f55 = [AO8], LDA (p17) FMA f104 = ALPHA4, f58, f104 } ;; { .mmf (p16) LDFD f56 = [AO1], LDA (p16) LDFD f57 = [AO2], LDA (p17) FMA f107 = ALPHA4, f59, f107 } { .mmf (p16) LDFD f58 = [AO3], LDA (p16) LDFD f59 = [AO4], LDA (p17) FMA f110 = ALPHA4, f60, f110 } ;; { .mmf (p16) LDFD f118 = [YLD1], 2 * SIZE (p16) LDFD f121 = [YLD2], 2 * SIZE (p17) FMA f113 = ALPHA4, f61, f113 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f116 = ALPHA4, f62, f116 } ;; { .mmf (p18) STFD [YST1] = f120, 2 * SIZE (p18) STFD [YST2] = f123, 2 * SIZE (p17) FMA f119 = ALPHA4, f63, f119 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f122 = ALPHA4, f64, f122 } ;; { .mmf (p16) LDFD f60 = [AO5], LDA (p16) LDFD f61 = [AO6], LDA (p17) FMA f101 = ALPHA5, f65, f101 } { .mmf (p16) LDFD f62 = [AO7], LDA (p16) LDFD f63 = [AO8], LDA (p17) FMA f104 = ALPHA5, f66, f104 } ;; { .mmf (p16) LDFD f64 = [AO1], LDA (p16) LDFD f65 = [AO2], LDA (p17) FMA f107 = ALPHA5, f67, f107 } { .mmf (p16) LDFD f66 = [AO3], LDA (p16) LDFD f67 = [AO4], LDA (p17) FMA f110 = ALPHA5, f68, f110 } ;; { .mmf (p16) WPREFETCH [PREB], 8 * SIZE nop __LINE__ (p17) FMA f113 = ALPHA5, f69, f113 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f116 = ALPHA5, f70, f116 } ;; { .mmf (p16) RPREFETCH [PREA] nop __LINE__ (p17) FMA f119 = ALPHA5, f71, f119 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f122 = ALPHA5, f72, f122 } ;; { .mmf (p16) LDFD f68 = [AO5], LDA (p16) LDFD f69 = [AO6], LDA (p17) FMA f101 = ALPHA6, f73, f101 } { .mmf (p16) LDFD f70 = [AO7], LDA (p16) LDFD f71 = [AO8], LDA (p17) FMA f104 = ALPHA6, f74, f104 } ;; { .mmf (p16) LDFD f72 = [AO1], LDA (p16) LDFD f73 = [AO2], LDA (p17) FMA f107 = ALPHA6, f75, f107 } { .mmf (p16) LDFD f74 = [AO3], LDA (p16) LDFD f75 = [AO4], LDA (p17) FMA f110 = ALPHA6, f76, f110 } ;; { .mmf nop __LINE__ nop __LINE__ (p17) FMA f113 = ALPHA6, f77, f113 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f116 = ALPHA6, f78, f116 } ;; { .mmf nop __LINE__ nop __LINE__ (p17) FMA f119 = ALPHA6, f79, f119 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f122 = ALPHA6, f80, f122 } ;; { .mmf (p16) LDFD f76 = [AO5], LDA (p16) LDFD f77 = [AO6], LDA (p17) FMA f101 = ALPHA7, f81, f101 } { .mmf (p16) LDFD f78 = [AO7], LDA (p16) LDFD f79 = [AO8], LDA (p17) FMA f104 = ALPHA7, f82, f104 } ;; { .mmf (p16) LDFD f80 = [AO1], LDA (p16) LDFD f81 = [AO2], LDA (p17) FMA f107 = ALPHA7, f83, f107 } { .mmf (p16) LDFD f82 = [AO3], LDA (p16) LDFD f83 = [AO4], LDA (p17) FMA f110 = ALPHA7, f84, f110 } ;; { .mmf nop __LINE__ nop __LINE__ (p17) FMA f113 = ALPHA7, f85, f113 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f116 = ALPHA7, f86, f116 } ;; { .mmf nop __LINE__ nop __LINE__ (p17) FMA f119 = ALPHA7, f87, f119 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f122 = ALPHA7, f88, f122 } ;; { .mmf (p16) LDFD f84 = [AO5], LDA (p16) LDFD f85 = [AO6], LDA (p17) FMA f101 = ALPHA8, f89, f101 } { .mmf (p16) LDFD f86 = [AO7], LDA (p16) LDFD f87 = [AO8], LDA (p17) FMA f104 = ALPHA8, f90, f104 }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -