📄 trsm_kernel_lt.s
字号:
/*********************************************************************//* *//* Optimized BLAS libraries *//* By Kazushige Goto <kgoto@tacc.utexas.edu> *//* *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO *//* THE USE OF THE SOFTWARE OR DOCUMENTATION. *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of *//* profits, interruption of business, or related expenses which may *//* arise from use of Software or Documentation, including but not *//* limited to those resulting from defects in Software and/or *//* Documentation, or loss or inaccuracy of data of any kind. *//*********************************************************************/#define ASSEMBLER#include "common.h"#ifdef DOUBLE#define PREFETCHSIZE (16 * 8)#else#define PREFETCHSIZE (32 * 8)#endif#define CPREFETCHSIZE 7#define CPREFETCH lfetch.excl.nt1#define M r32#define N r33#define K r34#define A r36#define B r37#define C r38#define LDC r39#define I r15#define J r16#define AOFFSET r17#define BOFFSET r18#define TEMP r19#define L r20#define C1 r21#define C2 r22#define C3 r23#define C4 r24#define C5 r25#define C6 r26#define C7 r27#define C8 r28#define C9 loc0#define C10 loc1#define C11 loc2#define C12 loc3#define C13 loc4#define C14 loc5#define C15 loc6#define C16 loc7#define PREA r8#define PREB r9#define PREC r10#define SP r12#define ARLC r29#define PR r30#define ARPFS r31#define ALPHA f8#define AORIG loc8#define KK loc9#define KK8 loc10#define OFFSET loc11#define AOFFSET2 loc12#define BOFFSET2 loc13 PROLOGUE .prologue PROFCODE { .mmi .save ar.pfs, ARPFS alloc ARPFS = ar.pfs, 8, 16, 0, 0 adds r14 = 16, SP mov ARLC = ar.lc } { .mmi adds r8 = -6 * 16, SP adds r9 = -5 * 16, SP adds SP = -6 * 16, SP } ;; { .mmi ld8 OFFSET = [r14] mov AOFFSET = A mov PR = pr } ;; { .mmi stf.spill [r8] = f16, 32 stf.spill [r9] = f17, 32 shr J = N, 3 } ;; { .mmi stf.spill [r8] = f18, 32 stf.spill [r9] = f19, 32 shladd LDC = LDC, BASE_SHIFT, r0 } ;; .body { .mmi stf.spill [r8] = f20 stf.spill [r9] = f21 cmp.ge p6, p0 = 0, J } { .mib nop __LINE__#ifdef RN sub KK = r0, OFFSET#else nop __LINE__#endif (p6) br.cond.dpnt .L050 } ;; .align 8.L010: { .mfi adds J = -1, J mov f64 = f0 shr I = M, 3 } { .mfi mov C1 = C // coffset1 = c + 0 * ldc mov f72 = f0#ifdef LT mov KK = OFFSET#else nop __LINE__#endif } ;; { .mmf cmp.eq p6, p7 = 0, I mov AOFFSET = A mov f80 = f0 } { .mmf add C2 = LDC, C // coffset2 = c + 1 * ldc shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc mov f88 = f0 } ;; { .mmf shladd C5 = LDC, 2, C // coffset5 = c + 4 * ldc shladd C = LDC, 3, C // coffset += 8 * ldc mov f96 = f0 } { .mmf shladd C4 = LDC, 1, C2 shladd C6 = LDC, 2, C2 mov f104 = f0 } ;; { .mfi shladd C7 = LDC, 2, C3 mov f112 = f0 mov L = KK }{ .mfb shladd C8 = LDC, 2, C4 mov f120 = f0 (p6) br.cond.dpnt .L020 } ;; .align 16.L011: { .mmf cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B mov f65 = f0 } ;; { .mmf (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE setf.d f73 = r0 mov f81 = f0 } ;; { .mmf (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE setf.d f119 = r0 mov f89 = f0 } { .mmf (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE setf.d f97 = r0 mov f105 = f0 } ;; { .mmf (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE setf.d f113 = r0 mov f121 = f0 } ;; { .mmf (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE setf.d f66 = r0 mov f74 = f0 } { .mfi setf.d f82 = r0 mov f90 = f0 nop __LINE__ } ;; { .mmf (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE setf.d f98 = r0 mov f106 = f0 } { .mfi setf.d f114 = r0 mov f122 = f0 adds L = 1, L } ;; { .mmf (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE setf.d f67 = r0 mov f75 = f0 } { .mfi setf.d f83 = r0 mov f91 = f0 cmp.eq p3, p0 = r0, r0 } ;; { .mmf (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE setf.d f99 = r0 mov f107 = f0 } { .mfi setf.d f115 = r0 mov f123 = f0 adds PREC = CPREFETCHSIZE * SIZE, C1 } ;; { .mmf CPREFETCH [PREC], LDC setf.d f68 = r0 mov f76 = f0 } { .mfi setf.d f84 = r0 mov f92 = f0 adds AOFFSET2 = 4 * SIZE, AOFFSET } ;; { .mmf CPREFETCH [PREC], LDC setf.d f100 = r0 mov f108 = f0 } { .mfi setf.d f116 = r0 mov f124 = f0 adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET } ;; { .mmf CPREFETCH [PREC], LDC setf.d f69 = r0 mov f77 = f0 } { .mfi setf.d f85 = r0 mov f93 = f0 adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET } ;; { .mmf CPREFETCH [PREC], LDC setf.d f101 = r0 mov f109 = f0 } { .mfi setf.d f117 = r0 mov f125 = f0 tbit.z p12, p0 = L, 0 } ;; { .mmf CPREFETCH [PREC], LDC setf.d f70 = r0 mov f78 = f0 } { .mfi setf.d f86 = r0 mov f94 = f0 shr L = L, 1 } ;; { .mmf CPREFETCH [PREC], LDC setf.d f102 = r0 mov f110 = f0 } { .mfi setf.d f118 = r0 mov f126 = f0 adds L = -1, L } ;; { .mmf CPREFETCH [PREC], LDC setf.d f71 = r0 mov f79 = f0 } { .mfi setf.d f87 = r0 mov f95 = f0 mov ar.lc = L } ;; { .mmf CPREFETCH [PREC] setf.d f103 = r0 mov f111 = f0 } { .mfb cmp.eq p6, p0 = -1, L mov f127 = f0 (p6) br.cond.dpnt .L018 } ;; .align 16.L012:/* 1 */ { .mfi lfetch.fault.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi (p12) cmp.ne p3, p0 = 0, L FMA f72 = f32, f49, f72 // A1 * B2 nop __LINE__ } ;;/* 2 */ { .mfb lfetch.nt1 [PREB], 16 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 nop __LINE__ } { .mfb cmp.ne p4, p5 = 0, L FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;;/* 3 */ { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 nop __LINE__ } { .mfb adds C9 = 4 * SIZE, C1 FMA f104 = f32, f53, f104 // A1 * B6 nop __LINE__ } ;;/* 4 */ { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 nop __LINE__ } { .mfb adds C10 = 4 * SIZE, C2 FMA f120 = f32, f55, f120 // A1 * B8 nop __LINE__ } ;;/* 5 */ { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb adds C11 = 4 * SIZE, C3 FMA f73 = f33, f49, f73 // A2 * B2 nop __LINE__ } ;;/* 6 */ { .mfb (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb adds C12 = 4 * SIZE, C4 FMA f89 = f33, f51, f89 // A2 * B4 nop __LINE__ } ;;/* 7 */ { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE FMA f97 = f33, f52, f97 // A2 * B5 nop __LINE__ } { .mfb adds C13 = 4 * SIZE, C5 FMA f105 = f33, f53, f105 // A2 * B6 nop __LINE__ } ;;/* 8 */ { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f113 = f33, f54, f113 // A2 * B7 nop __LINE__ } { .mfb adds C14 = 4 * SIZE, C6 FMA f121 = f33, f55, f121 // A2 * B8 nop __LINE__ } ;;/* 9 */ { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb adds C15 = 4 * SIZE, C7 FMA f74 = f34, f49, f74 // A3 * B2 nop __LINE__ } ;;/* 10 */ { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE FMA f82 = f34, f50, f82 // A3 * B3 nop __LINE__ } { .mfb adds C16 = 4 * SIZE, C8 FMA f90 = f34, f51, f90 // A3 * B4 nop __LINE__ } ;;/* 11 */ { .mfb FMA f98 = f34, f52, f98 // A3 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f106 = f34, f53, f106 // A3 * B6 nop __LINE__ } ;; /* 12 */ { .mfb FMA f114 = f34, f54, f114 // A3 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f122 = f34, f55, f122 // A3 * B8 nop __LINE__ } ;;/* 13 */ { .mfb nop __LINE__ FMA f67 = f35, f48, f67 // A4 * B1 } { .mfb nop __LINE__ FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } ;;/* 14 */ { .mfb FMA f83 = f35, f50, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f91 = f35, f51, f91 // A4 * B4 nop __LINE__ } ;;/* 15 */ { .mfb FMA f99 = f35, f52, f99 // A4 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f107 = f35, f53, f107 // A4 * B6 nop __LINE__ } ;;/* 16 */ { .mfb FMA f115 = f35, f54, f115 // A4 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f123 = f35, f55, f123 // A4 * B8 nop __LINE__ } ;;/* 17 */ { .mfb nop __LINE__ FMA f68 = f36, f48, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f76 = f36, f49, f76 // A5 * B2 nop __LINE__ } ;;/* 18 */ { .mfb nop __LINE__ FMA f84 = f36, f50, f84 // A5 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f92 = f36, f51, f92 // A5 * B4 nop __LINE__ } ;;/* 19 */ { .mfb nop __LINE__ FMA f100 = f36, f52, f100 // A5 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f108 = f36, f53, f108 // A5 * B6 nop __LINE__ } ;;/* 20 */ { .mfb nop __LINE__ FMA f116 = f36, f54, f116 // A5 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f124 = f36, f55, f124 // A5 * B8 nop __LINE__ } ;;/* 21 */ { .mfb nop __LINE__ FMA f69 = f37, f48, f69 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f77 = f37, f49, f77 // A6 * B2 nop __LINE__ } ;;/* 22 */ { .mfb nop __LINE__ FMA f85 = f37, f50, f85 // A6 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f93 = f37, f51, f93 // A6 * B4 nop __LINE__ } ;;/* 23 */ { .mfb nop __LINE__ FMA f101 = f37, f52, f101 // A6 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f109 = f37, f53, f109 // A6 * B6 nop __LINE__ } ;;/* 24 */ { .mfb nop __LINE__ FMA f117 = f37, f54, f117 // A6 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f125 = f37, f55, f125 // A6 * B8 nop __LINE__ } ;;/* 25 */ { .mfb nop __LINE__ FMA f70 = f38, f48, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f78 = f38, f49, f78 // A7 * B2 nop __LINE__ } ;;/* 26 */ { .mfb nop __LINE__ FMA f86 = f38, f50, f86 // A7 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f94 = f38, f51, f94 // A7 * B4 nop __LINE__ } ;;/* 27 */ { .mfb nop __LINE__ FMA f102 = f38, f52, f102 // A7 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f110 = f38, f53, f110 // A7 * B6 nop __LINE__ } ;;/* 28 */ { .mfb nop __LINE__ FMA f118 = f38, f54, f118 // A7 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f126 = f38, f55, f126 // A7 * B8 nop __LINE__ } ;;/* 29 */ { .mfb nop __LINE__ FMA f71 = f39, f48, f71 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f79 = f39, f49, f79 // A8 * B2 nop __LINE__ } ;;/* 30 */ { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f87 = f39, f50, f87 // A8 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f95 = f39, f51, f95 // A8 * B4 nop __LINE__ } ;;/* 31 */ { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f103 = f39, f52, f103 // A8 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f111 = f39, f53, f111 // A8 * B6 nop __LINE__ } ;;/* 32 */ { .mfb nop __LINE__ FMA f119 = f39, f54, f119 // A8 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f127 = f39, f55, f127 // A8 * B8 nop __LINE__ } ;;/* 33 */ { .mfb nop __LINE__ (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;;/* 34 */ { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;;/* 35 */ { .mfb (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f104 = f40, f61, f104 // A1 * B6 nop __LINE__ } ;;/* 36 */ { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f112 = f40, f62, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f120 = f40, f63, f120 // A1 * B8 nop __LINE__ } ;;/* 37 */ { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;;/* 38 */ { .mfb (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f89 = f41, f59, f89 // A2 * B4 nop __LINE__ } ;;/* 39 */ { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE (p3) FMA f97 = f41, f60, f97 // A2 * B5 nop __LINE__ } { .mfb
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -