atl_sjik48x48x48tn48x48x0_a1_b0.asm

来自「基于Blas CLapck的.用过的人知道是干啥的」· 汇编 代码 · 共 1,351 行 · 第 1/2 页

ASM
1,351
字号
;              Automatically Tuned Linear Algebra Software v3.8.0;                       (C) Copyright 2001 Julian Ruhe;;  Redistribution and use in source and binary forms, with or without;  modification, are permitted provided that the following conditions;  are met:;    1. Redistributions of source code must retain the above copyright;       notice, this list of conditions and the following disclaimer.;    2. Redistributions in binary form must reproduce the above copyright;       notice, this list of conditions, and the following disclaimer in the;       documentation and/or other materials provided with the distribution.;    3. The name of the ATLAS group or the names of its contributers may;       not be used to endorse or promote products derived from this;       software without specific written permission.;;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS;  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED;  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR;  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS;  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR;  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF;  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS;  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN;  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE);  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE;  POSSIBILITY OF SUCH DAMAGE.;;;	ATL_sJIK48x48x48TN48x48x0_a1_b0.asm;;	ATLAS "Speed of Light" DGEMM() kernel for AMD Athlon;	Code author: Julian Ruhe (ruheejih@linux.zrz.tu-berlin.de | Julian.Ruhe@t-online.de);;	void ATL_sJIK48x48x48TN48x48x0_a1_b0(const int M, const int N, const int K, const float alpha,;						const float *A, const int lda, const float *B, const int ldb,;						const float beta, float *C, const int ldc);;	Compile with "nasmw -f win32 -DWIN32 ATL_sJIK48x48x48TN48x48x0_a1_b0.asm" (Windows);	Compile with "nasm -f elf -DELF ATL_sJIK48x48x48TN48x48x0_a1_b0.asm" (LINUX);;	See config file (ATL_sJIK48x48x48TN48x48x0_a1.cfg) for important macro definitions;%include "ATL_sJIK48x48x48TN48x48x0_a1.cfg"%include "ATL_sJIK48x48x48TN48x48x0_a1.mcr"%ifdef WIN32global _ATL_sJIK48x48x48TN48x48x0_a1_b0section .text_ATL_sJIK48x48x48TN48x48x0_a1_b0:%endif%ifdef ELFglobal ATL_sJIK48x48x48TN48x48x0_a1_b0section .textATL_sJIK48x48x48TN48x48x0_a1_b0:%endif	push ebp	mov ebp,esp	push ebx	push esi	push edi	femms	mov eax,0				;temporary variable t1	push eax				;t1->stack	mov eax,[ebp+24]			;&A->eax	add eax,NB*NB*4				;&(A+1)->eax	mov ebx,[ebp+32]			;&B->ebx	sub eax,ebx				;calculate offset	push eax				;&A+1+offset->stack	mov eax,[ebp+48]			;ldc->eax	lea eax,[4*eax]	push eax				;8*ldc->stack	mov eax,NB	push eax				;loop counter->stack	mov eax,[ebp+24]			;&A->eax	mov ebx,[ebp+32]			;&B->ebx	mov ecx,[ebp+44]			;&C->ecx	add ecx,byte 30*4			;calculate offsets	add ebx,byte 30*4	add eax,5*NB*4	push eax				;&A+offset->stack	push ebp				;ebp->stack	mov edi,-1*NB*4				;calculate offsets for dot products	mov esi,-3*NB*4	mov ebp,-5*NB*4	mov edx,6*NB*4-30*4			;offset for the next 6 dot products						;stack dump						;						;[esp+20]:	t1 (temp)						;[esp+16]:	&(A+1)+offset						;[esp+12]:	ldc*4						;[esp+08]:	loop counter						;[esp+04]:	&A+offset						;[esp+00]:	ebp	align 16loopj_	fld dword [ebx+ELM1]			;01+1	fld dword [eax+DOTP2]	fmul st0,st1	fld dword [eax+DOTP3]	fmul st0,st2	fld dword [eax+DOTP1]	rep	fmul st0,st3	fxch st0,st3	fld dword [eax+DOTP5]	rep	fmul st0,st1	rep	fld dword [eax+DOTP6]	mov edx,edx	fmul st0,st2	fld dword [ebx+ELM2]	fld dword [eax+DOTP4]	rep	fmulp st4,st0	add eax,byte 30*4	mov edx,edx	OPERATION 2,3				;02+1	OPERATION 3,4				;03+1	OPERATION 4,5				;04+1	OPERATION 5,6				;05+1	OPERATION 6,7				;06+1	OPERATION 7,8				;07+1	OPERATION 8,9				;08+1	OPERATION 9,10				;09+1	OPERATION 10,11				;10+1	OPERATION 11,12				;11+1	OPERATION 12,13				;12+1	OPERATION 13,14				;13+1	OPERATION 14,15				;14+1	OPERATION 15,16				;15+1	OPERATION 16,17				;16+1	OPERATION 17,18				;17+1	OPERATION 18,19				;18+1	OPERATION 19,20				;19+1	OPERATION 20,21				;20+1	OPERATION 21,22				;21+1	OPERATION 22,23				;22+1	OPERATION 23,24				;23+1	OPERATION 24,25				;24+1	OPERATION 25,26				;25+1	OPERATION 26,27				;26+1	OPERATION 27,28				;27+1	OPERATION 28,29				;28+1	OPERATION 29,30				;29+1	OPERATION 30,31				;30+1	OPERATION 31,32				;31+1	OPERATION 32,33				;32+1	OPERATION 33,34				;33+1	OPERATION 34,35				;34+1	OPERATION 35,36				;35+1	OPERATION 36,37				;36+1	OPERATION 37,38				;37+1	OPERATION 38,39				;38+1	OPERATION 39,40				;39+1	OPERATION 40,41				;40+1	OPERATION 41,42				;41+1	OPERATION 42,43				;42+1	OPERATION 43,44				;43+1	OPERATION 44,45				;44+1	OPERATION 45,46				;45+1	OPERATION 46,47				;45+1	OPERATION 47,48				;45+1	fld dword [eax+DOTP1+ELM48]		;48+1	fmul st0,st1	faddp st7	fld dword [eax+DOTP2+ELM48]	fmul st0,st1	faddp st6	fld dword [eax+DOTP3+ELM48]	fmul st0,st1	faddp st5	fld dword [eax+DOTP4+ELM48]	fmul st0,st1	faddp st4	fld dword [eax+DOTP5+ELM48]	fmul st0,st1	faddp st3	rep	fmul dword [eax+DOTP6+ELM48]	faddp st1	fxch st5	%ifdef PREC_DST7		mov [esp+20],ecx		add ecx,[esp+12]		prefetchw [ecx-2*64]		prefetchw [ecx-1*64]		prefetchw [ecx+0*64]		prefetchw [ecx+1*64]		nop		prefetchw [ecx+2*64-1]		mov ecx,[esp+20]	%endif	%ifdef PREB_DST7		prefetch [ebx+48*4-2*64]		fnop		mov edx,edx		prefetch [ebx+48*4-1*64]		nop		prefetch [ebx+48*4+0*64]		nop		prefetch [ebx+48*4+1*64]		nop		prefetch [ebx+48*4+2*64]		nop	%endif	fstp dword [ecx+ELM1]	fxch st3	fstp dword [ecx+ELM2]	fxch st1	fstp dword [ecx+ELM3]	fstp dword [ecx+ELM4]	fstp dword [ecx+ELM5]	fstp dword [ecx+ELM6]	add eax,edx	fld dword [ebx+ELM1]			;01+2	fld dword [eax+DOTP2]	fmul st0,st1	fld dword [eax+DOTP3]	fmul st0,st2	fld dword [eax+DOTP1]	rep	fmul st0,st3	fxch st0,st3	fld dword [eax+DOTP5]	rep	fmul st0,st1	rep	fld dword [eax+DOTP6]	mov edx,edx	fmul st0,st2	fld dword [ebx+ELM2]	fld dword [eax+DOTP4]	rep	fmulp st4,st0	add eax,byte 30*4	mov edx,edx	OPERATION 2,3				;02+2	OPERATION 3,4				;03+2	OPERATION 4,5				;04+2	OPERATION 5,6				;05+2	OPERATION 6,7				;06+2	OPERATION 7,8				;07+2	OPERATION 8,9				;08+2	OPERATION 9,10				;09+2	OPERATION 10,11				;10+2	OPERATION 11,12				;11+2	OPERATION 12,13				;12+2	OPERATION 13,14				;13+2	OPERATION 14,15				;14+2	OPERATION 15,16				;15+2	OPERATION 16,17				;16+2	OPERATION 17,18				;17+2	OPERATION 18,19				;18+2	OPERATION 19,20				;19+2	OPERATION 20,21				;20+2	OPERATION 21,22				;21+2	OPERATION 22,23				;22+2	OPERATION 23,24				;23+2	OPERATION 24,25				;24+2	OPERATION 25,26				;25+2	OPERATION 26,27				;26+2	OPERATION 27,28				;27+2	OPERATION 28,29				;28+2	OPERATION 29,30				;29+2	OPERATION 30,31				;30+2	OPERATION 31,32				;31+2	OPERATION 32,33				;32+2	OPERATION 33,34				;33+2	OPERATION 34,35				;34+2	OPERATION 35,36				;35+2	OPERATION 36,37				;36+2	OPERATION 37,38				;37+2	OPERATION 38,39				;38+2	OPERATION 39,40				;39+2	OPERATION 40,41				;40+2	OPERATION 41,42				;41+2	OPERATION 42,43				;42+2	OPERATION 43,44				;43+2	OPERATION 44,45				;44+2	OPERATION 45,46				;45+2	OPERATION 46,47				;45+2	OPERATION 47,48				;45+2	fld dword [eax+DOTP1+ELM48]		;48+2	fmul st0,st1	faddp st7	fld dword [eax+DOTP2+ELM48]	fmul st0,st1	faddp st6	fld dword [eax+DOTP3+ELM48]	fmul st0,st1	faddp st5	fld dword [eax+DOTP4+ELM48]	fmul st0,st1	faddp st4	fld dword [eax+DOTP5+ELM48]	fmul st0,st1	faddp st3	rep	fmul dword [eax+DOTP6+ELM48]	faddp st1	fxch st5	%ifdef PREC_DST6		mov [esp+20],ecx		add ecx,[esp+12]		prefetchw [ecx-2*64]		prefetchw [ecx-1*64]		prefetchw [ecx+0*64]		prefetchw [ecx+1*64]		nop		prefetchw [ecx+2*64-1]		mov ecx,[esp+20]	%endif	%ifdef PREB_DST6		prefetch [ebx+48*4-2*64]		fnop		mov edx,edx		prefetch [ebx+48*4-1*64]		nop		prefetch [ebx+48*4+0*64]		nop		prefetch [ebx+48*4+1*64]		nop		prefetch [ebx+48*4+2*64]		nop	%endif	fstp dword [ecx+ELM7]	fxch st3	fstp dword [ecx+ELM8]	fxch st1	fstp dword [ecx+ELM9]	fstp dword [ecx+ELM10]	fstp dword [ecx+ELM11]	fstp dword [ecx+ELM12]	add eax,edx	fld dword [ebx+ELM1]			;01+3	fld dword [eax+DOTP2]	fmul st0,st1	fld dword [eax+DOTP3]	fmul st0,st2	fld dword [eax+DOTP1]	rep	fmul st0,st3	fxch st0,st3	fld dword [eax+DOTP5]	rep	fmul st0,st1	rep	fld dword [eax+DOTP6]	mov edx,edx	fmul st0,st2	fld dword [ebx+ELM2]	fld dword [eax+DOTP4]	rep	fmulp st4,st0	add eax,byte 30*4	mov edx,edx	OPERATION 2,3				;02+3	OPERATION 3,4				;03+3	OPERATION 4,5				;04+3	OPERATION 5,6				;05+3	OPERATION 6,7				;06+3	OPERATION 7,8				;07+3	OPERATION 8,9				;08+3	OPERATION 9,10				;09+3	OPERATION 10,11				;10+3	OPERATION 11,12				;11+3	OPERATION 12,13				;12+3	OPERATION 13,14				;13+3	OPERATION 14,15				;14+3	OPERATION 15,16				;15+3	OPERATION 16,17				;16+3	OPERATION 17,18				;17+3	OPERATION 18,19				;18+3	OPERATION 19,20				;19+3	OPERATION 20,21				;20+3	OPERATION 21,22				;21+3	OPERATION 22,23				;22+3	OPERATION 23,24				;23+3	OPERATION 24,25				;24+3	OPERATION 25,26				;25+3	OPERATION 26,27				;26+3	OPERATION 27,28				;27+3	OPERATION 28,29				;28+3	OPERATION 29,30				;29+3	OPERATION 30,31				;30+3	OPERATION 31,32				;31+3	OPERATION 32,33				;32+3	OPERATION 33,34				;33+3	OPERATION 34,35				;34+3	OPERATION 35,36				;35+3	OPERATION 36,37				;36+3	OPERATION 37,38				;37+3	OPERATION 38,39				;38+3	OPERATION 39,40				;39+3	OPERATION 40,41				;40+3	OPERATION 41,42				;41+3	OPERATION 42,43				;42+3	OPERATION 43,44				;43+3	OPERATION 44,45				;44+3	OPERATION 45,46				;45+3	OPERATION 46,47				;45+3	OPERATION 47,48				;45+3	fld dword [eax+DOTP1+ELM48]		;48+3	fmul st0,st1	faddp st7	fld dword [eax+DOTP2+ELM48]	fmul st0,st1	faddp st6	fld dword [eax+DOTP3+ELM48]	fmul st0,st1	faddp st5	fld dword [eax+DOTP4+ELM48]	fmul st0,st1	faddp st4	fld dword [eax+DOTP5+ELM48]	fmul st0,st1	faddp st3	rep	fmul dword [eax+DOTP6+ELM48]	faddp st1	fxch st5	%ifdef PREC_DST5		mov [esp+20],ecx		add ecx,[esp+12]		prefetchw [ecx-2*64]		prefetchw [ecx-1*64]		prefetchw [ecx+0*64]		prefetchw [ecx+1*64]		nop		prefetchw [ecx+2*64-1]		mov ecx,[esp+20]	%endif	%ifdef PREB_DST5		prefetch [ebx+48*4-2*64]		fnop		mov edx,edx		prefetch [ebx+48*4-1*64]		nop		prefetch [ebx+48*4+0*64]		nop		prefetch [ebx+48*4+1*64]		nop		prefetch [ebx+48*4+2*64]		nop	%endif	fstp dword [ecx+ELM13]	fxch st3	fstp dword [ecx+ELM14]	fxch st1	fstp dword [ecx+ELM15]	fstp dword [ecx+ELM16]	fstp dword [ecx+ELM17]	fstp dword [ecx+ELM18]	add eax,edx	fld dword [ebx+ELM1]			;01+4	fld dword [eax+DOTP2]	fmul st0,st1	fld dword [eax+DOTP3]	fmul st0,st2	fld dword [eax+DOTP1]	rep	fmul st0,st3	fxch st0,st3	fld dword [eax+DOTP5]	rep	fmul st0,st1	rep	fld dword [eax+DOTP6]	mov edx,edx	fmul st0,st2	fld dword [ebx+ELM2]	fld dword [eax+DOTP4]	rep	fmulp st4,st0	add eax,byte 30*4	mov edx,edx	OPERATION 2,3				;02+4	OPERATION 3,4				;03+4	OPERATION 4,5				;04+4	OPERATION 5,6				;05+4	OPERATION 6,7				;06+4	OPERATION 7,8				;07+4	OPERATION 8,9				;08+4	OPERATION 9,10				;09+4	OPERATION 10,11				;10+4	OPERATION 11,12				;11+4	OPERATION 12,13				;12+4	OPERATION 13,14				;13+4	OPERATION 14,15				;14+4	OPERATION 15,16				;15+4	OPERATION 16,17				;16+4	OPERATION 17,18				;17+4	OPERATION 18,19				;18+4	OPERATION 19,20				;19+4	OPERATION 20,21				;20+4	OPERATION 21,22				;21+4	OPERATION 22,23				;22+4	OPERATION 23,24				;23+4	OPERATION 24,25				;24+4	OPERATION 25,26				;25+4	OPERATION 26,27				;26+4	OPERATION 27,28				;27+4	OPERATION 28,29				;28+4	OPERATION 29,30				;29+4	OPERATION 30,31				;30+4	OPERATION 31,32				;31+4	OPERATION 32,33				;32+4	OPERATION 33,34				;33+4	OPERATION 34,35				;34+4	OPERATION 35,36				;35+4	OPERATION 36,37				;36+4	OPERATION 37,38				;37+4	OPERATION 38,39				;38+4	OPERATION 39,40				;39+4	OPERATION 40,41				;40+4	OPERATION 41,42				;41+4	OPERATION 42,43				;42+4	OPERATION 43,44				;43+4	OPERATION 44,45				;44+4	OPERATION 45,46				;45+4	OPERATION 46,47				;45+4	OPERATION 47,48				;45+4	fld dword [eax+DOTP1+ELM48]		;48+4	fmul st0,st1	faddp st7	fld dword [eax+DOTP2+ELM48]	fmul st0,st1	faddp st6	fld dword [eax+DOTP3+ELM48]	fmul st0,st1	faddp st5	fld dword [eax+DOTP4+ELM48]	fmul st0,st1	faddp st4	fld dword [eax+DOTP5+ELM48]	fmul st0,st1	faddp st3	rep	fmul dword [eax+DOTP6+ELM48]	faddp st1	fxch st5

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?