atl_djik30x30x30tn30x30x0_a1_b0.asm

来自「基于Blas CLapck的.用过的人知道是干啥的」· 汇编 代码 · 共 798 行

ASM
798
字号
;              Automatically Tuned Linear Algebra Software v3.8.0;                       (C) Copyright 2001 Julian Ruhe;;  Redistribution and use in source and binary forms, with or without;  modification, are permitted provided that the following conditions;  are met:;    1. Redistributions of source code must retain the above copyright;       notice, this list of conditions and the following disclaimer.;    2. Redistributions in binary form must reproduce the above copyright;       notice, this list of conditions, and the following disclaimer in the;       documentation and/or other materials provided with the distribution.;    3. The name of the ATLAS group or the names of its contributers may;       not be used to endorse or promote products derived from this;       software without specific written permission.;;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS;  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED;  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR;  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS;  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR;  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF;  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS;  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN;  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE);  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE;  POSSIBILITY OF SUCH DAMAGE.;;;	ATL_dJIK30x30x30TN30x30x0_a1_b0.asm;;	ATLAS "Speed of Light" DGEMM() kernel for AMD Athlon;	Code author: Julian Ruhe (ruheejih@linux.zrz.tu-berlin.de | Julian.Ruhe@t-online.de);;	void ATL_dJIK30x30x30TN30x30x0_a1_b0(const int M, const int N, const int K, const double alpha,;						const double *A, const int lda, const double *B, const int ldb,;						const double beta, double *C, const int ldc);;	Compile with "nasmw -f win32 -DWIN32 ATL_dJIK30x30x30TN30x30x0_a1_b0.asm" (Windows);	Compile with "nasm -f elf -DELF ATL_dJIK30x30x30TN30x30x0_a1_b0.asm" (LINUX);;	See config file (ATL_dJIK30x30x30TN30x30x0_a1.cfg) for important macro definitions;%include "ATL_dJIK30x30x30TN30x30x0_a1.cfg"%include "ATL_dJIK30x30x30TN30x30x0_a1.mcr"%ifdef WIN32global _ATL_dJIK30x30x30TN30x30x0_a1_b0section .text_ATL_dJIK30x30x30TN30x30x0_a1_b0:%endif%ifdef ELFglobal ATL_dJIK30x30x30TN30x30x0_a1_b0section .textATL_dJIK30x30x30TN30x30x0_a1_b0:%endif	push ebp	mov ebp,esp	push ebx	push esi	push edi	femms	mov eax,0				;temporary variable t1	push eax				;t1->stack	mov eax,[ebp+28]			;&A->eax	add eax,NB*NB*8				;&(A+1)->eax	mov ebx,[ebp+36]			;&B->ebx	sub eax,ebx				;calculate offset	push eax				;&A+1+offset->stack	mov eax,[ebp+56]			;ldc->eax	lea eax,[8*eax]	push eax				;8*ldc->stack	mov eax,NB	push eax				;loop counter->stack	mov eax,[ebp+28]			;&A->eax	mov ebx,[ebp+36]			;&B->ebx	mov ecx,[ebp+52]			;&C->ecx	add ecx,byte 15*8			;calculate offsets	add ebx,byte 15*8	add eax,5*NB*8	push eax				;&A+offset->stack	push ebp				;ebp->stack	mov edi,-1*NB*8				;calculate offsets for dot products	mov esi,-3*NB*8	mov ebp,-5*NB*8	mov edx,6*NB*8-15*8			;offset for the next 6 dot products						;stack dump						;[esp+20]:	t1 (temp)						;[esp+16]:	&(A+1)+offset						;[esp+12]:	ldc*8						;[esp+08]:	loop counter						;[esp+04]:	&A+offset						;[esp+00]:	ebp	align 16loopj_	fld qword [ebx+ELM1]			;01+1	fld qword [eax+DOTP2]	fmul st0,st1	fld qword [eax+DOTP3]	fmul st0,st2	fld qword [eax+DOTP1]	rep	fmul st0,st3	fxch st0,st3	fld qword [eax+DOTP5]	rep	fmul st0,st1	rep	fld qword [eax+DOTP6]	mov edx,edx	fmul st0,st2	fld qword [ebx+ELM2]	fld qword [eax+DOTP4]	rep	fmulp st4,st0	add eax,byte 15*8	mov edx,edx	OPERATION 2,3				;02+1	OPERATION 3,4				;03+1	OPERATION 4,5				;04+1	OPERATION 5,6				;05+1	OPERATION 6,7				;06+1	OPERATION 7,8				;07+1	OPERATION 8,9				;08+1	OPERATION 9,10				;09+1	OPERATION 10,11				;10+1	OPERATION 11,12				;11+1	OPERATION 12,13				;12+1	OPERATION 13,14				;13+1	OPERATION 14,15				;14+1	OPERATION 15,16				;15+1	OPERATION 16,17				;16+1	OPERATION 17,18				;17+1	OPERATION 18,19				;18+1	OPERATION 19,20				;19+1	OPERATION 20,21				;20+1	OPERATION 21,22				;21+1	OPERATION 22,23				;22+1	OPERATION 23,24				;23+1	OPERATION 24,25				;24+1	OPERATION 25,26				;25+1	OPERATION 26,27				;26+1	OPERATION 27,28				;27+1	OPERATION 28,29				;28+1	OPERATION 29,30				;29+1	fld qword [eax+DOTP1+ELM30]		;30+1	fmul st0,st1	faddp st7	fld qword [eax+DOTP2+ELM30]	fmul st0,st1	faddp st6	fld qword [eax+DOTP3+ELM30]	fmul st0,st1	faddp st5	fld qword [eax+DOTP4+ELM30]	fmul st0,st1	faddp st4	fld qword [eax+DOTP5+ELM30]	fmul st0,st1	faddp st3	rep	fmul qword [eax+DOTP6+ELM30]	faddp st1	fxch st5	%ifdef PREC_DST4		mov [esp+20],ecx		add ecx,[esp+12]		prefetchw [ecx-2*64]		prefetchw [ecx-1*64]		prefetchw [ecx+0*64]		prefetchw [ecx+1*64]		nop		prefetchw [ecx+2*64-1]		mov ecx,[esp+20]	%endif	%ifdef PREB_DST4		prefetch [ebx+30*8-2*64]		fnop		mov edx,edx		prefetch [ebx+30*8-1*64]		nop		prefetch [ebx+30*8+0*64]		nop		prefetch [ebx+30*8+1*64]		nop		prefetch [ebx+30*8+2*64]		nop	%endif	fstp qword [ecx+ELM1]	fxch st3	fstp qword [ecx+ELM2]	fxch st1	fstp qword [ecx+ELM3]	fstp qword [ecx+ELM4]	fstp qword [ecx+ELM5]	fstp qword [ecx+ELM6]	add eax,edx	fld qword [ebx+ELM1]			;01+2	fld qword [eax+DOTP2]	fmul st0,st1	fld qword [eax+DOTP3]	fmul st0,st2	fld qword [eax+DOTP1]	rep	fmul st0,st3	fxch st0,st3	fld qword [eax+DOTP5]	rep	fmul st0,st1	rep	fld qword [eax+DOTP6]	mov edx,edx	fmul st0,st2	fld qword [ebx+ELM2]	fld qword [eax+DOTP4]	rep	fmulp st4,st0	add eax,byte 15*8	mov edx,edx	OPERATION 2,3				;02+2	OPERATION 3,4				;03+2	OPERATION 4,5				;04+2	OPERATION 5,6				;05+2	OPERATION 6,7				;06+2	OPERATION 7,8				;07+2	OPERATION 8,9				;08+2	OPERATION 9,10				;09+2	OPERATION 10,11				;10+2	OPERATION 11,12				;11+2	OPERATION 12,13				;12+2	OPERATION 13,14				;13+2	OPERATION 14,15				;14+2	OPERATION 15,16				;15+2	OPERATION 16,17				;16+2	OPERATION 17,18				;17+2	OPERATION 18,19				;18+2	OPERATION 19,20				;19+2	OPERATION 20,21				;20+2	OPERATION 21,22				;21+2	OPERATION 22,23				;22+2	OPERATION 23,24				;23+2	OPERATION 24,25				;24+2	OPERATION 25,26				;25+2	OPERATION 26,27				;26+2	OPERATION 27,28				;27+2	OPERATION 28,29				;28+2	OPERATION 29,30				;29+2	fld qword [eax+DOTP1+ELM30]		;30+2	fmul st0,st1	faddp st7	fld qword [eax+DOTP2+ELM30]	fmul st0,st1	faddp st6	fld qword [eax+DOTP3+ELM30]	fmul st0,st1	faddp st5	fld qword [eax+DOTP4+ELM30]	fmul st0,st1	faddp st4	fld qword [eax+DOTP5+ELM30]	fmul st0,st1	faddp st3	rep	fmul qword [eax+DOTP6+ELM30]	faddp st1	fxch st5	%ifdef PREC_DST3		mov [esp+20],ecx		add ecx,[esp+12]		prefetchw [ecx-2*64]		prefetchw [ecx-1*64]		prefetchw [ecx+0*64]		prefetchw [ecx+1*64]		nop		prefetchw [ecx+2*64-1]		mov ecx,[esp+20]	%endif	%ifdef PREB_DST3		prefetch [ebx+30*8-2*64]		fnop		mov edx,edx		prefetch [ebx+30*8-1*64]		nop		prefetch [ebx+30*8+0*64]		nop		prefetch [ebx+30*8+1*64]		nop		prefetch [ebx+30*8+2*64]		nop	%endif	fstp qword [ecx+ELM7]	fxch st3	fstp qword [ecx+ELM8]	fxch st1	fstp qword [ecx+ELM9]	fstp qword [ecx+ELM10]	fstp qword [ecx+ELM11]	fstp qword [ecx+ELM12]	add eax,edx	fld qword [ebx+ELM1]			;01+3	fld qword [eax+DOTP2]	fmul st0,st1	fld qword [eax+DOTP3]	fmul st0,st2	fld qword [eax+DOTP1]	rep	fmul st0,st3	fxch st0,st3	fld qword [eax+DOTP5]	rep	fmul st0,st1	rep	fld qword [eax+DOTP6]	mov edx,edx	fmul st0,st2	fld qword [ebx+ELM2]	fld qword [eax+DOTP4]	rep	fmulp st4,st0	add eax,byte 15*8	mov edx,edx	OPERATION 2,3				;02+3	OPERATION 3,4				;03+3	OPERATION 4,5				;04+3	OPERATION 5,6				;05+3	OPERATION 6,7				;06+3	OPERATION 7,8				;07+3	OPERATION 8,9				;08+3	OPERATION 9,10				;09+3	OPERATION 10,11				;10+3	OPERATION 11,12				;11+3	OPERATION 12,13				;12+3	OPERATION 13,14				;13+3	OPERATION 14,15				;14+3	OPERATION 15,16				;15+3	OPERATION 16,17				;16+3	OPERATION 17,18				;17+3	OPERATION 18,19				;18+3	OPERATION 19,20				;19+3	OPERATION 20,21				;20+3	OPERATION 21,22				;21+3	OPERATION 22,23				;22+3	OPERATION 23,24				;23+3	OPERATION 24,25				;24+3	OPERATION 25,26				;25+3	OPERATION 26,27				;26+3	OPERATION 27,28				;27+3	OPERATION 28,29				;28+3	OPERATION 29,30				;29+3	fld qword [eax+DOTP1+ELM30]		;30+3	fmul st0,st1	faddp st7	fld qword [eax+DOTP2+ELM30]	fmul st0,st1	faddp st6	fld qword [eax+DOTP3+ELM30]	fmul st0,st1	faddp st5	fld qword [eax+DOTP4+ELM30]	fmul st0,st1	faddp st4	fld qword [eax+DOTP5+ELM30]	fmul st0,st1	faddp st3	rep	fmul qword [eax+DOTP6+ELM30]	faddp st1	fxch st5	%ifdef PREC_DST2		mov [esp+20],ecx		add ecx,[esp+12]		prefetchw [ecx-2*64]		prefetchw [ecx-1*64]		prefetchw [ecx+0*64]		prefetchw [ecx+1*64]		nop		prefetchw [ecx+2*64-1]		mov ecx,[esp+20]	%endif	%ifdef PREB_DST2		prefetch [ebx+30*8-2*64]		fnop		mov edx,edx		prefetch [ebx+30*8-1*64]		nop		prefetch [ebx+30*8+0*64]		nop		prefetch [ebx+30*8+1*64]		nop		prefetch [ebx+30*8+2*64]		nop	%endif	fstp qword [ecx+ELM13]	fxch st3	fstp qword [ecx+ELM14]	rep	fxch st1	fstp qword [ecx+ELM15]	fstp qword [ecx+ELM16]	fstp qword [ecx+ELM17]	fstp qword [ecx+ELM18]	add eax,edx	fld qword [ebx+ELM1]			;01+4	fld qword [eax+DOTP2]	fmul st0,st1	fld qword [eax+DOTP3]	fmul st0,st2	fld qword [eax+DOTP1]	rep	fmul st0,st3	fxch st0,st3	fld qword [eax+DOTP5]	rep	fmul st0,st1	rep	fld qword [eax+DOTP6]	mov edx,edx	fmul st0,st2	fld qword [ebx+ELM2]	fld qword [eax+DOTP4]	rep	fmulp st4,st0	add eax,byte 15*8	mov edx,edx	OPERATION 2,3				;02+4	OPERATION 3,4				;03+4	OPERATION 4,5				;04+4	OPERATION 5,6				;05+4	OPERATION 6,7				;06+4	OPERATION 7,8				;07+4	OPERATION 8,9				;08+4	OPERATION 9,10				;09+4	OPERATION 10,11				;10+4	OPERATION 11,12				;11+4	OPERATION 12,13				;12+4	OPERATION 13,14				;13+4	OPERATION 14,15				;14+4	OPERATION 15,16				;15+4	OPERATION 16,17				;16+4	OPERATION 17,18				;17+4	OPERATION 18,19				;18+4	OPERATION 19,20				;19+4	OPERATION 20,21				;20+4	OPERATION 21,22				;21+4	OPERATION 22,23				;22+4	OPERATION 23,24				;23+4	OPERATION 24,25				;24+4	OPERATION 25,26				;25+4	OPERATION 26,27				;26+4	OPERATION 27,28				;27+4	OPERATION 28,29				;28+4	OPERATION 29,30				;29+4	fld qword [eax+DOTP1+ELM30]		;30+4	fmul st0,st1	faddp st7	fld qword [eax+DOTP2+ELM30]	fmul st0,st1	faddp st6	fld qword [eax+DOTP3+ELM30]	fmul st0,st1	faddp st5	fld qword [eax+DOTP4+ELM30]	fmul st0,st1	faddp st4	fld qword [eax+DOTP5+ELM30]	fmul st0,st1	faddp st3	rep	fmul qword [eax+DOTP6+ELM30]	faddp st1	fxch st5	%ifdef PREC_DST1		mov [esp+20],ecx		add ecx,[esp+12]		prefetchw [ecx-2*64]		prefetchw [ecx-1*64]		prefetchw [ecx+0*64]		prefetchw [ecx+1*64]		nop		prefetchw [ecx+2*64-1]		mov ecx,[esp+20]	%endif	%ifdef PREB_DST1		prefetch [ebx+30*8-2*64]		fnop		mov edx,edx		prefetch [ebx+30*8-1*64]		nop		prefetch [ebx+30*8+0*64]		nop		prefetch [ebx+30*8+1*64]		nop		prefetch [ebx+30*8+2*64]		nop	%endif	fstp qword [ecx+ELM19]	fxch st3	fstp qword [ecx+ELM20]	fxch st1	fstp qword [ecx+ELM21]	fstp qword [ecx+ELM22]	fstp qword [ecx+ELM23]	fstp qword [ecx+ELM24]	add eax,edx	fld qword [ebx+ELM1]			;01+5	fld qword [eax+DOTP2]	fmul st0,st1	fld qword [eax+DOTP3]	fmul st0,st2	fld qword [eax+DOTP1]	rep	fmul st0,st3	fxch st0,st3	fld qword [eax+DOTP5]	rep	fmul st0,st1	rep	fld qword [eax+DOTP6]	mov edx,edx	fmul st0,st2	fld qword [ebx+ELM2]	fld qword [eax+DOTP4]	rep	fmulp st4,st0	add eax,byte 15*8	mov edx,edx	OPERATION 2,3				;02+5	OPERATION 3,4				;03+5	OPERATION 4,5				;04+5	OPERATION 5,6				;05+5	OPERATION 6,7				;06+5	OPERATION 7,8				;07+5	OPERATION 8,9				;08+5	OPERATION 9,10				;09+5	OPERATION 10,11				;10+5	OPERATION 11,12				;11+5	OPERATION 12,13				;12+5	OPERATION 13,14				;13+5	OPERATION 14,15				;14+5	OPERATION 15,16				;15+5	OPERATION 16,17				;16+5	OPERATION 17,18				;17+5	OPERATION 18,19				;18+5	OPERATION 19,20				;19+5	OPERATION 20,21				;20+5	OPERATION 21,22				;21+5	OPERATION 22,23				;22+5	OPERATION 23,24				;23+5	OPERATION 24,25				;24+5	OPERATION 25,26				;25+5	OPERATION 26,27				;26+5	OPERATION 27,28				;27+5	OPERATION 28,29				;28+5	OPERATION 29,30				;29+5	fld qword [eax+DOTP1+ELM30]		;30+5	fmul st0,st1	faddp st7	fld qword [eax+DOTP2+ELM30]	fmul st0,st1	faddp st6	fld qword [eax+DOTP3+ELM30]	fmul st0,st1	faddp st5	fld qword [eax+DOTP4+ELM30]	fmul st0,st1	faddp st4	fld qword [eax+DOTP5+ELM30]	fmul st0,st1	faddp st3	rep	fmul qword [eax+DOTP6+ELM30]	faddp st1	fxch st5	%ifdef PREA_EN		mov [esp+20],edx		;save edx in t1		mov edx,[esp+16]		;&A+1->edx		lea edx,[edx+ebx]		prefetch [edx-2*64]		nop		prefetch [edx-1*64]		prefetch [edx+0*64]		nop		prefetch [edx+1*64]		prefetch [edx+2*64-8]		mov edx,[esp+20]		;restore edx		mov eax,eax		fnop	%endif	fstp qword [ecx+ELM25]	fxch st3	fstp qword [ecx+ELM26]	fxch st1	fstp qword [ecx+ELM27]	fstp qword [ecx+ELM28]	fstp qword [ecx+ELM29]	fstp qword [ecx+ELM30]	sub ebx,edi				;next column of B	mov eax,[esp+4]				;reset eax	add ecx,[esp+12]			;next column of C (+ldc*8)	dec dword [esp+8]			;dec counter	jnz near loopj_end_	femms	pop ebp	add esp,byte 5*4			;remove local variables	pop edi					;restore registers	pop esi	pop ebx	leave                			;mov esp,ebp / pop ebp	ret

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?