atl_sjik48x48x48tn48x48x0_a1_b0.asm
来自「基于Blas CLapck的.用过的人知道是干啥的」· 汇编 代码 · 共 1,351 行 · 第 1/2 页
ASM
1,351 行
; Automatically Tuned Linear Algebra Software v3.8.0; (C) Copyright 2001 Julian Ruhe;; Redistribution and use in source and binary forms, with or without; modification, are permitted provided that the following conditions; are met:; 1. Redistributions of source code must retain the above copyright; notice, this list of conditions and the following disclaimer.; 2. Redistributions in binary form must reproduce the above copyright; notice, this list of conditions, and the following disclaimer in the; documentation and/or other materials provided with the distribution.; 3. The name of the ATLAS group or the names of its contributers may; not be used to endorse or promote products derived from this; software without specific written permission.;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS; BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF; SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN; CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE); ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE; POSSIBILITY OF SUCH DAMAGE.;;; ATL_sJIK48x48x48TN48x48x0_a1_b0.asm;; ATLAS "Speed of Light" DGEMM() kernel for AMD Athlon; Code author: Julian Ruhe (ruheejih@linux.zrz.tu-berlin.de | Julian.Ruhe@t-online.de);; void ATL_sJIK48x48x48TN48x48x0_a1_b0(const int M, const int N, const int K, const float alpha,; const float *A, const int lda, const float *B, const int ldb,; const float beta, float *C, const int ldc);; Compile with "nasmw -f win32 -DWIN32 ATL_sJIK48x48x48TN48x48x0_a1_b0.asm" (Windows); Compile with "nasm -f elf -DELF ATL_sJIK48x48x48TN48x48x0_a1_b0.asm" (LINUX);; See config file (ATL_sJIK48x48x48TN48x48x0_a1.cfg) for important macro definitions;%include "ATL_sJIK48x48x48TN48x48x0_a1.cfg"%include "ATL_sJIK48x48x48TN48x48x0_a1.mcr"%ifdef WIN32global _ATL_sJIK48x48x48TN48x48x0_a1_b0section .text_ATL_sJIK48x48x48TN48x48x0_a1_b0:%endif%ifdef ELFglobal ATL_sJIK48x48x48TN48x48x0_a1_b0section .textATL_sJIK48x48x48TN48x48x0_a1_b0:%endif push ebp mov ebp,esp push ebx push esi push edi femms mov eax,0 ;temporary variable t1 push eax ;t1->stack mov eax,[ebp+24] ;&A->eax add eax,NB*NB*4 ;&(A+1)->eax mov ebx,[ebp+32] ;&B->ebx sub eax,ebx ;calculate offset push eax ;&A+1+offset->stack mov eax,[ebp+48] ;ldc->eax lea eax,[4*eax] push eax ;8*ldc->stack mov eax,NB push eax ;loop counter->stack mov eax,[ebp+24] ;&A->eax mov ebx,[ebp+32] ;&B->ebx mov ecx,[ebp+44] ;&C->ecx add ecx,byte 30*4 ;calculate offsets add ebx,byte 30*4 add eax,5*NB*4 push eax ;&A+offset->stack push ebp ;ebp->stack mov edi,-1*NB*4 ;calculate offsets for dot products mov esi,-3*NB*4 mov ebp,-5*NB*4 mov edx,6*NB*4-30*4 ;offset for the next 6 dot products ;stack dump ; ;[esp+20]: t1 (temp) ;[esp+16]: &(A+1)+offset ;[esp+12]: ldc*4 ;[esp+08]: loop counter ;[esp+04]: &A+offset ;[esp+00]: ebp align 16loopj_ fld dword [ebx+ELM1] ;01+1 fld dword [eax+DOTP2] fmul st0,st1 fld dword [eax+DOTP3] fmul st0,st2 fld dword [eax+DOTP1] rep fmul st0,st3 fxch st0,st3 fld dword [eax+DOTP5] rep fmul st0,st1 rep fld dword [eax+DOTP6] mov edx,edx fmul st0,st2 fld dword [ebx+ELM2] fld dword [eax+DOTP4] rep fmulp st4,st0 add eax,byte 30*4 mov edx,edx OPERATION 2,3 ;02+1 OPERATION 3,4 ;03+1 OPERATION 4,5 ;04+1 OPERATION 5,6 ;05+1 OPERATION 6,7 ;06+1 OPERATION 7,8 ;07+1 OPERATION 8,9 ;08+1 OPERATION 9,10 ;09+1 OPERATION 10,11 ;10+1 OPERATION 11,12 ;11+1 OPERATION 12,13 ;12+1 OPERATION 13,14 ;13+1 OPERATION 14,15 ;14+1 OPERATION 15,16 ;15+1 OPERATION 16,17 ;16+1 OPERATION 17,18 ;17+1 OPERATION 18,19 ;18+1 OPERATION 19,20 ;19+1 OPERATION 20,21 ;20+1 OPERATION 21,22 ;21+1 OPERATION 22,23 ;22+1 OPERATION 23,24 ;23+1 OPERATION 24,25 ;24+1 OPERATION 25,26 ;25+1 OPERATION 26,27 ;26+1 OPERATION 27,28 ;27+1 OPERATION 28,29 ;28+1 OPERATION 29,30 ;29+1 OPERATION 30,31 ;30+1 OPERATION 31,32 ;31+1 OPERATION 32,33 ;32+1 OPERATION 33,34 ;33+1 OPERATION 34,35 ;34+1 OPERATION 35,36 ;35+1 OPERATION 36,37 ;36+1 OPERATION 37,38 ;37+1 OPERATION 38,39 ;38+1 OPERATION 39,40 ;39+1 OPERATION 40,41 ;40+1 OPERATION 41,42 ;41+1 OPERATION 42,43 ;42+1 OPERATION 43,44 ;43+1 OPERATION 44,45 ;44+1 OPERATION 45,46 ;45+1 OPERATION 46,47 ;45+1 OPERATION 47,48 ;45+1 fld dword [eax+DOTP1+ELM48] ;48+1 fmul st0,st1 faddp st7 fld dword [eax+DOTP2+ELM48] fmul st0,st1 faddp st6 fld dword [eax+DOTP3+ELM48] fmul st0,st1 faddp st5 fld dword [eax+DOTP4+ELM48] fmul st0,st1 faddp st4 fld dword [eax+DOTP5+ELM48] fmul st0,st1 faddp st3 rep fmul dword [eax+DOTP6+ELM48] faddp st1 fxch st5 %ifdef PREC_DST7 mov [esp+20],ecx add ecx,[esp+12] prefetchw [ecx-2*64] prefetchw [ecx-1*64] prefetchw [ecx+0*64] prefetchw [ecx+1*64] nop prefetchw [ecx+2*64-1] mov ecx,[esp+20] %endif %ifdef PREB_DST7 prefetch [ebx+48*4-2*64] fnop mov edx,edx prefetch [ebx+48*4-1*64] nop prefetch [ebx+48*4+0*64] nop prefetch [ebx+48*4+1*64] nop prefetch [ebx+48*4+2*64] nop %endif fstp dword [ecx+ELM1] fxch st3 fstp dword [ecx+ELM2] fxch st1 fstp dword [ecx+ELM3] fstp dword [ecx+ELM4] fstp dword [ecx+ELM5] fstp dword [ecx+ELM6] add eax,edx fld dword [ebx+ELM1] ;01+2 fld dword [eax+DOTP2] fmul st0,st1 fld dword [eax+DOTP3] fmul st0,st2 fld dword [eax+DOTP1] rep fmul st0,st3 fxch st0,st3 fld dword [eax+DOTP5] rep fmul st0,st1 rep fld dword [eax+DOTP6] mov edx,edx fmul st0,st2 fld dword [ebx+ELM2] fld dword [eax+DOTP4] rep fmulp st4,st0 add eax,byte 30*4 mov edx,edx OPERATION 2,3 ;02+2 OPERATION 3,4 ;03+2 OPERATION 4,5 ;04+2 OPERATION 5,6 ;05+2 OPERATION 6,7 ;06+2 OPERATION 7,8 ;07+2 OPERATION 8,9 ;08+2 OPERATION 9,10 ;09+2 OPERATION 10,11 ;10+2 OPERATION 11,12 ;11+2 OPERATION 12,13 ;12+2 OPERATION 13,14 ;13+2 OPERATION 14,15 ;14+2 OPERATION 15,16 ;15+2 OPERATION 16,17 ;16+2 OPERATION 17,18 ;17+2 OPERATION 18,19 ;18+2 OPERATION 19,20 ;19+2 OPERATION 20,21 ;20+2 OPERATION 21,22 ;21+2 OPERATION 22,23 ;22+2 OPERATION 23,24 ;23+2 OPERATION 24,25 ;24+2 OPERATION 25,26 ;25+2 OPERATION 26,27 ;26+2 OPERATION 27,28 ;27+2 OPERATION 28,29 ;28+2 OPERATION 29,30 ;29+2 OPERATION 30,31 ;30+2 OPERATION 31,32 ;31+2 OPERATION 32,33 ;32+2 OPERATION 33,34 ;33+2 OPERATION 34,35 ;34+2 OPERATION 35,36 ;35+2 OPERATION 36,37 ;36+2 OPERATION 37,38 ;37+2 OPERATION 38,39 ;38+2 OPERATION 39,40 ;39+2 OPERATION 40,41 ;40+2 OPERATION 41,42 ;41+2 OPERATION 42,43 ;42+2 OPERATION 43,44 ;43+2 OPERATION 44,45 ;44+2 OPERATION 45,46 ;45+2 OPERATION 46,47 ;45+2 OPERATION 47,48 ;45+2 fld dword [eax+DOTP1+ELM48] ;48+2 fmul st0,st1 faddp st7 fld dword [eax+DOTP2+ELM48] fmul st0,st1 faddp st6 fld dword [eax+DOTP3+ELM48] fmul st0,st1 faddp st5 fld dword [eax+DOTP4+ELM48] fmul st0,st1 faddp st4 fld dword [eax+DOTP5+ELM48] fmul st0,st1 faddp st3 rep fmul dword [eax+DOTP6+ELM48] faddp st1 fxch st5 %ifdef PREC_DST6 mov [esp+20],ecx add ecx,[esp+12] prefetchw [ecx-2*64] prefetchw [ecx-1*64] prefetchw [ecx+0*64] prefetchw [ecx+1*64] nop prefetchw [ecx+2*64-1] mov ecx,[esp+20] %endif %ifdef PREB_DST6 prefetch [ebx+48*4-2*64] fnop mov edx,edx prefetch [ebx+48*4-1*64] nop prefetch [ebx+48*4+0*64] nop prefetch [ebx+48*4+1*64] nop prefetch [ebx+48*4+2*64] nop %endif fstp dword [ecx+ELM7] fxch st3 fstp dword [ecx+ELM8] fxch st1 fstp dword [ecx+ELM9] fstp dword [ecx+ELM10] fstp dword [ecx+ELM11] fstp dword [ecx+ELM12] add eax,edx fld dword [ebx+ELM1] ;01+3 fld dword [eax+DOTP2] fmul st0,st1 fld dword [eax+DOTP3] fmul st0,st2 fld dword [eax+DOTP1] rep fmul st0,st3 fxch st0,st3 fld dword [eax+DOTP5] rep fmul st0,st1 rep fld dword [eax+DOTP6] mov edx,edx fmul st0,st2 fld dword [ebx+ELM2] fld dword [eax+DOTP4] rep fmulp st4,st0 add eax,byte 30*4 mov edx,edx OPERATION 2,3 ;02+3 OPERATION 3,4 ;03+3 OPERATION 4,5 ;04+3 OPERATION 5,6 ;05+3 OPERATION 6,7 ;06+3 OPERATION 7,8 ;07+3 OPERATION 8,9 ;08+3 OPERATION 9,10 ;09+3 OPERATION 10,11 ;10+3 OPERATION 11,12 ;11+3 OPERATION 12,13 ;12+3 OPERATION 13,14 ;13+3 OPERATION 14,15 ;14+3 OPERATION 15,16 ;15+3 OPERATION 16,17 ;16+3 OPERATION 17,18 ;17+3 OPERATION 18,19 ;18+3 OPERATION 19,20 ;19+3 OPERATION 20,21 ;20+3 OPERATION 21,22 ;21+3 OPERATION 22,23 ;22+3 OPERATION 23,24 ;23+3 OPERATION 24,25 ;24+3 OPERATION 25,26 ;25+3 OPERATION 26,27 ;26+3 OPERATION 27,28 ;27+3 OPERATION 28,29 ;28+3 OPERATION 29,30 ;29+3 OPERATION 30,31 ;30+3 OPERATION 31,32 ;31+3 OPERATION 32,33 ;32+3 OPERATION 33,34 ;33+3 OPERATION 34,35 ;34+3 OPERATION 35,36 ;35+3 OPERATION 36,37 ;36+3 OPERATION 37,38 ;37+3 OPERATION 38,39 ;38+3 OPERATION 39,40 ;39+3 OPERATION 40,41 ;40+3 OPERATION 41,42 ;41+3 OPERATION 42,43 ;42+3 OPERATION 43,44 ;43+3 OPERATION 44,45 ;44+3 OPERATION 45,46 ;45+3 OPERATION 46,47 ;45+3 OPERATION 47,48 ;45+3 fld dword [eax+DOTP1+ELM48] ;48+3 fmul st0,st1 faddp st7 fld dword [eax+DOTP2+ELM48] fmul st0,st1 faddp st6 fld dword [eax+DOTP3+ELM48] fmul st0,st1 faddp st5 fld dword [eax+DOTP4+ELM48] fmul st0,st1 faddp st4 fld dword [eax+DOTP5+ELM48] fmul st0,st1 faddp st3 rep fmul dword [eax+DOTP6+ELM48] faddp st1 fxch st5 %ifdef PREC_DST5 mov [esp+20],ecx add ecx,[esp+12] prefetchw [ecx-2*64] prefetchw [ecx-1*64] prefetchw [ecx+0*64] prefetchw [ecx+1*64] nop prefetchw [ecx+2*64-1] mov ecx,[esp+20] %endif %ifdef PREB_DST5 prefetch [ebx+48*4-2*64] fnop mov edx,edx prefetch [ebx+48*4-1*64] nop prefetch [ebx+48*4+0*64] nop prefetch [ebx+48*4+1*64] nop prefetch [ebx+48*4+2*64] nop %endif fstp dword [ecx+ELM13] fxch st3 fstp dword [ecx+ELM14] fxch st1 fstp dword [ecx+ELM15] fstp dword [ecx+ELM16] fstp dword [ecx+ELM17] fstp dword [ecx+ELM18] add eax,edx fld dword [ebx+ELM1] ;01+4 fld dword [eax+DOTP2] fmul st0,st1 fld dword [eax+DOTP3] fmul st0,st2 fld dword [eax+DOTP1] rep fmul st0,st3 fxch st0,st3 fld dword [eax+DOTP5] rep fmul st0,st1 rep fld dword [eax+DOTP6] mov edx,edx fmul st0,st2 fld dword [ebx+ELM2] fld dword [eax+DOTP4] rep fmulp st4,st0 add eax,byte 30*4 mov edx,edx OPERATION 2,3 ;02+4 OPERATION 3,4 ;03+4 OPERATION 4,5 ;04+4 OPERATION 5,6 ;05+4 OPERATION 6,7 ;06+4 OPERATION 7,8 ;07+4 OPERATION 8,9 ;08+4 OPERATION 9,10 ;09+4 OPERATION 10,11 ;10+4 OPERATION 11,12 ;11+4 OPERATION 12,13 ;12+4 OPERATION 13,14 ;13+4 OPERATION 14,15 ;14+4 OPERATION 15,16 ;15+4 OPERATION 16,17 ;16+4 OPERATION 17,18 ;17+4 OPERATION 18,19 ;18+4 OPERATION 19,20 ;19+4 OPERATION 20,21 ;20+4 OPERATION 21,22 ;21+4 OPERATION 22,23 ;22+4 OPERATION 23,24 ;23+4 OPERATION 24,25 ;24+4 OPERATION 25,26 ;25+4 OPERATION 26,27 ;26+4 OPERATION 27,28 ;27+4 OPERATION 28,29 ;28+4 OPERATION 29,30 ;29+4 OPERATION 30,31 ;30+4 OPERATION 31,32 ;31+4 OPERATION 32,33 ;32+4 OPERATION 33,34 ;33+4 OPERATION 34,35 ;34+4 OPERATION 35,36 ;35+4 OPERATION 36,37 ;36+4 OPERATION 37,38 ;37+4 OPERATION 38,39 ;38+4 OPERATION 39,40 ;39+4 OPERATION 40,41 ;40+4 OPERATION 41,42 ;41+4 OPERATION 42,43 ;42+4 OPERATION 43,44 ;43+4 OPERATION 44,45 ;44+4 OPERATION 45,46 ;45+4 OPERATION 46,47 ;45+4 OPERATION 47,48 ;45+4 fld dword [eax+DOTP1+ELM48] ;48+4 fmul st0,st1 faddp st7 fld dword [eax+DOTP2+ELM48] fmul st0,st1 faddp st6 fld dword [eax+DOTP3+ELM48] fmul st0,st1 faddp st5 fld dword [eax+DOTP4+ELM48] fmul st0,st1 faddp st4 fld dword [eax+DOTP5+ELM48] fmul st0,st1 faddp st3 rep fmul dword [eax+DOTP6+ELM48] faddp st1 fxch st5
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?