atl_sjik48x48x48tn48x48x0_a1_b0.asm
来自「基于Blas CLapck的.用过的人知道是干啥的」· 汇编 代码 · 共 1,351 行 · 第 1/2 页
ASM
1,351 行
%ifdef PREC_DST4 mov [esp+20],ecx add ecx,[esp+12] prefetchw [ecx-2*64] prefetchw [ecx-1*64] prefetchw [ecx+0*64] prefetchw [ecx+1*64] nop prefetchw [ecx+2*64-1] mov ecx,[esp+20] %endif %ifdef PREB_DST4 prefetch [ebx+48*4-2*64] fnop mov edx,edx prefetch [ebx+48*4-1*64] nop prefetch [ebx+48*4+0*64] nop prefetch [ebx+48*4+1*64] nop prefetch [ebx+48*4+2*64] nop %endif fstp dword [ecx+ELM19] fxch st3 fstp dword [ecx+ELM20] fxch st1 fstp dword [ecx+ELM21] fstp dword [ecx+ELM22] fstp dword [ecx+ELM23] fstp dword [ecx+ELM24] add eax,edx fld dword [ebx+ELM1] ;01+5 fld dword [eax+DOTP2] fmul st0,st1 fld dword [eax+DOTP3] fmul st0,st2 fld dword [eax+DOTP1] rep fmul st0,st3 fxch st0,st3 fld dword [eax+DOTP5] rep fmul st0,st1 rep fld dword [eax+DOTP6] mov edx,edx fmul st0,st2 fld dword [ebx+ELM2] fld dword [eax+DOTP4] rep fmulp st4,st0 add eax,byte 30*4 mov edx,edx OPERATION 2,3 ;02+5 OPERATION 3,4 ;03+5 OPERATION 4,5 ;04+5 OPERATION 5,6 ;05+5 OPERATION 6,7 ;06+5 OPERATION 7,8 ;07+5 OPERATION 8,9 ;08+5 OPERATION 9,10 ;09+5 OPERATION 10,11 ;10+5 OPERATION 11,12 ;11+5 OPERATION 12,13 ;12+5 OPERATION 13,14 ;13+5 OPERATION 14,15 ;14+5 OPERATION 15,16 ;15+5 OPERATION 16,17 ;16+5 OPERATION 17,18 ;17+5 OPERATION 18,19 ;18+5 OPERATION 19,20 ;19+5 OPERATION 20,21 ;20+5 OPERATION 21,22 ;21+5 OPERATION 22,23 ;22+5 OPERATION 23,24 ;23+5 OPERATION 24,25 ;24+5 OPERATION 25,26 ;25+5 OPERATION 26,27 ;26+5 OPERATION 27,28 ;27+5 OPERATION 28,29 ;28+5 OPERATION 29,30 ;29+5 OPERATION 30,31 ;30+5 OPERATION 31,32 ;31+5 OPERATION 32,33 ;32+5 OPERATION 33,34 ;33+5 OPERATION 34,35 ;34+5 OPERATION 35,36 ;35+5 OPERATION 36,37 ;36+5 OPERATION 37,38 ;37+5 OPERATION 38,39 ;38+5 OPERATION 39,40 ;39+5 OPERATION 40,41 ;40+5 OPERATION 41,42 ;41+5 OPERATION 42,43 ;42+5 OPERATION 43,44 ;43+5 OPERATION 44,45 ;44+5 OPERATION 45,46 ;45+5 OPERATION 46,47 ;45+5 OPERATION 47,48 ;45+5 fld dword [eax+DOTP1+ELM48] ;48+5 fmul st0,st1 faddp st7 fld dword [eax+DOTP2+ELM48] fmul st0,st1 faddp st6 fld dword [eax+DOTP3+ELM48] fmul st0,st1 faddp st5 fld dword [eax+DOTP4+ELM48] fmul st0,st1 faddp st4 fld dword [eax+DOTP5+ELM48] fmul st0,st1 faddp st3 rep fmul dword [eax+DOTP6+ELM48] faddp st1 fxch st5 %ifdef PREC_DST3 mov [esp+20],ecx add ecx,[esp+12] prefetchw [ecx-2*64] prefetchw [ecx-1*64] prefetchw [ecx+0*64] prefetchw [ecx+1*64] nop prefetchw [ecx+2*64-1] mov ecx,[esp+20] %endif %ifdef PREB_DST3 prefetch [ebx+48*4-2*64] fnop mov edx,edx prefetch [ebx+48*4-1*64] nop prefetch [ebx+48*4+0*64] nop prefetch [ebx+48*4+1*64] nop prefetch [ebx+48*4+2*64] nop %endif fstp dword [ecx+ELM25] fxch st3 fstp dword [ecx+ELM26] fxch st1 fstp dword [ecx+ELM27] fstp dword [ecx+ELM28] fstp dword [ecx+ELM29] fstp dword [ecx+ELM30] add eax,edx fld dword [ebx+ELM1] ;01+6 fld dword [eax+DOTP2] fmul st0,st1 fld dword [eax+DOTP3] fmul st0,st2 fld dword [eax+DOTP1] rep fmul st0,st3 fxch st0,st3 fld dword [eax+DOTP5] rep fmul st0,st1 rep fld dword [eax+DOTP6] mov edx,edx fmul st0,st2 fld dword [ebx+ELM2] fld dword [eax+DOTP4] rep fmulp st4,st0 add eax,byte 30*4 mov edx,edx OPERATION 2,3 ;02+6 OPERATION 3,4 ;03+6 OPERATION 4,5 ;04+6 OPERATION 5,6 ;05+6 OPERATION 6,7 ;06+6 OPERATION 7,8 ;07+6 OPERATION 8,9 ;08+6 OPERATION 9,10 ;09+6 OPERATION 10,11 ;10+6 OPERATION 11,12 ;11+6 OPERATION 12,13 ;12+6 OPERATION 13,14 ;13+6 OPERATION 14,15 ;14+6 OPERATION 15,16 ;15+6 OPERATION 16,17 ;16+6 OPERATION 17,18 ;17+6 OPERATION 18,19 ;18+6 OPERATION 19,20 ;19+6 OPERATION 20,21 ;20+6 OPERATION 21,22 ;21+6 OPERATION 22,23 ;22+6 OPERATION 23,24 ;23+6 OPERATION 24,25 ;24+6 OPERATION 25,26 ;25+6 OPERATION 26,27 ;26+6 OPERATION 27,28 ;27+6 OPERATION 28,29 ;28+6 OPERATION 29,30 ;29+6 OPERATION 30,31 ;30+6 OPERATION 31,32 ;31+6 OPERATION 32,33 ;32+6 OPERATION 33,34 ;33+6 OPERATION 34,35 ;34+6 OPERATION 35,36 ;35+6 OPERATION 36,37 ;36+6 OPERATION 37,38 ;37+6 OPERATION 38,39 ;38+6 OPERATION 39,40 ;39+6 OPERATION 40,41 ;40+6 OPERATION 41,42 ;41+6 OPERATION 42,43 ;42+6 OPERATION 43,44 ;43+6 OPERATION 44,45 ;44+6 OPERATION 45,46 ;45+6 OPERATION 46,47 ;45+6 OPERATION 47,48 ;45+6 fld dword [eax+DOTP1+ELM48] ;48+6 fmul st0,st1 faddp st7 fld dword [eax+DOTP2+ELM48] fmul st0,st1 faddp st6 fld dword [eax+DOTP3+ELM48] fmul st0,st1 faddp st5 fld dword [eax+DOTP4+ELM48] fmul st0,st1 faddp st4 fld dword [eax+DOTP5+ELM48] fmul st0,st1 faddp st3 rep fmul dword [eax+DOTP6+ELM48] faddp st1 fxch st5 %ifdef PREC_DST2 mov [esp+20],ecx add ecx,[esp+12] prefetchw [ecx-2*64] prefetchw [ecx-1*64] prefetchw [ecx+0*64] prefetchw [ecx+1*64] nop prefetchw [ecx+2*64-1] mov ecx,[esp+20] %endif %ifdef PREB_DST2 prefetch [ebx+48*4-2*64] fnop mov edx,edx prefetch [ebx+48*4-1*64] nop prefetch [ebx+48*4+0*64] nop prefetch [ebx+48*4+1*64] nop prefetch [ebx+48*4+2*64] nop %endif rep fstp dword [ecx+ELM31] fxch st3 fstp dword [ecx+ELM32] fxch st1 fstp dword [ecx+ELM33] fstp dword [ecx+ELM34] fstp dword [ecx+ELM35] fstp dword [ecx+ELM36] add eax,edx fld dword [ebx+ELM1] ;01+7 fld dword [eax+DOTP2] fmul st0,st1 fld dword [eax+DOTP3] fmul st0,st2 fld dword [eax+DOTP1] rep fmul st0,st3 fxch st0,st3 fld dword [eax+DOTP5] rep fmul st0,st1 rep fld dword [eax+DOTP6] mov edx,edx fmul st0,st2 fld dword [ebx+ELM2] fld dword [eax+DOTP4] rep fmulp st4,st0 add eax,byte 30*4 mov edx,edx OPERATION 2,3 ;02+7 OPERATION 3,4 ;03+7 OPERATION 4,5 ;04+7 OPERATION 5,6 ;05+7 OPERATION 6,7 ;06+7 OPERATION 7,8 ;07+7 OPERATION 8,9 ;08+7 OPERATION 9,10 ;09+7 OPERATION 10,11 ;10+7 OPERATION 11,12 ;11+7 OPERATION 12,13 ;12+7 OPERATION 13,14 ;13+7 OPERATION 14,15 ;14+7 OPERATION 15,16 ;15+7 OPERATION 16,17 ;16+7 OPERATION 17,18 ;17+7 OPERATION 18,19 ;18+7 OPERATION 19,20 ;19+7 OPERATION 20,21 ;20+7 OPERATION 21,22 ;21+7 OPERATION 22,23 ;22+7 OPERATION 23,24 ;23+7 OPERATION 24,25 ;24+7 OPERATION 25,26 ;25+7 OPERATION 26,27 ;26+7 OPERATION 27,28 ;27+7 OPERATION 28,29 ;28+7 OPERATION 29,30 ;29+7 OPERATION 30,31 ;30+7 OPERATION 31,32 ;31+7 OPERATION 32,33 ;32+7 OPERATION 33,34 ;33+7 OPERATION 34,35 ;34+7 OPERATION 35,36 ;35+7 OPERATION 36,37 ;36+7 OPERATION 37,38 ;37+7 OPERATION 38,39 ;38+7 OPERATION 39,40 ;39+7 OPERATION 40,41 ;40+7 OPERATION 41,42 ;41+7 OPERATION 42,43 ;42+7 OPERATION 43,44 ;43+7 OPERATION 44,45 ;44+7 OPERATION 45,46 ;45+7 OPERATION 46,47 ;45+7 OPERATION 47,48 ;45+7 fld dword [eax+DOTP1+ELM48] ;48+7 fmul st0,st1 faddp st7 fld dword [eax+DOTP2+ELM48] fmul st0,st1 faddp st6 fld dword [eax+DOTP3+ELM48] fmul st0,st1 faddp st5 fld dword [eax+DOTP4+ELM48] fmul st0,st1 faddp st4 fld dword [eax+DOTP5+ELM48] fmul st0,st1 faddp st3 rep fmul dword [eax+DOTP6+ELM48] faddp st1 fxch st5 %ifdef PREC_DST1 mov [esp+20],ecx add ecx,[esp+12] prefetchw [ecx-2*64] prefetchw [ecx-1*64] prefetchw [ecx+0*64] prefetchw [ecx+1*64] nop prefetchw [ecx+2*64-1] mov ecx,[esp+20] %endif %ifdef PREB_DST1 prefetch [ebx+48*4-2*64] fnop mov edx,edx prefetch [ebx+48*4-1*64] nop prefetch [ebx+48*4+0*64] nop prefetch [ebx+48*4+1*64] nop prefetch [ebx+48*4+2*64] nop %endif fstp dword [ecx+ELM37] fxch st3 fstp dword [ecx+ELM38] fxch st1 fstp dword [ecx+ELM39] fstp dword [ecx+ELM40] fstp dword [ecx+ELM41] fstp dword [ecx+ELM42] add eax,edx fld dword [ebx+ELM1] ;01+8 fld dword [eax+DOTP2] fmul st0,st1 fld dword [eax+DOTP3] fmul st0,st2 fld dword [eax+DOTP1] rep fmul st0,st3 fxch st0,st3 fld dword [eax+DOTP5] rep fmul st0,st1 rep fld dword [eax+DOTP6] mov edx,edx fmul st0,st2 fld dword [ebx+ELM2] fld dword [eax+DOTP4] rep fmulp st4,st0 add eax,byte 30*4 mov edx,edx OPERATION 2,3 ;02+8 OPERATION 3,4 ;03+8 OPERATION 4,5 ;04+8 OPERATION 5,6 ;05+8 OPERATION 6,7 ;06+8 OPERATION 7,8 ;07+8 OPERATION 8,9 ;08+8 OPERATION 9,10 ;09+8 OPERATION 10,11 ;10+8 OPERATION 11,12 ;11+8 OPERATION 12,13 ;12+8 OPERATION 13,14 ;13+8 OPERATION 14,15 ;14+8 OPERATION 15,16 ;15+8 OPERATION 16,17 ;16+8 OPERATION 17,18 ;17+8 OPERATION 18,19 ;18+8 OPERATION 19,20 ;19+8 OPERATION 20,21 ;20+8 OPERATION 21,22 ;21+8 OPERATION 22,23 ;22+8 OPERATION 23,24 ;23+8 OPERATION 24,25 ;24+8 OPERATION 25,26 ;25+8 OPERATION 26,27 ;26+8 OPERATION 27,28 ;27+8 OPERATION 28,29 ;28+8 OPERATION 29,30 ;29+8 OPERATION 30,31 ;30+8 OPERATION 31,32 ;31+8 OPERATION 32,33 ;32+8 OPERATION 33,34 ;33+8 OPERATION 34,35 ;34+8 OPERATION 35,36 ;35+8 OPERATION 36,37 ;36+8 OPERATION 37,38 ;37+8 OPERATION 38,39 ;38+8 OPERATION 39,40 ;39+8 OPERATION 40,41 ;40+8 OPERATION 41,42 ;41+8 OPERATION 42,43 ;42+8 OPERATION 43,44 ;43+8 OPERATION 44,45 ;44+8 OPERATION 45,46 ;45+8 OPERATION 46,47 ;45+8 OPERATION 47,48 ;45+8 fld dword [eax+DOTP1+ELM48] ;48+8 fmul st0,st1 faddp st7 fld dword [eax+DOTP2+ELM48] fmul st0,st1 faddp st6 fld dword [eax+DOTP3+ELM48] fmul st0,st1 faddp st5 fld dword [eax+DOTP4+ELM48] fmul st0,st1 faddp st4 fld dword [eax+DOTP5+ELM48] fmul st0,st1 faddp st3 rep fmul dword [eax+DOTP6+ELM48] faddp st1 fxch st5 %ifdef PREA_EN mov [esp+20],edx ;save edx in t1 mov edx,[esp+16] ;&A+1->edx lea edx,[edx+ebx] prefetch [edx-2*64] nop prefetch [edx-1*64] prefetch [edx+0*64] nop prefetch [edx+1*64] prefetch [edx+2*64-8] mov edx,[esp+20] ;restore edx mov eax,eax fnop %endif fstp dword [ecx+ELM43] fxch st3 fstp dword [ecx+ELM44] fxch st1 fstp dword [ecx+ELM45] fstp dword [ecx+ELM46] fstp dword [ecx+ELM47] fstp dword [ecx+ELM48] sub ebx,edi ;next column of B mov eax,[esp+4] ;reset eax add ecx,[esp+12] ;next column of C (+ldc*4) dec dword [esp+8] ;dec counter jnz near loopj_end_ femms pop ebp add esp,byte 5*4 ;remove local variables pop edi ;restore registers pop esi pop ebx leave ;mov esp,ebp / pop ebp ret
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?