📄 gemm_kernel_8x1_sse2.s
字号:
addpd %xmm3, %xmm4 movapd 42 * SIZE(AA), %xmm3 mulpd %xmm2, %xmm3 addpd %xmm3, %xmm5 movapd 44 * SIZE(AA), %xmm3 mulpd %xmm2, %xmm3 mulpd 46 * SIZE(AA), %xmm2 addpd %xmm3, %xmm6 movapd 56 * SIZE(AA), %xmm3 movd (PRE + 48) * SIZE(AA), %mm0 addpd %xmm2, %xmm7 movapd 12 * SIZE(BB), %xmm2 mulpd %xmm2, %xmm1 addpd %xmm1, %xmm4 movapd 50 * SIZE(AA), %xmm1 mulpd %xmm2, %xmm1 addpd %xmm1, %xmm5 movapd 52 * SIZE(AA), %xmm1 mulpd %xmm2, %xmm1 mulpd 54 * SIZE(AA), %xmm2 addpd %xmm1, %xmm6 movapd 64 * SIZE(AA), %xmm1 movd (PRE + 56) * SIZE(AA), %mm0 addpd %xmm2, %xmm7 movapd 14 * SIZE(BB), %xmm2 mulpd %xmm2, %xmm3 addpd %xmm3, %xmm4 movapd 58 * SIZE(AA), %xmm3 mulpd %xmm2, %xmm3 addpd %xmm3, %xmm5 movapd 60 * SIZE(AA), %xmm3 mulpd %xmm2, %xmm3 mulpd 62 * SIZE(AA), %xmm2 addpd %xmm3, %xmm6 movapd 72 * SIZE(AA), %xmm3 addpd %xmm2, %xmm7 movapd 24 * SIZE(BB), %xmm2 addl $64 * SIZE, AA addl $16 * SIZE, BB decl %eax jne .L11#endif.L12: movapd ALPHA, %xmm3 movl K, %eax andl $7, %eax # if (k & 1) BRANCH je .L14.L13: movapd 0 * SIZE(BB), %xmm0 movapd 0 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd 2 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm5 movapd 4 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm6 mulpd 6 * SIZE(AA), %xmm0 addpd %xmm0, %xmm7 addl $8 * SIZE, AA # aoffset += 8 addl $2 * SIZE, BB # boffset1 += 8 subl $1, %eax jg .L13 ALIGN_4.L14: mulpd %xmm3, %xmm4 mulpd %xmm3, %xmm5 mulpd %xmm3, %xmm6 mulpd %xmm3, %xmm7 movsd 0 * SIZE(%esi), %xmm0 movhpd 1 * SIZE(%esi), %xmm0 movsd 2 * SIZE(%esi), %xmm1 movhpd 3 * SIZE(%esi), %xmm1 movsd 4 * SIZE(%esi), %xmm2 movhpd 5 * SIZE(%esi), %xmm2 movsd 6 * SIZE(%esi), %xmm3 movhpd 7 * SIZE(%esi), %xmm3 addpd %xmm0, %xmm4 addpd %xmm1, %xmm5 addpd %xmm2, %xmm6 addpd %xmm3, %xmm7 movsd %xmm4, 0 * SIZE(%esi) movhpd %xmm4, 1 * SIZE(%esi) movsd %xmm5, 2 * SIZE(%esi) movhpd %xmm5, 3 * SIZE(%esi) movsd %xmm6, 4 * SIZE(%esi) movhpd %xmm6, 5 * SIZE(%esi) movsd %xmm7, 6 * SIZE(%esi) movhpd %xmm7, 7 * SIZE(%esi) addl $8 * SIZE, %esi # coffset += 4 BRANCH decl %ebx # i -- jg .L10 ALIGN_2 .L20: movl M, %ebx testl $4, %ebx jle .L30 leal BUFFER, %ecx movl K, %eax movapd 0 * SIZE + BUFFER, %xmm2 movapd 0 * SIZE(%edx), %xmm0 movapd 8 * SIZE + BUFFER, %xmm3 movapd 8 * SIZE(%edx), %xmm1 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 sarl $3, %eax je .L22.L21: movapd 0 * SIZE(BB), %xmm0 movapd 0 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 mulpd 2 * SIZE(AA), %xmm0 addpd %xmm0, %xmm5 movapd 2 * SIZE(BB), %xmm0 movapd 4 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 mulpd 6 * SIZE(AA), %xmm0 addpd %xmm0, %xmm5 movapd 4 * SIZE(BB), %xmm0 movapd 8 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 mulpd 10 * SIZE(AA), %xmm0 addpd %xmm0, %xmm5 movapd 6 * SIZE(BB), %xmm0 movapd 12 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 mulpd 14 * SIZE(AA), %xmm0 addpd %xmm0, %xmm5 movapd 8 * SIZE(BB), %xmm0 movapd 16 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 mulpd 18 * SIZE(AA), %xmm0 addpd %xmm0, %xmm5 movapd 10 * SIZE(BB), %xmm0 movapd 20 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 mulpd 22 * SIZE(AA), %xmm0 addpd %xmm0, %xmm5 movapd 12 * SIZE(BB), %xmm0 movapd 24 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 mulpd 26 * SIZE(AA), %xmm0 addpd %xmm0, %xmm5 movapd 14 * SIZE(BB), %xmm0 movapd 28 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 mulpd 30 * SIZE(AA), %xmm0 addpd %xmm0, %xmm5 addl $32 * SIZE, AA addl $16 * SIZE, BB decl %eax jne .L21.L22: movapd ALPHA, %xmm3 movl K, %eax andl $7, %eax BRANCH je .L24.L23: movapd 0 * SIZE(BB), %xmm0 movapd 0 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 mulpd 2 * SIZE(AA), %xmm0 addpd %xmm0, %xmm5 addl $4 * SIZE, AA # aoffset += 8 addl $2 * SIZE, BB # boffset1 += 8 subl $1, %eax jg .L23 ALIGN_4.L24: mulpd %xmm3, %xmm4 mulpd %xmm3, %xmm5 movsd 0 * SIZE(%esi), %xmm0 movhpd 1 * SIZE(%esi), %xmm0 movsd 2 * SIZE(%esi), %xmm1 movhpd 3 * SIZE(%esi), %xmm1 addpd %xmm0, %xmm4 addpd %xmm1, %xmm5 movsd %xmm4, 0 * SIZE(%esi) movhpd %xmm4, 1 * SIZE(%esi) movsd %xmm5, 2 * SIZE(%esi) movhpd %xmm5, 3 * SIZE(%esi) addl $4 * SIZE, %esi # coffset += 4 ALIGN_4.L30: movl M, %ebx testl $2, %ebx jle .L50 leal BUFFER, %ecx movl K, %eax movapd 0 * SIZE + BUFFER, %xmm2 movapd 0 * SIZE(AA), %xmm0 movapd 8 * SIZE + BUFFER, %xmm3 movapd 8 * SIZE(AA), %xmm1 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 sarl $3, %eax je .L32.L31: movapd 0 * SIZE(BB), %xmm0 movapd 0 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd 2 * SIZE(BB), %xmm0 movapd 2 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd 4 * SIZE(BB), %xmm0 movapd 4 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd 6 * SIZE(BB), %xmm0 movapd 6 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd 8 * SIZE(BB), %xmm0 movapd 8 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd 10 * SIZE(BB), %xmm0 movapd 10 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd 12 * SIZE(BB), %xmm0 movapd 12 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd 14 * SIZE(BB), %xmm0 movapd 14 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 addl $16 * SIZE, AA addl $16 * SIZE, BB BRANCH decl %eax jne .L31.L32: movapd ALPHA, %xmm3 movl K, %eax andl $7, %eax # if (k & 1) BRANCH je .L34.L33: movapd 0 * SIZE(BB), %xmm0 movapd 0 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 addl $2 * SIZE, AA # aoffset += 8 addl $2 * SIZE, BB # boffset1 += 8 decl %eax BRANCH jg .L33 ALIGN_4.L34: mulpd %xmm3, %xmm4 movsd 0 * SIZE(%esi), %xmm0 movhpd 1 * SIZE(%esi), %xmm0 addpd %xmm0, %xmm4 movsd %xmm4, 0 * SIZE(%esi) movhpd %xmm4, 1 * SIZE(%esi) addl $2 * SIZE, %esi ALIGN_2.L50: movl M, %ebx testl $1, %ebx jle .L99 leal BUFFER, %ecx movl K, %eax movsd 0 * SIZE + BUFFER, %xmm2 movsd 0 * SIZE(AA), %xmm0 movsd 8 * SIZE + BUFFER, %xmm3 movsd 4 * SIZE(AA), %xmm1 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 sarl $3, %eax je .L52.L51: movsd 0 * SIZE(AA), %xmm0 mulsd 0 * SIZE(BB), %xmm0 addsd %xmm0, %xmm4 movsd 1 * SIZE(AA), %xmm0 mulsd 2 * SIZE(BB), %xmm0 addsd %xmm0, %xmm4 movsd 2 * SIZE(AA), %xmm0 mulsd 4 * SIZE(BB), %xmm0 addsd %xmm0, %xmm4 movsd 3 * SIZE(AA), %xmm0 mulsd 6 * SIZE(BB), %xmm0 addsd %xmm0, %xmm4 movsd 4 * SIZE(AA), %xmm0 mulsd 8 * SIZE(BB), %xmm0 addsd %xmm0, %xmm4 movsd 5 * SIZE(AA), %xmm0 mulsd 10 * SIZE(BB), %xmm0 addsd %xmm0, %xmm4 movsd 6 * SIZE(AA), %xmm0 mulsd 12 * SIZE(BB), %xmm0 addsd %xmm0, %xmm4 movsd 7 * SIZE(AA), %xmm0 mulsd 14 * SIZE(BB), %xmm0 addsd %xmm0, %xmm4 addl $ 8 * SIZE, AA addl $16 * SIZE, BB BRANCH decl %eax jne .L51.L52: movsd ALPHA, %xmm3 movl K, %eax andl $7, %eax # if (k & 1) BRANCH je .L54.L53: movsd 0 * SIZE(AA), %xmm0 mulsd 0 * SIZE(BB), %xmm0 addsd %xmm0, %xmm4 addl $1 * SIZE, AA # aoffset += 8 addl $2 * SIZE, BB # boffset1 += 8 decl %eax BRANCH jg .L53 ALIGN_4.L54: movsd 0 * SIZE(%esi), %xmm0 mulsd %xmm3, %xmm4 addsd %xmm0, %xmm4 movsd %xmm4, 0 * SIZE(%esi) ALIGN_2.L99: addl LDC, C decl J # j -- jg .L01 ALIGN_2.L999: movl OLD_STACK, %esp EMMS popl %ebx popl %esi popl %edi popl %ebp ret ALIGN_2 EPILOGUE
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -