📄 gemv_n_sse2.s
字号:
addpd %xmm13, %xmm5 movsd 10 * SIZE(A2), %xmm13 movhpd 11 * SIZE(A2), %xmm13 mulpd %xmm0, %xmm9 addpd %xmm14, %xmm6 movsd 12 * SIZE(A2), %xmm14 movhpd 13 * SIZE(A2), %xmm14 mulpd %xmm0, %xmm10 addpd %xmm15, %xmm7 movsd 14 * SIZE(A2), %xmm15 movhpd 15 * SIZE(A2), %xmm15 mulpd %xmm0, %xmm11 movapd %xmm4, 0 * SIZE(Y1) movapd 8 * SIZE(Y1), %xmm4 movapd %xmm5, 2 * SIZE(Y1) movapd 10 * SIZE(Y1), %xmm5 movapd %xmm6, 4 * SIZE(Y1) movapd 12 * SIZE(Y1), %xmm6 movapd %xmm7, 6 * SIZE(Y1) movapd 14 * SIZE(Y1), %xmm7 PREFETCH PREFETCHSIZE * SIZE(A2) addpd %xmm8, %xmm4 movapd 16 * SIZE(A1), %xmm8 mulpd %xmm1, %xmm12 addpd %xmm9, %xmm5 movapd 18 * SIZE(A1), %xmm9 mulpd %xmm1, %xmm13 addpd %xmm10, %xmm6 movapd 20 * SIZE(A1), %xmm10 mulpd %xmm1, %xmm14 addpd %xmm11, %xmm7 movapd 22 * SIZE(A1), %xmm11 mulpd %xmm1, %xmm15 addpd %xmm12, %xmm4 movsd 16 * SIZE(A2), %xmm12 movhpd 17 * SIZE(A2), %xmm12 mulpd %xmm0, %xmm8 addpd %xmm13, %xmm5 movsd 18 * SIZE(A2), %xmm13 movhpd 19 * SIZE(A2), %xmm13 mulpd %xmm0, %xmm9 addpd %xmm14, %xmm6 movsd 20 * SIZE(A2), %xmm14 movhpd 21 * SIZE(A2), %xmm14 mulpd %xmm0, %xmm10 addpd %xmm15, %xmm7 movsd 22 * SIZE(A2), %xmm15 movhpd 23 * SIZE(A2), %xmm15 mulpd %xmm0, %xmm11 movapd %xmm4, 8 * SIZE(Y1) movapd 16 * SIZE(Y1), %xmm4 movapd %xmm5, 10 * SIZE(Y1) movapd 18 * SIZE(Y1), %xmm5 movapd %xmm6, 12 * SIZE(Y1) movapd 20 * SIZE(Y1), %xmm6 movapd %xmm7, 14 * SIZE(Y1) movapd 22 * SIZE(Y1), %xmm7 addq $16 * SIZE, A1 addq $16 * SIZE, A2 addq $16 * SIZE, Y1 decq I jg .L53 ALIGN_3.L54: addpd %xmm8, %xmm4 movapd 8 * SIZE(A1), %xmm8 mulpd %xmm1, %xmm12 addpd %xmm9, %xmm5 movapd 10 * SIZE(A1), %xmm9 mulpd %xmm1, %xmm13 addpd %xmm10, %xmm6 movapd 12 * SIZE(A1), %xmm10 mulpd %xmm1, %xmm14 addpd %xmm11, %xmm7 movapd 14 * SIZE(A1), %xmm11 mulpd %xmm1, %xmm15 addpd %xmm12, %xmm4 movsd 8 * SIZE(A2), %xmm12 movhpd 9 * SIZE(A2), %xmm12 mulpd %xmm0, %xmm8 addpd %xmm13, %xmm5 movsd 10 * SIZE(A2), %xmm13 movhpd 11 * SIZE(A2), %xmm13 mulpd %xmm0, %xmm9 addpd %xmm14, %xmm6 movsd 12 * SIZE(A2), %xmm14 movhpd 13 * SIZE(A2), %xmm14 mulpd %xmm0, %xmm10 addpd %xmm15, %xmm7 movsd 14 * SIZE(A2), %xmm15 movhpd 15 * SIZE(A2), %xmm15 mulpd %xmm0, %xmm11 movapd %xmm4, 0 * SIZE(Y1) movapd 8 * SIZE(Y1), %xmm4 movapd %xmm5, 2 * SIZE(Y1) movapd 10 * SIZE(Y1), %xmm5 movapd %xmm6, 4 * SIZE(Y1) movapd 12 * SIZE(Y1), %xmm6 movapd %xmm7, 6 * SIZE(Y1) movapd 14 * SIZE(Y1), %xmm7 addpd %xmm8, %xmm4 mulpd %xmm1, %xmm12 addpd %xmm9, %xmm5 mulpd %xmm1, %xmm13 addpd %xmm10, %xmm6 mulpd %xmm1, %xmm14 addpd %xmm11, %xmm7 mulpd %xmm1, %xmm15 addpd %xmm12, %xmm4 addpd %xmm13, %xmm5 addpd %xmm14, %xmm6 addpd %xmm15, %xmm7 movapd %xmm4, 8 * SIZE(Y1) movapd %xmm5, 10 * SIZE(Y1) movapd %xmm6, 12 * SIZE(Y1) movapd %xmm7, 14 * SIZE(Y1) addq $16 * SIZE, A1 addq $16 * SIZE, A2 addq $16 * SIZE, Y1 ALIGN_3.L55: testq $8, MM je .L56 movapd 0 * SIZE(Y1), %xmm4 movapd 2 * SIZE(Y1), %xmm5 movapd 4 * SIZE(Y1), %xmm6 movapd 6 * SIZE(Y1), %xmm7 movapd 0 * SIZE(A1), %xmm8 movapd 2 * SIZE(A1), %xmm9 movapd 4 * SIZE(A1), %xmm10 movapd 6 * SIZE(A1), %xmm11 movsd 0 * SIZE(A2), %xmm12 movhpd 1 * SIZE(A2), %xmm12 movsd 2 * SIZE(A2), %xmm13 movhpd 3 * SIZE(A2), %xmm13 movsd 4 * SIZE(A2), %xmm14 movhpd 5 * SIZE(A2), %xmm14 movsd 6 * SIZE(A2), %xmm15 movhpd 7 * SIZE(A2), %xmm15 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm11 addpd %xmm8, %xmm4 mulpd %xmm1, %xmm12 addpd %xmm9, %xmm5 mulpd %xmm1, %xmm13 addpd %xmm10, %xmm6 mulpd %xmm1, %xmm14 addpd %xmm11, %xmm7 mulpd %xmm1, %xmm15 addpd %xmm12, %xmm4 addpd %xmm13, %xmm5 addpd %xmm14, %xmm6 addpd %xmm15, %xmm7 movapd %xmm4, 0 * SIZE(Y1) movapd %xmm5, 2 * SIZE(Y1) movapd %xmm6, 4 * SIZE(Y1) movapd %xmm7, 6 * SIZE(Y1) addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, Y1 ALIGN_3.L56: testq $4, MM je .L57 movapd 0 * SIZE(Y1), %xmm4 movapd 2 * SIZE(Y1), %xmm5 movapd 0 * SIZE(A1), %xmm8 movapd 2 * SIZE(A1), %xmm9 movsd 0 * SIZE(A2), %xmm10 movhpd 1 * SIZE(A2), %xmm10 movsd 2 * SIZE(A2), %xmm11 movhpd 3 * SIZE(A2), %xmm11 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm10 mulpd %xmm1, %xmm11 addpd %xmm8, %xmm4 addpd %xmm9, %xmm5 addpd %xmm10, %xmm4 addpd %xmm11, %xmm5 movapd %xmm4, 0 * SIZE(Y1) movapd %xmm5, 2 * SIZE(Y1) addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3.L57: testq $2, MM je .L58 movapd 0 * SIZE(Y1), %xmm4 movapd 0 * SIZE(A1), %xmm8 movsd 0 * SIZE(A2), %xmm10 movhpd 1 * SIZE(A2), %xmm10 mulpd %xmm0, %xmm8 mulpd %xmm1, %xmm10 addpd %xmm8, %xmm4 addpd %xmm10, %xmm4 movapd %xmm4, 0 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3.L58: testq $1, MM je .L30 movsd 0 * SIZE(Y1), %xmm4 movsd 0 * SIZE(A1), %xmm8 movsd 0 * SIZE(A2), %xmm9 mulsd %xmm0, %xmm8 mulsd %xmm1, %xmm9 addsd %xmm8, %xmm4 addsd %xmm9, %xmm4 movsd %xmm4, 0 * SIZE(Y1) addq $1 * SIZE, A1 addq $1 * SIZE, A2 addq $1 * SIZE, Y1 ALIGN_3.L60: testq $1, N je .L990 movq BUFFER, Y1 movq A, A1 movsd (X), %xmm0 mulsd STACK_ALPHA, %xmm0 unpcklpd %xmm0, %xmm0 testq $SIZE, A je .L62 movsd (Y1), %xmm4 movsd (A1), %xmm8 mulsd %xmm0, %xmm8 addsd %xmm8, %xmm4 movsd %xmm4, (Y1) addq $1 * SIZE, A1 addq $1 * SIZE, Y1 ALIGN_3.L62: movq MM, I sarq $4, I jle .L65 movapd 0 * SIZE(A1), %xmm8 movapd 2 * SIZE(A1), %xmm9 movapd 4 * SIZE(A1), %xmm10 movapd 6 * SIZE(A1), %xmm11 movapd 8 * SIZE(A1), %xmm12 mulpd %xmm0, %xmm8 movapd 10 * SIZE(A1), %xmm13 mulpd %xmm0, %xmm9 movapd 12 * SIZE(A1), %xmm14 mulpd %xmm0, %xmm10 movapd 14 * SIZE(A1), %xmm15 mulpd %xmm0, %xmm11 movapd 0 * SIZE(Y1), %xmm4 movapd 2 * SIZE(Y1), %xmm5 movapd 4 * SIZE(Y1), %xmm6 movapd 6 * SIZE(Y1), %xmm7 decq I jle .L64 ALIGN_3.L63: PREFETCH PREFETCHSIZE * SIZE(A1) addpd %xmm8, %xmm4 movapd 16 * SIZE(A1), %xmm8 mulpd %xmm0, %xmm12 addpd %xmm9, %xmm5 movapd 18 * SIZE(A1), %xmm9 mulpd %xmm0, %xmm13 addpd %xmm10, %xmm6 movapd 20 * SIZE(A1), %xmm10 mulpd %xmm0, %xmm14 addpd %xmm11, %xmm7 movapd 22 * SIZE(A1), %xmm11 mulpd %xmm0, %xmm15 movapd %xmm4, 0 * SIZE(Y1) movapd 8 * SIZE(Y1), %xmm4 movapd %xmm5, 2 * SIZE(Y1) movapd 10 * SIZE(Y1), %xmm5 movapd %xmm6, 4 * SIZE(Y1) movapd 12 * SIZE(Y1), %xmm6 movapd %xmm7, 6 * SIZE(Y1) movapd 14 * SIZE(Y1), %xmm7 PREFETCH PREFETCHSIZE * SIZE(Y1) addpd %xmm12, %xmm4 movapd 24 * SIZE(A1), %xmm12 mulpd %xmm0, %xmm8 addpd %xmm13, %xmm5 movapd 26 * SIZE(A1), %xmm13 mulpd %xmm0, %xmm9 addpd %xmm14, %xmm6 movapd 28 * SIZE(A1), %xmm14 mulpd %xmm0, %xmm10 addpd %xmm15, %xmm7 movapd 30 * SIZE(A1), %xmm15 mulpd %xmm0, %xmm11 movapd %xmm4, 8 * SIZE(Y1) movapd 16 * SIZE(Y1), %xmm4 movapd %xmm5, 10 * SIZE(Y1) movapd 18 * SIZE(Y1), %xmm5 movapd %xmm6, 12 * SIZE(Y1) movapd 20 * SIZE(Y1), %xmm6 movapd %xmm7, 14 * SIZE(Y1) movapd 22 * SIZE(Y1), %xmm7 addq $16 * SIZE, A1 addq $16 * SIZE, Y1 decq I jg .L63 ALIGN_3.L64: addpd %xmm8, %xmm4 mulpd %xmm0, %xmm12 addpd %xmm9, %xmm5 mulpd %xmm0, %xmm13 addpd %xmm10, %xmm6 mulpd %xmm0, %xmm14 addpd %xmm11, %xmm7 mulpd %xmm0, %xmm15 movapd %xmm4, 0 * SIZE(Y1) movapd 8 * SIZE(Y1), %xmm4 movapd %xmm5, 2 * SIZE(Y1) movapd 10 * SIZE(Y1), %xmm5 movapd %xmm6, 4 * SIZE(Y1) movapd 12 * SIZE(Y1), %xmm6 movapd %xmm7, 6 * SIZE(Y1) movapd 14 * SIZE(Y1), %xmm7 addpd %xmm12, %xmm4 addpd %xmm13, %xmm5 addpd %xmm14, %xmm6 addpd %xmm15, %xmm7 movapd %xmm4, 8 * SIZE(Y1) movapd %xmm5, 10 * SIZE(Y1) movapd %xmm6, 12 * SIZE(Y1) movapd %xmm7, 14 * SIZE(Y1) addq $16 * SIZE, A1 addq $16 * SIZE, A2 addq $16 * SIZE, Y1 ALIGN_3.L65: testq $8, MM je .L66 movapd 0 * SIZE(Y1), %xmm4 movapd 2 * SIZE(Y1), %xmm5 movapd 4 * SIZE(Y1), %xmm6 movapd 6 * SIZE(Y1), %xmm7 movapd 0 * SIZE(A1), %xmm8 movapd 2 * SIZE(A1), %xmm9 movapd 4 * SIZE(A1), %xmm10 movapd 6 * SIZE(A1), %xmm11 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm11 addpd %xmm8, %xmm4 addpd %xmm9, %xmm5 addpd %xmm10, %xmm6 addpd %xmm11, %xmm7 movapd %xmm4, 0 * SIZE(Y1) movapd %xmm5, 2 * SIZE(Y1) movapd %xmm6, 4 * SIZE(Y1) movapd %xmm7, 6 * SIZE(Y1) addq $8 * SIZE, A1 addq $8 * SIZE, Y1 ALIGN_3.L66: testq $4, MM je .L67 movapd 0 * SIZE(Y1), %xmm4 movapd 2 * SIZE(Y1), %xmm5 movapd 0 * SIZE(A1), %xmm8 movapd 2 * SIZE(A1), %xmm9 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 addpd %xmm8, %xmm4 addpd %xmm9, %xmm5 movapd %xmm4, 0 * SIZE(Y1) movapd %xmm5, 2 * SIZE(Y1) addq $4 * SIZE, A1 addq $4 * SIZE, Y1 ALIGN_3.L67: testq $2, MM je .L68 movapd 0 * SIZE(Y1), %xmm4 movapd 0 * SIZE(A1), %xmm8 mulpd %xmm0, %xmm8 addpd %xmm8, %xmm4 movapd %xmm4, 0 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, Y1 ALIGN_3.L68: testq $1, MM je .L990 movsd 0 * SIZE(Y1), %xmm4 movsd 0 * SIZE(A1), %xmm8 mulsd %xmm0, %xmm8 addsd %xmm8, %xmm4 movsd %xmm4, 0 * SIZE(Y1) ALIGN_3.L990: movq Y, Y1 testq $SIZE, A je .L991 movsd (Y), %xmm4 addsd (BUFFER), %xmm4 movsd %xmm4, (Y1) addq INCY, Y addq INCY, Y1 addq $1 * SIZE, BUFFER ALIGN_3.L991: movq MM, %rax sarq $3, %rax jle .L994 ALIGN_3.L992: movsd (Y), %xmm4 addq INCY, Y movhpd (Y), %xmm4 addq INCY, Y movsd (Y), %xmm5 addq INCY, Y movhpd (Y), %xmm5 addq INCY, Y movsd (Y), %xmm6 addq INCY, Y movhpd (Y), %xmm6 addq INCY, Y movsd (Y), %xmm7 addq INCY, Y movhpd (Y), %xmm7 addq INCY, Y movapd (BUFFER), %xmm0 movapd 2 * SIZE(BUFFER), %xmm1 movapd 4 * SIZE(BUFFER), %xmm2 movapd 6 * SIZE(BUFFER), %xmm3 addpd %xmm4, %xmm0 addpd %xmm5, %xmm1 addpd %xmm6, %xmm2 addpd %xmm7, %xmm3 movsd %xmm0, (Y1) addq INCY, Y1 movhpd %xmm0, (Y1) addq INCY, Y1 movsd %xmm1, (Y1) addq INCY, Y1 movhpd %xmm1, (Y1) addq INCY, Y1 movsd %xmm2, (Y1) addq INCY, Y1 movhpd %xmm2, (Y1) addq INCY, Y1 movsd %xmm3, (Y1) addq INCY, Y1 movhpd %xmm3, (Y1) addq INCY, Y1 addq $8 * SIZE, BUFFER decq %rax jg .L992 ALIGN_3.L994: testq $7, MM jle .L999 testq $4, MM jle .L995 movsd (Y), %xmm4 addq INCY, Y movhpd (Y), %xmm4 addq INCY, Y movsd (Y), %xmm5 addq INCY, Y movhpd (Y), %xmm5 addq INCY, Y movapd (BUFFER), %xmm0 movapd 2 * SIZE(BUFFER), %xmm1 addpd %xmm4, %xmm0 addpd %xmm5, %xmm1 movsd %xmm0, (Y1) addq INCY, Y1 movhpd %xmm0, (Y1) addq INCY, Y1 movsd %xmm1, (Y1) addq INCY, Y1 movhpd %xmm1, (Y1) addq INCY, Y1 addq $4 * SIZE, BUFFER ALIGN_3.L995: testq $2, MM jle .L996 movsd (Y), %xmm4 addq INCY, Y movhpd (Y), %xmm4 addq INCY, Y movapd (BUFFER), %xmm0 addpd %xmm4, %xmm0 movsd %xmm0, (Y1) addq INCY, Y1 movhpd %xmm0, (Y1) addq INCY, Y1 addq $2 * SIZE, BUFFER ALIGN_3.L996: testq $1, MM jle .L999 movsd (Y), %xmm4 addq INCY, Y movsd (BUFFER), %xmm0 addsd %xmm4, %xmm0 movsd %xmm0, (Y1) ALIGN_3.L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15#ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15#endif addq $STACKSIZE, %rsp ret EPILOGUE
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -