📄 zgemv_n_sse2_barcelona.s
字号:
movhpd %xmm4, 1 * SIZE(Y1) movupd 8 * SIZE(Y1), %xmm4 movlpd %xmm5, 2 * SIZE(Y1) movhpd %xmm5, 3 * SIZE(Y1) movupd 10 * SIZE(Y1), %xmm5 movlpd %xmm6, 4 * SIZE(Y1) movhpd %xmm6, 5 * SIZE(Y1) movupd 12 * SIZE(Y1), %xmm6 movlpd %xmm7, 6 * SIZE(Y1) movhpd %xmm7, 7 * SIZE(Y1) movupd 14 * SIZE(Y1), %xmm7 addpd %xmm8, %xmm4 MOVDDUP2( 8 * SIZE, (A1, LDA, 1), %xmm8) mulpd %xmm1, %xmm9 addpd %xmm10, %xmm5 MOVDDUP2(10 * SIZE, (A1, LDA, 1), %xmm10) mulpd %xmm1, %xmm11 addpd %xmm12, %xmm6 MOVDDUP2(12 * SIZE, (A1, LDA, 1), %xmm12) mulpd %xmm1, %xmm13 addpd %xmm14, %xmm7 MOVDDUP2(14 * SIZE, (A1, LDA, 1), %xmm14) mulpd %xmm1, %xmm15 ADD %xmm9, %xmm4 MOVDDUP2( 9 * SIZE, (A1, LDA, 1), %xmm9) mulpd %xmm2, %xmm8 ADD %xmm11, %xmm5 MOVDDUP2(11 * SIZE, (A1, LDA, 1), %xmm11) mulpd %xmm2, %xmm10 ADD %xmm13, %xmm6 MOVDDUP2(13 * SIZE, (A1, LDA, 1), %xmm13) mulpd %xmm2, %xmm12 ADD %xmm15, %xmm7 MOVDDUP2(15 * SIZE, (A1, LDA, 1), %xmm15) mulpd %xmm2, %xmm14 addpd %xmm8, %xmm4 mulpd %xmm3, %xmm9 addpd %xmm10, %xmm5 mulpd %xmm3, %xmm11 addpd %xmm12, %xmm6 mulpd %xmm3, %xmm13 addpd %xmm14, %xmm7 mulpd %xmm3, %xmm15 ADD %xmm9, %xmm4 ADD %xmm11, %xmm5 ADD %xmm13, %xmm6 ADD %xmm15, %xmm7 movlpd %xmm4, 8 * SIZE(Y1) movhpd %xmm4, 9 * SIZE(Y1) movlpd %xmm5, 10 * SIZE(Y1) movhpd %xmm5, 11 * SIZE(Y1) movlpd %xmm6, 12 * SIZE(Y1) movhpd %xmm6, 13 * SIZE(Y1) movlpd %xmm7, 14 * SIZE(Y1) movhpd %xmm7, 15 * SIZE(Y1) subq $-16 * SIZE, A1 subq $-16 * SIZE, Y1 ALIGN_3.L26: testq $4, M je .L27 MOVDDUP( 0 * SIZE, A1, %xmm8) MOVDDUP( 1 * SIZE, A1, %xmm9) MOVDDUP( 2 * SIZE, A1, %xmm10) MOVDDUP( 3 * SIZE, A1, %xmm11) MOVDDUP( 4 * SIZE, A1, %xmm12) MOVDDUP( 5 * SIZE, A1, %xmm13) MOVDDUP( 6 * SIZE, A1, %xmm14) MOVDDUP( 7 * SIZE, A1, %xmm15) movupd 0 * SIZE(Y1), %xmm4 movupd 2 * SIZE(Y1), %xmm5 movupd 4 * SIZE(Y1), %xmm6 movupd 6 * SIZE(Y1), %xmm7 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm12 mulpd %xmm0, %xmm14 addpd %xmm8, %xmm4 MOVDDUP2( 0 * SIZE, (A1, LDA, 1), %xmm8) mulpd %xmm1, %xmm9 addpd %xmm10, %xmm5 MOVDDUP2( 2 * SIZE, (A1, LDA, 1), %xmm10) mulpd %xmm1, %xmm11 addpd %xmm12, %xmm6 MOVDDUP2( 4 * SIZE, (A1, LDA, 1), %xmm12) mulpd %xmm1, %xmm13 addpd %xmm14, %xmm7 MOVDDUP2( 6 * SIZE, (A1, LDA, 1), %xmm14) mulpd %xmm1, %xmm15 ADD %xmm9, %xmm4 MOVDDUP2( 1 * SIZE, (A1, LDA, 1), %xmm9) mulpd %xmm2, %xmm8 ADD %xmm11, %xmm5 MOVDDUP2( 3 * SIZE, (A1, LDA, 1), %xmm11) mulpd %xmm2, %xmm10 ADD %xmm13, %xmm6 MOVDDUP2( 5 * SIZE, (A1, LDA, 1), %xmm13) mulpd %xmm2, %xmm12 ADD %xmm15, %xmm7 MOVDDUP2( 7 * SIZE, (A1, LDA, 1), %xmm15) mulpd %xmm2, %xmm14 addpd %xmm8, %xmm4 mulpd %xmm3, %xmm9 addpd %xmm10, %xmm5 mulpd %xmm3, %xmm11 addpd %xmm12, %xmm6 mulpd %xmm3, %xmm13 addpd %xmm14, %xmm7 mulpd %xmm3, %xmm15 ADD %xmm9, %xmm4 ADD %xmm11, %xmm5 ADD %xmm13, %xmm6 ADD %xmm15, %xmm7 movlpd %xmm4, 0 * SIZE(Y1) movhpd %xmm4, 1 * SIZE(Y1) movlpd %xmm5, 2 * SIZE(Y1) movhpd %xmm5, 3 * SIZE(Y1) movlpd %xmm6, 4 * SIZE(Y1) movhpd %xmm6, 5 * SIZE(Y1) movlpd %xmm7, 6 * SIZE(Y1) movhpd %xmm7, 7 * SIZE(Y1) addq $8 * SIZE, A1 addq $8 * SIZE, Y1 ALIGN_3.L27: testq $2, M je .L28 MOVDDUP( 0 * SIZE, A1, %xmm8) MOVDDUP( 1 * SIZE, A1, %xmm9) MOVDDUP( 2 * SIZE, A1, %xmm10) MOVDDUP( 3 * SIZE, A1, %xmm11) MOVDDUP2( 0 * SIZE, (A1, LDA, 1), %xmm12) MOVDDUP2( 1 * SIZE, (A1, LDA, 1), %xmm13) MOVDDUP2( 2 * SIZE, (A1, LDA, 1), %xmm14) MOVDDUP2( 3 * SIZE, (A1, LDA, 1), %xmm15) movupd 0 * SIZE(Y1), %xmm4 movupd 2 * SIZE(Y1), %xmm5 mulpd %xmm0, %xmm8 mulpd %xmm1, %xmm9 mulpd %xmm0, %xmm10 mulpd %xmm1, %xmm11 mulpd %xmm2, %xmm12 addpd %xmm8, %xmm4 mulpd %xmm3, %xmm13 addpd %xmm10, %xmm5 mulpd %xmm2, %xmm14 ADD %xmm9, %xmm4 mulpd %xmm3, %xmm15 ADD %xmm11, %xmm5 addpd %xmm12, %xmm4 addpd %xmm14, %xmm5 ADD %xmm13, %xmm4 ADD %xmm15, %xmm5 movlpd %xmm4, 0 * SIZE(Y1) movhpd %xmm4, 1 * SIZE(Y1) movlpd %xmm5, 2 * SIZE(Y1) movhpd %xmm5, 3 * SIZE(Y1) addq $4 * SIZE, A1 addq $4 * SIZE, Y1 ALIGN_3.L28: testq $1, M je .L30 MOVDDUP( 0 * SIZE, A1, %xmm8) MOVDDUP( 1 * SIZE, A1, %xmm9) MOVDDUP2( 0 * SIZE, (A1, LDA, 1), %xmm12) MOVDDUP2( 1 * SIZE, (A1, LDA, 1), %xmm13) movupd 0 * SIZE(Y1), %xmm4 mulpd %xmm0, %xmm8 mulpd %xmm1, %xmm9 mulpd %xmm2, %xmm12 mulpd %xmm3, %xmm13 addpd %xmm8, %xmm4 ADD %xmm9, %xmm4 addpd %xmm12, %xmm4 ADD %xmm13, %xmm4 movlpd %xmm4, 0 * SIZE(Y1) movhpd %xmm4, 1 * SIZE(Y1) ALIGN_3.L30: testq $1, N je .L995 movq YY, Y1 movq A, A1 MOVDDUP(0 * SIZE, X, %xmm0) MOVDDUP(1 * SIZE, X, %xmm1) addq INCX, X movupd 0 + ALPHA_R, %xmm13 movupd 0 + ALPHA_I, %xmm14 movupd 0 + COMP_MASK, %xmm15 mulpd %xmm13, %xmm0 mulpd %xmm14, %xmm1 ADDX %xmm1, %xmm0 movapd %xmm0, %xmm1 SHUFPD_1 %xmm1, %xmm1 xorpd %xmm15, %xmm1 movq M, I sarq $3, I jle .L36 MOVDDUP( 0 * SIZE, A1, %xmm8) MOVDDUP( 2 * SIZE, A1, %xmm10) MOVDDUP( 4 * SIZE, A1, %xmm12) MOVDDUP( 6 * SIZE, A1, %xmm14) MOVDDUP( 1 * SIZE, A1, %xmm9) MOVDDUP( 3 * SIZE, A1, %xmm11) MOVDDUP( 5 * SIZE, A1, %xmm13) MOVDDUP( 7 * SIZE, A1, %xmm15) movupd 0 * SIZE(Y1), %xmm4 movupd 2 * SIZE(Y1), %xmm5 movupd 4 * SIZE(Y1), %xmm6 movupd 6 * SIZE(Y1), %xmm7 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm12 mulpd %xmm0, %xmm14 decq I jle .L35 ALIGN_3.L34: PREFETCH PREFETCHSIZE * SIZE(A1) addpd %xmm8, %xmm4 MOVDDUP( 8 * SIZE, A1, %xmm8) mulpd %xmm1, %xmm9 addpd %xmm10, %xmm5 MOVDDUP(10 * SIZE, A1, %xmm10) mulpd %xmm1, %xmm11 addpd %xmm12, %xmm6 MOVDDUP(12 * SIZE, A1, %xmm12) mulpd %xmm1, %xmm13 addpd %xmm14, %xmm7 MOVDDUP(14 * SIZE, A1, %xmm14) mulpd %xmm1, %xmm15 PREFETCHW PREFETCHSIZE * SIZE(Y1) ADD %xmm9, %xmm4 MOVDDUP( 9 * SIZE, A1, %xmm9) mulpd %xmm0, %xmm8 ADD %xmm11, %xmm5 MOVDDUP(11 * SIZE, A1, %xmm11) mulpd %xmm0, %xmm10 ADD %xmm13, %xmm6 MOVDDUP(13 * SIZE, A1, %xmm13) mulpd %xmm0, %xmm12 ADD %xmm15, %xmm7 MOVDDUP(15 * SIZE, A1, %xmm15) mulpd %xmm0, %xmm14 movlpd %xmm4, 0 * SIZE(Y1) movhpd %xmm4, 1 * SIZE(Y1) movupd 8 * SIZE(Y1), %xmm4 movlpd %xmm5, 2 * SIZE(Y1) movhpd %xmm5, 3 * SIZE(Y1) movupd 10 * SIZE(Y1), %xmm5 movlpd %xmm6, 4 * SIZE(Y1) movhpd %xmm6, 5 * SIZE(Y1) movupd 12 * SIZE(Y1), %xmm6 movlpd %xmm7, 6 * SIZE(Y1) movhpd %xmm7, 7 * SIZE(Y1) movupd 14 * SIZE(Y1), %xmm7 PREFETCH (PREFETCHSIZE + 8) * SIZE(A1) addpd %xmm8, %xmm4 MOVDDUP(16 * SIZE, A1, %xmm8) mulpd %xmm1, %xmm9 addpd %xmm10, %xmm5 MOVDDUP(18 * SIZE, A1, %xmm10) mulpd %xmm1, %xmm11 addpd %xmm12, %xmm6 MOVDDUP(20 * SIZE, A1, %xmm12) mulpd %xmm1, %xmm13 addpd %xmm14, %xmm7 MOVDDUP(22 * SIZE, A1, %xmm14) mulpd %xmm1, %xmm15 PREFETCHW (PREFETCHSIZE + 8) * SIZE(Y1) ADD %xmm9, %xmm4 MOVDDUP(17 * SIZE, A1, %xmm9) mulpd %xmm0, %xmm8 ADD %xmm11, %xmm5 MOVDDUP(19 * SIZE, A1, %xmm11) mulpd %xmm0, %xmm10 ADD %xmm13, %xmm6 MOVDDUP(21 * SIZE, A1, %xmm13) mulpd %xmm0, %xmm12 ADD %xmm15, %xmm7 MOVDDUP(23 * SIZE, A1, %xmm15) mulpd %xmm0, %xmm14 movlpd %xmm4, 8 * SIZE(Y1) movhpd %xmm4, 9 * SIZE(Y1) movupd 16 * SIZE(Y1), %xmm4 movlpd %xmm5, 10 * SIZE(Y1) movhpd %xmm5, 11 * SIZE(Y1) movupd 18 * SIZE(Y1), %xmm5 movlpd %xmm6, 12 * SIZE(Y1) movhpd %xmm6, 13 * SIZE(Y1) movupd 20 * SIZE(Y1), %xmm6 movlpd %xmm7, 14 * SIZE(Y1) movhpd %xmm7, 15 * SIZE(Y1) movupd 22 * SIZE(Y1), %xmm7 subq $-16 * SIZE, A1 subq $-16 * SIZE, Y1 decq I jg .L34 ALIGN_3.L35: addpd %xmm8, %xmm4 MOVDDUP( 8 * SIZE, A1, %xmm8) mulpd %xmm1, %xmm9 addpd %xmm10, %xmm5 MOVDDUP(10 * SIZE, A1, %xmm10) mulpd %xmm1, %xmm11 addpd %xmm12, %xmm6 MOVDDUP(12 * SIZE, A1, %xmm12) mulpd %xmm1, %xmm13 addpd %xmm14, %xmm7 MOVDDUP(14 * SIZE, A1, %xmm14) mulpd %xmm1, %xmm15 ADD %xmm9, %xmm4 MOVDDUP( 9 * SIZE, A1, %xmm9) mulpd %xmm0, %xmm8 ADD %xmm11, %xmm5 MOVDDUP(11 * SIZE, A1, %xmm11) mulpd %xmm0, %xmm10 ADD %xmm13, %xmm6 MOVDDUP(13 * SIZE, A1, %xmm13) mulpd %xmm0, %xmm12 ADD %xmm15, %xmm7 MOVDDUP(15 * SIZE, A1, %xmm15) mulpd %xmm0, %xmm14 movlpd %xmm4, 0 * SIZE(Y1) movhpd %xmm4, 1 * SIZE(Y1) movupd 8 * SIZE(Y1), %xmm4 movlpd %xmm5, 2 * SIZE(Y1) movhpd %xmm5, 3 * SIZE(Y1) movupd 10 * SIZE(Y1), %xmm5 movlpd %xmm6, 4 * SIZE(Y1) movhpd %xmm6, 5 * SIZE(Y1) movupd 12 * SIZE(Y1), %xmm6 movlpd %xmm7, 6 * SIZE(Y1) movhpd %xmm7, 7 * SIZE(Y1) movupd 14 * SIZE(Y1), %xmm7 addpd %xmm8, %xmm4 mulpd %xmm1, %xmm9 addpd %xmm10, %xmm5 mulpd %xmm1, %xmm11 addpd %xmm12, %xmm6 mulpd %xmm1, %xmm13 addpd %xmm14, %xmm7 mulpd %xmm1, %xmm15 ADD %xmm9, %xmm4 mulpd %xmm0, %xmm8 ADD %xmm11, %xmm5 mulpd %xmm0, %xmm10 ADD %xmm13, %xmm6 mulpd %xmm0, %xmm12 ADD %xmm15, %xmm7 mulpd %xmm0, %xmm14 movlpd %xmm4, 8 * SIZE(Y1) movhpd %xmm4, 9 * SIZE(Y1) movlpd %xmm5, 10 * SIZE(Y1) movhpd %xmm5, 11 * SIZE(Y1) movlpd %xmm6, 12 * SIZE(Y1) movhpd %xmm6, 13 * SIZE(Y1) movlpd %xmm7, 14 * SIZE(Y1) movhpd %xmm7, 15 * SIZE(Y1) subq $-16 * SIZE, A1 subq $-16 * SIZE, Y1 ALIGN_3.L36: testq $4, M je .L37 MOVDDUP( 0 * SIZE, A1, %xmm8) MOVDDUP( 1 * SIZE, A1, %xmm9) MOVDDUP( 2 * SIZE, A1, %xmm10) MOVDDUP( 3 * SIZE, A1, %xmm11) MOVDDUP( 4 * SIZE, A1, %xmm12) MOVDDUP( 5 * SIZE, A1, %xmm13) MOVDDUP( 6 * SIZE, A1, %xmm14) MOVDDUP( 7 * SIZE, A1, %xmm15) movupd 0 * SIZE(Y1), %xmm4 movupd 2 * SIZE(Y1), %xmm5 movupd 4 * SIZE(Y1), %xmm6 movupd 6 * SIZE(Y1), %xmm7 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm12 mulpd %xmm0, %xmm14 addpd %xmm8, %xmm4 mulpd %xmm1, %xmm9 addpd %xmm10, %xmm5 mulpd %xmm1, %xmm11 addpd %xmm12, %xmm6 mulpd %xmm1, %xmm13 addpd %xmm14, %xmm7 mulpd %xmm1, %xmm15 ADD %xmm9, %xmm4 ADD %xmm11, %xmm5 ADD %xmm13, %xmm6 ADD %xmm15, %xmm7 movlpd %xmm4, 0 * SIZE(Y1) movhpd %xmm4, 1 * SIZE(Y1) movlpd %xmm5, 2 * SIZE(Y1) movhpd %xmm5, 3 * SIZE(Y1) movlpd %xmm6, 4 * SIZE(Y1) movhpd %xmm6, 5 * SIZE(Y1) movlpd %xmm7, 6 * SIZE(Y1) movhpd %xmm7, 7 * SIZE(Y1) addq $8 * SIZE, A1 addq $8 * SIZE, Y1 ALIGN_3.L37: testq $2, M je .L38 MOVDDUP( 0 * SIZE, A1, %xmm8) MOVDDUP( 1 * SIZE, A1, %xmm9) MOVDDUP( 2 * SIZE, A1, %xmm10) MOVDDUP( 3 * SIZE, A1, %xmm11) movupd 0 * SIZE(Y1), %xmm4 movupd 2 * SIZE(Y1), %xmm5 mulpd %xmm0, %xmm8 mulpd %xmm1, %xmm9 mulpd %xmm0, %xmm10 mulpd %xmm1, %xmm11 addpd %xmm8, %xmm4 addpd %xmm10, %xmm5 ADD %xmm9, %xmm4 ADD %xmm11, %xmm5 movlpd %xmm4, 0 * SIZE(Y1) movhpd %xmm4, 1 * SIZE(Y1) movlpd %xmm5, 2 * SIZE(Y1) movhpd %xmm5, 3 * SIZE(Y1) addq $4 * SIZE, A1 addq $4 * SIZE, Y1 ALIGN_3.L38: testq $1, M je .L995 MOVDDUP( 0 * SIZE, A1, %xmm8) MOVDDUP( 1 * SIZE, A1, %xmm9) movupd 0 * SIZE(Y1), %xmm4 mulpd %xmm0, %xmm8 mulpd %xmm1, %xmm9 addpd %xmm8, %xmm4 ADD %xmm9, %xmm4 movlpd %xmm4, 0 * SIZE(Y1) movhpd %xmm4, 1 * SIZE(Y1) jmp .L995 ALIGN_3.L995: cmpq $SIZE * 2, INCY je .L999 movq Y, Y1 movq M, %rax sarq $2, %rax jle .L997 ALIGN_3.L996: movupd 0 * SIZE(Y), %xmm4 addq INCY, Y movupd 0 * SIZE(Y), %xmm5 addq INCY, Y movupd 0 * SIZE(Y), %xmm6 addq INCY, Y movupd 0 * SIZE(Y), %xmm7 addq INCY, Y movupd 0 * SIZE(YY), %xmm0 movupd 2 * SIZE(YY), %xmm1 movupd 4 * SIZE(YY), %xmm2 movupd 6 * SIZE(YY), %xmm3 addpd %xmm4, %xmm0 addpd %xmm5, %xmm1 addpd %xmm6, %xmm2 addpd %xmm7, %xmm3 movlpd %xmm0, 0 * SIZE(Y1) movhpd %xmm0, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm1, 0 * SIZE(Y1) movhpd %xmm1, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm2, 0 * SIZE(Y1) movhpd %xmm2, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm3, 0 * SIZE(Y1) movhpd %xmm3, 1 * SIZE(Y1) addq INCY, Y1 addq $8 * SIZE, YY decq %rax jg .L996 ALIGN_3.L997: movq M, %rax andq $3, %rax jle .L999 ALIGN_3.L998: movupd 0 * SIZE(YY), %xmm0 movupd 0 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 movlpd %xmm0, 0 * SIZE(Y1) movhpd %xmm0, 1 * SIZE(Y1) addq $2 * SIZE, YY addq INCY, Y addq INCY, Y1 decq %rax jg .L998 ALIGN_3.L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15#ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15#endif addq $STACKSIZE, %rsp ret ALIGN_3 EPILOGUE
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -