📄 zgemv_n_sse2.s
字号:
#ifdef PENTIUM4 PREFETCHW PREFETCHSIZE * SIZE(Y1)#endif addpd %xmm8, %xmm4 MOVDDUP( 8 * SIZE, A1, %xmm8) mulpd %xmm3, %xmm9 addpd %xmm10, %xmm5 MOVDDUP(10 * SIZE, A1, %xmm10) mulpd %xmm3, %xmm11 addpd %xmm12, %xmm6 MOVDDUP(12 * SIZE, A1, %xmm12) mulpd %xmm3, %xmm13 addpd %xmm14, %xmm7 MOVDDUP(14 * SIZE, A1, %xmm14) mulpd %xmm3, %xmm15 ADD %xmm9, %xmm4 MOVDDUP( 9 * SIZE, A1, %xmm9) mulpd %xmm0, %xmm8 ADD %xmm11, %xmm5 MOVDDUP(11 * SIZE, A1, %xmm11) mulpd %xmm0, %xmm10 ADD %xmm13, %xmm6 MOVDDUP(13 * SIZE, A1, %xmm13) mulpd %xmm0, %xmm12 ADD %xmm15, %xmm7 MOVDDUP(15 * SIZE, A1, %xmm15) mulpd %xmm0, %xmm14 movapd %xmm4, 0 * SIZE(Y1) movapd 8 * SIZE(Y1), %xmm4 movapd %xmm5, 2 * SIZE(Y1) movapd 10 * SIZE(Y1), %xmm5 movapd %xmm6, 4 * SIZE(Y1) movapd 12 * SIZE(Y1), %xmm6 movapd %xmm7, 6 * SIZE(Y1) movapd 14 * SIZE(Y1), %xmm7#if defined(OPTERON) || defined(CORE2) || defined(PENRYN) PREFETCH (PREFETCHSIZE + 8) * SIZE(A1)#endif#ifdef PENTIUM4 PREFETCH (PREFETCHSIZE + 0) * SIZE(A1, LDA, 1)#endif addpd %xmm8, %xmm4 MOVDDUP2( 8 * SIZE, (A1, LDA, 1), %xmm8) mulpd %xmm1, %xmm9 addpd %xmm10, %xmm5 MOVDDUP2(10 * SIZE, (A1, LDA, 1), %xmm10) mulpd %xmm1, %xmm11 addpd %xmm12, %xmm6 MOVDDUP2(12 * SIZE, (A1, LDA, 1), %xmm12) mulpd %xmm1, %xmm13 addpd %xmm14, %xmm7 MOVDDUP2(14 * SIZE, (A1, LDA, 1), %xmm14) mulpd %xmm1, %xmm15 ADD %xmm9, %xmm4 MOVDDUP2( 9 * SIZE, (A1, LDA, 1), %xmm9) mulpd %xmm2, %xmm8 ADD %xmm11, %xmm5 MOVDDUP2(11 * SIZE, (A1, LDA, 1), %xmm11) mulpd %xmm2, %xmm10 ADD %xmm13, %xmm6 MOVDDUP2(13 * SIZE, (A1, LDA, 1), %xmm13) mulpd %xmm2, %xmm12 ADD %xmm15, %xmm7 MOVDDUP2(15 * SIZE, (A1, LDA, 1), %xmm15) mulpd %xmm2, %xmm14 addpd %xmm8, %xmm4 MOVDDUP(16 * SIZE, A1, %xmm8) mulpd %xmm3, %xmm9 addpd %xmm10, %xmm5 MOVDDUP(18 * SIZE, A1, %xmm10) mulpd %xmm3, %xmm11 addpd %xmm12, %xmm6 MOVDDUP(20 * SIZE, A1, %xmm12) mulpd %xmm3, %xmm13 addpd %xmm14, %xmm7 MOVDDUP(22 * SIZE, A1, %xmm14) mulpd %xmm3, %xmm15#if defined(OPTERON) || defined(CORE2) || defined(PENRYN) PREFETCH (PREFETCHSIZE + 8) * SIZE(A1, LDA, 1)#endif ADD %xmm9, %xmm4 MOVDDUP(17 * SIZE, A1, %xmm9) mulpd %xmm0, %xmm8 ADD %xmm11, %xmm5 MOVDDUP(19 * SIZE, A1, %xmm11) mulpd %xmm0, %xmm10 ADD %xmm13, %xmm6 MOVDDUP(21 * SIZE, A1, %xmm13) mulpd %xmm0, %xmm12 ADD %xmm15, %xmm7 MOVDDUP(23 * SIZE, A1, %xmm15) mulpd %xmm0, %xmm14 movapd %xmm4, 8 * SIZE(Y1) movapd 16 * SIZE(Y1), %xmm4 movapd %xmm5, 10 * SIZE(Y1) movapd 18 * SIZE(Y1), %xmm5 movapd %xmm6, 12 * SIZE(Y1) movapd 20 * SIZE(Y1), %xmm6 movapd %xmm7, 14 * SIZE(Y1) movapd 22 * SIZE(Y1), %xmm7 subq $-16 * SIZE, A1 subq $-16 * SIZE, Y1 decq I jg .L24 ALIGN_3.L25: addpd %xmm8, %xmm4 MOVDDUP2( 0 * SIZE, (A1, LDA, 1), %xmm8) mulpd %xmm1, %xmm9 addpd %xmm10, %xmm5 MOVDDUP2( 2 * SIZE, (A1, LDA, 1), %xmm10) mulpd %xmm1, %xmm11 addpd %xmm12, %xmm6 MOVDDUP2( 4 * SIZE, (A1, LDA, 1), %xmm12) mulpd %xmm1, %xmm13 addpd %xmm14, %xmm7 MOVDDUP2( 6 * SIZE, (A1, LDA, 1), %xmm14) mulpd %xmm1, %xmm15 ADD %xmm9, %xmm4 MOVDDUP2( 1 * SIZE, (A1, LDA, 1), %xmm9) mulpd %xmm2, %xmm8 ADD %xmm11, %xmm5 MOVDDUP2( 3 * SIZE, (A1, LDA, 1), %xmm11) mulpd %xmm2, %xmm10 ADD %xmm13, %xmm6 MOVDDUP2( 5 * SIZE, (A1, LDA, 1), %xmm13) mulpd %xmm2, %xmm12 ADD %xmm15, %xmm7 MOVDDUP2( 7 * SIZE, (A1, LDA, 1), %xmm15) mulpd %xmm2, %xmm14 addpd %xmm8, %xmm4 MOVDDUP( 8 * SIZE, A1, %xmm8) mulpd %xmm3, %xmm9 addpd %xmm10, %xmm5 MOVDDUP(10 * SIZE, A1, %xmm10) mulpd %xmm3, %xmm11 addpd %xmm12, %xmm6 MOVDDUP(12 * SIZE, A1, %xmm12) mulpd %xmm3, %xmm13 addpd %xmm14, %xmm7 MOVDDUP(14 * SIZE, A1, %xmm14) mulpd %xmm3, %xmm15 ADD %xmm9, %xmm4 MOVDDUP( 9 * SIZE, A1, %xmm9) mulpd %xmm0, %xmm8 ADD %xmm11, %xmm5 MOVDDUP(11 * SIZE, A1, %xmm11) mulpd %xmm0, %xmm10 ADD %xmm13, %xmm6 MOVDDUP(13 * SIZE, A1, %xmm13) mulpd %xmm0, %xmm12 ADD %xmm15, %xmm7 MOVDDUP(15 * SIZE, A1, %xmm15) mulpd %xmm0, %xmm14 movapd %xmm4, 0 * SIZE(Y1) movapd 8 * SIZE(Y1), %xmm4 movapd %xmm5, 2 * SIZE(Y1) movapd 10 * SIZE(Y1), %xmm5 movapd %xmm6, 4 * SIZE(Y1) movapd 12 * SIZE(Y1), %xmm6 movapd %xmm7, 6 * SIZE(Y1) movapd 14 * SIZE(Y1), %xmm7 addpd %xmm8, %xmm4 MOVDDUP2( 8 * SIZE, (A1, LDA, 1), %xmm8) mulpd %xmm1, %xmm9 addpd %xmm10, %xmm5 MOVDDUP2(10 * SIZE, (A1, LDA, 1), %xmm10) mulpd %xmm1, %xmm11 addpd %xmm12, %xmm6 MOVDDUP2(12 * SIZE, (A1, LDA, 1), %xmm12) mulpd %xmm1, %xmm13 addpd %xmm14, %xmm7 MOVDDUP2(14 * SIZE, (A1, LDA, 1), %xmm14) mulpd %xmm1, %xmm15 ADD %xmm9, %xmm4 MOVDDUP2( 9 * SIZE, (A1, LDA, 1), %xmm9) mulpd %xmm2, %xmm8 ADD %xmm11, %xmm5 MOVDDUP2(11 * SIZE, (A1, LDA, 1), %xmm11) mulpd %xmm2, %xmm10 ADD %xmm13, %xmm6 MOVDDUP2(13 * SIZE, (A1, LDA, 1), %xmm13) mulpd %xmm2, %xmm12 ADD %xmm15, %xmm7 MOVDDUP2(15 * SIZE, (A1, LDA, 1), %xmm15) mulpd %xmm2, %xmm14 addpd %xmm8, %xmm4 mulpd %xmm3, %xmm9 addpd %xmm10, %xmm5 mulpd %xmm3, %xmm11 addpd %xmm12, %xmm6 mulpd %xmm3, %xmm13 addpd %xmm14, %xmm7 mulpd %xmm3, %xmm15 ADD %xmm9, %xmm4 ADD %xmm11, %xmm5 ADD %xmm13, %xmm6 ADD %xmm15, %xmm7 movapd %xmm4, 8 * SIZE(Y1) movapd %xmm5, 10 * SIZE(Y1) movapd %xmm6, 12 * SIZE(Y1) movapd %xmm7, 14 * SIZE(Y1) subq $-16 * SIZE, A1 subq $-16 * SIZE, Y1 ALIGN_3.L26: testq $4, M je .L27 MOVDDUP( 0 * SIZE, A1, %xmm8) MOVDDUP( 1 * SIZE, A1, %xmm9) MOVDDUP( 2 * SIZE, A1, %xmm10) MOVDDUP( 3 * SIZE, A1, %xmm11) MOVDDUP( 4 * SIZE, A1, %xmm12) MOVDDUP( 5 * SIZE, A1, %xmm13) MOVDDUP( 6 * SIZE, A1, %xmm14) MOVDDUP( 7 * SIZE, A1, %xmm15) movapd 0 * SIZE(Y1), %xmm4 movapd 2 * SIZE(Y1), %xmm5 movapd 4 * SIZE(Y1), %xmm6 movapd 6 * SIZE(Y1), %xmm7 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm12 mulpd %xmm0, %xmm14 addpd %xmm8, %xmm4 MOVDDUP2( 0 * SIZE, (A1, LDA, 1), %xmm8) mulpd %xmm1, %xmm9 addpd %xmm10, %xmm5 MOVDDUP2( 2 * SIZE, (A1, LDA, 1), %xmm10) mulpd %xmm1, %xmm11 addpd %xmm12, %xmm6 MOVDDUP2( 4 * SIZE, (A1, LDA, 1), %xmm12) mulpd %xmm1, %xmm13 addpd %xmm14, %xmm7 MOVDDUP2( 6 * SIZE, (A1, LDA, 1), %xmm14) mulpd %xmm1, %xmm15 ADD %xmm9, %xmm4 MOVDDUP2( 1 * SIZE, (A1, LDA, 1), %xmm9) mulpd %xmm2, %xmm8 ADD %xmm11, %xmm5 MOVDDUP2( 3 * SIZE, (A1, LDA, 1), %xmm11) mulpd %xmm2, %xmm10 ADD %xmm13, %xmm6 MOVDDUP2( 5 * SIZE, (A1, LDA, 1), %xmm13) mulpd %xmm2, %xmm12 ADD %xmm15, %xmm7 MOVDDUP2( 7 * SIZE, (A1, LDA, 1), %xmm15) mulpd %xmm2, %xmm14 addpd %xmm8, %xmm4 mulpd %xmm3, %xmm9 addpd %xmm10, %xmm5 mulpd %xmm3, %xmm11 addpd %xmm12, %xmm6 mulpd %xmm3, %xmm13 addpd %xmm14, %xmm7 mulpd %xmm3, %xmm15 ADD %xmm9, %xmm4 ADD %xmm11, %xmm5 ADD %xmm13, %xmm6 ADD %xmm15, %xmm7 movapd %xmm4, 0 * SIZE(Y1) movapd %xmm5, 2 * SIZE(Y1) movapd %xmm6, 4 * SIZE(Y1) movapd %xmm7, 6 * SIZE(Y1) addq $8 * SIZE, A1 addq $8 * SIZE, Y1 ALIGN_3.L27: testq $2, M je .L28 MOVDDUP( 0 * SIZE, A1, %xmm8) MOVDDUP( 1 * SIZE, A1, %xmm9) MOVDDUP( 2 * SIZE, A1, %xmm10) MOVDDUP( 3 * SIZE, A1, %xmm11) MOVDDUP2( 0 * SIZE, (A1, LDA, 1), %xmm12) MOVDDUP2( 1 * SIZE, (A1, LDA, 1), %xmm13) MOVDDUP2( 2 * SIZE, (A1, LDA, 1), %xmm14) MOVDDUP2( 3 * SIZE, (A1, LDA, 1), %xmm15) movapd 0 * SIZE(Y1), %xmm4 movapd 2 * SIZE(Y1), %xmm5 mulpd %xmm0, %xmm8 mulpd %xmm1, %xmm9 mulpd %xmm0, %xmm10 mulpd %xmm1, %xmm11 mulpd %xmm2, %xmm12 addpd %xmm8, %xmm4 mulpd %xmm3, %xmm13 addpd %xmm10, %xmm5 mulpd %xmm2, %xmm14 ADD %xmm9, %xmm4 mulpd %xmm3, %xmm15 ADD %xmm11, %xmm5 addpd %xmm12, %xmm4 addpd %xmm14, %xmm5 ADD %xmm13, %xmm4 ADD %xmm15, %xmm5 movapd %xmm4, 0 * SIZE(Y1) movapd %xmm5, 2 * SIZE(Y1) addq $4 * SIZE, A1 addq $4 * SIZE, Y1 ALIGN_3.L28: testq $1, M je .L30 MOVDDUP( 0 * SIZE, A1, %xmm8) MOVDDUP( 1 * SIZE, A1, %xmm9) MOVDDUP2( 0 * SIZE, (A1, LDA, 1), %xmm12) MOVDDUP2( 1 * SIZE, (A1, LDA, 1), %xmm13) movapd 0 * SIZE(Y1), %xmm4 mulpd %xmm0, %xmm8 mulpd %xmm1, %xmm9 mulpd %xmm2, %xmm12 mulpd %xmm3, %xmm13 addpd %xmm8, %xmm4 ADD %xmm9, %xmm4 addpd %xmm12, %xmm4 ADD %xmm13, %xmm4 movapd %xmm4, 0 * SIZE(Y1) ALIGN_3.L30: testq $1, N je .L995 movq YY, Y1 movq A, A1 MOVDDUP(0 * SIZE, X, %xmm0) MOVDDUP(1 * SIZE, X, %xmm1) addq INCX, X movlpd 0 + ALPHA_R, %xmm13 movhpd 8 + ALPHA_R, %xmm13 movlpd 0 + ALPHA_I, %xmm14 movhpd 8 + ALPHA_I, %xmm14 movlpd 0 + COMP_MASK, %xmm15 movhpd 8 + COMP_MASK, %xmm15 mulpd %xmm13, %xmm0 mulpd %xmm14, %xmm1 ADDX %xmm1, %xmm0 movapd %xmm0, %xmm1 SHUFPD_1 %xmm1, %xmm1 xorpd %xmm15, %xmm1 movq M, I sarq $3, I jle .L36 MOVDDUP( 0 * SIZE, A1, %xmm8) MOVDDUP( 2 * SIZE, A1, %xmm10) MOVDDUP( 4 * SIZE, A1, %xmm12) MOVDDUP( 6 * SIZE, A1, %xmm14) MOVDDUP( 1 * SIZE, A1, %xmm9) MOVDDUP( 3 * SIZE, A1, %xmm11) MOVDDUP( 5 * SIZE, A1, %xmm13) MOVDDUP( 7 * SIZE, A1, %xmm15) movapd 0 * SIZE(Y1), %xmm4 movapd 2 * SIZE(Y1), %xmm5 movapd 4 * SIZE(Y1), %xmm6 movapd 6 * SIZE(Y1), %xmm7 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm12 mulpd %xmm0, %xmm14 decq I jle .L35 ALIGN_3.L34:#if defined(OPTERON) || defined(CORE2) || defined(PENRYN) PREFETCH PREFETCHSIZE * SIZE(A1)#endif#ifdef PENTIUM4 PREFETCH PREFETCHSIZE * SIZE(A1)#endif addpd %xmm8, %xmm4 MOVDDUP( 8 * SIZE, A1, %xmm8) mulpd %xmm1, %xmm9 addpd %xmm10, %xmm5 MOVDDUP(10 * SIZE, A1, %xmm10) mulpd %xmm1, %xmm11 addpd %xmm12, %xmm6 MOVDDUP(12 * SIZE, A1, %xmm12) mulpd %xmm1, %xmm13 addpd %xmm14, %xmm7 MOVDDUP(14 * SIZE, A1, %xmm14) mulpd %xmm1, %xmm15#ifdef OPTERON PREFETCHW PREFETCHSIZE * SIZE(Y1)#endif ADD %xmm9, %xmm4 MOVDDUP( 9 * SIZE, A1, %xmm9) mulpd %xmm0, %xmm8 ADD %xmm11, %xmm5 MOVDDUP(11 * SIZE, A1, %xmm11) mulpd %xmm0, %xmm10 ADD %xmm13, %xmm6 MOVDDUP(13 * SIZE, A1, %xmm13) mulpd %xmm0, %xmm12 ADD %xmm15, %xmm7 MOVDDUP(15 * SIZE, A1, %xmm15) mulpd %xmm0, %xmm14 movapd %xmm4, 0 * SIZE(Y1) movapd 8 * SIZE(Y1), %xmm4 movapd %xmm5, 2 * SIZE(Y1) movapd 10 * SIZE(Y1), %xmm5 movapd %xmm6, 4 * SIZE(Y1) movapd 12 * SIZE(Y1), %xmm6 movapd %xmm7, 6 * SIZE(Y1) movapd 14 * SIZE(Y1), %xmm7#if defined(OPTERON) || defined(CORE2) || defined(PENRYN) PREFETCH (PREFETCHSIZE + 8) * SIZE(A1)#endif#ifdef PENTIUM4 PREFETCHW PREFETCHSIZE * SIZE(Y1)#endif addpd %xmm8, %xmm4 MOVDDUP(16 * SIZE, A1, %xmm8) mulpd %xmm1, %xmm9 addpd %xmm10, %xmm5 MOVDDUP(18 * SIZE, A1, %xmm10) mulpd %xmm1, %xmm11 addpd %xmm12, %xmm6 MOVDDUP(20 * SIZE, A1, %xmm12) mulpd %xmm1, %xmm13 addpd %xmm14, %xmm7 MOVDDUP(22 * SIZE, A1, %xmm14) mulpd %xmm1, %xmm15#ifdef OPTERON PREFETCHW (PREFETCHSIZE + 8) * SIZE(Y1)#endif ADD %xmm9, %xmm4 MOVDDUP(17 * SIZE, A1, %xmm9) mulpd %xmm0, %xmm8 ADD %xmm11, %xmm5 MOVDDUP(19 * SIZE, A1, %xmm11) mulpd %xmm0, %xmm10 ADD %xmm13, %xmm6 MOVDDUP(21 * SIZE, A1, %xmm13) mulpd %xmm0, %xmm12 ADD %xmm15, %xmm7 MOVDDUP(23 * SIZE, A1, %xmm15) mulpd %xmm0, %xmm14 movapd %xmm4, 8 * SIZE(Y1) movapd 16 * SIZE(Y1), %xmm4 movapd %xmm5, 10 * SIZE(Y1) movapd 18 * SIZE(Y1), %xmm5 movapd %xmm6, 12 * SIZE(Y1) movapd 20 * SIZE(Y1), %xmm6 movapd %xmm7, 14 * SIZE(Y1) movapd 22 * SIZE(Y1), %xmm7 subq $-16 * SIZE, A1 subq $-16 * SIZE, Y1 decq I jg .L34 ALIGN_3.L35: addpd %xmm8, %xmm4 MOVDDUP( 8 * SIZE, A1, %xmm8) mulpd %xmm1, %xmm9 addpd %xmm10, %xmm5 MOVDDUP(10 * SIZE, A1, %xmm10) mulpd %xmm1, %xmm11 addpd %xmm12, %xmm6 MOVDDUP(12 * SIZE, A1, %xmm12) mulpd %xmm1, %xmm13 addpd %xmm14, %xmm7 MOVDDUP(14 * SIZE, A1, %xmm14) mulpd %xmm1, %xmm15 ADD %xmm9, %xmm4 MOVDDUP( 9 * SIZE, A1, %xmm9) mulpd %xmm0, %xmm8 ADD %xmm11, %xmm5 MOVDDUP(11 * SIZE, A1, %xmm11) mulpd %xmm0, %xmm10 ADD %xmm13, %xmm6 MOVDDUP(13 * SIZE, A1, %xmm13) mulpd %xmm0, %xmm12 ADD %xmm15, %xmm7 MOVDDUP(15 * SIZE, A1, %xmm15) mulpd %xmm0, %xmm14 movapd %xmm4, 0 * SIZE(Y1) movapd 8 * SIZE(Y1), %xmm4 movapd %xmm5, 2 * SIZE(Y1) movapd 10 * SIZE(Y1), %xmm5 movapd %xmm6, 4 * SIZE(Y1) movapd 12 * SIZE(Y1), %xmm6 movapd %xmm7, 6 * SIZE(Y1) movapd 14 * SIZE(Y1), %xmm7 addpd %xmm8, %xmm4 mulpd %xmm1, %xmm9 addpd %xmm10, %xmm5 mulpd %xmm1, %xmm11 addpd %xmm12, %xmm6 mulpd %xmm1, %xmm13
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -