📄 zgemv_n_sse2.s
字号:
movapd 22 * SIZE(Y1), %xmm11 subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 decq I jg .L14 ALIGN_3.L15: addpd %xmm12, %xmm8 MOVDDUP( 1 * SIZE, A1, %xmm12) mulpd %xmm1, %xmm12 addpd %xmm13, %xmm9 MOVDDUP( 3 * SIZE, A1, %xmm13) mulpd %xmm1, %xmm13 addpd %xmm14, %xmm10 MOVDDUP( 5 * SIZE, A1, %xmm14) mulpd %xmm1, %xmm14 addpd %xmm15, %xmm11 MOVDDUP( 7 * SIZE, A1, %xmm15) mulpd %xmm1, %xmm15 ADD %xmm12, %xmm8 MOVDDUP( 0 * SIZE, A2, %xmm12) mulpd %xmm2, %xmm12 ADD %xmm13, %xmm9 MOVDDUP( 2 * SIZE, A2, %xmm13) mulpd %xmm2, %xmm13 ADD %xmm14, %xmm10 MOVDDUP( 4 * SIZE, A2, %xmm14) mulpd %xmm2, %xmm14 ADD %xmm15, %xmm11 MOVDDUP( 6 * SIZE, A2, %xmm15) mulpd %xmm2, %xmm15 addpd %xmm12, %xmm8 MOVDDUP( 1 * SIZE, A2, %xmm12) mulpd %xmm3, %xmm12 addpd %xmm13, %xmm9 MOVDDUP( 3 * SIZE, A2, %xmm13) mulpd %xmm3, %xmm13 addpd %xmm14, %xmm10 MOVDDUP( 5 * SIZE, A2, %xmm14) mulpd %xmm3, %xmm14 addpd %xmm15, %xmm11 MOVDDUP( 7 * SIZE, A2, %xmm15) mulpd %xmm3, %xmm15 ADD %xmm12, %xmm8 MOVDDUP2( 0 * SIZE, (A1, LDA, 2), %xmm12) mulpd %xmm4, %xmm12 ADD %xmm13, %xmm9 MOVDDUP2( 2 * SIZE, (A1, LDA, 2), %xmm13) mulpd %xmm4, %xmm13 ADD %xmm14, %xmm10 MOVDDUP2( 4 * SIZE, (A1, LDA, 2), %xmm14) mulpd %xmm4, %xmm14 ADD %xmm15, %xmm11 MOVDDUP2( 6 * SIZE, (A1, LDA, 2), %xmm15) mulpd %xmm4, %xmm15 addpd %xmm12, %xmm8 MOVDDUP2( 1 * SIZE, (A1, LDA, 2), %xmm12) mulpd %xmm5, %xmm12 addpd %xmm13, %xmm9 MOVDDUP2( 3 * SIZE, (A1, LDA, 2), %xmm13) mulpd %xmm5, %xmm13 addpd %xmm14, %xmm10 MOVDDUP2( 5 * SIZE, (A1, LDA, 2), %xmm14) mulpd %xmm5, %xmm14 addpd %xmm15, %xmm11 MOVDDUP2( 7 * SIZE, (A1, LDA, 2), %xmm15) mulpd %xmm5, %xmm15 ADD %xmm12, %xmm8 MOVDDUP2( 0 * SIZE, (A2, LDA, 2), %xmm12) mulpd %xmm6, %xmm12 ADD %xmm13, %xmm9 MOVDDUP2( 2 * SIZE, (A2, LDA, 2), %xmm13) mulpd %xmm6, %xmm13 ADD %xmm14, %xmm10 MOVDDUP2( 4 * SIZE, (A2, LDA, 2), %xmm14) mulpd %xmm6, %xmm14 ADD %xmm15, %xmm11 MOVDDUP2( 6 * SIZE, (A2, LDA, 2), %xmm15) mulpd %xmm6, %xmm15 addpd %xmm12, %xmm8 MOVDDUP2( 1 * SIZE, (A2, LDA, 2), %xmm12) mulpd %xmm7, %xmm12 addpd %xmm13, %xmm9 MOVDDUP2( 3 * SIZE, (A2, LDA, 2), %xmm13) mulpd %xmm7, %xmm13 addpd %xmm14, %xmm10 MOVDDUP2( 5 * SIZE, (A2, LDA, 2), %xmm14) mulpd %xmm7, %xmm14 addpd %xmm15, %xmm11 MOVDDUP2( 7 * SIZE, (A2, LDA, 2), %xmm15) mulpd %xmm7, %xmm15 ADD %xmm12, %xmm8 MOVDDUP( 8 * SIZE, A1, %xmm12) mulpd %xmm0, %xmm12 ADD %xmm13, %xmm9 MOVDDUP(10 * SIZE, A1, %xmm13) mulpd %xmm0, %xmm13 ADD %xmm14, %xmm10 MOVDDUP(12 * SIZE, A1, %xmm14) mulpd %xmm0, %xmm14 ADD %xmm15, %xmm11 MOVDDUP(14 * SIZE, A1, %xmm15) mulpd %xmm0, %xmm15 movapd %xmm8, 0 * SIZE(Y1) movapd 8 * SIZE(Y1), %xmm8 movapd %xmm9, 2 * SIZE(Y1) movapd 10 * SIZE(Y1), %xmm9 movapd %xmm10, 4 * SIZE(Y1) movapd 12 * SIZE(Y1), %xmm10 movapd %xmm11, 6 * SIZE(Y1) movapd 14 * SIZE(Y1), %xmm11 addpd %xmm12, %xmm8 MOVDDUP( 9 * SIZE, A1, %xmm12) mulpd %xmm1, %xmm12 addpd %xmm13, %xmm9 MOVDDUP(11 * SIZE, A1, %xmm13) mulpd %xmm1, %xmm13 addpd %xmm14, %xmm10 MOVDDUP(13 * SIZE, A1, %xmm14) mulpd %xmm1, %xmm14 addpd %xmm15, %xmm11 MOVDDUP(15 * SIZE, A1, %xmm15) mulpd %xmm1, %xmm15 ADD %xmm12, %xmm8 MOVDDUP( 8 * SIZE, A2, %xmm12) mulpd %xmm2, %xmm12 ADD %xmm13, %xmm9 MOVDDUP(10 * SIZE, A2, %xmm13) mulpd %xmm2, %xmm13 ADD %xmm14, %xmm10 MOVDDUP(12 * SIZE, A2, %xmm14) mulpd %xmm2, %xmm14 ADD %xmm15, %xmm11 MOVDDUP(14 * SIZE, A2, %xmm15) mulpd %xmm2, %xmm15 addpd %xmm12, %xmm8 MOVDDUP( 9 * SIZE, A2, %xmm12) mulpd %xmm3, %xmm12 addpd %xmm13, %xmm9 MOVDDUP(11 * SIZE, A2, %xmm13) mulpd %xmm3, %xmm13 addpd %xmm14, %xmm10 MOVDDUP(13 * SIZE, A2, %xmm14) mulpd %xmm3, %xmm14 addpd %xmm15, %xmm11 MOVDDUP(15 * SIZE, A2, %xmm15) mulpd %xmm3, %xmm15 ADD %xmm12, %xmm8 MOVDDUP2( 8 * SIZE, (A1, LDA, 2), %xmm12) mulpd %xmm4, %xmm12 ADD %xmm13, %xmm9 MOVDDUP2(10 * SIZE, (A1, LDA, 2), %xmm13) mulpd %xmm4, %xmm13 ADD %xmm14, %xmm10 MOVDDUP2(12 * SIZE, (A1, LDA, 2), %xmm14) mulpd %xmm4, %xmm14 ADD %xmm15, %xmm11 MOVDDUP2(14 * SIZE, (A1, LDA, 2), %xmm15) mulpd %xmm4, %xmm15 addpd %xmm12, %xmm8 MOVDDUP2( 9 * SIZE, (A1, LDA, 2), %xmm12) mulpd %xmm5, %xmm12 addpd %xmm13, %xmm9 MOVDDUP2(11 * SIZE, (A1, LDA, 2), %xmm13) mulpd %xmm5, %xmm13 addpd %xmm14, %xmm10 MOVDDUP2(13 * SIZE, (A1, LDA, 2), %xmm14) mulpd %xmm5, %xmm14 addpd %xmm15, %xmm11 MOVDDUP2(15 * SIZE, (A1, LDA, 2), %xmm15) mulpd %xmm5, %xmm15 ADD %xmm12, %xmm8 MOVDDUP2( 8 * SIZE, (A2, LDA, 2), %xmm12) mulpd %xmm6, %xmm12 ADD %xmm13, %xmm9 MOVDDUP2(10 * SIZE, (A2, LDA, 2), %xmm13) mulpd %xmm6, %xmm13 ADD %xmm14, %xmm10 MOVDDUP2(12 * SIZE, (A2, LDA, 2), %xmm14) mulpd %xmm6, %xmm14 ADD %xmm15, %xmm11 MOVDDUP2(14 * SIZE, (A2, LDA, 2), %xmm15) mulpd %xmm6, %xmm15 addpd %xmm12, %xmm8 MOVDDUP2( 9 * SIZE, (A2, LDA, 2), %xmm12) mulpd %xmm7, %xmm12 addpd %xmm13, %xmm9 MOVDDUP2(11 * SIZE, (A2, LDA, 2), %xmm13) mulpd %xmm7, %xmm13 addpd %xmm14, %xmm10 MOVDDUP2(13 * SIZE, (A2, LDA, 2), %xmm14) mulpd %xmm7, %xmm14 addpd %xmm15, %xmm11 MOVDDUP2(15 * SIZE, (A2, LDA, 2), %xmm15) mulpd %xmm7, %xmm15 ADD %xmm12, %xmm8 ADD %xmm13, %xmm9 ADD %xmm14, %xmm10 ADD %xmm15, %xmm11 movapd %xmm8, 8 * SIZE(Y1) movapd %xmm9, 10 * SIZE(Y1) movapd %xmm10, 12 * SIZE(Y1) movapd %xmm11, 14 * SIZE(Y1) subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 ALIGN_3.L16: testq $4, M je .L17 MOVDDUP( 0 * SIZE, A1, %xmm12) MOVDDUP( 2 * SIZE, A1, %xmm13) MOVDDUP( 4 * SIZE, A1, %xmm14) MOVDDUP( 6 * SIZE, A1, %xmm15) movapd 0 * SIZE(Y1), %xmm8 movapd 2 * SIZE(Y1), %xmm9 movapd 4 * SIZE(Y1), %xmm10 movapd 6 * SIZE(Y1), %xmm11 mulpd %xmm0, %xmm12 mulpd %xmm0, %xmm13 mulpd %xmm0, %xmm14 mulpd %xmm0, %xmm15 addpd %xmm12, %xmm8 MOVDDUP( 1 * SIZE, A1, %xmm12) mulpd %xmm1, %xmm12 addpd %xmm13, %xmm9 MOVDDUP( 3 * SIZE, A1, %xmm13) mulpd %xmm1, %xmm13 addpd %xmm14, %xmm10 MOVDDUP( 5 * SIZE, A1, %xmm14) mulpd %xmm1, %xmm14 addpd %xmm15, %xmm11 MOVDDUP( 7 * SIZE, A1, %xmm15) mulpd %xmm1, %xmm15 ADD %xmm12, %xmm8 MOVDDUP( 0 * SIZE, A2, %xmm12) mulpd %xmm2, %xmm12 ADD %xmm13, %xmm9 MOVDDUP( 2 * SIZE, A2, %xmm13) mulpd %xmm2, %xmm13 ADD %xmm14, %xmm10 MOVDDUP( 4 * SIZE, A2, %xmm14) mulpd %xmm2, %xmm14 ADD %xmm15, %xmm11 MOVDDUP( 6 * SIZE, A2, %xmm15) mulpd %xmm2, %xmm15 addpd %xmm12, %xmm8 MOVDDUP( 1 * SIZE, A2, %xmm12) mulpd %xmm3, %xmm12 addpd %xmm13, %xmm9 MOVDDUP( 3 * SIZE, A2, %xmm13) mulpd %xmm3, %xmm13 addpd %xmm14, %xmm10 MOVDDUP( 5 * SIZE, A2, %xmm14) mulpd %xmm3, %xmm14 addpd %xmm15, %xmm11 MOVDDUP( 7 * SIZE, A2, %xmm15) mulpd %xmm3, %xmm15 ADD %xmm12, %xmm8 MOVDDUP2( 0 * SIZE, (A1, LDA, 2), %xmm12) mulpd %xmm4, %xmm12 ADD %xmm13, %xmm9 MOVDDUP2( 2 * SIZE, (A1, LDA, 2), %xmm13) mulpd %xmm4, %xmm13 ADD %xmm14, %xmm10 MOVDDUP2( 4 * SIZE, (A1, LDA, 2), %xmm14) mulpd %xmm4, %xmm14 ADD %xmm15, %xmm11 MOVDDUP2( 6 * SIZE, (A1, LDA, 2), %xmm15) mulpd %xmm4, %xmm15 addpd %xmm12, %xmm8 MOVDDUP2( 1 * SIZE, (A1, LDA, 2), %xmm12) mulpd %xmm5, %xmm12 addpd %xmm13, %xmm9 MOVDDUP2( 3 * SIZE, (A1, LDA, 2), %xmm13) mulpd %xmm5, %xmm13 addpd %xmm14, %xmm10 MOVDDUP2( 5 * SIZE, (A1, LDA, 2), %xmm14) mulpd %xmm5, %xmm14 addpd %xmm15, %xmm11 MOVDDUP2( 7 * SIZE, (A1, LDA, 2), %xmm15) mulpd %xmm5, %xmm15 ADD %xmm12, %xmm8 MOVDDUP2( 0 * SIZE, (A2, LDA, 2), %xmm12) mulpd %xmm6, %xmm12 ADD %xmm13, %xmm9 MOVDDUP2( 2 * SIZE, (A2, LDA, 2), %xmm13) mulpd %xmm6, %xmm13 ADD %xmm14, %xmm10 MOVDDUP2( 4 * SIZE, (A2, LDA, 2), %xmm14) mulpd %xmm6, %xmm14 ADD %xmm15, %xmm11 MOVDDUP2( 6 * SIZE, (A2, LDA, 2), %xmm15) mulpd %xmm6, %xmm15 addpd %xmm12, %xmm8 MOVDDUP2( 1 * SIZE, (A2, LDA, 2), %xmm12) mulpd %xmm7, %xmm12 addpd %xmm13, %xmm9 MOVDDUP2( 3 * SIZE, (A2, LDA, 2), %xmm13) mulpd %xmm7, %xmm13 addpd %xmm14, %xmm10 MOVDDUP2( 5 * SIZE, (A2, LDA, 2), %xmm14) mulpd %xmm7, %xmm14 addpd %xmm15, %xmm11 MOVDDUP2( 7 * SIZE, (A2, LDA, 2), %xmm15) mulpd %xmm7, %xmm15 ADD %xmm12, %xmm8 ADD %xmm13, %xmm9 ADD %xmm14, %xmm10 ADD %xmm15, %xmm11 movapd %xmm8, 0 * SIZE(Y1) movapd %xmm9, 2 * SIZE(Y1) movapd %xmm10, 4 * SIZE(Y1) movapd %xmm11, 6 * SIZE(Y1) addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, Y1 ALIGN_3.L17: testq $2, M je .L18 movapd 0 * SIZE(Y1), %xmm8 movapd 2 * SIZE(Y1), %xmm9 MOVDDUP( 0 * SIZE, A1, %xmm12) MOVDDUP( 1 * SIZE, A1, %xmm13) MOVDDUP( 2 * SIZE, A1, %xmm14) MOVDDUP( 3 * SIZE, A1, %xmm15) mulpd %xmm0, %xmm12 mulpd %xmm1, %xmm13 mulpd %xmm0, %xmm14 mulpd %xmm1, %xmm15 addpd %xmm12, %xmm8 ADD %xmm13, %xmm8 addpd %xmm14, %xmm9 ADD %xmm15, %xmm9 MOVDDUP( 0 * SIZE, A2, %xmm12) MOVDDUP( 1 * SIZE, A2, %xmm13) MOVDDUP( 2 * SIZE, A2, %xmm14) MOVDDUP( 3 * SIZE, A2, %xmm15) mulpd %xmm2, %xmm12 mulpd %xmm3, %xmm13 mulpd %xmm2, %xmm14 mulpd %xmm3, %xmm15 addpd %xmm12, %xmm8 ADD %xmm13, %xmm8 addpd %xmm14, %xmm9 ADD %xmm15, %xmm9 MOVDDUP2( 0 * SIZE, (A1, LDA, 2), %xmm12) MOVDDUP2( 1 * SIZE, (A1, LDA, 2), %xmm13) MOVDDUP2( 2 * SIZE, (A1, LDA, 2), %xmm14) MOVDDUP2( 3 * SIZE, (A1, LDA, 2), %xmm15) mulpd %xmm4, %xmm12 mulpd %xmm5, %xmm13 mulpd %xmm4, %xmm14 mulpd %xmm5, %xmm15 addpd %xmm12, %xmm8 ADD %xmm13, %xmm8 addpd %xmm14, %xmm9 ADD %xmm15, %xmm9 MOVDDUP2( 0 * SIZE, (A2, LDA, 2), %xmm12) MOVDDUP2( 1 * SIZE, (A2, LDA, 2), %xmm13) MOVDDUP2( 2 * SIZE, (A2, LDA, 2), %xmm14) MOVDDUP2( 3 * SIZE, (A2, LDA, 2), %xmm15) mulpd %xmm6, %xmm12 mulpd %xmm7, %xmm13 mulpd %xmm6, %xmm14 mulpd %xmm7, %xmm15 addpd %xmm12, %xmm8 ADD %xmm13, %xmm8 addpd %xmm14, %xmm9 ADD %xmm15, %xmm9 movapd %xmm8, 0 * SIZE(Y1) movapd %xmm9, 2 * SIZE(Y1) addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3.L18: testq $1, M je .L19 movapd 0 * SIZE(Y1), %xmm8 MOVDDUP( 0 * SIZE, A1, %xmm12) MOVDDUP( 1 * SIZE, A1, %xmm13) MOVDDUP( 0 * SIZE, A2, %xmm14) MOVDDUP( 1 * SIZE, A2, %xmm15) mulpd %xmm0, %xmm12 mulpd %xmm1, %xmm13 mulpd %xmm2, %xmm14 mulpd %xmm3, %xmm15 addpd %xmm12, %xmm8 ADD %xmm13, %xmm8 addpd %xmm14, %xmm8 ADD %xmm15, %xmm8 MOVDDUP2( 0 * SIZE, (A1, LDA, 2), %xmm12) MOVDDUP2( 1 * SIZE, (A1, LDA, 2), %xmm13) MOVDDUP2( 0 * SIZE, (A2, LDA, 2), %xmm14) MOVDDUP2( 1 * SIZE, (A2, LDA, 2), %xmm15) mulpd %xmm4, %xmm12 mulpd %xmm5, %xmm13 mulpd %xmm6, %xmm14 mulpd %xmm7, %xmm15 addpd %xmm12, %xmm8 ADD %xmm13, %xmm8 addpd %xmm14, %xmm8 ADD %xmm15, %xmm8 movapd %xmm8, 0 * SIZE(Y1) ALIGN_3.L19: decq J jg .L11 ALIGN_3.L20: testq $2, N je .L30 movq YY, Y1 movq A, A1 leaq (A, LDA, 2), A MOVDDUP(0 * SIZE, X, %xmm0) MOVDDUP(1 * SIZE, X, %xmm1) addq INCX, X MOVDDUP(0 * SIZE, X, %xmm2) MOVDDUP(1 * SIZE, X, %xmm3) addq INCX, X movlpd 0 + ALPHA_R, %xmm13 movhpd 8 + ALPHA_R, %xmm13 movlpd 0 + ALPHA_I, %xmm14 movhpd 8 + ALPHA_I, %xmm14 movlpd 0 + COMP_MASK, %xmm15 movhpd 8 + COMP_MASK, %xmm15 mulpd %xmm13, %xmm0 mulpd %xmm14, %xmm1 mulpd %xmm13, %xmm2 mulpd %xmm14, %xmm3 ADDX %xmm1, %xmm0 ADDX %xmm3, %xmm2 movapd %xmm0, %xmm1 movapd %xmm2, %xmm3 SHUFPD_1 %xmm1, %xmm1 SHUFPD_1 %xmm3, %xmm3 xorpd %xmm15, %xmm1 xorpd %xmm15, %xmm3 movq M, I sarq $3, I jle .L26 MOVDDUP( 0 * SIZE, A1, %xmm8) MOVDDUP( 2 * SIZE, A1, %xmm10) MOVDDUP( 4 * SIZE, A1, %xmm12) MOVDDUP( 6 * SIZE, A1, %xmm14) MOVDDUP( 1 * SIZE, A1, %xmm9) MOVDDUP( 3 * SIZE, A1, %xmm11) MOVDDUP( 5 * SIZE, A1, %xmm13) MOVDDUP( 7 * SIZE, A1, %xmm15) movapd 0 * SIZE(Y1), %xmm4 movapd 2 * SIZE(Y1), %xmm5 movapd 4 * SIZE(Y1), %xmm6 movapd 6 * SIZE(Y1), %xmm7 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm12 mulpd %xmm0, %xmm14 decq I jle .L25 ALIGN_3.L24:#if defined(OPTERON) || defined(CORE2) || defined(PENRYN) PREFETCH PREFETCHSIZE * SIZE(A1)#endif#ifdef PENTIUM4 PREFETCH (PREFETCHSIZE + 0) * SIZE(A1)#endif addpd %xmm8, %xmm4 MOVDDUP2( 0 * SIZE, (A1, LDA, 1), %xmm8) mulpd %xmm1, %xmm9 addpd %xmm10, %xmm5 MOVDDUP2( 2 * SIZE, (A1, LDA, 1), %xmm10) mulpd %xmm1, %xmm11 addpd %xmm12, %xmm6 MOVDDUP2( 4 * SIZE, (A1, LDA, 1), %xmm12) mulpd %xmm1, %xmm13 addpd %xmm14, %xmm7 MOVDDUP2( 6 * SIZE, (A1, LDA, 1), %xmm14) mulpd %xmm1, %xmm15#ifdef OPTERON PREFETCHW PREFETCHSIZE * SIZE(Y1)#endif ADD %xmm9, %xmm4 MOVDDUP2( 1 * SIZE, (A1, LDA, 1), %xmm9) mulpd %xmm2, %xmm8 ADD %xmm11, %xmm5 MOVDDUP2( 3 * SIZE, (A1, LDA, 1), %xmm11) mulpd %xmm2, %xmm10 ADD %xmm13, %xmm6 MOVDDUP2( 5 * SIZE, (A1, LDA, 1), %xmm13) mulpd %xmm2, %xmm12 ADD %xmm15, %xmm7 MOVDDUP2( 7 * SIZE, (A1, LDA, 1), %xmm15) mulpd %xmm2, %xmm14#if defined(OPTERON) || defined(CORE2) || defined(PENRYN) PREFETCH PREFETCHSIZE * SIZE(A1, LDA, 1)#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -