📄 gemv_n_sse2_core2.s
字号:
ALIGN_3.L43: addpd %xmm8, %xmm4 movapd -16 * SIZE(A1, LDA, 2), %xmm8 mulpd %xmm1, %xmm12 addpd %xmm9, %xmm5 movapd -14 * SIZE(A1, LDA, 2), %xmm9 mulpd %xmm1, %xmm13 addpd %xmm10, %xmm6 movapd -12 * SIZE(A1, LDA, 2), %xmm10 mulpd %xmm1, %xmm14 addpd %xmm11, %xmm7 movapd -10 * SIZE(A1, LDA, 2), %xmm11 mulpd %xmm1, %xmm15 addpd %xmm12, %xmm4 movsd -16 * SIZE(A2, LDA, 2), %xmm12 movhpd -15 * SIZE(A2, LDA, 2), %xmm12 mulpd %xmm2, %xmm8 addpd %xmm13, %xmm5 movsd -14 * SIZE(A2, LDA, 2), %xmm13 movhpd -13 * SIZE(A2, LDA, 2), %xmm13 mulpd %xmm2, %xmm9 addpd %xmm14, %xmm6 movsd -12 * SIZE(A2, LDA, 2), %xmm14 movhpd -11 * SIZE(A2, LDA, 2), %xmm14 mulpd %xmm2, %xmm10 addpd %xmm15, %xmm7 movsd -10 * SIZE(A2, LDA, 2), %xmm15 movhpd -9 * SIZE(A2, LDA, 2), %xmm15 mulpd %xmm2, %xmm11 addpd %xmm8, %xmm4 movapd -8 * SIZE(A1), %xmm8 mulpd %xmm3, %xmm12 addpd %xmm9, %xmm5 movapd -6 * SIZE(A1), %xmm9 mulpd %xmm3, %xmm13 addpd %xmm10, %xmm6 movapd -4 * SIZE(A1), %xmm10 mulpd %xmm3, %xmm14 addpd %xmm11, %xmm7 movapd -2 * SIZE(A1), %xmm11 mulpd %xmm3, %xmm15 addpd %xmm12, %xmm4 movsd -8 * SIZE(A2), %xmm12 movhpd -7 * SIZE(A2), %xmm12 mulpd %xmm0, %xmm8 addpd %xmm13, %xmm5 movsd -6 * SIZE(A2), %xmm13 movhpd -5 * SIZE(A2), %xmm13 mulpd %xmm0, %xmm9 addpd %xmm14, %xmm6 movsd -4 * SIZE(A2), %xmm14 movhpd -3 * SIZE(A2), %xmm14 mulpd %xmm0, %xmm10 addpd %xmm15, %xmm7 movsd -2 * SIZE(A2), %xmm15 movhpd -1 * SIZE(A2), %xmm15 mulpd %xmm0, %xmm11 movapd %xmm4, 0 * SIZE(Y1) movapd 8 * SIZE(Y1), %xmm4 movapd %xmm5, 2 * SIZE(Y1) movapd 10 * SIZE(Y1), %xmm5 movapd %xmm6, 4 * SIZE(Y1) movapd 12 * SIZE(Y1), %xmm6 movapd %xmm7, 6 * SIZE(Y1) movapd 14 * SIZE(Y1), %xmm7 addpd %xmm8, %xmm4 movapd -8 * SIZE(A1, LDA, 2), %xmm8 mulpd %xmm1, %xmm12 addpd %xmm9, %xmm5 movapd -6 * SIZE(A1, LDA, 2), %xmm9 mulpd %xmm1, %xmm13 addpd %xmm10, %xmm6 movapd -4 * SIZE(A1, LDA, 2), %xmm10 mulpd %xmm1, %xmm14 addpd %xmm11, %xmm7 movapd -2 * SIZE(A1, LDA, 2), %xmm11 mulpd %xmm1, %xmm15 addpd %xmm12, %xmm4 movsd -8 * SIZE(A2, LDA, 2), %xmm12 movhpd -7 * SIZE(A2, LDA, 2), %xmm12 mulpd %xmm2, %xmm8 addpd %xmm13, %xmm5 movsd -6 * SIZE(A2, LDA, 2), %xmm13 movhpd -5 * SIZE(A2, LDA, 2), %xmm13 mulpd %xmm2, %xmm9 addpd %xmm14, %xmm6 movsd -4 * SIZE(A2, LDA, 2), %xmm14 movhpd -3 * SIZE(A2, LDA, 2), %xmm14 mulpd %xmm2, %xmm10 addpd %xmm15, %xmm7 movsd -2 * SIZE(A2, LDA, 2), %xmm15 movhpd -1 * SIZE(A2, LDA, 2), %xmm15 mulpd %xmm2, %xmm11 addpd %xmm8, %xmm4 movapd 0 * SIZE(A1), %xmm8 mulpd %xmm3, %xmm12 addpd %xmm9, %xmm5 movapd 2 * SIZE(A1), %xmm9 mulpd %xmm3, %xmm13 addpd %xmm10, %xmm6 movapd 4 * SIZE(A1), %xmm10 mulpd %xmm3, %xmm14 addpd %xmm11, %xmm7 movapd 6 * SIZE(A1), %xmm11 mulpd %xmm3, %xmm15 addpd %xmm12, %xmm4 movsd 0 * SIZE(A2), %xmm12 movhpd 1 * SIZE(A2), %xmm12 mulpd %xmm0, %xmm8 addpd %xmm13, %xmm5 movsd 2 * SIZE(A2), %xmm13 movhpd 3 * SIZE(A2), %xmm13 mulpd %xmm0, %xmm9 addpd %xmm14, %xmm6 movsd 4 * SIZE(A2), %xmm14 movhpd 5 * SIZE(A2), %xmm14 mulpd %xmm0, %xmm10 addpd %xmm15, %xmm7 movsd 6 * SIZE(A2), %xmm15 movhpd 7 * SIZE(A2), %xmm15 mulpd %xmm0, %xmm11 movapd %xmm4, 8 * SIZE(Y1) movapd 16 * SIZE(Y1), %xmm4 movapd %xmm5, 10 * SIZE(Y1) movapd 18 * SIZE(Y1), %xmm5 movapd %xmm6, 12 * SIZE(Y1) movapd 20 * SIZE(Y1), %xmm6 movapd %xmm7, 14 * SIZE(Y1) movapd 22 * SIZE(Y1), %xmm7 subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 decq I jg .L43 ALIGN_3.L44: addpd %xmm8, %xmm4 movapd -16 * SIZE(A1, LDA, 2), %xmm8 mulpd %xmm1, %xmm12 addpd %xmm9, %xmm5 movapd -14 * SIZE(A1, LDA, 2), %xmm9 mulpd %xmm1, %xmm13 addpd %xmm10, %xmm6 movapd -12 * SIZE(A1, LDA, 2), %xmm10 mulpd %xmm1, %xmm14 addpd %xmm11, %xmm7 movapd -10 * SIZE(A1, LDA, 2), %xmm11 mulpd %xmm1, %xmm15 addpd %xmm12, %xmm4 movsd -16 * SIZE(A2, LDA, 2), %xmm12 movhpd -15 * SIZE(A2, LDA, 2), %xmm12 mulpd %xmm2, %xmm8 addpd %xmm13, %xmm5 movsd -14 * SIZE(A2, LDA, 2), %xmm13 movhpd -13 * SIZE(A2, LDA, 2), %xmm13 mulpd %xmm2, %xmm9 addpd %xmm14, %xmm6 movsd -12 * SIZE(A2, LDA, 2), %xmm14 movhpd -11 * SIZE(A2, LDA, 2), %xmm14 mulpd %xmm2, %xmm10 addpd %xmm15, %xmm7 movsd -10 * SIZE(A2, LDA, 2), %xmm15 movhpd -9 * SIZE(A2, LDA, 2), %xmm15 mulpd %xmm2, %xmm11 addpd %xmm8, %xmm4 movapd -8 * SIZE(A1), %xmm8 mulpd %xmm3, %xmm12 addpd %xmm9, %xmm5 movapd -6 * SIZE(A1), %xmm9 mulpd %xmm3, %xmm13 addpd %xmm10, %xmm6 movapd -4 * SIZE(A1), %xmm10 mulpd %xmm3, %xmm14 addpd %xmm11, %xmm7 movapd -2 * SIZE(A1), %xmm11 mulpd %xmm3, %xmm15 addpd %xmm12, %xmm4 movsd -8 * SIZE(A2), %xmm12 movhpd -7 * SIZE(A2), %xmm12 mulpd %xmm0, %xmm8 addpd %xmm13, %xmm5 movsd -6 * SIZE(A2), %xmm13 movhpd -5 * SIZE(A2), %xmm13 mulpd %xmm0, %xmm9 addpd %xmm14, %xmm6 movsd -4 * SIZE(A2), %xmm14 movhpd -3 * SIZE(A2), %xmm14 mulpd %xmm0, %xmm10 addpd %xmm15, %xmm7 movsd -2 * SIZE(A2), %xmm15 movhpd -1 * SIZE(A2), %xmm15 mulpd %xmm0, %xmm11 movapd %xmm4, 0 * SIZE(Y1) movapd 8 * SIZE(Y1), %xmm4 movapd %xmm5, 2 * SIZE(Y1) movapd 10 * SIZE(Y1), %xmm5 movapd %xmm6, 4 * SIZE(Y1) movapd 12 * SIZE(Y1), %xmm6 movapd %xmm7, 6 * SIZE(Y1) movapd 14 * SIZE(Y1), %xmm7 addpd %xmm8, %xmm4 movapd -8 * SIZE(A1, LDA, 2), %xmm8 mulpd %xmm1, %xmm12 addpd %xmm9, %xmm5 movapd -6 * SIZE(A1, LDA, 2), %xmm9 mulpd %xmm1, %xmm13 addpd %xmm10, %xmm6 movapd -4 * SIZE(A1, LDA, 2), %xmm10 mulpd %xmm1, %xmm14 addpd %xmm11, %xmm7 movapd -2 * SIZE(A1, LDA, 2), %xmm11 mulpd %xmm1, %xmm15 addpd %xmm12, %xmm4 movsd -8 * SIZE(A2, LDA, 2), %xmm12 movhpd -7 * SIZE(A2, LDA, 2), %xmm12 mulpd %xmm2, %xmm8 addpd %xmm13, %xmm5 movsd -6 * SIZE(A2, LDA, 2), %xmm13 movhpd -5 * SIZE(A2, LDA, 2), %xmm13 mulpd %xmm2, %xmm9 addpd %xmm14, %xmm6 movsd -4 * SIZE(A2, LDA, 2), %xmm14 movhpd -3 * SIZE(A2, LDA, 2), %xmm14 mulpd %xmm2, %xmm10 addpd %xmm15, %xmm7 movsd -2 * SIZE(A2, LDA, 2), %xmm15 movhpd -1 * SIZE(A2, LDA, 2), %xmm15 mulpd %xmm2, %xmm11 addpd %xmm8, %xmm4 mulpd %xmm3, %xmm12 addpd %xmm9, %xmm5 mulpd %xmm3, %xmm13 addpd %xmm10, %xmm6 mulpd %xmm3, %xmm14 addpd %xmm11, %xmm7 mulpd %xmm3, %xmm15 addpd %xmm12, %xmm4 addpd %xmm13, %xmm5 addpd %xmm14, %xmm6 addpd %xmm15, %xmm7 movapd %xmm4, 8 * SIZE(Y1) movapd %xmm5, 10 * SIZE(Y1) movapd %xmm6, 12 * SIZE(Y1) movapd %xmm7, 14 * SIZE(Y1) subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 ALIGN_3.L45: testq $8, MM je .L46 movapd 0 * SIZE(Y1), %xmm4 movapd 2 * SIZE(Y1), %xmm5 movapd 4 * SIZE(Y1), %xmm6 movapd 6 * SIZE(Y1), %xmm7 movapd -16 * SIZE(A1), %xmm8 movapd -14 * SIZE(A1), %xmm9 movapd -12 * SIZE(A1), %xmm10 movapd -10 * SIZE(A1), %xmm11 movsd -16 * SIZE(A2), %xmm12 movhpd -15 * SIZE(A2), %xmm12 movsd -14 * SIZE(A2), %xmm13 movhpd -13 * SIZE(A2), %xmm13 movsd -12 * SIZE(A2), %xmm14 movhpd -11 * SIZE(A2), %xmm14 movsd -10 * SIZE(A2), %xmm15 movhpd -9 * SIZE(A2), %xmm15 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm11 addpd %xmm8, %xmm4 movapd -16 * SIZE(A1, LDA, 2), %xmm8 mulpd %xmm1, %xmm12 addpd %xmm9, %xmm5 movapd -14 * SIZE(A1, LDA, 2), %xmm9 mulpd %xmm1, %xmm13 addpd %xmm10, %xmm6 movapd -12 * SIZE(A1, LDA, 2), %xmm10 mulpd %xmm1, %xmm14 addpd %xmm11, %xmm7 movapd -10 * SIZE(A1, LDA, 2), %xmm11 mulpd %xmm1, %xmm15 addpd %xmm12, %xmm4 movsd -16 * SIZE(A2, LDA, 2), %xmm12 movhpd -15 * SIZE(A2, LDA, 2), %xmm12 mulpd %xmm2, %xmm8 addpd %xmm13, %xmm5 movsd -14 * SIZE(A2, LDA, 2), %xmm13 movhpd -13 * SIZE(A2, LDA, 2), %xmm13 mulpd %xmm2, %xmm9 addpd %xmm14, %xmm6 movsd -12 * SIZE(A2, LDA, 2), %xmm14 movhpd -11 * SIZE(A2, LDA, 2), %xmm14 mulpd %xmm2, %xmm10 addpd %xmm15, %xmm7 movsd -10 * SIZE(A2, LDA, 2), %xmm15 movhpd -9 * SIZE(A2, LDA, 2), %xmm15 mulpd %xmm2, %xmm11 addpd %xmm8, %xmm4 mulpd %xmm3, %xmm12 addpd %xmm9, %xmm5 mulpd %xmm3, %xmm13 addpd %xmm10, %xmm6 mulpd %xmm3, %xmm14 addpd %xmm11, %xmm7 mulpd %xmm3, %xmm15 addpd %xmm12, %xmm4 mulpd %xmm0, %xmm8 addpd %xmm13, %xmm5 mulpd %xmm0, %xmm9 addpd %xmm14, %xmm6 mulpd %xmm0, %xmm10 addpd %xmm15, %xmm7 mulpd %xmm0, %xmm11 movapd %xmm4, 0 * SIZE(Y1) movapd %xmm5, 2 * SIZE(Y1) movapd %xmm6, 4 * SIZE(Y1) movapd %xmm7, 6 * SIZE(Y1) addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, Y1 ALIGN_3.L46: testq $4, MM je .L47 movapd 0 * SIZE(Y1), %xmm4 movapd 2 * SIZE(Y1), %xmm5 movapd -16 * SIZE(A1), %xmm8 movapd -14 * SIZE(A1), %xmm9 movsd -16 * SIZE(A2), %xmm10 movhpd -15 * SIZE(A2), %xmm10 movsd -14 * SIZE(A2), %xmm11 movhpd -13 * SIZE(A2), %xmm11 movapd -16 * SIZE(A1, LDA, 2), %xmm12 movapd -14 * SIZE(A1, LDA, 2), %xmm13 movsd -16 * SIZE(A2, LDA, 2), %xmm14 movhpd -15 * SIZE(A2, LDA, 2), %xmm14 movsd -14 * SIZE(A2, LDA, 2), %xmm15 movhpd -13 * SIZE(A2, LDA, 2), %xmm15 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm10 mulpd %xmm1, %xmm11 mulpd %xmm2, %xmm12 mulpd %xmm2, %xmm13 mulpd %xmm3, %xmm14 mulpd %xmm3, %xmm15 addpd %xmm8, %xmm4 addpd %xmm9, %xmm5 addpd %xmm10, %xmm4 addpd %xmm11, %xmm5 addpd %xmm12, %xmm4 addpd %xmm13, %xmm5 addpd %xmm14, %xmm4 addpd %xmm15, %xmm5 movapd %xmm4, 0 * SIZE(Y1) movapd %xmm5, 2 * SIZE(Y1) addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3.L47: testq $2, MM je .L48 movapd 0 * SIZE(Y1), %xmm4 movapd -16 * SIZE(A1), %xmm8 movsd -16 * SIZE(A2), %xmm10 movhpd -15 * SIZE(A2), %xmm10 movapd -16 * SIZE(A1, LDA, 2), %xmm12 movsd -16 * SIZE(A2, LDA, 2), %xmm14 movhpd -15 * SIZE(A2, LDA, 2), %xmm14 mulpd %xmm0, %xmm8 mulpd %xmm1, %xmm10 mulpd %xmm2, %xmm12 mulpd %xmm3, %xmm14 addpd %xmm8, %xmm4 addpd %xmm10, %xmm4 addpd %xmm12, %xmm4 addpd %xmm14, %xmm4 movapd %xmm4, 0 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3.L48: testq $1, MM je .L49 movsd 0 * SIZE(Y1), %xmm4 movsd -16 * SIZE(A1), %xmm8 movsd -16 * SIZE(A2), %xmm9 movsd -16 * SIZE(A1, LDA, 2), %xmm10 movsd -16 * SIZE(A2, LDA, 2), %xmm11 mulsd %xmm0, %xmm8 mulsd %xmm1, %xmm9 mulsd %xmm2, %xmm10 mulsd %xmm3, %xmm11 addsd %xmm8, %xmm4 addsd %xmm9, %xmm4 addsd %xmm10, %xmm4 addsd %xmm11, %xmm4 movsd %xmm4, 0 * SIZE(Y1) addq $1 * SIZE, A1 addq $1 * SIZE, A2 addq $1 * SIZE, Y1 ALIGN_3.L49: decq J jg .L41 ALIGN_3.L50: testq $2, N je .L60 movq BUFFER, Y1 movq A, A1 leaq (A, LDA, 1), A2 leaq (A, LDA, 2), A movsd (X), %xmm0 addq INCX, X movsd (X), %xmm1 addq INCX, X mulsd STACK_ALPHA, %xmm0 mulsd STACK_ALPHA, %xmm1 unpcklpd %xmm0, %xmm0 unpcklpd %xmm1, %xmm1 testq $SIZE, A je .L52 movsd (Y1), %xmm4 movsd -16 * SIZE(A1), %xmm8 movsd -16 * SIZE(A2), %xmm9 mulsd %xmm0, %xmm8 mulsd %xmm1, %xmm9 addsd %xmm8, %xmm4 addsd %xmm9, %xmm4 movsd %xmm4, 0 * SIZE(Y1) addq $1 * SIZE, A1 addq $1 * SIZE, A2 addq $1 * SIZE, Y1 ALIGN_3.L52: movq MM, I sarq $4, I jle .L55 movapd -16 * SIZE(A1), %xmm8 movapd -14 * SIZE(A1), %xmm9 movapd -12 * SIZE(A1), %xmm10 movapd -10 * SIZE(A1), %xmm11 movsd -16 * SIZE(A2), %xmm12 movhpd -15 * SIZE(A2), %xmm12 movsd -14 * SIZE(A2), %xmm13 movhpd -13 * SIZE(A2), %xmm13 movsd -12 * SIZE(A2), %xmm14 movhpd -11 * SIZE(A2), %xmm14 movsd -10 * SIZE(A2), %xmm15 movhpd -9 * SIZE(A2), %xmm15 movapd 0 * SIZE(Y1), %xmm4 movapd 2 * SIZE(Y1), %xmm5 movapd 4 * SIZE(Y1), %xmm6 movapd 6 * SIZE(Y1), %xmm7 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm11 decq I jle .L54 ALIGN_3.L53: addpd %xmm8, %xmm4 movapd -8 * SIZE(A1), %xmm8 mulpd %xmm1, %xmm12 addpd %xmm9, %xmm5 movapd -6 * SIZE(A1), %xmm9 mulpd %xmm1, %xmm13 addpd %xmm10, %xmm6 movapd -4 * SIZE(A1), %xmm10 mulpd %xmm1, %xmm14 addpd %xmm11, %xmm7 movapd -2 * SIZE(A1), %xmm11 mulpd %xmm1, %xmm15 addpd %xmm12, %xmm4 movsd -8 * SIZE(A2), %xmm12 movhpd -7 * SIZE(A2), %xmm12 mulpd %xmm0, %xmm8 addpd %xmm13, %xmm5 movsd -6 * SIZE(A2), %xmm13 movhpd -5 * SIZE(A2), %xmm13 mulpd %xmm0, %xmm9 addpd %xmm14, %xmm6 movsd -4 * SIZE(A2), %xmm14 movhpd -3 * SIZE(A2), %xmm14 mulpd %xmm0, %xmm10 addpd %xmm15, %xmm7 movsd -2 * SIZE(A2), %xmm15 movhpd -1 * SIZE(A2), %xmm15 mulpd %xmm0, %xmm11 movapd %xmm4, 0 * SIZE(Y1) movapd 8 * SIZE(Y1), %xmm4 movapd %xmm5, 2 * SIZE(Y1) movapd 10 * SIZE(Y1), %xmm5 movapd %xmm6, 4 * SIZE(Y1) movapd 12 * SIZE(Y1), %xmm6 movapd %xmm7, 6 * SIZE(Y1) movapd 14 * SIZE(Y1), %xmm7 addpd %xmm8, %xmm4 movapd 0 * SIZE(A1), %xmm8 mulpd %xmm1, %xmm12 addpd %xmm9, %xmm5 movapd 2 * SIZE(A1), %xmm9 mulpd %xmm1, %xmm13 addpd %xmm10, %xmm6 movapd 4 * SIZE(A1), %xmm10 mulpd %xmm1, %xmm14 addpd %xmm11, %xmm7 movapd 6 * SIZE(A1), %xmm11 mulpd %xmm1, %xmm15 addpd %xmm12, %xmm4
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -