📄 gemv_n_sse2_barcelona.s
字号:
addq $1 * SIZE, A1 addq $1 * SIZE, A2 addq $1 * SIZE, Y1 ALIGN_3.L19: decq J jg .L11 ALIGN_3.L20: testq $2, N je .L30 movq YY, Y1 movq A, A1 leaq (A, LDA, 1), A2 leaq (A, LDA, 2), A movlpd (X), %xmm0 addq INCX, X movlpd (X), %xmm1 addq INCX, X mulsd STACK_ALPHA, %xmm0 mulsd STACK_ALPHA, %xmm1 unpcklpd %xmm0, %xmm0 unpcklpd %xmm1, %xmm1 movq M, I sarq $4, I jle .L25 movupd 0 * SIZE(A1), %xmm8 movupd 2 * SIZE(A1), %xmm9 movupd 4 * SIZE(A1), %xmm10 movupd 6 * SIZE(A1), %xmm11 movupd 0 * SIZE(A2), %xmm12 movupd 2 * SIZE(A2), %xmm13 movupd 4 * SIZE(A2), %xmm14 movupd 6 * SIZE(A2), %xmm15 movupd 0 * SIZE(Y1), %xmm4 movupd 2 * SIZE(Y1), %xmm5 movupd 4 * SIZE(Y1), %xmm6 movupd 6 * SIZE(Y1), %xmm7 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm11 decq I jle .L22 ALIGN_3.L21: PREFETCH PREFETCHSIZE * SIZE(A1) addpd %xmm8, %xmm4 movupd 8 * SIZE(A1), %xmm8 mulpd %xmm1, %xmm12 addpd %xmm9, %xmm5 movupd 10 * SIZE(A1), %xmm9 mulpd %xmm1, %xmm13 addpd %xmm10, %xmm6 movupd 12 * SIZE(A1), %xmm10 mulpd %xmm1, %xmm14 addpd %xmm11, %xmm7 movupd 14 * SIZE(A1), %xmm11 mulpd %xmm1, %xmm15 PREFETCH PREFETCHSIZE * SIZE(Y1) addpd %xmm12, %xmm4 movupd 8 * SIZE(A2), %xmm12 mulpd %xmm0, %xmm8 addpd %xmm13, %xmm5 movupd 10 * SIZE(A2), %xmm13 mulpd %xmm0, %xmm9 addpd %xmm14, %xmm6 movupd 12 * SIZE(A2), %xmm14 mulpd %xmm0, %xmm10 addpd %xmm15, %xmm7 movupd 14 * SIZE(A2), %xmm15 mulpd %xmm0, %xmm11 movlpd %xmm4, 0 * SIZE(Y1) movhpd %xmm4, 1 * SIZE(Y1) movupd 8 * SIZE(Y1), %xmm4 movlpd %xmm5, 2 * SIZE(Y1) movhpd %xmm5, 3 * SIZE(Y1) movupd 10 * SIZE(Y1), %xmm5 movlpd %xmm6, 4 * SIZE(Y1) movhpd %xmm6, 5 * SIZE(Y1) movupd 12 * SIZE(Y1), %xmm6 movlpd %xmm7, 6 * SIZE(Y1) movhpd %xmm7, 7 * SIZE(Y1) movupd 14 * SIZE(Y1), %xmm7 PREFETCH PREFETCHSIZE * SIZE(A2) addpd %xmm8, %xmm4 movupd 16 * SIZE(A1), %xmm8 mulpd %xmm1, %xmm12 addpd %xmm9, %xmm5 movupd 18 * SIZE(A1), %xmm9 mulpd %xmm1, %xmm13 addpd %xmm10, %xmm6 movupd 20 * SIZE(A1), %xmm10 mulpd %xmm1, %xmm14 addpd %xmm11, %xmm7 movupd 22 * SIZE(A1), %xmm11 mulpd %xmm1, %xmm15 addpd %xmm12, %xmm4 movupd 16 * SIZE(A2), %xmm12 mulpd %xmm0, %xmm8 addpd %xmm13, %xmm5 movupd 18 * SIZE(A2), %xmm13 mulpd %xmm0, %xmm9 addpd %xmm14, %xmm6 movupd 20 * SIZE(A2), %xmm14 mulpd %xmm0, %xmm10 addpd %xmm15, %xmm7 movupd 22 * SIZE(A2), %xmm15 mulpd %xmm0, %xmm11 movlpd %xmm4, 8 * SIZE(Y1) movhpd %xmm4, 9 * SIZE(Y1) movupd 16 * SIZE(Y1), %xmm4 movlpd %xmm5, 10 * SIZE(Y1) movhpd %xmm5, 11 * SIZE(Y1) movupd 18 * SIZE(Y1), %xmm5 movlpd %xmm6, 12 * SIZE(Y1) movhpd %xmm6, 13 * SIZE(Y1) movupd 20 * SIZE(Y1), %xmm6 movlpd %xmm7, 14 * SIZE(Y1) movhpd %xmm7, 15 * SIZE(Y1) movupd 22 * SIZE(Y1), %xmm7 subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 decq I jg .L21 ALIGN_3.L22: addpd %xmm8, %xmm4 movupd 8 * SIZE(A1), %xmm8 mulpd %xmm1, %xmm12 addpd %xmm9, %xmm5 movupd 10 * SIZE(A1), %xmm9 mulpd %xmm1, %xmm13 addpd %xmm10, %xmm6 movupd 12 * SIZE(A1), %xmm10 mulpd %xmm1, %xmm14 addpd %xmm11, %xmm7 movupd 14 * SIZE(A1), %xmm11 mulpd %xmm1, %xmm15 addpd %xmm12, %xmm4 movupd 8 * SIZE(A2), %xmm12 mulpd %xmm0, %xmm8 addpd %xmm13, %xmm5 movupd 10 * SIZE(A2), %xmm13 mulpd %xmm0, %xmm9 addpd %xmm14, %xmm6 movupd 12 * SIZE(A2), %xmm14 mulpd %xmm0, %xmm10 addpd %xmm15, %xmm7 movupd 14 * SIZE(A2), %xmm15 mulpd %xmm0, %xmm11 movlpd %xmm4, 0 * SIZE(Y1) movhpd %xmm4, 1 * SIZE(Y1) movupd 8 * SIZE(Y1), %xmm4 movlpd %xmm5, 2 * SIZE(Y1) movhpd %xmm5, 3 * SIZE(Y1) movupd 10 * SIZE(Y1), %xmm5 movlpd %xmm6, 4 * SIZE(Y1) movhpd %xmm6, 5 * SIZE(Y1) movupd 12 * SIZE(Y1), %xmm6 movlpd %xmm7, 6 * SIZE(Y1) movhpd %xmm7, 7 * SIZE(Y1) movupd 14 * SIZE(Y1), %xmm7 addpd %xmm8, %xmm4 mulpd %xmm1, %xmm12 addpd %xmm9, %xmm5 mulpd %xmm1, %xmm13 addpd %xmm10, %xmm6 mulpd %xmm1, %xmm14 addpd %xmm11, %xmm7 mulpd %xmm1, %xmm15 addpd %xmm12, %xmm4 addpd %xmm13, %xmm5 addpd %xmm14, %xmm6 addpd %xmm15, %xmm7 movlpd %xmm4, 8 * SIZE(Y1) movhpd %xmm4, 9 * SIZE(Y1) movlpd %xmm5, 10 * SIZE(Y1) movhpd %xmm5, 11 * SIZE(Y1) movlpd %xmm6, 12 * SIZE(Y1) movhpd %xmm6, 13 * SIZE(Y1) movlpd %xmm7, 14 * SIZE(Y1) movhpd %xmm7, 15 * SIZE(Y1) subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 ALIGN_3.L25: testq $8, M je .L26 movupd 0 * SIZE(A1), %xmm8 movupd 2 * SIZE(A1), %xmm9 movupd 4 * SIZE(A1), %xmm10 movupd 6 * SIZE(A1), %xmm11 movupd 0 * SIZE(A2), %xmm12 movupd 2 * SIZE(A2), %xmm13 movupd 4 * SIZE(A2), %xmm14 movupd 6 * SIZE(A2), %xmm15 movupd 0 * SIZE(Y1), %xmm4 movupd 2 * SIZE(Y1), %xmm5 movupd 4 * SIZE(Y1), %xmm6 movupd 6 * SIZE(Y1), %xmm7 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm11 addpd %xmm8, %xmm4 mulpd %xmm1, %xmm12 addpd %xmm9, %xmm5 mulpd %xmm1, %xmm13 addpd %xmm10, %xmm6 mulpd %xmm1, %xmm14 addpd %xmm11, %xmm7 mulpd %xmm1, %xmm15 addpd %xmm12, %xmm4 addpd %xmm13, %xmm5 addpd %xmm14, %xmm6 addpd %xmm15, %xmm7 movlpd %xmm4, 0 * SIZE(Y1) movhpd %xmm4, 1 * SIZE(Y1) movlpd %xmm5, 2 * SIZE(Y1) movhpd %xmm5, 3 * SIZE(Y1) movlpd %xmm6, 4 * SIZE(Y1) movhpd %xmm6, 5 * SIZE(Y1) movlpd %xmm7, 6 * SIZE(Y1) movhpd %xmm7, 7 * SIZE(Y1) addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, Y1 ALIGN_3.L26: testq $4, M je .L27 movupd 0 * SIZE(A1), %xmm8 movupd 2 * SIZE(A1), %xmm9 movupd 0 * SIZE(A2), %xmm10 movupd 2 * SIZE(A2), %xmm11 movupd 0 * SIZE(Y1), %xmm4 movupd 2 * SIZE(Y1), %xmm5 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm10 mulpd %xmm1, %xmm11 addpd %xmm8, %xmm4 addpd %xmm9, %xmm5 addpd %xmm10, %xmm4 addpd %xmm11, %xmm5 movlpd %xmm4, 0 * SIZE(Y1) movhpd %xmm4, 1 * SIZE(Y1) movlpd %xmm5, 2 * SIZE(Y1) movhpd %xmm5, 3 * SIZE(Y1) addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3.L27: testq $2, M je .L28 movupd 0 * SIZE(A1), %xmm8 movupd 0 * SIZE(A2), %xmm10 movupd 0 * SIZE(Y1), %xmm4 mulpd %xmm0, %xmm8 mulpd %xmm1, %xmm10 addpd %xmm8, %xmm4 addpd %xmm10, %xmm4 movlpd %xmm4, 0 * SIZE(Y1) movhpd %xmm4, 1 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3.L28: testq $1, M je .L30 movlpd 0 * SIZE(A1), %xmm8 movlpd 0 * SIZE(A2), %xmm9 movlpd 0 * SIZE(Y1), %xmm4 mulsd %xmm0, %xmm8 mulsd %xmm1, %xmm9 addsd %xmm8, %xmm4 addsd %xmm9, %xmm4 movlpd %xmm4, 0 * SIZE(Y1) addq $1 * SIZE, A1 addq $1 * SIZE, A2 addq $1 * SIZE, Y1 ALIGN_3.L30: testq $1, N je .L995 movq YY, Y1 movq A, A1 movlpd (X), %xmm0 mulsd STACK_ALPHA, %xmm0 unpcklpd %xmm0, %xmm0 movq M, I sarq $4, I jle .L35 movupd 0 * SIZE(A1), %xmm8 movupd 2 * SIZE(A1), %xmm9 movupd 4 * SIZE(A1), %xmm10 movupd 6 * SIZE(A1), %xmm11 movupd 8 * SIZE(A1), %xmm12 movupd 10 * SIZE(A1), %xmm13 movupd 12 * SIZE(A1), %xmm14 movupd 14 * SIZE(A1), %xmm15 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm11 movupd 0 * SIZE(Y1), %xmm4 movupd 2 * SIZE(Y1), %xmm5 movupd 4 * SIZE(Y1), %xmm6 movupd 6 * SIZE(Y1), %xmm7 decq I jle .L32 ALIGN_3.L31: PREFETCH PREFETCHSIZE * SIZE(A1) addpd %xmm8, %xmm4 movupd 16 * SIZE(A1), %xmm8 mulpd %xmm0, %xmm12 addpd %xmm9, %xmm5 movupd 18 * SIZE(A1), %xmm9 mulpd %xmm0, %xmm13 addpd %xmm10, %xmm6 movupd 20 * SIZE(A1), %xmm10 mulpd %xmm0, %xmm14 addpd %xmm11, %xmm7 movupd 22 * SIZE(A1), %xmm11 mulpd %xmm0, %xmm15 movlpd %xmm4, 0 * SIZE(Y1) movhpd %xmm4, 1 * SIZE(Y1) movupd 8 * SIZE(Y1), %xmm4 movlpd %xmm5, 2 * SIZE(Y1) movhpd %xmm5, 3 * SIZE(Y1) movupd 10 * SIZE(Y1), %xmm5 movlpd %xmm6, 4 * SIZE(Y1) movhpd %xmm6, 5 * SIZE(Y1) movupd 12 * SIZE(Y1), %xmm6 movlpd %xmm7, 6 * SIZE(Y1) movhpd %xmm7, 7 * SIZE(Y1) movupd 14 * SIZE(Y1), %xmm7 PREFETCH PREFETCHSIZE * SIZE(Y1) addpd %xmm12, %xmm4 movupd 24 * SIZE(A1), %xmm12 mulpd %xmm0, %xmm8 addpd %xmm13, %xmm5 movupd 26 * SIZE(A1), %xmm13 mulpd %xmm0, %xmm9 addpd %xmm14, %xmm6 movupd 28 * SIZE(A1), %xmm14 mulpd %xmm0, %xmm10 addpd %xmm15, %xmm7 movupd 30 * SIZE(A1), %xmm15 mulpd %xmm0, %xmm11 movlpd %xmm4, 8 * SIZE(Y1) movhpd %xmm4, 9 * SIZE(Y1) movupd 16 * SIZE(Y1), %xmm4 movlpd %xmm5, 10 * SIZE(Y1) movhpd %xmm5, 11 * SIZE(Y1) movupd 18 * SIZE(Y1), %xmm5 movlpd %xmm6, 12 * SIZE(Y1) movhpd %xmm6, 13 * SIZE(Y1) movupd 20 * SIZE(Y1), %xmm6 movlpd %xmm7, 14 * SIZE(Y1) movhpd %xmm7, 15 * SIZE(Y1) movupd 22 * SIZE(Y1), %xmm7 subq $-16 * SIZE, A1 subq $-16 * SIZE, Y1 decq I jg .L31 ALIGN_3.L32: addpd %xmm8, %xmm4 mulpd %xmm0, %xmm12 addpd %xmm9, %xmm5 mulpd %xmm0, %xmm13 addpd %xmm10, %xmm6 mulpd %xmm0, %xmm14 addpd %xmm11, %xmm7 mulpd %xmm0, %xmm15 movlpd %xmm4, 0 * SIZE(Y1) movhpd %xmm4, 1 * SIZE(Y1) movupd 8 * SIZE(Y1), %xmm4 movlpd %xmm5, 2 * SIZE(Y1) movhpd %xmm5, 3 * SIZE(Y1) movupd 10 * SIZE(Y1), %xmm5 movlpd %xmm6, 4 * SIZE(Y1) movhpd %xmm6, 5 * SIZE(Y1) movupd 12 * SIZE(Y1), %xmm6 movlpd %xmm7, 6 * SIZE(Y1) movhpd %xmm7, 7 * SIZE(Y1) movupd 14 * SIZE(Y1), %xmm7 addpd %xmm12, %xmm4 addpd %xmm13, %xmm5 addpd %xmm14, %xmm6 addpd %xmm15, %xmm7 movlpd %xmm4, 8 * SIZE(Y1) movhpd %xmm4, 9 * SIZE(Y1) movlpd %xmm5, 10 * SIZE(Y1) movhpd %xmm5, 11 * SIZE(Y1) movlpd %xmm6, 12 * SIZE(Y1) movhpd %xmm6, 13 * SIZE(Y1) movlpd %xmm7, 14 * SIZE(Y1) movhpd %xmm7, 15 * SIZE(Y1) subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 ALIGN_3.L35: testq $8, M je .L36 movupd 0 * SIZE(A1), %xmm8 movupd 2 * SIZE(A1), %xmm9 movupd 4 * SIZE(A1), %xmm10 movupd 6 * SIZE(A1), %xmm11 movupd 0 * SIZE(Y1), %xmm4 movupd 2 * SIZE(Y1), %xmm5 movupd 4 * SIZE(Y1), %xmm6 movupd 6 * SIZE(Y1), %xmm7 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm11 addpd %xmm8, %xmm4 addpd %xmm9, %xmm5 addpd %xmm10, %xmm6 addpd %xmm11, %xmm7 movlpd %xmm4, 0 * SIZE(Y1) movhpd %xmm4, 1 * SIZE(Y1) movlpd %xmm5, 2 * SIZE(Y1) movhpd %xmm5, 3 * SIZE(Y1) movlpd %xmm6, 4 * SIZE(Y1) movhpd %xmm6, 5 * SIZE(Y1) movlpd %xmm7, 6 * SIZE(Y1) movhpd %xmm7, 7 * SIZE(Y1) addq $8 * SIZE, A1 addq $8 * SIZE, Y1 ALIGN_3.L36: testq $4, M je .L37 movupd 0 * SIZE(A1), %xmm8 movupd 2 * SIZE(A1), %xmm9 movupd 0 * SIZE(Y1), %xmm4 movupd 2 * SIZE(Y1), %xmm5 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 addpd %xmm8, %xmm4 addpd %xmm9, %xmm5 movlpd %xmm4, 0 * SIZE(Y1) movhpd %xmm4, 1 * SIZE(Y1) movlpd %xmm5, 2 * SIZE(Y1) movhpd %xmm5, 3 * SIZE(Y1) addq $4 * SIZE, A1 addq $4 * SIZE, Y1 ALIGN_3.L37: testq $2, M je .L38 movupd 0 * SIZE(A1), %xmm8 movupd 0 * SIZE(Y1), %xmm4 mulpd %xmm0, %xmm8 addpd %xmm8, %xmm4 movlpd %xmm4, 0 * SIZE(Y1) movhpd %xmm4, 1 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, Y1 ALIGN_3.L38: testq $1, M je .L995 movlpd 0 * SIZE(A1), %xmm8 movlpd 0 * SIZE(Y1), %xmm4 mulsd %xmm0, %xmm8 addsd %xmm8, %xmm4 movlpd %xmm4, 0 * SIZE(Y1) ALIGN_3.L995: cmpq $SIZE, INCY je .L999 movq Y, Y1 movq M, %rax sarq $2, %rax jle .L997 ALIGN_3.L996: movlpd 0 * SIZE(Y), %xmm4 addq INCY, Y movhpd 0 * SIZE(Y), %xmm4 addq INCY, Y movlpd 0 * SIZE(Y), %xmm5 addq INCY, Y movhpd 0 * SIZE(Y), %xmm5 addq INCY, Y movapd 0 * SIZE(YY), %xmm0 movapd 2 * SIZE(YY), %xmm1 addpd %xmm4, %xmm0 addpd %xmm5, %xmm1 movlpd %xmm0, 0 * SIZE(Y1) addq INCY, Y1 movhpd %xmm0, 0 * SIZE(Y1) addq INCY, Y1 movlpd %xmm1, 0 * SIZE(Y1) addq INCY, Y1 movhpd %xmm1, 0 * SIZE(Y1) addq INCY, Y1 addq $4 * SIZE, YY decq %rax jg .L996 ALIGN_3.L997: movq M, %rax andq $3, %rax jle .L999 ALIGN_3.L998: movlpd 0 * SIZE(YY), %xmm0 addsd 0 * SIZE(Y), %xmm0 movlpd %xmm0, 0 * SIZE(Y1) addq $SIZE, YY addq INCY, Y addq INCY, Y1 decq %rax jg .L998 ALIGN_3.L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15#ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15#endif addq $STACKSIZE, %rsp ret EPILOGUE
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -