📄 gemv_n_sse2_core2.s
字号:
movapd -14 * SIZE(A1), %xmm9 movapd -16 * SIZE(A2), %xmm10 movapd -14 * SIZE(A2), %xmm11 mulpd %xmm12, %xmm8 mulpd %xmm12, %xmm9 mulpd %xmm13, %xmm10 mulpd %xmm13, %xmm11 addpd %xmm8, %xmm0 addpd %xmm9, %xmm1 addpd %xmm10, %xmm0 addpd %xmm11, %xmm1 movapd -16 * SIZE(A1, LDA, 2), %xmm8 movapd -14 * SIZE(A1, LDA, 2), %xmm9 movapd -16 * SIZE(A2, LDA, 2), %xmm10 movapd -14 * SIZE(A2, LDA, 2), %xmm11 mulpd %xmm14, %xmm8 mulpd %xmm14, %xmm9 mulpd %xmm15, %xmm10 mulpd %xmm15, %xmm11 addpd %xmm8, %xmm0 addpd %xmm9, %xmm1 addpd %xmm10, %xmm0 addpd %xmm11, %xmm1 movapd %xmm0, -16 * SIZE(Y1) movapd %xmm1, -14 * SIZE(Y1) addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3.L17: testq $2, MM je .L18 movapd -16 * SIZE(Y1), %xmm0 movapd -16 * SIZE(A1), %xmm8 movapd -16 * SIZE(A2), %xmm10 mulpd %xmm12, %xmm8 mulpd %xmm13, %xmm10 addpd %xmm8, %xmm0 addpd %xmm10, %xmm0 movapd -16 * SIZE(A1, LDA, 2), %xmm8 movapd -16 * SIZE(A2, LDA, 2), %xmm10 mulpd %xmm14, %xmm8 mulpd %xmm15, %xmm10 addpd %xmm8, %xmm0 addpd %xmm10, %xmm0 movapd %xmm0, -16 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3.L18: testq $1, MM je .L19 movsd -16 * SIZE(Y1), %xmm0 movsd -16 * SIZE(A1), %xmm8 movsd -16 * SIZE(A2), %xmm9 movsd -16 * SIZE(A1, LDA, 2), %xmm10 movsd -16 * SIZE(A2, LDA, 2), %xmm11 mulsd %xmm12, %xmm8 mulsd %xmm13, %xmm9 mulsd %xmm14, %xmm10 mulsd %xmm15, %xmm11 addsd %xmm8, %xmm0 addsd %xmm9, %xmm0 addsd %xmm10, %xmm0 addsd %xmm11, %xmm0 movsd %xmm0, -16 * SIZE(Y1) addq $1 * SIZE, A1 addq $1 * SIZE, A2 addq $1 * SIZE, Y1 ALIGN_3.L19: decq J jg .L11 ALIGN_3.L20: testq $2, N je .L30 leaq 16 * SIZE(BUFFER), Y1 movq A, A1 leaq (A, LDA, 1), A2 leaq (A, LDA, 2), A movsd (X), %xmm12 addq INCX, X movsd (X), %xmm13 addq INCX, X mulsd STACK_ALPHA, %xmm12 mulsd STACK_ALPHA, %xmm13 unpcklpd %xmm12, %xmm12 unpcklpd %xmm13, %xmm13 testq $SIZE, A je .L22 movsd -16 * SIZE(Y1), %xmm0 movsd -16 * SIZE(A1), %xmm8 movsd -16 * SIZE(A2), %xmm9 mulsd %xmm12, %xmm8 mulsd %xmm13, %xmm9 addsd %xmm8, %xmm0 addsd %xmm9, %xmm0 movsd %xmm0, -16 * SIZE(Y1) addq $1 * SIZE, A1 addq $1 * SIZE, A2 addq $1 * SIZE, Y1 ALIGN_3.L22: movq MM, I sarq $4, I jle .L25 ALIGN_3.L23: movapd -16 * SIZE(Y1), %xmm0 movapd -14 * SIZE(Y1), %xmm1 movapd -12 * SIZE(Y1), %xmm2 movapd -10 * SIZE(Y1), %xmm3 movapd -16 * SIZE(A1), %xmm8 movapd -14 * SIZE(A1), %xmm9 movapd -12 * SIZE(A1), %xmm10 movapd -10 * SIZE(A1), %xmm11 mulpd %xmm12, %xmm8 mulpd %xmm12, %xmm9 mulpd %xmm12, %xmm10 mulpd %xmm12, %xmm11 addpd %xmm8, %xmm0 addpd %xmm9, %xmm1 addpd %xmm10, %xmm2 addpd %xmm11, %xmm3 movapd -16 * SIZE(A2), %xmm8 movapd -14 * SIZE(A2), %xmm9 movapd -12 * SIZE(A2), %xmm10 movapd -10 * SIZE(A2), %xmm11 mulpd %xmm13, %xmm8 mulpd %xmm13, %xmm9 mulpd %xmm13, %xmm10 mulpd %xmm13, %xmm11 addpd %xmm8, %xmm0 addpd %xmm9, %xmm1 addpd %xmm10, %xmm2 addpd %xmm11, %xmm3 movapd %xmm0, -16 * SIZE(Y1) movapd %xmm1, -14 * SIZE(Y1) movapd %xmm2, -12 * SIZE(Y1) movapd %xmm3, -10 * SIZE(Y1) movapd -8 * SIZE(Y1), %xmm0 movapd -6 * SIZE(Y1), %xmm1 movapd -4 * SIZE(Y1), %xmm2 movapd -2 * SIZE(Y1), %xmm3 movapd -8 * SIZE(A1), %xmm8 movapd -6 * SIZE(A1), %xmm9 movapd -4 * SIZE(A1), %xmm10 movapd -2 * SIZE(A1), %xmm11 mulpd %xmm12, %xmm8 mulpd %xmm12, %xmm9 mulpd %xmm12, %xmm10 mulpd %xmm12, %xmm11 addpd %xmm8, %xmm0 addpd %xmm9, %xmm1 addpd %xmm10, %xmm2 addpd %xmm11, %xmm3 movapd -8 * SIZE(A2), %xmm8 movapd -6 * SIZE(A2), %xmm9 movapd -4 * SIZE(A2), %xmm10 movapd -2 * SIZE(A2), %xmm11 mulpd %xmm13, %xmm8 mulpd %xmm13, %xmm9 mulpd %xmm13, %xmm10 mulpd %xmm13, %xmm11 addpd %xmm8, %xmm0 addpd %xmm9, %xmm1 addpd %xmm10, %xmm2 addpd %xmm11, %xmm3 movapd %xmm0, -8 * SIZE(Y1) movapd %xmm1, -6 * SIZE(Y1) movapd %xmm2, -4 * SIZE(Y1) movapd %xmm3, -2 * SIZE(Y1) subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 subq $1, I jg .L23 ALIGN_3.L25: testq $8, MM je .L26 movapd -16 * SIZE(Y1), %xmm0 movapd -14 * SIZE(Y1), %xmm1 movapd -12 * SIZE(Y1), %xmm2 movapd -10 * SIZE(Y1), %xmm3 movapd -16 * SIZE(A1), %xmm8 movapd -14 * SIZE(A1), %xmm9 movapd -12 * SIZE(A1), %xmm10 movapd -10 * SIZE(A1), %xmm11 mulpd %xmm12, %xmm8 mulpd %xmm12, %xmm9 mulpd %xmm12, %xmm10 mulpd %xmm12, %xmm11 addpd %xmm8, %xmm0 addpd %xmm9, %xmm1 addpd %xmm10, %xmm2 addpd %xmm11, %xmm3 movapd -16 * SIZE(A2), %xmm8 movapd -14 * SIZE(A2), %xmm9 movapd -12 * SIZE(A2), %xmm10 movapd -10 * SIZE(A2), %xmm11 mulpd %xmm13, %xmm8 mulpd %xmm13, %xmm9 mulpd %xmm13, %xmm10 mulpd %xmm13, %xmm11 addpd %xmm8, %xmm0 addpd %xmm9, %xmm1 addpd %xmm10, %xmm2 addpd %xmm11, %xmm3 movapd %xmm0, -16 * SIZE(Y1) movapd %xmm1, -14 * SIZE(Y1) movapd %xmm2, -12 * SIZE(Y1) movapd %xmm3, -10 * SIZE(Y1) addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, Y1 ALIGN_3.L26: testq $4, MM je .L27 movapd -16 * SIZE(Y1), %xmm0 movapd -14 * SIZE(Y1), %xmm1 movapd -16 * SIZE(A1), %xmm8 movapd -14 * SIZE(A1), %xmm9 movapd -16 * SIZE(A2), %xmm10 movapd -14 * SIZE(A2), %xmm11 mulpd %xmm12, %xmm8 mulpd %xmm12, %xmm9 mulpd %xmm13, %xmm10 mulpd %xmm13, %xmm11 addpd %xmm8, %xmm0 addpd %xmm9, %xmm1 addpd %xmm10, %xmm0 addpd %xmm11, %xmm1 movapd %xmm0, -16 * SIZE(Y1) movapd %xmm1, -14 * SIZE(Y1) addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3.L27: testq $2, MM je .L28 movapd -16 * SIZE(Y1), %xmm0 movapd -16 * SIZE(A1), %xmm8 movapd -16 * SIZE(A2), %xmm10 mulpd %xmm12, %xmm8 mulpd %xmm13, %xmm10 addpd %xmm8, %xmm0 addpd %xmm10, %xmm0 movapd %xmm0, -16 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3.L28: testq $1, MM je .L30 movsd -16 * SIZE(Y1), %xmm0 movsd -16 * SIZE(A1), %xmm8 movsd -16 * SIZE(A2), %xmm9 mulsd %xmm12, %xmm8 mulsd %xmm13, %xmm9 addsd %xmm8, %xmm0 addsd %xmm9, %xmm0 movsd %xmm0, -16 * SIZE(Y1) addq $1 * SIZE, A1 addq $1 * SIZE, A2 addq $1 * SIZE, Y1 ALIGN_3.L30: testq $1, N je .L990 leaq 16 * SIZE(BUFFER), Y1 movq A, A1 movsd (X), %xmm12 mulsd STACK_ALPHA, %xmm12 unpcklpd %xmm12, %xmm12 testq $SIZE, A je .L32 movsd -16 * SIZE(Y1), %xmm0 movsd -16 * SIZE(A1), %xmm8 mulsd %xmm12, %xmm8 addsd %xmm8, %xmm0 movsd %xmm0, -16 * SIZE(Y1) addq $1 * SIZE, A1 addq $1 * SIZE, Y1 ALIGN_3.L32: movq MM, I sarq $4, I jle .L35 ALIGN_3.L33: movapd -16 * SIZE(Y1), %xmm0 movapd -14 * SIZE(Y1), %xmm1 movapd -12 * SIZE(Y1), %xmm2 movapd -10 * SIZE(Y1), %xmm3 movapd -16 * SIZE(A1), %xmm8 movapd -14 * SIZE(A1), %xmm9 movapd -12 * SIZE(A1), %xmm10 movapd -10 * SIZE(A1), %xmm11 mulpd %xmm12, %xmm8 mulpd %xmm12, %xmm9 mulpd %xmm12, %xmm10 mulpd %xmm12, %xmm11 addpd %xmm8, %xmm0 addpd %xmm9, %xmm1 addpd %xmm10, %xmm2 addpd %xmm11, %xmm3 movapd %xmm0, -16 * SIZE(Y1) movapd %xmm1, -14 * SIZE(Y1) movapd %xmm2, -12 * SIZE(Y1) movapd %xmm3, -10 * SIZE(Y1) movapd -8 * SIZE(Y1), %xmm0 movapd -6 * SIZE(Y1), %xmm1 movapd -4 * SIZE(Y1), %xmm2 movapd -2 * SIZE(Y1), %xmm3 movapd -8 * SIZE(A1), %xmm8 movapd -6 * SIZE(A1), %xmm9 movapd -4 * SIZE(A1), %xmm10 movapd -2 * SIZE(A1), %xmm11 mulpd %xmm12, %xmm8 mulpd %xmm12, %xmm9 mulpd %xmm12, %xmm10 mulpd %xmm12, %xmm11 addpd %xmm8, %xmm0 addpd %xmm9, %xmm1 addpd %xmm10, %xmm2 addpd %xmm11, %xmm3 movapd %xmm0, -8 * SIZE(Y1) movapd %xmm1, -6 * SIZE(Y1) movapd %xmm2, -4 * SIZE(Y1) movapd %xmm3, -2 * SIZE(Y1) subq $-16 * SIZE, A1 subq $-16 * SIZE, Y1 decq I jg .L33 ALIGN_3.L35: testq $8, MM je .L36 movapd -16 * SIZE(Y1), %xmm0 movapd -14 * SIZE(Y1), %xmm1 movapd -12 * SIZE(Y1), %xmm2 movapd -10 * SIZE(Y1), %xmm3 movapd -16 * SIZE(A1), %xmm8 movapd -14 * SIZE(A1), %xmm9 movapd -12 * SIZE(A1), %xmm10 movapd -10 * SIZE(A1), %xmm11 mulpd %xmm12, %xmm8 mulpd %xmm12, %xmm9 mulpd %xmm12, %xmm10 mulpd %xmm12, %xmm11 addpd %xmm8, %xmm0 addpd %xmm9, %xmm1 addpd %xmm10, %xmm2 addpd %xmm11, %xmm3 movapd %xmm0, -16 * SIZE(Y1) movapd %xmm1, -14 * SIZE(Y1) movapd %xmm2, -12 * SIZE(Y1) movapd %xmm3, -10 * SIZE(Y1) addq $8 * SIZE, A1 addq $8 * SIZE, Y1 ALIGN_3.L36: testq $4, MM je .L37 movapd -16 * SIZE(Y1), %xmm0 movapd -14 * SIZE(Y1), %xmm1 movapd -16 * SIZE(A1), %xmm8 movapd -14 * SIZE(A1), %xmm9 mulpd %xmm12, %xmm8 mulpd %xmm12, %xmm9 addpd %xmm8, %xmm0 addpd %xmm9, %xmm1 movapd %xmm0, -16 * SIZE(Y1) movapd %xmm1, -14 * SIZE(Y1) addq $4 * SIZE, A1 addq $4 * SIZE, Y1 ALIGN_3.L37: testq $2, MM je .L38 movapd -16 * SIZE(Y1), %xmm0 movapd -16 * SIZE(A1), %xmm8 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 movapd %xmm0, -16 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, Y1 ALIGN_3.L38: testq $1, MM je .L990 movsd -16 * SIZE(Y1), %xmm0 movsd -16 * SIZE(A1), %xmm8 mulsd %xmm12, %xmm8 addsd %xmm8, %xmm0 movsd %xmm0, -16 * SIZE(Y1) jmp .L990 ALIGN_3.L40: movq N, J sarq $2, J jle .L50 ALIGN_3.L41: movq BUFFER, Y1 movq A, A1 leaq (A, LDA, 1), A2 leaq (A, LDA, 4), A movsd (X), %xmm0 addq INCX, X movsd (X), %xmm1 addq INCX, X movsd (X), %xmm2 addq INCX, X movsd (X), %xmm3 addq INCX, X mulsd STACK_ALPHA, %xmm0 mulsd STACK_ALPHA, %xmm1 mulsd STACK_ALPHA, %xmm2 mulsd STACK_ALPHA, %xmm3 unpcklpd %xmm0, %xmm0 unpcklpd %xmm1, %xmm1 unpcklpd %xmm2, %xmm2 unpcklpd %xmm3, %xmm3 ALIGN_3 testq $SIZE, A je .L42 movsd 0 * SIZE(Y1), %xmm12 movsd -16 * SIZE(A1), %xmm8 movsd -16 * SIZE(A2), %xmm9 movsd -16 * SIZE(A1, LDA, 2), %xmm10 movsd -16 * SIZE(A2, LDA, 2), %xmm11 mulsd %xmm0, %xmm8 mulsd %xmm1, %xmm9 mulsd %xmm2, %xmm10 mulsd %xmm3, %xmm11 addsd %xmm8, %xmm12 addsd %xmm9, %xmm12 addsd %xmm10, %xmm12 addsd %xmm11, %xmm12 movsd %xmm12, 0 * SIZE(Y1) addq $1 * SIZE, A1 addq $1 * SIZE, A2 addq $1 * SIZE, Y1 ALIGN_3.L42: movq MM, I sarq $4, I jle .L45 movapd -16 * SIZE(A1), %xmm8 movapd -14 * SIZE(A1), %xmm9 movapd -12 * SIZE(A1), %xmm10 movapd -10 * SIZE(A1), %xmm11 movsd -16 * SIZE(A2), %xmm12 movhpd -15 * SIZE(A2), %xmm12 movsd -14 * SIZE(A2), %xmm13 movhpd -13 * SIZE(A2), %xmm13 movsd -12 * SIZE(A2), %xmm14 movhpd -11 * SIZE(A2), %xmm14 movsd -10 * SIZE(A2), %xmm15 movhpd -9 * SIZE(A2), %xmm15 movapd 0 * SIZE(Y1), %xmm4 movapd 2 * SIZE(Y1), %xmm5 movapd 4 * SIZE(Y1), %xmm6 movapd 6 * SIZE(Y1), %xmm7 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm11 decq I jle .L44
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -