📄 gemv_t_sse2.s
字号:
addl %eax, %edx addsd %xmm1, %xmm6 addsd %xmm2, %xmm7 movsd %xmm4, 0 * SIZE(%ebp) addl %eax, %ebp movsd %xmm5, 0 * SIZE(%ebp) addl %eax, %ebp movsd %xmm6, 0 * SIZE(%ebp) addl %eax, %ebp movsd %xmm7, 0 * SIZE(%ebp) addl %eax, %ebp decl J jg .L51 movl N, %esi andl $3, %esi jne .L100 ALIGN_3.L99: movl A, %ebx addl NLDA, %ebx movl %ebx, A movl IS, %esi addl $P, %esi cmpl M, %esi jl .L10 ALIGN_3.L999: movl OLD_STACK, %esp popl %ebx popl %esi popl %edi popl %ebp ret ALIGN_3.L100: movl N, %esi andl $3, %esi cmpl $3, %esi jne .L110 ALIGN_3.L101: movl A, %ebx # a_offset = a movl LDA, %edx leal (%ebx, %edx), %ecx # a_offset2 = a + lda leal (%ebx, %edx, 2), %eax addl %edx, %eax movl %eax, A movl BUFFER, %esi pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 movapd 0 * SIZE(%esi), %xmm0 movapd 2 * SIZE(%esi), %xmm2 movl MIN_M, %eax sarl $3, %eax jle .L103 ALIGN_3.L102: movsd 0 * SIZE(%ebx), %xmm1 movhpd 1 * SIZE(%ebx), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movsd 0 * SIZE(%ecx), %xmm1 movhpd 1 * SIZE(%ecx), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm5 movsd 0 * SIZE(%ebx, %edx, 2), %xmm1 movhpd 1 * SIZE(%ebx, %edx, 2), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm6 movapd 4 * SIZE(%esi), %xmm0 movsd 2 * SIZE(%ebx), %xmm1 movhpd 3 * SIZE(%ebx), %xmm1 mulpd %xmm2, %xmm1 addpd %xmm1, %xmm4 movsd 2 * SIZE(%ecx), %xmm1 movhpd 3 * SIZE(%ecx), %xmm1 mulpd %xmm2, %xmm1 addpd %xmm1, %xmm5 movsd 2 * SIZE(%ebx, %edx, 2), %xmm1 movhpd 3 * SIZE(%ebx, %edx, 2), %xmm1 mulpd %xmm2, %xmm1 addpd %xmm1, %xmm6 movapd 6 * SIZE(%esi), %xmm2 movsd 4 * SIZE(%ebx), %xmm1 movhpd 5 * SIZE(%ebx), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movsd 4 * SIZE(%ecx), %xmm1 movhpd 5 * SIZE(%ecx), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm5 movsd 4 * SIZE(%ebx, %edx, 2), %xmm1 movhpd 5 * SIZE(%ebx, %edx, 2), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm6 movapd 8 * SIZE(%esi), %xmm0 movsd 6 * SIZE(%ebx), %xmm1 movhpd 7 * SIZE(%ebx), %xmm1 mulpd %xmm2, %xmm1 addpd %xmm1, %xmm4 movsd 6 * SIZE(%ecx), %xmm1 movhpd 7 * SIZE(%ecx), %xmm1 addl $8 * SIZE, %ecx mulpd %xmm2, %xmm1 addpd %xmm1, %xmm5 movsd 6 * SIZE(%ebx, %edx, 2), %xmm1 movhpd 7 * SIZE(%ebx, %edx, 2), %xmm1 addl $8 * SIZE, %ebx mulpd %xmm2, %xmm1 addpd %xmm1, %xmm6 movapd 10 * SIZE(%esi), %xmm2 addl $8 * SIZE, %esi decl %eax jg .L102 ALIGN_3.L103: movl MIN_M, %eax andl $7, %eax je .L105 ALIGN_3.L104: movsd 0 * SIZE(%ebx), %xmm1 mulsd %xmm0, %xmm1 addsd %xmm1, %xmm4 movsd 0 * SIZE(%ecx), %xmm1 addl $SIZE, %ecx mulsd %xmm0, %xmm1 addsd %xmm1, %xmm5 movsd 0 * SIZE(%ebx, %edx, 2), %xmm1 addl $SIZE, %ebx mulsd %xmm0, %xmm1 addsd %xmm1, %xmm6 movsd 1 * SIZE(%esi), %xmm0 addl $SIZE, %esi decl %eax jg .L104 ALIGN_3.L105: movapd %xmm4, %xmm0 unpckhpd %xmm4, %xmm4 addsd %xmm0, %xmm4 movapd %xmm5, %xmm0 unpckhpd %xmm5, %xmm5 addsd %xmm0, %xmm5 movapd %xmm6, %xmm0 unpckhpd %xmm6, %xmm6 addsd %xmm0, %xmm6 mulsd %xmm3, %xmm4 mulsd %xmm3, %xmm5 mulsd %xmm3, %xmm6 movl INCY, %eax movl %ebp, %edx cmpl $SIZE, %eax jne .L106 movsd 0 * SIZE(%ebp), %xmm1 movsd 1 * SIZE(%ebp), %xmm2 addsd %xmm1, %xmm4 addsd %xmm2, %xmm5 movsd 2 * SIZE(%ebp), %xmm1 addsd %xmm1, %xmm6 movsd %xmm4, 0 * SIZE(%ebp) movsd %xmm5, 1 * SIZE(%ebp) movsd %xmm6, 2 * SIZE(%ebp) jmp .L99 ALIGN_3.L106: movsd 0 * SIZE(%edx), %xmm1 addl %eax, %edx movsd 0 * SIZE(%edx), %xmm2 addl %eax, %edx addsd %xmm1, %xmm4 addsd %xmm2, %xmm5 movsd 0 * SIZE(%edx), %xmm1 addsd %xmm1, %xmm6 movsd %xmm4, 0 * SIZE(%ebp) addl %eax, %ebp movsd %xmm5, 0 * SIZE(%ebp) addl %eax, %ebp movsd %xmm6, 0 * SIZE(%ebp) jmp .L99 ALIGN_3.L110: cmpl $2, %esi jne .L120 ALIGN_3.L111: movl A, %ebx # a_offset = a movl LDA, %edx leal (%ebx, %edx), %ecx # a_offset2 = a + lda leal (%ebx, %edx, 2), %eax movl %eax, A movl BUFFER, %esi pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 movapd 0 * SIZE(%esi), %xmm0 movapd 2 * SIZE(%esi), %xmm2 movl MIN_M, %eax sarl $3, %eax jle .L113 ALIGN_3.L112: movsd 0 * SIZE(%ebx), %xmm1 movhpd 1 * SIZE(%ebx), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movsd 0 * SIZE(%ecx), %xmm1 movhpd 1 * SIZE(%ecx), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm5 movapd 4 * SIZE(%esi), %xmm0 movsd 2 * SIZE(%ebx), %xmm1 movhpd 3 * SIZE(%ebx), %xmm1 mulpd %xmm2, %xmm1 addpd %xmm1, %xmm4 movsd 2 * SIZE(%ecx), %xmm1 movhpd 3 * SIZE(%ecx), %xmm1 mulpd %xmm2, %xmm1 addpd %xmm1, %xmm5 movapd 6 * SIZE(%esi), %xmm2 movsd 4 * SIZE(%ebx), %xmm1 movhpd 5 * SIZE(%ebx), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movsd 4 * SIZE(%ecx), %xmm1 movhpd 5 * SIZE(%ecx), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm5 movapd 8 * SIZE(%esi), %xmm0 movsd 6 * SIZE(%ebx), %xmm1 movhpd 7 * SIZE(%ebx), %xmm1 addl $8 * SIZE, %ebx mulpd %xmm2, %xmm1 addpd %xmm1, %xmm4 movsd 6 * SIZE(%ecx), %xmm1 movhpd 7 * SIZE(%ecx), %xmm1 addl $8 * SIZE, %ecx mulpd %xmm2, %xmm1 addpd %xmm1, %xmm5 movapd 10 * SIZE(%esi), %xmm2 addl $8 * SIZE, %esi decl %eax jg .L112 ALIGN_3.L113: movl MIN_M, %eax andl $7, %eax je .L115 ALIGN_3.L114: movsd 0 * SIZE(%ebx), %xmm1 addl $SIZE, %ebx mulsd %xmm0, %xmm1 addsd %xmm1, %xmm4 movsd 0 * SIZE(%ecx), %xmm1 addl $SIZE, %ecx mulsd %xmm0, %xmm1 addsd %xmm1, %xmm5 movsd 1 * SIZE(%esi), %xmm0 addl $SIZE, %esi decl %eax jg .L114 ALIGN_3.L115: movapd %xmm4, %xmm0 unpckhpd %xmm4, %xmm4 addsd %xmm0, %xmm4 movapd %xmm5, %xmm0 unpckhpd %xmm5, %xmm5 addsd %xmm0, %xmm5 mulsd %xmm3, %xmm4 mulsd %xmm3, %xmm5 movl INCY, %eax movl %ebp, %edx cmpl $SIZE, %eax jne .L116 movsd 0 * SIZE(%ebp), %xmm1 movsd 1 * SIZE(%ebp), %xmm2 addsd %xmm1, %xmm4 addsd %xmm2, %xmm5 movsd %xmm4, 0 * SIZE(%ebp) movsd %xmm5, 1 * SIZE(%ebp) jmp .L99 ALIGN_3.L116: movsd 0 * SIZE(%edx), %xmm1 addl %eax, %edx movsd 0 * SIZE(%edx), %xmm2 addsd %xmm1, %xmm4 addsd %xmm2, %xmm5 movsd %xmm4, 0 * SIZE(%ebp) addl %eax, %ebp movsd %xmm5, 0 * SIZE(%ebp) jmp .L99 ALIGN_3.L120: movl A, %ebx # a_offset = a movl LDA, %edx leal (%ebx, %edx), %ecx # a_offset2 = a + lda leal (%ebx, %edx, 1), %eax movl %eax, A movl BUFFER, %esi pxor %xmm4, %xmm4 movapd 0 * SIZE(%esi), %xmm0 movapd 2 * SIZE(%esi), %xmm2 movl MIN_M, %eax sarl $3, %eax jle .L123 ALIGN_3.L122: movsd 0 * SIZE(%ebx), %xmm1 movhpd 1 * SIZE(%ebx), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd 4 * SIZE(%esi), %xmm0 movsd 2 * SIZE(%ebx), %xmm1 movhpd 3 * SIZE(%ebx), %xmm1 mulpd %xmm2, %xmm1 addpd %xmm1, %xmm4 movapd 6 * SIZE(%esi), %xmm2 movsd 4 * SIZE(%ebx), %xmm1 movhpd 5 * SIZE(%ebx), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd 8 * SIZE(%esi), %xmm0 movsd 6 * SIZE(%ebx), %xmm1 movhpd 7 * SIZE(%ebx), %xmm1 addl $8 * SIZE, %ebx mulpd %xmm2, %xmm1 addpd %xmm1, %xmm4 movapd 10 * SIZE(%esi), %xmm2 addl $8 * SIZE, %esi decl %eax jg .L122 ALIGN_3.L123: movl MIN_M, %eax andl $7, %eax je .L125 ALIGN_3.L124: movsd 0 * SIZE(%ebx), %xmm1 addl $SIZE, %ebx mulsd %xmm0, %xmm1 addsd %xmm1, %xmm4 movsd 1 * SIZE(%esi), %xmm0 addl $SIZE, %esi decl %eax jg .L124 ALIGN_3.L125: movapd %xmm4, %xmm0 unpckhpd %xmm4, %xmm4 addsd %xmm0, %xmm4 mulsd %xmm3, %xmm4 movsd 0 * SIZE(%ebp), %xmm1 addsd %xmm1, %xmm4 movsd %xmm4, 0 * SIZE(%ebp) jmp .L99 EPILOGUE
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -