📄 gemv_t_sse.s
字号:
addss %xmm2, %xmm5 movss 2 * SIZE(%ebp), %xmm1 movss 3 * SIZE(%ebp), %xmm2 addss %xmm1, %xmm6 addss %xmm2, %xmm7 movss %xmm4, 0 * SIZE(%ebp) movss %xmm5, 1 * SIZE(%ebp) movss %xmm6, 2 * SIZE(%ebp) movss %xmm7, 3 * SIZE(%ebp) addl $4 * SIZE, %ebp decl J jg .L51 movl N, %esi andl $3, %esi jne .L100 jmp .L99 ALIGN_3.L56: movss 0 * SIZE(%edx), %xmm1 addl %eax, %edx movss 0 * SIZE(%edx), %xmm2 addl %eax, %edx addss %xmm1, %xmm4 addss %xmm2, %xmm5 movss 0 * SIZE(%edx), %xmm1 addl %eax, %edx movss 0 * SIZE(%edx), %xmm2 addss %xmm1, %xmm6 addss %xmm2, %xmm7 movss %xmm4, 0 * SIZE(%ebp) addl %eax, %ebp movss %xmm5, 0 * SIZE(%ebp) addl %eax, %ebp movss %xmm6, 0 * SIZE(%ebp) addl %eax, %ebp movss %xmm7, 0 * SIZE(%ebp) addl %eax, %ebp decl J jg .L51 movl N, %esi andl $3, %esi jne .L100 ALIGN_3.L99: movl A, %ebx addl NLDA, %ebx movl %ebx, A movl IS, %esi addl $P, %esi cmpl M, %esi jl .L10 ALIGN_3.L999: movl OLD_STACK, %esp popl %ebx popl %esi popl %edi popl %ebp ret ALIGN_3.L100: movl N, %esi andl $3, %esi cmpl $3, %esi jne .L110 ALIGN_3.L101: movl A, %ebx # a_offset = a movl LDA, %edx leal (%ebx, %edx), %ecx # a_offset2 = a + lda leal (%ebx, %edx, 2), %eax addl %edx, %eax movl %eax, A movl BUFFER, %esi pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 movaps 0 * SIZE(%esi), %xmm0 movaps 4 * SIZE(%esi), %xmm2 movl MIN_M, %eax sarl $3, %eax jle .L103 ALIGN_3.L102: movsd 0 * SIZE(%ebx), %xmm1 movhps 2 * SIZE(%ebx), %xmm1 mulps %xmm0, %xmm1 addps %xmm1, %xmm4 movsd 0 * SIZE(%ecx), %xmm1 movhps 2 * SIZE(%ecx), %xmm1 mulps %xmm0, %xmm1 addps %xmm1, %xmm5 movsd 0 * SIZE(%ebx, %edx, 2), %xmm1 movhps 2 * SIZE(%ebx, %edx, 2), %xmm1 mulps %xmm0, %xmm1 addps %xmm1, %xmm6 movaps 8 * SIZE(%esi), %xmm0 movsd 4 * SIZE(%ebx), %xmm1 movhps 6 * SIZE(%ebx), %xmm1 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movsd 4 * SIZE(%ecx), %xmm1 movhps 6 * SIZE(%ecx), %xmm1 mulps %xmm2, %xmm1 addps %xmm1, %xmm5 movsd 4 * SIZE(%ebx, %edx, 2), %xmm1 movhps 6 * SIZE(%ebx, %edx, 2), %xmm1 mulps %xmm2, %xmm1 addps %xmm1, %xmm6 movaps 12 * SIZE(%esi), %xmm2 addl $8 * SIZE, %ecx addl $8 * SIZE, %ebx addl $8 * SIZE, %esi decl %eax jg .L102 ALIGN_3.L103: movl MIN_M, %eax andl $7, %eax je .L105 ALIGN_3.L104: movss 0 * SIZE(%ebx), %xmm1 mulss %xmm0, %xmm1 addss %xmm1, %xmm4 movss 0 * SIZE(%ecx), %xmm1 addl $SIZE, %ecx mulss %xmm0, %xmm1 addss %xmm1, %xmm5 movss 0 * SIZE(%ebx, %edx, 2), %xmm1 addl $SIZE, %ebx mulss %xmm0, %xmm1 addss %xmm1, %xmm6 movss 1 * SIZE(%esi), %xmm0 addl $SIZE, %esi decl %eax jg .L104 ALIGN_3.L105: movaps %xmm4, %xmm0 shufps $0xe, %xmm4, %xmm4 addps %xmm0, %xmm4 movaps %xmm5, %xmm0 shufps $0xe, %xmm5, %xmm5 addps %xmm0, %xmm5 movaps %xmm6, %xmm0 shufps $0xe, %xmm6, %xmm6 addps %xmm0, %xmm6 movaps %xmm4, %xmm0 shufps $0x39, %xmm4, %xmm4 addss %xmm0, %xmm4 movaps %xmm5, %xmm0 shufps $0x39, %xmm5, %xmm5 addss %xmm0, %xmm5 movaps %xmm6, %xmm0 shufps $0x39, %xmm6, %xmm6 addss %xmm0, %xmm6 mulss %xmm3, %xmm4 mulss %xmm3, %xmm5 mulss %xmm3, %xmm6 movl INCY, %eax movl %ebp, %edx cmpl $SIZE, %eax jne .L106 movss 0 * SIZE(%ebp), %xmm1 movss 1 * SIZE(%ebp), %xmm2 addss %xmm1, %xmm4 addss %xmm2, %xmm5 movss 2 * SIZE(%ebp), %xmm1 addss %xmm1, %xmm6 movss %xmm4, 0 * SIZE(%ebp) movss %xmm5, 1 * SIZE(%ebp) movss %xmm6, 2 * SIZE(%ebp) jmp .L99 ALIGN_3.L106: movss 0 * SIZE(%edx), %xmm1 addl %eax, %edx movss 0 * SIZE(%edx), %xmm2 addl %eax, %edx addss %xmm1, %xmm4 addss %xmm2, %xmm5 movss 0 * SIZE(%edx), %xmm1 addss %xmm1, %xmm6 movss %xmm4, 0 * SIZE(%ebp) addl %eax, %ebp movss %xmm5, 0 * SIZE(%ebp) addl %eax, %ebp movss %xmm6, 0 * SIZE(%ebp) jmp .L99 ALIGN_3.L110: cmpl $2, %esi jne .L120 ALIGN_3.L111: movl A, %ebx # a_offset = a movl LDA, %edx leal (%ebx, %edx), %ecx # a_offset2 = a + lda leal (%ebx, %edx, 2), %eax movl %eax, A movl BUFFER, %esi pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 movaps 0 * SIZE(%esi), %xmm0 movaps 4 * SIZE(%esi), %xmm2 movl MIN_M, %eax sarl $3, %eax jle .L113 ALIGN_3.L112: movsd 0 * SIZE(%ebx), %xmm1 movhps 2 * SIZE(%ebx), %xmm1 mulps %xmm0, %xmm1 addps %xmm1, %xmm4 movsd 0 * SIZE(%ecx), %xmm1 movhps 2 * SIZE(%ecx), %xmm1 mulps %xmm0, %xmm1 addps %xmm1, %xmm5 movaps 8 * SIZE(%esi), %xmm0 movsd 4 * SIZE(%ebx), %xmm1 movhps 6 * SIZE(%ebx), %xmm1 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movsd 4 * SIZE(%ecx), %xmm1 movhps 6 * SIZE(%ecx), %xmm1 mulps %xmm2, %xmm1 addps %xmm1, %xmm5 movaps 12 * SIZE(%esi), %xmm2 addl $8 * SIZE, %ebx addl $8 * SIZE, %ecx addl $8 * SIZE, %esi decl %eax jg .L112 ALIGN_3.L113: movl MIN_M, %eax andl $7, %eax je .L115 ALIGN_3.L114: movss 0 * SIZE(%ebx), %xmm1 addl $SIZE, %ebx mulss %xmm0, %xmm1 addss %xmm1, %xmm4 movss 0 * SIZE(%ecx), %xmm1 addl $SIZE, %ecx mulss %xmm0, %xmm1 addss %xmm1, %xmm5 movss 1 * SIZE(%esi), %xmm0 addl $SIZE, %esi decl %eax jg .L114 ALIGN_3.L115: movaps %xmm4, %xmm0 shufps $0xe, %xmm4, %xmm4 addps %xmm0, %xmm4 movaps %xmm5, %xmm0 shufps $0xe, %xmm5, %xmm5 addps %xmm0, %xmm5 movaps %xmm4, %xmm0 shufps $0x39, %xmm4, %xmm4 addss %xmm0, %xmm4 movaps %xmm5, %xmm0 shufps $0x39, %xmm5, %xmm5 addss %xmm0, %xmm5 mulss %xmm3, %xmm4 mulss %xmm3, %xmm5 movl INCY, %eax movl %ebp, %edx cmpl $SIZE, %eax jne .L116 movss 0 * SIZE(%ebp), %xmm1 movss 1 * SIZE(%ebp), %xmm2 addss %xmm1, %xmm4 addss %xmm2, %xmm5 movss %xmm4, 0 * SIZE(%ebp) movss %xmm5, 1 * SIZE(%ebp) jmp .L99 ALIGN_3.L116: movss 0 * SIZE(%edx), %xmm1 addl %eax, %edx movss 0 * SIZE(%edx), %xmm2 addss %xmm1, %xmm4 addss %xmm2, %xmm5 movss %xmm4, 0 * SIZE(%ebp) addl %eax, %ebp movss %xmm5, 0 * SIZE(%ebp) jmp .L99 ALIGN_3.L120: movl A, %ebx # a_offset = a movl LDA, %edx leal (%ebx, %edx), %ecx # a_offset2 = a + lda leal (%ebx, %edx, 1), %eax movl %eax, A movl BUFFER, %esi pxor %xmm4, %xmm4 movaps 0 * SIZE(%esi), %xmm0 movaps 4 * SIZE(%esi), %xmm2 movl MIN_M, %eax sarl $3, %eax jle .L123 ALIGN_3.L122: movsd 0 * SIZE(%ebx), %xmm1 movhps 2 * SIZE(%ebx), %xmm1 mulps %xmm0, %xmm1 addps %xmm1, %xmm4 movaps 8 * SIZE(%esi), %xmm0 movsd 4 * SIZE(%ebx), %xmm1 movhps 6 * SIZE(%ebx), %xmm1 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps 12 * SIZE(%esi), %xmm2 addl $8 * SIZE, %ebx addl $8 * SIZE, %esi decl %eax jg .L122 ALIGN_3.L123: movl MIN_M, %eax andl $7, %eax je .L125 ALIGN_3.L124: movss 0 * SIZE(%ebx), %xmm1 addl $SIZE, %ebx mulss %xmm0, %xmm1 addss %xmm1, %xmm4 movss 1 * SIZE(%esi), %xmm0 addl $SIZE, %esi decl %eax jg .L124 ALIGN_3.L125: movaps %xmm4, %xmm0 shufps $0xe, %xmm4, %xmm4 addps %xmm0, %xmm4 movaps %xmm4, %xmm0 shufps $0x39, %xmm4, %xmm4 addss %xmm0, %xmm4 mulss %xmm3, %xmm4 movss 0 * SIZE(%ebp), %xmm1 addss %xmm1, %xmm4 movss %xmm4, 0 * SIZE(%ebp) jmp .L99 EPILOGUE
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -