📄 gemv_n_sse.s
字号:
mulps %xmm3, %xmm4 movl INCY, %eax mulps %xmm3, %xmm5 movl %ecx, %edx mulps %xmm3, %xmm6 mulps %xmm3, %xmm7 cmpl $4, %eax jne .L29#ifdef HAVE_SSE2 movsd 0 * SIZE(%ecx), %xmm0 movhps 2 * SIZE(%ecx), %xmm0 addps %xmm0, %xmm4 movsd 4 * SIZE(%ecx), %xmm1 movhps 6 * SIZE(%ecx), %xmm1 addps %xmm1, %xmm5 movsd 8 * SIZE(%ecx), %xmm2 movhps 10 * SIZE(%ecx), %xmm2 addps %xmm2, %xmm6 movsd 12 * SIZE(%ecx), %xmm0 movhps 14 * SIZE(%ecx), %xmm0 addps %xmm0, %xmm7 movsd %xmm4, 0 * SIZE(%ecx) unpckhpd %xmm4, %xmm4 movsd %xmm4, 2 * SIZE(%ecx) movsd %xmm5, 4 * SIZE(%ecx) unpckhpd %xmm5, %xmm5 movsd %xmm5, 6 * SIZE(%ecx) movsd %xmm6, 8 * SIZE(%ecx) unpckhpd %xmm6, %xmm6 movsd %xmm6, 10 * SIZE(%ecx) movsd %xmm7, 12 * SIZE(%ecx) unpckhpd %xmm7, %xmm7 movsd %xmm7, 14 * SIZE(%ecx)#else movlps 0 * SIZE(%ecx), %xmm0 movhps 2 * SIZE(%ecx), %xmm0 addps %xmm0, %xmm4 movlps 4 * SIZE(%ecx), %xmm1 movhps 6 * SIZE(%ecx), %xmm1 addps %xmm1, %xmm5 movlps 8 * SIZE(%ecx), %xmm2 movhps 10 * SIZE(%ecx), %xmm2 addps %xmm2, %xmm6 movlps 12 * SIZE(%ecx), %xmm0 movhps 14 * SIZE(%ecx), %xmm0 addps %xmm0, %xmm7 movlps %xmm4, 0 * SIZE(%ecx) movhps %xmm4, 2 * SIZE(%ecx) movlps %xmm5, 4 * SIZE(%ecx) movhps %xmm5, 6 * SIZE(%ecx) movlps %xmm6, 8 * SIZE(%ecx) movhps %xmm6, 10 * SIZE(%ecx) movlps %xmm7, 12 * SIZE(%ecx) movhps %xmm7, 14 * SIZE(%ecx)#endif addl $16 * SIZE, %ecx decl %esi jg .L21 movl M, %esi andl $15, %esi jne .L100 jmp .L99.L29: movss 0 * SIZE(%ecx), %xmm0 addss %xmm0, %xmm4 movss %xmm4, 0 * SIZE(%ecx) addl %eax, %ecx movss 0 * SIZE(%ecx), %xmm0 shufps $0x39, %xmm4, %xmm4 addss %xmm0, %xmm4 movss %xmm4, 0 * SIZE(%ecx) addl %eax, %ecx movss 0 * SIZE(%ecx), %xmm0 shufps $0x39, %xmm4, %xmm4 addss %xmm0, %xmm4 movss %xmm4, 0 * SIZE(%ecx) addl %eax, %ecx movss 0 * SIZE(%ecx), %xmm0 shufps $0x39, %xmm4, %xmm4 addss %xmm0, %xmm4 movss %xmm4, 0 * SIZE(%ecx) addl %eax, %ecx movss 0 * SIZE(%ecx), %xmm0 addss %xmm0, %xmm5 movss %xmm5, 0 * SIZE(%ecx) addl %eax, %ecx movss 0 * SIZE(%ecx), %xmm0 shufps $0x39, %xmm5, %xmm5 addss %xmm0, %xmm5 movss %xmm5, 0 * SIZE(%ecx) addl %eax, %ecx movss 0 * SIZE(%ecx), %xmm0 shufps $0x39, %xmm5, %xmm5 addss %xmm0, %xmm5 movss %xmm5, 0 * SIZE(%ecx) addl %eax, %ecx movss 0 * SIZE(%ecx), %xmm0 shufps $0x39, %xmm5, %xmm5 addss %xmm0, %xmm5 movss %xmm5, 0 * SIZE(%ecx) addl %eax, %ecx movss 0 * SIZE(%ecx), %xmm0 addss %xmm0, %xmm6 movss %xmm6, 0 * SIZE(%ecx) addl %eax, %ecx movss 0 * SIZE(%ecx), %xmm0 shufps $0x39, %xmm6, %xmm6 addss %xmm0, %xmm6 movss %xmm6, 0 * SIZE(%ecx) addl %eax, %ecx movss 0 * SIZE(%ecx), %xmm0 shufps $0x39, %xmm6, %xmm6 addss %xmm0, %xmm6 movss %xmm6, 0 * SIZE(%ecx) addl %eax, %ecx movss 0 * SIZE(%ecx), %xmm0 shufps $0x39, %xmm6, %xmm6 addss %xmm0, %xmm6 movss %xmm6, 0 * SIZE(%ecx) addl %eax, %ecx movss 0 * SIZE(%ecx), %xmm0 addss %xmm0, %xmm7 movss %xmm7, 0 * SIZE(%ecx) addl %eax, %ecx movss 0 * SIZE(%ecx), %xmm0 shufps $0x39, %xmm7, %xmm7 addss %xmm0, %xmm7 movss %xmm7, 0 * SIZE(%ecx) addl %eax, %ecx movss 0 * SIZE(%ecx), %xmm0 shufps $0x39, %xmm7, %xmm7 addss %xmm0, %xmm7 movss %xmm7, 0 * SIZE(%ecx) addl %eax, %ecx movss 0 * SIZE(%ecx), %xmm0 shufps $0x39, %xmm7, %xmm7 addss %xmm0, %xmm7 movss %xmm7, 0 * SIZE(%ecx) addl %eax, %ecx decl %esi jg .L21 movl M, %esi andl $15, %esi jne .L100.L99: movl PLDA_M, %ebx addl %ebx, A movl N, %edi movl IS, %ecx cmpl %edi, %ecx jl .L01.L999: movl OLD_STACK, %esp popl %ebx popl %esi popl %edi popl %ebp ret.L100: movl M, %esi testl $8, %esi jle .L110.L101: pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movl A, %edx # aoffset = a addl $8 * SIZE, A # a += 8 movl BUFFER, %ebx # boffset = buffer movaps 0 * SIZE(%ebx), %xmm0 movl %edi, %eax # i = min_n sarl $2, %eax jle .L106.L104: KERNELMACRO8UNROLL( 0) KERNELMACRO8UNROLL( 8) addl $16 * SIZE, %ebx decl %eax jg .L104.L106: movl %edi, %eax # i = min_n andl $3, %eax jle .L108.L107: KERNELMACRO8( 0) addl $4 * SIZE, %ebx decl %eax jg .L107.L108: addps %xmm6, %xmm4 addps %xmm7, %xmm5 mulps %xmm3, %xmm4 movl INCY, %eax mulps %xmm3, %xmm5 movl %ecx, %edx cmpl $4, %eax jne .L109#ifdef HAVE_SSE2 movsd 0 * SIZE(%ecx), %xmm0 movhps 2 * SIZE(%ecx), %xmm0 addps %xmm0, %xmm4 movsd 4 * SIZE(%ecx), %xmm1 movhps 6 * SIZE(%ecx), %xmm1 addps %xmm1, %xmm5 movsd %xmm4, 0 * SIZE(%ecx) unpckhpd %xmm4, %xmm4 movsd %xmm4, 2 * SIZE(%ecx) movsd %xmm5, 4 * SIZE(%ecx) unpckhpd %xmm5, %xmm5 movsd %xmm5, 6 * SIZE(%ecx)#else movlps 0 * SIZE(%ecx), %xmm0 movhps 2 * SIZE(%ecx), %xmm0 addps %xmm0, %xmm4 movlps 4 * SIZE(%ecx), %xmm1 movhps 6 * SIZE(%ecx), %xmm1 addps %xmm1, %xmm5 movlps %xmm4, 0 * SIZE(%ecx) movhps %xmm4, 2 * SIZE(%ecx) movlps %xmm5, 4 * SIZE(%ecx) movhps %xmm5, 6 * SIZE(%ecx)#endif addl $8 * SIZE, %ecx jmp .L110.L109: movss 0 * SIZE(%ecx), %xmm0 addss %xmm0, %xmm4 movss %xmm4, 0 * SIZE(%ecx) addl %eax, %ecx movss 0 * SIZE(%ecx), %xmm0 shufps $0x39, %xmm4, %xmm4 addss %xmm0, %xmm4 movss %xmm4, 0 * SIZE(%ecx) addl %eax, %ecx movss 0 * SIZE(%ecx), %xmm0 shufps $0x39, %xmm4, %xmm4 addss %xmm0, %xmm4 movss %xmm4, 0 * SIZE(%ecx) addl %eax, %ecx movss 0 * SIZE(%ecx), %xmm0 shufps $0x39, %xmm4, %xmm4 addss %xmm0, %xmm4 movss %xmm4, 0 * SIZE(%ecx) addl %eax, %ecx movss 0 * SIZE(%ecx), %xmm0 addss %xmm0, %xmm5 movss %xmm5, 0 * SIZE(%ecx) addl %eax, %ecx movss 0 * SIZE(%ecx), %xmm0 shufps $0x39, %xmm5, %xmm5 addss %xmm0, %xmm5 movss %xmm5, 0 * SIZE(%ecx) addl %eax, %ecx movss 0 * SIZE(%ecx), %xmm0 shufps $0x39, %xmm5, %xmm5 addss %xmm0, %xmm5 movss %xmm5, 0 * SIZE(%ecx) addl %eax, %ecx movss 0 * SIZE(%ecx), %xmm0 shufps $0x39, %xmm5, %xmm5 addss %xmm0, %xmm5 movss %xmm5, 0 * SIZE(%ecx) addl %eax, %ecx.L110: testl $4, %esi jle .L120.L111: pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movl A, %edx # aoffset = a addl $4 * SIZE, A # a += 8 movl BUFFER, %ebx # boffset = buffer movaps 0 * SIZE(%ebx), %xmm0 movl %edi, %eax # i = min_n sarl $2, %eax jle .L116.L114: KERNELMACRO4UNROLL( 0) addl $16 * SIZE, %ebx decl %eax jg .L114.L116: movl %edi, %eax # i = min_n andl $3, %eax jle .L118.L117: KERNELMACRO4( 0) addl $4 * SIZE, %ebx decl %eax jg .L117.L118: addps %xmm5, %xmm4 addps %xmm7, %xmm6 addps %xmm6, %xmm4 mulps %xmm3, %xmm4 movl INCY, %eax movl %ecx, %edx cmpl $4, %eax jne .L119#ifdef HAVE_SSE2 movsd 0 * SIZE(%ecx), %xmm0 movhps 2 * SIZE(%ecx), %xmm0 addps %xmm0, %xmm4 movsd %xmm4, 0 * SIZE(%ecx) unpckhpd %xmm4, %xmm4 movsd %xmm4, 2 * SIZE(%ecx)#else movss 0 * SIZE(%ecx), %xmm0 addss %xmm0, %xmm4 movss %xmm4, 0 * SIZE(%ecx) movss 1 * SIZE(%ecx), %xmm0 shufps $0x39, %xmm4, %xmm4 addss %xmm0, %xmm4 movss %xmm4, 1 * SIZE(%ecx) movss 2 * SIZE(%ecx), %xmm0 shufps $0x39, %xmm4, %xmm4 addss %xmm0, %xmm4 movss %xmm4, 2 * SIZE(%ecx) movss 3 * SIZE(%ecx), %xmm0 shufps $0x39, %xmm4, %xmm4 addss %xmm0, %xmm4 movss %xmm4, 3 * SIZE(%ecx)#endif addl $4 * SIZE, %ecx jmp .L120.L119: movss 0 * SIZE(%ecx), %xmm0 addss %xmm0, %xmm4 movss %xmm4, 0 * SIZE(%ecx) addl %eax, %ecx movss 0 * SIZE(%ecx), %xmm0 shufps $0x39, %xmm4, %xmm4 addss %xmm0, %xmm4 movss %xmm4, 0 * SIZE(%ecx) addl %eax, %ecx movss 0 * SIZE(%ecx), %xmm0 shufps $0x39, %xmm4, %xmm4 addss %xmm0, %xmm4 movss %xmm4, 0 * SIZE(%ecx) addl %eax, %ecx movss 0 * SIZE(%ecx), %xmm0 shufps $0x39, %xmm4, %xmm4 addss %xmm0, %xmm4 movss %xmm4, 0 * SIZE(%ecx) addl %eax, %ecx.L120: testl $2, %esi jle .L130.L121: pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movl A, %edx # aoffset = a addl $2 * SIZE, A # a += 8 movl BUFFER, %ebx # boffset = buffer movaps 0 * SIZE(%ebx), %xmm0 movl %edi, %eax # i = min_n sarl $2, %eax jle .L126.L124: KERNELMACRO2UNROLL( 0) addl $16 * SIZE, %ebx decl %eax jg .L124.L126: movl %edi, %eax # i = min_n andl $3, %eax jle .L128.L127: KERNELMACRO2( 0) addl $4 * SIZE, %ebx decl %eax jg .L127.L128: addps %xmm5, %xmm4 addps %xmm7, %xmm6 addps %xmm6, %xmm4 mulps %xmm3, %xmm4 movl INCY, %eax movl %ecx, %edx cmpl $4, %eax jne .L129#ifdef HAVE_SSE2 movsd 0 * SIZE(%ecx), %xmm0 addps %xmm0, %xmm4 movsd %xmm4, 0 * SIZE(%ecx)#else movss 0 * SIZE(%ecx), %xmm0 addss %xmm0, %xmm4 movss %xmm4, 0 * SIZE(%ecx) movss 1 * SIZE(%ecx), %xmm0 shufps $0x39, %xmm4, %xmm4 addss %xmm0, %xmm4 movss %xmm4, 1 * SIZE(%ecx)#endif addl $2 * SIZE, %ecx jmp .L130.L129: movss 0 * SIZE(%ecx), %xmm0 addss %xmm0, %xmm4 movss %xmm4, 0 * SIZE(%ecx) addl %eax, %ecx movss 0 * SIZE(%ecx), %xmm0 shufps $0x39, %xmm4, %xmm4 addss %xmm0, %xmm4 movss %xmm4, 0 * SIZE(%ecx) addl %eax, %ecx.L130: testl $1, %esi jle .L99.L131: pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movl A, %edx # aoffset = a addl $1 * SIZE, A # a += 8 movl BUFFER, %ebx # boffset = buffer movaps 0 * SIZE(%ebx), %xmm0 movl %edi, %eax # i = min_n sarl $2, %eax jle .L136.L134: KERNELMACRO1UNROLL( 0) addl $16 * SIZE, %ebx decl %eax jg .L134.L136: movl %edi, %eax # i = min_n andl $3, %eax jle .L138.L137: KERNELMACRO1( 0) addl $4 * SIZE, %ebx decl %eax jg .L137.L138: addss %xmm5, %xmm4 addss %xmm7, %xmm6 addss %xmm6, %xmm4 mulss %xmm3, %xmm4 movss 0 * SIZE(%ecx), %xmm0 addss %xmm0, %xmm4 movss %xmm4, 0 * SIZE(%ecx) jmp .L99 EPILOGUE
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -