📄 gemv_n_sse2.s
字号:
movl BUFFER, %ebx # boffset = buffer movapd 0 * SIZE(%ebx), %xmm0 movl %edi, %eax # i = min_n sarl $3, %eax jle .L106.L104: KERNELMACRO7( 0) KERNELMACRO7( 2) KERNELMACRO7( 4) KERNELMACRO7( 6) KERNELMACRO7( 8) KERNELMACRO7(10) KERNELMACRO7(12) KERNELMACRO7(14) addl $16 * SIZE, %ebx decl %eax jg .L104.L106: movl %edi, %eax # i = min_n andl $7, %eax jle .L108.L107: KERNELMACRO7( 0) addl $2 * SIZE, %ebx decl %eax jg .L107.L108: movsd 0 * SIZE(%ecx), %xmm0 mulpd %xmm3, %xmm4 movl INCY, %eax mulpd %xmm3, %xmm5 movl %ecx, %edx mulpd %xmm3, %xmm6 addl %eax, %edx mulsd %xmm3, %xmm7 cmpl $8, %eax jne .L109 movhpd 1 * SIZE(%ecx), %xmm0 addpd %xmm0, %xmm4 movsd 2 * SIZE(%ecx), %xmm0 movhpd 3 * SIZE(%ecx), %xmm0 addpd %xmm0, %xmm5 movsd 4 * SIZE(%ecx), %xmm0 movhpd 5 * SIZE(%ecx), %xmm0 addpd %xmm0, %xmm6 movsd 6 * SIZE(%ecx), %xmm0 addsd %xmm0, %xmm7 movsd %xmm4, 0 * SIZE(%ecx) unpckhpd %xmm4, %xmm4 movsd %xmm4, 1 * SIZE(%ecx) movsd %xmm5, 2 * SIZE(%ecx) unpckhpd %xmm5, %xmm5 movsd %xmm5, 3 * SIZE(%ecx) movsd %xmm6, 4 * SIZE(%ecx) unpckhpd %xmm6, %xmm6 movsd %xmm6, 5 * SIZE(%ecx) movsd %xmm7, 6 * SIZE(%ecx) jmp .L99.L109: movhpd 0 * SIZE(%edx), %xmm0 addl %eax, %edx addpd %xmm0, %xmm4 movsd 0 * SIZE(%edx), %xmm1 addl %eax, %edx movhpd 0 * SIZE(%edx), %xmm1 addl %eax, %edx addpd %xmm1, %xmm5 movsd 0 * SIZE(%edx), %xmm2 addl %eax, %edx movhpd 0 * SIZE(%edx), %xmm2 addl %eax, %edx addpd %xmm2, %xmm6 movsd 0 * SIZE(%edx), %xmm0 addsd %xmm0, %xmm7 movsd %xmm4, 0 * SIZE(%ecx) addl %eax, %ecx unpckhpd %xmm4, %xmm4 movsd %xmm4, 0 * SIZE(%ecx) addl %eax, %ecx movsd %xmm5, 0 * SIZE(%ecx) addl %eax, %ecx unpckhpd %xmm5, %xmm5 movsd %xmm5, 0 * SIZE(%ecx) addl %eax, %ecx movsd %xmm6, 0 * SIZE(%ecx) addl %eax, %ecx unpckhpd %xmm6, %xmm6 movsd %xmm6, 0 * SIZE(%ecx) addl %eax, %ecx movsd %xmm7, 0 * SIZE(%ecx) jmp .L99.L110: cmpl $6, %esi jne .L120.L111: pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 movl A, %edx # aoffset = a addl $6 * SIZE, A # a += 6 movl BUFFER, %ebx # boffset = buffer movapd 0 * SIZE(%ebx), %xmm0 movl %edi, %eax # i = min_n sarl $3, %eax jle .L116.L114: KERNELMACRO6( 0) KERNELMACRO6( 2) KERNELMACRO6( 4) KERNELMACRO6( 6) KERNELMACRO6( 8) KERNELMACRO6(10) KERNELMACRO6(12) KERNELMACRO6(14) addl $16 * SIZE, %ebx decl %eax jg .L114.L116: movl %edi, %eax # i = min_n andl $7, %eax jle .L118.L117: KERNELMACRO6( 0) addl $2 * SIZE, %ebx decl %eax jg .L117.L118: movsd 0 * SIZE(%ecx), %xmm0 mulpd %xmm3, %xmm4 movl INCY, %eax mulpd %xmm3, %xmm5 movl %ecx, %edx mulpd %xmm3, %xmm6 addl %eax, %edx cmpl $8, %eax jne .L119 movhpd 1 * SIZE(%ecx), %xmm0 addpd %xmm0, %xmm4 movsd 2 * SIZE(%ecx), %xmm0 movhpd 3 * SIZE(%ecx), %xmm0 addpd %xmm0, %xmm5 movsd 4 * SIZE(%ecx), %xmm0 movhpd 5 * SIZE(%ecx), %xmm0 addpd %xmm0, %xmm6 movsd %xmm4, 0 * SIZE(%ecx) unpckhpd %xmm4, %xmm4 movsd %xmm4, 1 * SIZE(%ecx) movsd %xmm5, 2 * SIZE(%ecx) unpckhpd %xmm5, %xmm5 movsd %xmm5, 3 * SIZE(%ecx) movsd %xmm6, 4 * SIZE(%ecx) unpckhpd %xmm6, %xmm6 movsd %xmm6, 5 * SIZE(%ecx) jmp .L99.L119: movhpd 0 * SIZE(%edx), %xmm0 addl %eax, %edx addpd %xmm0, %xmm4 movsd 0 * SIZE(%edx), %xmm1 addl %eax, %edx movhpd 0 * SIZE(%edx), %xmm1 addl %eax, %edx addpd %xmm1, %xmm5 movsd 0 * SIZE(%edx), %xmm2 addl %eax, %edx movhpd 0 * SIZE(%edx), %xmm2 addpd %xmm2, %xmm6 movsd %xmm4, 0 * SIZE(%ecx) addl %eax, %ecx unpckhpd %xmm4, %xmm4 movsd %xmm4, 0 * SIZE(%ecx) addl %eax, %ecx movsd %xmm5, 0 * SIZE(%ecx) addl %eax, %ecx unpckhpd %xmm5, %xmm5 movsd %xmm5, 0 * SIZE(%ecx) addl %eax, %ecx movsd %xmm6, 0 * SIZE(%ecx) addl %eax, %ecx unpckhpd %xmm6, %xmm6 movsd %xmm6, 0 * SIZE(%ecx) jmp .L99.L120: cmpl $5, %esi jne .L130.L121: pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 movl A, %edx # aoffset = a addl $5 * SIZE, A # a += 5 movl BUFFER, %ebx # boffset = buffer movapd 0 * SIZE(%ebx), %xmm0 movl %edi, %eax # i = min_n sarl $3, %eax jle .L126.L124: KERNELMACRO5( 0) KERNELMACRO5( 2) KERNELMACRO5( 4) KERNELMACRO5( 6) KERNELMACRO5( 8) KERNELMACRO5(10) KERNELMACRO5(12) KERNELMACRO5(14) addl $16 * SIZE, %ebx decl %eax jg .L124.L126: movl %edi, %eax # i = min_n andl $7, %eax jle .L128.L127: KERNELMACRO5( 0) addl $2 * SIZE, %ebx decl %eax jg .L127.L128: movsd 0 * SIZE(%ecx), %xmm0 mulpd %xmm3, %xmm4 movl INCY, %eax mulpd %xmm3, %xmm5 movl %ecx, %edx mulsd %xmm3, %xmm6 addl %eax, %edx cmpl $8, %eax jne .L129 movsd 0 * SIZE(%ecx), %xmm0 movhpd 1 * SIZE(%ecx), %xmm0 addpd %xmm0, %xmm4 movsd 2 * SIZE(%ecx), %xmm0 movhpd 3 * SIZE(%ecx), %xmm0 addpd %xmm0, %xmm5 movsd 4 * SIZE(%ecx), %xmm0 addsd %xmm0, %xmm6 movsd %xmm4, 0 * SIZE(%ecx) unpckhpd %xmm4, %xmm4 movsd %xmm4, 1 * SIZE(%ecx) movsd %xmm5, 2 * SIZE(%ecx) unpckhpd %xmm5, %xmm5 movsd %xmm5, 3 * SIZE(%ecx) movsd %xmm6, 4 * SIZE(%ecx) jmp .L99.L129: movhpd 0 * SIZE(%edx), %xmm0 addl %eax, %edx addpd %xmm0, %xmm4 movsd 0 * SIZE(%edx), %xmm1 addl %eax, %edx movhpd 0 * SIZE(%edx), %xmm1 addl %eax, %edx addpd %xmm1, %xmm5 movsd 0 * SIZE(%edx), %xmm2 addsd %xmm2, %xmm6 movsd %xmm4, 0 * SIZE(%ecx) addl %eax, %ecx unpckhpd %xmm4, %xmm4 movsd %xmm4, 0 * SIZE(%ecx) addl %eax, %ecx movsd %xmm5, 0 * SIZE(%ecx) addl %eax, %ecx unpckhpd %xmm5, %xmm5 movsd %xmm5, 0 * SIZE(%ecx) addl %eax, %ecx movsd %xmm6, 0 * SIZE(%ecx) jmp .L99.L130: cmpl $4, %esi jne .L140.L131: pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 movl A, %edx # aoffset = a addl $4 * SIZE, A # a += 5 movl BUFFER, %ebx # boffset = buffer movapd 0 * SIZE(%ebx), %xmm0 movl %edi, %eax # i = min_n sarl $3, %eax jle .L136.L134: KERNELMACRO4( 0) KERNELMACRO4( 2) KERNELMACRO4( 4) KERNELMACRO4( 6) KERNELMACRO4( 8) KERNELMACRO4(10) KERNELMACRO4(12) KERNELMACRO4(14) addl $16 * SIZE, %ebx decl %eax jg .L134.L136: movl %edi, %eax # i = min_n andl $7, %eax jle .L138.L137: KERNELMACRO4( 0) addl $2 * SIZE, %ebx decl %eax jg .L137.L138: movsd 0 * SIZE(%ecx), %xmm0 mulpd %xmm3, %xmm4 movl INCY, %eax mulpd %xmm3, %xmm5 movl %ecx, %edx addl %eax, %edx cmpl $8, %eax jne .L139 movhpd 1 * SIZE(%ecx), %xmm0 addpd %xmm0, %xmm4 movsd 2 * SIZE(%ecx), %xmm0 movhpd 3 * SIZE(%ecx), %xmm0 addpd %xmm0, %xmm5 movsd %xmm4, 0 * SIZE(%ecx) unpckhpd %xmm4, %xmm4 movsd %xmm4, 1 * SIZE(%ecx) movsd %xmm5, 2 * SIZE(%ecx) unpckhpd %xmm5, %xmm5 movsd %xmm5, 3 * SIZE(%ecx) jmp .L99.L139: movhpd 0 * SIZE(%edx), %xmm0 addl %eax, %edx addpd %xmm0, %xmm4 movsd 0 * SIZE(%edx), %xmm1 addl %eax, %edx movhpd 0 * SIZE(%edx), %xmm1 addpd %xmm1, %xmm5 movsd %xmm4, 0 * SIZE(%ecx) addl %eax, %ecx unpckhpd %xmm4, %xmm4 movsd %xmm4, 0 * SIZE(%ecx) addl %eax, %ecx movsd %xmm5, 0 * SIZE(%ecx) addl %eax, %ecx unpckhpd %xmm5, %xmm5 movsd %xmm5, 0 * SIZE(%ecx) jmp .L99.L140: cmpl $3, %esi jne .L150.L141: pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 movl A, %edx # aoffset = a addl $3 * SIZE, A # a += 5 movl BUFFER, %ebx # boffset = buffer movapd 0 * SIZE(%ebx), %xmm0 movl %edi, %eax # i = min_n sarl $3, %eax jle .L146.L144: KERNELMACRO3( 0) KERNELMACRO3( 2) KERNELMACRO3( 4) KERNELMACRO3( 6) KERNELMACRO3( 8) KERNELMACRO3(10) KERNELMACRO3(12) KERNELMACRO3(14) addl $16 * SIZE, %ebx decl %eax jg .L144.L146: movl %edi, %eax # i = min_n andl $7, %eax jle .L148.L147: KERNELMACRO3( 0) addl $2 * SIZE, %ebx decl %eax jg .L147.L148: movsd 0 * SIZE(%ecx), %xmm0 mulpd %xmm3, %xmm4 movl INCY, %eax mulsd %xmm3, %xmm5 movl %ecx, %edx addl %eax, %edx cmpl $8, %eax jne .L149 movsd 0 * SIZE(%ecx), %xmm0 movhpd 1 * SIZE(%ecx), %xmm0 addpd %xmm0, %xmm4 movsd 2 * SIZE(%ecx), %xmm0 addsd %xmm0, %xmm5 movsd %xmm4, 0 * SIZE(%ecx) unpckhpd %xmm4, %xmm4 movsd %xmm4, 1 * SIZE(%ecx) movsd %xmm5, 2 * SIZE(%ecx) jmp .L99.L149: movhpd 0 * SIZE(%edx), %xmm0 addl %eax, %edx addpd %xmm0, %xmm4 movsd 0 * SIZE(%edx), %xmm1 addsd %xmm1, %xmm5 movsd %xmm4, 0 * SIZE(%ecx) addl %eax, %ecx unpckhpd %xmm4, %xmm4 movsd %xmm4, 0 * SIZE(%ecx) addl %eax, %ecx movsd %xmm5, 0 * SIZE(%ecx) jmp .L99.L150: cmpl $2, %esi jne .L160.L151: pxor %xmm4, %xmm4 movl A, %edx # aoffset = a addl $2 * SIZE, A # a += 5 movl BUFFER, %ebx # boffset = buffer movapd 0 * SIZE(%ebx), %xmm0 movl %edi, %eax # i = min_n sarl $3, %eax jle .L156.L154: KERNELMACRO2( 0) KERNELMACRO2( 2) KERNELMACRO2( 4) KERNELMACRO2( 6) KERNELMACRO2( 8) KERNELMACRO2(10) KERNELMACRO2(12) KERNELMACRO2(14) addl $16 * SIZE, %ebx decl %eax jg .L154.L156: movl %edi, %eax # i = min_n andl $7, %eax jle .L158.L157: KERNELMACRO2( 0) addl $2 * SIZE, %ebx decl %eax jg .L157.L158: movsd 0 * SIZE(%ecx), %xmm0 mulpd %xmm3, %xmm4 movl INCY, %eax movl %ecx, %edx addl %eax, %edx cmpl $8, %eax jne .L159 movsd 0 * SIZE(%ecx), %xmm0 movhpd 1 * SIZE(%ecx), %xmm0 addpd %xmm0, %xmm4 movsd %xmm4, 0 * SIZE(%ecx) unpckhpd %xmm4, %xmm4 movsd %xmm4, 1 * SIZE(%ecx) jmp .L99.L159: movhpd 0 * SIZE(%edx), %xmm0 addl %eax, %edx addpd %xmm0, %xmm4 movsd %xmm4, 0 * SIZE(%ecx) addl %eax, %ecx unpckhpd %xmm4, %xmm4 movsd %xmm4, 0 * SIZE(%ecx) jmp .L99.L160: pxor %xmm4, %xmm4 movl A, %edx # aoffset = a addl $1 * SIZE, A # a += 5 movl BUFFER, %ebx # boffset = buffer movapd 0 * SIZE(%ebx), %xmm0 movl %edi, %eax # i = min_n sarl $3, %eax jle .L166.L164: KERNELMACRO1( 0) KERNELMACRO1( 2) KERNELMACRO1( 4) KERNELMACRO1( 6) KERNELMACRO1( 8) KERNELMACRO1(10) KERNELMACRO1(12) KERNELMACRO1(14) addl $16 * SIZE, %ebx decl %eax jg .L164.L166: movl %edi, %eax # i = min_n andl $7, %eax jle .L168.L167: KERNELMACRO1( 0) addl $2 * SIZE, %ebx decl %eax jg .L167.L168: mulsd %xmm3, %xmm4 movsd 0 * SIZE(%ecx), %xmm0 addsd %xmm0, %xmm4 movsd %xmm4, 0 * SIZE(%ecx) jmp .L99 EPILOGUE
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -