📄 trsm_kernel_rt_8x4_sse.s
字号:
mulss %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm11 movaps 20 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm15 movaps 24 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulss %xmm8, %xmm11 movaps 28 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulss %xmm11, %xmm8 subss %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulss %xmm11, %xmm8 subss %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulss %xmm11, %xmm8 subss %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulss %xmm11, %xmm8 subss %xmm8, %xmm15 movaps 36 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulss %xmm8, %xmm12 pshufd $0x55, %xmm6, %xmm8 mulss %xmm12, %xmm8 subss %xmm8, %xmm13 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm12, %xmm8 subss %xmm8, %xmm14 pshufd $0xff, %xmm6, %xmm8 mulss %xmm12, %xmm8 subss %xmm8, %xmm15 movaps 44 * SIZE(AO), %xmm7 pshufd $0x55, %xmm7, %xmm8 mulss %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulss %xmm13, %xmm8 subss %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulss %xmm13, %xmm8 subss %xmm8, %xmm15 movaps 52 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm8, %xmm14 pshufd $0xff, %xmm6, %xmm8 mulss %xmm14, %xmm8 subss %xmm8, %xmm15 movaps 60 * SIZE(AO), %xmm7 pshufd $0xff, %xmm7, %xmm8 mulss %xmm8, %xmm15#endif#if defined(RN) || defined(RT) movss 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 mulps %xmm2, %xmm9#endif#ifdef LN subq $8 * SIZE, CO1#endif#if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(B) movss %xmm5, 1 * SIZE(B) movss %xmm10, 2 * SIZE(B) movss %xmm11, 3 * SIZE(B) movss %xmm12, 4 * SIZE(B) movss %xmm13, 5 * SIZE(B) movss %xmm14, 6 * SIZE(B) movss %xmm15, 7 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 movaps %xmm2, 0 * SIZE(BO) pshufd $0x00, %xmm5, %xmm2 movaps %xmm2, 4 * SIZE(BO) pshufd $0x00, %xmm10, %xmm2 movaps %xmm2, 8 * SIZE(BO) pshufd $0x00, %xmm11, %xmm2 movaps %xmm2, 12 * SIZE(BO) pshufd $0x00, %xmm12, %xmm2 movaps %xmm2, 16 * SIZE(BO) pshufd $0x00, %xmm13, %xmm2 movaps %xmm2, 20 * SIZE(BO) pshufd $0x00, %xmm14, %xmm2 movaps %xmm2, 24 * SIZE(BO) pshufd $0x00, %xmm15, %xmm2 movaps %xmm2, 28 * SIZE(BO)#else movaps %xmm8, 0 * SIZE(AO) movaps %xmm9, 4 * SIZE(AO)#endif#if defined(LN) || defined(LT) unpcklps %xmm10, %xmm1 unpcklps %xmm11, %xmm5 unpcklps %xmm5, %xmm1 unpcklps %xmm14, %xmm12 unpcklps %xmm15, %xmm13 unpcklps %xmm13, %xmm12 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm12, 4 * SIZE(CO1) movhps %xmm12, 6 * SIZE(CO1)#else movlps %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movlps %xmm9, 4 * SIZE(CO1) movhps %xmm9, 6 * SIZE(CO1)#endif#ifndef LN addq $8 * SIZE, CO1#endif#if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 8), AO#ifdef LT addq $8 * SIZE, B#endif#endif#ifdef LN subq $8, KK movq BORIG, B#endif#ifdef LT addq $8, KK#endif#ifdef RT movq K, %rax movq BORIG, B salq $3 + BASE_SHIFT, %rax addq %rax, AORIG#endif decq I # i -- jg .L111 ALIGN_4 .L120: testq $4, M je .L130#ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG#endif#if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO#endif leaq BUFFER, BO#if defined(LN) || defined(RT) movq KK, %rax salq $BASE_SHIFT, %rax leaq (BO, %rax, 4), BO#endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3#if defined(LT) || defined(RN) movq KK, %rax#else movq K, %rax subq KK, %rax#endif sarq $3, %rax je .L125 ALIGN_4.L122: mulps %xmm8, %xmm9#if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)#endif movaps 4 * SIZE(AO), %xmm8 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 32 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps 8 * SIZE(AO), %xmm8 mulps 8 * SIZE(BO), %xmm8 addps %xmm8, %xmm2 movaps 12 * SIZE(AO), %xmm8 mulps 12 * SIZE(BO), %xmm8 addps %xmm8, %xmm3 movaps 32 * SIZE(AO), %xmm8#if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)#endif mulps %xmm10, %xmm11 movaps 20 * SIZE(AO), %xmm10 mulps 20 * SIZE(BO), %xmm10 addps %xmm11, %xmm0 movaps 48 * SIZE(BO), %xmm11 addps %xmm10, %xmm1 movaps 24 * SIZE(AO), %xmm10 mulps 24 * SIZE(BO), %xmm10 addps %xmm10, %xmm2 movaps 28 * SIZE(AO), %xmm10 mulps 28 * SIZE(BO), %xmm10 addps %xmm10, %xmm3 movaps 48 * SIZE(AO), %xmm10 addq $32 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L122 ALIGN_4.L125:#if defined(LT) || defined(RN) movq KK, %rax#else movq K, %rax subq KK, %rax#endif andq $7, %rax # if (k & 1) BRANCH je .L128 ALIGN_4.L126: mulps %xmm8, %xmm9 movaps 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 addq $4 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L126 ALIGN_4.L128: addps %xmm1, %xmm0 addps %xmm3, %xmm2 addps %xmm2, %xmm0#if defined(LN) || defined(RT) movq KK, %rax#ifdef LN subq $4, %rax#else subq $1, %rax#endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $BASE_SHIFT, %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO#endif#if defined(LN) || defined(LT) movaps %xmm0, %xmm8 unpcklps %xmm2, %xmm0 unpckhps %xmm2, %xmm8 movaps %xmm1, %xmm14 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm14 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movaps %xmm8, %xmm3 unpcklps %xmm14, %xmm8 unpckhps %xmm14, %xmm3 movss 0 * SIZE(B), %xmm1 movss 1 * SIZE(B), %xmm5 movss 2 * SIZE(B), %xmm10 movss 3 * SIZE(B), %xmm11 subss %xmm0, %xmm1 subss %xmm2, %xmm5 subss %xmm8, %xmm10 subss %xmm3, %xmm11#else movaps 0 * SIZE(AO), %xmm8 subps %xmm0, %xmm8#endif#ifdef LN movaps 12 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulss %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm11, %xmm8 subss %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulss %xmm11, %xmm8 subss %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulss %xmm11, %xmm8 subss %xmm8, %xmm1 movaps 8 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm1 movaps 4 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulss %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulss %xmm5, %xmm8 subss %xmm8, %xmm1 movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulss %xmm8, %xmm1#endif#ifdef LT movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulss %xmm8, %xmm1 pshufd $0x55, %xmm6, %xmm8 mulss %xmm1, %xmm8 subss %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm1, %xmm8 subss %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulss %xmm1, %xmm8 subss %xmm8, %xmm11 movaps 4 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulss %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm5, %xmm8 subss %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulss %xmm5, %xmm8 subss %xmm8, %xmm11 movaps 8 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm11 movaps 12 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulss %xmm8, %xmm11#endif#if defined(RN) || defined(RT) movss 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8#endif#ifdef LN subq $4 * SIZE, CO1#endif#if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(B) movss %xmm5, 1 * SIZE(B) movss %xmm10, 2 * SIZE(B) movss %xmm11, 3 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 movaps %xmm2, 0 * SIZE(BO) pshufd $0x00, %xmm5, %xmm2 movaps %xmm2, 4 * SIZE(BO) pshufd $0x00, %xmm10, %xmm2 movaps %xmm2, 8 * SIZE(BO) pshufd $0x00, %xmm11, %xmm2 movaps %xmm2, 12 * SIZE(BO)#else movaps %xmm8, 0 * SIZE(AO)#endif#if defined(LN) || defined(LT) unpcklps %xmm10, %xmm1 unpcklps %xmm11, %xmm5 unpcklps %xmm5, %xmm1 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1)#else movlps %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1)#endif#ifndef LN addq $4 * SIZE, CO1#endif#if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO#ifdef LT addq $4 * SIZE, B#endif#endif#ifdef LN subq $4, KK movq BORIG, B#endif#ifdef LT addq $4, KK#endif#ifdef RT movq K, %rax movq BORIG, B salq $2 + BASE_SHIFT, %rax addq %rax, AORIG#endif ALIGN_4 .L130: testq $2, M je .L140#ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG#endif#if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO#endif leaq BUFFER, BO#if defined(LN) || defined(RT) movq KK, %rax salq $BASE_SHIFT, %rax leaq (BO, %rax, 4), BO#endif movlps 0 * SIZE(AO), %xmm8 movhps 2 * SIZE(AO), %xmm8 movlps 8 * SIZE(AO), %xmm10 movhps 10 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3#if defined(LT) || defined(RN) movq KK, %rax#else movq K, %rax subq KK, %rax#endif sarq $3, %rax je .L135 ALIGN_4.L132: mulps %xmm8, %xmm9#if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)#endif movlps 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movlps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movlps 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movlps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movlps 6 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movlps 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movlps 16 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movlps 32 * SIZE(BO), %xmm9 mulps %xmm10, %xmm11 movlps 10 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movlps 20 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movlps 12 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movlps 24 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movlps 14 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movlps 28 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movlps 24 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movlps 48 * SIZE(BO), %xmm11 addq $16 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L132 ALIGN_4.L135:#if defined(LT) || defined(RN) movq KK, %rax#else movq K, %rax subq KK, %rax#endif andq $7, %rax # if (k & 1) BRANCH je .L138 ALIGN_4.L136: mulps %xmm8, %xmm9 movlps 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movlps 4 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L136 ALIGN_4.L138: addps %xmm1, %xmm0#if defined(LN) || defined(RT) movq KK, %rax#ifdef LN subq $2, %rax#else subq $1, %rax#endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $BASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO#endif#if defined(LN) || defined(LT) unpcklps %xmm2, %xmm0 unpcklps %xmm3, %xmm1 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movss 0 * SIZE(B), %xmm1 movss 1 * SIZE(B), %xmm5 subss %xmm0, %xmm1 subss %xmm2, %xmm5#else movsd 0 * SIZE(AO), %xmm8 subps %xmm0, %xmm8#endif#ifdef LN movaps 0 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulss %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm5, %xmm8 subss %xmm8, %xmm1 pshufd $0x00, %xmm6, %xmm8 mulss %xmm8, %xmm1#endif#ifdef LT movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulss %xmm8, %xmm1 pshufd $0x55, %xmm6, %xmm8 mulss %xmm1, %xmm8 subss %xmm8, %xmm5 pshufd $0xff, %xmm6, %xmm8 mulss %xmm8, %xmm5#endif#if defined(RN) || defined(RT) movss 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8#endif#ifdef LN subq $2 * SIZE, CO1#endif#if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(B) movss %xmm5, 1 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 movaps %xmm2, 0 * SIZE(BO) pshufd $0x00, %xmm5, %xmm2 movaps %xmm2, 4 * SIZE(BO)#else movlps %xmm8, 0 * SIZE(AO)#endif#if defined(LN) || defined(LT) unpcklps %xmm10, %xmm1 unpcklps %xmm11, %xmm5 unpcklps %xmm5, %xmm1 movlps %xmm1, 0 * SIZE(CO1)#else movlps %xmm8, 0 * SIZE(CO1)#endif#ifndef LN addq $2 * SIZE, CO1#endif#if defined(LT) || defined(RN) movq K, %rax subq KK, %rax
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -