asum_sse_x1.c
来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 345 行
C
345 行
#include "atlas_asm.h"#ifdef SREAL#ifndef ATL_SSE1 #error "This kernel requires SSE1"#endif#ifdef ATL_GAS_x8632 #define movq movl #define addq addl #define subq subl #define rsp esp #define rax eax#elif !defined(ATL_GAS_x8664) #error "This kernel requires a gas x86 assembler!"#endif#ifdef ATL_GAS_x8632 #define N %eax #define X %edx #define stX %ecx #define stXF %ebx#else #define N %rax #define X %rsi #define stX %rdi #define stXF %rdx#endif#define absval %xmm0#define rX0 %xmm1#define rX1 %xmm2#define rX2 %xmm3#define rX3 %xmm4#define sum0 %xmm5#define sum1 %xmm6#define sum2 %xmm7# BYTE: 4 8# TYPE ATL_UASUM(const int N, const TYPE *X, const int incX) .text.global ATL_asmdecor(ATL_UASUM)ATL_asmdecor(ATL_UASUM):#ifdef ATL_GAS_x8632 subl $16, %esp #define OFF 0#else #define OFF -16#endif## Temporarily store 1.0 and -1.0 to stack# fld1 fldz fsub %st(1), %st fstps OFF(%rsp) fstps OFF+4(%rsp)## absval = (-1.0 ^ 1.0) = sign bit# movss OFF(%rsp), absval movss OFF+4(%rsp), rX1 xorps rX1, absval## eax = all bits set# xorl %eax, %eax notl %eax movl %eax, OFF(%rsp) movss OFF(%rsp), rX1 andnps rX1, absval shufps $0x00, absval, absval#ifdef ATL_GAS_x8632## Save iregs# movl %ebx, (%esp)## N = N, X = X, stXF = X + N# movl 20(%esp), N movl 24(%esp), X#else movl %edi, %eax cltq#endif movq N, stXF shl $2, stXF addq X, stXF## Get X aligned to 16 byte boundary# xorps sum0, sum0 movq X, stX shr $4, stX shl $4, stX cmp X, stX jne FORCE_ALIGNALIGNED_START: movq N, stX shr $4, stX jz UNALIGNED shl $6, stX addq X, stX xorps sum1, sum1 xorps sum2, sum2ALIGNED_LOOP: movaps (X), rX0 movaps 16(X), rX1 movaps 32(X), rX2 movaps 48(X), rX3 andps absval, rX0 #if defined(ATL_ARCH_HAMMER64) || defined(ATL_ARCH_HAMMER32) prefetchnta 396(X) #else prefetchnta 296(X) #endif andps absval, rX1 addps rX0, sum0 andps absval, rX2 addps rX1, sum1 andps absval, rX3 addps rX2, sum2 addps rX3, sum0 addq $64, X cmp X, stX jne ALIGNED_LOOP# addps sum1, sum0 addps sum2, sum0 movhlps sum0, sum1 addps sum1, sum0 movss sum0, sum1 shufps $0x55, sum0, sum0 addss sum1, sum0 cmp X, stXF jne UNALIGNED_LOOP## Restore iregs, return value#DONE:#ifdef ATL_GAS_x8632 movl (%esp), %ebx movss sum0, (%esp) flds (%esp) addl $16, %esp#else movss sum0, %xmm0#endif retFORCE_ALIGN: movss (X), rX0 andps absval, rX0 addss rX0, sum0 addq $4, X movq X, stX shr $4, stX shl $4, stX dec N cmp X, stX je ALIGNED_START cmp X, stXF jne FORCE_ALIGN jmp DONEUNALIGNED: cmp X, stXF je DONEUNALIGNED_LOOP: movss (X), rX0 andps absval, rX0 addss rX0, sum0 addq $4, X cmp X, stXF jne UNALIGNED_LOOP jmp DONE#else#ifndef ATL_SSE2 #error "This kernel requires SSE2"#endif#ifdef ATL_GAS_x8632 #define movq movl #define addq addl #define subq subl #define rsp esp #define rax eax#elif !defined(ATL_GAS_x8664) #error "This kernel requires a gas x86 assembler!"#endif#ifdef ATL_GAS_x8632 #define N %eax #define X %edx #define stX %ecx #define stXF %ebx#else #define N %rax #define X %rsi #define stX %rdi #define stXF %rdx#endif#define absval %xmm0#define rX0 %xmm1#define rX1 %xmm2#define rX2 %xmm3#define rX3 %xmm4#define sum0 %xmm5#define sum1 %xmm6#define sum2 %xmm7# BYTE: 4 8# TYPE ATL_UASUM(const int N, const TYPE *X, const int incX) .text.global ATL_asmdecor(ATL_UASUM)ATL_asmdecor(ATL_UASUM):#ifdef ATL_GAS_x8632 subl $16, %esp #define OFF 0#else #define OFF -16#endif## Temporarily store 1.0 and -1.0 to stack# fld1 fldz fsub %st(1), %st fstpl OFF(%rsp) fstpl OFF+8(%rsp)## absval = (-1.0 ^ 1.0) = sign bit# movlpd OFF(%rsp), absval movlpd OFF+8(%rsp), rX1 xorpd rX1, absval## eax = all bits set# xorl %eax, %eax notl %eax movl %eax, OFF(%rsp) movl %eax, OFF+4(%rsp) movlpd OFF(%rsp), rX1 andnpd rX1, absval unpcklpd absval, absval#ifdef ATL_GAS_x8632## Save iregs# movl %ebx, (%esp)## N = N, X = X, stXF = X + N# movl 20(%esp), N movl 24(%esp), X#else movl %edi, %eax cltq#endif movq N, stXF shl $3, stXF addq X, stXF## If X is not aligned to 16 byte boundary, peel 1 iteration# xorpd sum0, sum0 movq X, stX shr $4, stX shl $4, stX cmp X, stX je ALIGNED_START movlpd (X), sum0 andpd absval, sum0 addq $8, X dec N jz DONE## If still not aligned after peeling, go to unaligned loop# movq X, stX shr $4, stX shl $4, stX cmp X, stX jne UNALIGNED_LOOPALIGNED_START: movq N, stX shr $3, stX jz UNALIGNED_LOOP shl $6, stX addq X, stX xorpd sum1, sum1 xorpd sum2, sum2ALIGNED_LOOP: movapd (X), rX0 movapd 16(X), rX1 movapd 32(X), rX2 movapd 48(X), rX3 andpd absval, rX0 #if defined(ATL_ARCH_HAMMER64) || defined(ATL_ARCH_HAMMER32) prefetchnta 640(X) #else prefetchnta 1024(X) #endif andpd absval, rX1 addpd rX0, sum0 andpd absval, rX2 addpd rX1, sum1 andpd absval, rX3 addpd rX2, sum2 addpd rX3, sum0 addq $64, X cmp X, stX jne ALIGNED_LOOP# addpd sum1, sum0 addpd sum2, sum0 movapd sum0, sum1 unpckhpd sum1, sum1 addsd sum1, sum0 cmp X, stXF jne UNALIGNED_LOOP## Restore iregs, return value#DONE:#ifdef ATL_GAS_x8632 movl (%esp), %ebx movlpd sum0, (%esp) fldl (%esp) addl $16, %esp#else movsd sum0, %xmm0#endif retUNALIGNED_LOOP: movlpd (X), rX0 andpd absval, rX0 addsd rX0, sum0 addq $8, X cmp X, stXF jne UNALIGNED_LOOP jmp DONE#endif
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?