daxpy_sse2.c
来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 138 行
C
138 行
#include "atlas_asm.h"#ifndef ATL_SSE2 #error "This kernel requires SSE2"#endif#ifdef ATL_GAS_x8632 #define movq movl #define addq addl #define subq subl #define X %eax #define Y %edx #define N %ecx #define Nr %ebp#elif defined(ATL_GAS_x8664) #define N %rdi #define X %rsi #define Y %rcx #define Nr %rdx#else #error "This kernel requires x86 assembly!"#endif#define alpha %xmm0#define rY0 %xmm1#define rX0 %xmm2#ifndef PFDIST #ifdef ATL_ARCH_P4E #define PFDIST 384 #else #define PFDIST 416 #endif#endif# byte offset 4 8 16 20# void ATL_UAXPY(const int N, const SCALAR alpha, const TYPE *X, const int incX,# TYPE *Y, const int incY) .text.global ATL_asmdecor(ATL_UAXPY)ATL_asmdecor(ATL_UAXPY):#ifdef ATL_GAS_x8632 #define OFF 8 subl $OFF, %esp movl %ebp, (%esp) movl %ebx, 4(%esp) movl OFF+4(%esp), N movlpd OFF+8(%esp), alpha movl OFF+16(%esp), X movl OFF+24(%esp), Y#endif unpcklpd alpha, alpha movq N, Nr cmp $4, N jbe SCALAR_TEST movq Y, Nr shr $4, Nr shl $4, Nr cmp Nr, Y je YALIGNED movlpd (X), rX0 mulsd alpha, rX0 movlpd (Y), rY0 subq $1, N addsd rX0, rY0 addq $8, X movlpd rY0, (Y) addq $8, YYALIGNED: movq X, Nr shr $4, Nr shl $4, Nr cmp Nr, X jne XUNALIGNED movq N, Nr shr $1, N shl $1, N sub N, Nr lea (X, N, 8), X lea (Y, N, 8), Y neg NNLOOP: movapd (X,N,8), rX0 movapd (Y,N,8), rY0 mulpd alpha, rX0 prefetchw PFDIST(Y,N,8) addpd rX0, rY0 movapd rY0, (Y,N,8) prefetchnta PFDIST(X,N,8) addq $2, N jnz NLOOP## This loop assumes num of iterations is in Nr#SCALAR_TEST: cmp $0, Nr je DONESLOOP: movlpd (X), rX0 mulsd alpha, rX0 movlpd (Y), rY0 addsd rX0, rY0 addq $8, X movlpd rY0, (Y) addq $8, Y subq $1, Nr jnz SLOOP## Epilogue#DONE:#ifdef ATL_GAS_x8632 movl (%esp), %ebp movl 4(%esp), %ebx addl $OFF, %esp#endif retXUNALIGNED: movq N, Nr shr $1, N shl $1, N sub N, Nr lea (X, N, 8), X lea (Y, N, 8), Y neg NUNLOOP: movupd (X,N,8), rX0 movapd (Y,N,8), rY0 mulpd alpha, rX0 addpd rX0, rY0 movapd rY0, (Y,N,8) addq $2, N jnz UNLOOP jmp SCALAR_TEST
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?