iamax_sse.c

来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 576 行

C
576
字号
#include "atlas_asm.h"#ifdef SREAL#ifdef ATL_GAS_x8632   #define movq movl   #define addq addl   #define subq subl   #define rsp  esp   #define rax  eax#elif !defined(ATL_GAS_x8664)   #error "This kernel requires a gas x86 assembler!"#endif#ifdef ATL_GAS_x8632   #define N	%eax   #define X	%edx   #define maxX	%ecx   #define X0	%edi   #define N8   %ebp   #define reg1 %ebx   #define reg2 %esi#else   #define N	%rax   #define X	%rsi   #define maxX	%rcx   #define X0	%rdi   #define N8   %rdx   #define reg1	%r8   #define reg2 %r9#endif#define maxval  %xmm0#define absval	%xmm1#define rX0     %xmm2#define rX1     %xmm3#define rX2     %xmm4#define rX3     %xmm5# IREG                   rdi            rsi# int ATL_UIAMAX(const int N, const TYPE *X, const int incX)# IREG                   rdi            rsi# int ATL_UIAMAX(const int N, const TYPE *X, const int incX)        .text.global	ATL_asmdecor(ATL_UIAMAX)ATL_asmdecor(ATL_UIAMAX):#ifdef ATL_GAS_x8632	subl	$16, %esp   #define SOFF 0#else   #define SOFF -8#endif##	Temporarily store 1.0 and -1.0 to stack#	fld1	fldz	fsub	%st(1), %st	fstps	SOFF(%rsp)	fstps	SOFF+4(%rsp)##       eax = all bits 1#	xorl	%eax, %eax	notl	%eax##	absval = (-1.0 ^ 1.0) = sign bit only#	movss	SOFF(%rsp), absval	movss	SOFF+4(%rsp), rX0	xorps	rX0, absval##       absval = NOT(sign bit) & (all ones) == all bits but sign bet set#	movl	%eax, SOFF(%rsp)	movss	SOFF(%rsp), rX0	andnps	rX0, absval	shufps	$0x00, absval, absval#ifdef ATL_GAS_x8632##       Save iregs#	movl	%edi, (%esp)	movl	%ebp, 4(%esp)        movl    %ebx, 8(%esp)	movl	%esi, 12(%esp)#	movl	20(%esp), N	movl	24(%esp), X#	movl	X, X0	movl	X, maxX	cmp	$1, N	jbe	DONE#else##       X already in right register, X0 = X, maxX = X, init N#	movl	%edi, %eax	movq	X, X0	movq	X, maxX	cmp	$1,%eax	jbe	DONE	cltq#endif        xorps   maxval, maxval##       Get X aligned to 16 byte boundary#	test	$15, X	jnz	FORCEALIGNALIGNED_STARTUP:        movq    N, N8        shr     $4, N8        jz      LOOP1	shl	$4, N8	subq	N8, N	shr	$4, N8LOOP8:   #if defined(ATL_ARCH_HAMMER64) || defined(ATL_ARCH_HAMMER32)		prefetchnta	608(X)   #elif defined(ATL_ARCH_P4)                prefetchnta     464(X)   #else		prefetchnta	128(X)		prefetchnta	160(X)   #endif        movaps  (X), rX0        movaps  16(X), rX1	movaps	32(X), rX2	movaps	48(X), rX3	andps	absval, rX0	andps	absval, rX1	andps	absval, rX2	andps	absval, rX3        cmpps   $6, maxval, rX0        cmpps   $6, maxval, rX1        cmpps   $6, maxval, rX2        cmpps   $6, maxval, rX3        movmskps        rX0, reg1        movmskps        rX1, reg2	shl	$4, reg1	or	reg2, reg1	movmskps	rX2, reg2	shl	$4, reg1	or	reg2, reg1	movmskps	rX3, reg2	shl	$4, reg1	or	reg2, reg1	cmp	$0, reg1	jne	LOOP8_NEWMAXLOOP8INC:	addq    $64, X	dec	N8	jnz	LOOP8###	Find which of 16 possible vals created maxval#FIND:	movups	(maxX), rX0	movups	16(maxX), rX1	andps	absval, rX0	andps	absval, rX1	cmpps	$0, maxval, rX0	cmpps	$0, maxval, rX1	movmskps	rX0, reg1	movmskps	rX1, reg2	test	$15, reg1	jnz	FIND_0	test	$15, reg2	jnz	FIND_4	movups	32(maxX), rX0	movups	48(maxX), rX1	andps	absval, rX0	andps	absval, rX1	cmpps	$0, maxval, rX0	cmpps	$0, maxval, rX1	movmskps	rX0, reg1	movmskps	rX1, reg2	test	$15, reg1	jnz	FIND_8	addq	$48, maxX	test	$1, reg2	jnz	FIND_CU	addq	$4, maxX	test	$2, reg2	jnz	FIND_CU	addq	$4, maxX	test	$4, reg2	jnz	FIND_CU	addq	$4, maxX	jmp	FIND_CUFIND_0:	test	$1, reg1	jnz	FIND_CU	addq	$4, maxX	test	$2, reg1	jnz	FIND_CU	addq	$4, maxX	test	$4, reg1	jnz	FIND_CU	addq	$4, maxX	jmp	FIND_CUFIND_4:	addq	$16, maxX	test	$1, reg2	jnz	FIND_CU	addq	$4, maxX	test	$2, reg2	jnz	FIND_CU	addq	$4, maxX	test	$4, reg2	jnz	FIND_CU	addq	$4, maxX	jmp	FIND_CUFIND_8:	addq	$32, maxX	test	$1, reg1	jnz	FIND_CU	addq	$4, maxX	test	$2, reg1	jnz	FIND_CU	addq	$4, maxX	test	$4, reg1	jnz	FIND_CU	addq	$4, maxXFIND_CU:	cmp	$0, N	jnz	LOOP1DONE:	movq	maxX, %rax	subq	X0, %rax	shr	$2, %rax#ifdef ATL_GAS_x8632	movl	(%esp), %edi	movl	4(%esp), %ebp	movl	8(%esp), %ebx	movl	12(%esp), %esi        addl    $16, %esp#endif	retLOOP8_NEWMAX:	movq	X, maxX	test	$0xFF00, reg1	jz	L8NM_8	movaps	(X), rX0	movaps	16(X), rX1	andps	absval, rX0	andps	absval, rX1	maxps	rX1, rX0	movhlps	rX0, rX1	maxps	rX1, rX0	movaps	rX0, maxval	shufps	$0x11,	maxval, maxval	maxps	rX0, maxval	movlhps	maxval, maxval	test	$0x00FF, reg1	jz	LOOP8INCL8NM_8:	movaps	32(X), rX0	movaps	48(X), rX1	andps	absval, rX0	andps	absval, rX1	maxps	rX1, rX0	movhlps	rX0, rX1	maxps	rX1, rX0	movaps	rX0, rX1	shufps	$0x11,	rX1, rX1	maxps	rX0, rX1	movlhps	rX1, rX1	maxps	rX1, maxval	jmp	LOOP8INC##  Assumes X at start, and N # of iterations#LOOP1:	movss	(X), rX0	andps	absval, rX0	comiss	rX0, maxval	jb	NEWMAX1LOOPINC1:	addq	$4, X        dec     N	jnz	LOOP1        shufps  $0x00, maxval, maxval	jmp	DONENEWMAX1:	movss	rX0, maxval	movq	X, maxX	jmp	LOOPINC1FORCEALIGN:	movss	(X), rX0	andps	absval, rX0	comiss	rX0, maxval	jb FA_NEWMAXFA_INC:	dec	N	jz	DONE	addq	$4, X	test	$15, X	jnz	FORCEALIGN#        shufps  $0x00, maxval, maxval        jmp     ALIGNED_STARTUPFA_NEWMAX:	movss	rX0, maxval	movq	X, maxX	jmp	FA_INC#else#ifdef ATL_GAS_x8632   #define movq movl   #define addq addl   #define subq subl   #define rsp  esp   #define rax  eax#elif !defined(ATL_GAS_x8664)   #error "This kernel requires a gas x86 assembler!"#endif#ifdef ATL_GAS_x8632   #define N	%eax   #define X	%edx   #define maxX	%ecx   #define X0	%edi   #define N4   %ebp   #define reg1 %ebx   #define reg2 %esi#else   #define N	%rax   #define X	%rsi   #define maxX	%rcx   #define X0	%rdi   #define N4   %rdx   #define reg1	%r8   #define reg2 %r9#endif#define maxval  %xmm0#define absval	%xmm1#define rX0     %xmm2#define rX1     %xmm3#define rX2     %xmm4#define rX3     %xmm5# IREG                   rdi            rsi# int ATL_UIAMAX(const int N, const TYPE *X, const int incX)        .text.global	ATL_asmdecor(ATL_UIAMAX)ATL_asmdecor(ATL_UIAMAX):#ifdef ATL_GAS_x8632	subl	$16, %esp   #define SOFF 0#else   #define SOFF -16#endif##	Temporarily store 1.0 and -1.0 to stack#	fld1	fldz	fsub	%st(1), %st	fstpl	SOFF(%rsp)	fstpl	SOFF+8(%rsp)##       eax = all bits 1#	xorl	%eax, %eax	notl	%eax##	absval = (-1.0 ^ 1.0) = sign bit only#	movlpd	SOFF(%rsp), absval	movlpd	SOFF+8(%rsp), rX0	xorpd	rX0, absval##       absval = NOT(sign bit) & (all ones) == all bits but sign bet set#	movl	%eax, SOFF(%rsp)	movl	%eax, 4+SOFF(%rsp)	movlpd	SOFF(%rsp), rX0	andnpd	rX0, absval	unpcklpd	absval, absval#ifdef ATL_GAS_x8632##       Save iregs#	movl	%edi, (%esp)	movl	%ebp, 4(%esp)        movl    %ebx, 8(%esp)	movl	%esi, 12(%esp)#	movl	20(%esp), N	movl	24(%esp), X#	movl	X, X0	movl	X, maxX	cmp	$1, N	jbe	DONE#else##       X already in right register, X0 = X, maxX = X, init N#	movl	%edi, %eax	movq	X, X0	movq	X, maxX	cmp	$1,%eax	jbe	DONE	cltq#endif        xorpd   maxval, maxval##       Get X aligned to 16 byte boundary#	test	$15, X	jnz	FORCEALIGNALIGNED_STARTUP:        movq    N, N4        shr     $3, N4        jz      LOOP1	shl	$3, N4	subq	N4, N	shr	$3, N4LOOP4:        movapd  (X), rX0        movapd  16(X), rX1#if defined(ATL_ARCH_HAMMER64) || defined(ATL_ARCH_HAMMER32)					prefetchnta	608(X)#else					prefetchnta	464(X)					prefetchnta	496(X)#endif	andpd	absval, rX0        movapd  32(X), rX2	andpd	absval, rX1	cmppd	$6, maxval, rX0        movapd  48(X), rX3	andpd	absval, rX2        cmppd   $6, maxval, rX1	andpd	absval, rX3        cmppd   $6, maxval, rX2        movmskpd        rX0, reg1        cmppd   $6, maxval, rX3        movmskpd        rX1, reg2	shl	$2, reg1	or 	reg2, reg1        movmskpd        rX2, reg2	shl	$2, reg1	or 	reg2, reg1        movmskpd        rX3, reg2	shl	$2, reg1	or 	reg2, reg1        cmp     $0, reg1        jne     LOOP4_NEWMAXLOOP4INC:	addq    $64, X	dec	N4	jnz	LOOP4###	Find which of 8 possible vals is maxval#	movupd	(maxX), rX0	movupd	16(maxX), rX1	andpd	absval, rX0	andpd	absval, rX1	cmppd	$0, maxval, rX0	cmppd	$0, maxval, rX1	movmskpd	rX0, reg1	movmskpd	rX1, reg2	test	$3, reg1	jnz	DONE_0	test	$3, reg2	jnz	DONE_2	movupd	32(maxX), rX0	movupd	48(maxX), rX1	andpd	absval, rX0	andpd	absval, rX1	cmppd	$0, maxval, rX0	cmppd	$0, maxval, rX1	movmskpd	rX0, reg1	movmskpd	rX1, reg2	test	$3, reg1	jnz	DONE_4	addq	$48, maxX	test	$1, reg2	jnz	DONE_CU	addq	$8, maxX	jmp	DONE_CUDONE_0:	test	$1, reg1	jnz	DONE_CU	add	$8, maxX	jmp	DONE_CUDONE_2:	addq	$16, maxX	test	$1, reg2	jnz	DONE_CU	add	$8, maxX	jmp	DONE_CUDONE_4:	addq	$32, maxX	test	$1, reg1	jnz	DONE_CU	add	$8, maxXDONE_CU:	cmp	$0, N	jnz	LOOP1DONE:	finit	movq	maxX, %rax	subq	X0, %rax	shr	$3, %rax#ifdef ATL_GAS_x8632	movl	(%esp), %edi	movl	4(%esp), %ebp	movl	8(%esp), %ebx	movl	12(%esp), %esi        addl    $16, %esp#endif	retLOOP4_NEWMAX:	movq	X, maxX	movapd	(X), rX0	movapd	16(X), rX1	andpd	absval, rX0	movapd	32(X), rX2	andpd	absval, rX1	movapd	48(X), rX3	andpd	absval, rX2	maxpd	rX1, rX0	andpd	absval, rX3	maxpd	rX3, rX2	maxpd	rX2, rX0	movapd	rX0, maxval	unpcklpd rX0, rX0	unpckhpd maxval, maxval	maxpd	rX0, maxval	jmp	LOOP4INC##  Assumes X at start, and N # of iterations,#LOOP1:	movlpd	(X), rX0	andpd	absval, rX0	comisd	rX0, maxval	jb	NEWMAX1LOOPINC1:	addq	$8, X        dec     N	jnz	LOOP1	jmp	DONENEWMAX1:	movlpd	(X), maxval	unpcklpd	maxval, maxval	andpd	absval, maxval	movq	X, maxX	jmp	LOOPINC1FORCEALIGN:	movlpd	(X), maxval	dec	N	unpcklpd	maxval, maxval	andpd	absval, maxval	addq	$8, X	test	$15, X	jnz	LOOP1	jmp	ALIGNED_STARTUP#endif

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?