iamax8_x86.c

来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 485 行

C
485
字号
#include "atlas_asm.h"#ifdef SREAL#ifdef ATL_GAS_x8632   #define movq movl   #define addq addl   #define subq subl   #define rsp  esp   #define rax  eax#elif !defined(ATL_GAS_x8664)   #error "This kernel requires a gas x86 assembler!"#endif#ifdef ATL_GAS_x8632   #define N	%eax   #define X	%edx   #define maxX	%ecx   #define X0	%edi   #define N8   %ebp   #define reg1 %ebx#else   #define N	%rax   #define X	%rsi   #define maxX	%rcx   #define X0	%rdi   #define N8   %rdx   #define reg1	%r8#endif#define maxval  %xmm0#define rX0     %xmm1#define rX1     %xmm2#define absval	%xmm3# IREG                   rdi            rsi# int ATL_UIAMAX(const int N, const TYPE *X, const int incX)        .text.global	ATL_asmdecor(ATL_UIAMAX)ATL_asmdecor(ATL_UIAMAX):#ifdef ATL_GAS_x8632	subl	$16, %esp   #define SOFF 0#else   #define SOFF -8#endif##	Temporarily store 1.0 and -1.0 to stack#	fld1	fldz	fsub	%st(1), %st	fstps	SOFF(%rsp)	fstps	SOFF+4(%rsp)##       eax = all bits 1#	xorl	%eax, %eax	notl	%eax##	absval = (-1.0 ^ 1.0) = sign bit only#	movss	SOFF(%rsp), absval	movss	SOFF+4(%rsp), rX0	xorps	rX0, absval##       absval = NOT(sign bit) & (all ones) == all bits but sign bet set#	movl	%eax, SOFF(%rsp)	movss	SOFF(%rsp), rX0	andnps	rX0, absval	shufps	$0x00, absval, absval#ifdef ATL_GAS_x8632##       Save iregs#	movl	%edi, (%esp)	movl	%ebp, 4(%esp)        movl    %ebx, 8(%esp)#	movl	20(%esp), N	movl	24(%esp), X#	movl	X, X0	movl	X, maxX	cmp	$1, N	jbe	DONE#else##       X already in right register, X0 = X, maxX = X, init N#	movl	%edi, %eax	movq	X, X0	movq	X, maxX	cmp	$1,%eax	jbe	DONE	cltq#endif        xorps   maxval, maxval##       Get X aligned to 16 byte boundary#        movq    X, N8        shr     $4, N8        shl     $4, N8        cmp     X, N8        jne     FORCEALIGNALIGNED_STARTUP:        movq    N, N8        shr     $3, N8        jz      CLEANUP	shl	$3, N8	subq	N8, N	shr	$3, N8LOOP8:        movaps  (X), rX0        movaps  16(X), rX1	andps	absval, rX0	andps	absval, rX1        maxps   maxval, rX0        maxps   maxval, rX1        cmpps   $4, maxval, rX0   #if defined(ATL_ARCH_HAMMER64) || defined(ATL_ARCH_HAMMER32)		prefetchnta	320(X)   #elif defined(ATL_ARCH_P4)                prefetchnta     464(X)   #else		prefetchnta	192(X)   #endif        cmpps   $4, maxval, rX1        movmskps        rX0, reg1        cmp     $0, reg1        jne     LOOP8_1        movmskps        rX1, reg1        cmp     $0, reg1        jne     LOOP8_2LOOP8INC:	addq    $32, X	dec	N8	jnz	LOOP8#	cmp	$0, N	jnz	CLEANUPDONE:	finit	movq	maxX, %rax	subq	X0, %rax	shr	$2, %rax#ifdef ATL_GAS_x8632	movl	(%esp), %edi	movl	4(%esp), %ebp	movl	8(%esp), %ebx        addl    $16, %esp#endif	retLOOP8_1:	flds	(X)	fabs	movq	$-12, reg1	movq	$-16, maxXLOOP8NML:	flds	16(X,reg1)	fabs	fcomi	%st(1), %st	jbe	LOOP8NMLINC	mov	reg1, maxX	fxchLOOP8NMLINC:	fstp	%st	addq	$4, reg1	jnz	LOOP8NML#	fstp	%st	addq	$16, maxX	addq	X, maxX	movss	(maxX), maxval	shufps	$0x00, maxval, maxval	andps	absval, maxval        movmskps        rX1, reg1        cmp     $0, reg1        jz      LOOP8INC	movaps	16(X), rX1	andps	absval, rX1        maxps   maxval, rX1        cmpps   $4, maxval, rX1        movmskps        rX1, reg1        cmp     $0, reg1	je	LOOP8INC	jmp	LOOP8_2LOOP8_2:	flds	16(X)	fabs	movq	$-12, reg1	movq	$-16, maxXLOOP8NML2:	flds	32(X,reg1)	fabs	fcomi	%st(1), %st	jbe	LOOP8NML2INC	mov	reg1, maxX	fxchLOOP8NML2INC:	fstp	%st	addq	$4, reg1	jnz	LOOP8NML2#	fstp	%st	addq	$32, maxX	addq	X, maxX	movss	(maxX), maxval	shufps	$0x00, maxval, maxval	andps	absval, maxval        jmp     LOOP8INCCLEANUP:	flds	(maxX)	fabs##  Assumes X at start, and N # of iterations, %st(0) has max so far#LOOP1:	flds	(X)	fabs	fcomip	%st(1), %st	ja	NEWMAX1LOOPINC1:	addq	$4, X        dec     N	jnz	LOOP1	jmp	DONENEWMAX1:	fstp	%st(0)	flds	(X)	fabs	movq	X, maxX	jmp	LOOPINC1FORCEALIGN:	flds	(X)	fabsLOOPALIGN:	flds	(X)	fabs	fcomip	%st(1), %st	ja	NEWMAXAALIGNINC:	addq	$4, X        movq    X, N8        shr     $4, N8        shl     $4, N8        cmp     X, N8        jz      ALIGNED        dec     N	jnz	LOOPALIGN	jmp	DONENEWMAXA:	fstp	%st(0)	flds	(X)	fabs	movq	X, maxX	jmp	ALIGNINCALIGNED:        dec     N	jz	DONE        movss   (maxX), maxval        shufps  $0x00, maxval, maxval	andps	absval, maxval        jmp     ALIGNED_STARTUP#else#ifdef ATL_GAS_x8632   #define movq movl   #define addq addl   #define subq subl   #define rsp  esp   #define rax  eax#elif !defined(ATL_GAS_x8664)   #error "This kernel requires a gas x86 assembler!"#endif#ifdef ATL_GAS_x8632   #define N	%eax   #define X	%esi   #define stX	%edx   #define stX4	%ebx   #define maxX	%ecx   #define X0	%edi#else   #define N	%rax   #define X	%rsi   #define stX	%rdx   #define stX4	%r8   #define maxX	%rcx   #define X0	%rdi#endif# int ATL_UIAMAX(const int N, const TYPE *X, const int incX)        .text.global	ATL_asmdecor(ATL_UIAMAX)ATL_asmdecor(ATL_UIAMAX):#ifdef ATL_GAS_x8632	subl	$12, %esp	movl	%ebx, (%esp)	movl	%esi, 4(%esp)	movl	%edi, 8(%esp)#	movl	16(%esp), N	movl	20(%esp), X#	movl	X, X0	movl	X, maxX	cmp	$1, N	jbe	DONE#else##       X already in right register, init N, stX = X + N#	movl	%edi, %eax	movq	X, X0	movq	X, maxX	cmp	$1,%eax	jbe	DONE	cltq#endif	movq	N, stX	shl	$3, stX	addq	X, stX	cmp	X, stX	fldl	(X)	fabs	addq	$8, X	movq	N, stX4	subq	$1, stX4	shr	$3, stX4	shl	$6, stX4	addq	X, stX4	cmp	stX4, X	je	LOOP1	ALIGN16LOOP:	fldl	(X)	fabs	fcomip	%st(1), %st   #if defined(ATL_ARCH_P4)	        prefetchnta	768(X)   #else	        prefetchnta	572(X)   #endif	ja	NEWMAX_1LOOP_2:	fldl	8(X)	fabs	fcomip	%st(1), %st	ja	NEWMAX_2LOOP_3:	fldl	16(X)	fabs	fcomip	%st(1), %st	ja	NEWMAX_3LOOP_4:	fldl	24(X)	fabs	fcomip	%st(1), %st	ja	NEWMAX_4LOOP_5:	fldl	32(X)	fabs	fcomip	%st(1), %st	ja	NEWMAX_5LOOP_6:	fldl	40(X)	fabs	fcomip	%st(1), %st	ja	NEWMAX_6LOOP_7:	fldl	48(X)	fabs	fcomip	%st(1), %st	ja	NEWMAX_7LOOP_8:	fldl	56(X)	fabs	fcomip	%st(1), %st	ja	NEWMAX_8LOOP_9:	addq	$64, X	cmp	stX4, X	jne	LOOP	cmp	stX4, stX	jne	LOOP1DONE:	finit	movq	maxX, %rax	subq	X0, %rax	shr	$3, %rax#ifdef ATL_GAS_x8632        movl    (%esp), %ebx        movl    4(%esp), %esi        movl    8(%esp), %edi        addl    $12, %esp#endif	retNEWMAX_1:	fstp	%st(0)	fldl	(X)	fabs	movq	X, maxX	jmp	LOOP_2NEWMAX_2:	fstp	%st(0)	fldl	8(X)	fabs	movq	X, maxX	addq	$8, maxX	jmp	LOOP_3NEWMAX_3:	fstp	%st(0)	fldl	16(X)	fabs	movq	X, maxX	addq	$16, maxX	jmp	LOOP_4NEWMAX_4:	fstp	%st(0)	fldl	24(X)	fabs	movq	X, maxX	addq	$24, maxX	jmp	LOOP_5NEWMAX_5:	fstp	%st(0)	fldl	32(X)	fabs	movq	X, maxX	addq	$32, maxX	jmp	LOOP_6NEWMAX_6:	fstp	%st(0)	fldl	40(X)	fabs	movq	X, maxX	addq	$40, maxX	jmp	LOOP_7NEWMAX_7:	fstp	%st(0)	fldl	48(X)	fabs	movq	X, maxX	addq	$48, maxX	jmp	LOOP_8NEWMAX_8:	fstp	%st(0)	fldl	56(X)	fabs	movq	X, maxX	addq	$56, maxX	jmp	LOOP_9##  Assumes X at start, and stX where to quit, %st(0) has max so far#LOOP1:	fldl	(X)	fabs	fcomip	%st(1), %st	ja	NEWMAX1LOOPINC1:	addq	$8, X	cmp	stX, X	jne	LOOP1	jmp	DONENEWMAX1:	fstp	%st(0)	fldl	(X)	fabs	movq	X, maxX	jmp	LOOPINC1#endif

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?