sparcv8plus.s

来自「mediastreamer2是开源的网络传输媒体流的库」· S 代码 · 共 1,536 行 · 第 1/3 页
1,536 行
	retl	movcs	%icc,1,%o0.type	bn_add_words,#function.size	bn_add_words,(.-bn_add_words).global bn_sub_words/* * BN_ULONG bn_sub_words(rp,ap,bp,n) * BN_ULONG *rp,*ap,*bp; * int n; */bn_sub_words:	brgz,a	%o3,.L_bn_sub_words_proceed	lduw	[%o1],%o4	retl	clr	%o0.L_bn_sub_words_proceed:	andcc	%o3,-4,%g0	bz,pn	%icc,.L_bn_sub_words_tail	addcc	%g0,0,%g0	! clear carry flag	nop.L_bn_sub_words_loop:		! wow! 32 aligned!	dec	4,%o3	lduw	[%o2],%o5	lduw	[%o1+4],%g1	lduw	[%o2+4],%g2	lduw	[%o1+8],%g3	lduw	[%o2+8],%g4	subccc	%o4,%o5,%o5	stuw	%o5,[%o0]	lduw	[%o1+12],%o4	lduw	[%o2+12],%o5	inc	16,%o1	subccc	%g1,%g2,%g2	stuw	%g2,[%o0+4]	inc	16,%o2	subccc	%g3,%g4,%g4	stuw	%g4,[%o0+8]	inc	16,%o0	subccc	%o4,%o5,%o5	stuw	%o5,[%o0-4]	and	%o3,-4,%g1	brnz,a,pt	%g1,.L_bn_sub_words_loop	lduw	[%o1],%o4	brnz,a,pn	%o3,.L_bn_sub_words_tail	lduw	[%o1],%o4.L_bn_sub_words_return:	clr	%o0	retl	movcs	%icc,1,%o0	nop.L_bn_sub_words_tail:		! wow! 32 aligned!	lduw	[%o2],%o5	dec	%o3	subccc	%o4,%o5,%o5	brz,pt	%o3,.L_bn_sub_words_return	stuw	%o5,[%o0]	lduw	[%o1+4],%o4	lduw	[%o2+4],%o5	dec	%o3	subccc	%o4,%o5,%o5	brz,pt	%o3,.L_bn_sub_words_return	stuw	%o5,[%o0+4]	lduw	[%o1+8],%o4	lduw	[%o2+8],%o5	subccc	%o4,%o5,%o5	stuw	%o5,[%o0+8]	clr	%o0	retl	movcs	%icc,1,%o0.type	bn_sub_words,#function.size	bn_sub_words,(.-bn_sub_words)/* * Code below depends on the fact that upper parts of the %l0-%l7 * and %i0-%i7 are zeroed by kernel after context switch. In * previous versions this comment stated that "the trouble is that * it's not feasible to implement the mumbo-jumbo in less V9 * instructions:-(" which apparently isn't true thanks to * 'bcs,a %xcc,.+8; inc %rd' pair. But the performance improvement * results not from the shorter code, but from elimination of * multicycle none-pairable 'rd %y,%rd' instructions. * *							Andy. */#define FRAME_SIZE	-96/* * Here is register usage map for *all* routines below. */#define t_1	%o0#define	t_2	%o1#define c_12	%o2#define c_3	%o3#define ap(I)	[%i1+4*I]#define bp(I)	[%i2+4*I]#define rp(I)	[%i0+4*I]#define	a_0	%l0#define	a_1	%l1#define	a_2	%l2#define	a_3	%l3#define	a_4	%l4#define	a_5	%l5#define	a_6	%l6#define	a_7	%l7#define	b_0	%i3#define	b_1	%i4#define	b_2	%i5#define	b_3	%o4#define	b_4	%o5#define	b_5	%o7#define	b_6	%g1#define	b_7	%g4.align	32.global bn_mul_comba8/* * void bn_mul_comba8(r,a,b) * BN_ULONG *r,*a,*b; */bn_mul_comba8:	save	%sp,FRAME_SIZE,%sp	mov	1,t_2	lduw	ap(0),a_0	sllx	t_2,32,t_2	lduw	bp(0),b_0	!=	lduw	bp(1),b_1	mulx	a_0,b_0,t_1	!mul_add_c(a[0],b[0],c1,c2,c3);	srlx	t_1,32,c_12	stuw	t_1,rp(0)	!=!r[0]=c1;	lduw	ap(1),a_1	mulx	a_0,b_1,t_1	!mul_add_c(a[0],b[1],c2,c3,c1);	addcc	c_12,t_1,c_12	clr	c_3		!=	bcs,a	%xcc,.+8	add	c_3,t_2,c_3	lduw	ap(2),a_2	mulx	a_1,b_0,t_1	!=!mul_add_c(a[1],b[0],c2,c3,c1);	addcc	c_12,t_1,t_1	bcs,a	%xcc,.+8	add	c_3,t_2,c_3	srlx	t_1,32,c_12	!=	stuw	t_1,rp(1)	!r[1]=c2;	or	c_12,c_3,c_12	mulx	a_2,b_0,t_1	!mul_add_c(a[2],b[0],c3,c1,c2);	addcc	c_12,t_1,c_12	!=	clr	c_3	bcs,a	%xcc,.+8	add	c_3,t_2,c_3	lduw	bp(2),b_2	!=	mulx	a_1,b_1,t_1	!mul_add_c(a[1],b[1],c3,c1,c2);	addcc	c_12,t_1,c_12	bcs,a	%xcc,.+8	add	c_3,t_2,c_3	!=	lduw	bp(3),b_3	mulx	a_0,b_2,t_1	!mul_add_c(a[0],b[2],c3,c1,c2);	addcc	c_12,t_1,t_1	bcs,a	%xcc,.+8	!=	add	c_3,t_2,c_3	srlx	t_1,32,c_12	stuw	t_1,rp(2)	!r[2]=c3;	or	c_12,c_3,c_12	!=	mulx	a_0,b_3,t_1	!mul_add_c(a[0],b[3],c1,c2,c3);	addcc	c_12,t_1,c_12	clr	c_3	bcs,a	%xcc,.+8	!=	add	c_3,t_2,c_3	mulx	a_1,b_2,t_1	!=!mul_add_c(a[1],b[2],c1,c2,c3);	addcc	c_12,t_1,c_12	bcs,a	%xcc,.+8	!=	add	c_3,t_2,c_3	lduw	ap(3),a_3	mulx	a_2,b_1,t_1	!mul_add_c(a[2],b[1],c1,c2,c3);	addcc	c_12,t_1,c_12	!=	bcs,a	%xcc,.+8	add	c_3,t_2,c_3	lduw	ap(4),a_4	mulx	a_3,b_0,t_1	!=!mul_add_c(a[3],b[0],c1,c2,c3);!=	addcc	c_12,t_1,t_1	bcs,a	%xcc,.+8	add	c_3,t_2,c_3	srlx	t_1,32,c_12	!=	stuw	t_1,rp(3)	!r[3]=c1;	or	c_12,c_3,c_12	mulx	a_4,b_0,t_1	!mul_add_c(a[4],b[0],c2,c3,c1);	addcc	c_12,t_1,c_12	!=	clr	c_3	bcs,a	%xcc,.+8	add	c_3,t_2,c_3	mulx	a_3,b_1,t_1	!=!mul_add_c(a[3],b[1],c2,c3,c1);	addcc	c_12,t_1,c_12	bcs,a	%xcc,.+8	add	c_3,t_2,c_3	mulx	a_2,b_2,t_1	!=!mul_add_c(a[2],b[2],c2,c3,c1);	addcc	c_12,t_1,c_12	bcs,a	%xcc,.+8	add	c_3,t_2,c_3	lduw	bp(4),b_4	!=	mulx	a_1,b_3,t_1	!mul_add_c(a[1],b[3],c2,c3,c1);	addcc	c_12,t_1,c_12	bcs,a	%xcc,.+8	add	c_3,t_2,c_3	!=	lduw	bp(5),b_5	mulx	a_0,b_4,t_1	!mul_add_c(a[0],b[4],c2,c3,c1);	addcc	c_12,t_1,t_1	bcs,a	%xcc,.+8	!=	add	c_3,t_2,c_3	srlx	t_1,32,c_12	stuw	t_1,rp(4)	!r[4]=c2;	or	c_12,c_3,c_12	!=	mulx	a_0,b_5,t_1	!mul_add_c(a[0],b[5],c3,c1,c2);	addcc	c_12,t_1,c_12	clr	c_3	bcs,a	%xcc,.+8	!=	add	c_3,t_2,c_3	mulx	a_1,b_4,t_1	!mul_add_c(a[1],b[4],c3,c1,c2);	addcc	c_12,t_1,c_12	bcs,a	%xcc,.+8	!=	add	c_3,t_2,c_3	mulx	a_2,b_3,t_1	!mul_add_c(a[2],b[3],c3,c1,c2);	addcc	c_12,t_1,c_12	bcs,a	%xcc,.+8	!=	add	c_3,t_2,c_3	mulx	a_3,b_2,t_1	!mul_add_c(a[3],b[2],c3,c1,c2);	addcc	c_12,t_1,c_12	bcs,a	%xcc,.+8	!=	add	c_3,t_2,c_3	lduw	ap(5),a_5	mulx	a_4,b_1,t_1	!mul_add_c(a[4],b[1],c3,c1,c2);	addcc	c_12,t_1,c_12	!=	bcs,a	%xcc,.+8	add	c_3,t_2,c_3	lduw	ap(6),a_6	mulx	a_5,b_0,t_1	!=!mul_add_c(a[5],b[0],c3,c1,c2);	addcc	c_12,t_1,t_1	bcs,a	%xcc,.+8	add	c_3,t_2,c_3	srlx	t_1,32,c_12	!=	stuw	t_1,rp(5)	!r[5]=c3;	or	c_12,c_3,c_12	mulx	a_6,b_0,t_1	!mul_add_c(a[6],b[0],c1,c2,c3);	addcc	c_12,t_1,c_12	!=	clr	c_3	bcs,a	%xcc,.+8	add	c_3,t_2,c_3	mulx	a_5,b_1,t_1	!=!mul_add_c(a[5],b[1],c1,c2,c3);	addcc	c_12,t_1,c_12	bcs,a	%xcc,.+8	add	c_3,t_2,c_3	mulx	a_4,b_2,t_1	!=!mul_add_c(a[4],b[2],c1,c2,c3);	addcc	c_12,t_1,c_12	bcs,a	%xcc,.+8	add	c_3,t_2,c_3	mulx	a_3,b_3,t_1	!=!mul_add_c(a[3],b[3],c1,c2,c3);	addcc	c_12,t_1,c_12	bcs,a	%xcc,.+8	add	c_3,t_2,c_3	mulx	a_2,b_4,t_1	!=!mul_add_c(a[2],b[4],c1,c2,c3);	addcc	c_12,t_1,c_12	bcs,a	%xcc,.+8	add	c_3,t_2,c_3	lduw	bp(6),b_6	!=	mulx	a_1,b_5,t_1	!mul_add_c(a[1],b[5],c1,c2,c3);	addcc	c_12,t_1,c_12	bcs,a	%xcc,.+8	add	c_3,t_2,c_3	!=	lduw	bp(7),b_7	mulx	a_0,b_6,t_1	!mul_add_c(a[0],b[6],c1,c2,c3);	addcc	c_12,t_1,t_1	bcs,a	%xcc,.+8	!=	add	c_3,t_2,c_3	srlx	t_1,32,c_12	stuw	t_1,rp(6)	!r[6]=c1;	or	c_12,c_3,c_12	!=	mulx	a_0,b_7,t_1	!mul_add_c(a[0],b[7],c2,c3,c1);	addcc	c_12,t_1,c_12	clr	c_3	bcs,a	%xcc,.+8	!=	add	c_3,t_2,c_3	mulx	a_1,b_6,t_1	!mul_add_c(a[1],b[6],c2,c3,c1);	addcc	c_12,t_1,c_12	bcs,a	%xcc,.+8	!=	add	c_3,t_2,c_3	mulx	a_2,b_5,t_1	!mul_add_c(a[2],b[5],c2,c3,c1);	addcc	c_12,t_1,c_12	bcs,a	%xcc,.+8	!=	add	c_3,t_2,c_3	mulx	a_3,b_4,t_1	!mul_add_c(a[3],b[4],c2,c3,c1);	addcc	c_12,t_1,c_12	bcs,a	%xcc,.+8	!=	add	c_3,t_2,c_3	mulx	a_4,b_3,t_1	!mul_add_c(a[4],b[3],c2,c3,c1);	addcc	c_12,t_1,c_12	bcs,a	%xcc,.+8	!=	add	c_3,t_2,c_3	mulx	a_5,b_2,t_1	!mul_add_c(a[5],b[2],c2,c3,c1);	addcc	c_12,t_1,c_12	bcs,a	%xcc,.+8	!=	add	c_3,t_2,c_3	lduw	ap(7),a_7	mulx	a_6,b_1,t_1	!=!mul_add_c(a[6],b[1],c2,c3,c1);	addcc	c_12,t_1,c_12	bcs,a	%xcc,.+8	add	c_3,t_2,c_3	mulx	a_7,b_0,t_1	!=!mul_add_c(a[7],b[0],c2,c3,c1);	addcc	c_12,t_1,t_1	bcs,a	%xcc,.+8	add	c_3,t_2,c_3	srlx	t_1,32,c_12	!=	stuw	t_1,rp(7)	!r[7]=c2;	or	c_12,c_3,c_12	mulx	a_7,b_1,t_1	!=!mul_add_c(a[7],b[1],c3,c1,c2);	addcc	c_12,t_1,c_12	clr	c_3	bcs,a	%xcc,.+8	add	c_3,t_2,c_3	!=	mulx	a_6,b_2,t_1	!mul_add_c(a[6],b[2],c3,c1,c2);	addcc	c_12,t_1,c_12	bcs,a	%xcc,.+8	add	c_3,t_2,c_3	!=	mulx	a_5,b_3,t_1	!mul_add_c(a[5],b[3],c3,c1,c2);	addcc	c_12,t_1,c_12	bcs,a	%xcc,.+8	add	c_3,t_2,c_3	!=	mulx	a_4,b_4,t_1	!mul_add_c(a[4],b[4],c3,c1,c2);	addcc	c_12,t_1,c_12	bcs,a	%xcc,.+8	add	c_3,t_2,c_3	!=	mulx	a_3,b_5,t_1	!mul_add_c(a[3],b[5],c3,c1,c2);	addcc	c_12,t_1,c_12	bcs,a	%xcc,.+8	add	c_3,t_2,c_3	!=	mulx	a_2,b_6,t_1	!mul_add_c(a[2],b[6],c3,c1,c2);	addcc	c_12,t_1,c_12	bcs,a	%xcc,.+8	add	c_3,t_2,c_3	!=	mulx	a_1,b_7,t_1	!mul_add_c(a[1],b[7],c3,c1,c2);	addcc	c_12,t_1,t_1	bcs,a	%xcc,.+8	add	c_3,t_2,c_3	!=	srlx	t_1,32,c_12	stuw	t_1,rp(8)	!r[8]=c3;	or	c_12,c_3,c_12	mulx	a_2,b_7,t_1	!=!mul_add_c(a[2],b[7],c1,c2,c3);	addcc	c_12,t_1,c_12	clr	c_3	bcs,a	%xcc,.+8	add	c_3,t_2,c_3	!=	mulx	a_3,b_6,t_1	!mul_add_c(a[3],b[6],c1,c2,c3);	addcc	c_12,t_1,c_12	bcs,a	%xcc,.+8	!=	add	c_3,t_2,c_3	mulx	a_4,b_5,t_1	!mul_add_c(a[4],b[5],c1,c2,c3);	addcc	c_12,t_1,c_12	bcs,a	%xcc,.+8	!=	add	c_3,t_2,c_3	mulx	a_5,b_4,t_1	!mul_add_c(a[5],b[4],c1,c2,c3);	addcc	c_12,t_1,c_12	bcs,a	%xcc,.+8	!=	add	c_3,t_2,c_3	mulx	a_6,b_3,t_1	!mul_add_c(a[6],b[3],c1,c2,c3);	addcc	c_12,t_1,c_12	bcs,a	%xcc,.+8	!=	add	c_3,t_2,c_3	mulx	a_7,b_2,t_1	!mul_add_c(a[7],b[2],c1,c2,c3);	addcc	c_12,t_1,t_1	bcs,a	%xcc,.+8	!=	add	c_3,t_2,c_3	srlx	t_1,32,c_12	stuw	t_1,rp(9)	!r[9]=c1;	or	c_12,c_3,c_12	!=	mulx	a_7,b_3,t_1	!mul_add_c(a[7],b[3],c2,c3,c1);	addcc	c_12,t_1,c_12	clr	c_3	bcs,a	%xcc,.+8	!=	add	c_3,t_2,c_3	mulx	a_6,b_4,t_1	!mul_add_c(a[6],b[4],c2,c3,c1);	addcc	c_12,t_1,c_12	bcs,a	%xcc,.+8	!=	add	c_3,t_2,c_3	mulx	a_5,b_5,t_1	!mul_add_c(a[5],b[5],c2,c3,c1);	addcc	c_12,t_1,c_12	bcs,a	%xcc,.+8	!=	add	c_3,t_2,c_3	mulx	a_4,b_6,t_1	!mul_add_c(a[4],b[6],c2,c3,c1);	addcc	c_12,t_1,c_12	bcs,a	%xcc,.+8	!=	add	c_3,t_2,c_3	mulx	a_3,b_7,t_1	!mul_add_c(a[3],b[7],c2,c3,c1);	addcc	c_12,t_1,t_1	bcs,a	%xcc,.+8	!=	add	c_3,t_2,c_3	srlx	t_1,32,c_12	stuw	t_1,rp(10)	!r[10]=c2;	or	c_12,c_3,c_12	!=	mulx	a_4,b_7,t_1	!mul_add_c(a[4],b[7],c3,c1,c2);	addcc	c_12,t_1,c_12	clr	c_3	bcs,a	%xcc,.+8	!=	add	c_3,t_2,c_3	mulx	a_5,b_6,t_1	!mul_add_c(a[5],b[6],c3,c1,c2);	addcc	c_12,t_1,c_12	bcs,a	%xcc,.+8	!=	add	c_3,t_2,c_3	mulx	a_6,b_5,t_1	!mul_add_c(a[6],b[5],c3,c1,c2);	addcc	c_12,t_1,c_12	bcs,a	%xcc,.+8	!=	add	c_3,t_2,c_3	mulx	a_7,b_4,t_1	!mul_add_c(a[7],b[4],c3,c1,c2);	addcc	c_12,t_1,t_1	bcs,a	%xcc,.+8	!=	add	c_3,t_2,c_3	srlx	t_1,32,c_12	stuw	t_1,rp(11)	!r[11]=c3;	or	c_12,c_3,c_12	!=	mulx	a_7,b_5,t_1	!mul_add_c(a[7],b[5],c1,c2,c3);	addcc	c_12,t_1,c_12	clr	c_3	bcs,a	%xcc,.+8	!=	add	c_3,t_2,c_3	mulx	a_6,b_6,t_1	!mul_add_c(a[6],b[6],c1,c2,c3);	addcc	c_12,t_1,c_12	bcs,a	%xcc,.+8	!=	add	c_3,t_2,c_3	mulx	a_5,b_7,t_1	!mul_add_c(a[5],b[7],c1,c2,c3);	addcc	c_12,t_1,t_1	bcs,a	%xcc,.+8	!=	add	c_3,t_2,c_3	srlx	t_1,32,c_12	stuw	t_1,rp(12)	!r[12]=c1;	or	c_12,c_3,c_12	!=	mulx	a_6,b_7,t_1	!mul_add_c(a[6],b[7],c2,c3,c1);	addcc	c_12,t_1,c_12	clr	c_3	bcs,a	%xcc,.+8	!=	add	c_3,t_2,c_3	mulx	a_7,b_6,t_1	!mul_add_c(a[7],b[6],c2,c3,c1);	addcc	c_12,t_1,t_1	bcs,a	%xcc,.+8	!=	add	c_3,t_2,c_3	srlx	t_1,32,c_12	st	t_1,rp(13)	!r[13]=c2;	or	c_12,c_3,c_12	!=	mulx	a_7,b_7,t_1	!mul_add_c(a[7],b[7],c3,c1,c2);	addcc	c_12,t_1,t_1	srlx	t_1,32,c_12	!=	stuw	t_1,rp(14)	!r[14]=c3;	stuw	c_12,rp(15)	!r[15]=c1;	ret	restore	%g0,%g0,%o0	!=.type	bn_mul_comba8,#function.size	bn_mul_comba8,(.-bn_mul_comba8).align	32.global bn_mul_comba4/* * void bn_mul_comba4(r,a,b) * BN_ULONG *r,*a,*b; */bn_mul_comba4:	save	%sp,FRAME_SIZE,%sp	lduw	ap(0),a_0	mov	1,t_2	lduw	bp(0),b_0	sllx	t_2,32,t_2	!=	lduw	bp(1),b_1	mulx	a_0,b_0,t_1	!mul_add_c(a[0],b[0],c1,c2,c3);	srlx	t_1,32,c_12	stuw	t_1,rp(0)	!=!r[0]=c1;	lduw	ap(1),a_1	mulx	a_0,b_1,t_1	!mul_add_c(a[0],b[1],c2,c3,c1);	addcc	c_12,t_1,c_12	clr	c_3		!=	bcs,a	%xcc,.+8	add	c_3,t_2,c_3	lduw	ap(2),a_2	mulx	a_1,b_0,t_1	!=!mul_add_c(a[1],b[0],c2,c3,c1);	addcc	c_12,t_1,t_1	bcs,a	%xcc,.+8
sparcv8plus.s - 源码说明

本页面展示了「mediastreamer2是开源的网络传输媒体流的库」中的 sparcv8plus.s 源码文件，采用 S 编程语言编写，共 1,536 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与mediastreamer2相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?