sparcv8plus.s

来自「mediastreamer2是开源的网络传输媒体流的库」· S 代码 · 共 1,536 行 · 第 1/3 页
1,536 行
.ident	"sparcv8plus.s, Version 1.4".ident	"SPARC v9 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"/* * ==================================================================== * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL * project. * * Rights for redistribution and usage in source and binary forms are * granted according to the OpenSSL license. Warranty of any kind is * disclaimed. * ==================================================================== *//* * This is my modest contributon to OpenSSL project (see * http://www.openssl.org/ for more information about it) and is * a drop-in UltraSPARC ISA replacement for crypto/bn/bn_asm.c * module. For updates see http://fy.chalmers.se/~appro/hpe/. * * Questions-n-answers. * * Q. How to compile? * A. With SC4.x/SC5.x: * *	cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o * *    and with gcc: * *	gcc -mcpu=ultrasparc -c bn_asm.sparc.v8plus.S -o bn_asm.o * *    or if above fails (it does if you have gas installed): * *	gcc -E bn_asm.sparc.v8plus.S | as -xarch=v8plus /dev/fd/0 -o bn_asm.o * *    Quick-n-dirty way to fuse the module into the library. *    Provided that the library is already configured and built *    (in 0.9.2 case with no-asm option): * *	# cd crypto/bn *	# cp /some/place/bn_asm.sparc.v8plus.S . *	# cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o *	# make *	# cd ../.. *	# make; make test * *    Quick-n-dirty way to get rid of it: * *	# cd crypto/bn *	# touch bn_asm.c *	# make *	# cd ../.. *	# make; make test * * Q. V8plus achitecture? What kind of beast is that? * A. Well, it's rather a programming model than an architecture... *    It's actually v9-compliant, i.e. *any* UltraSPARC, CPU under *    special conditions, namely when kernel doesn't preserve upper *    32 bits of otherwise 64-bit registers during a context switch. * * Q. Why just UltraSPARC? What about SuperSPARC? * A. Original release did target UltraSPARC only. Now SuperSPARC *    version is provided along. Both version share bn_*comba[48] *    implementations (see comment later in code for explanation). *    But what's so special about this UltraSPARC implementation? *    Why didn't I let compiler do the job? Trouble is that most of *    available compilers (well, SC5.0 is the only exception) don't *    attempt to take advantage of UltraSPARC's 64-bitness under *    32-bit kernels even though it's perfectly possible (see next *    question). * * Q. 64-bit registers under 32-bit kernels? Didn't you just say it *    doesn't work? * A. You can't adress *all* registers as 64-bit wide:-( The catch is *    that you actually may rely upon %o0-%o5 and %g1-%g4 being fully *    preserved if you're in a leaf function, i.e. such never calling *    any other functions. All functions in this module are leaf and *    10 registers is a handful. And as a matter of fact none-"comba" *    routines don't require even that much and I could even afford to *    not allocate own stack frame for 'em:-) * * Q. What about 64-bit kernels? * A. What about 'em? Just kidding:-) Pure 64-bit version is currently *    under evaluation and development... * * Q. What about shared libraries? * A. What about 'em? Kidding again:-) Code does *not* contain any *    code position dependencies and it's safe to include it into *    shared library as is. * * Q. How much faster does it go? * A. Do you have a good benchmark? In either case below is what I *    experience with crypto/bn/expspeed.c test program: * *	v8plus module on U10/300MHz against bn_asm.c compiled with: * *	cc-5.0 -xarch=v8plus -xO5 -xdepend	+7-12% *	cc-4.2 -xarch=v8plus -xO5 -xdepend	+25-35% *	egcs-1.1.2 -mcpu=ultrasparc -O3		+35-45% * *	v8 module on SS10/60MHz against bn_asm.c compiled with: * *	cc-5.0 -xarch=v8 -xO5 -xdepend		+7-10% *	cc-4.2 -xarch=v8 -xO5 -xdepend		+10% *	egcs-1.1.2 -mv8 -O3			+35-45% * *    As you can see it's damn hard to beat the new Sun C compiler *    and it's in first place GNU C users who will appreciate this *    assembler implementation:-)	 *//* * Revision history. * * 1.0	- initial release; * 1.1	- new loop unrolling model(*); *	- some more fine tuning; * 1.2	- made gas friendly; *	- updates to documentation concerning v9; *	- new performance comparison matrix; * 1.3	- fixed problem with /usr/ccs/lib/cpp; * 1.4	- native V9 bn_*_comba[48] implementation (15% more efficient) *	  resulting in slight overall performance kick; *	- some retunes; *	- support for GNU as added; * * (*)	Originally unrolled loop looked like this: *	    for (;;) { *		op(p+0); if (--n==0) break; *		op(p+1); if (--n==0) break; *		op(p+2); if (--n==0) break; *		op(p+3); if (--n==0) break; *		p+=4; *	    } *	I unroll according to following: *	    while (n&~3) { *		op(p+0); op(p+1); op(p+2); op(p+3); *		p+=4; n=-4; *	    } *	    if (n) { *		op(p+0); if (--n==0) return; *		op(p+2); if (--n==0) return; *		op(p+3); return; *	    } *//* * GNU assembler can't stand stuw:-( */#define stuw st.section	".text",#alloc,#execinstr.file		"bn_asm.sparc.v8plus.S".align	32.global bn_mul_add_words/* * BN_ULONG bn_mul_add_words(rp,ap,num,w) * BN_ULONG *rp,*ap; * int num; * BN_ULONG w; */bn_mul_add_words:	brgz,a	%o2,.L_bn_mul_add_words_proceed	lduw	[%o1],%g2	retl	clr	%o0.L_bn_mul_add_words_proceed:	srl	%o3,%g0,%o3	! clruw	%o3	andcc	%o2,-4,%g0	bz,pn	%icc,.L_bn_mul_add_words_tail	clr	%o5.L_bn_mul_add_words_loop:	! wow! 32 aligned!	lduw	[%o0],%g1	lduw	[%o1+4],%g3	mulx	%o3,%g2,%g2	add	%g1,%o5,%o4	nop	add	%o4,%g2,%o4	stuw	%o4,[%o0]	srlx	%o4,32,%o5	lduw	[%o0+4],%g1	lduw	[%o1+8],%g2	mulx	%o3,%g3,%g3	add	%g1,%o5,%o4	dec	4,%o2	add	%o4,%g3,%o4	stuw	%o4,[%o0+4]	srlx	%o4,32,%o5	lduw	[%o0+8],%g1	lduw	[%o1+12],%g3	mulx	%o3,%g2,%g2	add	%g1,%o5,%o4	inc	16,%o1	add	%o4,%g2,%o4	stuw	%o4,[%o0+8]	srlx	%o4,32,%o5	lduw	[%o0+12],%g1	mulx	%o3,%g3,%g3	add	%g1,%o5,%o4	inc	16,%o0	add	%o4,%g3,%o4	andcc	%o2,-4,%g0	stuw	%o4,[%o0-4]	srlx	%o4,32,%o5	bnz,a,pt	%icc,.L_bn_mul_add_words_loop	lduw	[%o1],%g2	brnz,a,pn	%o2,.L_bn_mul_add_words_tail	lduw	[%o1],%g2.L_bn_mul_add_words_return:	retl	mov	%o5,%o0.L_bn_mul_add_words_tail:	lduw	[%o0],%g1	mulx	%o3,%g2,%g2	add	%g1,%o5,%o4	dec	%o2	add	%o4,%g2,%o4	srlx	%o4,32,%o5	brz,pt	%o2,.L_bn_mul_add_words_return	stuw	%o4,[%o0]	lduw	[%o1+4],%g2	lduw	[%o0+4],%g1	mulx	%o3,%g2,%g2	add	%g1,%o5,%o4	dec	%o2	add	%o4,%g2,%o4	srlx	%o4,32,%o5	brz,pt	%o2,.L_bn_mul_add_words_return	stuw	%o4,[%o0+4]	lduw	[%o1+8],%g2	lduw	[%o0+8],%g1	mulx	%o3,%g2,%g2	add	%g1,%o5,%o4	add	%o4,%g2,%o4	stuw	%o4,[%o0+8]	retl	srlx	%o4,32,%o0.type	bn_mul_add_words,#function.size	bn_mul_add_words,(.-bn_mul_add_words).align	32.global bn_mul_words/* * BN_ULONG bn_mul_words(rp,ap,num,w) * BN_ULONG *rp,*ap; * int num; * BN_ULONG w; */bn_mul_words:	brgz,a	%o2,.L_bn_mul_words_proceeed	lduw	[%o1],%g2	retl	clr	%o0.L_bn_mul_words_proceeed:	srl	%o3,%g0,%o3	! clruw	%o3	andcc	%o2,-4,%g0	bz,pn	%icc,.L_bn_mul_words_tail	clr	%o5.L_bn_mul_words_loop:		! wow! 32 aligned!	lduw	[%o1+4],%g3	mulx	%o3,%g2,%g2	add	%g2,%o5,%o4	nop	stuw	%o4,[%o0]	srlx	%o4,32,%o5	lduw	[%o1+8],%g2	mulx	%o3,%g3,%g3	add	%g3,%o5,%o4	dec	4,%o2	stuw	%o4,[%o0+4]	srlx	%o4,32,%o5	lduw	[%o1+12],%g3	mulx	%o3,%g2,%g2	add	%g2,%o5,%o4	inc	16,%o1	stuw	%o4,[%o0+8]	srlx	%o4,32,%o5	mulx	%o3,%g3,%g3	add	%g3,%o5,%o4	inc	16,%o0	stuw	%o4,[%o0-4]	srlx	%o4,32,%o5	andcc	%o2,-4,%g0	bnz,a,pt	%icc,.L_bn_mul_words_loop	lduw	[%o1],%g2	nop	nop	brnz,a,pn	%o2,.L_bn_mul_words_tail	lduw	[%o1],%g2.L_bn_mul_words_return:	retl	mov	%o5,%o0.L_bn_mul_words_tail:	mulx	%o3,%g2,%g2	add	%g2,%o5,%o4	dec	%o2	srlx	%o4,32,%o5	brz,pt	%o2,.L_bn_mul_words_return	stuw	%o4,[%o0]	lduw	[%o1+4],%g2	mulx	%o3,%g2,%g2	add	%g2,%o5,%o4	dec	%o2	srlx	%o4,32,%o5	brz,pt	%o2,.L_bn_mul_words_return	stuw	%o4,[%o0+4]	lduw	[%o1+8],%g2	mulx	%o3,%g2,%g2	add	%g2,%o5,%o4	stuw	%o4,[%o0+8]	retl	srlx	%o4,32,%o0.type	bn_mul_words,#function.size	bn_mul_words,(.-bn_mul_words).align  32.global	bn_sqr_words/* * void bn_sqr_words(r,a,n) * BN_ULONG *r,*a; * int n; */bn_sqr_words:	brgz,a	%o2,.L_bn_sqr_words_proceeed	lduw	[%o1],%g2	retl	clr	%o0.L_bn_sqr_words_proceeed:	andcc	%o2,-4,%g0	nop	bz,pn	%icc,.L_bn_sqr_words_tail	nop.L_bn_sqr_words_loop:		! wow! 32 aligned!	lduw	[%o1+4],%g3	mulx	%g2,%g2,%o4	stuw	%o4,[%o0]	srlx	%o4,32,%o5	stuw	%o5,[%o0+4]	nop	lduw	[%o1+8],%g2	mulx	%g3,%g3,%o4	dec	4,%o2	stuw	%o4,[%o0+8]	srlx	%o4,32,%o5	stuw	%o5,[%o0+12]	lduw	[%o1+12],%g3	mulx	%g2,%g2,%o4	srlx	%o4,32,%o5	stuw	%o4,[%o0+16]	inc	16,%o1	stuw	%o5,[%o0+20]	mulx	%g3,%g3,%o4	inc	32,%o0	stuw	%o4,[%o0-8]	srlx	%o4,32,%o5	andcc	%o2,-4,%g2	stuw	%o5,[%o0-4]	bnz,a,pt	%icc,.L_bn_sqr_words_loop	lduw	[%o1],%g2	nop	brnz,a,pn	%o2,.L_bn_sqr_words_tail	lduw	[%o1],%g2.L_bn_sqr_words_return:	retl	clr	%o0.L_bn_sqr_words_tail:	mulx	%g2,%g2,%o4	dec	%o2	stuw	%o4,[%o0]	srlx	%o4,32,%o5	brz,pt	%o2,.L_bn_sqr_words_return	stuw	%o5,[%o0+4]	lduw	[%o1+4],%g2	mulx	%g2,%g2,%o4	dec	%o2	stuw	%o4,[%o0+8]	srlx	%o4,32,%o5	brz,pt	%o2,.L_bn_sqr_words_return	stuw	%o5,[%o0+12]	lduw	[%o1+8],%g2	mulx	%g2,%g2,%o4	srlx	%o4,32,%o5	stuw	%o4,[%o0+16]	stuw	%o5,[%o0+20]	retl	clr	%o0.type	bn_sqr_words,#function.size	bn_sqr_words,(.-bn_sqr_words).align	32.global bn_div_words/* * BN_ULONG bn_div_words(h,l,d) * BN_ULONG h,l,d; */bn_div_words:	sllx	%o0,32,%o0	or	%o0,%o1,%o0	udivx	%o0,%o2,%o0	retl	srl	%o0,%g0,%o0	! clruw	%o0.type	bn_div_words,#function.size	bn_div_words,(.-bn_div_words).align	32.global bn_add_words/* * BN_ULONG bn_add_words(rp,ap,bp,n) * BN_ULONG *rp,*ap,*bp; * int n; */bn_add_words:	brgz,a	%o3,.L_bn_add_words_proceed	lduw	[%o1],%o4	retl	clr	%o0.L_bn_add_words_proceed:	andcc	%o3,-4,%g0	bz,pn	%icc,.L_bn_add_words_tail	addcc	%g0,0,%g0	! clear carry flag	nop.L_bn_add_words_loop:		! wow! 32 aligned!	dec	4,%o3	lduw	[%o2],%o5	lduw	[%o1+4],%g1	lduw	[%o2+4],%g2	lduw	[%o1+8],%g3	lduw	[%o2+8],%g4	addccc	%o5,%o4,%o5	stuw	%o5,[%o0]	lduw	[%o1+12],%o4	lduw	[%o2+12],%o5	inc	16,%o1	addccc	%g1,%g2,%g1	stuw	%g1,[%o0+4]		inc	16,%o2	addccc	%g3,%g4,%g3	stuw	%g3,[%o0+8]	inc	16,%o0	addccc	%o5,%o4,%o5	stuw	%o5,[%o0-4]	and	%o3,-4,%g1	brnz,a,pt	%g1,.L_bn_add_words_loop	lduw	[%o1],%o4	brnz,a,pn	%o3,.L_bn_add_words_tail	lduw	[%o1],%o4.L_bn_add_words_return:	clr	%o0	retl	movcs	%icc,1,%o0	nop.L_bn_add_words_tail:	lduw	[%o2],%o5	dec	%o3	addccc	%o5,%o4,%o5	brz,pt	%o3,.L_bn_add_words_return	stuw	%o5,[%o0]	lduw	[%o1+4],%o4	lduw	[%o2+4],%o5	dec	%o3	addccc	%o5,%o4,%o5	brz,pt	%o3,.L_bn_add_words_return	stuw	%o5,[%o0+4]	lduw	[%o1+8],%o4	lduw	[%o2+8],%o5	addccc	%o5,%o4,%o5	stuw	%o5,[%o0+8]	clr	%o0
sparcv8plus.s - 源码说明

本页面展示了「mediastreamer2是开源的网络传输媒体流的库」中的 sparcv8plus.s 源码文件，采用 S 编程语言编写，共 1,536 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与mediastreamer2相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?