📄 ppc.pl

📁 mediastreamer2是开源的网络传输媒体流的库
💻 PL
📖 第 1 页 / 共 4 页
字号:
12 3 4 下一页
#!/usr/bin/env perl## Implemented as a Perl wrapper as we want to support several different# architectures with single file. We pick up the target based on the# file name we are asked to generate.## It should be noted though that this perl code is nothing like# <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much# as pre-processor to cover for platform differences in name decoration,# linker tables, 32-/64-bit instruction sets...## As you might know there're several PowerPC ABI in use. Most notably# Linux and AIX use different 32-bit ABIs. Good news are that these ABIs# are similar enough to implement leaf(!) functions, which would be ABI# neutral. And that's what you find here: ABI neutral leaf functions.# In case you wonder what that is...##       AIX performance##	MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e.##	The following is the performance of 32-bit compiler#	generated code:##	OpenSSL 0.9.6c 21 dec 2001#	built on: Tue Jun 11 11:06:51 EDT 2002#	options:bn(64,32) ...#compiler: cc -DTHREADS  -DAIX -DB_ENDIAN -DBN_LLONG -O3#                  sign    verify    sign/s verify/s#rsa  512 bits   0.0098s   0.0009s    102.0   1170.6#rsa 1024 bits   0.0507s   0.0026s     19.7    387.5#rsa 2048 bits   0.3036s   0.0085s      3.3    117.1#rsa 4096 bits   2.0040s   0.0299s      0.5     33.4#dsa  512 bits   0.0087s   0.0106s    114.3     94.5#dsa 1024 bits   0.0256s   0.0313s     39.0     32.0	##	Same bechmark with this assembler code:##rsa  512 bits   0.0056s   0.0005s    178.6   2049.2#rsa 1024 bits   0.0283s   0.0015s     35.3    674.1#rsa 2048 bits   0.1744s   0.0050s      5.7    201.2#rsa 4096 bits   1.1644s   0.0179s      0.9     55.7#dsa  512 bits   0.0052s   0.0062s    191.6    162.0#dsa 1024 bits   0.0149s   0.0180s     67.0     55.5##	Number of operations increases by at almost 75%##	Here are performance numbers for 64-bit compiler#	generated code:##	OpenSSL 0.9.6g [engine] 9 Aug 2002#	built on: Fri Apr 18 16:59:20 EDT 2003#	options:bn(64,64) ...#	compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3#                  sign    verify    sign/s verify/s#rsa  512 bits   0.0028s   0.0003s    357.1   3844.4#rsa 1024 bits   0.0148s   0.0008s     67.5   1239.7#rsa 2048 bits   0.0963s   0.0028s     10.4    353.0#rsa 4096 bits   0.6538s   0.0102s      1.5     98.1#dsa  512 bits   0.0026s   0.0032s    382.5    313.7#dsa 1024 bits   0.0081s   0.0099s    122.8    100.6##	Same benchmark with this assembler code:##rsa  512 bits   0.0020s   0.0002s    510.4   6273.7#rsa 1024 bits   0.0088s   0.0005s    114.1   2128.3#rsa 2048 bits   0.0540s   0.0016s     18.5    622.5#rsa 4096 bits   0.3700s   0.0058s      2.7    171.0#dsa  512 bits   0.0016s   0.0020s    610.7    507.1#dsa 1024 bits   0.0047s   0.0058s    212.5    173.2#	#	Again, performance increases by at about 75%##       Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)#       OpenSSL 0.9.7c 30 Sep 2003##       Original code.##rsa  512 bits   0.0011s   0.0001s    906.1  11012.5#rsa 1024 bits   0.0060s   0.0003s    166.6   3363.1#rsa 2048 bits   0.0370s   0.0010s     27.1    982.4#rsa 4096 bits   0.2426s   0.0036s      4.1    280.4#dsa  512 bits   0.0010s   0.0012s   1038.1    841.5#dsa 1024 bits   0.0030s   0.0037s    329.6    269.7#dsa 2048 bits   0.0101s   0.0127s     98.9     78.6##       Same benchmark with this assembler code:##rsa  512 bits   0.0007s   0.0001s   1416.2  16645.9#rsa 1024 bits   0.0036s   0.0002s    274.4   5380.6#rsa 2048 bits   0.0222s   0.0006s     45.1   1589.5#rsa 4096 bits   0.1469s   0.0022s      6.8    449.6#dsa  512 bits   0.0006s   0.0007s   1664.2   1376.2#dsa 1024 bits   0.0018s   0.0023s    545.0    442.2#dsa 2048 bits   0.0061s   0.0075s    163.5    132.8##        Performance increase of ~60%##	If you have comments or suggestions to improve code send#	me a note at schari@us.ibm.com#$opf = shift;if ($opf =~ /32\.s/) {	$BITS=	32;	$BNSZ=	$BITS/8;	$ISA=	"\"ppc\"";	$LD=	"lwz";		# load	$LDU=	"lwzu";		# load and update	$ST=	"stw";		# store	$STU=	"stwu";		# store and update	$UMULL=	"mullw";	# unsigned multiply low	$UMULH=	"mulhwu";	# unsigned multiply high	$UDIV=	"divwu";	# unsigned divide	$UCMPI=	"cmplwi";	# unsigned compare with immediate	$UCMP=	"cmplw";	# unsigned compare	$CNTLZ=	"cntlzw";	# count leading zeros	$SHL=	"slw";		# shift left	$SHR=	"srw";		# unsigned shift right	$SHRI=	"srwi";		# unsigned shift right by immediate		$SHLI=	"slwi";		# shift left by immediate	$CLRU=	"clrlwi";	# clear upper bits	$INSR=	"insrwi";	# insert right	$ROTL=	"rotlwi";	# rotate left by immediate	$TR=	"tw";		# conditional trap} elsif ($opf =~ /64\.s/) {	$BITS=	64;	$BNSZ=	$BITS/8;	$ISA=	"\"ppc64\"";	# same as above, but 64-bit mnemonics...	$LD=	"ld";		# load	$LDU=	"ldu";		# load and update	$ST=	"std";		# store	$STU=	"stdu";		# store and update	$UMULL=	"mulld";	# unsigned multiply low	$UMULH=	"mulhdu";	# unsigned multiply high	$UDIV=	"divdu";	# unsigned divide	$UCMPI=	"cmpldi";	# unsigned compare with immediate	$UCMP=	"cmpld";	# unsigned compare	$CNTLZ=	"cntlzd";	# count leading zeros	$SHL=	"sld";		# shift left	$SHR=	"srd";		# unsigned shift right	$SHRI=	"srdi";		# unsigned shift right by immediate		$SHLI=	"sldi";		# shift left by immediate	$CLRU=	"clrldi";	# clear upper bits	$INSR=	"insrdi";	# insert right 	$ROTL=	"rotldi";	# rotate left by immediate	$TR=	"td";		# conditional trap} else { die "nonsense $opf"; }( defined shift || open STDOUT,">$opf" ) || die "can't open $opf: $!";# function entry points from the AIX code## There are other, more elegant, ways to handle this. We (IBM) chose# this approach as it plays well with scripts we run to 'namespace'# OpenSSL .i.e. we add a prefix to all the public symbols so we can# co-exist in the same process with other implementations of OpenSSL.# 'cleverer' ways of doing these substitutions tend to hide data we# need to be obvious.#my @items = ("bn_sqr_comba4",	     "bn_sqr_comba8",	     "bn_mul_comba4",	     "bn_mul_comba8",	     "bn_sub_words",	     "bn_add_words",	     "bn_div_words",	     "bn_sqr_words",	     "bn_mul_words",	     "bn_mul_add_words");if    ($opf =~ /linux/)	{  do_linux();	}elsif ($opf =~ /aix/)	{  do_aix();	}elsif ($opf =~ /osx/)	{  do_osx();	}else			{  do_bsd();	}sub do_linux {    $d=&data();    if ($BITS==64) {      foreach $t (@items) {        $d =~ s/\.$t:/\\t.section\t".opd","aw"\\t.align\t3\\t.globl\t$t\$t:\\t.quad\t.$t,.TOC.\@tocbase,0\\t.size\t$t,24\\t.previous\n\\t.type\t.$t,\@function\\t.globl\t.$t\.$t:/g;      }    }    else {      foreach $t (@items) {        $d=~s/\.$t/$t/g;      }    }    # hide internal labels to avoid pollution of name table...    $d=~s/Lppcasm_/.Lppcasm_/gm;    print $d;}sub do_aix {    # AIX assembler is smart enough to please the linker without    # making us do something special...    print &data();}# MacOSX 32 bitsub do_osx {    $d=&data();    # Change the bn symbol prefix from '.' to '_'    foreach $t (@items) {      $d=~s/\.$t/_$t/g;    }    # Change .machine to something OS X asm will accept    $d=~s/\.machine.*/.text/g;    $d=~s/\#/;/g; # change comment from '#' to ';'    print $d;}# BSD (Untested)sub do_bsd {    $d=&data();    foreach $t (@items) {      $d=~s/\.$t/_$t/g;    }    print $d;}sub data {	local($data)=<<EOF;#--------------------------------------------------------------------#####	File:		ppc32.s##	Created by:	Suresh Chari#			IBM Thomas J. Watson Research Library#			Hawthorne, NY###	Description:	Optimized assembly routines for OpenSSL crypto#			on the 32 bitPowerPC platform.###	Version History##	2. Fixed bn_add,bn_sub and bn_div_words, added comments,#	   cleaned up code. Also made a single version which can#	   be used for both the AIX and Linux compilers. See NOTE#	   below.#				12/05/03		Suresh Chari#			(with lots of help from)        Andy Polyakov##	#	1. Initial version	10/20/02		Suresh Chari###	The following file works for the xlc,cc#	and gcc compilers.##	NOTE:	To get the file to link correctly with the gcc compiler#	        you have to change the names of the routines and remove#		the first .(dot) character. This should automatically#		be done in the build process.##	Hand optimized assembly code for the following routines#	#	bn_sqr_comba4#	bn_sqr_comba8#	bn_mul_comba4#	bn_mul_comba8#	bn_sub_words#	bn_add_words#	bn_div_words#	bn_sqr_words#	bn_mul_words#	bn_mul_add_words##	NOTE:	It is possible to optimize this code more for#	specific PowerPC or Power architectures. On the Northstar#	architecture the optimizations in this file do#	 NOT provide much improvement.##	If you have comments or suggestions to improve code send#	me a note at schari\@us.ibm.com##--------------------------------------------------------------------------##	Defines to be used in the assembly code.#	.set r0,0	# we use it as storage for value of 0.set SP,1	# preserved.set RTOC,2	# preserved .set r3,3	# 1st argument/return value.set r4,4	# 2nd argument/volatile register.set r5,5	# 3rd argument/volatile register.set r6,6	# ....set r7,7.set r8,8.set r9,9.set r10,10.set r11,11.set r12,12.set r13,13	# not used, nor any other "below" it....set BO_IF_NOT,4.set BO_IF,12.set BO_dCTR_NZERO,16.set BO_dCTR_ZERO,18.set BO_ALWAYS,20.set CR0_LT,0;.set CR0_GT,1;.set CR0_EQ,2.set CR1_FX,4;.set CR1_FEX,5;.set CR1_VX,6.set LR,8#	Declare function names to be global#	NOTE:	For gcc these names MUST be changed to remove#	        the first . i.e. for example change ".bn_sqr_comba4"#		to "bn_sqr_comba4". This should be automatically done#		in the build.		.globl	.bn_sqr_comba4	.globl	.bn_sqr_comba8	.globl	.bn_mul_comba4	.globl	.bn_mul_comba8	.globl	.bn_sub_words	.globl	.bn_add_words	.globl	.bn_div_words	.globl	.bn_sqr_words	.globl	.bn_mul_words	.globl	.bn_mul_add_words	# .text section		.machine	$ISA##	NOTE:	The following label name should be changed to#		"bn_sqr_comba4" i.e. remove the first dot#		for the gcc compiler. This should be automatically#		done in the build#.align	4.bn_sqr_comba4:## Optimized version of bn_sqr_comba4.## void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)# r3 contains r# r4 contains a## Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:	# # r5,r6 are the two BN_ULONGs being multiplied.# r7,r8 are the results of the 32x32 giving 64 bit multiply.# r9,r10, r11 are the equivalents of c1,c2, c3.# Here's the assembly##	xor		r0,r0,r0		# set r0 = 0. Used in the addze						# instructions below							#sqr_add_c(a,0,c1,c2,c3)	$LD		r5,`0*$BNSZ`(r4)			$UMULL		r9,r5,r5			$UMULH		r10,r5,r5		#in first iteration. No need						#to add since c1=c2=c3=0.						# Note c3(r11) is NOT set to 0						# but will be.	$ST		r9,`0*$BNSZ`(r3)	# r[0]=c1;						# sqr_add_c2(a,1,0,c2,c3,c1);	$LD		r6,`1*$BNSZ`(r4)			$UMULL		r7,r5,r6	$UMULH		r8,r5,r6						addc		r7,r7,r7		# compute (r7,r8)=2*(r7,r8)	adde		r8,r8,r8	addze		r9,r0			# catch carry if any.						# r9= r0(=0) and carry 		addc		r10,r7,r10		# now add to temp result.	addze		r11,r8                  # r8 added to r11 which is 0 	addze		r9,r9		$ST		r10,`1*$BNSZ`(r3)	#r[1]=c2; 						#sqr_add_c(a,1,c3,c1,c2)	$UMULL		r7,r6,r6	$UMULH		r8,r6,r6	addc		r11,r7,r11	adde		r9,r8,r9	addze		r10,r0						#sqr_add_c2(a,2,0,c3,c1,c2)	$LD		r6,`2*$BNSZ`(r4)	$UMULL		r7,r5,r6	$UMULH		r8,r5,r6		addc		r7,r7,r7	adde		r8,r8,r8	addze		r10,r10		addc		r11,r7,r11	adde		r9,r8,r9	addze		r10,r10	$ST		r11,`2*$BNSZ`(r3)	#r[2]=c3 						#sqr_add_c2(a,3,0,c1,c2,c3);	$LD		r6,`3*$BNSZ`(r4)			$UMULL		r7,r5,r6	$UMULH		r8,r5,r6	addc		r7,r7,r7	adde		r8,r8,r8	addze		r11,r0		addc		r9,r7,r9	adde		r10,r8,r10	addze		r11,r11						#sqr_add_c2(a,2,1,c1,c2,c3);	$LD		r5,`1*$BNSZ`(r4)	$LD		r6,`2*$BNSZ`(r4)	$UMULL		r7,r5,r6	$UMULH		r8,r5,r6		addc		r7,r7,r7	adde		r8,r8,r8	addze		r11,r11	addc		r9,r7,r9	adde		r10,r8,r10	addze		r11,r11	$ST		r9,`3*$BNSZ`(r3)	#r[3]=c1						#sqr_add_c(a,2,c2,c3,c1);	$UMULL		r7,r6,r6	$UMULH		r8,r6,r6	addc		r10,r7,r10	adde		r11,r8,r11	addze		r9,r0						#sqr_add_c2(a,3,1,c2,c3,c1);	$LD		r6,`3*$BNSZ`(r4)			$UMULL		r7,r5,r6	$UMULH		r8,r5,r6	addc		r7,r7,r7	adde		r8,r8,r8	addze		r9,r9		addc		r10,r7,r10	adde		r11,r8,r11	addze		r9,r9	$ST		r10,`4*$BNSZ`(r3)	#r[4]=c2						#sqr_add_c2(a,3,2,c3,c1,c2);	$LD		r5,`2*$BNSZ`(r4)			$UMULL		r7,r5,r6	$UMULH		r8,r5,r6	addc		r7,r7,r7	adde		r8,r8,r8	addze		r10,r0		addc		r11,r7,r11	adde		r9,r8,r9	addze		r10,r10	$ST		r11,`5*$BNSZ`(r3)	#r[5] = c3						#sqr_add_c(a,3,c1,c2,c3);	$UMULL		r7,r6,r6			$UMULH		r8,r6,r6	addc		r9,r7,r9	adde		r10,r8,r10	$ST		r9,`6*$BNSZ`(r3)	#r[6]=c1	$ST		r10,`7*$BNSZ`(r3)	#r[7]=c2	bclr	BO_ALWAYS,CR0_LT	.long	0x00000000##	NOTE:	The following label name should be changed to#		"bn_sqr_comba8" i.e. remove the first dot#		for the gcc compiler. This should be automatically#		done in the build#	.align	4.bn_sqr_comba8:## This is an optimized version of the bn_sqr_comba8 routine.# Tightly uses the adde instruction### void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)# r3 contains r# r4 contains a## Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:	# # r5,r6 are the two BN_ULONGs being multiplied.# r7,r8 are the results of the 32x32 giving 64 bit multiply.# r9,r10, r11 are the equivalents of c1,c2, c3.## Possible optimization of loading all 8 longs of a into registers# doesnt provide any speedup# 	xor		r0,r0,r0		#set r0 = 0.Used in addze						#instructions below.						#sqr_add_c(a,0,c1,c2,c3);	$LD		r5,`0*$BNSZ`(r4)	$UMULL		r9,r5,r5		#1st iteration:	no carries.	$UMULH		r10,r5,r5	$ST		r9,`0*$BNSZ`(r3)	# r[0]=c1;						#sqr_add_c2(a,1,0,c2,c3,c1);
12 3 4 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -