📄 ppc.pl

📁 mediastreamer2是开源的网络传输媒体流的库
💻 PL
📖 第 1 页 / 共 4 页
字号:
上一页 1 2 34
					#mul_add_c(a[5],b[7],c1,c2,c3);	$LD	r6,`5*$BNSZ`(r4)	$LD	r7,`7*$BNSZ`(r5)	$UMULL	r8,r6,r7	$UMULH	r9,r6,r7	addc	r10,r10,r8	adde	r11,r11,r9	addze	r12,r12	$ST	r10,`12*$BNSZ`(r3)	#r[12]=c1;					#mul_add_c(a[6],b[7],c2,c3,c1);	$LD	r6,`6*$BNSZ`(r4)	$UMULL	r8,r6,r7	$UMULH	r9,r6,r7	addc	r11,r11,r8	adde	r12,r12,r9	addze	r10,r0					#mul_add_c(a[7],b[6],c2,c3,c1);	$LD	r6,`7*$BNSZ`(r4)	$LD	r7,`6*$BNSZ`(r5)	$UMULL	r8,r6,r7	$UMULH	r9,r6,r7	addc	r11,r11,r8	adde	r12,r12,r9	addze	r10,r10	$ST	r11,`13*$BNSZ`(r3)	#r[13]=c2;					#mul_add_c(a[7],b[7],c3,c1,c2);	$LD	r7,`7*$BNSZ`(r5)	$UMULL	r8,r6,r7	$UMULH	r9,r6,r7	addc	r12,r12,r8	adde	r10,r10,r9	$ST	r12,`14*$BNSZ`(r3)	#r[14]=c3;	$ST	r10,`15*$BNSZ`(r3)	#r[15]=c1;	bclr	BO_ALWAYS,CR0_LT	.long	0x00000000##	NOTE:	The following label name should be changed to#		"bn_sub_words" i.e. remove the first dot#		for the gcc compiler. This should be automatically#		done in the build##.align	4.bn_sub_words:##	Handcoded version of bn_sub_words##BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)##	r3 = r#	r4 = a#	r5 = b#	r6 = n##       Note:	No loop unrolling done since this is not a performance#               critical loop.	xor	r0,r0,r0	#set r0 = 0##	check for r6 = 0 AND set carry bit.#	subfc.	r7,r0,r6        # If r6 is 0 then result is 0.				# if r6 > 0 then result !=0				# In either case carry bit is set.	bc	BO_IF,CR0_EQ,Lppcasm_sub_adios	addi	r4,r4,-$BNSZ	addi	r3,r3,-$BNSZ	addi	r5,r5,-$BNSZ	mtctr	r6Lppcasm_sub_mainloop:		$LDU	r7,$BNSZ(r4)	$LDU	r8,$BNSZ(r5)	subfe	r6,r8,r7	# r6 = r7+carry bit + onescomplement(r8)				# if carry = 1 this is r7-r8. Else it				# is r7-r8 -1 as we need.	$STU	r6,$BNSZ(r3)	bc	BO_dCTR_NZERO,CR0_EQ,Lppcasm_sub_mainloopLppcasm_sub_adios:		subfze	r3,r0		# if carry bit is set then r3 = 0 else -1	andi.	r3,r3,1         # keep only last bit.	bclr	BO_ALWAYS,CR0_LT	.long	0x00000000##	NOTE:	The following label name should be changed to#		"bn_add_words" i.e. remove the first dot#		for the gcc compiler. This should be automatically#		done in the build#.align	4.bn_add_words:##	Handcoded version of bn_add_words##BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)##	r3 = r#	r4 = a#	r5 = b#	r6 = n##       Note:	No loop unrolling done since this is not a performance#               critical loop.	xor	r0,r0,r0##	check for r6 = 0. Is this needed?#	addic.	r6,r6,0		#test r6 and clear carry bit.	bc	BO_IF,CR0_EQ,Lppcasm_add_adios	addi	r4,r4,-$BNSZ	addi	r3,r3,-$BNSZ	addi	r5,r5,-$BNSZ	mtctr	r6Lppcasm_add_mainloop:		$LDU	r7,$BNSZ(r4)	$LDU	r8,$BNSZ(r5)	adde	r8,r7,r8	$STU	r8,$BNSZ(r3)	bc	BO_dCTR_NZERO,CR0_EQ,Lppcasm_add_mainloopLppcasm_add_adios:		addze	r3,r0			#return carry bit.	bclr	BO_ALWAYS,CR0_LT	.long	0x00000000##	NOTE:	The following label name should be changed to#		"bn_div_words" i.e. remove the first dot#		for the gcc compiler. This should be automatically#		done in the build#.align	4.bn_div_words:##	This is a cleaned up version of code generated by#	the AIX compiler. The only optimization is to use#	the PPC instruction to count leading zeros instead#	of call to num_bits_word. Since this was compiled#	only at level -O2 we can possibly squeeze it more?#	#	r3 = h#	r4 = l#	r5 = d		$UCMPI	0,r5,0			# compare r5 and 0	bc	BO_IF_NOT,CR0_EQ,Lppcasm_div1	# proceed if d!=0	li	r3,-1			# d=0 return -1	bclr	BO_ALWAYS,CR0_LT	Lppcasm_div1:	xor	r0,r0,r0		#r0=0	li	r8,$BITS	$CNTLZ.	r7,r5			#r7 = num leading 0s in d.	bc	BO_IF,CR0_EQ,Lppcasm_div2	#proceed if no leading zeros	subf	r8,r7,r8		#r8 = BN_num_bits_word(d)	$SHR.	r9,r3,r8		#are there any bits above r8'th?	$TR	16,r9,r0		#if there're, signal to dump core...Lppcasm_div2:	$UCMP	0,r3,r5			#h>=d?	bc	BO_IF,CR0_LT,Lppcasm_div3	#goto Lppcasm_div3 if not	subf	r3,r5,r3		#h-=d ; Lppcasm_div3:				#r7 = BN_BITS2-i. so r7=i	cmpi	0,0,r7,0		# is (i == 0)?	bc	BO_IF,CR0_EQ,Lppcasm_div4	$SHL	r3,r3,r7		# h = (h<< i)	$SHR	r8,r4,r8		# r8 = (l >> BN_BITS2 -i)	$SHL	r5,r5,r7		# d<<=i	or	r3,r3,r8		# h = (h<<i)|(l>>(BN_BITS2-i))	$SHL	r4,r4,r7		# l <<=iLppcasm_div4:	$SHRI	r9,r5,`$BITS/2`		# r9 = dh					# dl will be computed when needed					# as it saves registers.	li	r6,2			#r6=2	mtctr	r6			#counter will be in count.Lppcasm_divouterloop: 	$SHRI	r8,r3,`$BITS/2`		#r8 = (h>>BN_BITS4)	$SHRI	r11,r4,`$BITS/2`	#r11= (l&BN_MASK2h)>>BN_BITS4					# compute here for innerloop.	$UCMP	0,r8,r9			# is (h>>BN_BITS4)==dh	bc	BO_IF_NOT,CR0_EQ,Lppcasm_div5	# goto Lppcasm_div5 if not	li	r8,-1	$CLRU	r8,r8,`$BITS/2`		#q = BN_MASK2l 	b	Lppcasm_div6Lppcasm_div5:	$UDIV	r8,r3,r9		#q = h/dhLppcasm_div6:	$UMULL	r12,r9,r8		#th = q*dh	$CLRU	r10,r5,`$BITS/2`	#r10=dl	$UMULL	r6,r8,r10		#tl = q*dl	Lppcasm_divinnerloop:	subf	r10,r12,r3		#t = h -th	$SHRI	r7,r10,`$BITS/2`	#r7= (t &BN_MASK2H), sort of...	addic.	r7,r7,0			#test if r7 == 0. used below.					# now want to compute					# r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4)					# the following 2 instructions do that	$SHLI	r7,r10,`$BITS/2`	# r7 = (t<<BN_BITS4)	or	r7,r7,r11		# r7|=((l&BN_MASK2h)>>BN_BITS4)	$UCMP	1,r6,r7			# compare (tl <= r7)	bc	BO_IF_NOT,CR0_EQ,Lppcasm_divinnerexit	bc	BO_IF_NOT,CR1_FEX,Lppcasm_divinnerexit	addi	r8,r8,-1		#q--	subf	r12,r9,r12		#th -=dh	$CLRU	r10,r5,`$BITS/2`	#r10=dl. t is no longer needed in loop.	subf	r6,r10,r6		#tl -=dl	b	Lppcasm_divinnerloopLppcasm_divinnerexit:	$SHRI	r10,r6,`$BITS/2`	#t=(tl>>BN_BITS4)	$SHLI	r11,r6,`$BITS/2`	#tl=(tl<<BN_BITS4)&BN_MASK2h;	$UCMP	1,r4,r11		# compare l and tl	add	r12,r12,r10		# th+=t	bc	BO_IF_NOT,CR1_FX,Lppcasm_div7  # if (l>=tl) goto Lppcasm_div7	addi	r12,r12,1		# th++Lppcasm_div7:	subf	r11,r11,r4		#r11=l-tl	$UCMP	1,r3,r12		#compare h and th	bc	BO_IF_NOT,CR1_FX,Lppcasm_div8	#if (h>=th) goto Lppcasm_div8	addi	r8,r8,-1		# q--	add	r3,r5,r3		# h+=dLppcasm_div8:	subf	r12,r12,r3		#r12 = h-th	$SHLI	r4,r11,`$BITS/2`	#l=(l&BN_MASK2l)<<BN_BITS4					# want to compute					# h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2					# the following 2 instructions will do this.	$INSR	r11,r12,`$BITS/2`,`$BITS/2`	# r11 is the value we want rotated $BITS/2.	$ROTL	r3,r11,`$BITS/2`	# rotate by $BITS/2 and store in r3	bc	BO_dCTR_ZERO,CR0_EQ,Lppcasm_div9#if (count==0) break ;	$SHLI	r0,r8,`$BITS/2`		#ret =q<<BN_BITS4	b	Lppcasm_divouterloopLppcasm_div9:	or	r3,r8,r0	bclr	BO_ALWAYS,CR0_LT	.long	0x00000000##	NOTE:	The following label name should be changed to#		"bn_sqr_words" i.e. remove the first dot#		for the gcc compiler. This should be automatically#		done in the build#.align	4.bn_sqr_words:##	Optimized version of bn_sqr_words##	void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)##	r3 = r#	r4 = a#	r5 = n##	r6 = a[i].#	r7,r8 = product.##	No unrolling done here. Not performance critical.	addic.	r5,r5,0			#test r5.	bc	BO_IF,CR0_EQ,Lppcasm_sqr_adios	addi	r4,r4,-$BNSZ	addi	r3,r3,-$BNSZ	mtctr	r5Lppcasm_sqr_mainloop:						#sqr(r[0],r[1],a[0]);	$LDU	r6,$BNSZ(r4)	$UMULL	r7,r6,r6	$UMULH  r8,r6,r6	$STU	r7,$BNSZ(r3)	$STU	r8,$BNSZ(r3)	bc	BO_dCTR_NZERO,CR0_EQ,Lppcasm_sqr_mainloopLppcasm_sqr_adios:		bclr	BO_ALWAYS,CR0_LT	.long	0x00000000##	NOTE:	The following label name should be changed to#		"bn_mul_words" i.e. remove the first dot#		for the gcc compiler. This should be automatically#		done in the build#.align	4	.bn_mul_words:## BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)## r3 = rp# r4 = ap# r5 = num# r6 = w	xor	r0,r0,r0	xor	r12,r12,r12		# used for carry	rlwinm.	r7,r5,30,2,31		# num >> 2	bc	BO_IF,CR0_EQ,Lppcasm_mw_REM	mtctr	r7Lppcasm_mw_LOOP:						#mul(rp[0],ap[0],w,c1);	$LD	r8,`0*$BNSZ`(r4)	$UMULL	r9,r6,r8	$UMULH  r10,r6,r8	addc	r9,r9,r12	#addze	r10,r10			#carry is NOT ignored.					#will be taken care of					#in second spin below					#using adde.	$ST	r9,`0*$BNSZ`(r3)					#mul(rp[1],ap[1],w,c1);	$LD	r8,`1*$BNSZ`(r4)		$UMULL	r11,r6,r8	$UMULH  r12,r6,r8	adde	r11,r11,r10	#addze	r12,r12	$ST	r11,`1*$BNSZ`(r3)					#mul(rp[2],ap[2],w,c1);	$LD	r8,`2*$BNSZ`(r4)	$UMULL	r9,r6,r8	$UMULH  r10,r6,r8	adde	r9,r9,r12	#addze	r10,r10	$ST	r9,`2*$BNSZ`(r3)					#mul_add(rp[3],ap[3],w,c1);	$LD	r8,`3*$BNSZ`(r4)	$UMULL	r11,r6,r8	$UMULH  r12,r6,r8	adde	r11,r11,r10	addze	r12,r12			#this spin we collect carry into					#r12	$ST	r11,`3*$BNSZ`(r3)		addi	r3,r3,`4*$BNSZ`	addi	r4,r4,`4*$BNSZ`	bc	BO_dCTR_NZERO,CR0_EQ,Lppcasm_mw_LOOPLppcasm_mw_REM:	andi.	r5,r5,0x3	bc	BO_IF,CR0_EQ,Lppcasm_mw_OVER					#mul(rp[0],ap[0],w,c1);	$LD	r8,`0*$BNSZ`(r4)	$UMULL	r9,r6,r8	$UMULH  r10,r6,r8	addc	r9,r9,r12	addze	r10,r10	$ST	r9,`0*$BNSZ`(r3)	addi	r12,r10,0		addi	r5,r5,-1	cmpli	0,0,r5,0	bc	BO_IF,CR0_EQ,Lppcasm_mw_OVER						#mul(rp[1],ap[1],w,c1);	$LD	r8,`1*$BNSZ`(r4)		$UMULL	r9,r6,r8	$UMULH  r10,r6,r8	addc	r9,r9,r12	addze	r10,r10	$ST	r9,`1*$BNSZ`(r3)	addi	r12,r10,0		addi	r5,r5,-1	cmpli	0,0,r5,0	bc	BO_IF,CR0_EQ,Lppcasm_mw_OVER						#mul_add(rp[2],ap[2],w,c1);	$LD	r8,`2*$BNSZ`(r4)	$UMULL	r9,r6,r8	$UMULH  r10,r6,r8	addc	r9,r9,r12	addze	r10,r10	$ST	r9,`2*$BNSZ`(r3)	addi	r12,r10,0		Lppcasm_mw_OVER:		addi	r3,r12,0	bclr	BO_ALWAYS,CR0_LT	.long	0x00000000##	NOTE:	The following label name should be changed to#		"bn_mul_add_words" i.e. remove the first dot#		for the gcc compiler. This should be automatically#		done in the build#.align	4.bn_mul_add_words:## BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)## r3 = rp# r4 = ap# r5 = num# r6 = w## empirical evidence suggests that unrolled version performs best!!#	xor	r0,r0,r0		#r0 = 0	xor	r12,r12,r12  		#r12 = 0 . used for carry			rlwinm.	r7,r5,30,2,31		# num >> 2	bc	BO_IF,CR0_EQ,Lppcasm_maw_leftover	# if (num < 4) go LPPCASM_maw_leftover	mtctr	r7Lppcasm_maw_mainloop:						#mul_add(rp[0],ap[0],w,c1);	$LD	r8,`0*$BNSZ`(r4)	$LD	r11,`0*$BNSZ`(r3)	$UMULL	r9,r6,r8	$UMULH  r10,r6,r8	addc	r9,r9,r12		#r12 is carry.	addze	r10,r10	addc	r9,r9,r11	#addze	r10,r10					#the above instruction addze					#is NOT needed. Carry will NOT					#be ignored. It's not affected					#by multiply and will be collected					#in the next spin	$ST	r9,`0*$BNSZ`(r3)						#mul_add(rp[1],ap[1],w,c1);	$LD	r8,`1*$BNSZ`(r4)		$LD	r9,`1*$BNSZ`(r3)	$UMULL	r11,r6,r8	$UMULH  r12,r6,r8	adde	r11,r11,r10		#r10 is carry.	addze	r12,r12	addc	r11,r11,r9	#addze	r12,r12	$ST	r11,`1*$BNSZ`(r3)						#mul_add(rp[2],ap[2],w,c1);	$LD	r8,`2*$BNSZ`(r4)	$UMULL	r9,r6,r8	$LD	r11,`2*$BNSZ`(r3)	$UMULH  r10,r6,r8	adde	r9,r9,r12	addze	r10,r10	addc	r9,r9,r11	#addze	r10,r10	$ST	r9,`2*$BNSZ`(r3)						#mul_add(rp[3],ap[3],w,c1);	$LD	r8,`3*$BNSZ`(r4)	$UMULL	r11,r6,r8	$LD	r9,`3*$BNSZ`(r3)	$UMULH  r12,r6,r8	adde	r11,r11,r10	addze	r12,r12	addc	r11,r11,r9	addze	r12,r12	$ST	r11,`3*$BNSZ`(r3)	addi	r3,r3,`4*$BNSZ`	addi	r4,r4,`4*$BNSZ`	bc	BO_dCTR_NZERO,CR0_EQ,Lppcasm_maw_mainloop	Lppcasm_maw_leftover:	andi.	r5,r5,0x3	bc	BO_IF,CR0_EQ,Lppcasm_maw_adios	addi	r3,r3,-$BNSZ	addi	r4,r4,-$BNSZ					#mul_add(rp[0],ap[0],w,c1);	mtctr	r5	$LDU	r8,$BNSZ(r4)	$UMULL	r9,r6,r8	$UMULH  r10,r6,r8	$LDU	r11,$BNSZ(r3)	addc	r9,r9,r11	addze	r10,r10	addc	r9,r9,r12	addze	r12,r10	$ST	r9,0(r3)		bc	BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios					#mul_add(rp[1],ap[1],w,c1);	$LDU	r8,$BNSZ(r4)		$UMULL	r9,r6,r8	$UMULH  r10,r6,r8	$LDU	r11,$BNSZ(r3)	addc	r9,r9,r11	addze	r10,r10	addc	r9,r9,r12	addze	r12,r10	$ST	r9,0(r3)		bc	BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios					#mul_add(rp[2],ap[2],w,c1);	$LDU	r8,$BNSZ(r4)	$UMULL	r9,r6,r8	$UMULH  r10,r6,r8	$LDU	r11,$BNSZ(r3)	addc	r9,r9,r11	addze	r10,r10	addc	r9,r9,r12	addze	r12,r10	$ST	r9,0(r3)		Lppcasm_maw_adios:		addi	r3,r12,0	bclr	BO_ALWAYS,CR0_LT	.long	0x00000000	.align	4EOF	$data =~ s/\`([^\`]*)\`/eval $1/gem;	# if some assembler chokes on some simplified mnemonic,	# this is the spot to fix it up, e.g.:	# GNU as doesn't seem to accept cmplw, 32-bit unsigned compare	$data =~ s/^(\s*)cmplw(\s+)([^,]+),(.*)/$1cmpl$2$3,0,$4/gm;	# assembler X doesn't accept li, load immediate value	#$data =~ s/^(\s*)li(\s+)([^,]+),(.*)/$1addi$2$3,0,$4/gm;	return($data);}
上一页 1 2 34
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -