⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 aes-ia64.s

📁 mediastreamer2是开源的网络传输媒体流的库
💻 S
📖 第 1 页 / 共 5 页
字号:
// ====================================================================// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL// project. Rights for redistribution and usage in source and binary// forms are granted according to the OpenSSL license.// ====================================================================//// What's wrong with compiler generated code? Compiler never uses// variable 'shr' which is pairable with 'extr'/'dep' instructions.// Then it uses 'zxt' which is an I-type, but can be replaced with// 'and' which in turn can be assigned to M-port [there're double as// much M-ports as there're I-ports on Itanium 2]. By sacrificing few// registers for small constants (255, 24 and 16) to be used with// 'shr' and 'and' instructions I can achieve better ILP, Intruction// Level Parallelism, and performance. This code outperforms GCC 3.3// generated code by over factor of 2 (two), GCC 3.4 - by 70% and// HP C - by 40%. Measured best-case scenario, i.e. aligned// big-endian input, ECB timing on Itanium 2 is (18 + 13*rounds)// ticks per block, or 9.25 CPU cycles per byte for 128 bit key..ident	"aes-ia64.S, version 1.1".ident	"IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>".explicit.textrk0=r8;     rk1=r9;prsave=r10;maskff=r11;twenty4=r14;sixteen=r15;te00=r16;   te11=r17;   te22=r18;   te33=r19;te01=r20;   te12=r21;   te23=r22;   te30=r23;te02=r24;   te13=r25;   te20=r26;   te31=r27;te03=r28;   te10=r29;   te21=r30;   te32=r31;// these are rotating...t0=r32;     s0=r33;t1=r34;     s1=r35;t2=r36;     s2=r37;t3=r38;     s3=r39;te0=r40;    te1=r41;    te2=r42;    te3=r43;#if defined(_HPUX_SOURCE) && !defined(_LP64)# define ADDP	addp4# define KSZ	4# define LDKEY	ld4#else# define ADDP	add#endif// This implies that AES_KEY comprises 32-bit key schedule elements// even on LP64 platforms.#ifndef	KSZ# define KSZ	4# define LDKEY	ld4#endif.proc	_ia64_AES_encrypt#// Input:	rk0-rk1//		te0//		te3	as AES_KEY->rounds!!!//		s0-s3//		maskff,twenty4,sixteen// Output:	r16,r20,r24,r28 as s0-s3// Clobber:	r16-r31,rk0-rk1,r32-r43.align	32_ia64_AES_encrypt:{ .mmi;	alloc	r16=ar.pfs,12,0,0,8	LDKEY	t0=[rk0],2*KSZ	mov	pr.rot=1<<16	}{ .mmi;	LDKEY	t1=[rk1],2*KSZ	add	te1=1024,te0	add	te3=-3,te3	};;{ .mib;	LDKEY	t2=[rk0],2*KSZ	mov	ar.ec=3		}{ .mib;	LDKEY	t3=[rk1],2*KSZ	add	te2=2048,te0	brp.loop.imp	.Le_top,.Le_end-16	};;{ .mmi;	xor	s0=s0,t0	xor	s1=s1,t1	mov	ar.lc=te3	}{ .mmi;	xor	s2=s2,t2	xor	s3=s3,t3	add	te3=3072,te0	};;.align	32.Le_top:{ .mmi;	(p0)	LDKEY	t0=[rk0],2*KSZ		// 0/0:rk[0]	(p0)	and	te33=s3,maskff		// 0/0:s3&0xff	(p0)	extr.u	te22=s2,8,8	}	// 0/0:s2>>8&0xff{ .mmi; (p0)	LDKEY	t1=[rk1],2*KSZ		// 0/1:rk[1]	(p0)	and	te30=s0,maskff		// 0/1:s0&0xff	(p0)	shr.u	te00=s0,twenty4	};;	// 0/0:s0>>24{ .mmi;	(p0)	LDKEY	t2=[rk0],2*KSZ		// 1/2:rk[2]	(p0)	shladd	te33=te33,2,te3		// 1/0:te0+s0>>24	(p0)	extr.u	te23=s3,8,8	}	// 1/1:s3>>8&0xff{ .mmi;	(p0)	LDKEY	t3=[rk1],2*KSZ		// 1/3:rk[3]	(p0)	shladd	te30=te30,2,te3		// 1/1:te3+s0	(p0)	shr.u	te01=s1,twenty4	};;	// 1/1:s1>>24{ .mmi;	(p0)	ld4	te33=[te33]		// 2/0:te3[s3&0xff]	(p0)	shladd	te22=te22,2,te2		// 2/0:te2+s2>>8&0xff	(p0)	extr.u	te20=s0,8,8	}	// 2/2:s0>>8&0xff{ .mmi;	(p0)	ld4	te30=[te30]		// 2/1:te3[s0]	(p0)	shladd	te23=te23,2,te2		// 2/1:te2+s3>>8	(p0)	shr.u	te02=s2,twenty4	};;	// 2/2:s2>>24{ .mmi;	(p0)	ld4	te22=[te22]		// 3/0:te2[s2>>8]	(p0)	shladd	te20=te20,2,te2		// 3/2:te2+s0>>8	(p0)	extr.u	te21=s1,8,8	}	// 3/3:s1>>8&0xff{ .mmi;	(p0)	ld4	te23=[te23]		// 3/1:te2[s3>>8]	(p0)	shladd	te00=te00,2,te0		// 3/0:te0+s0>>24	(p0)	shr.u	te03=s3,twenty4	};;	// 3/3:s3>>24{ .mmi;	(p0)	ld4	te20=[te20]		// 4/2:te2[s0>>8]	(p0)	shladd	te21=te21,2,te2		// 4/3:te3+s2	(p0)	extr.u	te11=s1,16,8	}	// 4/0:s1>>16&0xff{ .mmi;	(p0)	ld4	te00=[te00]		// 4/0:te0[s0>>24]	(p0)	shladd	te01=te01,2,te0		// 4/1:te0+s1>>24	(p0)	shr.u	te13=s3,sixteen	};;	// 4/2:s3>>16{ .mmi;	(p0)	ld4	te21=[te21]		// 5/3:te2[s1>>8]	(p0)	shladd	te11=te11,2,te1		// 5/0:te1+s1>>16	(p0)	extr.u	te12=s2,16,8	}	// 5/1:s2>>16&0xff{ .mmi;	(p0)	ld4	te01=[te01]		// 5/1:te0[s1>>24]	(p0)	shladd	te02=te02,2,te0		// 5/2:te0+s2>>24	(p0)	and	te31=s1,maskff	};;	// 5/2:s1&0xff{ .mmi;	(p0)	ld4	te11=[te11]		// 6/0:te1[s1>>16]	(p0)	shladd	te12=te12,2,te1		// 6/1:te1+s2>>16	(p0)	extr.u	te10=s0,16,8	}	// 6/3:s0>>16&0xff{ .mmi;	(p0)	ld4	te02=[te02]		// 6/2:te0[s2>>24]	(p0)	shladd	te03=te03,2,te0		// 6/3:te1+s0>>16	(p0)	and	te32=s2,maskff	};;	// 6/3:s2&0xff{ .mmi;	(p0)	ld4	te12=[te12]		// 7/1:te1[s2>>16]	(p0)	shladd	te31=te31,2,te3		// 7/2:te3+s1&0xff	(p0)	and	te13=te13,maskff}	// 7/2:s3>>16&0xff{ .mmi;	(p0)	ld4	te03=[te03]		// 7/3:te0[s3>>24]	(p0)	shladd	te32=te32,2,te3		// 7/3:te3+s2	(p0)	xor	t0=t0,te33	};;	// 7/0:{ .mmi;	(p0)	ld4	te31=[te31]		// 8/2:te3[s1]	(p0)	shladd	te13=te13,2,te1		// 8/2:te1+s3>>16	(p0)	xor	t0=t0,te22	}	// 8/0:{ .mmi;	(p0)	ld4	te32=[te32]		// 8/3:te3[s2]	(p0)	shladd	te10=te10,2,te1		// 8/3:te1+s0>>16	(p0)	xor	t1=t1,te30	};;	// 8/1:{ .mmi;	(p0)	ld4	te13=[te13]		// 9/2:te1[s3>>16]	(p0)	xor	t0=t0,te00		// 9/0:	(p0)	xor	t1=t1,te23	}	// 9/1:		{ .mmi;	(p0)	ld4	te10=[te10]		// 9/3:te1[s0>>16]	(p0)	xor	t2=t2,te20		// 9/2:	(p0)	xor	t3=t3,te21	};;	// 9/3:{ .mmi;	(p0)	xor	t0=t0,te11		// 10/0:done!	(p0)	xor	t1=t1,te01		// 10/1:	(p0)	xor	t2=t2,te02	}	// 10/2:{ .mmi;	(p0)	xor	t3=t3,te03		// 10/3:	(p16)	cmp.eq	p0,p17=r0,r0 	};;	// 10/clear (p17){ .mmi;	(p0)	xor	t1=t1,te12		// 11/1:done!	(p0)	xor	t2=t2,te31		// 11/2:	(p0)	xor	t3=t3,te32	}	// 11/3:{ .mmi;	(p17)	add	te0=4096,te0		// 11/		(p17)	add	te1=4096,te1	};;	// 11/{ .mib;	(p0)	xor	t2=t2,te13		// 12/2:done!	(p0)	xor	t3=t3,te10	}	// 12/3:done!{ .mib;	(p17)	add	te2=4096,te2		// 12/	(p17)	add	te3=4096,te3		// 12/	br.ctop.sptk	.Le_top		};;.Le_end:{ .mib;	mov	r16=s0	mov	r20=s1			}{ .mib;	mov	r24=s2	mov	r28=s3	br.ret.sptk	b6		};;.endp	_ia64_AES_encrypt#// void AES_encrypt (const void *in,void *out,const AES_KEY *key);.global	AES_encrypt#.proc	AES_encrypt#.align	32.skip	16AES_encrypt:	.prologue	.fframe	0	.save	ar.pfs,r2	.save	ar.lc,r3{ .mmi;	alloc	r2=ar.pfs,3,0,12,0	addl	out8=@ltoff(AES_Te#),gp	mov	r3=ar.lc		}{ .mmi;	and	out0=3,in0	ADDP	in0=0,in0	ADDP	out11=KSZ*60,in2	};;	// &AES_KEY->rounds	.body{ .mmi;	ld8	out8=[out8]			// Te0	ld4	out11=[out11]			// AES_KEY->rounds	mov	prsave=pr		}#if defined(_HPUX_SOURCE)	// HPUX is big-endian, cut 15+15 cycles...{ .mib; cmp.ne	p6,p0=out0,r0	add	out0=4,in0(p6)	br.dpnt.many	.Le_i_unaligned	};;{ .mmi;	ld4	out1=[in0],8		// s0	and	out9=3,in1	mov	twenty4=24		}{ .mmi;	ld4	out3=[out0],8		// s1	ADDP	rk0=0,in2	mov	sixteen=16		};;{ .mmi;	ld4	out5=[in0]		// s2	cmp.ne	p6,p0=out9,r0	mov	maskff=0xff		}{ .mmb;	ld4	out7=[out0]		// s3	ADDP	rk1=KSZ,in2	br.call.sptk.many	b6=_ia64_AES_encrypt	};;{ .mib;	ADDP	in0=4,in1	ADDP	in1=0,in1(p6)	br.spnt	.Le_o_unaligned		};;{ .mii;	mov	ar.pfs=r2	mov	ar.lc=r3		}{ .mmi;	st4	[in1]=r16,8		// s0	st4	[in0]=r20,8		// s1	mov	pr=prsave,0x1ffff	};;{ .mmb;	st4	[in1]=r24		// s2	st4	[in0]=r28		// s3	br.ret.sptk.many	b0	};;#endif.align	32.Le_i_unaligned:{ .mmi;	add	out0=1,in0	add	out2=2,in0	add	out4=3,in0	};;{ .mmi;	ld1	r16=[in0],4	ld1	r17=[out0],4	}//;;{ .mmi;	ld1	r18=[out2],4	ld1	out1=[out4],4	};;	// s0{ .mmi;	ld1	r20=[in0],4	ld1	r21=[out0],4	}//;;{ .mmi;	ld1	r22=[out2],4	ld1	out3=[out4],4	};;	// s1{ .mmi;	ld1	r24=[in0],4	ld1	r25=[out0],4	}//;;{ .mmi;	ld1	r26=[out2],4	ld1	out5=[out4],4	};;	// s2{ .mmi;	ld1	r28=[in0]	ld1	r29=[out0]	}//;;{ .mmi;	ld1	r30=[out2]	ld1	out7=[out4]	};;	// s3{ .mii;	dep	out1=r16,out1,24,8	//;;	dep	out3=r20,out3,24,8	}//;;{ .mii;	ADDP	rk0=0,in2	dep	out5=r24,out5,24,8	//;;	dep	out7=r28,out7,24,8	};;{ .mii;	ADDP	rk1=KSZ,in2	dep	out1=r17,out1,16,8	//;;	dep	out3=r21,out3,16,8	}//;;{ .mii;	mov	twenty4=24	dep	out5=r25,out5,16,8	//;;	dep	out7=r29,out7,16,8	};;{ .mii;	mov	sixteen=16	dep	out1=r18,out1,8,8	//;;	dep	out3=r22,out3,8,8	}//;;{ .mii;	mov	maskff=0xff	dep	out5=r26,out5,8,8	//;;	dep	out7=r30,out7,8,8	};;{ .mib;	br.call.sptk.many	b6=_ia64_AES_encrypt	};;.Le_o_unaligned:{ .mii;	ADDP	out0=0,in1	extr.u	r17=r16,8,8			// s0	shr.u	r19=r16,twenty4		}//;;{ .mii;	ADDP	out1=1,in1	extr.u	r18=r16,16,8	shr.u	r23=r20,twenty4		}//;;	// s1{ .mii;	ADDP	out2=2,in1	extr.u	r21=r20,8,8	shr.u	r22=r20,sixteen	}//;;{ .mii;	ADDP	out3=3,in1	extr.u	r25=r24,8,8			// s2	shr.u	r27=r24,twenty4		};;{ .mii;	st1	[out3]=r16,4	extr.u	r26=r24,16,8	shr.u	r31=r28,twenty4	}//;;	// s3{ .mii;	st1	[out2]=r17,4	extr.u	r29=r28,8,8	shr.u	r30=r28,sixteen		}//;;{ .mmi;	st1	[out1]=r18,4	st1	[out0]=r19,4		};;{ .mmi;	st1	[out3]=r20,4	st1	[out2]=r21,4		}//;;{ .mmi;	st1	[out1]=r22,4	st1	[out0]=r23,4		};;{ .mmi;	st1	[out3]=r24,4	st1	[out2]=r25,4	mov	pr=prsave,0x1ffff	}//;;{ .mmi;	st1	[out1]=r26,4	st1	[out0]=r27,4	mov	ar.pfs=r2		};;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -