⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 submul_1.asm

📁 a very popular packet of cryptography tools,it encloses the most common used algorithm and protocols
💻 ASM
📖 第 1 页 / 共 2 页
字号:
dnl  HP-PA 2.0 64-bit mpn_submul_1 -- Multiply a limb vector with a limb anddnl  subtract the result from a second limb vector.dnl  Copyright 1998, 1999, 2000, 2002, 2003 Free Software Foundation, Inc.dnl  This file is part of the GNU MP Library.dnl  The GNU MP Library is free software; you can redistribute it and/or modifydnl  it under the terms of the GNU Lesser General Public License as publisheddnl  by the Free Software Foundation; either version 2.1 of the License, or (atdnl  your option) any later version.dnl  The GNU MP Library is distributed in the hope that it will be useful, butdnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITYdnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Publicdnl  License for more details.dnl  You should have received a copy of the GNU Lesser General Public Licensednl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write todnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,dnl  MA 02111-1307, USA.dnl  This approaches ?? cycles/limb on PA8000 and 6.75 cycles/limb on PA8500dnl  for huge operands.dnl  The feed-in and wind-down code has not yet been scheduled.  Many cyclesdnl  could be saved there per call.dnl  DESCRIPTION:dnl  The main loop "BIG" is 4-way unrolled, mainly to allowdnl  effective use of ADD,DC.  Delays in moving data via the cache from the FPdnl  registers to the IU registers, have demaned a deep software pipeline, anddnl  a lot of stack slots for partial products in flight.dnldnl  CODE STRUCTURE:dnl  save-some-registersdnl  do 0, 1, 2, or 3 limbsdnl  if done, restore-some-regs and returndnl  save-many-regsdnl  do 4, 8, ... limbdnl  restore-all-regsdnl  STACK LAYOUT:dnl  HP-PA stack grows upwards.  We could allocate 8 fewer slots by using thednl  slots marked FREE, as well as some slots in the caller's "frame marker".dnldnl -00 <- r30dnl -08  FREEdnl -10  tmpdnl -18  tmpdnl -20  tmpdnl -28  tmpdnl -30  tmpdnl -38  tmpdnl -40  tmpdnl -48  tmpdnl -50  tmpdnl -58  tmpdnl -60  tmpdnl -68  tmpdnl -70  tmpdnl -78  tmpdnl -80  tmpdnl -88  tmpdnl -90  FREEdnl -98  FREEdnl -a0  FREEdnl -a8  FREEdnl -b0  r13dnl -b8  r12dnl -c0  r11dnl -c8  r10dnl -d0  r8dnl -d8  r8dnl -e0  r7dnl -e8  r6dnl -f0  r5dnl -f8  r4dnl -100 r3dnl  Previous frame:dnl  [unused area]dnl -38/-138 vlimb home slot.  For 2.0N, the vlimb arg will arrive here.include(`../config.m4')dnl INPUT PARAMETERS:define(`rp',`%r26')	dnldefine(`up',`%r25')	dnldefine(`n',`%r24')	dnldefine(`vlimb',`%r23')	dnldefine(`climb',`%r23')	dnlifdef(`HAVE_ABI_2_0w',`	.level	2.0w',`	.level	2.0')PROLOGUE(mpn_submul_1)ifdef(`HAVE_ABI_2_0w',`	std		vlimb, -0x38(%r30)	C store vlimb into "home" slot')	std,ma		%r3, 0x100(%r30)	std		%r4, -0xf8(%r30)	std		%r5, -0xf0(%r30)	ldo		0(%r0), climb		C clear climb	fldd		-0x138(%r30), %fr8	C put vlimb in fp registerdefine(`p032a1',`%r1')	dnldefine(`p032a2',`%r19')	dnldefine(`m032',`%r20')	dnldefine(`m096',`%r21')	dnldefine(`p000a',`%r22')	dnldefine(`p064a',`%r29')	dnldefine(`s000',`%r31')	dnldefine(`ma000',`%r4')	dnldefine(`ma064',`%r20')	dnldefine(`r000',`%r3')	dnl	extrd,u		n, 63, 2, %r5	cmpb,=		%r5, %r0, L(BIG)	nop	fldd		0(up), %fr4	ldo		8(up), up	xmpyu		%fr8R, %fr4L, %fr22	xmpyu		%fr8L, %fr4R, %fr23	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71	xmpyu		%fr8R, %fr4R, %fr24	xmpyu		%fr8L, %fr4L, %fr25	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79	addib,<>	-1, %r5, L(two_or_more)	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61L(one)	ldd		-0x78(%r30), p032a1	ldd		-0x70(%r30), p032a2	ldd		-0x80(%r30), p000a	b		L(0_one_out)	ldd		-0x68(%r30), p064aL(two_or_more)	fldd		0(up), %fr4	ldo		8(up), up	xmpyu		%fr8R, %fr4L, %fr22	xmpyu		%fr8L, %fr4R, %fr23	ldd		-0x78(%r30), p032a1	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71	xmpyu		%fr8R, %fr4R, %fr24	xmpyu		%fr8L, %fr4L, %fr25	ldd		-0x70(%r30), p032a2	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69	ldd		-0x80(%r30), p000a	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79	ldd		-0x68(%r30), p064a	addib,<>	-1, %r5, L(three_or_more)	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61L(two)	add		p032a1, p032a2, m032	add,dc		%r0, %r0, m096	depd,z		m032, 31, 32, ma000	extrd,u		m032, 31, 32, ma064	ldd		0(rp), r000	b		L(0_two_out)	depd		m096, 31, 32, ma064L(three_or_more)	fldd		0(up), %fr4	add		p032a1, p032a2, m032	add,dc		%r0, %r0, m096	depd,z		m032, 31, 32, ma000	extrd,u		m032, 31, 32, ma064	ldd		0(rp), r000dnl	addib,=		-1, %r5, L(0_out)	depd		m096, 31, 32, ma064L(oop0)dnl	xmpyu		%fr8R, %fr4L, %fr22dnl	xmpyu		%fr8L, %fr4R, %fr23dnl	ldd		-0x78(%r30), p032a1dnl	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71dnldnl	xmpyu		%fr8R, %fr4R, %fr24dnl	xmpyu		%fr8L, %fr4L, %fr25dnl	ldd		-0x70(%r30), p032a2dnl	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69dnldnl	ldo		8(rp), rpdnl	add		climb, p000a, s000dnl	ldd		-0x80(%r30), p000adnl	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79dnldnl	add,dc		p064a, %r0, climbdnl	ldo		8(up), updnl	ldd		-0x68(%r30), p064adnl	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61dnldnl	add		ma000, s000, s000dnl	add,dc		ma064, climb, climbdnl	fldd		0(up), %fr4dnldnl	sub		r000, s000, s000dnl	sub,db		%r0, climb, climbdnl	sub		%r0, climb, climbdnl	std		s000, -8(rp)dnldnl	add		p032a1, p032a2, m032dnl	add,dc		%r0, %r0, m096dnldnl	depd,z		m032, 31, 32, ma000dnl	extrd,u		m032, 31, 32, ma064dnl	ldd		0(rp), r000dnl	addib,<>	-1, %r5, L(oop0)dnl	depd		m096, 31, 32, ma064L(0_out)	ldo		8(up), up	xmpyu		%fr8R, %fr4L, %fr22	xmpyu		%fr8L, %fr4R, %fr23	ldd		-0x78(%r30), p032a1	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71	xmpyu		%fr8R, %fr4R, %fr24	xmpyu		%fr8L, %fr4L, %fr25	ldd		-0x70(%r30), p032a2	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69	ldo		8(rp), rp	add		climb, p000a, s000	ldd		-0x80(%r30), p000a	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79	add,dc		p064a, %r0, climb	ldd		-0x68(%r30), p064a	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61	add		ma000, s000, s000	add,dc		ma064, climb, climb	sub		r000, s000, s000	sub,db		%r0, climb, climb	sub		%r0, climb, climb	std		s000, -8(rp)	add		p032a1, p032a2, m032	add,dc		%r0, %r0, m096	depd,z		m032, 31, 32, ma000	extrd,u		m032, 31, 32, ma064	ldd		0(rp), r000	depd		m096, 31, 32, ma064L(0_two_out)	ldd		-0x78(%r30), p032a1	ldd		-0x70(%r30), p032a2	ldo		8(rp), rp	add		climb, p000a, s000	ldd		-0x80(%r30), p000a	add,dc		p064a, %r0, climb	ldd		-0x68(%r30), p064a	add		ma000, s000, s000	add,dc		ma064, climb, climb	sub		r000, s000, s000	sub,db		%r0, climb, climb	sub		%r0, climb, climb	std		s000, -8(rp)L(0_one_out)	add		p032a1, p032a2, m032	add,dc		%r0, %r0, m096	depd,z		m032, 31, 32, ma000	extrd,u		m032, 31, 32, ma064	ldd		0(rp), r000	depd		m096, 31, 32, ma064	add		climb, p000a, s000	add,dc		p064a, %r0, climb	add		ma000, s000, s000	add,dc		ma064, climb, climb	sub		r000, s000, s000	sub,db		%r0, climb, climb	sub		%r0, climb, climb	std		s000, 0(rp)	cmpib,>=	4, n, L(done)	ldo		8(rp), rpdnl 4-way unrolled code.L(BIG)define(`p032a1',`%r1')	dnldefine(`p032a2',`%r19')	dnldefine(`p096b1',`%r20')	dnldefine(`p096b2',`%r21')	dnldefine(`p160c1',`%r22')	dnldefine(`p160c2',`%r29')	dnldefine(`p224d1',`%r31')	dnldefine(`p224d2',`%r3')	dnl			dnldefine(`m032',`%r4')	dnldefine(`m096',`%r5')	dnldefine(`m160',`%r6')	dnldefine(`m224',`%r7')	dnldefine(`m288',`%r8')	dnl			dnldefine(`p000a',`%r1')	dnldefine(`p064a',`%r19')	dnldefine(`p064b',`%r20')	dnldefine(`p128b',`%r21')	dnldefine(`p128c',`%r22')	dnldefine(`p192c',`%r29')	dnldefine(`p192d',`%r31')	dnldefine(`p256d',`%r3')	dnl			dnldefine(`s000',`%r10')	dnldefine(`s064',`%r11')	dnldefine(`s128',`%r12')	dnldefine(`s192',`%r13')	dnl			dnldefine(`ma000',`%r9')	dnldefine(`ma064',`%r4')	dnldefine(`ma128',`%r5')	dnldefine(`ma192',`%r6')	dnldefine(`ma256',`%r7')	dnl			dnldefine(`r000',`%r1')	dnldefine(`r064',`%r19')	dnldefine(`r128',`%r20')	dnldefine(`r192',`%r21')	dnl	std		%r6, -0xe8(%r30)	std		%r7, -0xe0(%r30)	std		%r8, -0xd8(%r30)	std		%r9, -0xd0(%r30)	std		%r10, -0xc8(%r30)	std		%r11, -0xc0(%r30)	std		%r12, -0xb8(%r30)	std		%r13, -0xb0(%r30)ifdef(`HAVE_ABI_2_0w',`	extrd,u		n, 61, 62, n		C right shift 2',`	extrd,u		n, 61, 30, n		C right shift 2, zero extend')L(4_or_more)	fldd		0(up), %fr4	fldd		8(up), %fr5	fldd		16(up), %fr6

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -