📄 mul_1.asm
字号:
dnl HP-PA 2.0 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and storednl the result in a second limb vector.dnl Copyright 1998, 1999, 2000, 2002, 2003 Free Software Foundation, Inc.dnl This file is part of the GNU MP Library.dnl The GNU MP Library is free software; you can redistribute it and/or modifydnl it under the terms of the GNU Lesser General Public License as publisheddnl by the Free Software Foundation; either version 2.1 of the License, or (atdnl your option) any later version.dnl The GNU MP Library is distributed in the hope that it will be useful, butdnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITYdnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Publicdnl License for more details.dnl You should have received a copy of the GNU Lesser General Public Licensednl along with the GNU MP Library; see the file COPYING.LIB. If not, write todnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,dnl MA 02111-1307, USA.dnl This approaches ?? cycles/limb on PA8000 and 5.625 cycles/limb on PA8500dnl for huge operands. These numbers are close to optimal.dnl The feed-in and wind-down code has not yet been scheduled. Many cyclesdnl could be saved there per call.dnl DESCRIPTION:dnl The main loop "BIG" is 4-way unrolled, mainly to allowdnl effective use of ADD,DC. Delays in moving data via the cache from the FPdnl registers to the IU registers, have demaned a deep software pipeline, anddnl a lot of stack slots for partial products in flight.dnldnl CODE STRUCTURE:dnl save-some-registersdnl do 0, 1, 2, or 3 limbsdnl if done, restore-some-regs and returndnl save-many-regsdnl do 4, 8, ... limbdnl restore-all-regsdnl STACK LAYOUT:dnl HP-PA stack grows upwards. We could allocate 8 fewer slots by using thednl slots marked FREE, as well as some slots in the caller's "frame marker".dnldnl -00 <- r30dnl -08 FREEdnl -10 tmpdnl -18 tmpdnl -20 tmpdnl -28 tmpdnl -30 tmpdnl -38 tmpdnl -40 tmpdnl -48 tmpdnl -50 tmpdnl -58 tmpdnl -60 tmpdnl -68 tmpdnl -70 tmpdnl -78 tmpdnl -80 tmpdnl -88 tmpdnl -90 FREEdnl -98 FREEdnl -a0 FREEdnl -a8 FREEdnl -b0 r13dnl -b8 r12dnl -c0 r11dnl -c8 r10dnl -d0 r8dnl -d8 r8dnl -e0 r7dnl -e8 r6dnl -f0 r5dnl -f8 r4dnl -100 r3dnl Previous frame:dnl [unused area]dnl -38/-138 vlimb home slot. For 2.0N, the vlimb arg will arrive here.include(`../config.m4')dnl INPUT PARAMETERS:define(`rp',`%r26') dnldefine(`up',`%r25') dnldefine(`n',`%r24') dnldefine(`vlimb',`%r23') dnldefine(`climb',`%r23') dnlifdef(`HAVE_ABI_2_0w',` .level 2.0w',` .level 2.0')PROLOGUE(mpn_mul_1)ifdef(`HAVE_ABI_2_0w',` std vlimb, -0x38(%r30) C store vlimb into "home" slot') std,ma %r3, 0x100(%r30) std %r4, -0xf8(%r30) std %r5, -0xf0(%r30) ldo 0(%r0), climb C clear climb fldd -0x138(%r30), %fr8 C put vlimb in fp registerdefine(`p032a1',`%r1') dnldefine(`p032a2',`%r19') dnldefine(`m032',`%r20') dnldefine(`m096',`%r21') dnldefine(`p000a',`%r22') dnldefine(`p064a',`%r29') dnldefine(`s000',`%r31') dnldefine(`ma000',`%r4') dnldefine(`ma064',`%r20') dnlC define(`r000',`%r3') dnl FIXME don't save r3 for n < 4. extrd,u n, 63, 2, %r5 cmpb,= %r5, %r0, L(BIG) nop fldd 0(up), %fr4 ldo 8(up), up xmpyu %fr8R, %fr4L, %fr22 xmpyu %fr8L, %fr4R, %fr23 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 xmpyu %fr8R, %fr4R, %fr24 xmpyu %fr8L, %fr4L, %fr25 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79 addib,<> -1, %r5, L(two_or_more) fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61L(one) ldd -0x78(%r30), p032a1 ldd -0x70(%r30), p032a2 ldd -0x80(%r30), p000a b L(0_one_out) ldd -0x68(%r30), p064aL(two_or_more) fldd 0(up), %fr4 ldo 8(up), up xmpyu %fr8R, %fr4L, %fr22 xmpyu %fr8L, %fr4R, %fr23 ldd -0x78(%r30), p032a1 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 xmpyu %fr8R, %fr4R, %fr24 xmpyu %fr8L, %fr4L, %fr25 ldd -0x70(%r30), p032a2 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 ldd -0x80(%r30), p000a fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79 ldd -0x68(%r30), p064a addib,<> -1, %r5, L(three_or_more) fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61L(two) add p032a1, p032a2, m032 add,dc %r0, %r0, m096 depd,z m032, 31, 32, ma000 extrd,u m032, 31, 32, ma064 b L(0_two_out) depd m096, 31, 32, ma064L(three_or_more) fldd 0(up), %fr4 add p032a1, p032a2, m032 add,dc %r0, %r0, m096 depd,z m032, 31, 32, ma000 extrd,u m032, 31, 32, ma064dnl addib,= -1, %r5, L(0_out) depd m096, 31, 32, ma064L(oop0)dnl xmpyu %fr8R, %fr4L, %fr22dnl xmpyu %fr8L, %fr4R, %fr23dnl ldd -0x78(%r30), p032a1dnl fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71dnldnl xmpyu %fr8R, %fr4R, %fr24dnl xmpyu %fr8L, %fr4L, %fr25dnl ldd -0x70(%r30), p032a2dnl fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69dnldnl ldo 8(rp), rpdnl add climb, p000a, s000dnl ldd -0x80(%r30), p000adnl fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79dnldnl add,dc p064a, %r0, climbdnl ldo 8(up), updnl ldd -0x68(%r30), p064adnl fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61dnldnl add ma000, s000, s000dnl add,dc ma064, climb, climbdnl fldd 0(up), %fr4dnldnl std s000, -8(rp)dnldnl add p032a1, p032a2, m032dnl add,dc %r0, %r0, m096dnldnl depd,z m032, 31, 32, ma000dnl extrd,u m032, 31, 32, ma064dnl addib,<> -1, %r5, L(oop0)dnl depd m096, 31, 32, ma064L(0_out) ldo 8(up), up xmpyu %fr8R, %fr4L, %fr22 xmpyu %fr8L, %fr4R, %fr23 ldd -0x78(%r30), p032a1 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 xmpyu %fr8R, %fr4R, %fr24 xmpyu %fr8L, %fr4L, %fr25 ldd -0x70(%r30), p032a2 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 ldo 8(rp), rp add climb, p000a, s000 ldd -0x80(%r30), p000a fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79 add,dc p064a, %r0, climb ldd -0x68(%r30), p064a fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61 add ma000, s000, s000 add,dc ma064, climb, climb std s000, -8(rp) add p032a1, p032a2, m032 add,dc %r0, %r0, m096 depd,z m032, 31, 32, ma000 extrd,u m032, 31, 32, ma064 depd m096, 31, 32, ma064L(0_two_out) ldd -0x78(%r30), p032a1 ldd -0x70(%r30), p032a2 ldo 8(rp), rp add climb, p000a, s000 ldd -0x80(%r30), p000a add,dc p064a, %r0, climb ldd -0x68(%r30), p064a add ma000, s000, s000 add,dc ma064, climb, climb std s000, -8(rp)L(0_one_out) add p032a1, p032a2, m032 add,dc %r0, %r0, m096 depd,z m032, 31, 32, ma000 extrd,u m032, 31, 32, ma064 depd m096, 31, 32, ma064 add climb, p000a, s000 add,dc p064a, %r0, climb add ma000, s000, s000 add,dc ma064, climb, climb std s000, 0(rp) cmpib,>= 4, n, L(done) ldo 8(rp), rpdnl 4-way unrolled code.L(BIG)define(`p032a1',`%r1') dnldefine(`p032a2',`%r19') dnldefine(`p096b1',`%r20') dnldefine(`p096b2',`%r21') dnldefine(`p160c1',`%r22') dnldefine(`p160c2',`%r29') dnldefine(`p224d1',`%r31') dnldefine(`p224d2',`%r3') dnl dnldefine(`m032',`%r4') dnldefine(`m096',`%r5') dnldefine(`m160',`%r6') dnldefine(`m224',`%r7') dnldefine(`m288',`%r8') dnl dnldefine(`p000a',`%r1') dnldefine(`p064a',`%r19') dnldefine(`p064b',`%r20') dnldefine(`p128b',`%r21') dnldefine(`p128c',`%r22') dnldefine(`p192c',`%r29') dnldefine(`p192d',`%r31') dnldefine(`p256d',`%r3') dnl dnldefine(`s000',`%r10') dnldefine(`s064',`%r11') dnldefine(`s128',`%r12') dnldefine(`s192',`%r13') dnl dnldefine(`ma000',`%r9') dnldefine(`ma064',`%r4') dnldefine(`ma128',`%r5') dnldefine(`ma192',`%r6') dnldefine(`ma256',`%r7') dnl std %r6, -0xe8(%r30) std %r7, -0xe0(%r30) std %r8, -0xd8(%r30) std %r9, -0xd0(%r30) std %r10, -0xc8(%r30) std %r11, -0xc0(%r30) std %r12, -0xb8(%r30) std %r13, -0xb0(%r30)ifdef(`HAVE_ABI_2_0w',` extrd,u n, 61, 62, n C right shift 2',` extrd,u n, 61, 30, n C right shift 2, zero extend')
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -