📄 ppc_fmul.s

📁 powerpc 405 优化过的硬浮点库
💻 S
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/* fpopt/ppc_fmul.S, pl_FPE_common, pl_linux 11/24/03 16:17:34                                                                  */
/*----------------------------------------------------------------------------- */
/*  Copyright (c) 2003, IBM Corporation                                         */
/*  All rights reserved.                                                        */
/*                                                                              */
/*  Redistribution and use in source and binary forms, with or                  */
/*  without modification, are permitted provided that the following             */
/*  conditions are met:                                                         */
/*                                                                              */
/*    * Redistributions of source code must retain the above                    */
/*      copyright notice, this list of conditions and the following             */
/*      disclaimer.                                                             */
/*    * Redistributions in binary form must reproduce the above                 */
/*      copyright notice, this list of conditions and the following             */
/*      disclaimer in the documentation and/or other materials                  */
/*      provided with the distribution.                                         */
/*    * Neither the name of IBM nor the names of its contributors               */
/*      may be used to endorse or promote products derived from this            */
/*      software without specific prior written permission.                     */
/*                                                                              */
/*  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND                      */
/*  CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,                 */
/*  INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF                    */
/*  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE                    */
/*  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS           */
/*  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,         */
/*  OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,                    */
/*  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR          */
/*  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY         */
/*  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT                */
/*  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE           */
/*  USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.    */
/*                                                                              */
/*----------------------------------------------------------------------------- */
/*                                                                              */
/* Function: multiply two double floating point values.  frt = fpa * fpb        */
/* Input:    r3,r4(fpa)                                                         */
/*           r5,r6(fpb)                                                         */
/* Output:   r3,r4(frt)                                                         */
/* Notes:   1. No stack frame is created for the normal case of this function,  */
/*             so the following registers must be preserved, as required by     */
/*             ABI specification:                                               */
/*               LR, CR0, R1, R2, R13-R31                                       */
/*          2. For the full function case, a stack frame is created, following  */
/*             ABI specification, and non-volatile registers are saved and      */
/*             restored on the stack.                                           */
/*          3. operation performed according to IEEE754-1985 standard with      */
/*             rounding mode = nearest even.                                    */
/*                                                                              */
/*----------------------------------------------------------------------------- */

#include <ppc4xx.inc>
#include "fpeLib.inc"

function_prolog(__muldf3)

     mfcr      r0                     /* save cr */
     mtctr     r13                    /* save r13 */
     
/* load fpa into     r8,r9,r10 and cr6.  load fpb into r11, r12, r13     and cr7 */
     mr        r9,r3                  /* load fpa.exp, fpa.S, fpa.hi */
     mr        r12,r5                 /* load fpb.exp, fpb.S, fpb.hi */
     rlwinm    r8,r9,32-20,0x7ff      /* isolate exponent of fpa into r8 */
     rlwinm    r11,r12,32-20,0x7ff    /* isolate exponent of fpb into r11 */
     mr        r10,r4                 /* load fpa.lo */
                                      /* load fpb.lo (already in r6) */
     
/* test for normal:                                                             */
     cmpwi     cr0,r8,0x7ff           /* if (fpa.exp == DEXPMAX) */
     cmpwi     cr1,r8,0               /* if (fpa.exp == 0) */
     cmpwi     cr2,r11,0x7ff          /* if (fpb.exp == DEXPMAX) */
     cmpwi     cr3,r11,0              /* if (fpb.exp == 0) */
     beq       full                        /*fpa.exp is max */
     beq       cr2,full               /* fpb.exp is max */
     beq       cr1,tst0a              /* fpa.exp is 0  */
     beq       cr3,tst0b              /* fpb.exp is 0  */

/* normal case only:                                                            */
     oris      r9,r9,0x0010           /*    fpa.hi |= 0x00100000; */
     oris      r12,r12,0x0010         /*    fpb.hi |= 0x00100000; */
     
/* Calculate exponent                                                         */
     add       r8,r8,r11              /* result.expt = fpa.exp + fpb.exp; */
                                      /* ------- note: r11 is now free */
     addic.    r8,r8,-1022            /* result.exp -= DEXPBIAS; check for negative */
                                      /* result.exp += 1 (for multiplying left-just */
                                      /* numbers if result is left-just) */
     cmpwi     cr2,r8,0x7fd           /* if (result.exp >= DEXPMAX-2) */
     ble       full
     bge       cr2,full    
     
/* Multiply the numbers:                                                      */
/* Start by shifting everything left 11 bits, so that the MSW is full:        */
     rlwinm    r9,r9,11,0,20          /* r9 = fpa.hi */
     rlwimi    r9,r10,11,21,31
     rlwinm    r10,r10,11,0,20        /* r10 = fpa.lo */

     rlwinm    r12,r12,11,0,20        /* r12 = fpb.hi */
     rlwimi    r12,r6,11,21,31
     rlwinm    r13,r6,11,0,20         /* r13 = fpb.lo */
     
/* Multiply a-high and     b-high:                                            */
     mulhwu    r7,r9,r12              /* r7 gets high word of result */
     mullw     r11,r9,r12             /* r11 gets low word of result */
    
/* Multiply a-high and b-low (upper half only):                           */
     mulhwu    r13,r9,r13
     addc      r13,r11,r13
     addze     r7,r7
     
/* Multiply a-low and b-high (upper half only):                               */
     mulhwu    r10,r10,r12
     addc      r13,r13,r10
     addze.    r7,r7
                                      /* r7/r13 now have result hi/lo */
                                      /* ------- note:     r9-r12 are now free */
/*   There are two possibilities for the result:                        */
/*   Either (bit 0 is 1) or (bit 0 is 0 and bit 1 is 1).            */
/*   If bit 0 is 0, then shift left 1 bit to normalize:                 */
     blt       msb_is_bit0
     addc      r13,r13,r13               /* shift r7/r13 left 1 bit */
     adde      r7,r7,r7               /* */
     addic.    r8,r8,-1               /* shifted case: exponent is 1 less */
                                      /* than non-shifted */
     beq       full                   /* if exponent out of range, go to full handler */
msb_is_bit0:

/*   See if we need to round. (NOTE: the exact .5 case needs more precision */
/*   than what we have calculated; in that case, go to the full handler.)   */
/*   53 bits (bits 0-52 of the 2-word result) are significant. The sticky bit   */
/*   is bit 53 = bit (53-32= 21) of the low word (r13). If it is 0, then no */
/*   rounding.                                                                  */
/*   BUT we don't really know the sticky bit for sure yet -- the uncalculated */
/*   low-order 64 bits of the 128-bit product could conceivably cause the low   */
/*   bits of our LSW to round up, changing the state of bit 21.             */
/*   The round-up is the result of a carry-out from 3 neglected summands, */
/*   so at most it adds 2 to the low word. But we have already              */
/*   shifted left one bit, just above, so by now the carry-out could add    */
/*   as much as 4. The addition of a 3-bit number can                           */
/*   propagate up to bit 21 ONLY if bits 22-29 are all ones:        */
     rlwinm.   r10,r13,0,22,31
     cmpwi     cr2,r10,0x3fc          /* special case if bits 22-29 all 1's */
     rlwinm    r11,r13,0,20,21         /* test LSb and sticky bit */
     bge       cr2,full
     cmpwi     cr3,r11,0x400          /* special rounding if LSb=0 and sticky=1 */
     addic     r13,r13,0x400            /* standard rounding = add 1 to sticky */
     bne+      cr3,finish_std_round
     beq       full                   /* if LSb=0 and sticky=1 and 22-31 all */
                                      /* zeros, then special case */
finish_std_round:
     addze.    r7,r7                  /* add carry-out of low word to high */
                                      /* -- if     this overflowed us, then we */
                                      /* need to shift back right 1 bit */
                                      /* and increment the exponent */
                                      /* to reflect this shift: */
     bne+      std_done
     addi      r8,r8,1
     rlwinm    r13,r13,32-1,1,31
     rlwimi    r13,r7,32-1,0,0
     rlwinm    r7,r7,32-1,1,31

std_done:

/*   We're done! Just assemble the result back into IEEE format:          */
/*   r3 = IEEE MSW, r4 = IEEE LSW                                         */

     xor       r9,r3,r5              /* get product sign */

/*   Shift right 11 bits (= left 21 bits) to put the MSb into bit 11, but */
/*   mask it off since it's implied:                                        */
     rlwinm    r3,r7,32-11,12,31
     
/*   Bits 0-10 of the IEEE LSW represent bits 21-31 of the result, so they  */
/*   come from bits 21-31 of the high word (r7):                            */
     rlwinm    r4,r7,21,0,10
     
/*   Bits 11-31 of the IEEE LSW represent bits 32-52 of the result, so they     */
/*   come from bits 0-20 of the low word (r13):                          */
     rlwimi    r4,r13,32-11,11,31
     
/*   Put the exponent into bits 1-11 of the MSW:                            */
     rlwimi    r3,r8,20,1,11
     
/*   Insert sign bit:                                                       */
     rlwimi    r3,r9,0,0,0

     mtcr      r0                     /* restore cr */
     mfctr     r13                    /* restore r13 */
     blr                              /* return; */
                                   
tst0a:
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -