📄 softfloat.c

📁 <Floating Point Unit Core> fpupack.vhd pre_norm_addsub.vhd addsub_28.vhd post_norm_addsub.
💻 C
📖 第 1 页 / 共 5 页
字号:
上一页 1 2 3 45
    }
    aSign = extractFloat32Sign( a );
    bSign = extractFloat32Sign( b );
    if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
    return ( a != b ) && ( aSign ^ ( a < b ) );

}

/*----------------------------------------------------------------------------
| Returns the result of converting the double-precision floating-point value
| `a' to the 32-bit two's complement integer format.  The conversion is
| performed according to the IEC/IEEE Standard for Binary Floating-Point
| Arithmetic---which means in particular that the conversion is rounded
| according to the current rounding mode.  If `a' is a NaN, the largest
| positive integer is returned.  Otherwise, if the conversion overflows, the
| largest integer with the same sign as `a' is returned.
*----------------------------------------------------------------------------*/

int32 float64_to_int32( float64 a )
{
    flag aSign;
    int16 aExp, shiftCount;
    bits32 aSig0, aSig1, absZ, aSigExtra;
    int32 z;
    int8 roundingMode;

    aSig1 = extractFloat64Frac1( a );
    aSig0 = extractFloat64Frac0( a );
    aExp = extractFloat64Exp( a );
    aSign = extractFloat64Sign( a );
    shiftCount = aExp - 0x413;
    if ( 0 <= shiftCount ) {
        if ( 0x41E < aExp ) {
            if ( ( aExp == 0x7FF ) && ( aSig0 | aSig1 ) ) aSign = 0;
            goto invalid;
        }
        shortShift64Left(
            aSig0 | 0x00100000, aSig1, shiftCount, &absZ, &aSigExtra );
        if ( 0x80000000 < absZ ) goto invalid;
    }
    else {
        aSig1 = ( aSig1 != 0 );
        if ( aExp < 0x3FE ) {
            aSigExtra = aExp | aSig0 | aSig1;
            absZ = 0;
        }
        else {
            aSig0 |= 0x00100000;
            aSigExtra = ( aSig0<<( shiftCount & 31 ) ) | aSig1;
            absZ = aSig0>>( - shiftCount );
        }
    }
    roundingMode = float_rounding_mode;
    if ( roundingMode == float_round_nearest_even ) {
        if ( (sbits32) aSigExtra < 0 ) {
            ++absZ;
            if ( (bits32) ( aSigExtra<<1 ) == 0 ) absZ &= ~1;
        }
        z = aSign ? - absZ : absZ;
    }
    else {
        aSigExtra = ( aSigExtra != 0 );
        if ( aSign ) {
            z = - (   absZ
                    + ( ( roundingMode == float_round_down ) & aSigExtra ) );
        }
        else {
            z = absZ + ( ( roundingMode == float_round_up ) & aSigExtra );
        }
    }
    if ( ( aSign ^ ( z < 0 ) ) && z ) {
 invalid:
        float_raise( float_flag_invalid );
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
    }
    if ( aSigExtra ) float_exception_flags |= float_flag_inexact;
    return z;

}

/*----------------------------------------------------------------------------
| Returns the result of converting the double-precision floating-point value
| `a' to the 32-bit two's complement integer format.  The conversion is
| performed according to the IEC/IEEE Standard for Binary Floating-Point
| Arithmetic, except that the conversion is always rounded toward zero.
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
| the conversion overflows, the largest integer with the same sign as `a' is
| returned.
*----------------------------------------------------------------------------*/

int32 float64_to_int32_round_to_zero( float64 a )
{
    flag aSign;
    int16 aExp, shiftCount;
    bits32 aSig0, aSig1, absZ, aSigExtra;
    int32 z;

    aSig1 = extractFloat64Frac1( a );
    aSig0 = extractFloat64Frac0( a );
    aExp = extractFloat64Exp( a );
    aSign = extractFloat64Sign( a );
    shiftCount = aExp - 0x413;
    if ( 0 <= shiftCount ) {
        if ( 0x41E < aExp ) {
            if ( ( aExp == 0x7FF ) && ( aSig0 | aSig1 ) ) aSign = 0;
            goto invalid;
        }
        shortShift64Left(
            aSig0 | 0x00100000, aSig1, shiftCount, &absZ, &aSigExtra );
    }
    else {
        if ( aExp < 0x3FF ) {
            if ( aExp | aSig0 | aSig1 ) {
                float_exception_flags |= float_flag_inexact;
            }
            return 0;
        }
        aSig0 |= 0x00100000;
        aSigExtra = ( aSig0<<( shiftCount & 31 ) ) | aSig1;
        absZ = aSig0>>( - shiftCount );
    }
    z = aSign ? - absZ : absZ;
    if ( ( aSign ^ ( z < 0 ) ) && z ) {
 invalid:
        float_raise( float_flag_invalid );
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
    }
    if ( aSigExtra ) float_exception_flags |= float_flag_inexact;
    return z;

}

/*----------------------------------------------------------------------------
| Returns the result of converting the double-precision floating-point value
| `a' to the single-precision floating-point format.  The conversion is
| performed according to the IEC/IEEE Standard for Binary Floating-Point
| Arithmetic.
*----------------------------------------------------------------------------*/

float32 float64_to_float32( float64 a )
{
    flag aSign;
    int16 aExp;
    bits32 aSig0, aSig1, zSig;
    bits32 allZero;

    aSig1 = extractFloat64Frac1( a );
    aSig0 = extractFloat64Frac0( a );
    aExp = extractFloat64Exp( a );
    aSign = extractFloat64Sign( a );
    if ( aExp == 0x7FF ) {
        if ( aSig0 | aSig1 ) {
            return commonNaNToFloat32( float64ToCommonNaN( a ) );
        }
        return packFloat32( aSign, 0xFF, 0 );
    }
    shift64RightJamming( aSig0, aSig1, 22, &allZero, &zSig );
    if ( aExp ) zSig |= 0x40000000;
    return roundAndPackFloat32( aSign, aExp - 0x381, zSig );

}

/*----------------------------------------------------------------------------
| Rounds the double-precision floating-point value `a' to an integer,
| and returns the result as a double-precision floating-point value.  The
| operation is performed according to the IEC/IEEE Standard for Binary
| Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/

float64 float64_round_to_int( float64 a )
{
    flag aSign;
    int16 aExp;
    bits32 lastBitMask, roundBitsMask;
    int8 roundingMode;
    float64 z;

    aExp = extractFloat64Exp( a );
    if ( 0x413 <= aExp ) {
        if ( 0x433 <= aExp ) {
            if (    ( aExp == 0x7FF )
                 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) ) {
                return propagateFloat64NaN( a, a );
            }
            return a;
        }
        lastBitMask = 1;
        lastBitMask = ( lastBitMask<<( 0x432 - aExp ) )<<1;
        roundBitsMask = lastBitMask - 1;
        z = a;
        roundingMode = float_rounding_mode;
        if ( roundingMode == float_round_nearest_even ) {
            if ( lastBitMask ) {
                add64( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
            }
            else {
                if ( (sbits32) z.low < 0 ) {
                    ++z.high;
                    if ( (bits32) ( z.low<<1 ) == 0 ) z.high &= ~1;
                }
            }
        }
        else if ( roundingMode != float_round_to_zero ) {
            if (   extractFloat64Sign( z )
                 ^ ( roundingMode == float_round_up ) ) {
                add64( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
            }
        }
        z.low &= ~ roundBitsMask;
    }
    else {
        if ( aExp <= 0x3FE ) {
            if ( ( ( (bits32) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
            float_exception_flags |= float_flag_inexact;
            aSign = extractFloat64Sign( a );
            switch ( float_rounding_mode ) {
             case float_round_nearest_even:
                if (    ( aExp == 0x3FE )
                     && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) )
                   ) {
                    return packFloat64( aSign, 0x3FF, 0, 0 );
                }
                break;
             case float_round_down:
                return
                      aSign ? packFloat64( 1, 0x3FF, 0, 0 )
                    : packFloat64( 0, 0, 0, 0 );
             case float_round_up:
                return
                      aSign ? packFloat64( 1, 0, 0, 0 )
                    : packFloat64( 0, 0x3FF, 0, 0 );
            }
            return packFloat64( aSign, 0, 0, 0 );
        }
        lastBitMask = 1;
        lastBitMask <<= 0x413 - aExp;
        roundBitsMask = lastBitMask - 1;
        z.low = 0;
        z.high = a.high;
        roundingMode = float_rounding_mode;
        if ( roundingMode == float_round_nearest_even ) {
            z.high += lastBitMask>>1;
            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
                z.high &= ~ lastBitMask;
            }
        }
        else if ( roundingMode != float_round_to_zero ) {
            if (   extractFloat64Sign( z )
                 ^ ( roundingMode == float_round_up ) ) {
                z.high |= ( a.low != 0 );
                z.high += roundBitsMask;
            }
        }
        z.high &= ~ roundBitsMask;
    }
    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
        float_exception_flags |= float_flag_inexact;
    }
    return z;

}

/*----------------------------------------------------------------------------
| Returns the result of adding the absolute values of the double-precision
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
| before being returned.  `zSign' is ignored if the result is a NaN.
| The addition is performed according to the IEC/IEEE Standard for Binary
| Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/

static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
{
    int16 aExp, bExp, zExp;
    bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
    int16 expDiff;

    aSig1 = extractFloat64Frac1( a );
    aSig0 = extractFloat64Frac0( a );
    aExp = extractFloat64Exp( a );
    bSig1 = extractFloat64Frac1( b );
    bSig0 = extractFloat64Frac0( b );
    bExp = extractFloat64Exp( b );
    expDiff = aExp - bExp;
    if ( 0 < expDiff ) {
        if ( aExp == 0x7FF ) {
            if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b );
            return a;
        }
        if ( bExp == 0 ) {
            --expDiff;
        }
        else {
            bSig0 |= 0x00100000;
        }
        shift64ExtraRightJamming(
            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
        zExp = aExp;
    }
    else if ( expDiff < 0 ) {
        if ( bExp == 0x7FF ) {
            if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
            return packFloat64( zSign, 0x7FF, 0, 0 );
        }
        if ( aExp == 0 ) {
            ++expDiff;
        }
        else {
            aSig0 |= 0x00100000;
        }
        shift64ExtraRightJamming(
            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
        zExp = bExp;
    }
    else {
        if ( aExp == 0x7FF ) {
            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
                return propagateFloat64NaN( a, b );
            }
            return a;
        }
        add64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
        if ( aExp == 0 ) return packFloat64( zSign, 0, zSig0, zSig1 );
        zSig2 = 0;
        zSig0 |= 0x00200000;
        zExp = aExp;
        goto shiftRight1;
    }
    aSig0 |= 0x00100000;
    add64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
    --zExp;
    if ( zSig0 < 0x00200000 ) goto roundAndPack;
    ++zExp;
 shiftRight1:
    shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
 roundAndPack:
    return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );

}

/*----------------------------------------------------------------------------
| Returns the result of subtracting the absolute values of the double-
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
| difference is negated before being returned.  `zSign' is ignored if the
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
| Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/

static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
{
    int16 aExp, bExp, zExp;
    bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
    int16 expDiff;
    float64 z;

    aSig1 = extractFloat
上一页 1 2 3 45
💿 文件大小 467 K
👤 上传用户 alin
📂 所属分类 VHDL/FPGA/Verilog
🏷️ 相关标签

#vhd #post_norm_addsub #pre_norm_addsub #Floating
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -