Documentation
¶
Index ¶
- Constants
- Variables
- func Configure(config *Config)
- func DebugInfo() map[string]interface{}
- func DisableFastArithmetic()
- func DisableFastConversion()
- func EnableFastArithmetic()
- func EnableFastConversion()
- func Equal(a, b Float8) bool
- func GetMemoryUsage() int
- func GetVersion() string
- func Greater(a, b Float8) bool
- func GreaterEqual(a, b Float8) bool
- func Initialize()
- func Less(a, b Float8) bool
- func LessEqual(a, b Float8) bool
- func ToSlice32(f8s []Float8) []float32
- type ArithmeticMode
- type Config
- type ConversionMode
- type Float8
- func Add(a, b Float8) Float8
- func AddSlice(a, b []Float8) []Float8
- func AddWithMode(a, b Float8, mode ArithmeticMode) Float8
- func Ceil(f Float8) Float8
- func Clamp(f, min, max Float8) Float8
- func CopySign(f, sign Float8) Float8
- func Cos(f Float8) Float8
- func Div(a, b Float8) Float8
- func DivWithMode(a, b Float8, mode ArithmeticMode) Float8
- func Exp(f Float8) Float8
- func Floor(f Float8) Float8
- func Fmod(x, y Float8) Float8
- func FromBits(bits uint8) Float8
- func FromFloat64(f float64) Float8
- func FromInt(i int) Float8
- func Lerp(a, b, t Float8) Float8
- func Log(f Float8) Float8
- func Max(a, b Float8) Float8
- func Min(a, b Float8) Float8
- func Mul(a, b Float8) Float8
- func MulSlice(a, b []Float8) []Float8
- func MulWithMode(a, b Float8, mode ArithmeticMode) Float8
- func One() Float8
- func Parse(s string) (Float8, error)
- func Pow(f, exp Float8) Float8
- func Round(f Float8) Float8
- func ScaleSlice(s []Float8, scalar Float8) []Float8
- func Sign(f Float8) Float8
- func Sin(f Float8) Float8
- func Sqrt(f Float8) Float8
- func Sub(a, b Float8) Float8
- func SubWithMode(a, b Float8, mode ArithmeticMode) Float8
- func SumSlice(s []Float8) Float8
- func Tan(f Float8) Float8
- func ToFloat8(f32 float32) Float8
- func ToFloat8WithMode(f32 float32, mode ConversionMode) (Float8, error)
- func ToSlice8(f32s []float32) []Float8
- func Trunc(f Float8) Float8
- func Zero() Float8
- func (f Float8) Abs() Float8
- func (f Float8) Bits() uint8
- func (f Float8) GoString() string
- func (f Float8) IsFinite() bool
- func (f Float8) IsInf() bool
- func (f Float8) IsNaN() bool
- func (f Float8) IsNormal() bool
- func (f Float8) IsValid() bool
- func (f Float8) IsZero() bool
- func (f Float8) Neg() Float8
- func (f Float8) Sign() int
- func (f Float8) String() string
- func (f Float8) ToFloat32() float32
- func (f Float8) ToFloat64() float64
- func (f Float8) ToInt() int
- type Float8Error
Constants ¶
const ( Version = "2.0.0" VersionMajor = 2 VersionMinor = 0 VersionPatch = 0 )
Version information
const ( SignMask = 0b10000000 // 0x80 - Sign bit mask ExponentMask = 0b01111000 // 0x78 - Exponent bits mask MantissaMask = 0b00000111 // 0x07 - Mantissa bits mask MantissaLen = 3 // Number of mantissa bits // Exponent bias and limits // See https://en.wikipedia.org/wiki/Exponent_bias // bias = 2^(|exponent|-1) - 1 ExponentBias = 7 // Bias for 4-bit exponent ExponentMax = 15 // Maximum exponent value ExponentMin = -7 // Minimum exponent value // Float32 constants for conversion Float32Bias = 127 // IEEE 754 single precision bias // Special values PositiveZero Float8 = 0x00 NegativeZero Float8 = 0x80 PositiveInfinity Float8 = 0x78 // IEEE 754 E4M3FN: S.1111.000 = 0.1111.000₂ NegativeInfinity Float8 = 0xF8 // IEEE 754 E4M3FN: S.1111.000 = 1.1111.000₂ NaN Float8 = 0x7F // IEEE 754 E4M3FN: S.1111.111 (0x7F or 0xFF) MaxValue Float8 = 0x7E // Largest finite positive value MinValue Float8 = 0xFE // Largest finite negative value SmallestPositive Float8 = 0x01 // Smallest positive normalized value )
Bit masks and constants for Float8 format
Variables ¶
var ( E = ToFloat8(2.718281828459045) // Euler's number Pi = ToFloat8(3.141592653589793) // Pi Phi = ToFloat8(1.618033988749895) // Golden ratio Sqrt2 = ToFloat8(1.4142135623730951) // Square root of 2 SqrtE = ToFloat8(1.6487212707001282) // Square root of E SqrtPi = ToFloat8(1.7724538509055159) // Square root of Pi Ln2 = ToFloat8(0.6931471805599453) // Natural logarithm of 2 Log2E = ToFloat8(1.4426950408889634) // Base-2 logarithm of E Ln10 = ToFloat8(2.302585092994046) // Natural logarithm of 10 Log10E = ToFloat8(0.4342944819032518) // Base-10 logarithm of E )
Constants as Float8 values
var ( ErrOverflow = &Float8Error{Op: "convert", Msg: "value too large for float8"} ErrUnderflow = &Float8Error{Op: "convert", Msg: "value too small for float8"} ErrNaN = &Float8Error{Op: "convert", Msg: "NaN not representable in float8"} )
Common error instances
var DefaultArithmeticMode = ArithmeticAuto
Global arithmetic mode
var DefaultConversionMode = ModeDefault
Global conversion mode (can be changed for different behavior)
Functions ¶
func Configure ¶
func Configure(config *Config)
Configure applies the given configuration to the package
func DebugInfo ¶
func DebugInfo() map[string]interface{}
DebugInfo returns debugging information about the package state
func DisableFastArithmetic ¶
func DisableFastArithmetic()
DisableFastArithmetic disables lookup tables and uses algorithmic operations
func DisableFastConversion ¶
func DisableFastConversion()
DisableFastConversion disables lookup table and uses algorithmic conversion
func EnableFastArithmetic ¶
func EnableFastArithmetic()
EnableFastArithmetic enables lookup tables for arithmetic operations
func EnableFastConversion ¶
func EnableFastConversion()
EnableFastConversion enables lookup table for ToFloat32 conversion
func GetMemoryUsage ¶
func GetMemoryUsage() int
GetMemoryUsage returns the current memory usage of lookup tables in bytes
func ToSlice32 ¶
ToSlice32 converts a slice of Float8 to float32 with optimized performance.
This function is optimized for batch conversion of Float8 values to float32. It handles all special values correctly, including negative zero, infinity, and NaN.
Parameters:
- f8s: The input slice of Float8 values to convert. May be nil or empty.
Returns:
- nil if the input slice is nil
- A new slice containing the converted float32 values
Note: The conversion from Float8 to float32 is always exact since Float8 is a subset of float32. For large slices, consider using a pool of []float32 to reduce allocations.
Types ¶
type ArithmeticMode ¶
type ArithmeticMode int
ArithmeticMode defines which implementation to use for arithmetic operations
const ( // ArithmeticAuto chooses the best implementation automatically ArithmeticAuto ArithmeticMode = iota // ArithmeticAlgorithmic forces algorithmic implementation ArithmeticAlgorithmic // ArithmeticLookup forces lookup table implementation (if available) ArithmeticLookup )
type Config ¶
type Config struct {
EnableFastArithmetic bool
EnableFastConversion bool
DefaultMode ConversionMode
ArithmeticMode ArithmeticMode
}
Config holds package configuration options
func DefaultConfig ¶
func DefaultConfig() *Config
DefaultConfig returns the default package configuration
type ConversionMode ¶
type ConversionMode int
ConversionMode defines how conversions handle edge cases
const ( // ModeDefault uses standard IEEE 754 rounding behavior ModeDefault ConversionMode = iota // ModeStrict returns errors for overflow/underflow ModeStrict // ModeFast uses lookup tables when available (default for arithmetic) ModeFast )
type Float8 ¶
type Float8 uint8
Float8 represents an 8-bit floating-point number using the IEEE 754 FP8 E4M3FN format. This format is commonly used in machine learning for reduced-precision arithmetic.
Bit layout:
- 1 bit : Sign (0 = positive, 1 = negative)
- 4 bits : Exponent (biased by 7, range [-6, 7])
- 3 bits : Mantissa (3 explicit bits, 1 implicit leading bit for normal numbers)
Special values:
- PositiveZero/NegativeZero: Exponent=0000, Mantissa=000
- PositiveInfinity/NegativeInfinity: Exponent=1111, Mantissa=000
- NaN: Exponent=1111, Mantissa=111
This implementation follows the E4M3FN variant which has no infinities and two NaNs.
func Add ¶
Add returns the sum of the operands a and b.
This is a convenience function that calls AddWithMode with DefaultArithmeticMode. For more control over the arithmetic behavior, use AddWithMode directly.
Special cases:
Add(+0, ±0) = +0 Add(-0, -0) = -0 Add(±Inf, ∓Inf) = NaN (but returns +0 in this implementation) Add(NaN, x) = NaN Add(x, NaN) = NaN
For finite numbers, the result is rounded to the nearest representable Float8 value using the current rounding mode (typically round-to-nearest-even).
func AddSlice ¶
AddSlice performs element-wise addition of two Float8 slices.
This function adds corresponding elements of the input slices and returns a new slice with the results. The input slices must have the same length; otherwise, the function will panic.
Parameters:
- a, b: Slices of Float8 values to be added element-wise.
Returns:
- A new slice where each element is the sum of the corresponding elements in a and b.
Panics:
- If the input slices have different lengths.
Example:
a := []Float8{1.0, 2.0, 3.0}
b := []Float8{4.0, 5.0, 6.0}
result := AddSlice(a, b) // Returns [5.0, 7.0, 9.0]
func AddWithMode ¶
func AddWithMode(a, b Float8, mode ArithmeticMode) Float8
AddWithMode returns the sum of the operands a and b using the specified arithmetic mode.
The arithmetic mode determines how the addition is performed:
- ArithmeticAuto: Uses the fastest available method (lookup tables if enabled)
- ArithmeticLookup: Forces use of lookup tables (panics if not available)
- ArithmeticAlgorithmic: Uses the algorithmic implementation
Special cases are handled according to IEEE 754 rules:
- If either operand is NaN, the result is NaN
- Infinities of the same sign add to infinity of that sign
- Infinities of opposite signs produce NaN (but this implementation returns +0)
- The sign of a zero result is the sign of the sum of the operands
For finite numbers, the result is rounded to the nearest representable Float8 value. If the exact result is exactly halfway between two representable values, it is rounded to the value with an even least significant bit (round-to-nearest-even).
func Ceil ¶
Ceil returns the least integer value greater than or equal to f.
Special cases are:
Ceil(±0) = ±0 Ceil(±Inf) = ±Inf Ceil(NaN) = NaN
For finite x, the result is the least integer value ≥ x. The result is exact (no rounding occurs).
func Cos ¶
Cos returns the cosine of f (in radians).
Special cases are:
Cos(±0) = 1 Cos(±Inf) = NaN Cos(NaN) = NaN
For finite x, the result is the cosine of x in the range [-1, 1]. The result is rounded to the nearest representable Float8 value.
func Div ¶
Div returns the quotient a/b of the operands a and b.
This is a convenience function that calls DivWithMode with DefaultArithmeticMode. For more control over the arithmetic behavior, use DivWithMode directly.
Special cases:
Div(±0, ±0) = NaN Div(±Inf, ±Inf) = NaN Div(x, ±0) = ±Inf for x finite and not zero (sign obeys rule for signs) Div(±Inf, y) = ±Inf for y finite and not zero (sign obeys rule for signs) Div(x, y) = NaN if x or y is NaN
The sign of the result follows the standard sign rules for division. For finite numbers, the result is rounded to the nearest representable Float8 value. Division by zero results in ±Inf with the sign determined by the rule of signs.
func DivWithMode ¶
func DivWithMode(a, b Float8, mode ArithmeticMode) Float8
DivWithMode performs division with specified arithmetic mode
func Floor ¶
Floor returns the greatest integer value less than or equal to f.
Special cases are:
Floor(±0) = ±0 Floor(±Inf) = ±Inf Floor(NaN) = NaN
For finite x, the result is the greatest integer value ≤ x. The result is exact (no rounding occurs).
func Fmod ¶
Fmod returns the floating-point remainder of x/y.
The result has the same sign as x and magnitude less than the magnitude of y.
Special cases are:
Fmod(±0, y) = ±0 for y != 0 Fmod(±Inf, y) = NaN Fmod(x, 0) = NaN Fmod(NaN, y) = NaN Fmod(x, NaN) = NaN Fmod(x, ±Inf) = x for x not infinite
For finite x and y (y ≠ 0), the result is x - n*y where n is the integer nearest to x/y. If two integers are equally near, the even one is chosen. The result is rounded to the nearest representable Float8 value.
func FromFloat64 ¶
FromFloat64 converts a float64 to Float8 (with potential precision loss)
func Log ¶
Log returns the natural logarithm of f.
Special cases are:
Log(+Inf) = +Inf Log(0) = -Inf Log(x < 0) = NaN Log(NaN) = NaN
For finite x > 0, the result is the natural logarithm of x. The result is rounded to the nearest representable Float8 value.
func Max ¶
Max returns the larger of two Float8 values. If either value is NaN, returns NaN. Max(+Inf, x) returns +Inf Max(-Inf, x) returns x (if x is finite or +Inf) Max(x, +Inf) returns +Inf Max(x, -Inf) returns x (if x is finite or +Inf)
func Min ¶
Min returns the smaller of two Float8 values. If either value is NaN, returns NaN. Min(+Inf, x) returns x (if x is finite or -Inf) Min(-Inf, x) returns -Inf Min(x, +Inf) returns x (if x is finite or -Inf) Min(x, -Inf) returns -Inf
func Mul ¶
Mul returns the product of the operands a and b.
This is a convenience function that calls MulWithMode with DefaultArithmeticMode. For more control over the arithmetic behavior, use MulWithMode directly.
Special cases:
Mul(±0, ±Inf) = NaN Mul(±Inf, ±0) = NaN Mul(±0, ±0) = ±0 (sign obeys the rule for signs of zero products) Mul(±0, y) = ±0 for y finite and not zero Mul(±Inf, y) = ±Inf for y finite and not zero Mul(x, y) = NaN if x or y is NaN
The sign of the result follows the standard sign rules for multiplication. For finite numbers, the result is rounded to the nearest representable Float8 value.
func MulSlice ¶
MulSlice performs element-wise multiplication of two Float8 slices.
This function multiplies corresponding elements of the input slices and returns a new slice with the results. The input slices must have the same length; otherwise, the function will panic.
Parameters:
- a, b: Slices of Float8 values to be multiplied element-wise.
Returns:
- A new slice where each element is the product of the corresponding elements in a and b.
Panics:
- If the input slices have different lengths.
Example:
a := []Float8{1.0, 2.0, 3.0}
b := []Float8{4.0, 5.0, 6.0}
result := MulSlice(a, b) // Returns [4.0, 10.0, 18.0]
func MulWithMode ¶
func MulWithMode(a, b Float8, mode ArithmeticMode) Float8
MulWithMode performs multiplication with specified arithmetic mode
func Pow ¶
Pow returns f raised to the power of exp.
Special cases are:
Pow(±0, exp) = ±0 for exp > 0 Pow(±0, exp) = +Inf for exp < 0 Pow(1, exp) = 1 for any exp (even NaN) Pow(f, 0) = 1 for any f (including NaN, +Inf, -Inf) Pow(f, 1) = f for any f Pow(NaN, exp) = NaN Pow(f, NaN) = NaN Pow(±0, -Inf) = +Inf Pow(±0, +Inf) = +0 Pow(+Inf, exp) = +Inf for exp > 0 Pow(+Inf, exp) = +0 for exp < 0 Pow(-Inf, exp) = -0 for exp a negative odd integer Pow(-Inf, exp) = +0 for exp a negative non-odd integer Pow(-Inf, exp) = -Inf for exp a positive odd integer Pow(-Inf, exp) = +Inf for exp a positive non-odd integer Pow(-1, ±Inf) = 1 Pow(f, +Inf) = +Inf for |f| > 1 Pow(f, -Inf) = +0 for |f| > 1 Pow(f, +Inf) = +0 for |f| < 1 Pow(f, -Inf) = +Inf for |f| < 1
The result is rounded to the nearest representable Float8 value.
func Round ¶
Round returns the nearest integer value to f, rounding ties to even.
Special cases are:
Round(±0) = ±0 Round(±Inf) = ±Inf Round(NaN) = NaN
For finite x, the result is the nearest integer to x. Ties are rounded to the nearest even integer. The result is exact (no rounding occurs).
func ScaleSlice ¶
ScaleSlice multiplies each element in the slice by a scalar
func Sin ¶
Sin returns the sine of f (in radians).
Special cases are:
Sin(±0) = ±0 Sin(±Inf) = NaN Sin(NaN) = NaN
For finite x, the result is the sine of x in the range [-1, 1]. The result is rounded to the nearest representable Float8 value.
func Sqrt ¶
Sqrt returns the square root of the Float8 value.
Special cases are:
Sqrt(+0) = +0 Sqrt(-0) = -0 Sqrt(+Inf) = +Inf Sqrt(x) = NaN if x < 0 (including -Inf) Sqrt(NaN) = NaN
For finite x ≥ 0, the result is the greatest Float8 value y such that y² ≤ x. The result is rounded to the nearest representable Float8 value.
func Sub ¶
Sub returns the difference of a-b, i.e., the result of subtracting b from a.
This is a convenience function that calls SubWithMode with DefaultArithmeticMode. For more control over the arithmetic behavior, use SubWithMode directly.
Special cases:
Sub(+0, +0) = +0 Sub(+0, -0) = +0 Sub(-0, +0) = -0 Sub(-0, -0) = +0 Sub(±Inf, ±Inf) = NaN (but returns +0 in this implementation) Sub(NaN, x) = NaN Sub(x, NaN) = NaN
For finite numbers, the result is rounded to the nearest representable Float8 value.
func SubWithMode ¶
func SubWithMode(a, b Float8, mode ArithmeticMode) Float8
SubWithMode performs subtraction with specified arithmetic mode
func SumSlice ¶
SumSlice returns the sum of all elements in the slice.
This function computes the sum of all Float8 values in the input slice. If the slice is empty, it returns PositiveZero.
The summation is performed using the standard addition rules for Float8, including proper handling of special values (NaN, Inf, etc.).
Parameters:
- s: The input slice of Float8 values to sum.
Returns:
- The sum of all elements in the slice.
- If the slice is empty, returns PositiveZero.
- If any element is NaN, the result is NaN.
Example:
s := []Float8{1.0, 2.0, 3.0, 4.0}
sum := SumSlice(s) // Returns 10.0
func Tan ¶
Tan returns the tangent of f (in radians).
Special cases are:
Tan(±0) = ±0 Tan(±Inf) = NaN Tan(NaN) = NaN
For finite x, the result is the tangent of x. The result is rounded to the nearest representable Float8 value. Note that the result may be extremely large or small for inputs near (2n+1)π/2.
func ToFloat8 ¶
ToFloat8 converts a float32 value to Float8 format using the default conversion mode.
This is a convenience function that calls ToFloat8WithMode with DefaultConversionMode. For more control over the conversion process, use ToFloat8WithMode directly.
Special cases:
- Converts +0.0 to PositiveZero (0x00)
- Converts -0.0 to NegativeZero (0x80)
- Converts +Inf to PositiveInfinity (0x78)
- Converts -Inf to NegativeInfinity (0xF8)
- Converts NaN to NaN (0x7F or 0xFF)
For finite numbers, the conversion may lose precision or result in overflow/underflow. The default mode handles these cases by saturating to the maximum/minimum representable values.
func ToFloat8WithMode ¶
func ToFloat8WithMode(f32 float32, mode ConversionMode) (Float8, error)
ToFloat8WithMode converts a float32 to Float8 with the specified conversion mode.
The conversion mode determines how edge cases are handled:
- ModeDefault: Uses standard IEEE 754 rounding behavior, saturating on overflow
- ModeStrict: Returns an error for overflow/underflow/NaN
- ModeFast: Uses lookup tables when available (if enabled)
Special cases are handled as follows:
- ±0.0 is converted to the corresponding Float8 zero (preserving sign)
- ±Inf is converted to the corresponding Float8 infinity
- NaN is handled according to the conversion mode
For finite numbers, the conversion follows these steps:
- Extract sign, exponent, and mantissa from the float32
- Adjust the exponent for the Float8 format (E4M3FN)
- Round the mantissa to 3 bits (plus implicit leading bit)
- Handle overflow/underflow according to the conversion mode
Returns the converted Float8 value and an error if the conversion fails in strict mode.
func ToSlice8 ¶
ToSlice8 converts a slice of float32 to Float8 with optimized performance.
This function is optimized for batch conversion of float32 values to Float8. It handles special values correctly, including negative zero, infinity, and NaN.
Parameters:
- f32s: The input slice of float32 values to convert. May be nil or empty.
Returns:
- nil if the input slice is nil
- A non-nil empty slice if the input slice is empty
- A new slice containing the converted Float8 values
Note: This function preserves negative zero by checking the sign bit of zero values. For large slices, consider using a pool of []Float8 to reduce allocations.
func Trunc ¶
Trunc returns the integer value of f with any fractional part removed.
Special cases are:
Trunc(±0) = ±0 Trunc(±Inf) = ±Inf Trunc(NaN) = NaN
For finite x, the result is the integer part of x with the sign of x. This is equivalent to rounding toward zero. The result is exact (no rounding occurs).
func (Float8) Abs ¶
Abs returns the absolute value of f.
Special cases are:
Abs(±Inf) = +Inf Abs(NaN) = NaN Abs(±0) = +0
For all other values, Abs clears the sign bit to return a positive number.
func (Float8) IsFinite ¶
IsFinite reports whether f is a finite value (not infinite and not NaN).
A Float8 value is finite if its exponent is not all 1s (0x0F). This includes both normal numbers (with an implicit leading 1 bit) and subnormal numbers (with an implicit leading 0 bit).
Returns:
- true if f is a finite number (including zero and subnormals)
- false if f is infinity or NaN
func (Float8) IsInf ¶
IsInf reports whether f is an infinity, either positive or negative.
In the E4M3FN format, infinity values have all exponent bits set (0x78 for +Inf, 0xF8 for -Inf) and a zero mantissa. This is different from the standard IEEE 754 format used in float32/float64.
Returns:
- true if f is positive or negative infinity
- false otherwise, including for NaN and finite values
func (Float8) IsNaN ¶
IsNaN reports whether f is a "not-a-number" (NaN) value.
In the E4M3FN format, NaN is represented with all exponent bits set (0x0F) and all mantissa bits set (0x07). This results in two possible NaN values: 0x7F (positive NaN) and 0xFF (negative NaN).
Returns:
- true if f is a NaN value
- false otherwise, including for infinity and finite values
func (Float8) IsNormal ¶
IsNormal returns true if the Float8 is a normal (non-zero, non-infinite) number
func (Float8) IsZero ¶
IsZero reports whether f represents the floating-point value zero (either positive or negative).
According to IEEE 754, both +0 and -0 are considered zero, though they may have different bit patterns and behave differently in certain operations (like 1/+0 = +Inf, 1/-0 = -Inf).
Returns:
- true if f is +0 or -0
- false otherwise, including for NaN and infinity values
func (Float8) Sign ¶
Sign returns the sign of the Float8 value.
The return values are:
- 1 if f > 0
- -1 if f < 0
- 0 if f is zero (including -0) or NaN
Note that negative zero is treated as zero (returns 0), following the IEEE 754 standard where +0 and -0 compare as equal. However, they can be distinguished using bitwise operations or by examining the sign bit directly.
For NaN values, Sign returns 0, consistent with math/big.Float's behavior.
func (Float8) ToFloat32 ¶
ToFloat32 converts a Float8 value to float32.
This conversion is always exact since Float8 is a subset of float32. Special values are preserved:
- PositiveZero/NegativeZero → ±0.0
- PositiveInfinity/NegativeInfinity → ±Inf
- NaN → NaN
The conversion uses a fast path for common values and falls back to algorithmic conversion for other values.
type Float8Error ¶
type Float8Error struct {
Op string // Operation that caused the error
Value float32 // Input value that caused the error (if applicable)
Msg string // Error message
}
Float8Error represents errors that can occur during Float8 operations
func (*Float8Error) Error ¶
func (e *Float8Error) Error() string