-
-
Save BoRueiHong/863b201c0f0f3c6a6b7e17b62cbb8463 to your computer and use it in GitHub Desktop.
float16 to float32
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // based on https://gist.github.com/martin-kallman/5049614 | |
| // float32 | |
| // Martin Kallman | |
| // | |
| // Fast half-precision to single-precision floating point conversion | |
| // - Supports signed zero and denormals-as-zero (DAZ) | |
| // - Does not support infinities or NaN | |
| // - Few, partially pipelinable, non-branching instructions, | |
| // - Core opreations ~6 clock cycles on modern x86-64 | |
| void float32(float *__restrict out, const uint16_t in) { | |
| uint32_t t1; | |
| uint32_t t2; | |
| uint32_t t3; | |
| t1 = in & 0x7fffu; // Non-sign bits | |
| t2 = in & 0x8000u; // Sign bit | |
| t3 = in & 0x7c00u; // Exponent | |
| t1 <<= 13u; // Align mantissa on MSB | |
| t2 <<= 16u; // Shift sign bit into position | |
| t1 += 0x38000000; // Adjust bias | |
| t1 = (t3 == 0 ? 0 : t1); // Denormals-as-zero | |
| t1 |= t2; // Re-insert sign bit | |
| *((uint32_t *) out) = t1; | |
| }; | |
| // float16 | |
| // Martin Kallman | |
| // | |
| // Fast single-precision to half-precision floating point conversion | |
| // - Supports signed zero, denormals-as-zero (DAZ), flush-to-zero (FTZ), | |
| // clamp-to-max | |
| // - Does not support infinities or NaN | |
| // - Few, partially pipelinable, non-branching instructions, | |
| // - Core opreations ~10 clock cycles on modern x86-64 | |
| void float16(uint16_t *__restrict out, const float in) { | |
| uint32_t inu = *((uint32_t * ) & in); | |
| uint32_t t1; | |
| uint32_t t2; | |
| uint32_t t3; | |
| t1 = inu & 0x7fffffffu; // Non-sign bits | |
| t2 = inu & 0x80000000u; // Sign bit | |
| t3 = inu & 0x7f800000u; // Exponent | |
| t1 >>= 13u; // Align mantissa on MSB | |
| t2 >>= 16u; // Shift sign bit into position | |
| t1 -= 0x1c000; // Adjust bias | |
| t1 = (t3 < 0x38800000u) ? 0 : t1; // Flush-to-zero | |
| t1 = (t3 > 0x8e000000u) ? 0x7bff : t1; // Clamp-to-max | |
| t1 = (t3 == 0 ? 0 : t1); // Denormals-as-zero | |
| t1 |= t2; // Re-insert sign bit | |
| *((uint16_t *) out) = t1; | |
| }; | |
| #define ABS(A) ((A) >= 0 ? (A) : -(A)) | |
| int main() { | |
| float original = -42.42f; | |
| uint16_t small = 0; | |
| float16(&small, original); | |
| float quantized = 0.0f; | |
| float32(&quantized, small); | |
| float diff = ABS(original - quantized); | |
| printf("orig %f quantized %f absdiff %f\n", original, quantized, diff); | |
| assert(diff < 0.1f); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment