/* * Copyright (c) 2011-2013 NVIDIA Corporation. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * Neither the name of NVIDIA Corporation nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Release 1.2 * * (1) Deployed new implementation of div_dbldbl() and sqrt_dbldbl() based on * Newton-Raphson iteration, providing significant speedup. * (2) Added new function rsqrt_dbldbl() which provides reciprocal square root. * * Release 1.1 * * (1) Fixed a bug affecting add_dbldbl() and sub_dbldbl() that in very rare * cases returned results with reduced accuracy. * (2) Replaced the somewhat inaccurate error bounds with the experimentally * observed maximum relative error. */ #if !defined(DBLDBL_H_) #define DBLDBL_H_ #if defined(__cplusplus) extern "C" { #endif /* __cplusplus */ #include /* import sqrt() */ /* The head of a double-double number is stored in the most significant part of a double2 (the y-component). The tail is stored in the least significant part of the double2 (the x-component). All double-double operands must be normalized on both input to and return from all basic operations, i.e. the magnitude of the tail shall be <= 0.5 ulp of the head. */ typedef double2 dbldbl; /* Create a double-double from two doubles. No normalization is performed, so the head and tail components passed in must satisfy the normalization requirement. To create a double-double from two arbitrary double-precision numbers, use add_double_to_dbldbl(). */ __device__ __forceinline__ dbldbl make_dbldbl (double head, double tail) { dbldbl z; z.x = tail; z.y = head; return z; } /* Return the head of a double-double number */ __device__ __forceinline__ double get_dbldbl_head (dbldbl a) { return a.y; } /* Return the tail of a double-double number */ __device__ __forceinline__ double get_dbldbl_tail (dbldbl a) { return a.x; } /* Compute error-free sum of two unordered doubles. See Knuth, TAOCP vol. 2 */ __device__ __forceinline__ dbldbl add_double_to_dbldbl (double a, double b) { double t1, t2; dbldbl z; z.y = __dadd_rn (a, b); t1 = __dadd_rn (z.y, -a); t2 = __dadd_rn (z.y, -t1); t1 = __dadd_rn (b, -t1); t2 = __dadd_rn (a, -t2); z.x = __dadd_rn (t1, t2); return z; } /* Compute error-free product of two doubles. Take full advantage of FMA */ __device__ __forceinline__ dbldbl mul_double_to_dbldbl (double a, double b) { dbldbl z; z.y = __dmul_rn (a, b); z.x = __fma_rn (a, b, -z.y); return z; } /* Negate a double-double number, by separately negating head and tail */ __device__ __forceinline__ dbldbl neg_dbldbl (dbldbl a) { dbldbl z; z.y = -a.y; z.x = -a.x; return z; } /* Compute high-accuracy sum of two double-double operands. In the absence of underflow and overflow, the maximum relative error observed with 10 billion test cases was 3.0716194922303448e-32 (~= 2**-104.6826). This implementation is based on: Andrew Thall, Extended-Precision Floating-Point Numbers for GPU Computation. Retrieved on 7/12/2011 from http://andrewthall.org/papers/df64_qf128.pdf. */ __device__ __forceinline__ dbldbl add_dbldbl (dbldbl a, dbldbl b) { dbldbl z; double t1, t2, t3, t4, t5, e; t1 = __dadd_rn (a.y, b.y); t2 = __dadd_rn (t1, -a.y); t3 = __dadd_rn (__dadd_rn (a.y, t2 - t1), __dadd_rn (b.y, -t2)); t4 = __dadd_rn (a.x, b.x); t2 = __dadd_rn (t4, -a.x); t5 = __dadd_rn (__dadd_rn (a.x, t2 - t4), __dadd_rn (b.x, -t2)); t3 = __dadd_rn (t3, t4); t4 = __dadd_rn (t1, t3); t3 = __dadd_rn (t1 - t4, t3); t3 = __dadd_rn (t3, t5); z.y = e = __dadd_rn (t4, t3); z.x = __dadd_rn (t4 - e, t3); return z; } /* Compute high-accuracy difference of two double-double operands. In the absence of underflow and overflow, the maximum relative error observed with 10 billion test cases was 3.0716194922303448e-32 (~= 2**-104.6826). This implementation is based on: Andrew Thall, Extended-Precision Floating-Point Numbers for GPU Computation. Retrieved on 7/12/2011 from http://andrewthall.org/papers/df64_qf128.pdf. */ __device__ __forceinline__ dbldbl sub_dbldbl (dbldbl a, dbldbl b) { dbldbl z; double t1, t2, t3, t4, t5, e; t1 = __dadd_rn (a.y, -b.y); t2 = __dadd_rn (t1, -a.y); t3 = __dadd_rn (__dadd_rn (a.y, t2 - t1), - __dadd_rn (b.y, t2)); t4 = __dadd_rn (a.x, -b.x); t2 = __dadd_rn (t4, -a.x); t5 = __dadd_rn (__dadd_rn (a.x, t2 - t4), - __dadd_rn (b.x, t2)); t3 = __dadd_rn (t3, t4); t4 = __dadd_rn (t1, t3); t3 = __dadd_rn (t1 - t4, t3); t3 = __dadd_rn (t3, t5); z.y = e = __dadd_rn (t4, t3); z.x = __dadd_rn (t4 - e, t3); return z; } /* Compute high-accuracy product of two double-double operands, taking full advantage of FMA. In the absence of underflow and overflow, the maximum relative error observed with 10 billion test cases was 5.238480533564479e-32 (~= 2**-103.9125). */ __device__ __forceinline__ dbldbl mul_dbldbl (dbldbl a, dbldbl b) { dbldbl t, z; double e; t.y = __dmul_rn (a.y, b.y); t.x = __fma_rn (a.y, b.y, -t.y); t.x = __fma_rn (a.x, b.x, t.x); t.x = __fma_rn (a.y, b.x, t.x); t.x = __fma_rn (a.x, b.y, t.x); z.y = e = __dadd_rn (t.y, t.x); z.x = __dadd_rn (t.y - e, t.x); return z; } /* Compute high-accuracy quotient of two double-double operands, using Newton- Raphson iteration. Based on: T. Nagai, H. Yoshida, H. Kuroda, Y. Kanada. Fast Quadruple Precision Arithmetic Library on Parallel Computer SR11000/J2. In Proceedings of the 8th International Conference on Computational Science, ICCS '08, Part I, pp. 446-455. In the absence of underflow and overflow, the maximum relative error observed with 10 billion test cases was 1.0161322480099059e-31 (~= 2**-102.9566). */ __device__ __forceinline__ dbldbl div_dbldbl (dbldbl a, dbldbl b) { dbldbl t, z; double e, r; r = 1.0 / b.y; t.y = __dmul_rn (a.y, r); e = __fma_rn (b.y, -t.y, a.y); t.y = __fma_rn (r, e, t.y); t.x = __fma_rn (b.y, -t.y, a.y); t.x = __dadd_rn (a.x, t.x); t.x = __fma_rn (b.x, -t.y, t.x); e = __dmul_rn (r, t.x); t.x = __fma_rn (b.y, -e, t.x); t.x = __fma_rn (r, t.x, e); z.y = e = __dadd_rn (t.y, t.x); z.x = __dadd_rn (t.y - e, t.x); return z; } /* Compute high-accuracy square root of a double-double number. Newton-Raphson iteration based on equation 4 from a paper by Alan Karp and Peter Markstein, High Precision Division and Square Root, ACM TOMS, vol. 23, no. 4, December 1997, pp. 561-589. In the absence of underflow and overflow, the maximum relative error observed with 10 billion test cases was 3.7564109505601846e-32 (~= 2**-104.3923). */ __device__ __forceinline__ dbldbl sqrt_dbldbl (dbldbl a) { dbldbl t, z; double e, y, s, r; r = rsqrt (a.y); if (a.y == 0.0) r = 0.0; y = __dmul_rn (a.y, r); s = __fma_rn (y, -y, a.y); r = __dmul_rn (0.5, r); z.y = e = __dadd_rn (s, a.x); z.x = __dadd_rn (s - e, a.x); t.y = __dmul_rn (r, z.y); t.x = __fma_rn (r, z.y, -t.y); t.x = __fma_rn (r, z.x, t.x); r = __dadd_rn (y, t.y); s = __dadd_rn (y - r, t.y); s = __dadd_rn (s, t.x); z.y = e = __dadd_rn (r, s); z.x = __dadd_rn (r - e, s); return z; } /* Compute high-accuracy reciprocal square root of a double-double number. Based on Newton-Raphson iteration. In the absence of underflow and overflow, the maximum relative error observed with 10 billion test cases was 6.4937771666026349e-32 (~= 2**-103.6026) */ __device__ __forceinline__ dbldbl rsqrt_dbldbl (dbldbl a) { dbldbl z; double r, s, e; r = rsqrt (a.y); e = __dmul_rn (a.y, r); s = __fma_rn (e, -r, 1.0); e = __fma_rn (a.y, r, -e); s = __fma_rn (e, -r, s); e = __dmul_rn (a.x, r); s = __fma_rn (e, -r, s); e = 0.5 * r; z.y = __dmul_rn (e, s); z.x = __fma_rn (e, s, -z.y); s = __dadd_rn (r, z.y); r = __dadd_rn (r, -s); r = __dadd_rn (r, z.y); r = __dadd_rn (r, z.x); z.y = e = __dadd_rn (s, r); z.x = __dadd_rn (s - e, r); return z; } #if defined(__cplusplus) } #endif /* __cplusplus */ #endif /* DBLDBL_H_ */