From 32b35070f5d78c251999de36990603fd21c0943c Mon Sep 17 00:00:00 2001
From: pleroy <pascal.h.leroy@gmail.com>
Date: Wed, 9 Oct 2024 22:16:58 +0200
Subject: [PATCH] Support for argument reduction.

---
 benchmarks/elementary_functions_benchmark.cpp |  2 +-
 functions/sin_cos_test.cpp                    |  7 ++--
 numerics/sin_cos.cpp                          | 32 ++++++++++++++++---
 3 files changed, 32 insertions(+), 9 deletions(-)
diff --git a/benchmarks/elementary_functions_benchmark.cpp b/benchmarks/elementary_functions_benchmark.cpp
index 83af680d80..58e5f31b45 100644
--- a/benchmarks/elementary_functions_benchmark.cpp
+++ b/benchmarks/elementary_functions_benchmark.cpp
@@ -28,7 +28,7 @@ void BM_EvaluateElementaryFunction(benchmark::State& state) {
   using Argument = double;
 
   std::mt19937_64 random(42);
-  std::uniform_real_distribution<> uniformly_at(0, π / 4);
+  std::uniform_real_distribution<> uniformly_at(-2 * π, 2 * π);
 
   Argument a[number_of_iterations];
   for (std::int64_t i = 0; i < number_of_iterations; ++i) {
diff --git a/functions/sin_cos_test.cpp b/functions/sin_cos_test.cpp
index de665b45c0..25a650a924 100644
--- a/functions/sin_cos_test.cpp
+++ b/functions/sin_cos_test.cpp
@@ -23,8 +23,7 @@ class SinCosTest : public ::testing::Test {};
 
 TEST_F(SinCosTest, Random) {
   std::mt19937_64 random(42);
-  // TODO(phl): Negative angles.
-  std::uniform_real_distribution<> uniformly_at(-π / 4, π / 4);
+  std::uniform_real_distribution<> uniformly_at(-2 * π, 2 * π);
 
   cpp_bin_float_50 max_sin_ulps_error = 0;
   cpp_bin_float_50 max_cos_ulps_error = 0;
@@ -69,8 +68,8 @@ TEST_F(SinCosTest, Random) {
   }
 
   // This implementation is not quite correctly rounded, but not far from it.
-  EXPECT_LE(max_sin_ulps_error, 0.500003);
-  EXPECT_LE(max_cos_ulps_error, 0.500001);
+  EXPECT_LE(max_sin_ulps_error, 0.500138);
+  EXPECT_LE(max_cos_ulps_error, 0.500146);
 
   LOG(ERROR) << "Sin error: " << max_sin_ulps_error << std::setprecision(25)
              << " ulps for argument: " << worst_sin_argument
diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp
index 0b2c20cc83..88e59ac220 100644
--- a/numerics/sin_cos.cpp
+++ b/numerics/sin_cos.cpp
@@ -8,6 +8,7 @@
 #include "numerics/double_precision.hpp"
 #include "numerics/fma.hpp"
 #include "numerics/polynomial_evaluators.hpp"
+#include "quantities/elementary_functions.hpp"
 
 namespace principia {
 namespace numerics {
@@ -18,6 +19,7 @@ using namespace principia::numerics::_accurate_tables;
 using namespace principia::numerics::_double_precision;
 using namespace principia::numerics::_fma;
 using namespace principia::numerics::_polynomial_evaluators;
+using namespace principia::quantities::_elementary_functions;
 
 using Argument = double;
 using Value = double;
@@ -27,7 +29,7 @@ constexpr Argument table_spacing_reciprocal = 512.0;
 
 // π / 2 split so that the high half has 18 zeros at the end of its mantissa.
 constexpr std::int64_t π_over_2_zeroes = 18;
-constexpr Argument π_over_2_threshold = 1 << π_over_2_zeroes;//NONONO
+constexpr Argument π_over_2_threshold = π / 2 * (1 << π_over_2_zeroes);
 constexpr Argument π_over_2_high = 0x1.921fb'5444p0;
 constexpr Argument π_over_2_low = 0x2.d1846'9898c'c5170'1b839p-40;
 
@@ -39,6 +41,17 @@ static const __m128d sign_bit =
     _mm_castsi128_pd(_mm_cvtsi64_si128(0x8000'0000'0000'0000));
 }  // namespace masks
 
+template<FMAPolicy fma_policy>
+double FusedMultiplyAdd(double const a, double const b, double const c) {
+  if ((fma_policy == FMAPolicy::Force && CanEmitFMAInstructions) ||
+      (fma_policy == FMAPolicy::Auto && UseHardwareFMA)) {
+    using quantities::_elementary_functions::FusedMultiplyAdd;
+    return FusedMultiplyAdd(a, b, c);
+  } else {
+    return a * b + c;
+  }
+}
+
 void Reduce(Argument const x,
             DoublePrecision<Argument>& x_reduced,
             std::int64_t& quadrant) {
@@ -90,7 +103,8 @@ Value SinImplementation(DoublePrecision<Argument> const argument) {
   if (abs_x < sin_near_zero_cutoff) {
     double const x² = x * x;
     double const x³ = x² * x;
-    return x + (x³ * SinPolynomialNearZero<fma_policy>(x²) + e);
+    return x + FusedMultiplyAdd<fma_policy>(
+                   x³, SinPolynomialNearZero<fma_policy>(x²), e);
   } else {
     __m128d const sign = _mm_and_pd(masks::sign_bit, _mm_set_sd(x));
     auto const i = _mm_cvtsd_si64(_mm_set_sd(abs_x * table_spacing_reciprocal));
@@ -100,6 +114,10 @@ Value SinImplementation(DoublePrecision<Argument> const argument) {
     double const sin_x₀ =
         _mm_cvtsd_f64(_mm_xor_pd(_mm_set_sd(accurate_values.sin_x), sign));
     double const& cos_x₀ = accurate_values.cos_x;
+  // [GB91] incorporates `e` in the computation of `h`.  However, `e` can be
+  // very small and `h` can be as large as 1/512.  We note that the only
+  // influence of `e` is on the computation of `cos_x₀ * e`, so we incorporate
+  // `e` with the `SinPolynomial` below.
     double const h = x - x₀;
 
     DoublePrecision<double> const sin_x₀_plus_h_cos_x₀ =
@@ -108,7 +126,8 @@ Value SinImplementation(DoublePrecision<Argument> const argument) {
     double const h³ = h² * h;
     return sin_x₀_plus_h_cos_x₀.value +
            ((sin_x₀ * h² * CosPolynomial<fma_policy>(h²) +
-             cos_x₀ * (h³ * SinPolynomial<fma_policy>(h²) + e)) +
+             cos_x₀ * FusedMultiplyAdd<fma_policy>(
+                          h³, SinPolynomial<fma_policy>(h²), e)) +
             sin_x₀_plus_h_cos_x₀.error);
   }
 }
@@ -127,6 +146,10 @@ Value CosImplementation(DoublePrecision<Argument> const argument) {
   double const sin_x₀ =
       _mm_cvtsd_f64(_mm_xor_pd(_mm_set_sd(accurate_values.sin_x), sign));
   double const& cos_x₀ = accurate_values.cos_x;
+  // [GB91] incorporates `e` in the computation of `h`.  However, `e` can be
+  // very small and `h` can be as large as 1/512.  We note that the only
+  // influence of `e` is on the computation of `sin_x₀ * e`, so we incorporate
+  // `e` with the `SinPolynomial` below.
   double const h = x - x₀;
 
   DoublePrecision<double> const cos_x₀_minus_h_sin_x₀ =
@@ -135,7 +158,8 @@ Value CosImplementation(DoublePrecision<Argument> const argument) {
   double const h³ = h² * h;
   return cos_x₀_minus_h_sin_x₀.value +
          ((cos_x₀ * h² * CosPolynomial<fma_policy>(h²) -
-           sin_x₀ * (h³ * SinPolynomial<fma_policy>(h²) + e)) +
+           sin_x₀ * FusedMultiplyAdd<fma_policy>(
+                        h³, SinPolynomial<fma_policy>(h²), e)) +
           cos_x₀_minus_h_sin_x₀.error);
 }