GenericPacketMathFunctions.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2007 Julien Pommier
5 // Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
6 // Copyright (C) 2009-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
7 //
8 // This Source Code Form is subject to the terms of the Mozilla
9 // Public License v. 2.0. If a copy of the MPL was not distributed
10 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
11 
12 /* The exp and log functions of this file initially come from
13  * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
14  */
15 
16 #ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H
17 #define EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H
18 
19 #include "../../InternalHeaderCheck.h"
20 
21 namespace Eigen {
22 namespace internal {
23 
24 // Creates a Scalar integer type with same bit-width.
25 template<typename T> struct make_integer;
26 template<> struct make_integer<float> { typedef numext::int32_t type; };
27 template<> struct make_integer<double> { typedef numext::int64_t type; };
28 template<> struct make_integer<half> { typedef numext::int16_t type; };
29 template<> struct make_integer<bfloat16> { typedef numext::int16_t type; };
30 
31 template<typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
32 Packet pfrexp_generic_get_biased_exponent(const Packet& a) {
33  typedef typename unpacket_traits<Packet>::type Scalar;
34  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
35  static constexpr int mantissa_bits = numext::numeric_limits<Scalar>::digits - 1;
36  return pcast<PacketI, Packet>(plogical_shift_right<mantissa_bits>(preinterpret<PacketI>(pabs(a))));
37 }
38 
39 // Safely applies frexp, correctly handles denormals.
40 // Assumes IEEE floating point format.
41 template<typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
42 Packet pfrexp_generic(const Packet& a, Packet& exponent) {
43  typedef typename unpacket_traits<Packet>::type Scalar;
44  typedef typename make_unsigned<typename make_integer<Scalar>::type>::type ScalarUI;
45  static constexpr int
46  TotalBits = sizeof(Scalar) * CHAR_BIT,
47  MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
48  ExponentBits = TotalBits - MantissaBits - 1;
49 
50  EIGEN_CONSTEXPR ScalarUI scalar_sign_mantissa_mask =
51  ~(((ScalarUI(1) << ExponentBits) - ScalarUI(1)) << MantissaBits); // ~0x7f800000
52  const Packet sign_mantissa_mask = pset1frombits<Packet>(static_cast<ScalarUI>(scalar_sign_mantissa_mask));
53  const Packet half = pset1<Packet>(Scalar(0.5));
54  const Packet zero = pzero(a);
55  const Packet normal_min = pset1<Packet>((numext::numeric_limits<Scalar>::min)()); // Minimum normal value, 2^-126
56 
57  // To handle denormals, normalize by multiplying by 2^(int(MantissaBits)+1).
58  const Packet is_denormal = pcmp_lt(pabs(a), normal_min);
59  EIGEN_CONSTEXPR ScalarUI scalar_normalization_offset = ScalarUI(MantissaBits + 1); // 24
60  // The following cannot be constexpr because bfloat16(uint16_t) is not constexpr.
61  const Scalar scalar_normalization_factor = Scalar(ScalarUI(1) << int(scalar_normalization_offset)); // 2^24
62  const Packet normalization_factor = pset1<Packet>(scalar_normalization_factor);
63  const Packet normalized_a = pselect(is_denormal, pmul(a, normalization_factor), a);
64 
65  // Determine exponent offset: -126 if normal, -126-24 if denormal
66  const Scalar scalar_exponent_offset = -Scalar((ScalarUI(1)<<(ExponentBits-1)) - ScalarUI(2)); // -126
67  Packet exponent_offset = pset1<Packet>(scalar_exponent_offset);
68  const Packet normalization_offset = pset1<Packet>(-Scalar(scalar_normalization_offset)); // -24
69  exponent_offset = pselect(is_denormal, padd(exponent_offset, normalization_offset), exponent_offset);
70 
71  // Determine exponent and mantissa from normalized_a.
72  exponent = pfrexp_generic_get_biased_exponent(normalized_a);
73  // Zero, Inf and NaN return 'a' unmodified, exponent is zero
74  // (technically the exponent is unspecified for inf/NaN, but GCC/Clang set it to zero)
75  const Scalar scalar_non_finite_exponent = Scalar((ScalarUI(1) << ExponentBits) - ScalarUI(1)); // 255
76  const Packet non_finite_exponent = pset1<Packet>(scalar_non_finite_exponent);
77  const Packet is_zero_or_not_finite = por(pcmp_eq(a, zero), pcmp_eq(exponent, non_finite_exponent));
78  const Packet m = pselect(is_zero_or_not_finite, a, por(pand(normalized_a, sign_mantissa_mask), half));
79  exponent = pselect(is_zero_or_not_finite, zero, padd(exponent, exponent_offset));
80  return m;
81 }
82 
83 // Safely applies ldexp, correctly handles overflows, underflows and denormals.
84 // Assumes IEEE floating point format.
85 template<typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
86 Packet pldexp_generic(const Packet& a, const Packet& exponent) {
87  // We want to return a * 2^exponent, allowing for all possible integer
88  // exponents without overflowing or underflowing in intermediate
89  // computations.
90  //
91  // Since 'a' and the output can be denormal, the maximum range of 'exponent'
92  // to consider for a float is:
93  // -255-23 -> 255+23
94  // Below -278 any finite float 'a' will become zero, and above +278 any
95  // finite float will become inf, including when 'a' is the smallest possible
96  // denormal.
97  //
98  // Unfortunately, 2^(278) cannot be represented using either one or two
99  // finite normal floats, so we must split the scale factor into at least
100  // three parts. It turns out to be faster to split 'exponent' into four
101  // factors, since [exponent>>2] is much faster to compute that [exponent/3].
102  //
103  // Set e = min(max(exponent, -278), 278);
104  // b = floor(e/4);
105  // out = ((((a * 2^(b)) * 2^(b)) * 2^(b)) * 2^(e-3*b))
106  //
107  // This will avoid any intermediate overflows and correctly handle 0, inf,
108  // NaN cases.
109  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
110  typedef typename unpacket_traits<Packet>::type Scalar;
111  typedef typename unpacket_traits<PacketI>::type ScalarI;
112  static constexpr int
113  TotalBits = sizeof(Scalar) * CHAR_BIT,
114  MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
115  ExponentBits = TotalBits - MantissaBits - 1;
116 
117  const Packet max_exponent = pset1<Packet>(Scalar((ScalarI(1)<<ExponentBits) + ScalarI(MantissaBits - 1))); // 278
118  const PacketI bias = pset1<PacketI>((ScalarI(1)<<(ExponentBits-1)) - ScalarI(1)); // 127
119  const PacketI e = pcast<Packet, PacketI>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
120  PacketI b = parithmetic_shift_right<2>(e); // floor(e/4);
121  Packet c = preinterpret<Packet>(plogical_shift_left<MantissaBits>(padd(b, bias))); // 2^b
122  Packet out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
123  b = psub(psub(psub(e, b), b), b); // e - 3b
124  c = preinterpret<Packet>(plogical_shift_left<MantissaBits>(padd(b, bias))); // 2^(e-3*b)
125  out = pmul(out, c);
126  return out;
127 }
128 
129 // Explicitly multiplies
130 // a * (2^e)
131 // clamping e to the range
132 // [NumTraits<Scalar>::min_exponent()-2, NumTraits<Scalar>::max_exponent()]
133 //
134 // This is approx 7x faster than pldexp_impl, but will prematurely over/underflow
135 // if 2^e doesn't fit into a normal floating-point Scalar.
136 //
137 // Assumes IEEE floating point format
138 template<typename Packet>
139 struct pldexp_fast_impl {
140  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
141  typedef typename unpacket_traits<Packet>::type Scalar;
142  typedef typename unpacket_traits<PacketI>::type ScalarI;
143  static constexpr int
144  TotalBits = sizeof(Scalar) * CHAR_BIT,
145  MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
146  ExponentBits = TotalBits - MantissaBits - 1;
147 
148  static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
149  Packet run(const Packet& a, const Packet& exponent) {
150  const Packet bias = pset1<Packet>(Scalar((ScalarI(1)<<(ExponentBits-1)) - ScalarI(1))); // 127
151  const Packet limit = pset1<Packet>(Scalar((ScalarI(1)<<ExponentBits) - ScalarI(1))); // 255
152  // restrict biased exponent between 0 and 255 for float.
153  const PacketI e = pcast<Packet, PacketI>(pmin(pmax(padd(exponent, bias), pzero(limit)), limit)); // exponent + 127
154  // return a * (2^e)
155  return pmul(a, preinterpret<Packet>(plogical_shift_left<MantissaBits>(e)));
156  }
157 };
158 
159 // Natural or base 2 logarithm.
160 // Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
161 // and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
162 // be easily approximated by a polynomial centered on m=1 for stability.
163 // TODO(gonnet): Further reduce the interval allowing for lower-degree
164 // polynomial interpolants -> ... -> profit!
165 template <typename Packet, bool base2>
167 Packet plog_impl_float(const Packet _x)
168 {
169  const Packet cst_1 = pset1<Packet>(1.0f);
170  const Packet cst_minus_inf = pset1frombits<Packet>(static_cast<Eigen::numext::uint32_t>(0xff800000u));
171  const Packet cst_pos_inf = pset1frombits<Packet>(static_cast<Eigen::numext::uint32_t>(0x7f800000u));
172 
173  const Packet cst_cephes_SQRTHF = pset1<Packet>(0.707106781186547524f);
174  Packet e, x;
175  // extract significant in the range [0.5,1) and exponent
176  x = pfrexp(_x,e);
177 
178  // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
179  // and shift by -1. The values are then centered around 0, which improves
180  // the stability of the polynomial evaluation.
181  // if( x < SQRTHF ) {
182  // e -= 1;
183  // x = x + x - 1.0;
184  // } else { x = x - 1.0; }
185  Packet mask = pcmp_lt(x, cst_cephes_SQRTHF);
186  Packet tmp = pand(x, mask);
187  x = psub(x, cst_1);
188  e = psub(e, pand(cst_1, mask));
189  x = padd(x, tmp);
190 
191  // Polynomial coefficients for rational (3,3) r(x) = p(x)/q(x)
192  // approximating log(1+x) on [sqrt(0.5)-1;sqrt(2)-1].
193  const Packet cst_p1 = pset1<Packet>(1.0000000190281136f);
194  const Packet cst_p2 = pset1<Packet>(1.0000000190281063f);
195  const Packet cst_p3 = pset1<Packet>(0.18256296349849254f);
196  const Packet cst_q1 = pset1<Packet>(1.4999999999999927f);
197  const Packet cst_q2 = pset1<Packet>(0.59923249590823520f);
198  const Packet cst_q3 = pset1<Packet>(0.049616247954120038f);
199 
200  Packet p = pmadd(x, cst_p3, cst_p2);
201  p = pmadd(x, p, cst_p1);
202  p = pmul(x, p);
203  Packet q = pmadd(x, cst_q3, cst_q2);
204  q = pmadd(x, q, cst_q1);
205  q = pmadd(x, q, cst_1);
206  x = pdiv(p, q);
207 
208  // Add the logarithm of the exponent back to the result of the interpolation.
209  if (base2) {
210  const Packet cst_log2e = pset1<Packet>(static_cast<float>(EIGEN_LOG2E));
211  x = pmadd(x, cst_log2e, e);
212  } else {
213  const Packet cst_ln2 = pset1<Packet>(static_cast<float>(EIGEN_LN2));
214  x = pmadd(e, cst_ln2, x);
215  }
216 
217  Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
218  Packet iszero_mask = pcmp_eq(_x,pzero(_x));
219  Packet pos_inf_mask = pcmp_eq(_x,cst_pos_inf);
220  // Filter out invalid inputs, i.e.:
221  // - negative arg will be NAN
222  // - 0 will be -INF
223  // - +INF will be +INF
224  return pselect(iszero_mask, cst_minus_inf,
225  por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask));
226 }
227 
228 template <typename Packet>
230 Packet plog_float(const Packet _x)
231 {
232  return plog_impl_float<Packet, /* base2 */ false>(_x);
233 }
234 
235 template <typename Packet>
237 Packet plog2_float(const Packet _x)
238 {
239  return plog_impl_float<Packet, /* base2 */ true>(_x);
240 }
241 
242 /* Returns the base e (2.718...) or base 2 logarithm of x.
243  * The argument is separated into its exponent and fractional parts.
244  * The logarithm of the fraction in the interval [sqrt(1/2), sqrt(2)],
245  * is approximated by
246  *
247  * log(1+x) = x - 0.5 x**2 + x**3 P(x)/Q(x).
248  *
249  * for more detail see: http://www.netlib.org/cephes/
250  */
251 template <typename Packet, bool base2>
253 Packet plog_impl_double(const Packet _x)
254 {
255  Packet x = _x;
256 
257  const Packet cst_1 = pset1<Packet>(1.0);
258  const Packet cst_neg_half = pset1<Packet>(-0.5);
259  const Packet cst_minus_inf = pset1frombits<Packet>( static_cast<uint64_t>(0xfff0000000000000ull));
260  const Packet cst_pos_inf = pset1frombits<Packet>( static_cast<uint64_t>(0x7ff0000000000000ull));
261 
262 
263  // Polynomial Coefficients for log(1+x) = x - x**2/2 + x**3 P(x)/Q(x)
264  // 1/sqrt(2) <= x < sqrt(2)
265  const Packet cst_cephes_SQRTHF = pset1<Packet>(0.70710678118654752440E0);
266  const Packet cst_cephes_log_p0 = pset1<Packet>(1.01875663804580931796E-4);
267  const Packet cst_cephes_log_p1 = pset1<Packet>(4.97494994976747001425E-1);
268  const Packet cst_cephes_log_p2 = pset1<Packet>(4.70579119878881725854E0);
269  const Packet cst_cephes_log_p3 = pset1<Packet>(1.44989225341610930846E1);
270  const Packet cst_cephes_log_p4 = pset1<Packet>(1.79368678507819816313E1);
271  const Packet cst_cephes_log_p5 = pset1<Packet>(7.70838733755885391666E0);
272 
273  const Packet cst_cephes_log_q0 = pset1<Packet>(1.0);
274  const Packet cst_cephes_log_q1 = pset1<Packet>(1.12873587189167450590E1);
275  const Packet cst_cephes_log_q2 = pset1<Packet>(4.52279145837532221105E1);
276  const Packet cst_cephes_log_q3 = pset1<Packet>(8.29875266912776603211E1);
277  const Packet cst_cephes_log_q4 = pset1<Packet>(7.11544750618563894466E1);
278  const Packet cst_cephes_log_q5 = pset1<Packet>(2.31251620126765340583E1);
279 
280  Packet e;
281  // extract significant in the range [0.5,1) and exponent
282  x = pfrexp(x,e);
283 
284  // Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
285  // and shift by -1. The values are then centered around 0, which improves
286  // the stability of the polynomial evaluation.
287  // if( x < SQRTHF ) {
288  // e -= 1;
289  // x = x + x - 1.0;
290  // } else { x = x - 1.0; }
291  Packet mask = pcmp_lt(x, cst_cephes_SQRTHF);
292  Packet tmp = pand(x, mask);
293  x = psub(x, cst_1);
294  e = psub(e, pand(cst_1, mask));
295  x = padd(x, tmp);
296 
297  Packet x2 = pmul(x, x);
298  Packet x3 = pmul(x2, x);
299 
300  // Evaluate the polynomial approximant , probably to improve instruction-level parallelism.
301  // y = x - 0.5*x^2 + x^3 * polevl( x, P, 5 ) / p1evl( x, Q, 5 ) );
302  Packet y, y1, y_;
303  y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
304  y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4);
305  y = pmadd(y, x, cst_cephes_log_p2);
306  y1 = pmadd(y1, x, cst_cephes_log_p5);
307  y_ = pmadd(y, x3, y1);
308 
309  y = pmadd(cst_cephes_log_q0, x, cst_cephes_log_q1);
310  y1 = pmadd(cst_cephes_log_q3, x, cst_cephes_log_q4);
311  y = pmadd(y, x, cst_cephes_log_q2);
312  y1 = pmadd(y1, x, cst_cephes_log_q5);
313  y = pmadd(y, x3, y1);
314 
315  y_ = pmul(y_, x3);
316  y = pdiv(y_, y);
317 
318  y = pmadd(cst_neg_half, x2, y);
319  x = padd(x, y);
320 
321  // Add the logarithm of the exponent back to the result of the interpolation.
322  if (base2) {
323  const Packet cst_log2e = pset1<Packet>(static_cast<double>(EIGEN_LOG2E));
324  x = pmadd(x, cst_log2e, e);
325  } else {
326  const Packet cst_ln2 = pset1<Packet>(static_cast<double>(EIGEN_LN2));
327  x = pmadd(e, cst_ln2, x);
328  }
329 
330  Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
331  Packet iszero_mask = pcmp_eq(_x,pzero(_x));
332  Packet pos_inf_mask = pcmp_eq(_x,cst_pos_inf);
333  // Filter out invalid inputs, i.e.:
334  // - negative arg will be NAN
335  // - 0 will be -INF
336  // - +INF will be +INF
337  return pselect(iszero_mask, cst_minus_inf,
338  por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask));
339 }
340 
341 template <typename Packet>
343 Packet plog_double(const Packet _x)
344 {
345  return plog_impl_double<Packet, /* base2 */ false>(_x);
346 }
347 
348 template <typename Packet>
350 Packet plog2_double(const Packet _x)
351 {
352  return plog_impl_double<Packet, /* base2 */ true>(_x);
353 }
354 
358 template<typename Packet>
359 Packet generic_plog1p(const Packet& x)
360 {
361  typedef typename unpacket_traits<Packet>::type ScalarType;
362  const Packet one = pset1<Packet>(ScalarType(1));
363  Packet xp1 = padd(x, one);
364  Packet small_mask = pcmp_eq(xp1, one);
365  Packet log1 = plog(xp1);
366  Packet inf_mask = pcmp_eq(xp1, log1);
367  Packet log_large = pmul(x, pdiv(log1, psub(xp1, one)));
368  return pselect(por(small_mask, inf_mask), x, log_large);
369 }
370 
374 template<typename Packet>
375 Packet generic_expm1(const Packet& x)
376 {
377  typedef typename unpacket_traits<Packet>::type ScalarType;
378  const Packet one = pset1<Packet>(ScalarType(1));
379  const Packet neg_one = pset1<Packet>(ScalarType(-1));
380  Packet u = pexp(x);
381  Packet one_mask = pcmp_eq(u, one);
382  Packet u_minus_one = psub(u, one);
383  Packet neg_one_mask = pcmp_eq(u_minus_one, neg_one);
384  Packet logu = plog(u);
385  // The following comparison is to catch the case where
386  // exp(x) = +inf. It is written in this way to avoid having
387  // to form the constant +inf, which depends on the packet
388  // type.
389  Packet pos_inf_mask = pcmp_eq(logu, u);
390  Packet expm1 = pmul(u_minus_one, pdiv(x, logu));
391  expm1 = pselect(pos_inf_mask, u, expm1);
392  return pselect(one_mask,
393  x,
394  pselect(neg_one_mask,
395  neg_one,
396  expm1));
397 }
398 
399 
400 // Exponential function. Works by writing "x = m*log(2) + r" where
401 // "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
402 // "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
403 // exp(r) is computed using a 6th order minimax polynomial approximation.
404 template <typename Packet>
406 Packet pexp_float(const Packet _x)
407 {
408  const Packet cst_zero = pset1<Packet>(0.0f);
409  const Packet cst_one = pset1<Packet>(1.0f);
410  const Packet cst_half = pset1<Packet>(0.5f);
411  const Packet cst_exp_hi = pset1<Packet>(88.723f);
412  const Packet cst_exp_lo = pset1<Packet>(-104.f);
413 
414  const Packet cst_cephes_LOG2EF = pset1<Packet>(1.44269504088896341f);
415  const Packet cst_p2 = pset1<Packet>(0.49999988079071044921875f);
416  const Packet cst_p3 = pset1<Packet>(0.16666518151760101318359375f);
417  const Packet cst_p4 = pset1<Packet>(4.166965186595916748046875e-2f);
418  const Packet cst_p5 = pset1<Packet>(8.36894474923610687255859375e-3f);
419  const Packet cst_p6 = pset1<Packet>(1.37449637986719608306884765625e-3f);
420 
421  // Clamp x.
422  Packet zero_mask = pcmp_lt(_x, cst_exp_lo);
423  Packet x = pmin(_x, cst_exp_hi);
424 
425  // Express exp(x) as exp(m*ln(2) + r), start by extracting
426  // m = floor(x/ln(2) + 0.5).
427  Packet m = pfloor(pmadd(x, cst_cephes_LOG2EF, cst_half));
428 
429  // Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is
430  // subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating
431  // truncation errors.
432  const Packet cst_cephes_exp_C1 = pset1<Packet>(-0.693359375f);
433  const Packet cst_cephes_exp_C2 = pset1<Packet>(2.12194440e-4f);
434  Packet r = pmadd(m, cst_cephes_exp_C1, x);
435  r = pmadd(m, cst_cephes_exp_C2, r);
436 
437  // Evaluate the 6th order polynomial approximation to exp(r)
438  // with r in the interval [-ln(2)/2;ln(2)/2].
439  const Packet r2 = pmul(r, r);
440  Packet p_even = pmadd(r2, cst_p6, cst_p4);
441  const Packet p_odd = pmadd(r2, cst_p5, cst_p3);
442  p_even = pmadd(r2, p_even, cst_p2);
443  const Packet p_low = padd(r, cst_one);
444  Packet y = pmadd(r, p_odd, p_even);
445  y = pmadd(r2, y, p_low);
446 
447  // Return 2^m * exp(r).
448  // TODO: replace pldexp with faster implementation since y in [-1, 1).
449  return pselect(zero_mask, cst_zero, pmax(pldexp(y,m), _x));
450 }
451 
452 template <typename Packet>
454 Packet pexp_double(const Packet _x)
455 {
456  Packet x = _x;
457  const Packet cst_zero = pset1<Packet>(0.0);
458  const Packet cst_1 = pset1<Packet>(1.0);
459  const Packet cst_2 = pset1<Packet>(2.0);
460  const Packet cst_half = pset1<Packet>(0.5);
461 
462  const Packet cst_exp_hi = pset1<Packet>(709.784);
463  const Packet cst_exp_lo = pset1<Packet>(-709.784);
464 
465  const Packet cst_cephes_LOG2EF = pset1<Packet>(1.4426950408889634073599);
466  const Packet cst_cephes_exp_p0 = pset1<Packet>(1.26177193074810590878e-4);
467  const Packet cst_cephes_exp_p1 = pset1<Packet>(3.02994407707441961300e-2);
468  const Packet cst_cephes_exp_p2 = pset1<Packet>(9.99999999999999999910e-1);
469  const Packet cst_cephes_exp_q0 = pset1<Packet>(3.00198505138664455042e-6);
470  const Packet cst_cephes_exp_q1 = pset1<Packet>(2.52448340349684104192e-3);
471  const Packet cst_cephes_exp_q2 = pset1<Packet>(2.27265548208155028766e-1);
472  const Packet cst_cephes_exp_q3 = pset1<Packet>(2.00000000000000000009e0);
473  const Packet cst_cephes_exp_C1 = pset1<Packet>(0.693145751953125);
474  const Packet cst_cephes_exp_C2 = pset1<Packet>(1.42860682030941723212e-6);
475 
476  Packet tmp, fx;
477 
478  // clamp x
479  Packet zero_mask = pcmp_lt(_x, cst_exp_lo);
480  x = pmin(x, cst_exp_hi);
481  // Express exp(x) as exp(g + n*log(2)).
482  fx = pmadd(cst_cephes_LOG2EF, x, cst_half);
483 
484  // Get the integer modulus of log(2), i.e. the "n" described above.
485  fx = pfloor(fx);
486 
487  // Get the remainder modulo log(2), i.e. the "g" described above. Subtract
488  // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
489  // digits right.
490  tmp = pmul(fx, cst_cephes_exp_C1);
491  Packet z = pmul(fx, cst_cephes_exp_C2);
492  x = psub(x, tmp);
493  x = psub(x, z);
494 
495  Packet x2 = pmul(x, x);
496 
497  // Evaluate the numerator polynomial of the rational interpolant.
498  Packet px = cst_cephes_exp_p0;
499  px = pmadd(px, x2, cst_cephes_exp_p1);
500  px = pmadd(px, x2, cst_cephes_exp_p2);
501  px = pmul(px, x);
502 
503  // Evaluate the denominator polynomial of the rational interpolant.
504  Packet qx = cst_cephes_exp_q0;
505  qx = pmadd(qx, x2, cst_cephes_exp_q1);
506  qx = pmadd(qx, x2, cst_cephes_exp_q2);
507  qx = pmadd(qx, x2, cst_cephes_exp_q3);
508 
509  // I don't really get this bit, copied from the SSE2 routines, so...
510  // TODO(gonnet): Figure out what is going on here, perhaps find a better
511  // rational interpolant?
512  x = pdiv(px, psub(qx, px));
513  x = pmadd(cst_2, x, cst_1);
514 
515  // Construct the result 2^n * exp(g) = e * x. The max is used to catch
516  // non-finite values in the input.
517  // TODO: replace pldexp with faster implementation since x in [-1, 1).
518  return pselect(zero_mask, cst_zero, pmax(pldexp(x,fx), _x));
519 }
520 
521 // The following code is inspired by the following stack-overflow answer:
522 // https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751
523 // It has been largely optimized:
524 // - By-pass calls to frexp.
525 // - Aligned loads of required 96 bits of 2/pi. This is accomplished by
526 // (1) balancing the mantissa and exponent to the required bits of 2/pi are
527 // aligned on 8-bits, and (2) replicating the storage of the bits of 2/pi.
528 // - Avoid a branch in rounding and extraction of the remaining fractional part.
529 // Overall, I measured a speed up higher than x2 on x86-64.
530 inline float trig_reduce_huge (float xf, Eigen::numext::int32_t *quadrant)
531 {
536 
537  const double pio2_62 = 3.4061215800865545e-19; // pi/2 * 2^-62
538  const uint64_t zero_dot_five = uint64_t(1) << 61; // 0.5 in 2.62-bit fixed-point format
539 
540  // 192 bits of 2/pi for Payne-Hanek reduction
541  // Bits are introduced by packet of 8 to enable aligned reads.
542  static const uint32_t two_over_pi [] =
543  {
544  0x00000028, 0x000028be, 0x0028be60, 0x28be60db,
545  0xbe60db93, 0x60db9391, 0xdb939105, 0x9391054a,
546  0x91054a7f, 0x054a7f09, 0x4a7f09d5, 0x7f09d5f4,
547  0x09d5f47d, 0xd5f47d4d, 0xf47d4d37, 0x7d4d3770,
548  0x4d377036, 0x377036d8, 0x7036d8a5, 0x36d8a566,
549  0xd8a5664f, 0xa5664f10, 0x664f10e4, 0x4f10e410,
550  0x10e41000, 0xe4100000
551  };
552 
553  uint32_t xi = numext::bit_cast<uint32_t>(xf);
554  // Below, -118 = -126 + 8.
555  // -126 is to get the exponent,
556  // +8 is to enable alignment of 2/pi's bits on 8 bits.
557  // This is possible because the fractional part of x as only 24 meaningful bits.
558  uint32_t e = (xi >> 23) - 118;
559  // Extract the mantissa and shift it to align it wrt the exponent
560  xi = ((xi & 0x007fffffu)| 0x00800000u) << (e & 0x7);
561 
562  uint32_t i = e >> 3;
563  uint32_t twoopi_1 = two_over_pi[i-1];
564  uint32_t twoopi_2 = two_over_pi[i+3];
565  uint32_t twoopi_3 = two_over_pi[i+7];
566 
567  // Compute x * 2/pi in 2.62-bit fixed-point format.
568  uint64_t p;
569  p = uint64_t(xi) * twoopi_3;
570  p = uint64_t(xi) * twoopi_2 + (p >> 32);
571  p = (uint64_t(xi * twoopi_1) << 32) + p;
572 
573  // Round to nearest: add 0.5 and extract integral part.
574  uint64_t q = (p + zero_dot_five) >> 62;
575  *quadrant = int(q);
576  // Now it remains to compute "r = x - q*pi/2" with high accuracy,
577  // since we have p=x/(pi/2) with high accuracy, we can more efficiently compute r as:
578  // r = (p-q)*pi/2,
579  // where the product can be be carried out with sufficient accuracy using double precision.
580  p -= q<<62;
581  return float(double(int64_t(p)) * pio2_62);
582 }
583 
584 template<bool ComputeSine,typename Packet>
586 #if EIGEN_COMP_GNUC_STRICT
587 __attribute__((optimize("-fno-unsafe-math-optimizations")))
588 #endif
589 Packet psincos_float(const Packet& _x)
590 {
591  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
592 
593  const Packet cst_2oPI = pset1<Packet>(0.636619746685028076171875f); // 2/PI
594  const Packet cst_rounding_magic = pset1<Packet>(12582912); // 2^23 for rounding
595  const PacketI csti_1 = pset1<PacketI>(1);
596  const Packet cst_sign_mask = pset1frombits<Packet>(static_cast<Eigen::numext::uint32_t>(0x80000000u));
597 
598  Packet x = pabs(_x);
599 
600  // Scale x by 2/Pi to find x's octant.
601  Packet y = pmul(x, cst_2oPI);
602 
603  // Rounding trick to find nearest integer:
604  Packet y_round = padd(y, cst_rounding_magic);
606  PacketI y_int = preinterpret<PacketI>(y_round); // last 23 digits represent integer (if abs(x)<2^24)
607  y = psub(y_round, cst_rounding_magic); // nearest integer to x * (2/pi)
608 
609  // Subtract y * Pi/2 to reduce x to the interval -Pi/4 <= x <= +Pi/4
610  // using "Extended precision modular arithmetic"
611  #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD)
612  // This version requires true FMA for high accuracy
613  // It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08):
614  const float huge_th = ComputeSine ? 117435.992f : 71476.0625f;
615  x = pmadd(y, pset1<Packet>(-1.57079601287841796875f), x);
616  x = pmadd(y, pset1<Packet>(-3.1391647326017846353352069854736328125e-07f), x);
617  x = pmadd(y, pset1<Packet>(-5.390302529957764765544681040410068817436695098876953125e-15f), x);
618  #else
619  // Without true FMA, the previous set of coefficients maintain 1ULP accuracy
620  // up to x<15.7 (for sin), but accuracy is immediately lost for x>15.7.
621  // We thus use one more iteration to maintain 2ULPs up to reasonably large inputs.
622 
623  // The following set of coefficients maintain 1ULP up to 9.43 and 14.16 for sin and cos respectively.
624  // and 2 ULP up to:
625  const float huge_th = ComputeSine ? 25966.f : 18838.f;
626  x = pmadd(y, pset1<Packet>(-1.5703125), x); // = 0xbfc90000
628  x = pmadd(y, pset1<Packet>(-0.000483989715576171875), x); // = 0xb9fdc000
630  x = pmadd(y, pset1<Packet>(1.62865035235881805419921875e-07), x); // = 0x342ee000
631  x = pmadd(y, pset1<Packet>(5.5644315544167710640977020375430583953857421875e-11), x); // = 0x2e74b9ee
632 
633  // For the record, the following set of coefficients maintain 2ULP up
634  // to a slightly larger range:
635  // const float huge_th = ComputeSine ? 51981.f : 39086.125f;
636  // but it slightly fails to maintain 1ULP for two values of sin below pi.
637  // x = pmadd(y, pset1<Packet>(-3.140625/2.), x);
638  // x = pmadd(y, pset1<Packet>(-0.00048351287841796875), x);
639  // x = pmadd(y, pset1<Packet>(-3.13855707645416259765625e-07), x);
640  // x = pmadd(y, pset1<Packet>(-6.0771006282767103812147979624569416046142578125e-11), x);
641 
642  // For the record, with only 3 iterations it is possible to maintain
643  // 1 ULP up to 3PI (maybe more) and 2ULP up to 255.
644  // The coefficients are: 0xbfc90f80, 0xb7354480, 0x2e74b9ee
645  #endif
646 
647  if(predux_any(pcmp_le(pset1<Packet>(huge_th),pabs(_x))))
648  {
649  const int PacketSize = unpacket_traits<Packet>::size;
650  EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float vals[PacketSize];
651  EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float x_cpy[PacketSize];
652  EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) Eigen::numext::int32_t y_int2[PacketSize];
653  pstoreu(vals, pabs(_x));
654  pstoreu(x_cpy, x);
655  pstoreu(y_int2, y_int);
656  for(int k=0; k<PacketSize;++k)
657  {
658  float val = vals[k];
659  if(val>=huge_th && (numext::isfinite)(val))
660  x_cpy[k] = trig_reduce_huge(val,&y_int2[k]);
661  }
662  x = ploadu<Packet>(x_cpy);
663  y_int = ploadu<PacketI>(y_int2);
664  }
665 
666  // Compute the sign to apply to the polynomial.
667  // sin: sign = second_bit(y_int) xor signbit(_x)
668  // cos: sign = second_bit(y_int+1)
669  Packet sign_bit = ComputeSine ? pxor(_x, preinterpret<Packet>(plogical_shift_left<30>(y_int)))
670  : preinterpret<Packet>(plogical_shift_left<30>(padd(y_int,csti_1)));
671  sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit
672 
673  // Get the polynomial selection mask from the second bit of y_int
674  // We'll calculate both (sin and cos) polynomials and then select from the two.
675  Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(y_int, csti_1), pzero(y_int)));
676 
677  Packet x2 = pmul(x,x);
678 
679  // Evaluate the cos(x) polynomial. (-Pi/4 <= x <= Pi/4)
680  Packet y1 = pset1<Packet>(2.4372266125283204019069671630859375e-05f);
681  y1 = pmadd(y1, x2, pset1<Packet>(-0.00138865201734006404876708984375f ));
682  y1 = pmadd(y1, x2, pset1<Packet>(0.041666619479656219482421875f ));
683  y1 = pmadd(y1, x2, pset1<Packet>(-0.5f));
684  y1 = pmadd(y1, x2, pset1<Packet>(1.f));
685 
686  // Evaluate the sin(x) polynomial. (Pi/4 <= x <= Pi/4)
687  // octave/matlab code to compute those coefficients:
688  // x = (0:0.0001:pi/4)';
689  // A = [x.^3 x.^5 x.^7];
690  // w = ((1.-(x/(pi/4)).^2).^5)*2000+1; # weights trading relative accuracy
691  // c = (A'*diag(w)*A)\‍(A'*diag(w)*(sin(x)-x)); # weighted LS, linear coeff forced to 1
692  // printf('%.64f\n %.64f\n%.64f\n', c(3), c(2), c(1))
693  //
694  Packet y2 = pset1<Packet>(-0.0001959234114083702898469196984621021329076029360294342041015625f);
695  y2 = pmadd(y2, x2, pset1<Packet>( 0.0083326873655616851693794799871284340042620897293090820312500000f));
696  y2 = pmadd(y2, x2, pset1<Packet>(-0.1666666203982298255503735617821803316473960876464843750000000000f));
697  y2 = pmul(y2, x2);
698  y2 = pmadd(y2, x, x);
699 
700  // Select the correct result from the two polynomials.
701  y = ComputeSine ? pselect(poly_mask,y2,y1)
702  : pselect(poly_mask,y1,y2);
703 
704  // Update the sign and filter huge inputs
705  return pxor(y, sign_bit);
706 }
707 
708 template<typename Packet>
710 Packet psin_float(const Packet& x)
711 {
712  return psincos_float<true>(x);
713 }
714 
715 template<typename Packet>
717 Packet pcos_float(const Packet& x)
718 {
719  return psincos_float<false>(x);
720 }
721 
722 // Generic implementation of acos(x).
723 template<typename Packet>
725 Packet pacos_float(const Packet& x_in) {
726  typedef typename unpacket_traits<Packet>::type Scalar;
727  static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
728 
729  const Packet cst_one = pset1<Packet>(Scalar(1));
730  const Packet cst_pi = pset1<Packet>(Scalar(EIGEN_PI));
731  const Packet p6 = pset1<Packet>(Scalar(2.36423197202384471893310546875e-3));
732  const Packet p5 = pset1<Packet>(Scalar(-1.1368644423782825469970703125e-2));
733  const Packet p4 = pset1<Packet>(Scalar(2.717843465507030487060546875e-2));
734  const Packet p3 = pset1<Packet>(Scalar(-4.8969544470310211181640625e-2));
735  const Packet p2 = pset1<Packet>(Scalar(8.8804088532924652099609375e-2));
736  const Packet p1 = pset1<Packet>(Scalar(-0.214591205120086669921875));
737  const Packet p0 = pset1<Packet>(Scalar(1.57079637050628662109375));
738 
739  // For x in [0:1], we approximate acos(x)/sqrt(1-x), which is a smooth
740  // function, by a 6'th order polynomial.
741  // For x in [-1:0) we use that acos(-x) = pi - acos(x).
742  const Packet neg_mask = psignbit(x_in);
743  const Packet abs_x = pabs(x_in);
744 
745  // Evaluate the polynomial using Horner's rule:
746  // P(x) = p0 + x * (p1 + x * (p2 + ... (p5 + x * p6)) ... ) .
747  // We evaluate even and odd terms independently to increase
748  // instruction level parallelism.
749  Packet x2 = pmul(x_in,x_in);
750  Packet p_even = pmadd(p6, x2, p4);
751  Packet p_odd = pmadd(p5, x2, p3);
752  p_even = pmadd(p_even, x2, p2);
753  p_odd = pmadd(p_odd, x2, p1);
754  p_even = pmadd(p_even, x2, p0);
755  Packet p = pmadd(p_odd, abs_x, p_even);
756 
757  // The polynomial approximates acos(x)/sqrt(1-x), so
758  // multiply by sqrt(1-x) to get acos(x).
759  // Conveniently returns NaN for arguments outside [-1:1].
760  Packet denom = psqrt(psub(cst_one, abs_x));
761  Packet result = pmul(denom, p);
762  // Undo mapping for negative arguments.
763  return pselect(neg_mask, psub(cst_pi, result), result);
764 }
765 
766 // Generic implementation of asin(x).
767 template<typename Packet>
769 Packet pasin_float(const Packet& x_in) {
770  typedef typename unpacket_traits<Packet>::type Scalar;
771  static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
772 
773  constexpr float kPiOverTwo = static_cast<float>(EIGEN_PI / 2);
774 
775  const Packet cst_half = pset1<Packet>(0.5f);
776  const Packet cst_one = pset1<Packet>(1.0f);
777  const Packet cst_two = pset1<Packet>(2.0f);
778  const Packet cst_pi_over_two = pset1<Packet>(kPiOverTwo);
779  // For |x| < 0.5 approximate asin(x)/x by an 8th order polynomial with
780  // even terms only.
781  const Packet p9 = pset1<Packet>(5.08838854730129241943359375e-2f);
782  const Packet p7 = pset1<Packet>(3.95139865577220916748046875e-2f);
783  const Packet p5 = pset1<Packet>(7.550220191478729248046875e-2f);
784  const Packet p3 = pset1<Packet>(0.16664917767047882080078125f);
785  const Packet p1 = pset1<Packet>(1.00000011920928955078125f);
786 
787  const Packet abs_x = pabs(x_in);
788  const Packet sign_mask = pandnot(x_in, abs_x);
789  const Packet invalid_mask = pcmp_lt(cst_one, abs_x);
790 
791  // For arguments |x| > 0.5, we map x back to [0:0.5] using
792  // the transformation x_large = sqrt(0.5*(1-x)), and use the
793  // identity
794  // asin(x) = pi/2 - 2 * asin( sqrt( 0.5 * (1 - x)))
795 
796  const Packet x_large = psqrt(pnmadd(cst_half, abs_x, cst_half));
797  const Packet large_mask = pcmp_lt(cst_half, abs_x);
798  const Packet x = pselect(large_mask, x_large, abs_x);
799  const Packet x2 = pmul(x, x);
800 
801  // Compute polynomial.
802  // x * (p1 + x^2*(p3 + x^2*(p5 + x^2*(p7 + x^2*p9))))
803 
804  Packet p = pmadd(p9, x2, p7);
805  p = pmadd(p, x2, p5);
806  p = pmadd(p, x2, p3);
807  p = pmadd(p, x2, p1);
808  p = pmul(p, x);
809 
810  const Packet p_large = pnmadd(cst_two, p, cst_pi_over_two);
811  p = pselect(large_mask, p_large, p);
812  // Flip the sign for negative arguments.
813  p = pxor(p, sign_mask);
814  // Return NaN for arguments outside [-1:1].
815  return por(invalid_mask, p);
816 }
817 
818 // Computes elementwise atan(x) for x in [-1:1] with 2 ulp accuracy.
819 template<typename Packet>
821 Packet patan_reduced_float(const Packet& x) {
822  const Packet q0 = pset1<Packet>(-0.3333314359188079833984375f);
823  const Packet q2 = pset1<Packet>(0.19993579387664794921875f);
824  const Packet q4 = pset1<Packet>(-0.14209578931331634521484375f);
825  const Packet q6 = pset1<Packet>(0.1066047251224517822265625f);
826  const Packet q8 = pset1<Packet>(-7.5408883392810821533203125e-2f);
827  const Packet q10 = pset1<Packet>(4.3082617223262786865234375e-2f);
828  const Packet q12 = pset1<Packet>(-1.62907354533672332763671875e-2f);
829  const Packet q14 = pset1<Packet>(2.90188402868807315826416015625e-3f);
830 
831  // Approximate atan(x) by a polynomial of the form
832  // P(x) = x + x^3 * Q(x^2),
833  // where Q(x^2) is a 7th order polynomial in x^2.
834  // We evaluate even and odd terms in x^2 in parallel
835  // to take advantage of instruction level parallelism
836  // and hardware with multiple FMA units.
837 
838  // note: if x == -0, this returns +0
839  const Packet x2 = pmul(x, x);
840  const Packet x4 = pmul(x2, x2);
841  Packet q_odd = pmadd(q14, x4, q10);
842  Packet q_even = pmadd(q12, x4, q8);
843  q_odd = pmadd(q_odd, x4, q6);
844  q_even = pmadd(q_even, x4, q4);
845  q_odd = pmadd(q_odd, x4, q2);
846  q_even = pmadd(q_even, x4, q0);
847  const Packet q = pmadd(q_odd, x2, q_even);
848  return pmadd(q, pmul(x, x2), x);
849 }
850 
851 template<typename Packet>
853 Packet patan_float(const Packet& x_in) {
854  typedef typename unpacket_traits<Packet>::type Scalar;
855  static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
856 
857  constexpr float kPiOverTwo = static_cast<float>(EIGEN_PI / 2);
858 
859  const Packet cst_signmask = pset1<Packet>(-0.0f);
860  const Packet cst_one = pset1<Packet>(1.0f);
861  const Packet cst_pi_over_two = pset1<Packet>(kPiOverTwo);
862 
863  // "Large": For |x| > 1, use atan(1/x) = sign(x)*pi/2 - atan(x).
864  // "Small": For |x| <= 1, approximate atan(x) directly by a polynomial
865  // calculated using Sollya.
866 
867  const Packet abs_x = pabs(x_in);
868  const Packet x_signmask = pand(x_in, cst_signmask);
869  const Packet large_mask = pcmp_lt(cst_one, abs_x);
870  const Packet x = pselect(large_mask, preciprocal(abs_x), abs_x);
871  const Packet p = patan_reduced_float(x);
872  // Apply transformations according to the range reduction masks.
873  Packet result = pselect(large_mask, psub(cst_pi_over_two, p), p);
874  // Return correct sign
875  return pxor(result, x_signmask);
876 }
877 
878 // Computes elementwise atan(x) for x in [-tan(pi/8):tan(pi/8)]
879 // with 2 ulp accuracy.
880 template <typename Packet>
882 patan_reduced_double(const Packet& x) {
883  const Packet q0 =
884  pset1<Packet>(-0.33333333333330028569463365784031338989734649658203);
885  const Packet q2 =
886  pset1<Packet>(0.199999999990664090177006073645316064357757568359375);
887  const Packet q4 =
888  pset1<Packet>(-0.142857141937123677255527809393242932856082916259766);
889  const Packet q6 =
890  pset1<Packet>(0.111111065991039953404495577160560060292482376098633);
891  const Packet q8 =
892  pset1<Packet>(-9.0907812986129224452902519715280504897236824035645e-2);
893  const Packet q10 =
894  pset1<Packet>(7.6900542950704739442180368769186316058039665222168e-2);
895  const Packet q12 =
896  pset1<Packet>(-6.6410112986494976294871150912513257935643196105957e-2);
897  const Packet q14 =
898  pset1<Packet>(5.6920144995467943094258345126945641823112964630127e-2);
899  const Packet q16 =
900  pset1<Packet>(-4.3577020814990513608577771265117917209863662719727e-2);
901  const Packet q18 =
902  pset1<Packet>(2.1244050233624342527427586446719942614436149597168e-2);
903 
904  // Approximate atan(x) on [0:tan(pi/8)] by a polynomial of the form
905  // P(x) = x + x^3 * Q(x^2),
906  // where Q(x^2) is a 9th order polynomial in x^2.
907  // We evaluate even and odd terms in x^2 in parallel
908  // to take advantage of instruction level parallelism
909  // and hardware with multiple FMA units.
910  const Packet x2 = pmul(x, x);
911  const Packet x4 = pmul(x2, x2);
912  Packet q_odd = pmadd(q18, x4, q14);
913  Packet q_even = pmadd(q16, x4, q12);
914  q_odd = pmadd(q_odd, x4, q10);
915  q_even = pmadd(q_even, x4, q8);
916  q_odd = pmadd(q_odd, x4, q6);
917  q_even = pmadd(q_even, x4, q4);
918  q_odd = pmadd(q_odd, x4, q2);
919  q_even = pmadd(q_even, x4, q0);
920  const Packet p = pmadd(q_odd, x2, q_even);
921  return pmadd(p, pmul(x, x2), x);
922 }
923 
924 template<typename Packet>
926 Packet patan_double(const Packet& x_in) {
927  typedef typename unpacket_traits<Packet>::type Scalar;
928  static_assert(std::is_same<Scalar, double>::value, "Scalar type must be double");
929 
930  constexpr double kPiOverTwo = static_cast<double>(EIGEN_PI / 2);
931  constexpr double kPiOverFour = static_cast<double>(EIGEN_PI / 4);
932  constexpr double kTanPiOverEight = 0.4142135623730950488016887;
933  constexpr double kTan3PiOverEight = 2.4142135623730950488016887;
934 
935  const Packet cst_signmask = pset1<Packet>(-0.0);
936  const Packet cst_one = pset1<Packet>(1.0);
937  const Packet cst_pi_over_two = pset1<Packet>(kPiOverTwo);
938  const Packet cst_pi_over_four = pset1<Packet>(kPiOverFour);
939  const Packet cst_large = pset1<Packet>(kTan3PiOverEight);
940  const Packet cst_medium = pset1<Packet>(kTanPiOverEight);
941 
942  // Use the same range reduction strategy (to [0:tan(pi/8)]) as the
943  // Cephes library:
944  // "Large": For x >= tan(3*pi/8), use atan(1/x) = pi/2 - atan(x).
945  // "Medium": For x in [tan(pi/8) : tan(3*pi/8)),
946  // use atan(x) = pi/4 + atan((x-1)/(x+1)).
947  // "Small": For x < tan(pi/8), approximate atan(x) directly by a polynomial
948  // calculated using Sollya.
949 
950  const Packet abs_x = pabs(x_in);
951  const Packet x_signmask = pand(x_in, cst_signmask);
952  const Packet large_mask = pcmp_lt(cst_large, abs_x);
953  const Packet medium_mask = pandnot(pcmp_lt(cst_medium, abs_x), large_mask);
954 
955  Packet x = abs_x;
956  x = pselect(large_mask, preciprocal(abs_x), x);
957  x = pselect(medium_mask, pdiv(psub(abs_x, cst_one), padd(abs_x, cst_one)), x);
958 
959  // Compute approximation of p ~= atan(x') where x' is the argument reduced to
960  // [0:tan(pi/8)].
961  Packet p = patan_reduced_double(x);
962 
963  // Apply transformations according to the range reduction masks.
964  p = pselect(large_mask, psub(cst_pi_over_two, p), p);
965  p = pselect(medium_mask, padd(cst_pi_over_four, p), p);
966  // Return the correct sign
967  return pxor(p, x_signmask);
968 }
969 
970 template<typename Packet>
972 Packet patanh_float(const Packet& x) {
973  typedef typename unpacket_traits<Packet>::type Scalar;
974  static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
975  const Packet half = pset1<Packet>(0.5f);
976  const Packet x_gt_half = pcmp_le(half, pabs(x));
977  // For |x| in [0:0.5] we use a polynomial approximation of the form
978  // P(x) = x + x^3*(c3 + x^2 * (c5 + x^2 * (... x^2 * c11) ... )).
979  const Packet C3 = pset1<Packet>(0.3333373963832855224609375f);
980  const Packet C5 = pset1<Packet>(0.1997792422771453857421875f);
981  const Packet C7 = pset1<Packet>(0.14672131836414337158203125f);
982  const Packet C9 = pset1<Packet>(8.2311116158962249755859375e-2f);
983  const Packet C11 = pset1<Packet>(0.1819281280040740966796875f);
984  const Packet x2 = pmul(x,x);
985  Packet p = pmadd(C11, x2, C9);
986  p = pmadd(x2, p, C7);
987  p = pmadd(x2, p, C5);
988  p = pmadd(x2, p, C3);
989  p = pmadd(pmul(x,x2), p, x);
990 
991  // For |x| in ]0.5:1.0] we use atanh = 0.5*ln((1+x)/(1-x));
992  const Packet one = pset1<Packet>(1.0f);
993  Packet r = pdiv(padd(one, x), psub(one, x));
994  r = pmul(half, plog(r));
995  return pselect(x_gt_half, r, p);
996 }
997 
998 template<typename Packet>
1000 Packet pdiv_complex(const Packet& x, const Packet& y) {
1001  typedef typename unpacket_traits<Packet>::as_real RealPacket;
1002  // In the following we annotate the code for the case where the inputs
1003  // are a pair length-2 SIMD vectors representing a single pair of complex
1004  // numbers x = a + i*b, y = c + i*d.
1005  const RealPacket y_abs = pabs(y.v); // |c|, |d|
1006  const RealPacket y_abs_flip = pcplxflip(Packet(y_abs)).v; // |d|, |c|
1007  const RealPacket y_max = pmax(y_abs, y_abs_flip); // max(|c|, |d|), max(|c|, |d|)
1008  const RealPacket y_scaled = pdiv(y.v, y_max); // c / max(|c|, |d|), d / max(|c|, |d|)
1009  // Compute scaled denominator.
1010  const RealPacket y_scaled_sq = pmul(y_scaled, y_scaled); // c'**2, d'**2
1011  const RealPacket denom = padd(y_scaled_sq, pcplxflip(Packet(y_scaled_sq)).v);
1012  Packet result_scaled = pmul(x, pconj(Packet(y_scaled))); // a * c' + b * d', -a * d + b * c
1013  // Divide elementwise by denom.
1014  result_scaled = Packet(pdiv(result_scaled.v, denom));
1015  // Rescale result
1016  return Packet(pdiv(result_scaled.v, y_max));
1017 }
1018 
1019 template<typename Packet>
1021 Packet psqrt_complex(const Packet& a) {
1022  typedef typename unpacket_traits<Packet>::type Scalar;
1023  typedef typename Scalar::value_type RealScalar;
1024  typedef typename unpacket_traits<Packet>::as_real RealPacket;
1025 
1026  // Computes the principal sqrt of the complex numbers in the input.
1027  //
1028  // For example, for packets containing 2 complex numbers stored in interleaved format
1029  // a = [a0, a1] = [x0, y0, x1, y1],
1030  // where x0 = real(a0), y0 = imag(a0) etc., this function returns
1031  // b = [b0, b1] = [u0, v0, u1, v1],
1032  // such that b0^2 = a0, b1^2 = a1.
1033  //
1034  // To derive the formula for the complex square roots, let's consider the equation for
1035  // a single complex square root of the number x + i*y. We want to find real numbers
1036  // u and v such that
1037  // (u + i*v)^2 = x + i*y <=>
1038  // u^2 - v^2 + i*2*u*v = x + i*v.
1039  // By equating the real and imaginary parts we get:
1040  // u^2 - v^2 = x
1041  // 2*u*v = y.
1042  //
1043  // For x >= 0, this has the numerically stable solution
1044  // u = sqrt(0.5 * (x + sqrt(x^2 + y^2)))
1045  // v = 0.5 * (y / u)
1046  // and for x < 0,
1047  // v = sign(y) * sqrt(0.5 * (-x + sqrt(x^2 + y^2)))
1048  // u = 0.5 * (y / v)
1049  //
1050  // To avoid unnecessary over- and underflow, we compute sqrt(x^2 + y^2) as
1051  // l = max(|x|, |y|) * sqrt(1 + (min(|x|, |y|) / max(|x|, |y|))^2) ,
1052 
1053  // In the following, without lack of generality, we have annotated the code, assuming
1054  // that the input is a packet of 2 complex numbers.
1055  //
1056  // Step 1. Compute l = [l0, l0, l1, l1], where
1057  // l0 = sqrt(x0^2 + y0^2), l1 = sqrt(x1^2 + y1^2)
1058  // To avoid over- and underflow, we use the stable formula for each hypotenuse
1059  // l0 = (min0 == 0 ? max0 : max0 * sqrt(1 + (min0/max0)**2)),
1060  // where max0 = max(|x0|, |y0|), min0 = min(|x0|, |y0|), and similarly for l1.
1061 
1062  RealPacket a_abs = pabs(a.v); // [|x0|, |y0|, |x1|, |y1|]
1063  RealPacket a_abs_flip = pcplxflip(Packet(a_abs)).v; // [|y0|, |x0|, |y1|, |x1|]
1064  RealPacket a_max = pmax(a_abs, a_abs_flip);
1065  RealPacket a_min = pmin(a_abs, a_abs_flip);
1066  RealPacket a_min_zero_mask = pcmp_eq(a_min, pzero(a_min));
1067  RealPacket a_max_zero_mask = pcmp_eq(a_max, pzero(a_max));
1068  RealPacket r = pdiv(a_min, a_max);
1069  const RealPacket cst_one = pset1<RealPacket>(RealScalar(1));
1070  RealPacket l = pmul(a_max, psqrt(padd(cst_one, pmul(r, r)))); // [l0, l0, l1, l1]
1071  // Set l to a_max if a_min is zero.
1072  l = pselect(a_min_zero_mask, a_max, l);
1073 
1074  // Step 2. Compute [rho0, *, rho1, *], where
1075  // rho0 = sqrt(0.5 * (l0 + |x0|)), rho1 = sqrt(0.5 * (l1 + |x1|))
1076  // We don't care about the imaginary parts computed here. They will be overwritten later.
1077  const RealPacket cst_half = pset1<RealPacket>(RealScalar(0.5));
1078  Packet rho;
1079  rho.v = psqrt(pmul(cst_half, padd(a_abs, l)));
1080 
1081  // Step 3. Compute [rho0, eta0, rho1, eta1], where
1082  // eta0 = (y0 / l0) / 2, and eta1 = (y1 / l1) / 2.
1083  // set eta = 0 of input is 0 + i0.
1084  RealPacket eta = pandnot(pmul(cst_half, pdiv(a.v, pcplxflip(rho).v)), a_max_zero_mask);
1085  RealPacket real_mask = peven_mask(a.v);
1086  Packet positive_real_result;
1087  // Compute result for inputs with positive real part.
1088  positive_real_result.v = pselect(real_mask, rho.v, eta);
1089 
1090  // Step 4. Compute solution for inputs with negative real part:
1091  // [|eta0|, sign(y0)*rho0, |eta1|, sign(y1)*rho1]
1092  const RealPacket cst_imag_sign_mask =
1093  pset1<Packet>(Scalar(RealScalar(0.0), RealScalar(-0.0))).v;
1094  RealPacket imag_signs = pand(a.v, cst_imag_sign_mask);
1095  Packet negative_real_result;
1096  // Notice that rho is positive, so taking it's absolute value is a noop.
1097  negative_real_result.v = por(pabs(pcplxflip(positive_real_result).v), imag_signs);
1098 
1099  // Step 5. Select solution branch based on the sign of the real parts.
1100  Packet negative_real_mask;
1101  negative_real_mask.v = pcmp_lt(pand(real_mask, a.v), pzero(a.v));
1102  negative_real_mask.v = por(negative_real_mask.v, pcplxflip(negative_real_mask).v);
1103  Packet result = pselect(negative_real_mask, negative_real_result, positive_real_result);
1104 
1105  // Step 6. Handle special cases for infinities:
1106  // * If z is (x,+∞), the result is (+∞,+∞) even if x is NaN
1107  // * If z is (x,-∞), the result is (+∞,-∞) even if x is NaN
1108  // * If z is (-∞,y), the result is (0*|y|,+∞) for finite or NaN y
1109  // * If z is (+∞,y), the result is (+∞,0*|y|) for finite or NaN y
1110  const RealPacket cst_pos_inf = pset1<RealPacket>(NumTraits<RealScalar>::infinity());
1111  Packet is_inf;
1112  is_inf.v = pcmp_eq(a_abs, cst_pos_inf);
1113  Packet is_real_inf;
1114  is_real_inf.v = pand(is_inf.v, real_mask);
1115  is_real_inf = por(is_real_inf, pcplxflip(is_real_inf));
1116  // prepare packet of (+∞,0*|y|) or (0*|y|,+∞), depending on the sign of the infinite real part.
1117  Packet real_inf_result;
1118  real_inf_result.v = pmul(a_abs, pset1<Packet>(Scalar(RealScalar(1.0), RealScalar(0.0))).v);
1119  real_inf_result.v = pselect(negative_real_mask.v, pcplxflip(real_inf_result).v, real_inf_result.v);
1120  // prepare packet of (+∞,+∞) or (+∞,-∞), depending on the sign of the infinite imaginary part.
1121  Packet is_imag_inf;
1122  is_imag_inf.v = pandnot(is_inf.v, real_mask);
1123  is_imag_inf = por(is_imag_inf, pcplxflip(is_imag_inf));
1124  Packet imag_inf_result;
1125  imag_inf_result.v = por(pand(cst_pos_inf, real_mask), pandnot(a.v, real_mask));
1126  // unless otherwise specified, if either the real or imaginary component is nan, the entire result is nan
1127  Packet result_is_nan = pisnan(result);
1128  result = por(result_is_nan, result);
1129 
1130  return pselect(is_imag_inf, imag_inf_result, pselect(is_real_inf, real_inf_result, result));
1131 }
1132 
1133 
1134 template <typename Packet>
1135 struct psign_impl<Packet, std::enable_if_t<!NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
1136  !NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>> {
1137  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) {
1138  using Scalar = typename unpacket_traits<Packet>::type;
1139  const Packet cst_one = pset1<Packet>(Scalar(1));
1140  const Packet cst_zero = pzero(a);
1141 
1142  const Packet abs_a = pabs(a);
1143  const Packet sign_mask = pandnot(a, abs_a);
1144  const Packet nonzero_mask = pcmp_lt(cst_zero, abs_a);
1145 
1146  return pselect(nonzero_mask, por(sign_mask, cst_one), abs_a);
1147  }
1148 };
1149 
1150 template <typename Packet>
1151 struct psign_impl<Packet, std::enable_if_t<!NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
1152  NumTraits<typename unpacket_traits<Packet>::type>::IsSigned &&
1153  NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>> {
1154  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) {
1155  using Scalar = typename unpacket_traits<Packet>::type;
1156  const Packet cst_one = pset1<Packet>(Scalar(1));
1157  const Packet cst_minus_one = pset1<Packet>(Scalar(-1));
1158  const Packet cst_zero = pzero(a);
1159 
1160  const Packet positive_mask = pcmp_lt(cst_zero, a);
1161  const Packet positive = pand(positive_mask, cst_one);
1162  const Packet negative_mask = pcmp_lt(a, cst_zero);
1163  const Packet negative = pand(negative_mask, cst_minus_one);
1164 
1165  return por(positive, negative);
1166  }
1167 };
1168 
1169 template <typename Packet>
1170 struct psign_impl<Packet, std::enable_if_t<!NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
1171  !NumTraits<typename unpacket_traits<Packet>::type>::IsSigned &&
1172  NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>> {
1173  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) {
1174  using Scalar = typename unpacket_traits<Packet>::type;
1175  const Packet cst_one = pset1<Packet>(Scalar(1));
1176  const Packet cst_zero = pzero(a);
1177 
1178  const Packet zero_mask = pcmp_eq(cst_zero, a);
1179  return pandnot(cst_one, zero_mask);
1180  }
1181 };
1182 
1183 // \internal \returns the the sign of a complex number z, defined as z / abs(z).
1184 template <typename Packet>
1185 struct psign_impl<Packet, std::enable_if_t<NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
1186  unpacket_traits<Packet>::vectorizable>> {
1187  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) {
1188  typedef typename unpacket_traits<Packet>::type Scalar;
1189  typedef typename Scalar::value_type RealScalar;
1190  typedef typename unpacket_traits<Packet>::as_real RealPacket;
1191 
1192  // Step 1. Compute (for each element z = x + i*y in a)
1193  // l = abs(z) = sqrt(x^2 + y^2).
1194  // To avoid over- and underflow, we use the stable formula for each hypotenuse
1195  // l = (zmin == 0 ? zmax : zmax * sqrt(1 + (zmin/zmax)**2)),
1196  // where zmax = max(|x|, |y|), zmin = min(|x|, |y|),
1197  RealPacket a_abs = pabs(a.v);
1198  RealPacket a_abs_flip = pcplxflip(Packet(a_abs)).v;
1199  RealPacket a_max = pmax(a_abs, a_abs_flip);
1200  RealPacket a_min = pmin(a_abs, a_abs_flip);
1201  RealPacket a_min_zero_mask = pcmp_eq(a_min, pzero(a_min));
1202  RealPacket a_max_zero_mask = pcmp_eq(a_max, pzero(a_max));
1203  RealPacket r = pdiv(a_min, a_max);
1204  const RealPacket cst_one = pset1<RealPacket>(RealScalar(1));
1205  RealPacket l = pmul(a_max, psqrt(padd(cst_one, pmul(r, r)))); // [l0, l0, l1, l1]
1206  // Set l to a_max if a_min is zero, since the roundtrip sqrt(a_max^2) may be
1207  // lossy.
1208  l = pselect(a_min_zero_mask, a_max, l);
1209  // Step 2 compute a / abs(a).
1210  RealPacket sign_as_real = pandnot(pdiv(a.v, l), a_max_zero_mask);
1211  Packet sign;
1212  sign.v = sign_as_real;
1213  return sign;
1214  }
1215 };
1216 
1217 // TODO(rmlarsen): The following set of utilities for double word arithmetic
1218 // should perhaps be refactored as a separate file, since it would be generally
1219 // useful for special function implementation etc. Writing the algorithms in
1220 // terms if a double word type would also make the code more readable.
1221 
1222 // This function splits x into the nearest integer n and fractional part r,
1223 // such that x = n + r holds exactly.
1224 template<typename Packet>
1225 EIGEN_STRONG_INLINE
1226 void absolute_split(const Packet& x, Packet& n, Packet& r) {
1227  n = pround(x);
1228  r = psub(x, n);
1229 }
1230 
1231 // This function computes the sum {s, r}, such that x + y = s_hi + s_lo
1232 // holds exactly, and s_hi = fl(x+y), if |x| >= |y|.
1233 template<typename Packet>
1234 EIGEN_STRONG_INLINE
1235 void fast_twosum(const Packet& x, const Packet& y, Packet& s_hi, Packet& s_lo) {
1236  s_hi = padd(x, y);
1237  const Packet t = psub(s_hi, x);
1238  s_lo = psub(y, t);
1239 }
1240 
1241 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
1242 // This function implements the extended precision product of
1243 // a pair of floating point numbers. Given {x, y}, it computes the pair
1244 // {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and
1245 // p_hi = fl(x * y).
1246 template<typename Packet>
1247 EIGEN_STRONG_INLINE
1248 void twoprod(const Packet& x, const Packet& y,
1249  Packet& p_hi, Packet& p_lo) {
1250  p_hi = pmul(x, y);
1251  p_lo = pmsub(x, y, p_hi);
1252 }
1253 
1254 #else
1255 
1256 // This function implements the Veltkamp splitting. Given a floating point
1257 // number x it returns the pair {x_hi, x_lo} such that x_hi + x_lo = x holds
1258 // exactly and that half of the significant of x fits in x_hi.
1259 // This is Algorithm 3 from Jean-Michel Muller, "Elementary Functions",
1260 // 3rd edition, Birkh\"auser, 2016.
1261 template<typename Packet>
1262 EIGEN_STRONG_INLINE
1263 void veltkamp_splitting(const Packet& x, Packet& x_hi, Packet& x_lo) {
1264  typedef typename unpacket_traits<Packet>::type Scalar;
1265  EIGEN_CONSTEXPR int shift = (NumTraits<Scalar>::digits() + 1) / 2;
1266  const Scalar shift_scale = Scalar(uint64_t(1) << shift); // Scalar constructor not necessarily constexpr.
1267  const Packet gamma = pmul(pset1<Packet>(shift_scale + Scalar(1)), x);
1268  Packet rho = psub(x, gamma);
1269  x_hi = padd(rho, gamma);
1270  x_lo = psub(x, x_hi);
1271 }
1272 
1273 // This function implements Dekker's algorithm for products x * y.
1274 // Given floating point numbers {x, y} computes the pair
1275 // {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and
1276 // p_hi = fl(x * y).
1277 template<typename Packet>
1278 EIGEN_STRONG_INLINE
1279 void twoprod(const Packet& x, const Packet& y,
1280  Packet& p_hi, Packet& p_lo) {
1281  Packet x_hi, x_lo, y_hi, y_lo;
1282  veltkamp_splitting(x, x_hi, x_lo);
1283  veltkamp_splitting(y, y_hi, y_lo);
1284 
1285  p_hi = pmul(x, y);
1286  p_lo = pmadd(x_hi, y_hi, pnegate(p_hi));
1287  p_lo = pmadd(x_hi, y_lo, p_lo);
1288  p_lo = pmadd(x_lo, y_hi, p_lo);
1289  p_lo = pmadd(x_lo, y_lo, p_lo);
1290 }
1291 
1292 #endif // EIGEN_HAS_SINGLE_INSTRUCTION_MADD
1293 
1294 
1295 // This function implements Dekker's algorithm for the addition
1296 // of two double word numbers represented by {x_hi, x_lo} and {y_hi, y_lo}.
1297 // It returns the result as a pair {s_hi, s_lo} such that
1298 // x_hi + x_lo + y_hi + y_lo = s_hi + s_lo holds exactly.
1299 // This is Algorithm 5 from Jean-Michel Muller, "Elementary Functions",
1300 // 3rd edition, Birkh\"auser, 2016.
1301 template<typename Packet>
1302 EIGEN_STRONG_INLINE
1303  void twosum(const Packet& x_hi, const Packet& x_lo,
1304  const Packet& y_hi, const Packet& y_lo,
1305  Packet& s_hi, Packet& s_lo) {
1306  const Packet x_greater_mask = pcmp_lt(pabs(y_hi), pabs(x_hi));
1307  Packet r_hi_1, r_lo_1;
1308  fast_twosum(x_hi, y_hi,r_hi_1, r_lo_1);
1309  Packet r_hi_2, r_lo_2;
1310  fast_twosum(y_hi, x_hi,r_hi_2, r_lo_2);
1311  const Packet r_hi = pselect(x_greater_mask, r_hi_1, r_hi_2);
1312 
1313  const Packet s1 = padd(padd(y_lo, r_lo_1), x_lo);
1314  const Packet s2 = padd(padd(x_lo, r_lo_2), y_lo);
1315  const Packet s = pselect(x_greater_mask, s1, s2);
1316 
1317  fast_twosum(r_hi, s, s_hi, s_lo);
1318 }
1319 
1320 // This is a version of twosum for double word numbers,
1321 // which assumes that |x_hi| >= |y_hi|.
1322 template<typename Packet>
1323 EIGEN_STRONG_INLINE
1324  void fast_twosum(const Packet& x_hi, const Packet& x_lo,
1325  const Packet& y_hi, const Packet& y_lo,
1326  Packet& s_hi, Packet& s_lo) {
1327  Packet r_hi, r_lo;
1328  fast_twosum(x_hi, y_hi, r_hi, r_lo);
1329  const Packet s = padd(padd(y_lo, r_lo), x_lo);
1330  fast_twosum(r_hi, s, s_hi, s_lo);
1331 }
1332 
1333 // This is a version of twosum for adding a floating point number x to
1334 // double word number {y_hi, y_lo} number, with the assumption
1335 // that |x| >= |y_hi|.
1336 template<typename Packet>
1337 EIGEN_STRONG_INLINE
1338 void fast_twosum(const Packet& x,
1339  const Packet& y_hi, const Packet& y_lo,
1340  Packet& s_hi, Packet& s_lo) {
1341  Packet r_hi, r_lo;
1342  fast_twosum(x, y_hi, r_hi, r_lo);
1343  const Packet s = padd(y_lo, r_lo);
1344  fast_twosum(r_hi, s, s_hi, s_lo);
1345 }
1346 
1347 // This function implements the multiplication of a double word
1348 // number represented by {x_hi, x_lo} by a floating point number y.
1349 // It returns the result as a pair {p_hi, p_lo} such that
1350 // (x_hi + x_lo) * y = p_hi + p_lo hold with a relative error
1351 // of less than 2*2^{-2p}, where p is the number of significand bit
1352 // in the floating point type.
1353 // This is Algorithm 7 from Jean-Michel Muller, "Elementary Functions",
1354 // 3rd edition, Birkh\"auser, 2016.
1355 template<typename Packet>
1356 EIGEN_STRONG_INLINE
1357 void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y,
1358  Packet& p_hi, Packet& p_lo) {
1359  Packet c_hi, c_lo1;
1360  twoprod(x_hi, y, c_hi, c_lo1);
1361  const Packet c_lo2 = pmul(x_lo, y);
1362  Packet t_hi, t_lo1;
1363  fast_twosum(c_hi, c_lo2, t_hi, t_lo1);
1364  const Packet t_lo2 = padd(t_lo1, c_lo1);
1365  fast_twosum(t_hi, t_lo2, p_hi, p_lo);
1366 }
1367 
1368 // This function implements the multiplication of two double word
1369 // numbers represented by {x_hi, x_lo} and {y_hi, y_lo}.
1370 // It returns the result as a pair {p_hi, p_lo} such that
1371 // (x_hi + x_lo) * (y_hi + y_lo) = p_hi + p_lo holds with a relative error
1372 // of less than 2*2^{-2p}, where p is the number of significand bit
1373 // in the floating point type.
1374 template<typename Packet>
1375 EIGEN_STRONG_INLINE
1376 void twoprod(const Packet& x_hi, const Packet& x_lo,
1377  const Packet& y_hi, const Packet& y_lo,
1378  Packet& p_hi, Packet& p_lo) {
1379  Packet p_hi_hi, p_hi_lo;
1380  twoprod(x_hi, x_lo, y_hi, p_hi_hi, p_hi_lo);
1381  Packet p_lo_hi, p_lo_lo;
1382  twoprod(x_hi, x_lo, y_lo, p_lo_hi, p_lo_lo);
1383  fast_twosum(p_hi_hi, p_hi_lo, p_lo_hi, p_lo_lo, p_hi, p_lo);
1384 }
1385 
1386 // This function implements the division of double word {x_hi, x_lo}
1387 // by float y. This is Algorithm 15 from "Tight and rigourous error bounds
1388 // for basic building blocks of double-word arithmetic", Joldes, Muller, & Popescu,
1389 // 2017. https://hal.archives-ouvertes.fr/hal-01351529
1390 template <typename Packet>
1391 void doubleword_div_fp(const Packet& x_hi, const Packet& x_lo, const Packet& y,
1392  Packet& z_hi, Packet& z_lo) {
1393  const Packet t_hi = pdiv(x_hi, y);
1394  Packet pi_hi, pi_lo;
1395  twoprod(t_hi, y, pi_hi, pi_lo);
1396  const Packet delta_hi = psub(x_hi, pi_hi);
1397  const Packet delta_t = psub(delta_hi, pi_lo);
1398  const Packet delta = padd(delta_t, x_lo);
1399  const Packet t_lo = pdiv(delta, y);
1400  fast_twosum(t_hi, t_lo, z_hi, z_lo);
1401 }
1402 
1403 // This function computes log2(x) and returns the result as a double word.
1404 template <typename Scalar>
1405 struct accurate_log2 {
1406  template <typename Packet>
1407  EIGEN_STRONG_INLINE
1408  void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) {
1409  log2_x_hi = plog2(x);
1410  log2_x_lo = pzero(x);
1411  }
1412 };
1413 
1414 // This specialization uses a more accurate algorithm to compute log2(x) for
1415 // floats in [1/sqrt(2);sqrt(2)] with a relative accuracy of ~6.42e-10.
1416 // This additional accuracy is needed to counter the error-magnification
1417 // inherent in multiplying by a potentially large exponent in pow(x,y).
1418 // The minimax polynomial used was calculated using the Sollya tool.
1419 // See sollya.org.
1420 template <>
1421 struct accurate_log2<float> {
1422  template <typename Packet>
1423  EIGEN_STRONG_INLINE
1424  void operator()(const Packet& z, Packet& log2_x_hi, Packet& log2_x_lo) {
1425  // The function log(1+x)/x is approximated in the interval
1426  // [1/sqrt(2)-1;sqrt(2)-1] by a degree 10 polynomial of the form
1427  // Q(x) = (C0 + x * (C1 + x * (C2 + x * (C3 + x * P(x))))),
1428  // where the degree 6 polynomial P(x) is evaluated in single precision,
1429  // while the remaining 4 terms of Q(x), as well as the final multiplication by x
1430  // to reconstruct log(1+x) are evaluated in extra precision using
1431  // double word arithmetic. C0 through C3 are extra precise constants
1432  // stored as double words.
1433  //
1434  // The polynomial coefficients were calculated using Sollya commands:
1435  // > n = 10;
1436  // > f = log2(1+x)/x;
1437  // > interval = [sqrt(0.5)-1;sqrt(2)-1];
1438  // > p = fpminimax(f,n,[|double,double,double,double,single...|],interval,relative,floating);
1439 
1440  const Packet p6 = pset1<Packet>( 9.703654795885e-2f);
1441  const Packet p5 = pset1<Packet>(-0.1690667718648f);
1442  const Packet p4 = pset1<Packet>( 0.1720575392246f);
1443  const Packet p3 = pset1<Packet>(-0.1789081543684f);
1444  const Packet p2 = pset1<Packet>( 0.2050433009862f);
1445  const Packet p1 = pset1<Packet>(-0.2404672354459f);
1446  const Packet p0 = pset1<Packet>( 0.2885761857032f);
1447 
1448  const Packet C3_hi = pset1<Packet>(-0.360674142838f);
1449  const Packet C3_lo = pset1<Packet>(-6.13283912543e-09f);
1450  const Packet C2_hi = pset1<Packet>(0.480897903442f);
1451  const Packet C2_lo = pset1<Packet>(-1.44861207474e-08f);
1452  const Packet C1_hi = pset1<Packet>(-0.721347510815f);
1453  const Packet C1_lo = pset1<Packet>(-4.84483164698e-09f);
1454  const Packet C0_hi = pset1<Packet>(1.44269502163f);
1455  const Packet C0_lo = pset1<Packet>(2.01711713999e-08f);
1456  const Packet one = pset1<Packet>(1.0f);
1457 
1458  const Packet x = psub(z, one);
1459  // Evaluate P(x) in working precision.
1460  // We evaluate it in multiple parts to improve instruction level
1461  // parallelism.
1462  Packet x2 = pmul(x,x);
1463  Packet p_even = pmadd(p6, x2, p4);
1464  p_even = pmadd(p_even, x2, p2);
1465  p_even = pmadd(p_even, x2, p0);
1466  Packet p_odd = pmadd(p5, x2, p3);
1467  p_odd = pmadd(p_odd, x2, p1);
1468  Packet p = pmadd(p_odd, x, p_even);
1469 
1470  // Now evaluate the low-order tems of Q(x) in double word precision.
1471  // In the following, due to the alternating signs and the fact that
1472  // |x| < sqrt(2)-1, we can assume that |C*_hi| >= q_i, and use
1473  // fast_twosum instead of the slower twosum.
1474  Packet q_hi, q_lo;
1475  Packet t_hi, t_lo;
1476  // C3 + x * p(x)
1477  twoprod(p, x, t_hi, t_lo);
1478  fast_twosum(C3_hi, C3_lo, t_hi, t_lo, q_hi, q_lo);
1479  // C2 + x * p(x)
1480  twoprod(q_hi, q_lo, x, t_hi, t_lo);
1481  fast_twosum(C2_hi, C2_lo, t_hi, t_lo, q_hi, q_lo);
1482  // C1 + x * p(x)
1483  twoprod(q_hi, q_lo, x, t_hi, t_lo);
1484  fast_twosum(C1_hi, C1_lo, t_hi, t_lo, q_hi, q_lo);
1485  // C0 + x * p(x)
1486  twoprod(q_hi, q_lo, x, t_hi, t_lo);
1487  fast_twosum(C0_hi, C0_lo, t_hi, t_lo, q_hi, q_lo);
1488 
1489  // log(z) ~= x * Q(x)
1490  twoprod(q_hi, q_lo, x, log2_x_hi, log2_x_lo);
1491  }
1492 };
1493 
1494 // This specialization uses a more accurate algorithm to compute log2(x) for
1495 // floats in [1/sqrt(2);sqrt(2)] with a relative accuracy of ~1.27e-18.
1496 // This additional accuracy is needed to counter the error-magnification
1497 // inherent in multiplying by a potentially large exponent in pow(x,y).
1498 // The minimax polynomial used was calculated using the Sollya tool.
1499 // See sollya.org.
1500 
1501 template <>
1502 struct accurate_log2<double> {
1503  template <typename Packet>
1504  EIGEN_STRONG_INLINE
1505  void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) {
1506  // We use a transformation of variables:
1507  // r = c * (x-1) / (x+1),
1508  // such that
1509  // log2(x) = log2((1 + r/c) / (1 - r/c)) = f(r).
1510  // The function f(r) can be approximated well using an odd polynomial
1511  // of the form
1512  // P(r) = ((Q(r^2) * r^2 + C) * r^2 + 1) * r,
1513  // For the implementation of log2<double> here, Q is of degree 6 with
1514  // coefficient represented in working precision (double), while C is a
1515  // constant represented in extra precision as a double word to achieve
1516  // full accuracy.
1517  //
1518  // The polynomial coefficients were computed by the Sollya script:
1519  //
1520  // c = 2 / log(2);
1521  // trans = c * (x-1)/(x+1);
1522  // itrans = (1+x/c)/(1-x/c);
1523  // interval=[trans(sqrt(0.5)); trans(sqrt(2))];
1524  // print(interval);
1525  // f = log2(itrans(x));
1526  // p=fpminimax(f,[|1,3,5,7,9,11,13,15,17|],[|1,DD,double...|],interval,relative,floating);
1527  const Packet q12 = pset1<Packet>(2.87074255468000586e-9);
1528  const Packet q10 = pset1<Packet>(2.38957980901884082e-8);
1529  const Packet q8 = pset1<Packet>(2.31032094540014656e-7);
1530  const Packet q6 = pset1<Packet>(2.27279857398537278e-6);
1531  const Packet q4 = pset1<Packet>(2.31271023278625638e-5);
1532  const Packet q2 = pset1<Packet>(2.47556738444535513e-4);
1533  const Packet q0 = pset1<Packet>(2.88543873228900172e-3);
1534  const Packet C_hi = pset1<Packet>(0.0400377511598501157);
1535  const Packet C_lo = pset1<Packet>(-4.77726582251425391e-19);
1536  const Packet one = pset1<Packet>(1.0);
1537 
1538  const Packet cst_2_log2e_hi = pset1<Packet>(2.88539008177792677);
1539  const Packet cst_2_log2e_lo = pset1<Packet>(4.07660016854549667e-17);
1540  // c * (x - 1)
1541  Packet t_hi, t_lo;
1542  // t = c * (x-1)
1543  twoprod(cst_2_log2e_hi, cst_2_log2e_lo, psub(x, one), t_hi, t_lo);
1544  // r = c * (x-1) / (x+1),
1545  Packet r_hi, r_lo;
1546  doubleword_div_fp(t_hi, t_lo, padd(x, one), r_hi, r_lo);
1547 
1548  // r2 = r * r
1549  Packet r2_hi, r2_lo;
1550  twoprod(r_hi, r_lo, r_hi, r_lo, r2_hi, r2_lo);
1551  // r4 = r2 * r2
1552  Packet r4_hi, r4_lo;
1553  twoprod(r2_hi, r2_lo, r2_hi, r2_lo, r4_hi, r4_lo);
1554 
1555  // Evaluate Q(r^2) in working precision. We evaluate it in two parts
1556  // (even and odd in r^2) to improve instruction level parallelism.
1557  Packet q_even = pmadd(q12, r4_hi, q8);
1558  Packet q_odd = pmadd(q10, r4_hi, q6);
1559  q_even = pmadd(q_even, r4_hi, q4);
1560  q_odd = pmadd(q_odd, r4_hi, q2);
1561  q_even = pmadd(q_even, r4_hi, q0);
1562  Packet q = pmadd(q_odd, r2_hi, q_even);
1563 
1564  // Now evaluate the low order terms of P(x) in double word precision.
1565  // In the following, due to the increasing magnitude of the coefficients
1566  // and r being constrained to [-0.5, 0.5] we can use fast_twosum instead
1567  // of the slower twosum.
1568  // Q(r^2) * r^2
1569  Packet p_hi, p_lo;
1570  twoprod(r2_hi, r2_lo, q, p_hi, p_lo);
1571  // Q(r^2) * r^2 + C
1572  Packet p1_hi, p1_lo;
1573  fast_twosum(C_hi, C_lo, p_hi, p_lo, p1_hi, p1_lo);
1574  // (Q(r^2) * r^2 + C) * r^2
1575  Packet p2_hi, p2_lo;
1576  twoprod(r2_hi, r2_lo, p1_hi, p1_lo, p2_hi, p2_lo);
1577  // ((Q(r^2) * r^2 + C) * r^2 + 1)
1578  Packet p3_hi, p3_lo;
1579  fast_twosum(one, p2_hi, p2_lo, p3_hi, p3_lo);
1580 
1581  // log(z) ~= ((Q(r^2) * r^2 + C) * r^2 + 1) * r
1582  twoprod(p3_hi, p3_lo, r_hi, r_lo, log2_x_hi, log2_x_lo);
1583  }
1584 };
1585 
1586 // This function computes exp2(x) (i.e. 2**x).
1587 template <typename Scalar>
1588 struct fast_accurate_exp2 {
1589  template <typename Packet>
1590  EIGEN_STRONG_INLINE
1591  Packet operator()(const Packet& x) {
1592  // TODO(rmlarsen): Add a pexp2 packetop.
1593  return pexp(pmul(pset1<Packet>(Scalar(EIGEN_LN2)), x));
1594  }
1595 };
1596 
1597 // This specialization uses a faster algorithm to compute exp2(x) for floats
1598 // in [-0.5;0.5] with a relative accuracy of 1 ulp.
1599 // The minimax polynomial used was calculated using the Sollya tool.
1600 // See sollya.org.
1601 template <>
1602 struct fast_accurate_exp2<float> {
1603  template <typename Packet>
1604  EIGEN_STRONG_INLINE
1605  Packet operator()(const Packet& x) {
1606  // This function approximates exp2(x) by a degree 6 polynomial of the form
1607  // Q(x) = 1 + x * (C + x * P(x)), where the degree 4 polynomial P(x) is evaluated in
1608  // single precision, and the remaining steps are evaluated with extra precision using
1609  // double word arithmetic. C is an extra precise constant stored as a double word.
1610  //
1611  // The polynomial coefficients were calculated using Sollya commands:
1612  // > n = 6;
1613  // > f = 2^x;
1614  // > interval = [-0.5;0.5];
1615  // > p = fpminimax(f,n,[|1,double,single...|],interval,relative,floating);
1616 
1617  const Packet p4 = pset1<Packet>(1.539513905e-4f);
1618  const Packet p3 = pset1<Packet>(1.340007293e-3f);
1619  const Packet p2 = pset1<Packet>(9.618283249e-3f);
1620  const Packet p1 = pset1<Packet>(5.550328270e-2f);
1621  const Packet p0 = pset1<Packet>(0.2402264923f);
1622 
1623  const Packet C_hi = pset1<Packet>(0.6931471825f);
1624  const Packet C_lo = pset1<Packet>(2.36836577e-08f);
1625  const Packet one = pset1<Packet>(1.0f);
1626 
1627  // Evaluate P(x) in working precision.
1628  // We evaluate even and odd parts of the polynomial separately
1629  // to gain some instruction level parallelism.
1630  Packet x2 = pmul(x,x);
1631  Packet p_even = pmadd(p4, x2, p2);
1632  Packet p_odd = pmadd(p3, x2, p1);
1633  p_even = pmadd(p_even, x2, p0);
1634  Packet p = pmadd(p_odd, x, p_even);
1635 
1636  // Evaluate the remaining terms of Q(x) with extra precision using
1637  // double word arithmetic.
1638  Packet p_hi, p_lo;
1639  // x * p(x)
1640  twoprod(p, x, p_hi, p_lo);
1641  // C + x * p(x)
1642  Packet q1_hi, q1_lo;
1643  twosum(p_hi, p_lo, C_hi, C_lo, q1_hi, q1_lo);
1644  // x * (C + x * p(x))
1645  Packet q2_hi, q2_lo;
1646  twoprod(q1_hi, q1_lo, x, q2_hi, q2_lo);
1647  // 1 + x * (C + x * p(x))
1648  Packet q3_hi, q3_lo;
1649  // Since |q2_hi| <= sqrt(2)-1 < 1, we can use fast_twosum
1650  // for adding it to unity here.
1651  fast_twosum(one, q2_hi, q3_hi, q3_lo);
1652  return padd(q3_hi, padd(q2_lo, q3_lo));
1653  }
1654 };
1655 
1656 // in [-0.5;0.5] with a relative accuracy of 1 ulp.
1657 // The minimax polynomial used was calculated using the Sollya tool.
1658 // See sollya.org.
1659 template <>
1660 struct fast_accurate_exp2<double> {
1661  template <typename Packet>
1662  EIGEN_STRONG_INLINE
1663  Packet operator()(const Packet& x) {
1664  // This function approximates exp2(x) by a degree 10 polynomial of the form
1665  // Q(x) = 1 + x * (C + x * P(x)), where the degree 8 polynomial P(x) is evaluated in
1666  // single precision, and the remaining steps are evaluated with extra precision using
1667  // double word arithmetic. C is an extra precise constant stored as a double word.
1668  //
1669  // The polynomial coefficients were calculated using Sollya commands:
1670  // > n = 11;
1671  // > f = 2^x;
1672  // > interval = [-0.5;0.5];
1673  // > p = fpminimax(f,n,[|1,DD,double...|],interval,relative,floating);
1674 
1675  const Packet p9 = pset1<Packet>(4.431642109085495276e-10);
1676  const Packet p8 = pset1<Packet>(7.073829923303358410e-9);
1677  const Packet p7 = pset1<Packet>(1.017822306737031311e-7);
1678  const Packet p6 = pset1<Packet>(1.321543498017646657e-6);
1679  const Packet p5 = pset1<Packet>(1.525273342728892877e-5);
1680  const Packet p4 = pset1<Packet>(1.540353045780084423e-4);
1681  const Packet p3 = pset1<Packet>(1.333355814685869807e-3);
1682  const Packet p2 = pset1<Packet>(9.618129107593478832e-3);
1683  const Packet p1 = pset1<Packet>(5.550410866481961247e-2);
1684  const Packet p0 = pset1<Packet>(0.240226506959101332);
1685  const Packet C_hi = pset1<Packet>(0.693147180559945286);
1686  const Packet C_lo = pset1<Packet>(4.81927865669806721e-17);
1687  const Packet one = pset1<Packet>(1.0);
1688 
1689  // Evaluate P(x) in working precision.
1690  // We evaluate even and odd parts of the polynomial separately
1691  // to gain some instruction level parallelism.
1692  Packet x2 = pmul(x,x);
1693  Packet p_even = pmadd(p8, x2, p6);
1694  Packet p_odd = pmadd(p9, x2, p7);
1695  p_even = pmadd(p_even, x2, p4);
1696  p_odd = pmadd(p_odd, x2, p5);
1697  p_even = pmadd(p_even, x2, p2);
1698  p_odd = pmadd(p_odd, x2, p3);
1699  p_even = pmadd(p_even, x2, p0);
1700  p_odd = pmadd(p_odd, x2, p1);
1701  Packet p = pmadd(p_odd, x, p_even);
1702 
1703  // Evaluate the remaining terms of Q(x) with extra precision using
1704  // double word arithmetic.
1705  Packet p_hi, p_lo;
1706  // x * p(x)
1707  twoprod(p, x, p_hi, p_lo);
1708  // C + x * p(x)
1709  Packet q1_hi, q1_lo;
1710  twosum(p_hi, p_lo, C_hi, C_lo, q1_hi, q1_lo);
1711  // x * (C + x * p(x))
1712  Packet q2_hi, q2_lo;
1713  twoprod(q1_hi, q1_lo, x, q2_hi, q2_lo);
1714  // 1 + x * (C + x * p(x))
1715  Packet q3_hi, q3_lo;
1716  // Since |q2_hi| <= sqrt(2)-1 < 1, we can use fast_twosum
1717  // for adding it to unity here.
1718  fast_twosum(one, q2_hi, q3_hi, q3_lo);
1719  return padd(q3_hi, padd(q2_lo, q3_lo));
1720  }
1721 };
1722 
1723 // This function implements the non-trivial case of pow(x,y) where x is
1724 // positive and y is (possibly) non-integer.
1725 // Formally, pow(x,y) = exp2(y * log2(x)), where exp2(x) is shorthand for 2^x.
1726 // TODO(rmlarsen): We should probably add this as a packet up 'ppow', to make it
1727 // easier to specialize or turn off for specific types and/or backends.x
1728 template <typename Packet>
1729 EIGEN_STRONG_INLINE Packet generic_pow_impl(const Packet& x, const Packet& y) {
1730  typedef typename unpacket_traits<Packet>::type Scalar;
1731  // Split x into exponent e_x and mantissa m_x.
1732  Packet e_x;
1733  Packet m_x = pfrexp(x, e_x);
1734 
1735  // Adjust m_x to lie in [1/sqrt(2):sqrt(2)] to minimize absolute error in log2(m_x).
1736  EIGEN_CONSTEXPR Scalar sqrt_half = Scalar(0.70710678118654752440);
1737  const Packet m_x_scale_mask = pcmp_lt(m_x, pset1<Packet>(sqrt_half));
1738  m_x = pselect(m_x_scale_mask, pmul(pset1<Packet>(Scalar(2)), m_x), m_x);
1739  e_x = pselect(m_x_scale_mask, psub(e_x, pset1<Packet>(Scalar(1))), e_x);
1740 
1741  // Compute log2(m_x) with 6 extra bits of accuracy.
1742  Packet rx_hi, rx_lo;
1743  accurate_log2<Scalar>()(m_x, rx_hi, rx_lo);
1744 
1745  // Compute the two terms {y * e_x, y * r_x} in f = y * log2(x) with doubled
1746  // precision using double word arithmetic.
1747  Packet f1_hi, f1_lo, f2_hi, f2_lo;
1748  twoprod(e_x, y, f1_hi, f1_lo);
1749  twoprod(rx_hi, rx_lo, y, f2_hi, f2_lo);
1750  // Sum the two terms in f using double word arithmetic. We know
1751  // that |e_x| > |log2(m_x)|, except for the case where e_x==0.
1752  // This means that we can use fast_twosum(f1,f2).
1753  // In the case e_x == 0, e_x * y = f1 = 0, so we don't lose any
1754  // accuracy by violating the assumption of fast_twosum, because
1755  // it's a no-op.
1756  Packet f_hi, f_lo;
1757  fast_twosum(f1_hi, f1_lo, f2_hi, f2_lo, f_hi, f_lo);
1758 
1759  // Split f into integer and fractional parts.
1760  Packet n_z, r_z;
1761  absolute_split(f_hi, n_z, r_z);
1762  r_z = padd(r_z, f_lo);
1763  Packet n_r;
1764  absolute_split(r_z, n_r, r_z);
1765  n_z = padd(n_z, n_r);
1766 
1767  // We now have an accurate split of f = n_z + r_z and can compute
1768  // x^y = 2**{n_z + r_z) = exp2(r_z) * 2**{n_z}.
1769  // Since r_z is in [-0.5;0.5], we compute the first factor to high accuracy
1770  // using a specialized algorithm. Multiplication by the second factor can
1771  // be done exactly using pldexp(), since it is an integer power of 2.
1772  const Packet e_r = fast_accurate_exp2<Scalar>()(r_z);
1773  return pldexp(e_r, n_z);
1774 }
1775 
1776 // Generic implementation of pow(x,y).
1777 template <typename Packet>
1779  typedef typename unpacket_traits<Packet>::type Scalar;
1780 
1781  const Packet cst_pos_inf = pset1<Packet>(NumTraits<Scalar>::infinity());
1782  const Packet cst_neg_inf = pset1<Packet>(-NumTraits<Scalar>::infinity());
1783  const Packet cst_zero = pset1<Packet>(Scalar(0));
1784  const Packet cst_one = pset1<Packet>(Scalar(1));
1785  const Packet cst_nan = pset1<Packet>(NumTraits<Scalar>::quiet_NaN());
1786 
1787  const Packet abs_x = pabs(x);
1788  // Predicates for sign and magnitude of x.
1789  const Packet abs_x_is_zero = pcmp_eq(abs_x, cst_zero);
1790  const Packet x_has_signbit = psignbit(x);
1791  const Packet x_is_neg = pandnot(x_has_signbit, abs_x_is_zero);
1792  const Packet x_is_neg_zero = pand(x_has_signbit, abs_x_is_zero);
1793  const Packet abs_x_is_inf = pcmp_eq(abs_x, cst_pos_inf);
1794  const Packet abs_x_is_one = pcmp_eq(abs_x, cst_one);
1795  const Packet abs_x_is_gt_one = pcmp_lt(cst_one, abs_x);
1796  const Packet abs_x_is_lt_one = pcmp_lt(abs_x, cst_one);
1797  const Packet x_is_one = pandnot(abs_x_is_one, x_is_neg);
1798  const Packet x_is_neg_one = pand(abs_x_is_one, x_is_neg);
1799  const Packet x_is_nan = pisnan(x);
1800 
1801  // Predicates for sign and magnitude of y.
1802  const Packet abs_y = pabs(y);
1803  const Packet y_is_one = pcmp_eq(y, cst_one);
1804  const Packet abs_y_is_zero = pcmp_eq(abs_y, cst_zero);
1805  const Packet y_is_neg = pcmp_lt(y, cst_zero);
1806  const Packet y_is_pos = pandnot(ptrue(y), por(abs_y_is_zero, y_is_neg));
1807  const Packet y_is_nan = pisnan(y);
1808  const Packet abs_y_is_inf = pcmp_eq(abs_y, cst_pos_inf);
1809  EIGEN_CONSTEXPR Scalar huge_exponent =
1811  const Packet abs_y_is_huge = pcmp_le(pset1<Packet>(huge_exponent), pabs(y));
1812 
1813  // Predicates for whether y is integer and/or even.
1814  const Packet y_is_int = pcmp_eq(pfloor(y), y);
1815  const Packet y_div_2 = pmul(y, pset1<Packet>(Scalar(0.5)));
1816  const Packet y_is_even = pcmp_eq(pround(y_div_2), y_div_2);
1817 
1818  // Predicates encoding special cases for the value of pow(x,y)
1819  const Packet invalid_negative_x = pandnot(pandnot(pandnot(x_is_neg, abs_x_is_inf), y_is_int), abs_y_is_inf);
1820  const Packet pow_is_nan = por(invalid_negative_x, por(x_is_nan, y_is_nan));
1821  const Packet pow_is_one =
1822  por(por(x_is_one, abs_y_is_zero), pand(x_is_neg_one, por(abs_y_is_inf, pandnot(y_is_even, invalid_negative_x))));
1823  const Packet pow_is_zero = por(por(por(pand(abs_x_is_zero, y_is_pos), pand(abs_x_is_inf, y_is_neg)),
1824  pand(pand(abs_x_is_lt_one, abs_y_is_huge), y_is_pos)),
1825  pand(pand(abs_x_is_gt_one, abs_y_is_huge), y_is_neg));
1826  const Packet pow_is_inf = por(por(por(pand(abs_x_is_zero, y_is_neg), pand(abs_x_is_inf, y_is_pos)),
1827  pand(pand(abs_x_is_lt_one, abs_y_is_huge), y_is_neg)),
1828  pand(pand(abs_x_is_gt_one, abs_y_is_huge), y_is_pos));
1829  const Packet pow_is_neg_zero = pand(pandnot(y_is_int, y_is_even),
1830  por(pand(y_is_neg, pand(abs_x_is_inf, x_is_neg)), pand(y_is_pos, x_is_neg_zero)));
1831  const Packet inf_val =
1832  pselect(pandnot(pand(por(pand(abs_x_is_inf, x_is_neg), pand(x_is_neg_zero, y_is_neg)), y_is_int), y_is_even),
1833  cst_neg_inf, cst_pos_inf);
1834  // General computation of pow(x,y) for positive x or negative x and integer y.
1835  const Packet negate_pow_abs = pandnot(x_is_neg, y_is_even);
1836  const Packet pow_abs = generic_pow_impl(abs_x, y);
1837  return pselect(y_is_one, x,
1838  pselect(pow_is_one, cst_one,
1839  pselect(pow_is_nan, cst_nan,
1840  pselect(pow_is_inf, inf_val,
1841  pselect(pow_is_neg_zero, pnegate(cst_zero),
1842  pselect(pow_is_zero, cst_zero,
1843  pselect(negate_pow_abs, pnegate(pow_abs), pow_abs)))))));
1844 }
1845 
1846 /* polevl (modified for Eigen)
1847  *
1848  * Evaluate polynomial
1849  *
1850  *
1851  *
1852  * SYNOPSIS:
1853  *
1854  * int N;
1855  * Scalar x, y, coef[N+1];
1856  *
1857  * y = polevl<decltype(x), N>( x, coef);
1858  *
1859  *
1860  *
1861  * DESCRIPTION:
1862  *
1863  * Evaluates polynomial of degree N:
1864  *
1865  * 2 N
1866  * y = C + C x + C x +...+ C x
1867  * 0 1 2 N
1868  *
1869  * Coefficients are stored in reverse order:
1870  *
1871  * coef[0] = C , ..., coef[N] = C .
1872  * N 0
1873  *
1874  * The function p1evl() assumes that coef[N] = 1.0 and is
1875  * omitted from the array. Its calling arguments are
1876  * otherwise the same as polevl().
1877  *
1878  *
1879  * The Eigen implementation is templatized. For best speed, store
1880  * coef as a const array (constexpr), e.g.
1881  *
1882  * const double coef[] = {1.0, 2.0, 3.0, ...};
1883  *
1884  */
1885 template <typename Packet, int N>
1886 struct ppolevl {
1887  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const typename unpacket_traits<Packet>::type coeff[]) {
1888  EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
1889  return pmadd(ppolevl<Packet, N-1>::run(x, coeff), x, pset1<Packet>(coeff[N]));
1890  }
1891 };
1892 
1893 template <typename Packet>
1894 struct ppolevl<Packet, 0> {
1895  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const typename unpacket_traits<Packet>::type coeff[]) {
1897  return pset1<Packet>(coeff[0]);
1898  }
1899 };
1900 
1901 /* chbevl (modified for Eigen)
1902  *
1903  * Evaluate Chebyshev series
1904  *
1905  *
1906  *
1907  * SYNOPSIS:
1908  *
1909  * int N;
1910  * Scalar x, y, coef[N], chebevl();
1911  *
1912  * y = chbevl( x, coef, N );
1913  *
1914  *
1915  *
1916  * DESCRIPTION:
1917  *
1918  * Evaluates the series
1919  *
1920  * N-1
1921  * - '
1922  * y = > coef[i] T (x/2)
1923  * - i
1924  * i=0
1925  *
1926  * of Chebyshev polynomials Ti at argument x/2.
1927  *
1928  * Coefficients are stored in reverse order, i.e. the zero
1929  * order term is last in the array. Note N is the number of
1930  * coefficients, not the order.
1931  *
1932  * If coefficients are for the interval a to b, x must
1933  * have been transformed to x -> 2(2x - b - a)/(b-a) before
1934  * entering the routine. This maps x from (a, b) to (-1, 1),
1935  * over which the Chebyshev polynomials are defined.
1936  *
1937  * If the coefficients are for the inverted interval, in
1938  * which (a, b) is mapped to (1/b, 1/a), the transformation
1939  * required is x -> 2(2ab/x - b - a)/(b-a). If b is infinity,
1940  * this becomes x -> 4a/x - 1.
1941  *
1942  *
1943  *
1944  * SPEED:
1945  *
1946  * Taking advantage of the recurrence properties of the
1947  * Chebyshev polynomials, the routine requires one more
1948  * addition per loop than evaluating a nested polynomial of
1949  * the same degree.
1950  *
1951  */
1952 
1953 template <typename Packet, int N>
1954 struct pchebevl {
1956  static EIGEN_STRONG_INLINE Packet run(Packet x, const typename unpacket_traits<Packet>::type coef[]) {
1957  typedef typename unpacket_traits<Packet>::type Scalar;
1958  Packet b0 = pset1<Packet>(coef[0]);
1959  Packet b1 = pset1<Packet>(static_cast<Scalar>(0.f));
1960  Packet b2;
1961 
1962  for (int i = 1; i < N; i++) {
1963  b2 = b1;
1964  b1 = b0;
1965  b0 = psub(pmadd(x, b1, pset1<Packet>(coef[i])), b2);
1966  }
1967 
1968  return pmul(pset1<Packet>(static_cast<Scalar>(0.5f)), psub(b0, b2));
1969  }
1970 };
1971 
1972 namespace unary_pow {
1973 
1974 template <typename ScalarExponent, bool IsInteger = NumTraits<ScalarExponent>::IsInteger>
1975 struct exponent_helper {
1976  using safe_abs_type = ScalarExponent;
1977  static constexpr ScalarExponent one_half = ScalarExponent(0.5);
1978  // these routines assume that exp is an integer stored as a floating point type
1979  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScalarExponent safe_abs(const ScalarExponent& exp) {
1980  return numext::abs(exp);
1981  }
1982  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool is_odd(const ScalarExponent& exp) {
1983  eigen_assert(((numext::isfinite)(exp) && exp == numext::floor(exp)) && "exp must be an integer");
1984  ScalarExponent exp_div_2 = exp * one_half;
1985  ScalarExponent floor_exp_div_2 = numext::floor(exp_div_2);
1986  return exp_div_2 != floor_exp_div_2;
1987  }
1988  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScalarExponent floor_div_two(const ScalarExponent& exp) {
1989  ScalarExponent exp_div_2 = exp * one_half;
1990  return numext::floor(exp_div_2);
1991  }
1992 };
1993 
1994 template <typename ScalarExponent>
1995 struct exponent_helper<ScalarExponent, true> {
1996  // if `exp` is a signed integer type, cast it to its unsigned counterpart to safely store its absolute value
1997  // consider the (rare) case where `exp` is an int32_t: abs(-2147483648) != 2147483648
1998  using safe_abs_type = typename numext::get_integer_by_size<sizeof(ScalarExponent)>::unsigned_type;
1999  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE safe_abs_type safe_abs(const ScalarExponent& exp) {
2000  ScalarExponent mask = numext::signbit(exp);
2001  safe_abs_type result = safe_abs_type(exp ^ mask);
2002  return result + safe_abs_type(ScalarExponent(1) & mask);
2003  }
2004  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool is_odd(const safe_abs_type& exp) {
2005  return exp % safe_abs_type(2) != safe_abs_type(0);
2006  }
2007  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE safe_abs_type floor_div_two(const safe_abs_type& exp) {
2008  return exp >> safe_abs_type(1);
2009  }
2010 };
2011 
2012 template <typename Packet, typename ScalarExponent,
2013  bool ReciprocateIfExponentIsNegative =
2014  !NumTraits<typename unpacket_traits<Packet>::type>::IsInteger && NumTraits<ScalarExponent>::IsSigned>
2015 struct reciprocate {
2016  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
2017  using Scalar = typename unpacket_traits<Packet>::type;
2018  const Packet cst_pos_one = pset1<Packet>(Scalar(1));
2019  return exponent < 0 ? pdiv(cst_pos_one, x) : x;
2020  }
2021 };
2022 
2023 template <typename Packet, typename ScalarExponent>
2024 struct reciprocate<Packet, ScalarExponent, false> {
2025  // pdiv not defined, nor necessary for integer base types
2026  // if the exponent is unsigned, then the exponent cannot be negative
2027  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent&) { return x; }
2028 };
2029 
2030 template <typename Packet, typename ScalarExponent>
2031 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet int_pow(const Packet& x, const ScalarExponent& exponent) {
2032  using Scalar = typename unpacket_traits<Packet>::type;
2033  using ExponentHelper = exponent_helper<ScalarExponent>;
2034  using AbsExponentType = typename ExponentHelper::safe_abs_type;
2035  const Packet cst_pos_one = pset1<Packet>(Scalar(1));
2036  if (exponent == ScalarExponent(0)) return cst_pos_one;
2037 
2038  Packet result = reciprocate<Packet, ScalarExponent>::run(x, exponent);
2039  Packet y = cst_pos_one;
2040  AbsExponentType m = ExponentHelper::safe_abs(exponent);
2041 
2042  while (m > 1) {
2043  bool odd = ExponentHelper::is_odd(m);
2044  if (odd) y = pmul(y, result);
2045  result = pmul(result, result);
2046  m = ExponentHelper::floor_div_two(m);
2047  }
2048 
2049  return pmul(y, result);
2050 }
2051 
2052 template <typename Packet>
2053 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet gen_pow(const Packet& x,
2054  const typename unpacket_traits<Packet>::type& exponent) {
2055  const Packet exponent_packet = pset1<Packet>(exponent);
2056  return generic_pow_impl(x, exponent_packet);
2057 }
2058 
2059 template <typename Packet, typename ScalarExponent>
2060 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_nonint_nonint_errors(const Packet& x, const Packet& powx,
2061  const ScalarExponent& exponent) {
2062  using Scalar = typename unpacket_traits<Packet>::type;
2063 
2064  // non-integer base and exponent case
2065 
2066  const Scalar pos_zero = Scalar(0);
2067  const Scalar all_ones = ptrue<Scalar>(Scalar());
2068  const Scalar pos_one = Scalar(1);
2069  const Scalar pos_inf = NumTraits<Scalar>::infinity();
2070 
2071  const Packet cst_pos_zero = pzero(x);
2072  const Packet cst_pos_one = pset1<Packet>(pos_one);
2073  const Packet cst_pos_inf = pset1<Packet>(pos_inf);
2074 
2075  const bool exponent_is_not_fin = !(numext::isfinite)(exponent);
2076  const bool exponent_is_neg = exponent < ScalarExponent(0);
2077  const bool exponent_is_pos = exponent > ScalarExponent(0);
2078 
2079  const Packet exp_is_not_fin = pset1<Packet>(exponent_is_not_fin ? all_ones : pos_zero);
2080  const Packet exp_is_neg = pset1<Packet>(exponent_is_neg ? all_ones : pos_zero);
2081  const Packet exp_is_pos = pset1<Packet>(exponent_is_pos ? all_ones : pos_zero);
2082  const Packet exp_is_inf = pand(exp_is_not_fin, por(exp_is_neg, exp_is_pos));
2083  const Packet exp_is_nan = pandnot(exp_is_not_fin, por(exp_is_neg, exp_is_pos));
2084 
2085  const Packet x_is_le_zero = pcmp_le(x, cst_pos_zero);
2086  const Packet x_is_ge_zero = pcmp_le(cst_pos_zero, x);
2087  const Packet x_is_zero = pand(x_is_le_zero, x_is_ge_zero);
2088 
2089  const Packet abs_x = pabs(x);
2090  const Packet abs_x_is_le_one = pcmp_le(abs_x, cst_pos_one);
2091  const Packet abs_x_is_ge_one = pcmp_le(cst_pos_one, abs_x);
2092  const Packet abs_x_is_inf = pcmp_eq(abs_x, cst_pos_inf);
2093  const Packet abs_x_is_one = pand(abs_x_is_le_one, abs_x_is_ge_one);
2094 
2095  Packet pow_is_inf_if_exp_is_neg = por(x_is_zero, pand(abs_x_is_le_one, exp_is_inf));
2096  Packet pow_is_inf_if_exp_is_pos = por(abs_x_is_inf, pand(abs_x_is_ge_one, exp_is_inf));
2097  Packet pow_is_one = pand(abs_x_is_one, por(exp_is_inf, x_is_ge_zero));
2098 
2099  Packet result = powx;
2100  result = por(x_is_le_zero, result);
2101  result = pselect(pow_is_inf_if_exp_is_neg, pand(cst_pos_inf, exp_is_neg), result);
2102  result = pselect(pow_is_inf_if_exp_is_pos, pand(cst_pos_inf, exp_is_pos), result);
2103  result = por(exp_is_nan, result);
2104  result = pselect(pow_is_one, cst_pos_one, result);
2105  return result;
2106 }
2107 
2108 template <typename Packet, typename ScalarExponent,
2109  std::enable_if_t<NumTraits<typename unpacket_traits<Packet>::type>::IsSigned, bool> = true>
2110 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_negative_exponent(const Packet& x, const ScalarExponent& exponent) {
2111  using Scalar = typename unpacket_traits<Packet>::type;
2112 
2113  // singed integer base, signed integer exponent case
2114 
2115  // This routine handles negative exponents.
2116  // The return value is either 0, 1, or -1.
2117 
2118  const Scalar pos_zero = Scalar(0);
2119  const Scalar all_ones = ptrue<Scalar>(Scalar());
2120  const Scalar pos_one = Scalar(1);
2121 
2122  const Packet cst_pos_one = pset1<Packet>(pos_one);
2123 
2124  const bool exponent_is_odd = exponent % ScalarExponent(2) != ScalarExponent(0);
2125 
2126  const Packet exp_is_odd = pset1<Packet>(exponent_is_odd ? all_ones : pos_zero);
2127 
2128  const Packet abs_x = pabs(x);
2129  const Packet abs_x_is_one = pcmp_eq(abs_x, cst_pos_one);
2130 
2131  Packet result = pselect(exp_is_odd, x, abs_x);
2132  result = pand(abs_x_is_one, result);
2133  return result;
2134 }
2135 
2136 template <typename Packet, typename ScalarExponent,
2137  std::enable_if_t<!NumTraits<typename unpacket_traits<Packet>::type>::IsSigned, bool> = true>
2138 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_negative_exponent(const Packet& x, const ScalarExponent&) {
2139  using Scalar = typename unpacket_traits<Packet>::type;
2140 
2141  // unsigned integer base, signed integer exponent case
2142 
2143  // This routine handles negative exponents.
2144  // The return value is either 0 or 1
2145 
2146  const Scalar pos_one = Scalar(1);
2147 
2148  const Packet cst_pos_one = pset1<Packet>(pos_one);
2149 
2150  const Packet x_is_one = pcmp_eq(x, cst_pos_one);
2151 
2152  return pand(x_is_one, x);
2153 }
2154 
2155 
2156 } // end namespace unary_pow
2157 
2158 template <typename Packet, typename ScalarExponent,
2159  bool BaseIsIntegerType = NumTraits<typename unpacket_traits<Packet>::type>::IsInteger,
2160  bool ExponentIsIntegerType = NumTraits<ScalarExponent>::IsInteger,
2161  bool ExponentIsSigned = NumTraits<ScalarExponent>::IsSigned>
2162 struct unary_pow_impl;
2163 
2164 template <typename Packet, typename ScalarExponent, bool ExponentIsSigned>
2165 struct unary_pow_impl<Packet, ScalarExponent, false, false, ExponentIsSigned> {
2166  typedef typename unpacket_traits<Packet>::type Scalar;
2167  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
2168  const bool exponent_is_integer = (numext::isfinite)(exponent) && numext::round(exponent) == exponent;
2169  if (exponent_is_integer) {
2170  return unary_pow::int_pow(x, exponent);
2171  } else {
2172  Packet result = unary_pow::gen_pow(x, exponent);
2173  result = unary_pow::handle_nonint_nonint_errors(x, result, exponent);
2174  return result;
2175  }
2176  }
2177 };
2178 
2179 template <typename Packet, typename ScalarExponent, bool ExponentIsSigned>
2180 struct unary_pow_impl<Packet, ScalarExponent, false, true, ExponentIsSigned> {
2181  typedef typename unpacket_traits<Packet>::type Scalar;
2182  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
2183  return unary_pow::int_pow(x, exponent);
2184  }
2185 };
2186 
2187 template <typename Packet, typename ScalarExponent>
2188 struct unary_pow_impl<Packet, ScalarExponent, true, true, true> {
2189  typedef typename unpacket_traits<Packet>::type Scalar;
2190  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
2191  if (exponent < ScalarExponent(0)) {
2192  return unary_pow::handle_negative_exponent(x, exponent);
2193  } else {
2194  return unary_pow::int_pow(x, exponent);
2195  }
2196  }
2197 };
2198 
2199 template <typename Packet, typename ScalarExponent>
2200 struct unary_pow_impl<Packet, ScalarExponent, true, true, false> {
2201  typedef typename unpacket_traits<Packet>::type Scalar;
2202  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
2203  return unary_pow::int_pow(x, exponent);
2204  }
2205 };
2206 
2207 } // end namespace internal
2208 } // end namespace Eigen
2209 
2210 #endif // EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H
Matrix3f m
Array< int, Dynamic, 1 > v
Array< int, 3, 1 > b
int n
#define EIGEN_ALIGN_TO_BOUNDARY(n)
Array< double, 1, 3 > e(1./3., 0.5, 2.)
Array33i c
IndexedView_or_Block operator()(const RowIndices &rowIndices, const ColIndices &colIndices)
#define EIGEN_CONSTEXPR
Definition: Macros.h:747
#define EIGEN_UNUSED_VARIABLE(var)
Definition: Macros.h:957
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:883
#define eigen_assert(x)
Definition: Macros.h:902
#define EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Definition: Macros.h:892
#define EIGEN_OPTIMIZATION_BARRIER(X)
Definition: Macros.h:1039
#define EIGEN_LOG2E
Definition: MathFunctions.h:17
#define EIGEN_LN2
Definition: MathFunctions.h:18
#define EIGEN_PI
Definition: MathFunctions.h:16
Vector3f p0
Vector3f p1
#define EIGEN_STATIC_ASSERT(X, MSG)
Definition: StaticAssert.h:26
float * p
bfloat16() min(const bfloat16 &a, const bfloat16 &b)
Definition: BFloat16.h:684
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_double(const Packet _x)
Packet pmin(const Packet &a, const Packet &b)
void twoprod(const Packet &x, const Packet &y, Packet &p_hi, Packet &p_lo)
Packet padd(const Packet &a, const Packet &b)
void veltkamp_splitting(const Packet &x, Packet &x_hi, Packet &x_lo)
Packet8f pzero(const Packet8f &)
void doubleword_div_fp(const Packet &x_hi, const Packet &x_lo, const Packet &y, Packet &z_hi, Packet &z_lo)
void twosum(const Packet &x_hi, const Packet &x_lo, const Packet &y_hi, const Packet &y_lo, Packet &s_hi, Packet &s_lo)
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_float(const Packet &x)
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_double(const Packet _x)
Packet8h ptrue(const Packet8h &a)
Packet4f pcmp_lt_or_nan(const Packet4f &a, const Packet4f &b)
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pacos_float(const Packet &x_in)
float trig_reduce_huge(float xf, Eigen::numext::int32_t *quadrant)
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_float(const Packet &x_in)
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_impl_double(const Packet _x)
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh_float(const Packet &x)
EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2(const Packet &a)
const Scalar & y
EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog(const Packet &a)
Packet8h pandnot(const Packet8h &a, const Packet8h &b)
Packet4f pabs(const Packet4f &a)
Packet pmax(const Packet &a, const Packet &b)
Packet2cf pnegate(const Packet2cf &a)
Packet generic_pow_impl(const Packet &x, const Packet &y)
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_reduced_double(const Packet &x)
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_double(const Packet &x_in)
EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp(const Packet &a)
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psqrt_complex(const Packet &a)
Packet1cd pcplxflip(const Packet1cd &x)
Definition: MSA/Complex.h:617
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_float(const Packet _x)
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_pow(const Packet &x, const Packet &y)
Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
Packet generic_plog1p(const Packet &x)
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_float(const Packet &x)
void pstoreu(Scalar *to, const Packet &from)
Packet2cf pcmp_eq(const Packet2cf &a, const Packet2cf &b)
Packet4f pselect(const Packet4f &mask, const Packet4f &a, const Packet4f &b)
Packet pmul(const Packet &a, const Packet &b)
Packet4d pfrexp_generic_get_biased_exponent(const Packet4d &a)
Packet pmsub(const Packet &a, const Packet &b, const Packet &c)
Packet pfrexp_generic(const Packet &a, Packet &exponent)
Packet8h pfrexp(const Packet8h &a, Packet8h &exponent)
Packet pldexp_generic(const Packet &a, const Packet &exponent)
EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pround(const Packet &a)
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pdiv_complex(const Packet &x, const Packet &y)
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_impl_float(const Packet _x)
Packet generic_expm1(const Packet &x)
Packet4f psqrt(const Packet4f &a)
Packet pnmadd(const Packet &a, const Packet &b, const Packet &c)
Packet psub(const Packet &a, const Packet &b)
Packet8h pand(const Packet8h &a, const Packet8h &b)
Packet8h pldexp(const Packet8h &a, const Packet8h &exponent)
Packet8h pxor(const Packet8h &a, const Packet8h &b)
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_float(const Packet _x)
Packet8bf psignbit(const Packet8bf &a)
Packet pdiv(const Packet &a, const Packet &b)
Packet preciprocal(const Packet &a)
Packet2cf pconj(const Packet2cf &a)
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_double(const Packet _x)
void absolute_split(const Packet &x, Packet &n, Packet &r)
void fast_twosum(const Packet &x, const Packet &y, Packet &s_hi, Packet &s_lo)
Packet8h por(const Packet8h &a, const Packet8h &b)
Packet4i pcmp_lt(const Packet4i &a, const Packet4i &b)
svint32_t PacketXi __attribute__((arm_sve_vector_bits(EIGEN_ARM64_SVE_VL)))
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_reduced_float(const Packet &x)
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_float(const Packet _x)
Packet8f pisnan(const Packet8f &a)
bool predux_any(const Packet4f &x)
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pasin_float(const Packet &x_in)
EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pfloor(const Packet &a)
Packet4f pcmp_le(const Packet4f &a, const Packet4f &b)
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psincos_float(const Packet &_x)
Packet8f peven_mask(const Packet8f &)
Scalar round(const Scalar &x)
std::int32_t int32_t
Definition: Meta.h:40
std::int16_t int16_t
Definition: Meta.h:38
std::int64_t int64_t
Definition: Meta.h:42
Scalar() floor(const Scalar &x)
static constexpr EIGEN_ALWAYS_INLINE Scalar signbit(const Scalar &x)
std::uint32_t uint32_t
Definition: Meta.h:39
EIGEN_ALWAYS_INLINE bool() isfinite(const Eigen::bfloat16 &h)
Definition: BFloat16.h:790
std::uint64_t uint64_t
Definition: Meta.h:41
EIGEN_ALWAYS_INLINE std::enable_if_t< NumTraits< T >::IsSigned||NumTraits< T >::IsComplex, typename NumTraits< T >::Real > abs(const T &x)
: InteropHeaders
Definition: Core:139
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_sign_op< typename Derived::Scalar >, const Derived > sign(const Eigen::ArrayBase< Derived > &x)
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_exp_op< typename Derived::Scalar >, const Derived > exp(const Eigen::ArrayBase< Derived > &x)
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_expm1_op< typename Derived::Scalar >, const Derived > expm1(const Eigen::ArrayBase< Derived > &x)
Definition: BFloat16.h:222
Holds information about the various numeric (i.e. scalar) types allowed by Eigen.
Definition: NumTraits.h:231