arch/SSE/MathFunctions.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2007 Julien Pommier
5 // Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
6 //
7 // This Source Code Form is subject to the terms of the Mozilla
8 // Public License v. 2.0. If a copy of the MPL was not distributed
9 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
10 
11 /* The sin and cos and functions of this file come from
12  * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
13  */
14 
15 #ifndef EIGEN_MATH_FUNCTIONS_SSE_H
16 #define EIGEN_MATH_FUNCTIONS_SSE_H
17 
18 #include "../../InternalHeaderCheck.h"
19 
20 namespace Eigen {
21 
22 namespace internal {
23 
26 
27 // Notice that for newer processors, it is counterproductive to use Newton
28 // iteration for square root. In particular, Skylake and Zen2 processors
29 // have approximately doubled throughput of the _mm_sqrt_ps instruction
30 // compared to their predecessors.
32 Packet4f psqrt<Packet4f>(const Packet4f& x) { return _mm_sqrt_ps(x); }
34 Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); }
36 Packet16b psqrt<Packet16b>(const Packet16b& x) { return x; }
37 
38 #if EIGEN_FAST_MATH
39 // Even on Skylake, using Newton iteration is a win for reciprocal square root.
42  return generic_rsqrt_newton_step<Packet4f, /*Steps=*/1>::run(x, _mm_rsqrt_ps(x));
43 }
44 
45 #ifdef EIGEN_VECTORIZE_FMA
46 // Trying to speed up reciprocal using Newton-Raphson is counterproductive
47 // unless FMA is available. Without FMA pdiv(pset1<Packet>(Scalar(1),a)) is
48 // 30% faster.
49 template<> EIGEN_STRONG_INLINE Packet4f preciprocal<Packet4f>(const Packet4f& x) {
50  return generic_reciprocal_newton_step<Packet4f, /*Steps=*/1>::run(x, _mm_rcp_ps(x));
51 }
52 #endif
53 
54 #endif
55 
56 } // end namespace internal
57 
58 namespace numext {
59 
60 template<>
62 float sqrt(const float &x)
63 {
64  return internal::pfirst(internal::Packet4f(_mm_sqrt_ss(_mm_set_ss(x))));
65 }
66 
67 template<>
69 double sqrt(const double &x)
70 {
71 #if EIGEN_COMP_GNUC_STRICT
72  // This works around a GCC bug generating poor code for _mm_sqrt_pd
73  // See https://gitlab.com/libeigen/eigen/commit/8dca9f97e38970
74  return internal::pfirst(internal::Packet2d(__builtin_ia32_sqrtsd(_mm_set_sd(x))));
75 #else
76  return internal::pfirst(internal::Packet2d(_mm_sqrt_pd(_mm_set_sd(x))));
77 #endif
78 }
79 
80 } // end namespace numex
81 
82 } // end namespace Eigen
83 
84 #endif // EIGEN_MATH_FUNCTIONS_SSE_H
#define EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(PACKET)
#define EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(PACKET)
#define EIGEN_ALWAYS_INLINE
Definition: Macros.h:836
#define EIGEN_UNUSED
Definition: Macros.h:932
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:883
#define EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Definition: Macros.h:892
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f psqrt< Packet4f >(const Packet4f &x)
eigen_packet_wrapper< __m128i, 1 > Packet16b
bfloat16 pfirst(const Packet8bf &a)
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16b psqrt< Packet16b >(const Packet16b &x)
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d psqrt< Packet2d >(const Packet2d &x)
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f prsqrt< Packet4f >(const Packet4f &x)
__vector float Packet4f
EIGEN_ALWAYS_INLINE float sqrt(const float &x)
: InteropHeaders
Definition: Core:139