TensorCostModel.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen@google.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
12 
13 #include "./InternalHeaderCheck.h"
14 
15 namespace Eigen {
16 
25 // Class storing the cost of evaluating a tensor expression in terms of the
26 // estimated number of operand bytes loads, bytes stored, and compute cycles.
27 class TensorOpCost {
28  public:
29  // TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple
30  // model based on minimal reciprocal throughput numbers from Intel or
31  // Agner Fog's tables would be better than what is there now.
32  template <typename ArgType>
33  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost() {
34  return internal::functor_traits<
35  internal::scalar_product_op<ArgType, ArgType> >::Cost;
36  }
37  template <typename ArgType>
38  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost() {
39  return internal::functor_traits<internal::scalar_sum_op<ArgType> >::Cost;
40  }
41  template <typename ArgType>
42  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost() {
43  return internal::functor_traits<
44  internal::scalar_quotient_op<ArgType, ArgType> >::Cost;
45  }
46  template <typename ArgType>
47  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost() {
48  return internal::functor_traits<internal::scalar_mod_op<ArgType> >::Cost;
49  }
50  template <typename SrcType, typename TargetType>
51  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost() {
52  return internal::functor_traits<
53  internal::scalar_cast_op<SrcType, TargetType> >::Cost;
54  }
55 
63 
66  bool vectorized, double packet_size)
69  compute_cycles_(vectorized ? compute_cycles / packet_size
70  : compute_cycles) {
74  }
75 
76  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const {
77  return bytes_loaded_;
78  }
79  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const {
80  return bytes_stored_;
81  }
82  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const {
83  return compute_cycles_;
84  }
85  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost(
86  double load_cost, double store_cost, double compute_cost) const {
87  return load_cost * bytes_loaded_ + store_cost * bytes_stored_ +
88  compute_cost * compute_cycles_;
89  }
90 
91  // Drop memory access component. Intended for cases when memory accesses are
92  // sequential or are completely masked by computations.
94  bytes_loaded_ = 0;
95  bytes_stored_ = 0;
96  }
97 
98  // TODO(rmlarsen): Define min in terms of total cost, not elementwise.
100  const TensorOpCost& rhs) const {
105  }
106 
107  // TODO(rmlarsen): Define max in terms of total cost, not elementwise.
109  const TensorOpCost& rhs) const {
114  }
115 
117  const TensorOpCost& rhs) {
118  bytes_loaded_ += rhs.bytes_loaded();
119  bytes_stored_ += rhs.bytes_stored();
121  return *this;
122  }
123 
124  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs) {
125  bytes_loaded_ *= rhs;
126  bytes_stored_ *= rhs;
127  compute_cycles_ *= rhs;
128  return *this;
129  }
130 
131  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+(
132  TensorOpCost lhs, const TensorOpCost& rhs) {
133  lhs += rhs;
134  return lhs;
135  }
136  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(
137  TensorOpCost lhs, double rhs) {
138  lhs *= rhs;
139  return lhs;
140  }
141  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(
142  double lhs, TensorOpCost rhs) {
143  rhs *= lhs;
144  return rhs;
145  }
146 
147  friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) {
148  return os << "[bytes_loaded = " << tc.bytes_loaded()
149  << ", bytes_stored = " << tc.bytes_stored()
150  << ", compute_cycles = " << tc.compute_cycles() << "]";
151  }
152 
153  private:
157 };
158 
159 // TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads
160 // in [1:max_threads] instead of just switching multi-threading off for small
161 // work units.
162 template <typename Device>
164  public:
165  // Scaling from Eigen compute cost to device cycles.
166  static const int kDeviceCyclesPerComputeCycle = 1;
167 
168  // Costs in device cycles.
169  static const int kStartupCycles = 100000;
170  static const int kPerThreadCycles = 100000;
171  static const int kTaskSize = 40000;
172 
173  // Returns the number of threads in [1:max_threads] to use for
174  // evaluating an expression with the given output size and cost per
175  // coefficient.
176  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(
177  double output_size, const TensorOpCost& cost_per_coeff, int max_threads) {
178  double cost = totalCost(output_size, cost_per_coeff);
179  double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
180  // Make sure we don't invoke undefined behavior when we convert to an int.
181  threads = numext::mini<double>(threads, GenericNumTraits<int>::highest());
182  return numext::mini(max_threads,
183  numext::maxi<int>(1, static_cast<int>(threads)));
184  }
185 
186  // taskSize assesses parallel task size.
187  // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task
188  // granularity needs to be increased to mitigate parallelization overheads.
189  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(
190  double output_size, const TensorOpCost& cost_per_coeff) {
191  return totalCost(output_size, cost_per_coeff) / kTaskSize;
192  }
193 
194  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(
195  double output_size, const TensorOpCost& cost_per_coeff) {
196  // Cost of memory fetches from L2 cache. 64 is typical cache line size.
197  // 11 is L2 cache latency on Haswell.
198  // We don't know whether data is in L1, L2 or L3. But we are most interested
199  // in single-threaded computational time around 100us-10ms (smaller time
200  // is too small for parallelization, larger time is not interesting
201  // either because we are probably using all available threads already).
202  // And for the target time range, L2 seems to be what matters. Data set
203  // fitting into L1 is too small to take noticeable time. Data set fitting
204  // only into L3 presumably will take more than 10ms to load and process.
205  const double kLoadCycles = 1.0 / 64 * 11;
206  const double kStoreCycles = 1.0 / 64 * 11;
207  // Scaling from Eigen compute cost to device cycles.
208  return output_size *
209  cost_per_coeff.total_cost(kLoadCycles, kStoreCycles,
211  }
212 };
213 
214 } // namespace Eigen
215 
216 #endif // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
#define EIGEN_DEVICE_FUNC
#define eigen_assert(x)
static double totalCost(double output_size, const TensorOpCost &cost_per_coeff)
static const int kDeviceCyclesPerComputeCycle
static double taskSize(double output_size, const TensorOpCost &cost_per_coeff)
static const int kPerThreadCycles
static const int kStartupCycles
static int numThreads(double output_size, const TensorOpCost &cost_per_coeff, int max_threads)
static const int kTaskSize
static int CastCost()
double total_cost(double load_cost, double store_cost, double compute_cost) const
TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
friend TensorOpCost operator*(double lhs, TensorOpCost rhs)
static int MulCost()
TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles, bool vectorized, double packet_size)
TensorOpCost cwiseMin(const TensorOpCost &rhs) const
static int AddCost()
TensorOpCost & operator*=(double rhs)
static int DivCost()
static int ModCost()
double bytes_stored() const
friend TensorOpCost operator+(TensorOpCost lhs, const TensorOpCost &rhs)
TensorOpCost cwiseMax(const TensorOpCost &rhs) const
friend std::ostream & operator<<(std::ostream &os, const TensorOpCost &tc)
double bytes_loaded() const
friend TensorOpCost operator*(TensorOpCost lhs, double rhs)
double compute_cycles() const
TensorOpCost & operator+=(const TensorOpCost &rhs)
EIGEN_ALWAYS_INLINE T maxi(const T &x, const T &y)
EIGEN_ALWAYS_INLINE bool() isfinite(const Eigen::bfloat16 &h)
EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
: TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend