Eigen::TensorCostModel< Device > Class Template Reference

Static Public Member Functions

static int numThreads (double output_size, const TensorOpCost &cost_per_coeff, int max_threads)
 
static double taskSize (double output_size, const TensorOpCost &cost_per_coeff)
 
static double totalCost (double output_size, const TensorOpCost &cost_per_coeff)
 

Static Public Attributes

static const int kDeviceCyclesPerComputeCycle
 
static const int kPerThreadCycles
 
static const int kStartupCycles
 
static const int kTaskSize
 

Detailed Description

template<typename Device>
class Eigen::TensorCostModel< Device >

Definition at line 163 of file TensorCostModel.h.

Member Function Documentation

◆ numThreads()

template<typename Device >
static int Eigen::TensorCostModel< Device >::numThreads ( double  output_size,
const TensorOpCost cost_per_coeff,
int  max_threads 
)
inlinestatic

Definition at line 176 of file TensorCostModel.h.

177  {
178  double cost = totalCost(output_size, cost_per_coeff);
179  double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
180  // Make sure we don't invoke undefined behavior when we convert to an int.
181  threads = numext::mini<double>(threads, GenericNumTraits<int>::highest());
182  return numext::mini(max_threads,
183  numext::maxi<int>(1, static_cast<int>(threads)));
184  }
static double totalCost(double output_size, const TensorOpCost &cost_per_coeff)
static const int kPerThreadCycles
static const int kStartupCycles
EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)

◆ taskSize()

template<typename Device >
static double Eigen::TensorCostModel< Device >::taskSize ( double  output_size,
const TensorOpCost cost_per_coeff 
)
inlinestatic

Definition at line 189 of file TensorCostModel.h.

190  {
191  return totalCost(output_size, cost_per_coeff) / kTaskSize;
192  }
static const int kTaskSize

◆ totalCost()

template<typename Device >
static double Eigen::TensorCostModel< Device >::totalCost ( double  output_size,
const TensorOpCost cost_per_coeff 
)
inlinestatic

Definition at line 194 of file TensorCostModel.h.

195  {
196  // Cost of memory fetches from L2 cache. 64 is typical cache line size.
197  // 11 is L2 cache latency on Haswell.
198  // We don't know whether data is in L1, L2 or L3. But we are most interested
199  // in single-threaded computational time around 100us-10ms (smaller time
200  // is too small for parallelization, larger time is not interesting
201  // either because we are probably using all available threads already).
202  // And for the target time range, L2 seems to be what matters. Data set
203  // fitting into L1 is too small to take noticeable time. Data set fitting
204  // only into L3 presumably will take more than 10ms to load and process.
205  const double kLoadCycles = 1.0 / 64 * 11;
206  const double kStoreCycles = 1.0 / 64 * 11;
207  // Scaling from Eigen compute cost to device cycles.
208  return output_size *
209  cost_per_coeff.total_cost(kLoadCycles, kStoreCycles,
211  }
static const int kDeviceCyclesPerComputeCycle

Member Data Documentation

◆ kDeviceCyclesPerComputeCycle

template<typename Device >
const int Eigen::TensorCostModel< Device >::kDeviceCyclesPerComputeCycle
static

Definition at line 166 of file TensorCostModel.h.

◆ kPerThreadCycles

template<typename Device >
const int Eigen::TensorCostModel< Device >::kPerThreadCycles
static

Definition at line 170 of file TensorCostModel.h.

◆ kStartupCycles

template<typename Device >
const int Eigen::TensorCostModel< Device >::kStartupCycles
static

Definition at line 169 of file TensorCostModel.h.

◆ kTaskSize

template<typename Device >
const int Eigen::TensorCostModel< Device >::kTaskSize
static

Definition at line 171 of file TensorCostModel.h.


The documentation for this class was generated from the following file: