10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
32 template <
typename ArgType>
34 return internal::functor_traits<
35 internal::scalar_product_op<ArgType, ArgType> >::Cost;
37 template <
typename ArgType>
39 return internal::functor_traits<internal::scalar_sum_op<ArgType> >::Cost;
41 template <
typename ArgType>
43 return internal::functor_traits<
44 internal::scalar_quotient_op<ArgType, ArgType> >::Cost;
46 template <
typename ArgType>
48 return internal::functor_traits<internal::scalar_mod_op<ArgType> >::Cost;
50 template <
typename SrcType,
typename TargetType>
52 return internal::functor_traits<
53 internal::scalar_cast_op<SrcType, TargetType> >::Cost;
66 bool vectorized,
double packet_size)
86 double load_cost,
double store_cost,
double compute_cost)
const {
162 template <
typename Device>
177 double output_size,
const TensorOpCost& cost_per_coeff,
int max_threads) {
178 double cost =
totalCost(output_size, cost_per_coeff);
183 numext::maxi<int>(1,
static_cast<int>(threads)));
190 double output_size,
const TensorOpCost& cost_per_coeff) {
195 double output_size,
const TensorOpCost& cost_per_coeff) {
205 const double kLoadCycles = 1.0 / 64 * 11;
206 const double kStoreCycles = 1.0 / 64 * 11;
209 cost_per_coeff.
total_cost(kLoadCycles, kStoreCycles,
#define EIGEN_DEVICE_FUNC
static double totalCost(double output_size, const TensorOpCost &cost_per_coeff)
static const int kDeviceCyclesPerComputeCycle
static double taskSize(double output_size, const TensorOpCost &cost_per_coeff)
static const int kPerThreadCycles
static const int kStartupCycles
static int numThreads(double output_size, const TensorOpCost &cost_per_coeff, int max_threads)
static const int kTaskSize
double total_cost(double load_cost, double store_cost, double compute_cost) const
TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
friend TensorOpCost operator*(double lhs, TensorOpCost rhs)
TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles, bool vectorized, double packet_size)
TensorOpCost cwiseMin(const TensorOpCost &rhs) const
TensorOpCost & operator*=(double rhs)
double bytes_stored() const
friend TensorOpCost operator+(TensorOpCost lhs, const TensorOpCost &rhs)
TensorOpCost cwiseMax(const TensorOpCost &rhs) const
friend std::ostream & operator<<(std::ostream &os, const TensorOpCost &tc)
double bytes_loaded() const
friend TensorOpCost operator*(TensorOpCost lhs, double rhs)
double compute_cycles() const
TensorOpCost & operator+=(const TensorOpCost &rhs)
EIGEN_ALWAYS_INLINE T maxi(const T &x, const T &y)
EIGEN_ALWAYS_INLINE bool() isfinite(const Eigen::bfloat16 &h)
EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
: TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend