TensorBroadcasting.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
12 
13 #include "./InternalHeaderCheck.h"
14 
15 namespace Eigen {
16 
24 namespace internal {
25 template<typename Broadcast, typename XprType>
26 struct traits<TensorBroadcastingOp<Broadcast, XprType> > : public traits<XprType>
27 {
28  typedef typename XprType::Scalar Scalar;
29  typedef traits<XprType> XprTraits;
30  typedef typename XprTraits::StorageKind StorageKind;
31  typedef typename XprTraits::Index Index;
32  typedef typename XprType::Nested Nested;
33  typedef std::remove_reference_t<Nested> Nested_;
34  static constexpr int NumDimensions = XprTraits::NumDimensions;
35  static constexpr int Layout = XprTraits::Layout;
36  typedef typename XprTraits::PointerType PointerType;
37 };
38 
39 template<typename Broadcast, typename XprType>
40 struct eval<TensorBroadcastingOp<Broadcast, XprType>, Eigen::Dense>
41 {
42  typedef const TensorBroadcastingOp<Broadcast, XprType> EIGEN_DEVICE_REF type;
43 };
44 
45 template<typename Broadcast, typename XprType>
46 struct nested<TensorBroadcastingOp<Broadcast, XprType>, 1, typename eval<TensorBroadcastingOp<Broadcast, XprType> >::type>
47 {
48  typedef TensorBroadcastingOp<Broadcast, XprType> type;
49 };
50 
51 template <typename Dims>
52 struct is_input_scalar {
53  static const bool value = false;
54 };
55 template <>
56 struct is_input_scalar<Sizes<> > {
57  static const bool value = true;
58 };
59 #ifndef EIGEN_EMULATE_CXX11_META_H
60 template <typename std::ptrdiff_t... Indices>
61 struct is_input_scalar<Sizes<Indices...> > {
62  static const bool value = (Sizes<Indices...>::total_size == 1);
63 };
64 #endif
65 
66 } // end namespace internal
67 
68 
69 
70 template<typename Broadcast, typename XprType>
71 class TensorBroadcastingOp : public TensorBase<TensorBroadcastingOp<Broadcast, XprType>, ReadOnlyAccessors>
72 {
73  public:
74  typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Scalar Scalar;
76  typedef typename XprType::CoeffReturnType CoeffReturnType;
77  typedef typename Eigen::internal::nested<TensorBroadcastingOp>::type Nested;
78  typedef typename Eigen::internal::traits<TensorBroadcastingOp>::StorageKind StorageKind;
79  typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Index Index;
80 
81  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBroadcastingOp(const XprType& expr, const Broadcast& broadcast)
82  : m_xpr(expr), m_broadcast(broadcast) {}
83 
85  const Broadcast& broadcast() const { return m_broadcast; }
86 
89  expression() const { return m_xpr; }
90 
91  protected:
92  typename XprType::Nested m_xpr;
93  const Broadcast m_broadcast;
94 };
95 
96 
97 // Eval as rvalue
98 template<typename Broadcast, typename ArgType, typename Device>
99 struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
100 {
102  typedef typename XprType::Index Index;
103  static constexpr int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
105  typedef typename XprType::Scalar Scalar;
110  protected: // all the non-static fields must have the same access control, otherwise the TensorEvaluator won't be standard layout;
111  bool isCopy, nByOne, oneByN;
112  public:
115 
116  enum {
120  PreferBlockAccess = true,
121  RawAccess = false
122  };
124 
125  typedef std::remove_const_t<Scalar> ScalarNoConst;
126 
127  // We do block based broadcasting using a trick with 2x tensor rank and 0
128  // strides. See block method implementation for details.
130 
131  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
132  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
133  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
134 
137 
138  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
139  Layout, Index>
141  //===--------------------------------------------------------------------===//
142 
143  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
144  : isCopy(false), nByOne(false), oneByN(false),
145  m_device(device), m_broadcast(op.broadcast()), m_impl(op.expression(), device)
146  {
147 
148  // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar
149  // and store the result in a scalar. Instead one should reshape the scalar into a N-D
150  // tensor with N >= 1 of 1 element first and then broadcast.
151  EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
152  const InputDimensions& input_dims = m_impl.dimensions();
153  isCopy = true;
154  for (int i = 0; i < NumDims; ++i) {
155  eigen_assert(input_dims[i] > 0);
156  m_dimensions[i] = input_dims[i] * m_broadcast[i];
157  if (m_broadcast[i] != 1) {
158  isCopy = false;
159  }
160  }
161 
162  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
163  m_inputStrides[0] = 1;
164  m_outputStrides[0] = 1;
165  for (int i = 1; i < NumDims; ++i) {
166  m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
167  m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
168  }
169  } else {
170  m_inputStrides[NumDims-1] = 1;
171  m_outputStrides[NumDims-1] = 1;
172  for (int i = NumDims-2; i >= 0; --i) {
173  m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
174  m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
175  }
176  }
177 
178  if (input_dims[0] == 1) {
179  oneByN = true;
180  for (int i = 1; i < NumDims; ++i) {
181  if (m_broadcast[i] != 1) {
182  oneByN = false;
183  break;
184  }
185  }
186  } else if (input_dims[NumDims-1] == 1) {
187  nByOne = true;
188  for (int i = 0; i < NumDims-1; ++i) {
189  if (m_broadcast[i] != 1) {
190  nByOne = false;
191  break;
192  }
193  }
194  }
195 
196  // Handle special format like NCHW, its input shape is '[1, N..., 1]' and
197  // broadcast shape is '[N, 1..., N]'
198  if (!oneByN && !nByOne) {
199  if (input_dims[0] == 1 && input_dims[NumDims-1] == 1 && NumDims > 2) {
200  nByOne = true;
201  oneByN = true;
202  for (int i = 1; i < NumDims-1; ++i) {
203  if (m_broadcast[i] != 1) {
204  nByOne = false;
205  oneByN = false;
206  break;
207  }
208  }
209  }
210  }
211  }
212 
213  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
214 
215  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
216  m_impl.evalSubExprsIfNeeded(NULL);
217  return true;
218  }
219 
220 #ifdef EIGEN_USE_THREADS
221  template <typename EvalSubExprsCallback>
222  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
223  EvaluatorPointerType, EvalSubExprsCallback done) {
224  m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
225  }
226 #endif // EIGEN_USE_THREADS
227 
228  EIGEN_STRONG_INLINE void cleanup() {
229  m_impl.cleanup();
230  }
231 
233  {
234  if (internal::is_input_scalar<internal::remove_all_t<InputDimensions>>::value) {
235  return m_impl.coeff(0);
236  }
237 
238  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
239  if (isCopy) {
240  return m_impl.coeff(index);
241  } else {
242  return coeffColMajor(index);
243  }
244  } else {
245  if (isCopy) {
246  return m_impl.coeff(index);
247  } else {
248  return coeffRowMajor(index);
249  }
250  }
251  }
252 
253  // TODO: attempt to speed this up. The integer divisions and modulo are slow
254  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexColMajor(Index index) const {
255  Index inputIndex = 0;
257  for (int i = NumDims - 1; i > 0; --i) {
258  const Index idx = index / m_outputStrides[i];
259  if (internal::index_statically_eq<Broadcast>(i, 1)) {
260  eigen_assert(idx < m_impl.dimensions()[i]);
261  inputIndex += idx * m_inputStrides[i];
262  } else {
263  if (internal::index_statically_eq<InputDimensions>(i, 1)) {
264  eigen_assert(idx % m_impl.dimensions()[i] == 0);
265  } else {
266  inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
267  }
268  }
269  index -= idx * m_outputStrides[i];
270  }
271  if (internal::index_statically_eq<Broadcast>(0, 1)) {
272  eigen_assert(index < m_impl.dimensions()[0]);
273  inputIndex += index;
274  } else {
275  if (internal::index_statically_eq<InputDimensions>(0, 1)) {
276  eigen_assert(index % m_impl.dimensions()[0] == 0);
277  } else {
278  inputIndex += (index % m_impl.dimensions()[0]);
279  }
280  }
281  return inputIndex;
282  }
283 
284  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const
285  {
286  return m_impl.coeff(indexColMajor(index));
287  }
288 
289  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexRowMajor(Index index) const {
290  Index inputIndex = 0;
292  for (int i = 0; i < NumDims - 1; ++i) {
293  const Index idx = index / m_outputStrides[i];
294  if (internal::index_statically_eq<Broadcast>(i, 1)) {
295  eigen_assert(idx < m_impl.dimensions()[i]);
296  inputIndex += idx * m_inputStrides[i];
297  } else {
298  if (internal::index_statically_eq<InputDimensions>(i, 1)) {
299  eigen_assert(idx % m_impl.dimensions()[i] == 0);
300  } else {
301  inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
302  }
303  }
304  index -= idx * m_outputStrides[i];
305  }
306  if (internal::index_statically_eq<Broadcast>(NumDims - 1, 1)) {
307  eigen_assert(index < m_impl.dimensions()[NumDims - 1]);
308  inputIndex += index;
309  } else {
310  if (internal::index_statically_eq<InputDimensions>(NumDims - 1, 1)) {
311  eigen_assert(index % m_impl.dimensions()[NumDims - 1] == 0);
312  } else {
313  inputIndex += (index % m_impl.dimensions()[NumDims - 1]);
314  }
315  }
316  return inputIndex;
317  }
318 
319  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const
320  {
321  return m_impl.coeff(indexRowMajor(index));
322  }
323 
324  template<int LoadMode>
326  {
327  if (internal::is_input_scalar<internal::remove_all_t<InputDimensions>>::value) {
328  return internal::pset1<PacketReturnType>(m_impl.coeff(0));
329  }
330 
331  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
332  if (isCopy) {
333  #ifdef EIGEN_GPU_COMPILE_PHASE
334  // See PR 437: on NVIDIA P100 and K20m we observed a x3-4 speed up by enforcing
335  // unaligned loads here. The reason is unclear though.
336  return m_impl.template packet<Unaligned>(index);
337  #else
338  return m_impl.template packet<LoadMode>(index);
339  #endif
340  } else if (oneByN && !nByOne) {
341  return packetNByOne<LoadMode>(index);
342  } else if (!oneByN && nByOne) {
343  return packetOneByN<LoadMode>(index);
344  } else if (oneByN && nByOne) {
345  return packetOneByNByOne<LoadMode>(index);
346  } else {
347  return packetColMajor<LoadMode>(index);
348  }
349  } else {
350  if (isCopy) {
351  #ifdef EIGEN_GPU_COMPILE_PHASE
352  // See above.
353  return m_impl.template packet<Unaligned>(index);
354  #else
355  return m_impl.template packet<LoadMode>(index);
356  #endif
357  } else if (oneByN && !nByOne) {
358  return packetOneByN<LoadMode>(index);
359  } else if (!oneByN && nByOne) {
360  return packetNByOne<LoadMode>(index);
361  } else if (oneByN && nByOne) {
362  return packetOneByNByOne<LoadMode>(index);
363  } else {
364  return packetRowMajor<LoadMode>(index);
365  }
366  }
367  }
368 
369  template<int LoadMode>
371  (Index index) const
372  {
373  eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
374 
375  EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
376  Index startDim, endDim;
377  Index inputIndex, outputOffset, batchedIndex;
378 
379  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
380  startDim = NumDims - 1;
381  endDim = 1;
382  } else {
383  startDim = 0;
384  endDim = NumDims - 2;
385  }
386 
387  batchedIndex = index % m_outputStrides[startDim];
388  inputIndex = batchedIndex / m_outputStrides[endDim];
389  outputOffset = batchedIndex % m_outputStrides[endDim];
390 
391  if (outputOffset + PacketSize <= m_outputStrides[endDim]) {
392  values[0] = m_impl.coeff(inputIndex);
393  return internal::pload1<PacketReturnType>(values);
394  } else {
396  for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) {
397  if (outputOffset + cur < m_outputStrides[endDim]) {
398  values[i] = m_impl.coeff(inputIndex);
399  } else {
400  ++inputIndex;
401  inputIndex = (inputIndex == m_inputStrides[startDim] ? 0 : inputIndex);
402  values[i] = m_impl.coeff(inputIndex);
403  outputOffset = 0;
404  cur = 0;
405  }
406  }
407  return internal::pload<PacketReturnType>(values);
408  }
409  }
410 
411  template<int LoadMode>
412  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const
413  {
414  // Consider the flattened tensor [v0, ..., vN],
415  // Concatenates m_broadcast[dim] copies,
416  // [v0, ..., vN, v0, ..., vN, ... ]
417  // with dim == NumDims - 1 for col-major, dim == 0 for row-major.
418  eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
419 
420  // Size of flattened tensor.
421  const Index M = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ?
422  m_inputStrides[NumDims - 1] : m_inputStrides[0];
423  Index inputIndex = index % M;
424  if (inputIndex + PacketSize <= M) {
425  return m_impl.template packet<Unaligned>(inputIndex);
426  } else {
427  EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
429  for (int i = 0; i < PacketSize; ++i) {
430  if (inputIndex > M - 1) {
431  inputIndex = 0;
432  }
433  values[i] = m_impl.coeff(inputIndex++);
434  }
435  return internal::pload<PacketReturnType>(values);
436  }
437  }
438 
439  template<int LoadMode>
440  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetNByOne(Index index) const
441  {
442  // Consider the flattened tensor [v0, ..., vN],
443  // Interleaves m_broadcast[dim] copies,
444  // [v0, v0, ..., v1, v1, ..., vN, vN, ... ]
445  // with dim == 0 for col-major, dim == NumDims - 1 for row-major.
446  eigen_assert(index + PacketSize-1 < dimensions().TotalSize());
447 
448  const Index M = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ?
449  m_broadcast[0] : m_broadcast[NumDims - 1];
450 
451  Index inputIndex = index / M;
452  Index outputOffset = index % M;
453  if (outputOffset + PacketSize <= M) {
454  return internal::pset1<PacketReturnType>(m_impl.coeff(inputIndex));
455  } else {
456  EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
458  for (int i = 0; i < PacketSize; ++i) {
459  if (outputOffset < M) {
460  values[i] = m_impl.coeff(inputIndex);
461  ++outputOffset;
462  } else {
463  values[i] = m_impl.coeff(++inputIndex);
464  outputOffset = 1; // Next offset.
465  }
466  }
467  return internal::pload<PacketReturnType>(values);
468  }
469  }
470 
471  // Ignore the LoadMode and always use unaligned loads since we can't guarantee
472  // the alignment at compile time.
473  template<int LoadMode>
474  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
475  {
476  eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
477 
478  const Index originalIndex = index;
479 
480  Index inputIndex = 0;
482  for (int i = NumDims - 1; i > 0; --i) {
483  const Index idx = index / m_outputStrides[i];
484  if (internal::index_statically_eq<Broadcast>(i, 1)) {
485  eigen_assert(idx < m_impl.dimensions()[i]);
486  inputIndex += idx * m_inputStrides[i];
487  } else {
488  if (internal::index_statically_eq<InputDimensions>(i, 1)) {
489  eigen_assert(idx % m_impl.dimensions()[i] == 0);
490  } else {
491  inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
492  }
493  }
494  index -= idx * m_outputStrides[i];
495  }
496  Index innermostLoc;
497  if (internal::index_statically_eq<Broadcast>(0, 1)) {
498  eigen_assert(index < m_impl.dimensions()[0]);
499  innermostLoc = index;
500  } else {
501  if (internal::index_statically_eq<InputDimensions>(0, 1)) {
502  eigen_assert(index % m_impl.dimensions()[0] == 0);
503  innermostLoc = 0;
504  } else {
505  innermostLoc = index % m_impl.dimensions()[0];
506  }
507  }
508  inputIndex += innermostLoc;
509 
510  // Todo: this could be extended to the second dimension if we're not
511  // broadcasting alongside the first dimension, and so on.
512  if (innermostLoc + PacketSize <= m_impl.dimensions()[0]) {
513  return m_impl.template packet<Unaligned>(inputIndex);
514  } else {
515  EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
516  values[0] = m_impl.coeff(inputIndex);
518  for (int i = 1; i < PacketSize; ++i) {
519  if (innermostLoc + i < m_impl.dimensions()[0]) {
520  values[i] = m_impl.coeff(inputIndex+i);
521  } else {
522  values[i] = coeffColMajor(originalIndex+i);
523  }
524  }
525  PacketReturnType rslt = internal::pload<PacketReturnType>(values);
526  return rslt;
527  }
528  }
529 
530  template<int LoadMode>
531  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
532  {
533  eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
534 
535  const Index originalIndex = index;
536 
537  Index inputIndex = 0;
539  for (int i = 0; i < NumDims - 1; ++i) {
540  const Index idx = index / m_outputStrides[i];
541  if (internal::index_statically_eq<Broadcast>(i, 1)) {
542  eigen_assert(idx < m_impl.dimensions()[i]);
543  inputIndex += idx * m_inputStrides[i];
544  } else {
545  if (internal::index_statically_eq<InputDimensions>(i, 1)) {
546  eigen_assert(idx % m_impl.dimensions()[i] == 0);
547  } else {
548  inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
549  }
550  }
551  index -= idx * m_outputStrides[i];
552  }
553  Index innermostLoc;
554  if (internal::index_statically_eq<Broadcast>(NumDims-1, 1)) {
555  eigen_assert(index < m_impl.dimensions()[NumDims-1]);
556  innermostLoc = index;
557  } else {
558  if (internal::index_statically_eq<InputDimensions>(NumDims-1, 1)) {
559  eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0);
560  innermostLoc = 0;
561  } else {
562  innermostLoc = index % m_impl.dimensions()[NumDims-1];
563  }
564  }
565  inputIndex += innermostLoc;
566 
567  // Todo: this could be extended to the second dimension if we're not
568  // broadcasting alongside the first dimension, and so on.
569  if (innermostLoc + PacketSize <= m_impl.dimensions()[NumDims-1]) {
570  return m_impl.template packet<Unaligned>(inputIndex);
571  } else {
572  EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
573  values[0] = m_impl.coeff(inputIndex);
575  for (int i = 1; i < PacketSize; ++i) {
576  if (innermostLoc + i < m_impl.dimensions()[NumDims-1]) {
577  values[i] = m_impl.coeff(inputIndex+i);
578  } else {
579  values[i] = coeffRowMajor(originalIndex+i);
580  }
581  }
582  PacketReturnType rslt = internal::pload<PacketReturnType>(values);
583  return rslt;
584  }
585  }
586 
587  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
588  costPerCoeff(bool vectorized) const {
589  double compute_cost = TensorOpCost::AddCost<Index>();
590  if (!isCopy && NumDims > 0) {
592  for (int i = NumDims - 1; i > 0; --i) {
593  compute_cost += TensorOpCost::DivCost<Index>();
594  if (internal::index_statically_eq<Broadcast>(i, 1)) {
595  compute_cost +=
596  TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
597  } else {
598  if (!internal::index_statically_eq<InputDimensions>(i, 1)) {
599  compute_cost += TensorOpCost::MulCost<Index>() +
600  TensorOpCost::ModCost<Index>() +
601  TensorOpCost::AddCost<Index>();
602  }
603  }
604  compute_cost +=
605  TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
606  }
607  }
608  return m_impl.costPerCoeff(vectorized) +
609  TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
610  }
611 
612  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
613  internal::TensorBlockResourceRequirements getResourceRequirements() const {
614  // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large
615  // tensors. But this might need further tuning.
616  const size_t target_size = m_device.firstLevelCacheSize();
617  return internal::TensorBlockResourceRequirements::merge(
618  m_impl.getResourceRequirements(),
619  internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size));
620  }
621 
622  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
624  bool /*root_of_expr_ast*/ = false) const {
625  BlockBroadcastingParams params = blockBroadcastingParams(desc);
626 
627  if (params.inner_dim_size == 0 || params.bcast_dim_size == 0) {
628  return emptyBlock();
629  }
630 
631  // Prepare storage for the materialized broadcasting result.
632  const typename TensorBlock::Storage block_storage =
633  TensorBlock::prepareStorage(desc, scratch);
634  ScalarNoConst* materialized_output = block_storage.data();
635 
636  // We potentially will need to materialize input blocks.
637  size_t materialized_input_size = 0;
638  ScalarNoConst* materialized_input = NULL;
639 
640  // Initialize block broadcating iterator state for outer dimensions (outer
641  // with regard to bcast dimension). Dimension in this array are always in
642  // inner_most -> outer_most order (col major layout).
644  int idx = 0;
645 
646  for (int i = params.inner_dim_count + 1; i < NumDims; ++i) {
647  const Index dim = IsColMajor ? i : NumDims - 1 - i;
648  it[idx].size = params.output_dims[dim];
649  it[idx].count = 0;
650  it[idx].output_stride = m_outputStrides[dim];
651  it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
652  idx++;
653  }
654 
655  // Write output into the beginning of `materialized_output`.
656  Index output_offset = 0;
657 
658  // We will fill output block by broadcasting along the bcast dim, and
659  // iterating over outer dimension.
660  const Index output_size = NumDims == 0 ? 1 : params.output_dims.TotalSize();
661 
662  for (Index num_output_coeffs = 0; num_output_coeffs < output_size;) {
663  ScalarNoConst* bcast_output = materialized_output + num_output_coeffs;
664  Index bcast_offset = desc.offset() + output_offset;
665 
666  // Broadcast along the bcast dimension.
667  num_output_coeffs += BroadcastBlockAlongBcastDim(
668  params, bcast_offset, scratch, bcast_output, &materialized_input,
669  &materialized_input_size);
670 
671  // Switch to the next outer dimension.
672  for (int j = 0; j < idx; ++j) {
673  if (++it[j].count < it[j].size) {
674  output_offset += it[j].output_stride;
675  break;
676  }
677  it[j].count = 0;
678  output_offset -= it[j].output_span;
679  }
680  }
681 
682  return block_storage.AsTensorMaterializedBlock();
683  }
684 
685  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
686 
687  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
688 
689  Broadcast functor() const { return m_broadcast; }
690  private:
691  static constexpr bool IsColMajor =
692  static_cast<int>(Layout) == static_cast<int>(ColMajor);
693 
694  // We will build a general case block broadcasting on top of broadcasting
695  // primitive that will do broadcasting only for the inner dimension(s) along
696  // the first dimension smaller than the input size (it's called `bcast_dim`).
697  //
698  // Example:
699  // dim: 0 1 2 (ColMajor)
700  // input size: [9, 3, 6]
701  // block size: [9, 2, 6]
702  //
703  // We will compute broadcasted block by iterating over the outer dimensions
704  // before `bcast_dim` (only dimension `2` in this example) and computing
705  // broadcasts along the `bcast_dim` (dimension `1` in this example).
706 
707  // BlockBroadcastingParams holds precomputed parameters for broadcasting a
708  // single block along the broadcasting dimension. Sizes and strides along the
709  // `bcast_dim` might be invalid, they will be adjusted later in
710  // `BroadcastBlockAlongBcastDim`.
711  struct BlockBroadcastingParams {
712  Dimensions input_dims; // input expression dimensions
713  Dimensions output_dims; // output block sizes
714  Dimensions output_strides; // output block strides
715 
716  int inner_dim_count; // count inner dimensions matching in size
717  int bcast_dim; // broadcasting dimension index
718  Index bcast_dim_size; // broadcasting dimension size
719  Index inner_dim_size; // inner dimensions size
720 
721  // Block sizes and strides for the input block where all dimensions before
722  // `bcast_dim` are equal to `1`.
725 
726  // Block sizes and strides for blocks with extra dimensions and strides `0`.
730  };
731 
732  struct BlockBroadcastingIteratorState {
737  };
738 
739  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlockBroadcastingParams
741  BlockBroadcastingParams params;
742 
743  params.input_dims = Dimensions(m_impl.dimensions());
744 
745  // Output block sizes and strides.
746  params.output_dims = desc.dimensions();
747  params.output_strides = internal::strides<Layout>(params.output_dims);
748 
749  // Find the broadcasting dimension (first dimension with output size smaller
750  // that the input size).
751  params.bcast_dim = 0;
752  params.bcast_dim_size = 1;
753  params.inner_dim_size = 1;
754 
755  // Count the number of inner dimensions that have the same size in the block
756  // and in the broadcast expression.
757  params.inner_dim_count = 0;
758 
759  for (int i = 0; i < NumDims; ++i) {
760  const int dim = IsColMajor ? i : NumDims - i - 1;
761 
762  if (params.output_dims[dim] == m_dimensions[dim]) {
763  params.inner_dim_size *= params.output_dims[dim];
764  ++params.inner_dim_count;
765  continue;
766  }
767 
768  // First non-matching dimension is the broadcasting dimension.
769  eigen_assert(params.output_dims[dim] < m_dimensions[dim]);
770  params.bcast_dim = dim;
771  params.bcast_dim_size = params.output_dims[dim];
772  break;
773  }
774 
775  // Calculate the input block size for looking into the input.
776  for (int i = 0; i < params.inner_dim_count; ++i) {
777  const int dim = IsColMajor ? i : NumDims - i - 1;
778  params.input_block_sizes[dim] = params.input_dims[dim];
779  }
780  for (int i = params.inner_dim_count; i < NumDims; ++i) {
781  const int dim = IsColMajor ? i : NumDims - i - 1;
782  params.input_block_sizes[dim] = 1;
783  }
784  params.input_block_strides =
785  internal::strides<Layout>(params.input_block_sizes);
786 
787  // Broadcast with the 0-stride trick: Create 1 extra dim for each
788  // broadcast, set the input stride to 0.
789  //
790  // When ColMajor:
791  //
792  // - bcast_block_sizes:
793  // [d_0, b_0, d_1, b_1, ...]
794  //
795  // - bcast_block_strides:
796  // [output_block_strides[0], output_block_strides[0] * d_0,
797  // output_block_strides[1], output_block_strides[1] * d_1,
798  // ...]
799  //
800  // - bcast_input_strides:
801  // [input_block_strides[0], 0,
802  // input_block_strides[1], 0,
803  // ...].
804  //
805  for (int i = 0; i < params.inner_dim_count; ++i) {
806  const int dim = IsColMajor ? i : NumDims - i - 1;
807 
808  const int copy_dim = IsColMajor ? 2 * i : 2 * NumDims - 2 * i - 1;
809  const int broadcast_dim = IsColMajor ? copy_dim + 1 : copy_dim - 1;
810 
811  params.bcast_block_sizes[copy_dim] = params.input_dims[dim];
812  params.bcast_block_sizes[broadcast_dim] = m_broadcast[dim];
813  params.bcast_block_strides[copy_dim] = params.output_strides[dim];
814  params.bcast_block_strides[broadcast_dim] =
815  params.output_strides[dim] * params.input_dims[dim];
816  params.bcast_input_strides[copy_dim] = params.input_block_strides[dim];
817  params.bcast_input_strides[broadcast_dim] = 0;
818  }
819 
820  for (int i = 2 * params.inner_dim_count; i < 2 * NumDims; ++i) {
821  const int dim = IsColMajor ? i : 2 * NumDims - i - 1;
822  params.bcast_block_sizes[dim] = 1;
823  params.bcast_block_strides[dim] = 0;
824  params.bcast_input_strides[dim] = 0;
825  }
826 
827  return params;
828  }
829 
830  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock emptyBlock() const {
832  for (int i = 0; i < NumDims; ++i) dimensions[i] = 0;
834  }
835 
837  BlockBroadcastingParams params, Index bcast_offset,
838  TensorBlockScratch& scratch, ScalarNoConst* materialized_output,
839  ScalarNoConst** materialized_input,
840  size_t* materialized_input_size) const {
841  if (params.bcast_dim_size == 1) {
842  // We just need one block read using the ready-set values above.
843  return BroadcastBlock(
844  params.input_block_sizes, params.input_block_strides,
845  params.bcast_block_sizes, params.bcast_block_strides,
846  params.bcast_input_strides, bcast_offset, 0, scratch,
847  materialized_output, materialized_input, materialized_input_size);
848 
849  } else if (params.input_dims[params.bcast_dim] == 1) {
850  // Broadcast bcast dimension (< NumDims) by bcast_dim_size.
851  const int broadcast_bcast_dim =
852  IsColMajor ? 2 * params.inner_dim_count + 1
853  : 2 * NumDims - 2 * params.inner_dim_count - 2;
854 
855  params.bcast_block_sizes[broadcast_bcast_dim] = params.bcast_dim_size;
856  params.bcast_input_strides[broadcast_bcast_dim] = 0;
857  params.bcast_block_strides[broadcast_bcast_dim] =
858  params.output_strides[params.bcast_dim];
859 
860  return BroadcastBlock(
861  params.input_block_sizes, params.input_block_strides,
862  params.bcast_block_sizes, params.bcast_block_strides,
863  params.bcast_input_strides, bcast_offset, 0, scratch,
864  materialized_output, materialized_input, materialized_input_size);
865 
866  } else {
867  // Keep track of the total number of the coefficients written to the
868  // output block.
869  Index num_output_coeffs = 0;
870 
871  // The general case. Let's denote the output block as
872  //
873  // x[..., a:a+bcast_dim_size, :, ..., :]
874  //
875  // where a:a+bcast_dim_size is a slice on the bcast_dim dimension
876  // (< NumDims). We need to split the a:a+bcast_dim_size into possibly 3
877  // sub-blocks:
878  //
879  // (1) a:b, where b is the smallest multiple of
880  // input_dims[bcast_dim_start] in [a, a+bcast_dim_size].
881  //
882  // (2) b:c, where c is the largest multiple of input_dims[bcast_dim_start]
883  // in [a, a+bcast_dim_size].
884  //
885  // (3) c:a+bcast_dim_size .
886  //
887  // Or, when b and c do not exist, we just need to process the whole block
888  // together.
889 
890  // Find a.
891  const Index bcast_dim_left_index =
892  bcast_offset / m_outputStrides[params.bcast_dim];
893 
894  // Find b and c.
895  const Index input_bcast_dim_size = params.input_dims[params.bcast_dim];
896 
897  // First multiple after a. This is b when <= bcast_dim_left_index +
898  // bcast_dim_size.
899  const Index first_multiple =
900  divup<Index>(bcast_dim_left_index, input_bcast_dim_size) *
901  input_bcast_dim_size;
902 
903  if (first_multiple <= bcast_dim_left_index + params.bcast_dim_size) {
904  // b exists, so does c. Find it.
905  const Index last_multiple =
906  (bcast_dim_left_index + params.bcast_dim_size) /
907  input_bcast_dim_size * input_bcast_dim_size;
908  const int copy_bcast_dim =
909  IsColMajor ? 2 * params.inner_dim_count
910  : 2 * NumDims - 2 * params.inner_dim_count - 1;
911  const int broadcast_bcast_dim =
912  IsColMajor ? 2 * params.inner_dim_count + 1
913  : 2 * NumDims - 2 * params.inner_dim_count - 2;
914 
915  if (first_multiple > bcast_dim_left_index) {
916  const Index head_size = first_multiple - bcast_dim_left_index;
917  params.input_block_sizes[params.bcast_dim] = head_size;
918  params.bcast_block_sizes[copy_bcast_dim] = head_size;
919  params.bcast_input_strides[copy_bcast_dim] =
920  params.input_block_strides[params.bcast_dim];
921  params.bcast_block_strides[copy_bcast_dim] =
922  params.output_strides[params.bcast_dim];
923  params.bcast_block_sizes[broadcast_bcast_dim] = 1;
924  params.bcast_input_strides[broadcast_bcast_dim] = 0;
925  params.bcast_block_strides[broadcast_bcast_dim] =
926  params.output_strides[params.bcast_dim] *
927  params.input_dims[params.bcast_dim];
928 
929  num_output_coeffs += BroadcastBlock(
930  params.input_block_sizes, params.input_block_strides,
931  params.bcast_block_sizes, params.bcast_block_strides,
932  params.bcast_input_strides, bcast_offset, 0, scratch,
933  materialized_output, materialized_input, materialized_input_size);
934  }
935  if (first_multiple < last_multiple) {
936  params.input_block_sizes[params.bcast_dim] = input_bcast_dim_size;
937  params.bcast_block_sizes[copy_bcast_dim] = input_bcast_dim_size;
938  params.bcast_input_strides[copy_bcast_dim] =
939  params.input_block_strides[params.bcast_dim];
940  params.bcast_block_strides[copy_bcast_dim] =
941  params.output_strides[params.bcast_dim];
942  params.bcast_block_sizes[broadcast_bcast_dim] =
943  (last_multiple - first_multiple) / input_bcast_dim_size;
944  params.bcast_input_strides[broadcast_bcast_dim] = 0;
945  params.bcast_block_strides[broadcast_bcast_dim] =
946  params.output_strides[params.bcast_dim] *
947  params.input_dims[params.bcast_dim];
948  const Index offset = (first_multiple - bcast_dim_left_index) *
949  m_outputStrides[params.bcast_dim];
950 
951  num_output_coeffs += BroadcastBlock(
952  params.input_block_sizes, params.input_block_strides,
953  params.bcast_block_sizes, params.bcast_block_strides,
954  params.bcast_input_strides, bcast_offset, offset, scratch,
955  materialized_output, materialized_input, materialized_input_size);
956  }
957  if (last_multiple < bcast_dim_left_index + params.bcast_dim_size) {
958  const Index tail_size =
959  bcast_dim_left_index + params.bcast_dim_size - last_multiple;
960  params.input_block_sizes[params.bcast_dim] = tail_size;
961  params.bcast_block_sizes[copy_bcast_dim] = tail_size;
962  params.bcast_input_strides[copy_bcast_dim] =
963  params.input_block_strides[params.bcast_dim];
964  params.bcast_block_strides[copy_bcast_dim] =
965  params.output_strides[params.bcast_dim];
966  params.bcast_block_sizes[broadcast_bcast_dim] = 1;
967  params.bcast_input_strides[broadcast_bcast_dim] = 0;
968  params.bcast_block_strides[broadcast_bcast_dim] =
969  params.output_strides[params.bcast_dim] *
970  params.input_dims[params.bcast_dim];
971  const Index offset = (last_multiple - bcast_dim_left_index) *
972  m_outputStrides[params.bcast_dim];
973 
974  num_output_coeffs += BroadcastBlock(
975  params.input_block_sizes, params.input_block_strides,
976  params.bcast_block_sizes, params.bcast_block_strides,
977  params.bcast_input_strides, bcast_offset, offset, scratch,
978  materialized_output, materialized_input, materialized_input_size);
979  }
980  } else {
981  // b and c do not exist.
982  const int copy_bcast_dim =
983  IsColMajor ? 2 * params.inner_dim_count
984  : 2 * NumDims - 2 * params.inner_dim_count - 1;
985  params.input_block_sizes[params.bcast_dim] = params.bcast_dim_size;
986  params.bcast_block_sizes[copy_bcast_dim] = params.bcast_dim_size;
987  params.bcast_input_strides[copy_bcast_dim] =
988  params.input_block_strides[params.bcast_dim];
989  params.bcast_block_strides[copy_bcast_dim] =
990  params.output_strides[params.bcast_dim];
991 
992  num_output_coeffs += BroadcastBlock(
993  params.input_block_sizes, params.input_block_strides,
994  params.bcast_block_sizes, params.bcast_block_strides,
995  params.bcast_input_strides, bcast_offset, 0, scratch,
996  materialized_output, materialized_input, materialized_input_size);
997  }
998 
999  return num_output_coeffs;
1000  }
1001  }
1002 
1004  const Dimensions& input_block_sizes,
1005  const Dimensions& input_block_strides,
1006  const BroadcastDimensions& bcast_block_sizes,
1007  const BroadcastDimensions& bcast_block_strides,
1008  const BroadcastDimensions& bcast_input_strides, Index bcast_offset,
1009  Index offset, TensorBlockScratch& scratch,
1010  ScalarNoConst* materialized_output, ScalarNoConst** materialized_input,
1011  size_t* materialized_input_size) const {
1012  // ---------------------------------------------------------------------- //
1013  // Tensor block descriptor for reading block from the input.
1014  const Index input_offset = bcast_offset + offset;
1015  TensorBlockDesc input_desc(
1016  IsColMajor ? indexColMajor(input_offset) : indexRowMajor(input_offset),
1017  input_block_sizes);
1018 
1019  ArgTensorBlock input_block = m_impl.block(input_desc, scratch);
1020 
1021  // ---------------------------------------------------------------------- //
1022  // Materialize input block into a temporary memory buffer only if it's not
1023  // already available in the arg block.
1024  const ScalarNoConst* input_buffer = NULL;
1025 
1026  if (input_block.data() != NULL) {
1027  // Input block already has raw data, there is no need to materialize it.
1028  input_buffer = input_block.data();
1029 
1030  } else {
1031  // Otherwise we have to do block assignment into a temporary buffer.
1032 
1033  // Maybe reuse previously allocated buffer, or allocate a new one with a
1034  // scratch allocator.
1035  const size_t input_total_size = input_block_sizes.TotalSize();
1036  if (*materialized_input == NULL ||
1037  *materialized_input_size < input_total_size) {
1038  *materialized_input_size = input_total_size;
1039  void* mem = scratch.allocate(*materialized_input_size * sizeof(Scalar));
1040  *materialized_input = static_cast<ScalarNoConst*>(mem);
1041  }
1042 
1043  typedef internal::TensorBlockAssignment<
1044  ScalarNoConst, NumDims, typename ArgTensorBlock::XprType, Index>
1045  TensorBlockAssignment;
1046 
1047  TensorBlockAssignment::Run(
1048  TensorBlockAssignment::target(input_block_sizes, input_block_strides,
1049  *materialized_input),
1050  input_block.expr());
1051 
1052  input_buffer = *materialized_input;
1053  }
1054 
1055  // ---------------------------------------------------------------------- //
1056  // Copy data from materialized input block to the materialized output, using
1057  // given broadcast strides (strides with zeroes).
1058  typedef internal::TensorBlockIO<ScalarNoConst, Index, 2 * NumDims, Layout>
1059  TensorBlockIO;
1060 
1061  typename TensorBlockIO::Src src(bcast_input_strides, input_buffer);
1062  typename TensorBlockIO::Dst dst(bcast_block_sizes, bcast_block_strides,
1063  materialized_output + offset);
1064 
1065  return TensorBlockIO::Copy(dst, src);
1066  }
1067 
1068 protected:
1070  const std::remove_reference_t<Broadcast> m_broadcast;
1075 };
1076 
1077 
1078 } // end namespace Eigen
1079 
1080 #endif // EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
int i
#define EIGEN_ALIGN_MAX
Matrix4Xd M
#define EIGEN_ALWAYS_INLINE
#define EIGEN_UNROLL_LOOP
#define EIGEN_DEVICE_FUNC
#define eigen_assert(x)
#define EIGEN_STATIC_ASSERT(X, MSG)
#define EIGEN_DEVICE_REF
Definition: TensorMacros.h:36
The tensor base class.
const Broadcast & broadcast() const
XprType::CoeffReturnType CoeffReturnType
const internal::remove_all_t< typename XprType::Nested > & expression() const
Eigen::NumTraits< Scalar >::Real RealScalar
Eigen::internal::traits< TensorBroadcastingOp >::Index Index
Eigen::internal::nested< TensorBroadcastingOp >::type Nested
TensorBroadcastingOp(const XprType &expr, const Broadcast &broadcast)
Eigen::internal::traits< TensorBroadcastingOp >::Scalar Scalar
Eigen::internal::traits< TensorBroadcastingOp >::StorageKind StorageKind
A tensor expression mapping an existing array of data.
Definition: TensorMap.h:32
Index first_multiple(Index size, Index base)
typename remove_all< T >::type remove_all_t
: TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend
std::array< T, N > array
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
DenseIndex TotalSize() const
internal::packet_traits< Scalar >::type type
Definition: TensorMeta.h:55
SparseMat::Index size
internal::TensorMaterializedBlock< ScalarNoConst, NumDims, Layout, Index > TensorBlock
TensorBlock block(TensorBlockDesc &desc, TensorBlockScratch &scratch, bool=false) const
Index BroadcastBlock(const Dimensions &input_block_sizes, const Dimensions &input_block_strides, const BroadcastDimensions &bcast_block_sizes, const BroadcastDimensions &bcast_block_strides, const BroadcastDimensions &bcast_input_strides, Index bcast_offset, Index offset, TensorBlockScratch &scratch, ScalarNoConst *materialized_output, ScalarNoConst **materialized_input, size_t *materialized_input_size) const
EIGEN_ALWAYS_INLINE BlockBroadcastingParams blockBroadcastingParams(TensorBlockDesc &desc) const
Index BroadcastBlockAlongBcastDim(BlockBroadcastingParams params, Index bcast_offset, TensorBlockScratch &scratch, ScalarNoConst *materialized_output, ScalarNoConst **materialized_input, size_t *materialized_input_size) const
A cost model used to limit the number of threads used for evaluating tensor expression.
const Dimensions & dimensions() const
static constexpr int Layout
const Device EIGEN_DEVICE_REF m_device
Storage::Type EvaluatorPointerType
static constexpr int PacketSize
internal::TensorMaterializedBlock< ScalarNoConst, NumCoords, Layout, Index > TensorBlock
std::remove_const_t< Scalar > ScalarNoConst
Derived::Dimensions Dimensions
std::ptrdiff_t j