TensorScan.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2016 Igor Babuschkin <igor@babuschk.in>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
12 
13 #include "./InternalHeaderCheck.h"
14 
15 namespace Eigen {
16 
17 namespace internal {
18 
19 template <typename Op, typename XprType>
20 struct traits<TensorScanOp<Op, XprType> >
21  : public traits<XprType> {
22  typedef typename XprType::Scalar Scalar;
23  typedef traits<XprType> XprTraits;
24  typedef typename XprTraits::StorageKind StorageKind;
25  typedef typename XprType::Nested Nested;
26  typedef std::remove_reference_t<Nested> Nested_;
27  static constexpr int NumDimensions = XprTraits::NumDimensions;
28  static constexpr int Layout = XprTraits::Layout;
29  typedef typename XprTraits::PointerType PointerType;
30 };
31 
32 template<typename Op, typename XprType>
33 struct eval<TensorScanOp<Op, XprType>, Eigen::Dense>
34 {
35  typedef const TensorScanOp<Op, XprType>& type;
36 };
37 
38 template<typename Op, typename XprType>
39 struct nested<TensorScanOp<Op, XprType>, 1,
40  typename eval<TensorScanOp<Op, XprType> >::type>
41 {
42  typedef TensorScanOp<Op, XprType> type;
43 };
44 } // end namespace internal
45 
51 template <typename Op, typename XprType>
53  : public TensorBase<TensorScanOp<Op, XprType>, ReadOnlyAccessors> {
54 public:
55  typedef typename Eigen::internal::traits<TensorScanOp>::Scalar Scalar;
57  typedef typename XprType::CoeffReturnType CoeffReturnType;
58  typedef typename Eigen::internal::nested<TensorScanOp>::type Nested;
59  typedef typename Eigen::internal::traits<TensorScanOp>::StorageKind StorageKind;
60  typedef typename Eigen::internal::traits<TensorScanOp>::Index Index;
61 
62  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp(
63  const XprType& expr, const Index& axis, bool exclusive = false, const Op& op = Op())
65 
66  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
67  const Index axis() const { return m_axis; }
68  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
69  const XprType& expression() const { return m_expr; }
70  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
71  const Op accumulator() const { return m_accumulator; }
72  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
73  bool exclusive() const { return m_exclusive; }
74 
75 protected:
76  typename XprType::Nested m_expr;
77  const Index m_axis;
78  const Op m_accumulator;
79  const bool m_exclusive;
80 };
81 
82 
83 namespace internal {
84 
85 template <typename Self>
86 EIGEN_STRONG_INLINE void ReduceScalar(Self& self, Index offset,
87  typename Self::CoeffReturnType* data) {
88  // Compute the scan along the axis, starting at the given offset
89  typename Self::CoeffReturnType accum = self.accumulator().initialize();
90  if (self.stride() == 1) {
91  if (self.exclusive()) {
92  for (Index curr = offset; curr < offset + self.size(); ++curr) {
93  data[curr] = self.accumulator().finalize(accum);
94  self.accumulator().reduce(self.inner().coeff(curr), &accum);
95  }
96  } else {
97  for (Index curr = offset; curr < offset + self.size(); ++curr) {
98  self.accumulator().reduce(self.inner().coeff(curr), &accum);
99  data[curr] = self.accumulator().finalize(accum);
100  }
101  }
102  } else {
103  if (self.exclusive()) {
104  for (Index idx3 = 0; idx3 < self.size(); idx3++) {
105  Index curr = offset + idx3 * self.stride();
106  data[curr] = self.accumulator().finalize(accum);
107  self.accumulator().reduce(self.inner().coeff(curr), &accum);
108  }
109  } else {
110  for (Index idx3 = 0; idx3 < self.size(); idx3++) {
111  Index curr = offset + idx3 * self.stride();
112  self.accumulator().reduce(self.inner().coeff(curr), &accum);
113  data[curr] = self.accumulator().finalize(accum);
114  }
115  }
116  }
117 }
118 
119 template <typename Self>
120 EIGEN_STRONG_INLINE void ReducePacket(Self& self, Index offset,
121  typename Self::CoeffReturnType* data) {
122  using Scalar = typename Self::CoeffReturnType;
123  using Packet = typename Self::PacketReturnType;
124  // Compute the scan along the axis, starting at the calculated offset
125  Packet accum = self.accumulator().template initializePacket<Packet>();
126  if (self.stride() == 1) {
127  if (self.exclusive()) {
128  for (Index curr = offset; curr < offset + self.size(); ++curr) {
129  internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
130  self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
131  }
132  } else {
133  for (Index curr = offset; curr < offset + self.size(); ++curr) {
134  self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
135  internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
136  }
137  }
138  } else {
139  if (self.exclusive()) {
140  for (Index idx3 = 0; idx3 < self.size(); idx3++) {
141  const Index curr = offset + idx3 * self.stride();
142  internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
143  self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
144  }
145  } else {
146  for (Index idx3 = 0; idx3 < self.size(); idx3++) {
147  const Index curr = offset + idx3 * self.stride();
148  self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
149  internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
150  }
151  }
152  }
153 }
154 
155 template <typename Self, bool Vectorize, bool Parallel>
156 struct ReduceBlock {
157  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
158  typename Self::CoeffReturnType* data) {
159  for (Index idx2 = 0; idx2 < self.stride(); idx2++) {
160  // Calculate the starting offset for the scan
161  Index offset = idx1 + idx2;
162  ReduceScalar(self, offset, data);
163  }
164  }
165 };
166 
167 // Specialization for vectorized reduction.
168 template <typename Self>
169 struct ReduceBlock<Self, /*Vectorize=*/true, /*Parallel=*/false> {
170  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
171  typename Self::CoeffReturnType* data) {
172  using Packet = typename Self::PacketReturnType;
173  const int PacketSize = internal::unpacket_traits<Packet>::size;
174  Index idx2 = 0;
175  for (; idx2 + PacketSize <= self.stride(); idx2 += PacketSize) {
176  // Calculate the starting offset for the packet scan
177  Index offset = idx1 + idx2;
178  ReducePacket(self, offset, data);
179  }
180  for (; idx2 < self.stride(); idx2++) {
181  // Calculate the starting offset for the scan
182  Index offset = idx1 + idx2;
183  ReduceScalar(self, offset, data);
184  }
185  }
186 };
187 
188 // Single-threaded CPU implementation of scan
189 template <typename Self, typename Reducer, typename Device,
190  bool Vectorize =
192  internal::reducer_traits<Reducer, Device>::PacketAccess)>
193 struct ScanLauncher {
194  void operator()(Self& self, typename Self::CoeffReturnType* data) const {
195  Index total_size = internal::array_prod(self.dimensions());
196 
197  // We fix the index along the scan axis to 0 and perform a
198  // scan per remaining entry. The iteration is split into two nested
199  // loops to avoid an integer division by keeping track of each idx1 and
200  // idx2.
201  for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) {
202  ReduceBlock<Self, Vectorize, /*Parallel=*/false> block_reducer;
203  block_reducer(self, idx1, data);
204  }
205  }
206 };
207 
208 #ifdef EIGEN_USE_THREADS
209 
210 // Adjust block_size to avoid false sharing of cachelines among
211 // threads. Currently set to twice the cache line size on Intel and ARM
212 // processors.
213 EIGEN_STRONG_INLINE Index AdjustBlockSize(Index item_size, Index block_size) {
214  EIGEN_CONSTEXPR Index kBlockAlignment = 128;
215  const Index items_per_cacheline =
216  numext::maxi<Index>(1, kBlockAlignment / item_size);
217  return items_per_cacheline * divup(block_size, items_per_cacheline);
218 }
219 
220 template <typename Self>
221 struct ReduceBlock<Self, /*Vectorize=*/true, /*Parallel=*/true> {
222  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
223  typename Self::CoeffReturnType* data) {
224  using Scalar = typename Self::CoeffReturnType;
225  using Packet = typename Self::PacketReturnType;
226  const int PacketSize = internal::unpacket_traits<Packet>::size;
227  Index num_scalars = self.stride();
228  Index num_packets = 0;
229  if (self.stride() >= PacketSize) {
230  num_packets = self.stride() / PacketSize;
231  self.device().parallelFor(
232  num_packets,
233  TensorOpCost(PacketSize * self.size(), PacketSize * self.size(),
234  16 * PacketSize * self.size(), true, PacketSize),
235  // Make the shard size large enough that two neighboring threads
236  // won't write to the same cacheline of `data`.
237  [=](Index blk_size) {
238  return AdjustBlockSize(PacketSize * sizeof(Scalar), blk_size);
239  },
240  [&](Index first, Index last) {
241  for (Index packet = first; packet < last; ++packet) {
242  const Index idx2 = packet * PacketSize;
243  ReducePacket(self, idx1 + idx2, data);
244  }
245  });
246  num_scalars -= num_packets * PacketSize;
247  }
248  self.device().parallelFor(
249  num_scalars, TensorOpCost(self.size(), self.size(), 16 * self.size()),
250  // Make the shard size large enough that two neighboring threads
251  // won't write to the same cacheline of `data`.
252  [=](Index blk_size) {
253  return AdjustBlockSize(sizeof(Scalar), blk_size);
254  },
255  [&](Index first, Index last) {
256  for (Index scalar = first; scalar < last; ++scalar) {
257  const Index idx2 = num_packets * PacketSize + scalar;
258  ReduceScalar(self, idx1 + idx2, data);
259  }
260  });
261  }
262 };
263 
264 template <typename Self>
265 struct ReduceBlock<Self, /*Vectorize=*/false, /*Parallel=*/true> {
266  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
267  typename Self::CoeffReturnType* data) {
268  using Scalar = typename Self::CoeffReturnType;
269  self.device().parallelFor(
270  self.stride(), TensorOpCost(self.size(), self.size(), 16 * self.size()),
271  // Make the shard size large enough that two neighboring threads
272  // won't write to the same cacheline of `data`.
273  [=](Index blk_size) {
274  return AdjustBlockSize(sizeof(Scalar), blk_size);
275  },
276  [&](Index first, Index last) {
277  for (Index idx2 = first; idx2 < last; ++idx2) {
278  ReduceScalar(self, idx1 + idx2, data);
279  }
280  });
281  }
282 };
283 
284 // Specialization for multi-threaded execution.
285 template <typename Self, typename Reducer, bool Vectorize>
286 struct ScanLauncher<Self, Reducer, ThreadPoolDevice, Vectorize> {
287  void operator()(Self& self, typename Self::CoeffReturnType* data) {
288  using Scalar = typename Self::CoeffReturnType;
289  using Packet = typename Self::PacketReturnType;
290  const int PacketSize = internal::unpacket_traits<Packet>::size;
291  const Index total_size = internal::array_prod(self.dimensions());
292  const Index inner_block_size = self.stride() * self.size();
293  bool parallelize_by_outer_blocks = (total_size >= (self.stride() * inner_block_size));
294 
295  if ((parallelize_by_outer_blocks && total_size <= 4096) ||
296  (!parallelize_by_outer_blocks && self.stride() < PacketSize)) {
297  ScanLauncher<Self, Reducer, DefaultDevice, Vectorize> launcher;
298  launcher(self, data);
299  return;
300  }
301 
302  if (parallelize_by_outer_blocks) {
303  // Parallelize over outer blocks.
304  const Index num_outer_blocks = total_size / inner_block_size;
305  self.device().parallelFor(
306  num_outer_blocks,
307  TensorOpCost(inner_block_size, inner_block_size,
308  16 * PacketSize * inner_block_size, Vectorize,
309  PacketSize),
310  [=](Index blk_size) {
311  return AdjustBlockSize(inner_block_size * sizeof(Scalar), blk_size);
312  },
313  [&](Index first, Index last) {
314  for (Index idx1 = first; idx1 < last; ++idx1) {
315  ReduceBlock<Self, Vectorize, /*Parallelize=*/false> block_reducer;
316  block_reducer(self, idx1 * inner_block_size, data);
317  }
318  });
319  } else {
320  // Parallelize over inner packets/scalars dimensions when the reduction
321  // axis is not an inner dimension.
322  ReduceBlock<Self, Vectorize, /*Parallelize=*/true> block_reducer;
323  for (Index idx1 = 0; idx1 < total_size;
324  idx1 += self.stride() * self.size()) {
325  block_reducer(self, idx1, data);
326  }
327  }
328  }
329 };
330 #endif // EIGEN_USE_THREADS
331 
332 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
333 
334 // GPU implementation of scan
335 // TODO(ibab) This placeholder implementation performs multiple scans in
336 // parallel, but it would be better to use a parallel scan algorithm and
337 // optimize memory access.
338 template <typename Self, typename Reducer>
339 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) {
340  // Compute offset as in the CPU version
341  Index val = threadIdx.x + blockIdx.x * blockDim.x;
342  Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride();
343 
344  if (offset + (self.size() - 1) * self.stride() < total_size) {
345  // Compute the scan along the axis, starting at the calculated offset
346  typename Self::CoeffReturnType accum = self.accumulator().initialize();
347  for (Index idx = 0; idx < self.size(); idx++) {
348  Index curr = offset + idx * self.stride();
349  if (self.exclusive()) {
350  data[curr] = self.accumulator().finalize(accum);
351  self.accumulator().reduce(self.inner().coeff(curr), &accum);
352  } else {
353  self.accumulator().reduce(self.inner().coeff(curr), &accum);
354  data[curr] = self.accumulator().finalize(accum);
355  }
356  }
357  }
358  __syncthreads();
359 
360 }
361 
362 template <typename Self, typename Reducer, bool Vectorize>
363 struct ScanLauncher<Self, Reducer, GpuDevice, Vectorize> {
364  void operator()(const Self& self, typename Self::CoeffReturnType* data) {
365  Index total_size = internal::array_prod(self.dimensions());
366  Index num_blocks = (total_size / self.size() + 63) / 64;
367  Index block_size = 64;
368 
369  LAUNCH_GPU_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
370  }
371 };
372 #endif // EIGEN_USE_GPU && (EIGEN_GPUCC)
373 
374 } // namespace internal
375 
376 // Eval as rvalue
377 template <typename Op, typename ArgType, typename Device>
378 struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
379 
381  typedef typename XprType::Index Index;
382  typedef const ArgType ChildTypeNoConst;
383  typedef const ArgType ChildType;
384  static constexpr int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
386  typedef std::remove_const_t<typename XprType::Scalar> Scalar;
392 
394  enum {
395  IsAligned = false,
397  BlockAccess = false,
398  PreferBlockAccess = false,
399  CoordAccess = false,
400  RawAccess = true
401  };
402 
403  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
404  typedef internal::TensorBlockNotImplemented TensorBlock;
405  //===--------------------------------------------------------------------===//
406 
407  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
408  : m_impl(op.expression(), device),
409  m_device(device),
410  m_exclusive(op.exclusive()),
411  m_accumulator(op.accumulator()),
412  m_size(m_impl.dimensions()[op.axis()]),
413  m_stride(1), m_consume_dim(op.axis()),
414  m_output(NULL) {
415 
416  // Accumulating a scalar isn't supported.
417  EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
418  eigen_assert(op.axis() >= 0 && op.axis() < NumDims);
419 
420  // Compute stride of scan axis
421  const Dimensions& dims = m_impl.dimensions();
422  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
423  for (int i = 0; i < op.axis(); ++i) {
424  m_stride = m_stride * dims[i];
425  }
426  } else {
427  // dims can only be indexed through unsigned integers,
428  // so let's use an unsigned type to let the compiler knows.
429  // This prevents stupid warnings: ""'*((void*)(& evaluator)+64)[18446744073709551615]' may be used uninitialized in this function"
430  unsigned int axis = internal::convert_index<unsigned int>(op.axis());
431  for (unsigned int i = NumDims - 1; i > axis; --i) {
432  m_stride = m_stride * dims[i];
433  }
434  }
435  }
436 
437  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
438  return m_impl.dimensions();
439  }
440 
441  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride() const {
442  return m_stride;
443  }
444 
445  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& consume_dim() const {
446  return m_consume_dim;
447  }
448 
449  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const {
450  return m_size;
451  }
452 
453  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& accumulator() const {
454  return m_accumulator;
455  }
456 
457  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const {
458  return m_exclusive;
459  }
460 
461  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& inner() const {
462  return m_impl;
463  }
464 
465  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const {
466  return m_device;
467  }
468 
469  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
470  m_impl.evalSubExprsIfNeeded(NULL);
471  internal::ScanLauncher<Self, Op, Device> launcher;
472  if (data) {
473  launcher(*this, data);
474  return false;
475  }
476 
477  const Index total_size = internal::array_prod(dimensions());
478  m_output = static_cast<EvaluatorPointerType>(m_device.get((Scalar*) m_device.allocate_temp(total_size * sizeof(Scalar))));
479  launcher(*this, m_output);
480  return true;
481  }
482 
483  template<int LoadMode>
485  return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
486  }
487 
488  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const
489  {
490  return m_output;
491  }
492 
493  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
494  {
495  return m_output[index];
496  }
497 
498  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
499  return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
500  }
501 
502  EIGEN_STRONG_INLINE void cleanup() {
503  if (m_output) {
504  m_device.deallocate_temp(m_output);
505  m_output = NULL;
506  }
507  m_impl.cleanup();
508  }
509 
510 protected:
513  const bool m_exclusive;
515  const Index m_size;
519 };
520 
521 } // end namespace Eigen
522 
523 #endif // EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
int i
IndexedView_or_VectorBlock operator()(const Indices &indices)
#define EIGEN_CONSTEXPR
#define EIGEN_DEVICE_FUNC
#define EIGEN_HIP_LAUNCH_BOUNDS_1024
#define eigen_assert(x)
int data[]
#define EIGEN_STATIC_ASSERT(X, MSG)
#define EIGEN_DEVICE_REF
Definition: TensorMacros.h:36
The tensor base class.
const Op accumulator() const
Definition: TensorScan.h:71
XprType::CoeffReturnType CoeffReturnType
Definition: TensorScan.h:57
Eigen::internal::nested< TensorScanOp >::type Nested
Definition: TensorScan.h:58
const bool m_exclusive
Definition: TensorScan.h:79
Eigen::internal::traits< TensorScanOp >::StorageKind StorageKind
Definition: TensorScan.h:59
const Index axis() const
Definition: TensorScan.h:67
TensorScanOp(const XprType &expr, const Index &axis, bool exclusive=false, const Op &op=Op())
Definition: TensorScan.h:62
Eigen::internal::traits< TensorScanOp >::Scalar Scalar
Definition: TensorScan.h:55
XprType::Nested m_expr
Definition: TensorScan.h:76
bool exclusive() const
Definition: TensorScan.h:73
Eigen::NumTraits< Scalar >::Real RealScalar
Definition: TensorScan.h:56
const Op m_accumulator
Definition: TensorScan.h:78
const XprType & expression() const
Definition: TensorScan.h:69
Eigen::internal::traits< TensorScanOp >::Index Index
Definition: TensorScan.h:60
const Index m_axis
Definition: TensorScan.h:77
static const last_t last
void ReducePacket(Self &self, Index offset, typename Self::CoeffReturnType *data)
Definition: TensorScan.h:120
EIGEN_CONSTEXPR Index first(const T &x) EIGEN_NOEXCEPT
constexpr auto array_prod(const array< T, N > &arr) -> decltype(array_reduce< product_op, T, N >(arr, static_cast< T >(1)))
void ReduceScalar(Self &self, Index offset, typename Self::CoeffReturnType *data)
Definition: TensorScan.h:86
: TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
EIGEN_ALWAYS_INLINE T divup(const X x, const Y y)
Definition: TensorMeta.h:32
internal::packet_traits< Scalar >::type type
Definition: TensorMeta.h:55
SparseMat::Index size
const TensorEvaluator< ArgType, Device > & inner() const
Definition: TensorScan.h:461
PacketType< CoeffReturnType, Device >::type PacketReturnType
Definition: TensorScan.h:388
std::remove_const_t< typename XprType::Scalar > Scalar
Definition: TensorScan.h:386
TensorEvaluator< const TensorScanOp< Op, ArgType >, Device > Self
Definition: TensorScan.h:389
TensorEvaluator(const XprType &op, const Device &device)
Definition: TensorScan.h:407
A cost model used to limit the number of threads used for evaluating tensor expression.
const Dimensions & dimensions() const
static constexpr int Layout
const Device EIGEN_DEVICE_REF m_device
EvaluatorPointerType data() const