TensorConvolution.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
12 
13 #include "./InternalHeaderCheck.h"
14 
15 namespace Eigen {
16 
24 namespace internal {
25 
26 template <typename Index, typename InputDims, int NumKernelDims, int Layout>
27 class IndexMapper {
28  public:
29  IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
30  const array<Index, NumKernelDims>& indices) {
31 
32  array<Index, NumDims> dimensions = input_dims;
33  for (int i = 0; i < NumKernelDims; ++i) {
34  const Index index = indices[i];
35  const Index input_dim = input_dims[index];
36  const Index kernel_dim = kernel_dims[i];
37  const Index result_dim = input_dim - kernel_dim + 1;
38  dimensions[index] = result_dim;
39  }
40 
41  array<Index, NumDims> inputStrides;
42  array<Index, NumDims> outputStrides;
43  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
44  inputStrides[0] = 1;
45  outputStrides[0] = 1;
46  for (int i = 1; i < NumDims; ++i) {
47  inputStrides[i] = inputStrides[i-1] * input_dims[i-1];
48  outputStrides[i] = outputStrides[i-1] * dimensions[i-1];
49  }
50  } else {
51  inputStrides[NumDims - 1] = 1;
52  outputStrides[NumDims - 1] = 1;
53  for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) {
54  inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
55  outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1];
56  }
57  }
58 
59  array<Index, NumDims> gpuInputDimensions;
60  array<Index, NumDims> gpuOutputDimensions;
61  array<Index, NumDims> tmp = dimensions;
62  array<Index, NumDims> ordering;
63  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
64  ? 0
65  : NumDims - NumKernelDims;
66  for (int i = 0; i < NumKernelDims; ++i) {
67  const Index index = i + offset;
68  ordering[index] = indices[i];
69  tmp[indices[i]] = -1;
70  gpuInputDimensions[index] = input_dims[indices[i]];
71  gpuOutputDimensions[index] = dimensions[indices[i]];
72  }
73 
74  int written = static_cast<int>(Layout) == static_cast<int>(ColMajor)
75  ? NumKernelDims
76  : 0;
77  for (int i = 0; i < NumDims; ++i) {
78  if (tmp[i] >= 0) {
79  ordering[written] = i;
80  gpuInputDimensions[written] = input_dims[i];
81  gpuOutputDimensions[written] = dimensions[i];
82  ++written;
83  }
84  }
85 
86  for (int i = 0; i < NumDims; ++i) {
87  m_inputStrides[i] = inputStrides[ordering[i]];
88  m_outputStrides[i] = outputStrides[ordering[i]];
89  }
90 
91  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
92  for (int i = 0; i < NumDims; ++i) {
93  if (i > NumKernelDims) {
94  m_gpuInputStrides[i] =
95  m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1];
96  m_gpuOutputStrides[i] =
97  m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1];
98  } else {
99  m_gpuInputStrides[i] = 1;
100  m_gpuOutputStrides[i] = 1;
101  }
102  }
103  } else {
104  for (int i = NumDims - 1; i >= 0; --i) {
105  if (static_cast<size_t>(i + 1) < offset) {
106  m_gpuInputStrides[i] =
107  m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1];
108  m_gpuOutputStrides[i] =
109  m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1];
110  } else {
111  m_gpuInputStrides[i] = 1;
112  m_gpuOutputStrides[i] = 1;
113  }
114  }
115  }
116  }
117 
118  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const {
119  Index inputIndex = 0;
120  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
121  for (int d = NumDims - 1; d > NumKernelDims; --d) {
122  const Index idx = p / m_gpuInputStrides[d];
123  inputIndex += idx * m_inputStrides[d];
124  p -= idx * m_gpuInputStrides[d];
125  }
126  if (NumKernelDims < NumDims) {
127  inputIndex += p * m_inputStrides[NumKernelDims];
128  }
129  } else {
130  std::ptrdiff_t limit = 0;
131  if (NumKernelDims < NumDims) {
132  limit = NumDims - NumKernelDims - 1;
133  }
134  for (int d = 0; d < limit; ++d) {
135  const Index idx = p / m_gpuInputStrides[d];
136  inputIndex += idx * m_inputStrides[d];
137  p -= idx * m_gpuInputStrides[d];
138  }
139  inputIndex += p * m_inputStrides[limit];
140  }
141  return inputIndex;
142  }
143 
144  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const {
145  Index outputIndex = 0;
146  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
147  for (int d = NumDims - 1; d > NumKernelDims; --d) {
148  const Index idx = p / m_gpuOutputStrides[d];
149  outputIndex += idx * m_outputStrides[d];
150  p -= idx * m_gpuOutputStrides[d];
151  }
152  if (NumKernelDims < NumDims) {
153  outputIndex += p * m_outputStrides[NumKernelDims];
154  }
155  } else {
156  std::ptrdiff_t limit = 0;
157  if (NumKernelDims < NumDims) {
158  limit = NumDims - NumKernelDims - 1;
159  }
160  for (int d = 0; d < limit; ++d) {
161  const Index idx = p / m_gpuOutputStrides[d];
162  outputIndex += idx * m_outputStrides[d];
163  p -= idx * m_gpuOutputStrides[d];
164  }
165  outputIndex += p * m_outputStrides[limit];
166  }
167  return outputIndex;
168  }
169 
170  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const {
171  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
172  ? 0
173  : NumDims - NumKernelDims;
174  return i * m_inputStrides[offset];
175  }
176 
177  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const {
178  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
179  ? 0
180  : NumDims - NumKernelDims;
181  return i * m_outputStrides[offset];
182  }
183 
184  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const {
185  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
186  ? 0
187  : NumDims - NumKernelDims;
188  return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
189  }
190 
191  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const {
192  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
193  ? 0
194  : NumDims - NumKernelDims;
195  return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
196  }
197 
198  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
199  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
200  ? 0
201  : NumDims - NumKernelDims;
202  return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] +
203  k * m_inputStrides[offset + 2];
204  }
205 
206  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
207  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
208  ? 0
209  : NumDims - NumKernelDims;
210  return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] +
211  k * m_outputStrides[offset + 2];
212  }
213 
214  private:
215  static constexpr int NumDims = internal::array_size<InputDims>::value;
216  array<Index, NumDims> m_inputStrides;
217  array<Index, NumDims> m_outputStrides;
218  array<Index, NumDims> m_gpuInputStrides;
219  array<Index, NumDims> m_gpuOutputStrides;
220 };
221 
222 
223 
224 template<typename Dimensions, typename InputXprType, typename KernelXprType>
225 struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
226 {
227  // Type promotion to handle the case where the types of the lhs and the rhs are different.
228  typedef typename promote_storage_type<typename InputXprType::Scalar,
229  typename KernelXprType::Scalar>::ret Scalar;
230  typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind,
231  typename traits<KernelXprType>::StorageKind>::ret StorageKind;
232  typedef typename promote_index_type<typename traits<InputXprType>::Index,
233  typename traits<KernelXprType>::Index>::type Index;
234  typedef typename InputXprType::Nested LhsNested;
235  typedef typename KernelXprType::Nested RhsNested;
236  typedef std::remove_reference_t<LhsNested> LhsNested_;
237  typedef std::remove_reference_t<RhsNested> RhsNested_;
238  static constexpr int NumDimensions = traits<InputXprType>::NumDimensions;
239  static constexpr int Layout = traits<InputXprType>::Layout;
240  typedef std::conditional_t<Pointer_type_promotion<typename InputXprType::Scalar, Scalar>::val,
242 
243  enum {
244  Flags = 0
245  };
246 };
247 
248 template<typename Dimensions, typename InputXprType, typename KernelXprType>
249 struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense>
250 {
251  typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type;
252 };
253 
254 template<typename Dimensions, typename InputXprType, typename KernelXprType>
255 struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type>
256 {
257  typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type;
258 };
259 
260 } // end namespace internal
261 
262 
263 
264 template<typename Indices, typename InputXprType, typename KernelXprType>
265 class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors>
266 {
267  public:
268  typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar;
270  typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
271  typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
272  typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested;
273  typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind;
274  typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index;
275 
276  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims)
277  : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
278 
279  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
280  const Indices& indices() const { return m_indices; }
281 
283  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
285  inputExpression() const { return m_input_xpr; }
286 
287  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
289  kernelExpression() const { return m_kernel_xpr; }
290 
291  protected:
292  typename InputXprType::Nested m_input_xpr;
293  typename KernelXprType::Nested m_kernel_xpr;
294  const Indices m_indices;
295 };
296 
297 
298 template<typename Indices, typename InputArgType, typename KernelArgType, typename Device>
299 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device>
300 {
302 
303  static constexpr int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value;
304  static constexpr int NumKernelDims = internal::array_size<Indices>::value;
305  typedef typename XprType::Index Index;
307 
308  typedef typename XprType::Scalar Scalar;
314 
316  enum {
319  BlockAccess = false,
320  PreferBlockAccess = false,
321  CoordAccess = false, // to be implemented
322  RawAccess = false
323  };
324 
325  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
326  typedef internal::TensorBlockNotImplemented TensorBlock;
327  //===--------------------------------------------------------------------===//
328 
329  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
330  : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
331  {
332  EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
333 
334  const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
335  const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
336 
337  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
338  m_inputStride[0] = 1;
339  for (int i = 1; i < NumDims; ++i) {
340  m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
341  }
342  } else {
343  m_inputStride[NumDims - 1] = 1;
344  for (int i = NumDims - 2; i >= 0; --i) {
345  m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
346  }
347  }
348 
349  m_dimensions = m_inputImpl.dimensions();
350  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
351  for (int i = 0; i < NumKernelDims; ++i) {
352  const Index index = op.indices()[i];
353  const Index input_dim = input_dims[index];
354  const Index kernel_dim = kernel_dims[i];
355  const Index result_dim = input_dim - kernel_dim + 1;
356  m_dimensions[index] = result_dim;
357  if (i > 0) {
358  m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
359  } else {
360  m_kernelStride[0] = 1;
361  }
362  m_indexStride[i] = m_inputStride[index];
363  }
364 
365  m_outputStride[0] = 1;
366  for (int i = 1; i < NumDims; ++i) {
367  m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
368  }
369  } else {
370  for (int i = NumKernelDims - 1; i >= 0; --i) {
371  const Index index = op.indices()[i];
372  const Index input_dim = input_dims[index];
373  const Index kernel_dim = kernel_dims[i];
374  const Index result_dim = input_dim - kernel_dim + 1;
375  m_dimensions[index] = result_dim;
376  if (i < NumKernelDims - 1) {
377  m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
378  } else {
379  m_kernelStride[NumKernelDims - 1] = 1;
380  }
381  m_indexStride[i] = m_inputStride[index];
382  }
383 
384  m_outputStride[NumDims - 1] = 1;
385  for (int i = NumDims - 2; i >= 0; --i) {
386  m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
387  }
388  }
389  }
390 
391  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
392 
393  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
394  m_inputImpl.evalSubExprsIfNeeded(NULL);
395  preloadKernel();
396  return true;
397  }
398  EIGEN_STRONG_INLINE void cleanup() {
399  m_inputImpl.cleanup();
400  if (m_local_kernel) {
401  m_device.deallocate((void*)m_kernel);
402  m_local_kernel = false;
403  }
404  m_kernel = NULL;
405  }
406 
407  void evalTo(typename XprType::Scalar* buffer) {
408  evalSubExprsIfNeeded(NULL);
409  for (int i = 0; i < dimensions().TotalSize(); ++i) {
410  buffer[i] += coeff(i);
411  }
412  cleanup();
413  }
414 
415  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
416  {
417  CoeffReturnType result = CoeffReturnType(0);
418  convolve(firstInput(index), 0, NumKernelDims-1, result);
419  return result;
420  }
421 
422  template<int LoadMode>
424  {
425  Index indices[2] = {index, index+PacketSize-1};
426  Index startInputs[2] = {0, 0};
427  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
428  for (int i = NumDims - 1; i > 0; --i) {
429  const Index idx0 = indices[0] / m_outputStride[i];
430  const Index idx1 = indices[1] / m_outputStride[i];
431  startInputs[0] += idx0 * m_inputStride[i];
432  startInputs[1] += idx1 * m_inputStride[i];
433  indices[0] -= idx0 * m_outputStride[i];
434  indices[1] -= idx1 * m_outputStride[i];
435  }
436  } else {
437  for (int i = 0; i < NumDims - 1; ++i) {
438  const Index idx0 = indices[0] / m_outputStride[i];
439  const Index idx1 = indices[1] / m_outputStride[i];
440  startInputs[0] += idx0 * m_inputStride[i];
441  startInputs[1] += idx1 * m_inputStride[i];
442  indices[0] -= idx0 * m_outputStride[i];
443  indices[1] -= idx1 * m_outputStride[i];
444  }
445  }
446  startInputs[0] += indices[0];
447  startInputs[1] += indices[1];
448 
449  if (startInputs[1]-startInputs[0] == PacketSize-1) {
450  PacketReturnType result = internal::pset1<PacketReturnType>(0);
451  convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
452  return result;
453  } else {
455  data[0] = Scalar(0);
456  convolve(startInputs[0], 0, NumKernelDims-1, data[0]);
457  for (int i = 1; i < PacketSize-1; ++i) {
458  data[i] = Scalar(0);
459  convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]);
460  }
461  data[PacketSize-1] = Scalar(0);
462  convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]);
463  return internal::pload<PacketReturnType>(data);
464  }
465  }
466 
467  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
468  costPerCoeff(bool vectorized) const {
469  const double kernel_size = m_kernelImpl.dimensions().TotalSize();
470  // We ignore the use of fused multiply-add.
471  const double convolve_compute_cost =
472  TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
473  const double firstIndex_compute_cost =
474  NumDims *
475  (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
476  TensorOpCost::DivCost<Index>());
477  return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
478  kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
479  m_kernelImpl.costPerCoeff(vectorized) +
480  TensorOpCost(0, 0, convolve_compute_cost, vectorized,
481  PacketSize));
482  }
483 
484  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
485 
486  private:
487  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
488  Index startInput = 0;
489  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
490  for (int i = NumDims - 1; i > 0; --i) {
491  const Index idx = index / m_outputStride[i];
492  startInput += idx * m_inputStride[i];
493  index -= idx * m_outputStride[i];
494  }
495  } else {
496  for (int i = 0; i < NumDims - 1; ++i) {
497  const Index idx = index / m_outputStride[i];
498  startInput += idx * m_inputStride[i];
499  index -= idx * m_outputStride[i];
500  }
501  }
502  startInput += index;
503  return startInput;
504  }
505 
506  EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
507  for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
508  const Index input = firstIndex + j * m_indexStride[DimIndex];
509  const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
510  if (DimIndex > 0) {
511  convolve(input, kernel, DimIndex-1, accum);
512  } else {
513  accum += m_inputImpl.coeff(input) * m_kernel[kernel];
514  }
515  }
516  }
517 
518  template <typename Packet>
519  EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const {
520  for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
521  const Index input = firstIndex + j * m_indexStride[DimIndex];
522  const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
523  if (DimIndex > 0) {
524  convolvePacket(input, kernel, DimIndex-1, accum);
525  } else {
526  accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
527  }
528  }
529  }
530 
531  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
532  // Don't make a local copy of the kernel unless we have to (i.e. it's an
533  // expression that needs to be evaluated)
534  const Scalar* in_place = m_kernelImpl.data();
535  if (in_place) {
536  m_kernel = in_place;
537  m_local_kernel = false;
538  } else {
539  size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
540  Scalar* local = (Scalar*)m_device.allocate_temp(kernel_sz);
542  EvalTo evalToTmp(local, m_kernelArg);
543  const bool Vectorize = internal::IsVectorizable<Device, KernelArgType>::value;
544  internal::TensorExecutor<const EvalTo, Device, Vectorize>::run(evalToTmp, m_device);
545 
546  m_kernel = local;
547  m_local_kernel = true;
548  }
549  }
550 
553 
559 
560  KernelArgType m_kernelArg;
561  const Scalar* m_kernel;
564 };
565 
566 
567 
568 
569 // Use an optimized implementation of the evaluation code for GPUs whenever possible.
570 #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
571 
572 template <int StaticKernelSize>
573 struct GetKernelSize {
574  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const {
575  return StaticKernelSize;
576  }
577 };
578 template <>
579 struct GetKernelSize<Dynamic> {
580  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const {
581  return kernelSize;
582  }
583 };
584 
585 template <typename InputEvaluator, typename Index, typename InputDims,
586  int StaticKernelSize>
587 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel1D(
588  InputEvaluator eval,
589  const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
590  indexMapper,
591  const float* __restrict kernel, const int numPlanes, const int numX,
592  const int maxX, const int kernelSize, float* buffer) {
593 #if defined(EIGEN_HIPCC)
594  HIP_DYNAMIC_SHARED(float, s)
595 #else
596  extern __shared__ float s[];
597 #endif
598 
599  const int first_x = blockIdx.x * maxX;
600  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
601  const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
602  const int num_x_output = last_x - first_x + 1;
603 
604  const int first_plane = blockIdx.y * blockDim.y;
605  const int plane_stride = blockDim.y * gridDim.y;
606 
607  for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
608  // Load inputs to shared memory
609  const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
610  const int plane_kernel_offset = threadIdx.y * num_x_input;
611  #pragma unroll
612  for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
613  const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x);
614  s[i + plane_kernel_offset] = eval.coeff(tensor_index);
615  }
616 
617  __syncthreads();
618 
619  // Compute the convolution
620  const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
621 
622  #pragma unroll
623  for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
624  const int kernel_offset = plane_kernel_offset + i;
625  float result = 0.0f;
626  #pragma unroll
627  for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
628  result += s[k + kernel_offset] * kernel[k];
629  }
630  const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x);
631  buffer[tensor_index] = result;
632  }
633  __syncthreads();
634  }
635 };
636 
637 template <typename InputEvaluator, typename Index, typename InputDims,
638  int StaticKernelSizeX, int StaticKernelSizeY>
639 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel2D(
640  InputEvaluator eval,
641  const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
642  indexMapper,
643  const float* __restrict kernel, const int numPlanes, const int numX,
644  const int maxX, const int numY, const int maxY, const int kernelSizeX,
645  const int kernelSizeY, float* buffer) {
646 #if defined(EIGEN_HIPCC)
647  HIP_DYNAMIC_SHARED(float, s)
648 #else
649  extern __shared__ float s[];
650 #endif
651 
652  const int first_x = blockIdx.x * maxX;
653  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
654  const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
655  const int num_x_output = last_x - first_x + 1;
656 
657  const int first_y = blockIdx.y * maxY;
658  const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
659  const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
660  const int num_y_output = last_y - first_y + 1;
661 
662  const int first_plane = blockIdx.z * blockDim.z;
663  const int plane_stride = blockDim.z * gridDim.z;
664 
665  for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
666 
667  const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
668  const int plane_kernel_offset = threadIdx.z * num_y_input;
669 
670  // Load inputs to shared memory
671  #pragma unroll
672  for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
673  const int input_offset = num_x_input * (j + plane_kernel_offset);
674  #pragma unroll
675  for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
676  const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y);
677  s[i + input_offset] = eval.coeff(tensor_index);
678  }
679  }
680 
681  __syncthreads();
682 
683  // Convolution
684  const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
685 
686  #pragma unroll
687  for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
688  #pragma unroll
689  for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
690  float result = 0.0f;
691  #pragma unroll
692  for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
693  const int kernel_offset = kernelSizeX * l;
694  const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
695  #pragma unroll
696  for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
697  result += s[k + input_offset] * kernel[k + kernel_offset];
698  }
699  }
700  const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
701  buffer[tensor_index] = result;
702  }
703  }
704 
705  __syncthreads();
706  }
707 };
708 
709 template <typename InputEvaluator, typename Index, typename InputDims>
710 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D(
711  InputEvaluator eval,
712  const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
713  indexMapper,
714  const float* __restrict kernel, const size_t numPlanes, const size_t numX,
715  const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ,
716  const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
717  const size_t kernelSizeZ, float* buffer) {
718 #if defined(EIGEN_HIPCC)
719  HIP_DYNAMIC_SHARED(float, s)
720 #else
721  extern __shared__ float s[];
722 #endif
723 
724  // Load inputs to shared memory
725  const int first_x = blockIdx.x * maxX;
726  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
727  const int num_x_input = last_x - first_x + kernelSizeX;
728 
729  const int first_y = blockIdx.y * maxY;
730  const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
731  const int num_y_input = last_y - first_y + kernelSizeY;
732 
733  const int first_z = blockIdx.z * maxZ;
734  const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
735  const int num_z_input = last_z - first_z + kernelSizeZ;
736 
737  for (int p = 0; p < numPlanes; ++p) {
738 
739  const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
740  const int plane_kernel_offset = 0;
741 
742  for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
743  for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
744  for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
745  const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
746  s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
747  }
748  }
749  }
750 
751  __syncthreads();
752 
753  // Convolution
754  const int num_z_output = last_z - first_z + 1;
755  const int num_y_output = last_y - first_y + 1;
756  const int num_x_output = last_x - first_x + 1;
757  const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
758 
759  for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
760  for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
761  for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
762  float result = 0.0f;
763  for (int n = 0; n < kernelSizeZ; ++n) {
764  for (int m = 0; m < kernelSizeY; ++m) {
765  for (int l = 0; l < kernelSizeX; ++l) {
766  result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
767  }
768  }
769  }
770  const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
771  buffer[tensor_index] = result;
772  }
773  }
774  }
775  __syncthreads();
776  }
777 };
778 
779 
780 
781 template<typename Indices, typename InputArgType, typename KernelArgType>
782 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice>
783 {
784  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
785 
786  static constexpr int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;
787  static constexpr int NumKernelDims = internal::array_size<Indices>::value;
788  typedef typename XprType::Index Index;
789  typedef DSizes<Index, NumDims> Dimensions;
790  typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
791 
793  enum {
795  PacketAccess = false,
796  BlockAccess = false,
797  PreferBlockAccess = false,
798  CoordAccess = false, // to be implemented
799  RawAccess = false
800  };
801 
802  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
803  typedef internal::TensorBlockNotImplemented TensorBlock;
804  //===--------------------------------------------------------------------===//
805 
806  TensorEvaluator(const XprType& op, const GpuDevice& device)
807  : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
808  {
809  EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
810 
811  const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
812  const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
813 
814  m_dimensions = m_inputImpl.dimensions();
815  for (int i = 0; i < NumKernelDims; ++i) {
816  const Index index = op.indices()[i];
817  const Index input_dim = input_dims[index];
818  const Index kernel_dim = kernel_dims[i];
819  const Index result_dim = input_dim - kernel_dim + 1;
820  m_dimensions[index] = result_dim;
821  }
822  }
823 
824  typedef typename XprType::CoeffReturnType CoeffReturnType;
826  typedef typename InputArgType::Scalar Scalar;
827  static constexpr int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
828 
829  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
830 
831  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
832  preloadKernel();
833  m_inputImpl.evalSubExprsIfNeeded(NULL);
834  if (data) {
835  executeEval(data);
836  return false;
837  } else {
838  m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
839  executeEval(m_buf);
840  return true;
841  }
842  }
843 
844  EIGEN_STRONG_INLINE void cleanup() {
845  m_inputImpl.cleanup();
846  if (m_buf) {
847  m_device.deallocate(m_buf);
848  m_buf = NULL;
849  }
850  if (m_local_kernel) {
851  m_device.deallocate((void*)m_kernel);
852  m_local_kernel = false;
853  }
854  m_kernel = NULL;
855  }
856 
857  EIGEN_STRONG_INLINE void preloadKernel() {
858  // Don't make a local copy of the kernel unless we have to (i.e. it's an
859  // expression that needs to be evaluated)
860  const Scalar* in_place = m_kernelImpl.data();
861  if (in_place) {
862  m_kernel = in_place;
863  m_local_kernel = false;
864  } else {
865  size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
866  Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
867  typedef TensorEvalToOp<const KernelArgType> EvalTo;
868  EvalTo evalToTmp(local, m_kernelArg);
869  const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value;
870  internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp, m_device);
871 
872  m_kernel = local;
873  m_local_kernel = true;
874  }
875  }
876 
877  static unsigned int ceil(unsigned int num, unsigned int denom) {
878  const unsigned int rounded_toward_zero = num / denom;
879  if (num > rounded_toward_zero * denom) {
880  return rounded_toward_zero + 1;
881  }
882  return rounded_toward_zero;
883  }
884 
885  void executeEval(Scalar* data) const {
886  typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
887 
888  const int maxSharedMem = m_device.sharedMemPerBlock();
889  const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock();
890  const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock;
891  const int numMultiProcessors = m_device.getNumGpuMultiProcessors();
892  const int warpSize = 32;
893 
894  switch (NumKernelDims) {
895  case 1: {
896  const int kernel_size = m_kernelImpl.dimensions().TotalSize();
897 
898  const int numX = dimensions()[m_indices[0]];
899  const int numP = dimensions().TotalSize() / numX;
900  int maxX;
901  dim3 block_size;
902 
903  const int single_stride_dim =
904  static_cast<int>(Layout) == static_cast<int>(ColMajor)
905  ? 0
906  : m_inputImpl.dimensions().rank() - 1;
907  if (m_indices[0] == single_stride_dim) {
908  // Maximum the reuse
909  const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
910  maxX = numext::mini<int>(inner_dim, numX);
911  const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
912  block_size.x = numext::mini(maxThreadsPerBlock, maxX);
913  block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
914  }
915  else {
916  // Read as much as possible alongside the inner most dimension, that is the plane
917  const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar));
918  const int maxP = numext::mini<int>(inner_dim, numP);
919  maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
920 
921  block_size.x = numext::mini(warpSize, maxX);
922  block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP);
923  }
924 
925  const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
926  gpu_assert(shared_mem <= maxSharedMem);
927 
928  const int num_x_blocks = ceil(numX, maxX);
929  const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
930  const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
931 
932  dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y)));
933 
934 
935  //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
936 
937  const array<Index, 1> indices(m_indices[0]);
938  const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]);
939  internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(
940  m_inputImpl.dimensions(), kernel_dims, indices);
941  switch(kernel_size) {
942  case 4: {
943  LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
944  break;
945  }
946  case 7: {
947  LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
948  break;
949  }
950  default: {
951  LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
952  }
953  }
954  break;
955  }
956 
957  case 2: {
958  const int idxX =
959  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
960  const int idxY =
961  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
962  const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
963  const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
964 
965  const int numX = dimensions()[m_indices[idxX]];
966  const int numY = dimensions()[m_indices[idxY]];
967  const int numP = dimensions().TotalSize() / (numX*numY);
968 
969  const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));
970 
971  // Snap maxX to warp size
972  int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
973  const int maxX = numext::mini<int>(inner_dim, numX);
974  const int maxY = numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
975  const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
976 
977  dim3 block_size;
978  block_size.x = numext::mini(1024, maxX);
979  block_size.y = numext::mini<int>(1024/block_size.x, maxY);
980  block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
981 
982  const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
983  gpu_assert(shared_mem <= maxSharedMem);
984 
985  const int num_x_blocks = ceil(numX, maxX);
986  const int num_y_blocks = ceil(numY, maxY);
987  const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
988  const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
989 
990  dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z)));
991 
992 
993  //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
994 
995  const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]);
996  const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX],
997  m_kernelImpl.dimensions()[idxY]);
998  internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(
999  m_inputImpl.dimensions(), kernel_dims, indices);
1000  switch (kernel_size_x) {
1001  case 4: {
1002  switch (kernel_size_y) {
1003  case 7: {
1004  LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
1005  break;
1006  }
1007  default: {
1008  LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
1009  break;
1010  }
1011  }
1012  break;
1013  }
1014  case 7: {
1015  switch (kernel_size_y) {
1016  case 4: {
1017  LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
1018  break;
1019  }
1020  default: {
1021  LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
1022  break;
1023  }
1024  }
1025  break;
1026  }
1027  default: {
1028  LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
1029  break;
1030  }
1031  }
1032  break;
1033  }
1034 
1035  case 3: {
1036  const int idxX =
1037  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
1038  const int idxY =
1039  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
1040  const int idxZ =
1041  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
1042 
1043  const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
1044  const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
1045  const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
1046 
1047  const int numX = dimensions()[m_indices[idxX]];
1048  const int numY = dimensions()[m_indices[idxY]];
1049  const int numZ = dimensions()[m_indices[idxZ]];
1050  const int numP = dimensions().TotalSize() / (numX*numY*numZ);
1051 
1052  const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
1053  const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
1054  const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
1055 
1056  dim3 block_size;
1057  block_size.x = numext::mini(32, maxX);
1058  block_size.y = numext::mini(32, maxY);
1059  block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ);
1060  dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
1061 
1062  const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
1063  gpu_assert(shared_mem <= maxSharedMem);
1064 
1065  //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
1066  const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
1067  m_indices[idxZ]);
1068  const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX],
1069  m_kernelImpl.dimensions()[idxY],
1070  m_kernelImpl.dimensions()[idxZ]);
1071  internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
1072  m_inputImpl.dimensions(), kernel_dims, indices);
1073 
1074  LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
1075  break;
1076  }
1077 
1078  default: {
1079  EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
1080  }
1081  }
1082  }
1083 
1084  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
1085  {
1086  eigen_assert(m_buf);
1087  eigen_assert(index < m_dimensions.TotalSize());
1088  return m_buf[index];
1089  }
1090 
1091  template<int LoadMode>
1092  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
1093  {
1094  eigen_assert(m_buf);
1095  eigen_assert(index < m_dimensions.TotalSize());
1096  return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
1097  }
1098 
1099  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
1100  costPerCoeff(bool vectorized) const {
1101  // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
1102  // model.
1103  const double kernel_size = m_kernelImpl.dimensions().TotalSize();
1104  // We ignore the use of fused multiply-add.
1105  const double convolve_compute_cost =
1106  TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
1107  const double firstIndex_compute_cost =
1108  NumDims *
1109  (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
1110  TensorOpCost::DivCost<Index>());
1111  return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
1112  kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
1113  m_kernelImpl.costPerCoeff(vectorized) +
1114  TensorOpCost(0, 0, convolve_compute_cost, vectorized,
1115  PacketSize));
1116  }
1117 
1118  private:
1119  TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
1120  TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
1121  KernelArgType m_kernelArg;
1122  Indices m_indices;
1123  Dimensions m_dimensions;
1124  Scalar* m_buf;
1125  const Scalar* m_kernel;
1126  bool m_local_kernel;
1127 
1128  const GpuDevice& m_device;
1129 };
1130 #endif
1131 
1132 
1133 } // end namespace Eigen
1134 
1135 #endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
Matrix3f m
int n
int i
#define EIGEN_ALIGN_MAX
#define EIGEN_DEVICE_FUNC
#define EIGEN_HIP_LAUNCH_BOUNDS_1024
#define eigen_assert(x)
#define EIGEN_STATIC_ASSERT(X, MSG)
#define EIGEN_DEVICE_REF
Definition: TensorMacros.h:36
#define gpu_assert(x)
Definition: Tensor:70
float * p
The tensor base class.
const internal::remove_all_t< typename InputXprType::Nested > & inputExpression() const
Eigen::internal::traits< TensorConvolutionOp >::Index Index
Eigen::internal::traits< TensorConvolutionOp >::Scalar Scalar
Eigen::internal::traits< TensorConvolutionOp >::StorageKind StorageKind
internal::promote_storage_type< typename InputXprType::CoeffReturnType, typename KernelXprType::CoeffReturnType >::ret CoeffReturnType
Eigen::internal::nested< TensorConvolutionOp >::type Nested
Eigen::NumTraits< Scalar >::Real RealScalar
const Indices & indices() const
const internal::remove_all_t< typename KernelXprType::Nested > & kernelExpression() const
InputXprType::Nested m_input_xpr
KernelXprType::Nested m_kernel_xpr
TensorConvolutionOp(const InputXprType &input, const KernelXprType &kernel, const Indices &dims)
typename remove_all< T >::type remove_all_t
EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
: TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend
std::array< T, N > array
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_ceil_op< typename Derived::Scalar >, const Derived > ceil(const Eigen::ArrayBase< Derived > &x)
const int Dynamic
internal::packet_traits< Scalar >::type type
Definition: TensorMeta.h:55
void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet &accum) const
void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType &accum) const
A cost model used to limit the number of threads used for evaluating tensor expression.
const Dimensions & dimensions() const
PacketReturnType packet(Index index) const
static constexpr int Layout
Derived::Scalar Scalar
const Device EIGEN_DEVICE_REF m_device
CoeffReturnType coeff(Index index) const
TensorEvaluator(const Derived &m, const Device &device)
static constexpr int PacketSize
EvaluatorPointerType data() const
Derived::Scalar CoeffReturnType
internal::TensorMaterializedBlock< ScalarNoConst, NumCoords, Layout, Index > TensorBlock
bool evalSubExprsIfNeeded(EvaluatorPointerType dest)
PacketType< CoeffReturnType, Device >::type PacketReturnType
Derived::Dimensions Dimensions
TensorOpCost costPerCoeff(bool vectorized) const
std::ptrdiff_t j