TensorConvolutionSycl.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Mehdi Goli Codeplay Software Ltd.
5 // Ralph Potter Codeplay Software Ltd.
6 // Luke Iwanski Codeplay Software Ltd.
7 // Contact: <eigen@codeplay.com>
8 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
9 
10 //
11 // This Source Code Form is subject to the terms of the Mozilla
12 // Public License v. 2.0. If a copy of the MPL was not distributed
13 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
14 
15 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
16 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
17 
18 #include "./InternalHeaderCheck.h"
19 
20 namespace Eigen {
21 
31 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
32  typename Kernel_accessor, typename Buffer_accessor, convolution_type Conv_Dim>
34 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
35  typename Kernel_accessor, typename Buffer_accessor>
36 struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
37  Buffer_accessor, convolution_type::CONV1D> {
38  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
41  Evaluator device_evaluator;
42  Kernel_accessor kernel_filter;
43  Buffer_accessor buffer_acc;
44  internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper;
45  const size_t kernelSize;
46  const cl::sycl::range<2> input_range;
47  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
48  Buffer_accessor buffer_acc_,
49  internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper_,
50  const size_t kernelSize_, const cl::sycl::range<2> input_range_)
51  : local_acc(local_acc_),
52  device_evaluator(device_evaluator_),
53  kernel_filter(kernel_filter_),
54  buffer_acc(buffer_acc_),
55  indexMapper(indexMapper_),
56  kernelSize(kernelSize_),
57  input_range(input_range_) {}
58 
59  template <typename BooleanDim2>
60  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) const {
61  return (boolean_check[0] && boolean_check[1]);
62  }
63  void operator()(cl::sycl::nd_item<2> itemID) const {
64  auto buffer_ptr = buffer_acc;
65  auto kernel_ptr = kernel_filter;
66  // the required row to be calculated for the for each plane in shered memory
67  const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1);
68  const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input;
69  const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0];
70  const size_t plane_tensor_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(1));
72  for (size_t i = itemID.get_local_id(0); i < num_input; i += itemID.get_local_range()[0]) {
73  const size_t local_index = i + plane_kernel_offset;
74  const size_t tensor_index =
75  plane_tensor_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + input_offset);
76 
77  local_acc[local_index] =
78  (((i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1])
79  ? device_evaluator.coeff(tensor_index)
80  : CoeffReturnType(0);
81  }
82 
83  itemID.barrier(cl::sycl::access::fence_space::local_space);
84 
85  // calculate the convolution // output start x
86  const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]);
87  if (boundary_check(itemID.get_global_id() < input_range)) {
88  CoeffReturnType result = static_cast<CoeffReturnType>(0);
89  const size_t index = plane_kernel_offset + itemID.get_local_id(0);
90  for (size_t k = 0; k < kernelSize; ++k) {
91  result += (local_acc[k + index] * kernel_ptr[k]);
92  }
93  const size_t tensor_index =
94  indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(1)) +
95  indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + first_output_start);
96  buffer_ptr[tensor_index] = result;
97  }
98  }
99 };
100 
101 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
102  typename Kernel_accessor, typename Buffer_accessor>
103 struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
104  Buffer_accessor, convolution_type::CONV2D> {
105  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
108  Evaluator device_evaluator;
109  Kernel_accessor kernel_filter;
110  Buffer_accessor buffer_acc;
111  internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper;
112  const cl::sycl::range<2> kernel_size;
113  const cl::sycl::range<3> input_range;
114  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
115  Buffer_accessor buffer_acc_,
116  internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper_,
117  const cl::sycl::range<2> kernel_size_, const cl::sycl::range<3> input_range_)
118  : local_acc(local_acc_),
119  device_evaluator(device_evaluator_),
120  kernel_filter(kernel_filter_),
121  buffer_acc(buffer_acc_),
122  indexMapper(indexMapper_),
123  kernel_size(kernel_size_),
124  input_range(input_range_) {}
125  template <typename BooleanDim3>
126  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) const {
127  return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
128  }
129 
130  void operator()(cl::sycl::nd_item<3> itemID) const {
131  auto buffer_ptr = buffer_acc;
132  auto kernel_ptr = kernel_filter;
133  // the required row to be calculated for the for each plane in shered memory
134  const auto num_input = cl::sycl::range<2>{
135  (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)};
136 
137  const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(2));
138  const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1];
139 
140  const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
141  itemID.get_group(1) * itemID.get_local_range()[1]};
142 
143  // fill the local memory
144  bool in_range_dim2 = itemID.get_global_id(2) < input_range[2];
145  for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
146  const size_t local_input_offset = num_input[0] * (j + plane_kernel_offset);
147  bool in_range_dim1 = ((j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1));
148  for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
149  const size_t local_index = i + local_input_offset;
150  const size_t tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
151  i + input_offset[0], j + input_offset[1]);
152  local_acc[local_index] = (((i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) &&
153  in_range_dim1 && in_range_dim2)
154  ? device_evaluator.coeff(tensor_index)
155  : CoeffReturnType(0);
156  }
157  }
158 
159  itemID.barrier(cl::sycl::access::fence_space::local_space);
160 
161  // output offset start for each thread
162  const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
163  itemID.get_group(1) * itemID.get_local_range()[1]};
164 
165  if (boundary_check(itemID.get_global_id() < input_range)) {
166  CoeffReturnType result = static_cast<CoeffReturnType>(0);
167 
168  for (size_t j = 0; j < kernel_size[1]; j++) {
169  size_t kernel_offset = kernel_size[0] * j;
170  const size_t index =
171  (num_input[0] * (plane_kernel_offset + j + itemID.get_local_id(1))) + itemID.get_local_id(0);
172  for (size_t i = 0; i < kernel_size[0]; i++) {
173  result += (local_acc[i + index] * kernel_ptr[i + kernel_offset]);
174  }
175  }
176  const size_t tensor_index =
177  indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(2)) +
178  indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + output_offset[0],
179  itemID.get_local_id(1) + output_offset[1]);
180 
181  buffer_ptr[tensor_index] = result;
182  }
183  }
184 };
185 
186 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
187  typename Kernel_accessor, typename Buffer_accessor>
188 struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
189  Buffer_accessor, convolution_type::CONV3D> {
190  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
193  Evaluator device_evaluator;
194  Kernel_accessor kernel_filter;
195  Buffer_accessor buffer_acc;
196  internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper;
197  const cl::sycl::range<3> kernel_size;
198  const cl::sycl::range<3> input_range;
199  const size_t numP;
200 
201  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
202  Buffer_accessor buffer_acc_,
203  internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper_,
204  const cl::sycl::range<3> kernel_size_, const cl::sycl::range<3> input_range_,
205  const size_t numP_)
206  : local_acc(local_acc_),
207  device_evaluator(device_evaluator_),
208  kernel_filter(kernel_filter_),
209  buffer_acc(buffer_acc_),
210  indexMapper(indexMapper_),
211  kernel_size(kernel_size_),
212  input_range(input_range_),
213  numP(numP_) {}
214  template <typename BooleanDim3>
215  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) const {
216  return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
217  }
218  void operator()(cl::sycl::nd_item<3> itemID) const {
219  auto buffer_ptr = buffer_acc;
220  auto kernel_ptr = kernel_filter;
221  const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};
222 
223  const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()};
224 
225  const auto output_offset =
226  cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()};
227 
228  for (size_t p = 0; p < numP; p++) {
230  const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
231  for (size_t k = itemID.get_local_id(2); k < num_input[2]; k += itemID.get_local_range()[2]) {
232  size_t local_index_dim2 = num_input[0] * num_input[1] * k;
233  bool cond_k_dim = (k + input_offset[2] < (input_range[2] + kernel_size[2] - 1));
234  for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
235  bool cond_j_dim = cond_k_dim && (j + input_offset[1] < (input_range[1] + kernel_size[1] - 1));
236  size_t local_index_dim1 = (num_input[0] * j) + local_index_dim2;
237  for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
238  bool conds = cond_j_dim && (i + input_offset[0] < (input_range[0] + kernel_size[0] - 1));
239  const size_t local_index = local_index_dim1 + i;
240  const size_t tensor_index =
241  plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
242  i + input_offset[0], j + input_offset[1], k + input_offset[2]);
243  local_acc[local_index] = conds ? device_evaluator.coeff(tensor_index) : CoeffReturnType(0);
244  }
245  }
246  }
247  itemID.barrier(cl::sycl::access::fence_space::local_space);
248 
249  // calculate the convolution
250 
251  if (boundary_check(itemID.get_global_id() < input_range)) {
252  CoeffReturnType result = static_cast<CoeffReturnType>(0);
253  for (size_t k = 0; k < kernel_size[2]; k++) {
254  for (size_t j = 0; j < kernel_size[1]; j++) {
255  for (size_t i = 0; i < kernel_size[0]; i++) {
256  const size_t kernel_index = i + kernel_size[0] * (j + kernel_size[1] * k);
257  const size_t local_index =
258  ((i + itemID.get_local_id(0)) +
259  num_input[0] * ((j + itemID.get_local_id(1)) + num_input[1] * (k + itemID.get_local_id(2))));
260 
261  result += (local_acc[local_index] * kernel_ptr[kernel_index]);
262  }
263  }
264  }
265  const size_t tensor_index =
266  indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p) +
267  indexMapper.mapGpuOutputKernelToTensorOutputOffset(output_offset[0], output_offset[1], output_offset[2]);
268  buffer_ptr[tensor_index] = result;
269  }
270 
271  itemID.barrier(cl::sycl::access::fence_space::local_space);
272  }
273  }
274 };
275 
276 template <typename Indices, typename InputArgType, typename KernelArgType>
277 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Eigen::SyclDevice> {
279 
280  static constexpr int NumDims =
281  internal::array_size<typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions>::value;
282  static constexpr int NumKernelDims = internal::array_size<Indices>::value;
283  typedef typename XprType::Index Index;
286  typedef const Eigen::SyclDevice Device;
289  typedef typename InputArgType::Scalar Scalar;
294 
296  enum {
299  PacketAccess = false,
300  BlockAccess = false,
301  PreferBlockAccess = false,
302  CoordAccess = false, // to be implemented
303  RawAccess = false
304  };
305 
306  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
307  typedef internal::TensorBlockNotImplemented TensorBlock;
308  //===--------------------------------------------------------------------===//
309 
310  TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device)
311  : m_inputImpl(op.inputExpression(), device),
312  m_kernelArg(op.kernelExpression()),
313  m_kernelImpl(op.kernelExpression(), device),
314  m_indices(op.indices()),
315  m_buf(NULL),
316  m_kernel(NULL),
317  m_local_kernel(false),
318  m_device(device) {
321  YOU_MADE_A_PROGRAMMING_MISTAKE);
322 
323  const typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions &input_dims = m_inputImpl.dimensions();
325  m_kernelImpl.dimensions();
326 
327  m_dimensions = m_inputImpl.dimensions();
328  for (int i = 0; i < NumKernelDims; ++i) {
329  const Index index = op.indices()[i];
330  const Index input_dim = input_dims[index];
331  const Index kernel_dim = kernel_dims[i];
332  const Index result_dim = input_dim - kernel_dim + 1;
333  m_dimensions[index] = result_dim;
334  }
335  }
336 
337  EIGEN_DEVICE_FUNC const Dimensions &dimensions() const { return m_dimensions; }
338 
339  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
340  preloadKernel();
341  m_inputImpl.evalSubExprsIfNeeded(NULL);
342  if (data) {
343  executeEval(data);
344  return false;
345  } else {
346  m_buf = (EvaluatorPointerType)m_device.get(
347  (Scalar *)m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)));
348  executeEval(m_buf);
349  return true;
350  }
351  }
352 
353  EIGEN_STRONG_INLINE void cleanup() {
354  m_inputImpl.cleanup();
355  if (m_buf) {
356  m_device.deallocate_temp(m_buf);
357  m_buf = NULL;
358  }
359  if (m_local_kernel) {
360  m_device.deallocate_temp(m_kernel);
361  m_local_kernel = false;
362  }
363  m_kernel = NULL;
364  }
366  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device &device() const { return m_device; }
368  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_buf; }
369 
370  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
371  // Don't make a local copy of the kernel unless we have to (i.e. it's an
372  // expression that needs to be evaluated)
373  typename KernelStorage::Type in_place = m_kernelImpl.data();
374  if (in_place) {
375  m_kernel = in_place;
376  m_local_kernel = false;
377  } else {
378  ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
379  EvaluatorPointerType local = (EvaluatorPointerType)m_device.get((Scalar *)m_device.allocate_temp(kernel_sz));
381  EvalTo evalToTmp(m_device.get(local), m_kernelArg);
382  const bool PacketAccess = internal::IsVectorizable<Eigen::SyclDevice, KernelArgType>::value;
383  internal::TensorExecutor<const EvalTo, Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device);
384  m_kernel = local;
385  m_local_kernel = true;
386  }
387  }
388 
389  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(EvaluatorPointerType data) const {
391  typedef typename InputEvaluator::Dimensions InputDims;
392  switch (NumKernelDims) {
393  case 1: {
394  const size_t numX = dimensions()[m_indices[0]];
395  const size_t numP = dimensions().TotalSize() / numX;
396  const auto input_dim = std::array<size_t, 2>{numX, numP};
397  auto global_range = cl::sycl::range<2>{1, 1};
398  auto local_range = cl::sycl::range<2>{1, 1};
399  const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
400 
401  m_device.parallel_for_setup(input_dim, global_range, local_range);
402  const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]);
403  gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
404  const array<Index, 1> indices{{m_indices[0]}};
405  const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}};
406  internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
407 
408  typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
410  ConvKernel;
411 
412  m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
413  m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size,
414  indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1])).wait();
415  break;
416  }
417 
418  case 2: {
419  auto kernel_index = std::array<size_t, 2>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1,
420  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0};
421  auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
422  (size_t)m_kernelImpl.dimensions()[kernel_index[1]]};
423  const size_t numX = dimensions()[m_indices[kernel_index[0]]];
424  const size_t numY = dimensions()[m_indices[kernel_index[1]]];
425  const size_t numP = dimensions().TotalSize() / (numX * numY);
426  auto input_dim = std::array<size_t, 3>{numX, numY, numP};
427 
428  auto global_range = cl::sycl::range<3>{1, 1, 1};
429  auto local_range = cl::sycl::range<3>{1, 1, 1};
430 
431  m_device.parallel_for_setup(input_dim, global_range, local_range);
432 
433  const size_t local_memory_size =
434  (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2];
435  gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
436  const array<Index, 2> indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}};
437  const array<Index, 2> kernel_dims{
438  {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}};
439  internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
440  typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
442  ConvKernel;
443  m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
444  m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
445  indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]}).wait();
446  break;
447  }
448 
449  case 3: {
450  auto kernel_index = std::array<size_t, 3>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2,
451  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1,
452  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0};
453 
454  auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
455  (size_t)m_kernelImpl.dimensions()[kernel_index[1]],
456  (size_t)m_kernelImpl.dimensions()[kernel_index[2]]};
457 
458  const size_t numX = dimensions()[m_indices[kernel_index[0]]];
459  const size_t numY = dimensions()[m_indices[kernel_index[1]]];
460  const size_t numZ = dimensions()[m_indices[kernel_index[2]]];
461  auto input_dim = std::array<size_t, 3>{numX, numY, numZ};
462  const size_t numP = dimensions().TotalSize() / (numX * numY * numZ);
463 
464  const array<Index, 3> indices{
465  {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}};
466  const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]],
467  m_kernelImpl.dimensions()[kernel_index[1]],
468  m_kernelImpl.dimensions()[kernel_index[2]]}};
469 
470  internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
471 
472  auto global_range = cl::sycl::range<3>{1, 1, 1};
473  auto local_range = cl::sycl::range<3>{1, 1, 1};
474 
475  m_device.parallel_for_setup(input_dim, global_range, local_range);
476  auto local_memory_range = (local_range + kernel_size - 1);
477  const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2];
478 
479  gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
480  typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
482  ConvKernel;
483  m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
484  m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
485  indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP).wait();
486  break;
487  }
488 
489  default: {
490  EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3),
491  THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
492  }
493  }
494  }
495 
496  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
497  eigen_assert(m_buf != NULL);
498  eigen_assert(index < m_dimensions.TotalSize());
499  return m_buf[index];
500  }
501 
502  template <int LoadMode>
503  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const {
504  eigen_assert(m_buf != NULL);
505  eigen_assert(index < m_dimensions.TotalSize());
506  return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index);
507  }
508 
509  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
510  // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
511  // model.
512  const double kernel_size = m_kernelImpl.dimensions().TotalSize();
513  // We ignore the use of fused multiply-add.
514  const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
515  const double firstIndex_compute_cost =
516  NumDims *
517  (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
518  return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
519  kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
520  TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
521  }
522 
523 
524  private:
525  // No assignment (copies are needed by the kernels)
528  KernelArgType m_kernelArg;
530  Indices m_indices;
535  const Eigen::SyclDevice EIGEN_DEVICE_REF m_device;
536 }; // namespace Eigen
537 
538 } // end namespace Eigen
539 
540 #endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
int i
#define EIGEN_DEVICE_FUNC
#define eigen_assert(x)
#define EIGEN_STATIC_ASSERT(X, MSG)
#define EIGEN_DEVICE_REF
Definition: TensorMacros.h:36
#define gpu_assert(x)
Definition: Tensor:70
float * p
Eigen::internal::traits< TensorConvolutionOp >::Index Index
internal::promote_storage_type< typename InputXprType::CoeffReturnType, typename KernelXprType::CoeffReturnType >::ret CoeffReturnType
const Indices & indices() const
: TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend
std::array< T, N > array
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > Local_accessor
EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, Buffer_accessor buffer_acc_, internal::IndexMapper< Index, InputDims, 2, Evaluator::Layout > indexMapper_, const cl::sycl::range< 2 > kernel_size_, const cl::sycl::range< 3 > input_range_)
EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, Buffer_accessor buffer_acc_, internal::IndexMapper< Index, InputDims, 1, Evaluator::Layout > indexMapper_, const size_t kernelSize_, const cl::sycl::range< 2 > input_range_)
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > Local_accessor
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > Local_accessor
EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, Buffer_accessor buffer_acc_, internal::IndexMapper< Index, InputDims, 3, Evaluator::Layout > indexMapper_, const cl::sycl::range< 3 > kernel_size_, const cl::sycl::range< 3 > input_range_, const size_t numP_)
internal::packet_traits< Scalar >::type type
Definition: TensorMeta.h:55
A cost model used to limit the number of threads used for evaluating tensor expression.
const Dimensions & dimensions() const
static constexpr int Layout
Derived::Scalar Scalar
const Device EIGEN_DEVICE_REF m_device
Storage::Type EvaluatorPointerType
static constexpr int PacketSize
EvaluatorPointerType data() const
Derived::Scalar CoeffReturnType
std::ptrdiff_t j