TensorDeviceSycl.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Mehdi Goli Codeplay Software Ltd.
5 // Ralph Potter Codeplay Software Ltd.
6 // Luke Iwanski Codeplay Software Ltd.
7 // Contact: <eigen@codeplay.com>
8 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
9 
10 //
11 // This Source Code Form is subject to the terms of the Mozilla
12 // Public License v. 2.0. If a copy of the MPL was not distributed
13 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
14 
15 #if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H)
16 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
17 #include <unordered_set>
18 
19 #include "./InternalHeaderCheck.h"
20 
21 namespace Eigen {
22 
23 namespace TensorSycl {
24 namespace internal {
25 
27 struct SyclDeviceInfo {
28  SyclDeviceInfo(cl::sycl::queue queue)
29  : local_mem_type(
30  queue.get_device()
31  .template get_info<cl::sycl::info::device::local_mem_type>()),
32  max_work_item_sizes(
33  queue.get_device()
34  .template get_info<
35  cl::sycl::info::device::max_work_item_sizes<3>>()),
36  max_mem_alloc_size(
37  queue.get_device()
38  .template get_info<
39  cl::sycl::info::device::max_mem_alloc_size>()),
40  max_compute_units(queue.get_device()
41  .template get_info<
42  cl::sycl::info::device::max_compute_units>()),
43  max_work_group_size(
44  queue.get_device()
45  .template get_info<
46  cl::sycl::info::device::max_work_group_size>()),
47  local_mem_size(
48  queue.get_device()
49  .template get_info<cl::sycl::info::device::local_mem_size>()),
50  platform_name(queue.get_device()
51  .get_platform()
52  .template get_info<cl::sycl::info::platform::name>()),
53  device_name(queue.get_device()
54  .template get_info<cl::sycl::info::device::name>()),
55  device_vendor(
56  queue.get_device()
57  .template get_info<cl::sycl::info::device::vendor>()) {}
58 
59  cl::sycl::info::local_mem_type local_mem_type;
60  cl::sycl::id<3> max_work_item_sizes;
61  unsigned long max_mem_alloc_size;
62  unsigned long max_compute_units;
63  unsigned long max_work_group_size;
64  size_t local_mem_size;
65  std::string platform_name;
66  std::string device_name;
67  std::string device_vendor;
68 };
69 
70 } // end namespace internal
71 } // end namespace TensorSycl
72 
73 // All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and
74 // can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently
75 // TensorFlow via the Eigen SYCL Backend.
76 EIGEN_STRONG_INLINE auto get_sycl_supported_devices()
77  -> decltype(cl::sycl::device::get_devices()) {
78 #ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR
79  return {cl::sycl::device(cl::sycl::default_selector())};
80 #else
81  std::vector<cl::sycl::device> supported_devices;
82  auto platform_list = cl::sycl::platform::get_platforms();
83  for (const auto &platform : platform_list) {
84  auto device_list = platform.get_devices();
85  auto platform_name =
86  platform.template get_info<cl::sycl::info::platform::name>();
87  std::transform(platform_name.begin(), platform_name.end(),
88  platform_name.begin(), ::tolower);
89  for (const auto &device : device_list) {
90  auto vendor = device.template get_info<cl::sycl::info::device::vendor>();
91  std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower);
92  bool unsupported_condition =
93  (device.is_cpu() && platform_name.find("amd") != std::string::npos &&
94  vendor.find("apu") == std::string::npos) ||
95  (platform_name.find("experimental") != std::string::npos) ||
96  device.is_host();
97  if (!unsupported_condition) {
98  supported_devices.push_back(device);
99  }
100  }
101  }
102  return supported_devices;
103 #endif
104 }
105 
106 class QueueInterface {
107  public:
109  template <typename DeviceOrSelector>
110  explicit QueueInterface(
111  const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler,
112  unsigned num_threads = std::thread::hardware_concurrency())
113  : m_queue{dev_or_sel, handler, {sycl::property::queue::in_order()}},
114  m_thread_pool(num_threads),
115  m_device_info(m_queue) {}
116 
117  template <typename DeviceOrSelector>
118  explicit QueueInterface(
119  const DeviceOrSelector &dev_or_sel,
120  unsigned num_threads = std::thread::hardware_concurrency())
121  : QueueInterface(
122  dev_or_sel,
123  [this](cl::sycl::exception_list l) {
124  this->exception_caught_ = this->sycl_async_handler(l);
125  },
126  num_threads) {}
127 
128  explicit QueueInterface(
129  const cl::sycl::queue &q,
130  unsigned num_threads = std::thread::hardware_concurrency())
131  : m_queue(q), m_thread_pool(num_threads), m_device_info(m_queue) {}
132 
133  EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
134 #if EIGEN_MAX_ALIGN_BYTES > 0
135  return (void *)cl::sycl::aligned_alloc_device(EIGEN_MAX_ALIGN_BYTES,
136  num_bytes, m_queue);
137 #else
138  return (void *)cl::sycl::malloc_device(num_bytes, m_queue);
139 #endif
140  }
141 
142  EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
143  return (void *)cl::sycl::malloc_device<uint8_t>(num_bytes, m_queue);
144  }
145 
146  template <typename data_t>
147  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(data_t *data) const {
148  return data;
149  }
150 
151  EIGEN_STRONG_INLINE void deallocate_temp(void *p) const { deallocate(p); }
152 
153  EIGEN_STRONG_INLINE void deallocate_temp(const void *p) const {
154  deallocate_temp(const_cast<void *>(p));
155  }
156 
157  EIGEN_STRONG_INLINE void deallocate(void *p) const {
158  cl::sycl::free(p, m_queue);
159  }
160 
165  EIGEN_STRONG_INLINE void memcpyHostToDevice(
166  void *dst, const void *src, size_t n,
167  std::function<void()> callback) const {
168  auto e = m_queue.memcpy(dst, src, n);
169  synchronize_and_callback(e, callback);
170  }
171 
176  EIGEN_STRONG_INLINE void memcpyDeviceToHost(
177  void *dst, const void *src, size_t n,
178  std::function<void()> callback) const {
179  if (n == 0) {
180  if (callback) callback();
181  return;
182  }
183  auto e = m_queue.memcpy(dst, src, n);
184  synchronize_and_callback(e, callback);
185  }
186 
190  EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const {
191  if (n == 0) {
192  return;
193  }
194  m_queue.memcpy(dst, src, n).wait();
195  }
196 
200  EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
201  if (n == 0) {
202  return;
203  }
204  m_queue.memset(data, c, n).wait();
205  }
206 
207  template <typename T>
208  EIGEN_STRONG_INLINE void fill(T *begin, T *end, const T &value) const {
209  if (begin == end) {
210  return;
211  }
212  const size_t count = end - begin;
213  m_queue.fill(begin, value, count).wait();
214  }
215 
216  template <typename OutScalar, typename sycl_kernel, typename Lhs,
217  typename Rhs, typename OutPtr, typename Range, typename Index,
218  typename... T>
219  EIGEN_ALWAYS_INLINE cl::sycl::event binary_kernel_launcher(
220  const Lhs &lhs, const Rhs &rhs, OutPtr outptr, Range thread_range,
221  Index scratchSize, T... var) const {
222  auto kernel_functor = [=](cl::sycl::handler &cgh) {
223  typedef cl::sycl::accessor<OutScalar, 1,
224  cl::sycl::access::mode::read_write,
225  cl::sycl::access::target::local>
226  LocalAccessor;
227 
228  LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
229  cgh.parallel_for(thread_range,
230  sycl_kernel(scratch, lhs, rhs, outptr, var...));
231  };
232 
233  return m_queue.submit(kernel_functor);
234  }
235 
236  template <typename OutScalar, typename sycl_kernel, typename InPtr,
237  typename OutPtr, typename Range, typename Index, typename... T>
238  EIGEN_ALWAYS_INLINE cl::sycl::event unary_kernel_launcher(const InPtr &inptr,
239  OutPtr &outptr,
240  Range thread_range,
241  Index scratchSize,
242  T... var) const {
243  auto kernel_functor = [=](cl::sycl::handler &cgh) {
244  typedef cl::sycl::accessor<OutScalar, 1,
245  cl::sycl::access::mode::read_write,
246  cl::sycl::access::target::local>
247  LocalAccessor;
248 
249  LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
250  cgh.parallel_for(thread_range,
251  sycl_kernel(scratch, inptr, outptr, var...));
252  };
253  return m_queue.submit(kernel_functor);
254  }
255 
256  template <typename OutScalar, typename sycl_kernel, typename InPtr,
257  typename Range, typename Index, typename... T>
258  EIGEN_ALWAYS_INLINE cl::sycl::event nullary_kernel_launcher(
259  const InPtr &inptr, Range thread_range, Index scratchSize,
260  T... var) const {
261  auto kernel_functor = [=](cl::sycl::handler &cgh) {
262  typedef cl::sycl::accessor<OutScalar, 1,
263  cl::sycl::access::mode::read_write,
264  cl::sycl::access::target::local>
265  LocalAccessor;
266 
267  LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
268  cgh.parallel_for(thread_range, sycl_kernel(scratch, inptr, var...));
269  };
270 
271  return m_queue.submit(kernel_functor);
272  }
273 
274  EIGEN_STRONG_INLINE void synchronize() const {
275 #ifdef EIGEN_EXCEPTIONS
276  m_queue.wait_and_throw();
277 #else
278  m_queue.wait();
279 #endif
280  }
281 
282  template <typename Index>
283  EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
284  Index &rng, Index &GRange) const {
285  tileSize = static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
286  tileSize = std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
287  EIGEN_SYCL_LOCAL_THREAD_DIM1),
288  static_cast<Index>(tileSize));
289  rng = n;
290  if (rng == 0) rng = static_cast<Index>(1);
291  GRange = rng;
292  if (tileSize > GRange)
293  tileSize = GRange;
294  else if (GRange > tileSize) {
295  Index xMode = static_cast<Index>(GRange % tileSize);
296  if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode);
297  }
298  }
299 
302  template <typename Index>
303  EIGEN_STRONG_INLINE void parallel_for_setup(
304  const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
305  cl::sycl::range<2> &local_range) const {
306  std::array<Index, 2> input_range = input_dim;
307  Index max_workgroup_Size =
308  static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
309  max_workgroup_Size =
310  std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
311  EIGEN_SYCL_LOCAL_THREAD_DIM1),
312  static_cast<Index>(max_workgroup_Size));
313  Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
314  local_range[1] =
315  static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
316  input_range[1] = input_dim[1];
317  if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
318  global_range[1] = input_range[1];
319  if (local_range[1] > global_range[1])
320  local_range[1] = global_range[1];
321  else if (global_range[1] > local_range[1]) {
322  Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
323  if (xMode != 0)
324  global_range[1] += static_cast<Index>(local_range[1] - xMode);
325  }
326  local_range[0] = static_cast<Index>(max_workgroup_Size / local_range[1]);
327  input_range[0] = input_dim[0];
328  if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
329  global_range[0] = input_range[0];
330  if (local_range[0] > global_range[0])
331  local_range[0] = global_range[0];
332  else if (global_range[0] > local_range[0]) {
333  Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
334  if (xMode != 0)
335  global_range[0] += static_cast<Index>(local_range[0] - xMode);
336  }
337  }
338 
341  template <typename Index>
342  EIGEN_STRONG_INLINE void parallel_for_setup(
343  const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
344  cl::sycl::range<3> &local_range) const {
345  std::array<Index, 3> input_range = input_dim;
346  Index max_workgroup_Size =
347  static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
348  max_workgroup_Size =
349  std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
350  EIGEN_SYCL_LOCAL_THREAD_DIM1),
351  static_cast<Index>(max_workgroup_Size));
352  Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
353  local_range[2] =
354  static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 3)));
355  input_range[2] = input_dim[2];
356  if (input_range[2] == 0) input_range[1] = static_cast<Index>(1);
357  global_range[2] = input_range[2];
358  if (local_range[2] > global_range[2])
359  local_range[2] = global_range[2];
360  else if (global_range[2] > local_range[2]) {
361  Index xMode = static_cast<Index>(global_range[2] % local_range[2]);
362  if (xMode != 0)
363  global_range[2] += static_cast<Index>(local_range[2] - xMode);
364  }
365  pow_of_2 = static_cast<Index>(
366  std::log2(static_cast<Index>(max_workgroup_Size / local_range[2])));
367  local_range[1] =
368  static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
369  input_range[1] = input_dim[1];
370  if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
371  global_range[1] = input_range[1];
372  if (local_range[1] > global_range[1])
373  local_range[1] = global_range[1];
374  else if (global_range[1] > local_range[1]) {
375  Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
376  if (xMode != 0)
377  global_range[1] += static_cast<Index>(local_range[1] - xMode);
378  }
379  local_range[0] = static_cast<Index>(max_workgroup_Size /
380  (local_range[1] * local_range[2]));
381  input_range[0] = input_dim[0];
382  if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
383  global_range[0] = input_range[0];
384  if (local_range[0] > global_range[0])
385  local_range[0] = global_range[0];
386  else if (global_range[0] > local_range[0]) {
387  Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
388  if (xMode != 0)
389  global_range[0] += static_cast<Index>(local_range[0] - xMode);
390  }
391  }
392 
393  EIGEN_STRONG_INLINE bool has_local_memory() const {
394 #if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
395  return false;
396 #elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
397  return true;
398 #else
399  return m_device_info.local_mem_type ==
400  cl::sycl::info::local_mem_type::local;
401 #endif
402  }
403 
404  EIGEN_STRONG_INLINE unsigned long max_buffer_size() const {
405  return m_device_info.max_mem_alloc_size;
406  }
407 
408  EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
409  return m_device_info.max_compute_units;
410  }
411 
412  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
413  return m_device_info.max_work_group_size;
414  }
415 
416  EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const {
417  return m_device_info.max_work_item_sizes;
418  }
419 
421  EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; }
422 
423  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
424  // OpenCL does not have such a concept
425  return 2;
426  }
427 
428  EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
429  return m_device_info.local_mem_size;
430  }
431 
432  // This function returns the nearest power of 2 Work-group size which is <=
433  // maximum device workgroup size.
434  EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
435  return getPowerOfTwo(m_device_info.max_work_group_size, false);
436  }
437 
438  EIGEN_STRONG_INLINE std::string getPlatformName() const {
439  return m_device_info.platform_name;
440  }
441 
442  EIGEN_STRONG_INLINE std::string getDeviceName() const {
443  return m_device_info.device_name;
444  }
445 
446  EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
447  return m_device_info.device_vendor;
448  }
449 
450  // This function returns the nearest power of 2
451  // if roundup is true returns result>=wgsize
452  // else it return result <= wgsize
453  EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t wGSize, bool roundUp) const {
454  if (roundUp) --wGSize;
455  wGSize |= (wGSize >> 1);
456  wGSize |= (wGSize >> 2);
457  wGSize |= (wGSize >> 4);
458  wGSize |= (wGSize >> 8);
459  wGSize |= (wGSize >> 16);
460 #if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64
461  wGSize |= (wGSize >> 32);
462 #endif
463  return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize);
464  }
465 
466  EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return m_queue; }
467 
468  // This function checks if the runtime recorded an error for the
469  // underlying stream device.
470  EIGEN_STRONG_INLINE bool ok() const {
471  if (!exception_caught_) {
472  synchronize();
473  }
474  return !exception_caught_;
475  }
476 
477  protected:
478  void synchronize_and_callback(cl::sycl::event e,
479  const std::function<void()> &callback) const {
480  if (callback) {
481  auto callback_ = [=]() {
482 #ifdef EIGEN_EXCEPTIONS
483  cl::sycl::event(e).wait_and_throw();
484 #else
485  cl::sycl::event(e).wait();
486 #endif
487  callback();
488  };
489  m_thread_pool.Schedule(std::move(callback_));
490  } else {
491 #ifdef EIGEN_EXCEPTIONS
492  m_queue.wait_and_throw();
493 #else
494  m_queue.wait();
495 #endif
496  }
497  }
498 
499  bool sycl_async_handler(cl::sycl::exception_list exceptions) const {
500  bool exception_caught = false;
501  for (const auto &e : exceptions) {
502  if (e) {
503  exception_caught = true;
504  EIGEN_THROW_X(e);
505  }
506  }
507  return exception_caught;
508  }
509 
511  bool exception_caught_ = false;
513  mutable cl::sycl::queue m_queue;
516  mutable Eigen::ThreadPool m_thread_pool;
517 
518  const TensorSycl::internal::SyclDeviceInfo m_device_info;
519 };
520 
521 struct SyclDeviceBase {
524  const QueueInterface *m_queue_stream;
525  explicit SyclDeviceBase(const QueueInterface *queue_stream)
526  : m_queue_stream(queue_stream) {}
527  EIGEN_STRONG_INLINE const QueueInterface *queue_stream() const {
528  return m_queue_stream;
529  }
530 };
531 
532 // Here is a sycl device struct which accept the sycl queue interface
533 // as an input
534 struct SyclDevice : public SyclDeviceBase {
535  explicit SyclDevice(const QueueInterface *queue_stream)
536  : SyclDeviceBase(queue_stream) {}
537 
540  template <typename Index>
541  EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
542  Index &rng, Index &GRange) const {
543  queue_stream()->parallel_for_setup(n, tileSize, rng, GRange);
544  }
545 
548  template <typename Index>
549  EIGEN_STRONG_INLINE void parallel_for_setup(
550  const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
551  cl::sycl::range<2> &local_range) const {
552  queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
553  }
554 
557  template <typename Index>
558  EIGEN_STRONG_INLINE void parallel_for_setup(
559  const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
560  cl::sycl::range<3> &local_range) const {
561  queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
562  }
563 
565  EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
566  return queue_stream()->allocate(num_bytes);
567  }
568 
569  EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
570  return queue_stream()->allocate_temp(num_bytes);
571  }
572 
574  EIGEN_STRONG_INLINE void deallocate(void *p) const {
575  queue_stream()->deallocate(p);
576  }
577 
578  EIGEN_STRONG_INLINE void deallocate_temp(void *buffer) const {
579  queue_stream()->deallocate_temp(buffer);
580  }
581 
582  EIGEN_STRONG_INLINE void deallocate_temp(const void *buffer) const {
583  queue_stream()->deallocate_temp(buffer);
584  }
585 
586  template <typename data_t>
587  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(data_t *data) const {
588  return data;
589  }
590 
591  // some runtime conditions that can be applied here
592  EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; }
593 
595  template <typename Index>
596  EIGEN_STRONG_INLINE void memcpyHostToDevice(
597  Index *dst, const Index *src, size_t n,
598  std::function<void()> callback = {}) const {
599  queue_stream()->memcpyHostToDevice(dst, src, n, callback);
600  }
602  template <typename Index>
603  EIGEN_STRONG_INLINE void memcpyDeviceToHost(
604  void *dst, const Index *src, size_t n,
605  std::function<void()> callback = {}) const {
606  queue_stream()->memcpyDeviceToHost(dst, src, n, callback);
607  }
609  template <typename Index>
610  EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const {
611  queue_stream()->memcpy(dst, src, n);
612  }
614  EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
615  queue_stream()->memset(data, c, n);
616  }
618  template <typename T>
619  EIGEN_STRONG_INLINE void fill(T *begin, T *end, const T &value) const {
620  queue_stream()->fill(begin, end, value);
621  }
623  EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const {
624  return queue_stream()->sycl_queue();
625  }
626 
627  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return 48 * 1024; }
628 
629  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
630  // We won't try to take advantage of the l2 cache for the time being, and
631  // there is no l3 cache on sycl devices.
632  return firstLevelCacheSize();
633  }
634  EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
635  return queue_stream()->getNumSyclMultiProcessors();
636  }
637  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
638  return queue_stream()->maxSyclThreadsPerBlock();
639  }
640  EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const {
641  return queue_stream()->maxWorkItemSizes();
642  }
643  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
644  // OpenCL does not have such a concept
645  return queue_stream()->maxSyclThreadsPerMultiProcessor();
646  }
647  EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
648  return queue_stream()->sharedMemPerBlock();
649  }
650  EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
651  return queue_stream()->getNearestPowerOfTwoWorkGroupSize();
652  }
653 
654  EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t val, bool roundUp) const {
655  return queue_stream()->getPowerOfTwo(val, roundUp);
656  }
658  EIGEN_STRONG_INLINE int majorDeviceVersion() const {
659  return queue_stream()->majorDeviceVersion();
660  }
661 
662  EIGEN_STRONG_INLINE void synchronize() const {
663  queue_stream()->synchronize();
664  }
665 
666  // This function checks if the runtime recorded an error for the
667  // underlying stream device.
668  EIGEN_STRONG_INLINE bool ok() const { return queue_stream()->ok(); }
669 
670  EIGEN_STRONG_INLINE bool has_local_memory() const {
671  return queue_stream()->has_local_memory();
672  }
673  EIGEN_STRONG_INLINE long max_buffer_size() const {
674  return queue_stream()->max_buffer_size();
675  }
676  EIGEN_STRONG_INLINE std::string getPlatformName() const {
677  return queue_stream()->getPlatformName();
678  }
679  EIGEN_STRONG_INLINE std::string getDeviceName() const {
680  return queue_stream()->getDeviceName();
681  }
682  EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
683  return queue_stream()->getDeviceVendor();
684  }
685  template <typename OutScalar, typename KernelType, typename... T>
686  EIGEN_ALWAYS_INLINE cl::sycl::event binary_kernel_launcher(T... var) const {
687  return queue_stream()
688  ->template binary_kernel_launcher<OutScalar, KernelType>(var...);
689  }
690  template <typename OutScalar, typename KernelType, typename... T>
691  EIGEN_ALWAYS_INLINE cl::sycl::event unary_kernel_launcher(T... var) const {
692  return queue_stream()
693  ->template unary_kernel_launcher<OutScalar, KernelType>(var...);
694  }
695 
696  template <typename OutScalar, typename KernelType, typename... T>
697  EIGEN_ALWAYS_INLINE cl::sycl::event nullary_kernel_launcher(T... var) const {
698  return queue_stream()
699  ->template nullary_kernel_launcher<OutScalar, KernelType>(var...);
700  }
701 };
702 } // end namespace Eigen
703 
704 #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
int n
Array< double, 1, 3 > e(1./3., 0.5, 2.)
#define EIGEN_ALWAYS_INLINE
#define EIGEN_DEVICE_FUNC
#define EIGEN_THROW_X(X)
int data[]
static const lastp1_t end
: TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
CleanedUpDerType< DerType >::type() min(const AutoDiffScalar< DerType > &x, const T &y)
Eigen::AutoDiffScalar< EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(internal::remove_all_t< DerType >, typename internal::traits< internal::remove_all_t< DerType >>::Scalar, product) > pow(const Eigen::AutoDiffScalar< DerType > &x, const typename internal::traits< internal::remove_all_t< DerType >>::Scalar &y)