15 #if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H)
16 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
17 #include <unordered_set>
19 #include "./InternalHeaderCheck.h"
23 namespace TensorSycl {
27 struct SyclDeviceInfo {
28 SyclDeviceInfo(cl::sycl::queue queue)
31 .template get_info<cl::sycl::info::device::local_mem_type>()),
35 cl::sycl::info::device::max_work_item_sizes<3>>()),
39 cl::sycl::info::device::max_mem_alloc_size>()),
40 max_compute_units(queue.get_device()
42 cl::sycl::info::device::max_compute_units>()),
46 cl::sycl::info::device::max_work_group_size>()),
49 .template get_info<cl::sycl::info::device::local_mem_size>()),
50 platform_name(queue.get_device()
52 .template get_info<cl::sycl::info::platform::name>()),
53 device_name(queue.get_device()
54 .template get_info<cl::sycl::info::device::name>()),
57 .template get_info<cl::sycl::info::device::vendor>()) {}
59 cl::sycl::info::local_mem_type local_mem_type;
60 cl::sycl::id<3> max_work_item_sizes;
61 unsigned long max_mem_alloc_size;
62 unsigned long max_compute_units;
63 unsigned long max_work_group_size;
64 size_t local_mem_size;
65 std::string platform_name;
66 std::string device_name;
67 std::string device_vendor;
76 EIGEN_STRONG_INLINE
auto get_sycl_supported_devices()
77 -> decltype(cl::sycl::device::get_devices()) {
78 #ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR
79 return {cl::sycl::device(cl::sycl::default_selector())};
81 std::vector<cl::sycl::device> supported_devices;
82 auto platform_list = cl::sycl::platform::get_platforms();
83 for (
const auto &platform : platform_list) {
84 auto device_list = platform.get_devices();
86 platform.template get_info<cl::sycl::info::platform::name>();
87 std::transform(platform_name.begin(), platform_name.end(),
88 platform_name.begin(), ::tolower);
89 for (
const auto &device : device_list) {
90 auto vendor = device.template get_info<cl::sycl::info::device::vendor>();
91 std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower);
92 bool unsupported_condition =
93 (device.is_cpu() && platform_name.find(
"amd") != std::string::npos &&
94 vendor.find(
"apu") == std::string::npos) ||
95 (platform_name.find(
"experimental") != std::string::npos) ||
97 if (!unsupported_condition) {
98 supported_devices.push_back(device);
102 return supported_devices;
106 class QueueInterface {
109 template <
typename DeviceOrSelector>
110 explicit QueueInterface(
111 const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler,
112 unsigned num_threads = std::thread::hardware_concurrency())
113 : m_queue{dev_or_sel, handler, {sycl::property::queue::in_order()}},
114 m_thread_pool(num_threads),
115 m_device_info(m_queue) {}
117 template <
typename DeviceOrSelector>
118 explicit QueueInterface(
119 const DeviceOrSelector &dev_or_sel,
120 unsigned num_threads = std::thread::hardware_concurrency())
123 [this](cl::sycl::exception_list l) {
124 this->exception_caught_ = this->sycl_async_handler(l);
128 explicit QueueInterface(
129 const cl::sycl::queue &
q,
130 unsigned num_threads = std::thread::hardware_concurrency())
131 : m_queue(
q), m_thread_pool(num_threads), m_device_info(m_queue) {}
133 EIGEN_STRONG_INLINE
void *allocate(
size_t num_bytes)
const {
134 #if EIGEN_MAX_ALIGN_BYTES > 0
135 return (
void *)cl::sycl::aligned_alloc_device(EIGEN_MAX_ALIGN_BYTES,
138 return (
void *)cl::sycl::malloc_device(num_bytes, m_queue);
142 EIGEN_STRONG_INLINE
void *allocate_temp(
size_t num_bytes)
const {
143 return (
void *)cl::sycl::malloc_device<uint8_t>(num_bytes, m_queue);
146 template <
typename data_t>
151 EIGEN_STRONG_INLINE
void deallocate_temp(
void *p)
const { deallocate(p); }
153 EIGEN_STRONG_INLINE
void deallocate_temp(
const void *p)
const {
154 deallocate_temp(
const_cast<void *
>(p));
157 EIGEN_STRONG_INLINE
void deallocate(
void *p)
const {
158 cl::sycl::free(p, m_queue);
165 EIGEN_STRONG_INLINE
void memcpyHostToDevice(
166 void *dst,
const void *src,
size_t n,
167 std::function<
void()> callback)
const {
168 auto e = m_queue.memcpy(dst, src, n);
169 synchronize_and_callback(e, callback);
176 EIGEN_STRONG_INLINE
void memcpyDeviceToHost(
177 void *dst,
const void *src,
size_t n,
178 std::function<
void()> callback)
const {
180 if (callback) callback();
183 auto e = m_queue.memcpy(dst, src, n);
184 synchronize_and_callback(e, callback);
190 EIGEN_STRONG_INLINE
void memcpy(
void *dst,
const void *src,
size_t n)
const {
194 m_queue.memcpy(dst, src, n).wait();
200 EIGEN_STRONG_INLINE
void memset(
void *data,
int c,
size_t n)
const {
204 m_queue.memset(data, c, n).wait();
207 template <
typename T>
208 EIGEN_STRONG_INLINE
void fill(
T *begin,
T *
end,
const T &value)
const {
212 const size_t count =
end - begin;
213 m_queue.fill(begin, value, count).wait();
216 template <
typename OutScalar,
typename sycl_kernel,
typename Lhs,
217 typename Rhs,
typename OutPtr,
typename Range,
typename Index,
220 const Lhs &lhs,
const Rhs &rhs, OutPtr outptr, Range thread_range,
221 Index scratchSize,
T... var)
const {
222 auto kernel_functor = [=](cl::sycl::handler &cgh) {
223 typedef cl::sycl::accessor<OutScalar, 1,
224 cl::sycl::access::mode::read_write,
225 cl::sycl::access::target::local>
228 LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
229 cgh.parallel_for(thread_range,
230 sycl_kernel(scratch, lhs, rhs, outptr, var...));
233 return m_queue.submit(kernel_functor);
236 template <
typename OutScalar,
typename sycl_kernel,
typename InPtr,
237 typename OutPtr,
typename Range,
typename Index,
typename...
T>
243 auto kernel_functor = [=](cl::sycl::handler &cgh) {
244 typedef cl::sycl::accessor<OutScalar, 1,
245 cl::sycl::access::mode::read_write,
246 cl::sycl::access::target::local>
249 LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
250 cgh.parallel_for(thread_range,
251 sycl_kernel(scratch, inptr, outptr, var...));
253 return m_queue.submit(kernel_functor);
256 template <
typename OutScalar,
typename sycl_kernel,
typename InPtr,
257 typename Range,
typename Index,
typename...
T>
259 const InPtr &inptr, Range thread_range,
Index scratchSize,
261 auto kernel_functor = [=](cl::sycl::handler &cgh) {
262 typedef cl::sycl::accessor<OutScalar, 1,
263 cl::sycl::access::mode::read_write,
264 cl::sycl::access::target::local>
267 LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
268 cgh.parallel_for(thread_range, sycl_kernel(scratch, inptr, var...));
271 return m_queue.submit(kernel_functor);
274 EIGEN_STRONG_INLINE
void synchronize()
const {
275 #ifdef EIGEN_EXCEPTIONS
276 m_queue.wait_and_throw();
282 template <
typename Index>
283 EIGEN_STRONG_INLINE
void parallel_for_setup(
Index n,
Index &tileSize,
285 tileSize =
static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
286 tileSize =
std::min(
static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
287 EIGEN_SYCL_LOCAL_THREAD_DIM1),
288 static_cast<Index>(tileSize));
290 if (rng == 0) rng =
static_cast<Index>(1);
292 if (tileSize > GRange)
294 else if (GRange > tileSize) {
295 Index xMode =
static_cast<Index>(GRange % tileSize);
296 if (xMode != 0) GRange +=
static_cast<Index>(tileSize - xMode);
302 template <
typename Index>
303 EIGEN_STRONG_INLINE
void parallel_for_setup(
304 const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
305 cl::sycl::range<2> &local_range)
const {
306 std::array<Index, 2> input_range = input_dim;
307 Index max_workgroup_Size =
308 static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
311 EIGEN_SYCL_LOCAL_THREAD_DIM1),
312 static_cast<Index>(max_workgroup_Size));
313 Index pow_of_2 =
static_cast<Index>(std::log2(max_workgroup_Size));
316 input_range[1] = input_dim[1];
317 if (input_range[1] == 0) input_range[1] =
static_cast<Index>(1);
318 global_range[1] = input_range[1];
319 if (local_range[1] > global_range[1])
320 local_range[1] = global_range[1];
321 else if (global_range[1] > local_range[1]) {
322 Index xMode =
static_cast<Index>(global_range[1] % local_range[1]);
324 global_range[1] +=
static_cast<Index>(local_range[1] - xMode);
326 local_range[0] =
static_cast<Index>(max_workgroup_Size / local_range[1]);
327 input_range[0] = input_dim[0];
328 if (input_range[0] == 0) input_range[0] =
static_cast<Index>(1);
329 global_range[0] = input_range[0];
330 if (local_range[0] > global_range[0])
331 local_range[0] = global_range[0];
332 else if (global_range[0] > local_range[0]) {
333 Index xMode =
static_cast<Index>(global_range[0] % local_range[0]);
335 global_range[0] +=
static_cast<Index>(local_range[0] - xMode);
341 template <
typename Index>
342 EIGEN_STRONG_INLINE
void parallel_for_setup(
343 const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
344 cl::sycl::range<3> &local_range)
const {
345 std::array<Index, 3> input_range = input_dim;
346 Index max_workgroup_Size =
347 static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
350 EIGEN_SYCL_LOCAL_THREAD_DIM1),
351 static_cast<Index>(max_workgroup_Size));
352 Index pow_of_2 =
static_cast<Index>(std::log2(max_workgroup_Size));
355 input_range[2] = input_dim[2];
356 if (input_range[2] == 0) input_range[1] =
static_cast<Index>(1);
357 global_range[2] = input_range[2];
358 if (local_range[2] > global_range[2])
359 local_range[2] = global_range[2];
360 else if (global_range[2] > local_range[2]) {
361 Index xMode =
static_cast<Index>(global_range[2] % local_range[2]);
363 global_range[2] +=
static_cast<Index>(local_range[2] - xMode);
365 pow_of_2 =
static_cast<Index>(
366 std::log2(
static_cast<Index>(max_workgroup_Size / local_range[2])));
369 input_range[1] = input_dim[1];
370 if (input_range[1] == 0) input_range[1] =
static_cast<Index>(1);
371 global_range[1] = input_range[1];
372 if (local_range[1] > global_range[1])
373 local_range[1] = global_range[1];
374 else if (global_range[1] > local_range[1]) {
375 Index xMode =
static_cast<Index>(global_range[1] % local_range[1]);
377 global_range[1] +=
static_cast<Index>(local_range[1] - xMode);
379 local_range[0] =
static_cast<Index>(max_workgroup_Size /
380 (local_range[1] * local_range[2]));
381 input_range[0] = input_dim[0];
382 if (input_range[0] == 0) input_range[0] =
static_cast<Index>(1);
383 global_range[0] = input_range[0];
384 if (local_range[0] > global_range[0])
385 local_range[0] = global_range[0];
386 else if (global_range[0] > local_range[0]) {
387 Index xMode =
static_cast<Index>(global_range[0] % local_range[0]);
389 global_range[0] +=
static_cast<Index>(local_range[0] - xMode);
393 EIGEN_STRONG_INLINE
bool has_local_memory()
const {
394 #if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
396 #elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
399 return m_device_info.local_mem_type ==
400 cl::sycl::info::local_mem_type::local;
404 EIGEN_STRONG_INLINE
unsigned long max_buffer_size()
const {
405 return m_device_info.max_mem_alloc_size;
408 EIGEN_STRONG_INLINE
unsigned long getNumSyclMultiProcessors()
const {
409 return m_device_info.max_compute_units;
412 EIGEN_STRONG_INLINE
unsigned long maxSyclThreadsPerBlock()
const {
413 return m_device_info.max_work_group_size;
416 EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes()
const {
417 return m_device_info.max_work_item_sizes;
421 EIGEN_STRONG_INLINE
int majorDeviceVersion()
const {
return 1; }
423 EIGEN_STRONG_INLINE
unsigned long maxSyclThreadsPerMultiProcessor()
const {
428 EIGEN_STRONG_INLINE
size_t sharedMemPerBlock()
const {
429 return m_device_info.local_mem_size;
434 EIGEN_STRONG_INLINE
size_t getNearestPowerOfTwoWorkGroupSize()
const {
435 return getPowerOfTwo(m_device_info.max_work_group_size,
false);
438 EIGEN_STRONG_INLINE std::string getPlatformName()
const {
439 return m_device_info.platform_name;
442 EIGEN_STRONG_INLINE std::string getDeviceName()
const {
443 return m_device_info.device_name;
446 EIGEN_STRONG_INLINE std::string getDeviceVendor()
const {
447 return m_device_info.device_vendor;
453 EIGEN_STRONG_INLINE
size_t getPowerOfTwo(
size_t wGSize,
bool roundUp)
const {
454 if (roundUp) --wGSize;
455 wGSize |= (wGSize >> 1);
456 wGSize |= (wGSize >> 2);
457 wGSize |= (wGSize >> 4);
458 wGSize |= (wGSize >> 8);
459 wGSize |= (wGSize >> 16);
460 #if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64
461 wGSize |= (wGSize >> 32);
463 return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize);
466 EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue()
const {
return m_queue; }
470 EIGEN_STRONG_INLINE
bool ok()
const {
471 if (!exception_caught_) {
474 return !exception_caught_;
478 void synchronize_and_callback(cl::sycl::event e,
479 const std::function<
void()> &callback)
const {
481 auto callback_ = [=]() {
482 #ifdef EIGEN_EXCEPTIONS
483 cl::sycl::event(e).wait_and_throw();
485 cl::sycl::event(e).wait();
489 m_thread_pool.Schedule(std::move(callback_));
491 #ifdef EIGEN_EXCEPTIONS
492 m_queue.wait_and_throw();
499 bool sycl_async_handler(cl::sycl::exception_list exceptions)
const {
500 bool exception_caught =
false;
501 for (
const auto &e : exceptions) {
503 exception_caught =
true;
507 return exception_caught;
511 bool exception_caught_ =
false;
513 mutable cl::sycl::queue m_queue;
518 const TensorSycl::internal::SyclDeviceInfo m_device_info;
521 struct SyclDeviceBase {
524 const QueueInterface *m_queue_stream;
525 explicit SyclDeviceBase(
const QueueInterface *queue_stream)
526 : m_queue_stream(queue_stream) {}
527 EIGEN_STRONG_INLINE
const QueueInterface *queue_stream()
const {
528 return m_queue_stream;
534 struct SyclDevice :
public SyclDeviceBase {
535 explicit SyclDevice(
const QueueInterface *queue_stream)
536 : SyclDeviceBase(queue_stream) {}
540 template <
typename Index>
541 EIGEN_STRONG_INLINE
void parallel_for_setup(
Index n,
Index &tileSize,
543 queue_stream()->parallel_for_setup(n, tileSize, rng, GRange);
548 template <
typename Index>
549 EIGEN_STRONG_INLINE
void parallel_for_setup(
550 const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
551 cl::sycl::range<2> &local_range)
const {
552 queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
557 template <
typename Index>
558 EIGEN_STRONG_INLINE
void parallel_for_setup(
559 const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
560 cl::sycl::range<3> &local_range)
const {
561 queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
565 EIGEN_STRONG_INLINE
void *allocate(
size_t num_bytes)
const {
566 return queue_stream()->allocate(num_bytes);
569 EIGEN_STRONG_INLINE
void *allocate_temp(
size_t num_bytes)
const {
570 return queue_stream()->allocate_temp(num_bytes);
574 EIGEN_STRONG_INLINE
void deallocate(
void *p)
const {
575 queue_stream()->deallocate(p);
578 EIGEN_STRONG_INLINE
void deallocate_temp(
void *buffer)
const {
579 queue_stream()->deallocate_temp(buffer);
582 EIGEN_STRONG_INLINE
void deallocate_temp(
const void *buffer)
const {
583 queue_stream()->deallocate_temp(buffer);
586 template <
typename data_t>
592 EIGEN_STRONG_INLINE
bool isDeviceSuitable()
const {
return true; }
595 template <
typename Index>
596 EIGEN_STRONG_INLINE
void memcpyHostToDevice(
598 std::function<
void()> callback = {})
const {
599 queue_stream()->memcpyHostToDevice(dst, src, n, callback);
602 template <
typename Index>
603 EIGEN_STRONG_INLINE
void memcpyDeviceToHost(
604 void *dst,
const Index *src,
size_t n,
605 std::function<
void()> callback = {})
const {
606 queue_stream()->memcpyDeviceToHost(dst, src, n, callback);
609 template <
typename Index>
610 EIGEN_STRONG_INLINE
void memcpy(
void *dst,
const Index *src,
size_t n)
const {
611 queue_stream()->memcpy(dst, src, n);
614 EIGEN_STRONG_INLINE
void memset(
void *data,
int c,
size_t n)
const {
615 queue_stream()->memset(data, c, n);
618 template <
typename T>
619 EIGEN_STRONG_INLINE
void fill(
T *begin,
T *
end,
const T &value)
const {
620 queue_stream()->fill(begin,
end, value);
623 EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue()
const {
624 return queue_stream()->sycl_queue();
627 EIGEN_STRONG_INLINE
size_t firstLevelCacheSize()
const {
return 48 * 1024; }
629 EIGEN_STRONG_INLINE
size_t lastLevelCacheSize()
const {
632 return firstLevelCacheSize();
634 EIGEN_STRONG_INLINE
unsigned long getNumSyclMultiProcessors()
const {
635 return queue_stream()->getNumSyclMultiProcessors();
637 EIGEN_STRONG_INLINE
unsigned long maxSyclThreadsPerBlock()
const {
638 return queue_stream()->maxSyclThreadsPerBlock();
640 EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes()
const {
641 return queue_stream()->maxWorkItemSizes();
643 EIGEN_STRONG_INLINE
unsigned long maxSyclThreadsPerMultiProcessor()
const {
645 return queue_stream()->maxSyclThreadsPerMultiProcessor();
647 EIGEN_STRONG_INLINE
size_t sharedMemPerBlock()
const {
648 return queue_stream()->sharedMemPerBlock();
650 EIGEN_STRONG_INLINE
size_t getNearestPowerOfTwoWorkGroupSize()
const {
651 return queue_stream()->getNearestPowerOfTwoWorkGroupSize();
654 EIGEN_STRONG_INLINE
size_t getPowerOfTwo(
size_t val,
bool roundUp)
const {
655 return queue_stream()->getPowerOfTwo(val, roundUp);
658 EIGEN_STRONG_INLINE
int majorDeviceVersion()
const {
659 return queue_stream()->majorDeviceVersion();
662 EIGEN_STRONG_INLINE
void synchronize()
const {
663 queue_stream()->synchronize();
668 EIGEN_STRONG_INLINE
bool ok()
const {
return queue_stream()->ok(); }
670 EIGEN_STRONG_INLINE
bool has_local_memory()
const {
671 return queue_stream()->has_local_memory();
673 EIGEN_STRONG_INLINE
long max_buffer_size()
const {
674 return queue_stream()->max_buffer_size();
676 EIGEN_STRONG_INLINE std::string getPlatformName()
const {
677 return queue_stream()->getPlatformName();
679 EIGEN_STRONG_INLINE std::string getDeviceName()
const {
680 return queue_stream()->getDeviceName();
682 EIGEN_STRONG_INLINE std::string getDeviceVendor()
const {
683 return queue_stream()->getDeviceVendor();
685 template <
typename OutScalar,
typename KernelType,
typename...
T>
687 return queue_stream()
688 ->template binary_kernel_launcher<OutScalar, KernelType>(var...);
690 template <
typename OutScalar,
typename KernelType,
typename...
T>
692 return queue_stream()
693 ->template unary_kernel_launcher<OutScalar, KernelType>(var...);
696 template <
typename OutScalar,
typename KernelType,
typename...
T>
698 return queue_stream()
699 ->template nullary_kernel_launcher<OutScalar, KernelType>(var...);
Array< double, 1, 3 > e(1./3., 0.5, 2.)
#define EIGEN_ALWAYS_INLINE
#define EIGEN_DEVICE_FUNC
static const lastp1_t end
: TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
CleanedUpDerType< DerType >::type() min(const AutoDiffScalar< DerType > &x, const T &y)
Eigen::AutoDiffScalar< EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(internal::remove_all_t< DerType >, typename internal::traits< internal::remove_all_t< DerType >>::Scalar, product) > pow(const Eigen::AutoDiffScalar< DerType > &x, const typename internal::traits< internal::remove_all_t< DerType >>::Scalar &y)