10 #if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H)
11 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
18 #include "./InternalHeaderCheck.h"
22 static const int kGpuScratchSize = 1024;
26 class StreamInterface {
28 virtual ~StreamInterface() {}
30 virtual const gpuStream_t& stream()
const = 0;
31 virtual const gpuDeviceProp_t& deviceProperties()
const = 0;
34 virtual void* allocate(
size_t num_bytes)
const = 0;
35 virtual void deallocate(
void* buffer)
const = 0;
38 virtual void* scratchpad()
const = 0;
44 virtual unsigned int* semaphore()
const = 0;
47 class GpuDeviceProperties {
49 GpuDeviceProperties() :
50 initialized_(false), first_(true), device_properties_(nullptr) {}
52 ~GpuDeviceProperties() {
53 if (device_properties_) {
54 delete[] device_properties_;
58 EIGEN_STRONG_INLINE
const gpuDeviceProp_t& get(
int device)
const {
59 return device_properties_[device];
62 EIGEN_STRONG_INLINE
bool isInitialized()
const {
74 if (first_.exchange(
false)) {
77 gpuError_t status = gpuGetDeviceCount(&num_devices);
78 if (status != gpuSuccess) {
79 std::cerr <<
"Failed to get the number of GPU devices: "
80 << gpuGetErrorString(status)
84 device_properties_ =
new gpuDeviceProp_t[num_devices];
85 for (
int i = 0;
i < num_devices; ++
i) {
86 status = gpuGetDeviceProperties(&device_properties_[i], i);
87 if (status != gpuSuccess) {
88 std::cerr <<
"Failed to initialize GPU device #"
91 << gpuGetErrorString(status)
97 std::atomic_thread_fence(std::memory_order_release);
101 while (!initialized_) {
102 std::atomic_thread_fence(std::memory_order_acquire);
103 std::this_thread::sleep_for(std::chrono::milliseconds(1000));
110 volatile bool initialized_;
111 std::atomic<bool> first_;
112 gpuDeviceProp_t* device_properties_;
116 static GpuDeviceProperties* deviceProperties =
new GpuDeviceProperties();
117 if (!deviceProperties->isInitialized()) {
118 deviceProperties->initialize();
120 return *deviceProperties;
124 return GetGpuDeviceProperties().get(device);
127 static const gpuStream_t default_stream = gpuStreamDefault;
129 class GpuStreamDevice :
public StreamInterface {
132 GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
133 gpuError_t status = gpuGetDevice(&device_);
134 if (status != gpuSuccess) {
135 std::cerr <<
"Failed to get the GPU devices "
136 << gpuGetErrorString(status)
142 GpuStreamDevice(
int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {}
147 GpuStreamDevice(
const gpuStream_t* stream,
int device = -1)
148 : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
150 gpuError_t status = gpuGetDevice(&device_);
151 if (status != gpuSuccess) {
152 std::cerr <<
"Failed to get the GPU devices "
153 << gpuGetErrorString(status)
159 gpuError_t err = gpuGetDeviceCount(&num_devices);
167 virtual ~GpuStreamDevice() {
169 deallocate(scratch_);
173 const gpuStream_t& stream()
const {
return *stream_; }
174 const gpuDeviceProp_t& deviceProperties()
const {
175 return GetGpuDeviceProperties(device_);
177 virtual void* allocate(
size_t num_bytes)
const {
178 gpuError_t err = gpuSetDevice(device_);
182 err = gpuMalloc(&result, num_bytes);
187 virtual void deallocate(
void* buffer)
const {
188 gpuError_t err = gpuSetDevice(device_);
192 err = gpuFree(buffer);
196 virtual void* scratchpad()
const {
197 if (scratch_ == NULL) {
198 scratch_ = allocate(kGpuScratchSize +
sizeof(
unsigned int));
203 virtual unsigned int* semaphore()
const {
204 if (semaphore_ == NULL) {
205 char* scratch =
static_cast<char*
>(scratchpad()) + kGpuScratchSize;
206 semaphore_ =
reinterpret_cast<unsigned int*
>(scratch);
207 gpuError_t err = gpuMemsetAsync(semaphore_, 0,
sizeof(
unsigned int), *stream_);
215 const gpuStream_t* stream_;
217 mutable void* scratch_;
218 mutable unsigned int* semaphore_;
224 explicit GpuDevice(
const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
227 explicit GpuDevice(
const StreamInterface* stream,
int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
231 EIGEN_STRONG_INLINE
const gpuStream_t& stream()
const {
232 return stream_->stream();
235 EIGEN_STRONG_INLINE
void* allocate(
size_t num_bytes)
const {
236 return stream_->allocate(num_bytes);
239 EIGEN_STRONG_INLINE
void deallocate(
void* buffer)
const {
240 stream_->deallocate(buffer);
243 EIGEN_STRONG_INLINE
void* allocate_temp(
size_t num_bytes)
const {
244 return stream_->allocate(num_bytes);
247 EIGEN_STRONG_INLINE
void deallocate_temp(
void* buffer)
const {
248 stream_->deallocate(buffer);
251 template<
typename Type>
256 EIGEN_STRONG_INLINE
void* scratchpad()
const {
257 return stream_->scratchpad();
260 EIGEN_STRONG_INLINE
unsigned int* semaphore()
const {
261 return stream_->semaphore();
264 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void memcpy(
void* dst,
const void* src,
size_t n)
const {
265 #ifndef EIGEN_GPU_COMPILE_PHASE
266 gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToDevice,
274 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
278 EIGEN_STRONG_INLINE
void memcpyHostToDevice(
void* dst,
const void* src,
size_t n)
const {
280 gpuMemcpyAsync(dst, src, n, gpuMemcpyHostToDevice, stream_->stream());
285 EIGEN_STRONG_INLINE
void memcpyDeviceToHost(
void* dst,
const void* src,
size_t n)
const {
287 gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToHost, stream_->stream());
292 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void memset(
void* buffer,
int c,
size_t n)
const {
293 #ifndef EIGEN_GPU_COMPILE_PHASE
294 gpuError_t err = gpuMemsetAsync(buffer, c, n, stream_->stream());
301 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
306 EIGEN_STRONG_INLINE
void fill(
T* begin,
T*
end,
const T& value)
const {
307 #ifndef EIGEN_GPU_COMPILE_PHASE
308 const size_t count =
end - begin;
310 const int value_size =
sizeof(value);
311 char* buffer = (
char*)begin;
312 char* value_bytes = (
char*)(&value);
317 bool use_single_memset =
true;
318 for (
int i=1;
i<value_size; ++
i) {
319 if (value_bytes[i] != value_bytes[0]) {
320 use_single_memset =
false;
324 if (use_single_memset) {
325 err = gpuMemsetAsync(buffer, value_bytes[0], count *
sizeof(
T), stream_->stream());
328 for (
int b=0;
b<value_size; ++
b) {
329 err = gpuMemset2DAsync(buffer+
b, value_size, value_bytes[
b], 1, count, stream_->stream());
337 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
341 EIGEN_STRONG_INLINE
size_t numThreads()
const {
346 EIGEN_STRONG_INLINE
size_t firstLevelCacheSize()
const {
351 EIGEN_STRONG_INLINE
size_t lastLevelCacheSize()
const {
354 return firstLevelCacheSize();
358 #ifndef EIGEN_GPU_COMPILE_PHASE
359 gpuError_t err = gpuStreamSynchronize(stream_->stream());
360 if (err != gpuSuccess) {
361 std::cerr <<
"Error detected in GPU stream: "
362 << gpuGetErrorString(err)
367 gpu_assert(
false &&
"The default device should be used instead to generate kernel code");
371 EIGEN_STRONG_INLINE
int getNumGpuMultiProcessors()
const {
372 return stream_->deviceProperties().multiProcessorCount;
374 EIGEN_STRONG_INLINE
int maxGpuThreadsPerBlock()
const {
375 return stream_->deviceProperties().maxThreadsPerBlock;
377 EIGEN_STRONG_INLINE
int maxGpuThreadsPerMultiProcessor()
const {
378 return stream_->deviceProperties().maxThreadsPerMultiProcessor;
380 EIGEN_STRONG_INLINE
int sharedMemPerBlock()
const {
381 return static_cast<int>(stream_->deviceProperties().sharedMemPerBlock);
383 EIGEN_STRONG_INLINE
int majorDeviceVersion()
const {
384 return stream_->deviceProperties().major;
386 EIGEN_STRONG_INLINE
int minorDeviceVersion()
const {
387 return stream_->deviceProperties().minor;
390 EIGEN_STRONG_INLINE
int maxBlocks()
const {
396 inline bool ok()
const {
398 gpuError_t error = gpuStreamQuery(stream_->stream());
399 return (error == gpuSuccess) || (error == gpuErrorNotReady);
406 const StreamInterface* stream_;
410 #if defined(EIGEN_HIPCC)
412 #define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
413 hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \
414 gpu_assert(hipGetLastError() == hipSuccess);
418 #define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
419 (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \
420 gpu_assert(cudaGetLastError() == cudaSuccess);
426 static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) {
427 #ifndef EIGEN_GPU_COMPILE_PHASE
428 gpuError_t status = gpuDeviceSetSharedMemConfig(config);
#define EIGEN_ALWAYS_INLINE
#define EIGEN_UNUSED_VARIABLE(var)
#define EIGEN_DEVICE_FUNC
static const lastp1_t end
: TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend