TensorDeviceGpu.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H)
11 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
12 
13 // This header file container defines fo gpu* macros which will resolve to
14 // their equivalent hip* or cuda* versions depending on the compiler in use
15 // A separate header (included at the end of this file) will undefine all
17 
18 #include "./InternalHeaderCheck.h"
19 
20 namespace Eigen {
21 
22 static const int kGpuScratchSize = 1024;
23 
24 // This defines an interface that GPUDevice can take to use
25 // HIP / CUDA streams underneath.
26 class StreamInterface {
27  public:
28  virtual ~StreamInterface() {}
29 
30  virtual const gpuStream_t& stream() const = 0;
31  virtual const gpuDeviceProp_t& deviceProperties() const = 0;
32 
33  // Allocate memory on the actual device where the computation will run
34  virtual void* allocate(size_t num_bytes) const = 0;
35  virtual void deallocate(void* buffer) const = 0;
36 
37  // Return a scratchpad buffer of size 1k
38  virtual void* scratchpad() const = 0;
39 
40  // Return a semaphore. The semaphore is initially initialized to 0, and
41  // each kernel using it is responsible for resetting to 0 upon completion
42  // to maintain the invariant that the semaphore is always equal to 0 upon
43  // each kernel start.
44  virtual unsigned int* semaphore() const = 0;
45 };
46 
47 class GpuDeviceProperties {
48  public:
49  GpuDeviceProperties() :
50  initialized_(false), first_(true), device_properties_(nullptr) {}
51 
52  ~GpuDeviceProperties() {
53  if (device_properties_) {
54  delete[] device_properties_;
55  }
56  }
57 
58  EIGEN_STRONG_INLINE const gpuDeviceProp_t& get(int device) const {
59  return device_properties_[device];
60  }
61 
62  EIGEN_STRONG_INLINE bool isInitialized() const {
63  return initialized_;
64  }
65 
66  void initialize() {
67  if (!initialized_) {
68  // Attempts to ensure proper behavior in the case of multiple threads
69  // calling this function simultaneously. This would be trivial to
70  // implement if we could use std::mutex, but unfortunately mutex don't
71  // compile with nvcc, so we resort to atomics and thread fences instead.
72  // Note that if the caller uses a compiler that doesn't support c++11 we
73  // can't ensure that the initialization is thread safe.
74  if (first_.exchange(false)) {
75  // We're the first thread to reach this point.
76  int num_devices;
77  gpuError_t status = gpuGetDeviceCount(&num_devices);
78  if (status != gpuSuccess) {
79  std::cerr << "Failed to get the number of GPU devices: "
80  << gpuGetErrorString(status)
81  << std::endl;
82  gpu_assert(status == gpuSuccess);
83  }
84  device_properties_ = new gpuDeviceProp_t[num_devices];
85  for (int i = 0; i < num_devices; ++i) {
86  status = gpuGetDeviceProperties(&device_properties_[i], i);
87  if (status != gpuSuccess) {
88  std::cerr << "Failed to initialize GPU device #"
89  << i
90  << ": "
91  << gpuGetErrorString(status)
92  << std::endl;
93  gpu_assert(status == gpuSuccess);
94  }
95  }
96 
97  std::atomic_thread_fence(std::memory_order_release);
98  initialized_ = true;
99  } else {
100  // Wait for the other thread to inititialize the properties.
101  while (!initialized_) {
102  std::atomic_thread_fence(std::memory_order_acquire);
103  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
104  }
105  }
106  }
107  }
108 
109  private:
110  volatile bool initialized_;
111  std::atomic<bool> first_;
112  gpuDeviceProp_t* device_properties_;
113 };
114 
115 EIGEN_ALWAYS_INLINE const GpuDeviceProperties& GetGpuDeviceProperties() {
116  static GpuDeviceProperties* deviceProperties = new GpuDeviceProperties();
117  if (!deviceProperties->isInitialized()) {
118  deviceProperties->initialize();
119  }
120  return *deviceProperties;
121 }
122 
123 EIGEN_ALWAYS_INLINE const gpuDeviceProp_t& GetGpuDeviceProperties(int device) {
124  return GetGpuDeviceProperties().get(device);
125 }
126 
127 static const gpuStream_t default_stream = gpuStreamDefault;
128 
129 class GpuStreamDevice : public StreamInterface {
130  public:
131  // Use the default stream on the current device
132  GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
133  gpuError_t status = gpuGetDevice(&device_);
134  if (status != gpuSuccess) {
135  std::cerr << "Failed to get the GPU devices "
136  << gpuGetErrorString(status)
137  << std::endl;
138  gpu_assert(status == gpuSuccess);
139  }
140  }
141  // Use the default stream on the specified device
142  GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {}
143  // Use the specified stream. Note that it's the
144  // caller responsibility to ensure that the stream can run on
145  // the specified device. If no device is specified the code
146  // assumes that the stream is associated to the current gpu device.
147  GpuStreamDevice(const gpuStream_t* stream, int device = -1)
148  : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
149  if (device < 0) {
150  gpuError_t status = gpuGetDevice(&device_);
151  if (status != gpuSuccess) {
152  std::cerr << "Failed to get the GPU devices "
153  << gpuGetErrorString(status)
154  << std::endl;
155  gpu_assert(status == gpuSuccess);
156  }
157  } else {
158  int num_devices;
159  gpuError_t err = gpuGetDeviceCount(&num_devices);
161  gpu_assert(err == gpuSuccess);
162  gpu_assert(device < num_devices);
163  device_ = device;
164  }
165  }
166 
167  virtual ~GpuStreamDevice() {
168  if (scratch_) {
169  deallocate(scratch_);
170  }
171  }
172 
173  const gpuStream_t& stream() const { return *stream_; }
174  const gpuDeviceProp_t& deviceProperties() const {
175  return GetGpuDeviceProperties(device_);
176  }
177  virtual void* allocate(size_t num_bytes) const {
178  gpuError_t err = gpuSetDevice(device_);
180  gpu_assert(err == gpuSuccess);
181  void* result;
182  err = gpuMalloc(&result, num_bytes);
183  gpu_assert(err == gpuSuccess);
184  gpu_assert(result != NULL);
185  return result;
186  }
187  virtual void deallocate(void* buffer) const {
188  gpuError_t err = gpuSetDevice(device_);
190  gpu_assert(err == gpuSuccess);
191  gpu_assert(buffer != NULL);
192  err = gpuFree(buffer);
193  gpu_assert(err == gpuSuccess);
194  }
195 
196  virtual void* scratchpad() const {
197  if (scratch_ == NULL) {
198  scratch_ = allocate(kGpuScratchSize + sizeof(unsigned int));
199  }
200  return scratch_;
201  }
202 
203  virtual unsigned int* semaphore() const {
204  if (semaphore_ == NULL) {
205  char* scratch = static_cast<char*>(scratchpad()) + kGpuScratchSize;
206  semaphore_ = reinterpret_cast<unsigned int*>(scratch);
207  gpuError_t err = gpuMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
209  gpu_assert(err == gpuSuccess);
210  }
211  return semaphore_;
212  }
213 
214  private:
215  const gpuStream_t* stream_;
216  int device_;
217  mutable void* scratch_;
218  mutable unsigned int* semaphore_;
219 };
220 
221 struct GpuDevice {
222  // The StreamInterface is not owned: the caller is
223  // responsible for its initialization and eventual destruction.
224  explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
225  eigen_assert(stream);
226  }
227  explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
228  eigen_assert(stream);
229  }
230  // TODO(bsteiner): This is an internal API, we should not expose it.
231  EIGEN_STRONG_INLINE const gpuStream_t& stream() const {
232  return stream_->stream();
233  }
234 
235  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
236  return stream_->allocate(num_bytes);
237  }
238 
239  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
240  stream_->deallocate(buffer);
241  }
242 
243  EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const {
244  return stream_->allocate(num_bytes);
245  }
246 
247  EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const {
248  stream_->deallocate(buffer);
249  }
250 
251  template<typename Type>
252  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const {
253  return data;
254  }
255 
256  EIGEN_STRONG_INLINE void* scratchpad() const {
257  return stream_->scratchpad();
258  }
259 
260  EIGEN_STRONG_INLINE unsigned int* semaphore() const {
261  return stream_->semaphore();
262  }
263 
264  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
265 #ifndef EIGEN_GPU_COMPILE_PHASE
266  gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToDevice,
267  stream_->stream());
269  gpu_assert(err == gpuSuccess);
270 #else
274  eigen_assert(false && "The default device should be used instead to generate kernel code");
275 #endif
276  }
277 
278  EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
279  gpuError_t err =
280  gpuMemcpyAsync(dst, src, n, gpuMemcpyHostToDevice, stream_->stream());
282  gpu_assert(err == gpuSuccess);
283  }
284 
285  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
286  gpuError_t err =
287  gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToHost, stream_->stream());
289  gpu_assert(err == gpuSuccess);
290  }
291 
292  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
293 #ifndef EIGEN_GPU_COMPILE_PHASE
294  gpuError_t err = gpuMemsetAsync(buffer, c, n, stream_->stream());
296  gpu_assert(err == gpuSuccess);
297 #else
298  EIGEN_UNUSED_VARIABLE(buffer)
301  eigen_assert(false && "The default device should be used instead to generate kernel code");
302 #endif
303  }
304 
305  template<typename T>
306  EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
307 #ifndef EIGEN_GPU_COMPILE_PHASE
308  const size_t count = end - begin;
309  // Split value into bytes and run memset with stride.
310  const int value_size = sizeof(value);
311  char* buffer = (char*)begin;
312  char* value_bytes = (char*)(&value);
313  gpuError_t err;
315 
316  // If all value bytes are equal, then a single memset can be much faster.
317  bool use_single_memset = true;
318  for (int i=1; i<value_size; ++i) {
319  if (value_bytes[i] != value_bytes[0]) {
320  use_single_memset = false;
321  }
322  }
323 
324  if (use_single_memset) {
325  err = gpuMemsetAsync(buffer, value_bytes[0], count * sizeof(T), stream_->stream());
326  gpu_assert(err == gpuSuccess);
327  } else {
328  for (int b=0; b<value_size; ++b) {
329  err = gpuMemset2DAsync(buffer+b, value_size, value_bytes[b], 1, count, stream_->stream());
330  gpu_assert(err == gpuSuccess);
331  }
332  }
333 #else
334  EIGEN_UNUSED_VARIABLE(begin)
336  EIGEN_UNUSED_VARIABLE(value)
337  eigen_assert(false && "The default device should be used instead to generate kernel code");
338 #endif
339  }
340 
341  EIGEN_STRONG_INLINE size_t numThreads() const {
342  // FIXME
343  return 32;
344  }
345 
346  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
347  // FIXME
348  return 48*1024;
349  }
350 
351  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
352  // We won't try to take advantage of the l2 cache for the time being, and
353  // there is no l3 cache on hip/cuda devices.
354  return firstLevelCacheSize();
355  }
356 
357  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
358 #ifndef EIGEN_GPU_COMPILE_PHASE
359  gpuError_t err = gpuStreamSynchronize(stream_->stream());
360  if (err != gpuSuccess) {
361  std::cerr << "Error detected in GPU stream: "
362  << gpuGetErrorString(err)
363  << std::endl;
364  gpu_assert(err == gpuSuccess);
365  }
366 #else
367  gpu_assert(false && "The default device should be used instead to generate kernel code");
368 #endif
369  }
370 
371  EIGEN_STRONG_INLINE int getNumGpuMultiProcessors() const {
372  return stream_->deviceProperties().multiProcessorCount;
373  }
374  EIGEN_STRONG_INLINE int maxGpuThreadsPerBlock() const {
375  return stream_->deviceProperties().maxThreadsPerBlock;
376  }
377  EIGEN_STRONG_INLINE int maxGpuThreadsPerMultiProcessor() const {
378  return stream_->deviceProperties().maxThreadsPerMultiProcessor;
379  }
380  EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
381  return static_cast<int>(stream_->deviceProperties().sharedMemPerBlock);
382  }
383  EIGEN_STRONG_INLINE int majorDeviceVersion() const {
384  return stream_->deviceProperties().major;
385  }
386  EIGEN_STRONG_INLINE int minorDeviceVersion() const {
387  return stream_->deviceProperties().minor;
388  }
389 
390  EIGEN_STRONG_INLINE int maxBlocks() const {
391  return max_blocks_;
392  }
393 
394  // This function checks if the GPU runtime recorded an error for the
395  // underlying stream device.
396  inline bool ok() const {
397 #ifdef EIGEN_GPUCC
398  gpuError_t error = gpuStreamQuery(stream_->stream());
399  return (error == gpuSuccess) || (error == gpuErrorNotReady);
400 #else
401  return false;
402 #endif
403  }
404 
405  private:
406  const StreamInterface* stream_;
407  int max_blocks_;
408 };
409 
410 #if defined(EIGEN_HIPCC)
411 
412 #define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
413  hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \
414  gpu_assert(hipGetLastError() == hipSuccess);
415 
416 #else
417 
418 #define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
419  (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \
420  gpu_assert(cudaGetLastError() == cudaSuccess);
421 
422 #endif
423 
424 // FIXME: Should be device and kernel specific.
425 #ifdef EIGEN_GPUCC
426 static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) {
427 #ifndef EIGEN_GPU_COMPILE_PHASE
428  gpuError_t status = gpuDeviceSetSharedMemConfig(config);
429  EIGEN_UNUSED_VARIABLE(status)
430  gpu_assert(status == gpuSuccess);
431 #else
432  EIGEN_UNUSED_VARIABLE(config)
433 #endif
434 }
435 #endif
436 
437 } // end namespace Eigen
438 
439 // undefine all the gpu* macros we defined at the beginning of the file
441 
442 #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
int i
#define EIGEN_ALWAYS_INLINE
#define EIGEN_UNUSED_VARIABLE(var)
#define EIGEN_DEVICE_FUNC
#define eigen_assert(x)
int data[]
#define gpu_assert(x)
Definition: Tensor:70
static const lastp1_t end
Type
: TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend