HIP: Heterogenous-computing Interface for Portability
Loading...
Searching...
No Matches
amd_hip_runtime.h
1/*
2Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3
4Permission is hereby granted, free of charge, to any person obtaining a copy
5of this software and associated documentation files (the "Software"), to deal
6in the Software without restriction, including without limitation the rights
7to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8copies of the Software, and to permit persons to whom the Software is
9furnished to do so, subject to the following conditions:
10
11The above copyright notice and this permission notice shall be included in
12all copies or substantial portions of the Software.
13
14THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20THE SOFTWARE.
21*/
22
28//#pragma once
29#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_H
30#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_H
31
32#include <hip/amd_detail/amd_hip_common.h>
33
34#if !defined(__HIPCC_RTC__)
35#ifdef __cplusplus
36#include <cstddef>
37#else
38#include <stddef.h>
39#endif // __cplusplus
40#endif // !defined(__HIPCC_RTC__)
41
42#ifdef __cplusplus
43extern "C" {
44#endif
45
54const char* amd_dbgapi_get_build_name();
55
63const char* amd_dbgapi_get_git_hash();
64
72size_t amd_dbgapi_get_build_id();
73
74#ifdef __cplusplus
75} /* extern "c" */
76#endif
77
78//---
79// Top part of file can be compiled with any compiler
80
81#if !defined(__HIPCC_RTC__)
82#ifdef __cplusplus
83#include <cmath>
84#include <cstdint>
85#include <tuple>
86#else
87#include <math.h>
88#include <stdint.h>
89#endif // __cplusplus
90#else
91#if !__HIP_NO_STD_DEFS__
92typedef unsigned int uint32_t;
93typedef unsigned long long uint64_t;
94typedef signed int int32_t;
95typedef signed long long int64_t;
96namespace std {
97using ::uint32_t;
98using ::uint64_t;
99using ::int32_t;
100using ::int64_t;
101}
102#endif // __HIP_NO_STD_DEFS__
103#endif // !defined(__HIPCC_RTC__)
104
105#if __HIP_CLANG_ONLY__
106
107#if !defined(__align__)
108#define __align__(x) __attribute__((aligned(x)))
109#endif
110
111#define CUDA_SUCCESS hipSuccess
112
113#if !defined(__HIPCC_RTC__)
114#include <hip/hip_runtime_api.h>
115#include <hip/amd_detail/amd_hip_atomic.h>
116#include <hip/amd_detail/amd_device_functions.h>
117#include <hip/amd_detail/amd_surface_functions.h>
118#include <hip/amd_detail/texture_fetch_functions.h>
119#include <hip/amd_detail/texture_indirect_functions.h>
120extern int HIP_TRACE_API;
121#endif // !defined(__HIPCC_RTC__)
122
123#ifdef __cplusplus
124#include <hip/amd_detail/hip_ldg.h>
125#endif
126
128
129// TODO-HCC remove old definitions ; ~1602 hcc supports __HCC_ACCELERATOR__ define.
130#if defined(__KALMAR_ACCELERATOR__) && !defined(__HCC_ACCELERATOR__)
131#define __HCC_ACCELERATOR__ __KALMAR_ACCELERATOR__
132#endif
133
134// Feature tests:
135#if (defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)) || __HIP_DEVICE_COMPILE__
136// Device compile and not host compile:
137
138// 32-bit Atomics:
139#define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1)
140#define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1)
141#define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (1)
142#define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (1)
143#define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (1)
144
145// 64-bit Atomics:
146#define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1)
147#define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (1)
148
149// Doubles
150#define __HIP_ARCH_HAS_DOUBLES__ (1)
151
152// warp cross-lane operations:
153#define __HIP_ARCH_HAS_WARP_VOTE__ (1)
154#define __HIP_ARCH_HAS_WARP_BALLOT__ (1)
155#define __HIP_ARCH_HAS_WARP_SHUFFLE__ (1)
156#define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
157
158// sync
159#define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (1)
160#define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
161
162// misc
163#define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
164#define __HIP_ARCH_HAS_3DGRID__ (1)
165#define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
166
167#endif /* Device feature flags */
168
169
170#define launch_bounds_impl0(requiredMaxThreadsPerBlock) \
171 __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
172#define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor) \
173 __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock), \
174 amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
175#define select_impl_(_1, _2, impl_, ...) impl_
176#define __launch_bounds__(...) \
177 select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0, )(__VA_ARGS__)
178
179#if !defined(__HIPCC_RTC__)
180__host__ inline void* __get_dynamicgroupbaseptr() { return nullptr; }
181#endif // !defined(__HIPCC_RTC__)
182
183// End doxygen API:
188//
189// hip-clang functions
190//
191#if !defined(__HIPCC_RTC__)
192#define HIP_KERNEL_NAME(...) __VA_ARGS__
193#define HIP_SYMBOL(X) X
194
195typedef int hipLaunchParm;
196
197template <std::size_t n, typename... Ts,
198 typename std::enable_if<n == sizeof...(Ts)>::type* = nullptr>
199void pArgs(const std::tuple<Ts...>&, void*) {}
200
201template <std::size_t n, typename... Ts,
202 typename std::enable_if<n != sizeof...(Ts)>::type* = nullptr>
203void pArgs(const std::tuple<Ts...>& formals, void** _vargs) {
204 using T = typename std::tuple_element<n, std::tuple<Ts...> >::type;
205
206 static_assert(!std::is_reference<T>{},
207 "A __global__ function cannot have a reference as one of its "
208 "arguments.");
209#if defined(HIP_STRICT)
210 static_assert(std::is_trivially_copyable<T>{},
211 "Only TriviallyCopyable types can be arguments to a __global__ "
212 "function");
213#endif
214 _vargs[n] = const_cast<void*>(reinterpret_cast<const void*>(&std::get<n>(formals)));
215 return pArgs<n + 1>(formals, _vargs);
216}
217
218template <typename... Formals, typename... Actuals>
219std::tuple<Formals...> validateArgsCountType(void (*kernel)(Formals...), std::tuple<Actuals...>(actuals)) {
220 static_assert(sizeof...(Formals) == sizeof...(Actuals), "Argument Count Mismatch");
221 std::tuple<Formals...> to_formals{std::move(actuals)};
222 return to_formals;
223}
224
225#if defined(HIP_TEMPLATE_KERNEL_LAUNCH)
226template <typename... Args, typename F = void (*)(Args...)>
227void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
228 std::uint32_t sharedMemBytes, hipStream_t stream, Args... args) {
229 constexpr size_t count = sizeof...(Args);
230 auto tup_ = std::tuple<Args...>{args...};
231 auto tup = validateArgsCountType(kernel, tup_);
232 void* _Args[count];
233 pArgs<0>(tup, _Args);
234
235 auto k = reinterpret_cast<void*>(kernel);
236 hipLaunchKernel(k, numBlocks, dimBlocks, _Args, sharedMemBytes, stream);
237}
238#else
239#define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...) \
240 do { \
241 kernelName<<<(numBlocks), (numThreads), (memPerBlock), (streamId)>>>(__VA_ARGS__); \
242 } while (0)
243
244#define hipLaunchKernelGGL(kernelName, ...) hipLaunchKernelGGLInternal((kernelName), __VA_ARGS__)
245#endif
246
247#include <hip/hip_runtime_api.h>
248#endif // !defined(__HIPCC_RTC__)
249
250extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_id(unsigned int);
251extern "C" __device__ __attribute__((const)) size_t __ockl_get_group_id(unsigned int);
252extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_size(unsigned int);
253extern "C" __device__ __attribute__((const)) size_t __ockl_get_num_groups(unsigned int);
254struct __HIP_BlockIdx {
255 __device__
256 std::uint32_t operator()(std::uint32_t x) const noexcept { return __ockl_get_group_id(x); }
257};
258struct __HIP_BlockDim {
259 __device__
260 std::uint32_t operator()(std::uint32_t x) const noexcept {
261 return __ockl_get_local_size(x);
262 }
263};
264struct __HIP_GridDim {
265 __device__
266 std::uint32_t operator()(std::uint32_t x) const noexcept {
267 return __ockl_get_num_groups(x);
268 }
269};
270struct __HIP_ThreadIdx {
271 __device__
272 std::uint32_t operator()(std::uint32_t x) const noexcept {
273 return __ockl_get_local_id(x);
274 }
275};
276
277#if defined(__HIPCC_RTC__)
278typedef struct dim3 {
279 uint32_t x;
280 uint32_t y;
281 uint32_t z;
282#ifdef __cplusplus
283 constexpr __device__ dim3(uint32_t _x = 1, uint32_t _y = 1, uint32_t _z = 1) : x(_x), y(_y), z(_z){};
284#endif
285} dim3;
286#endif // !defined(__HIPCC_RTC__)
287
288extern "C" __device__ __attribute__((const)) size_t __ockl_get_global_size(unsigned int);
289
290#ifdef __cplusplus
291template <typename F> struct __HIP_Coordinates {
292 using R = decltype(F{}(0));
293
294 struct __X {
295 __device__ operator R() const noexcept { return F{}(0); }
296 __device__ R operator+=(const R& rhs) { return F{}(0) + rhs; }
297 };
298 struct __Y {
299 __device__ operator R() const noexcept { return F{}(1); }
300 __device__ R operator+=(const R& rhs) { return F{}(1) + rhs; }
301 };
302 struct __Z {
303 __device__ operator R() const noexcept { return F{}(2); }
304 __device__ R operator+=(const R& rhs) { return F{}(2) + rhs; }
305 };
306
307#if !defined(_MSC_VER)
308 __attribute__((weak))
309#endif
310 __device__ static constexpr __X x{};
311#if !defined(_MSC_VER)
312 __attribute__((weak))
313#endif
314 __device__ static constexpr __Y y{};
315#if !defined(_MSC_VER)
316 __attribute__((weak))
317#endif
318 __device__ static constexpr __Z z{};
319
320 __device__ operator dim3() const { return dim3(x, y, z); }
321};
322
323template <typename F>
324constexpr typename __HIP_Coordinates<F>::__X __HIP_Coordinates<F>::x;
325template <typename F>
326constexpr typename __HIP_Coordinates<F>::__Y __HIP_Coordinates<F>::y;
327template <typename F>
328constexpr typename __HIP_Coordinates<F>::__Z __HIP_Coordinates<F>::z;
329
330inline
331__device__
332std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::__X,
333 __HIP_Coordinates<__HIP_BlockDim>::__X) noexcept {
334 return __ockl_get_global_size(0);
335}
336inline
337__device__
338std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::__X,
339 __HIP_Coordinates<__HIP_GridDim>::__X) noexcept {
340 return __ockl_get_global_size(0);
341}
342inline
343__device__
344std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::__Y,
345 __HIP_Coordinates<__HIP_BlockDim>::__Y) noexcept {
346 return __ockl_get_global_size(1);
347}
348inline
349__device__
350std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::__Y,
351 __HIP_Coordinates<__HIP_GridDim>::__Y) noexcept {
352 return __ockl_get_global_size(1);
353}
354inline
355__device__
356std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::__Z,
357 __HIP_Coordinates<__HIP_BlockDim>::__Z) noexcept {
358 return __ockl_get_global_size(2);
359}
360inline
361__device__
362std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::__Z,
363 __HIP_Coordinates<__HIP_GridDim>::__Z) noexcept {
364 return __ockl_get_global_size(2);
365}
366
367static constexpr __HIP_Coordinates<__HIP_BlockDim> blockDim{};
368static constexpr __HIP_Coordinates<__HIP_BlockIdx> blockIdx{};
369static constexpr __HIP_Coordinates<__HIP_GridDim> gridDim{};
370static constexpr __HIP_Coordinates<__HIP_ThreadIdx> threadIdx{};
371#endif // __cplusplus
372
373extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_id(unsigned int);
374#define hipThreadIdx_x (__ockl_get_local_id(0))
375#define hipThreadIdx_y (__ockl_get_local_id(1))
376#define hipThreadIdx_z (__ockl_get_local_id(2))
377
378extern "C" __device__ __attribute__((const)) size_t __ockl_get_group_id(unsigned int);
379#define hipBlockIdx_x (__ockl_get_group_id(0))
380#define hipBlockIdx_y (__ockl_get_group_id(1))
381#define hipBlockIdx_z (__ockl_get_group_id(2))
382
383extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_size(unsigned int);
384#define hipBlockDim_x (__ockl_get_local_size(0))
385#define hipBlockDim_y (__ockl_get_local_size(1))
386#define hipBlockDim_z (__ockl_get_local_size(2))
387
388extern "C" __device__ __attribute__((const)) size_t __ockl_get_num_groups(unsigned int);
389#define hipGridDim_x (__ockl_get_num_groups(0))
390#define hipGridDim_y (__ockl_get_num_groups(1))
391#define hipGridDim_z (__ockl_get_num_groups(2))
392
393#if !defined(__HIPCC_RTC__)
394#include <hip/amd_detail/amd_math_functions.h>
395#endif
396
397#if __HIP_HCC_COMPAT_MODE__
398// Define HCC work item functions in terms of HIP builtin variables.
399#pragma push_macro("__DEFINE_HCC_FUNC")
400#define __DEFINE_HCC_FUNC(hc_fun,hip_var) \
401inline __device__ __attribute__((always_inline)) unsigned int hc_get_##hc_fun(unsigned int i) { \
402 if (i==0) \
403 return hip_var.x; \
404 else if(i==1) \
405 return hip_var.y; \
406 else \
407 return hip_var.z; \
408}
409
410__DEFINE_HCC_FUNC(workitem_id, threadIdx)
411__DEFINE_HCC_FUNC(group_id, blockIdx)
412__DEFINE_HCC_FUNC(group_size, blockDim)
413__DEFINE_HCC_FUNC(num_groups, gridDim)
414#pragma pop_macro("__DEFINE_HCC_FUNC")
415
416extern "C" __device__ __attribute__((const)) size_t __ockl_get_global_id(unsigned int);
417inline __device__ __attribute__((always_inline)) unsigned int
418hc_get_workitem_absolute_id(int dim)
419{
420 return (unsigned int)__ockl_get_global_id(dim);
421}
422
423#endif
424
425#if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
426#if !defined(__HIPCC_RTC__)
427// Support std::complex.
428#if !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
429#pragma push_macro("__CUDA__")
430#define __CUDA__
431#include <__clang_cuda_math_forward_declares.h>
432#include <__clang_cuda_complex_builtins.h>
433// Workaround for using libc++ with HIP-Clang.
434// The following headers requires clang include path before standard C++ include path.
435// However libc++ include path requires to be before clang include path.
436// To workaround this, we pass -isystem with the parent directory of clang include
437// path instead of the clang include path itself.
438#include <include/cuda_wrappers/algorithm>
439#include <include/cuda_wrappers/complex>
440#include <include/cuda_wrappers/new>
441#undef __CUDA__
442#pragma pop_macro("__CUDA__")
443#endif // !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
444#endif // !defined(__HIPCC_RTC__)
445#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
446#endif // __HIP_CLANG_ONLY__
447
448#endif // HIP_AMD_DETAIL_RUNTIME_H
#define __host__
Definition host_defines.h:166