HIP: Heterogenous-computing Interface for Portability
Loading...
Searching...
No Matches
amd_warp_functions.h
1/*
2Copyright (c) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3
4Permission is hereby granted, free of charge, to any person obtaining a copy
5of this software and associated documentation files (the "Software"), to deal
6in the Software without restriction, including without limitation the rights
7to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8copies of the Software, and to permit persons to whom the Software is
9furnished to do so, subject to the following conditions:
10
11The above copyright notice and this permission notice shall be included in
12all copies or substantial portions of the Software.
13
14THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20THE SOFTWARE.
21*/
22
23#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
24#define HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
25
26__device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
27 union { int i; unsigned u; float f; } tmp; tmp.u = src;
28 tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
29 return tmp.u;
30}
31
32__device__ static inline float __hip_ds_bpermutef(int index, float src) {
33 union { int i; unsigned u; float f; } tmp; tmp.f = src;
34 tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
35 return tmp.f;
36}
37
38__device__ static inline unsigned __hip_ds_permute(int index, unsigned src) {
39 union { int i; unsigned u; float f; } tmp; tmp.u = src;
40 tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
41 return tmp.u;
42}
43
44__device__ static inline float __hip_ds_permutef(int index, float src) {
45 union { int i; unsigned u; float f; } tmp; tmp.f = src;
46 tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
47 return tmp.f;
48}
49
50#define __hip_ds_swizzle(src, pattern) __hip_ds_swizzle_N<(pattern)>((src))
51#define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))
52
53template <int pattern>
54__device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) {
55 union { int i; unsigned u; float f; } tmp; tmp.u = src;
56 tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
57 return tmp.u;
58}
59
60template <int pattern>
61__device__ static inline float __hip_ds_swizzlef_N(float src) {
62 union { int i; unsigned u; float f; } tmp; tmp.f = src;
63 tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
64 return tmp.f;
65}
66
67#define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \
68 __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))
69
70template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
71__device__ static inline int __hip_move_dpp_N(int src) {
72 return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask,
73 bound_ctrl);
74}
75
76static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE;
77
78__device__
79inline
80int __shfl(int var, int src_lane, int width = warpSize) {
81 int self = __lane_id();
82 int index = (src_lane & (width - 1)) + (self & ~(width-1));
83 return __builtin_amdgcn_ds_bpermute(index<<2, var);
84}
85__device__
86inline
87unsigned int __shfl(unsigned int var, int src_lane, int width = warpSize) {
88 union { int i; unsigned u; float f; } tmp; tmp.u = var;
89 tmp.i = __shfl(tmp.i, src_lane, width);
90 return tmp.u;
91}
92__device__
93inline
94float __shfl(float var, int src_lane, int width = warpSize) {
95 union { int i; unsigned u; float f; } tmp; tmp.f = var;
96 tmp.i = __shfl(tmp.i, src_lane, width);
97 return tmp.f;
98}
99__device__
100inline
101double __shfl(double var, int src_lane, int width = warpSize) {
102 static_assert(sizeof(double) == 2 * sizeof(int), "");
103 static_assert(sizeof(double) == sizeof(uint64_t), "");
104
105 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
106 tmp[0] = __shfl(tmp[0], src_lane, width);
107 tmp[1] = __shfl(tmp[1], src_lane, width);
108
109 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
110 double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
111 return tmp1;
112}
113__device__
114inline
115long __shfl(long var, int src_lane, int width = warpSize)
116{
117 #ifndef _MSC_VER
118 static_assert(sizeof(long) == 2 * sizeof(int), "");
119 static_assert(sizeof(long) == sizeof(uint64_t), "");
120
121 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
122 tmp[0] = __shfl(tmp[0], src_lane, width);
123 tmp[1] = __shfl(tmp[1], src_lane, width);
124
125 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
126 long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
127 return tmp1;
128 #else
129 static_assert(sizeof(long) == sizeof(int), "");
130 return static_cast<long>(__shfl(static_cast<int>(var), src_lane, width));
131 #endif
132}
133__device__
134inline
135unsigned long __shfl(unsigned long var, int src_lane, int width = warpSize) {
136 #ifndef _MSC_VER
137 static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
138 static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
139
140 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
141 tmp[0] = __shfl(tmp[0], src_lane, width);
142 tmp[1] = __shfl(tmp[1], src_lane, width);
143
144 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
145 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
146 return tmp1;
147 #else
148 static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
149 return static_cast<unsigned long>(__shfl(static_cast<unsigned int>(var), src_lane, width));
150 #endif
151}
152__device__
153inline
154long long __shfl(long long var, int src_lane, int width = warpSize)
155{
156 static_assert(sizeof(long long) == 2 * sizeof(int), "");
157 static_assert(sizeof(long long) == sizeof(uint64_t), "");
158
159 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
160 tmp[0] = __shfl(tmp[0], src_lane, width);
161 tmp[1] = __shfl(tmp[1], src_lane, width);
162
163 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
164 long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
165 return tmp1;
166}
167__device__
168inline
169unsigned long long __shfl(unsigned long long var, int src_lane, int width = warpSize) {
170 static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
171 static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
172
173 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
174 tmp[0] = __shfl(tmp[0], src_lane, width);
175 tmp[1] = __shfl(tmp[1], src_lane, width);
176
177 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
178 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
179 return tmp1;
180}
181
182__device__
183inline
184int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) {
185 int self = __lane_id();
186 int index = self - lane_delta;
187 index = (index < (self & ~(width-1)))?self:index;
188 return __builtin_amdgcn_ds_bpermute(index<<2, var);
189}
190__device__
191inline
192unsigned int __shfl_up(unsigned int var, unsigned int lane_delta, int width = warpSize) {
193 union { int i; unsigned u; float f; } tmp; tmp.u = var;
194 tmp.i = __shfl_up(tmp.i, lane_delta, width);
195 return tmp.u;
196}
197__device__
198inline
199float __shfl_up(float var, unsigned int lane_delta, int width = warpSize) {
200 union { int i; unsigned u; float f; } tmp; tmp.f = var;
201 tmp.i = __shfl_up(tmp.i, lane_delta, width);
202 return tmp.f;
203}
204__device__
205inline
206double __shfl_up(double var, unsigned int lane_delta, int width = warpSize) {
207 static_assert(sizeof(double) == 2 * sizeof(int), "");
208 static_assert(sizeof(double) == sizeof(uint64_t), "");
209
210 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
211 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
212 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
213
214 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
215 double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
216 return tmp1;
217}
218__device__
219inline
220long __shfl_up(long var, unsigned int lane_delta, int width = warpSize)
221{
222 #ifndef _MSC_VER
223 static_assert(sizeof(long) == 2 * sizeof(int), "");
224 static_assert(sizeof(long) == sizeof(uint64_t), "");
225
226 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
227 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
228 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
229
230 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
231 long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
232 return tmp1;
233 #else
234 static_assert(sizeof(long) == sizeof(int), "");
235 return static_cast<long>(__shfl_up(static_cast<int>(var), lane_delta, width));
236 #endif
237}
238
239__device__
240inline
241unsigned long __shfl_up(unsigned long var, unsigned int lane_delta, int width = warpSize)
242{
243 #ifndef _MSC_VER
244 static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
245 static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
246
247 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
248 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
249 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
250
251 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
252 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
253 return tmp1;
254 #else
255 static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
256 return static_cast<unsigned long>(__shfl_up(static_cast<unsigned int>(var), lane_delta, width));
257 #endif
258}
259
260__device__
261inline
262long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize)
263{
264 static_assert(sizeof(long long) == 2 * sizeof(int), "");
265 static_assert(sizeof(long long) == sizeof(uint64_t), "");
266 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
267 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
268 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
269 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
270 long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
271 return tmp1;
272}
273
274__device__
275inline
276unsigned long long __shfl_up(unsigned long long var, unsigned int lane_delta, int width = warpSize)
277{
278 static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
279 static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
280 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
281 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
282 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
283 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
284 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
285 return tmp1;
286}
287
288__device__
289inline
290int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) {
291 int self = __lane_id();
292 int index = self + lane_delta;
293 index = (int)((self&(width-1))+lane_delta) >= width?self:index;
294 return __builtin_amdgcn_ds_bpermute(index<<2, var);
295}
296__device__
297inline
298unsigned int __shfl_down(unsigned int var, unsigned int lane_delta, int width = warpSize) {
299 union { int i; unsigned u; float f; } tmp; tmp.u = var;
300 tmp.i = __shfl_down(tmp.i, lane_delta, width);
301 return tmp.u;
302}
303__device__
304inline
305float __shfl_down(float var, unsigned int lane_delta, int width = warpSize) {
306 union { int i; unsigned u; float f; } tmp; tmp.f = var;
307 tmp.i = __shfl_down(tmp.i, lane_delta, width);
308 return tmp.f;
309}
310__device__
311inline
312double __shfl_down(double var, unsigned int lane_delta, int width = warpSize) {
313 static_assert(sizeof(double) == 2 * sizeof(int), "");
314 static_assert(sizeof(double) == sizeof(uint64_t), "");
315
316 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
317 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
318 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
319
320 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
321 double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
322 return tmp1;
323}
324__device__
325inline
326long __shfl_down(long var, unsigned int lane_delta, int width = warpSize)
327{
328 #ifndef _MSC_VER
329 static_assert(sizeof(long) == 2 * sizeof(int), "");
330 static_assert(sizeof(long) == sizeof(uint64_t), "");
331
332 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
333 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
334 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
335
336 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
337 long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
338 return tmp1;
339 #else
340 static_assert(sizeof(long) == sizeof(int), "");
341 return static_cast<long>(__shfl_down(static_cast<int>(var), lane_delta, width));
342 #endif
343}
344__device__
345inline
346unsigned long __shfl_down(unsigned long var, unsigned int lane_delta, int width = warpSize)
347{
348 #ifndef _MSC_VER
349 static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
350 static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
351
352 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
353 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
354 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
355
356 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
357 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
358 return tmp1;
359 #else
360 static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
361 return static_cast<unsigned long>(__shfl_down(static_cast<unsigned int>(var), lane_delta, width));
362 #endif
363}
364__device__
365inline
366long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSize)
367{
368 static_assert(sizeof(long long) == 2 * sizeof(int), "");
369 static_assert(sizeof(long long) == sizeof(uint64_t), "");
370 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
371 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
372 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
373 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
374 long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
375 return tmp1;
376}
377__device__
378inline
379unsigned long long __shfl_down(unsigned long long var, unsigned int lane_delta, int width = warpSize)
380{
381 static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
382 static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
383 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
384 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
385 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
386 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
387 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
388 return tmp1;
389}
390
391__device__
392inline
393int __shfl_xor(int var, int lane_mask, int width = warpSize) {
394 int self = __lane_id();
395 int index = self^lane_mask;
396 index = index >= ((self+width)&~(width-1))?self:index;
397 return __builtin_amdgcn_ds_bpermute(index<<2, var);
398}
399__device__
400inline
401unsigned int __shfl_xor(unsigned int var, int lane_mask, int width = warpSize) {
402 union { int i; unsigned u; float f; } tmp; tmp.u = var;
403 tmp.i = __shfl_xor(tmp.i, lane_mask, width);
404 return tmp.u;
405}
406__device__
407inline
408float __shfl_xor(float var, int lane_mask, int width = warpSize) {
409 union { int i; unsigned u; float f; } tmp; tmp.f = var;
410 tmp.i = __shfl_xor(tmp.i, lane_mask, width);
411 return tmp.f;
412}
413__device__
414inline
415double __shfl_xor(double var, int lane_mask, int width = warpSize) {
416 static_assert(sizeof(double) == 2 * sizeof(int), "");
417 static_assert(sizeof(double) == sizeof(uint64_t), "");
418
419 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
420 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
421 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
422
423 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
424 double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
425 return tmp1;
426}
427__device__
428inline
429long __shfl_xor(long var, int lane_mask, int width = warpSize)
430{
431 #ifndef _MSC_VER
432 static_assert(sizeof(long) == 2 * sizeof(int), "");
433 static_assert(sizeof(long) == sizeof(uint64_t), "");
434
435 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
436 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
437 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
438
439 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
440 long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
441 return tmp1;
442 #else
443 static_assert(sizeof(long) == sizeof(int), "");
444 return static_cast<long>(__shfl_xor(static_cast<int>(var), lane_mask, width));
445 #endif
446}
447__device__
448inline
449unsigned long __shfl_xor(unsigned long var, int lane_mask, int width = warpSize)
450{
451 #ifndef _MSC_VER
452 static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
453 static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
454
455 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
456 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
457 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
458
459 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
460 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
461 return tmp1;
462 #else
463 static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
464 return static_cast<unsigned long>(__shfl_xor(static_cast<unsigned int>(var), lane_mask, width));
465 #endif
466}
467__device__
468inline
469long long __shfl_xor(long long var, int lane_mask, int width = warpSize)
470{
471 static_assert(sizeof(long long) == 2 * sizeof(int), "");
472 static_assert(sizeof(long long) == sizeof(uint64_t), "");
473 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
474 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
475 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
476 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
477 long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
478 return tmp1;
479}
480__device__
481inline
482unsigned long long __shfl_xor(unsigned long long var, int lane_mask, int width = warpSize)
483{
484 static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
485 static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
486 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
487 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
488 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
489 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
490 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
491 return tmp1;
492}
493
494#endif