Skip to main content

core\stdarch\crates\core_arch\src\amdgpu/
mod.rs

1//! amdgpu intrinsics
2//!
3//! The reference is the [LLVM amdgpu guide] and the [LLVM implementation].
4//! The order of intrinsics here follows the order in the [LLVM implementation].
5//!
6//! [LLVM amdgpu guide]: https://llvm.org/docs/AMDGPUUsage.html#llvm-ir-intrinsics
7//! [LLVM implementation]: https://github.com/llvm/llvm-project/blob/main/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
8
9#[allow(improper_ctypes)]
10unsafe extern "unadjusted" {
11    #[link_name = "llvm.amdgcn.workitem.id.x"]
12    safe fn llvm_workitem_id_x() -> u32;
13    #[link_name = "llvm.amdgcn.workitem.id.y"]
14    safe fn llvm_workitem_id_y() -> u32;
15    #[link_name = "llvm.amdgcn.workitem.id.z"]
16    safe fn llvm_workitem_id_z() -> u32;
17
18    #[link_name = "llvm.amdgcn.workgroup.id.x"]
19    safe fn llvm_workgroup_id_x() -> u32;
20    #[link_name = "llvm.amdgcn.workgroup.id.y"]
21    safe fn llvm_workgroup_id_y() -> u32;
22    #[link_name = "llvm.amdgcn.workgroup.id.z"]
23    safe fn llvm_workgroup_id_z() -> u32;
24
25    #[link_name = "llvm.amdgcn.groupstaticsize"]
26    safe fn llvm_groupstaticsize() -> u32;
27    #[link_name = "llvm.amdgcn.dispatch.id"]
28    safe fn llvm_dispatch_id() -> u64;
29
30    #[link_name = "llvm.amdgcn.wavefrontsize"]
31    safe fn llvm_wavefrontsize() -> u32;
32
33    #[link_name = "llvm.amdgcn.s.barrier"]
34    safe fn llvm_s_barrier();
35    #[link_name = "llvm.amdgcn.s.barrier.signal"]
36    fn llvm_s_barrier_signal(barrier_type: i32);
37    #[link_name = "llvm.amdgcn.s.barrier.signal.isfirst"]
38    fn llvm_s_barrier_signal_isfirst(barrier_type: i32) -> bool;
39    #[link_name = "llvm.amdgcn.s.barrier.wait"]
40    fn llvm_s_barrier_wait(barrier_type: i16);
41    #[link_name = "llvm.amdgcn.s.get.barrier.state"]
42    fn llvm_s_get_barrier_state(barrier_type: i32) -> u32;
43    #[link_name = "llvm.amdgcn.wave.barrier"]
44    safe fn llvm_wave_barrier();
45    #[link_name = "llvm.amdgcn.sched.barrier"]
46    fn llvm_sched_barrier(mask: u32);
47    #[link_name = "llvm.amdgcn.sched.group.barrier"]
48    fn llvm_sched_group_barrier(mask: u32, size: u32, sync_id: u32);
49
50    #[link_name = "llvm.amdgcn.s.sleep"]
51    safe fn llvm_s_sleep(count: u32);
52
53    #[link_name = "llvm.amdgcn.s.sethalt"]
54    safe fn llvm_s_sethalt(value: u32) -> !;
55
56    #[link_name = "llvm.amdgcn.s.getpc"]
57    safe fn llvm_s_getpc() -> i64;
58
59    #[link_name = "llvm.amdgcn.mbcnt.lo"]
60    safe fn llvm_mbcnt_lo(value: u32, init: u32) -> u32;
61    #[link_name = "llvm.amdgcn.mbcnt.hi"]
62    safe fn llvm_mbcnt_hi(value: u32, init: u32) -> u32;
63
64    #[link_name = "llvm.amdgcn.ballot"]
65    safe fn llvm_ballot(b: bool) -> u64;
66
67    #[link_name = "llvm.amdgcn.inverse.ballot"]
68    safe fn llvm_inverse_ballot(value: u64) -> bool;
69
70    #[link_name = "llvm.amdgcn.wave.reduce.umin"]
71    safe fn llvm_wave_reduce_umin(value: u32, strategy: u32) -> u32;
72    #[link_name = "llvm.amdgcn.wave.reduce.min"]
73    safe fn llvm_wave_reduce_min(value: i32, strategy: u32) -> i32;
74    #[link_name = "llvm.amdgcn.wave.reduce.umax"]
75    safe fn llvm_wave_reduce_umax(value: u32, strategy: u32) -> u32;
76    #[link_name = "llvm.amdgcn.wave.reduce.max"]
77    safe fn llvm_wave_reduce_max(value: i32, strategy: u32) -> i32;
78    #[link_name = "llvm.amdgcn.wave.reduce.add"]
79    safe fn llvm_wave_reduce_add(value: u32, strategy: u32) -> u32;
80    #[link_name = "llvm.amdgcn.wave.reduce.and"]
81    safe fn llvm_wave_reduce_and(value: u32, strategy: u32) -> u32;
82    #[link_name = "llvm.amdgcn.wave.reduce.or"]
83    safe fn llvm_wave_reduce_or(value: u32, strategy: u32) -> u32;
84    #[link_name = "llvm.amdgcn.wave.reduce.xor"]
85    safe fn llvm_wave_reduce_xor(value: u32, strategy: u32) -> u32;
86
87    // The following intrinsics can have multiple sizes
88
89    #[link_name = "llvm.amdgcn.readfirstlane.i32"]
90    safe fn llvm_readfirstlane_u32(value: u32) -> u32;
91    #[link_name = "llvm.amdgcn.readfirstlane.i64"]
92    safe fn llvm_readfirstlane_u64(value: u64) -> u64;
93    #[link_name = "llvm.amdgcn.readlane.i32"]
94    fn llvm_readlane_u32(value: u32, lane: u32) -> u32;
95    #[link_name = "llvm.amdgcn.readlane.i64"]
96    fn llvm_readlane_u64(value: u64, lane: u32) -> u64;
97    #[link_name = "llvm.amdgcn.writelane.i32"]
98    fn llvm_writelane_u32(value: u32, lane: u32, default: u32) -> u32;
99    #[link_name = "llvm.amdgcn.writelane.i64"]
100    fn llvm_writelane_u64(value: u64, lane: u32, default: u64) -> u64;
101
102    #[link_name = "llvm.amdgcn.endpgm"]
103    safe fn llvm_endpgm() -> !;
104
105    #[link_name = "llvm.amdgcn.update.dpp.i32"]
106    fn llvm_update_dpp(
107        old: u32,
108        src: u32,
109        dpp_ctrl: u32,
110        row_mask: u32,
111        bank_mask: u32,
112        bound_control: bool,
113    ) -> u32;
114
115    #[link_name = "llvm.amdgcn.s.memrealtime"]
116    safe fn llvm_s_memrealtime() -> u64;
117
118    #[link_name = "llvm.amdgcn.ds.permute"]
119    fn llvm_ds_permute(lane: u32, value: u32) -> u32;
120    #[link_name = "llvm.amdgcn.ds.bpermute"]
121    fn llvm_ds_bpermute(lane: u32, value: u32) -> u32;
122    #[link_name = "llvm.amdgcn.perm"]
123    fn llvm_perm(src0: u32, src1: u32, selector: u32) -> u32;
124
125    // gfx10
126    #[link_name = "llvm.amdgcn.permlane16.i32"]
127    fn llvm_permlane16_u32(
128        old: u32,
129        src0: u32,
130        src1: u32,
131        src2: u32,
132        fi: bool,
133        bound_control: bool,
134    ) -> u32;
135
136    // gfx10
137    #[link_name = "llvm.amdgcn.permlanex16.i32"]
138    fn llvm_permlanex16_u32(
139        old: u32,
140        src0: u32,
141        src1: u32,
142        src2: u32,
143        fi: bool,
144        bound_control: bool,
145    ) -> u32;
146
147    #[link_name = "llvm.amdgcn.s.get.waveid.in.workgroup"]
148    safe fn llvm_s_get_waveid_in_workgroup() -> u32;
149
150    // gfx11
151    #[link_name = "llvm.amdgcn.permlane64.i32"]
152    fn llvm_permlane64_u32(value: u32) -> u32;
153
154    // gfx12
155    #[link_name = "llvm.amdgcn.permlane16.var"]
156    fn llvm_permlane16_var(old: u32, src0: u32, src1: u32, fi: bool, bound_control: bool) -> u32;
157
158    // gfx12
159    #[link_name = "llvm.amdgcn.permlanex16.var"]
160    fn llvm_permlanex16_var(old: u32, src0: u32, src1: u32, fi: bool, bound_control: bool) -> u32;
161
162    #[link_name = "llvm.amdgcn.wave.id"]
163    safe fn llvm_wave_id() -> u32;
164
165    // gfx950
166    #[link_name = "llvm.amdgcn.permlane16.swap"]
167    fn llvm_permlane16_swap(
168        vdst_old: u32,
169        vsrc_src0: u32,
170        fi: bool,
171        bound_control: bool,
172    ) -> (u32, u32);
173
174    // gfx950
175    #[link_name = "llvm.amdgcn.permlane32.swap"]
176    fn llvm_permlane32_swap(
177        vdst_old: u32,
178        vsrc_src0: u32,
179        fi: bool,
180        bound_control: bool,
181    ) -> (u32, u32);
182}
183
184/// Returns the x coordinate of the workitem index within the workgroup.
185#[inline]
186#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
187pub fn workitem_id_x() -> u32 {
188    llvm_workitem_id_x()
189}
190/// Returns the y coordinate of the workitem index within the workgroup.
191#[inline]
192#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
193pub fn workitem_id_y() -> u32 {
194    llvm_workitem_id_y()
195}
196/// Returns the z coordinate of the workitem index within the workgroup.
197#[inline]
198#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
199pub fn workitem_id_z() -> u32 {
200    llvm_workitem_id_z()
201}
202
203/// Returns the x coordinate of the workgroup index within the dispatch.
204#[inline]
205#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
206pub fn workgroup_id_x() -> u32 {
207    llvm_workgroup_id_x()
208}
209/// Returns the y coordinate of the workgroup index within the dispatch.
210#[inline]
211#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
212pub fn workgroup_id_y() -> u32 {
213    llvm_workgroup_id_y()
214}
215/// Returns the z coordinate of the workgroup index within the dispatch.
216#[inline]
217#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
218pub fn workgroup_id_z() -> u32 {
219    llvm_workgroup_id_z()
220}
221
222/// Returns the size of statically allocated shared memory for this program in bytes.
223#[inline]
224#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
225pub fn groupstaticsize() -> u32 {
226    llvm_groupstaticsize()
227}
228/// Returns the id of the dispatch that is currently executed.
229#[inline]
230#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
231pub fn dispatch_id() -> u64 {
232    llvm_dispatch_id()
233}
234
235/// Returns the number of threads in a wavefront.
236///
237/// Is always a power of 2.
238#[inline]
239#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
240pub fn wavefrontsize() -> u32 {
241    llvm_wavefrontsize()
242}
243
244/// Synchronize all wavefronts in a workgroup.
245///
246/// Each wavefronts in a workgroup waits at the barrier until all wavefronts in the workgroup arrive at a barrier.
247///
248#[doc = include_str!("intrinsic_is_convergent.md")]
249#[inline]
250#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
251pub fn s_barrier() {
252    llvm_s_barrier()
253}
254
255/// Signal a specific barrier type.
256///
257/// Only for non-named barriers.
258///
259#[doc = include_str!("intrinsic_is_convergent.md")]
260#[inline]
261#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
262pub unsafe fn s_barrier_signal<const BARRIER_TYPE: i32>() {
263    unsafe { llvm_s_barrier_signal(BARRIER_TYPE) }
264}
265
266/// Signal a specific barrier type.
267///
268/// Only for non-named barriers.
269/// Provides access to the s_barrier_signal_first instruction;
270/// additionally ensures that the result value is valid even when
271/// the intrinsic is used from a wavefront that is not running in a workgroup.
272///
273#[doc = include_str!("intrinsic_is_convergent.md")]
274#[inline]
275#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
276pub unsafe fn s_barrier_signal_isfirst<const BARRIER_TYPE: i32>() -> bool {
277    unsafe { llvm_s_barrier_signal_isfirst(BARRIER_TYPE) }
278}
279
280/// Wait for a specific barrier type.
281///
282/// Only for non-named barriers.
283///
284#[doc = include_str!("intrinsic_is_convergent.md")]
285#[inline]
286#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
287pub unsafe fn s_barrier_wait<const BARRIER_TYPE: i16>() {
288    unsafe { llvm_s_barrier_wait(BARRIER_TYPE) }
289}
290
291/// Get the state of a specific barrier type.
292///
293/// The `barrier_type` argument must be uniform, otherwise behavior is undefined.
294///
295#[doc = include_str!("intrinsic_is_convergent.md")]
296#[inline]
297#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
298pub unsafe fn s_get_barrier_state<const BARRIER_TYPE: i32>() -> u32 {
299    unsafe { llvm_s_get_barrier_state(BARRIER_TYPE) }
300}
301
302/// A barrier for only the threads within the current wavefront.
303///
304/// Does not result in an instruction but restricts the compiler.
305///
306#[doc = include_str!("intrinsic_is_convergent.md")]
307#[inline]
308#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
309pub fn wave_barrier() {
310    llvm_wave_barrier()
311}
312
313/// Prevent movement of some instruction types.
314///
315/// Controls the types of instructions that may be allowed to cross the intrinsic during instruction scheduling.
316/// The parameter is a mask for the instruction types that can cross the intrinsic.
317///
318/// - 0x0000: No instructions may be scheduled across `sched_barrier`.
319/// - 0x0001: All, non-memory, non-side-effect producing instructions may be scheduled across `sched_barrier`, i.e. allow ALU instructions to pass.
320/// - 0x0002: VALU instructions may be scheduled across `sched_barrier`.
321/// - 0x0004: SALU instructions may be scheduled across `sched_barrier`.
322/// - 0x0008: MFMA/WMMA instructions may be scheduled across `sched_barrier`.
323/// - 0x0010: All VMEM instructions may be scheduled across `sched_barrier`.
324/// - 0x0020: VMEM read instructions may be scheduled across `sched_barrier`.
325/// - 0x0040: VMEM write instructions may be scheduled across `sched_barrier`.
326/// - 0x0080: All DS instructions may be scheduled across `sched_barrier`.
327/// - 0x0100: All DS read instructions may be scheduled across `sched_barrier`.
328/// - 0x0200: All DS write instructions may be scheduled across `sched_barrier`.
329/// - 0x0400: All Transcendental (e.g. V_EXP) instructions may be scheduled across `sched_barrier`.
330///
331#[doc = include_str!("intrinsic_is_convergent.md")]
332#[inline]
333#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
334pub unsafe fn sched_barrier<const MASK: u32>() {
335    static_assert_uimm_bits!(MASK, 11);
336    unsafe { llvm_sched_barrier(MASK) }
337}
338
339/// Creates schedule groups with specific properties to create custom scheduling pipelines.
340///
341/// The ordering between groups is enforced by the instruction scheduler.
342/// The intrinsic applies to the code that precedes the intrinsic.
343/// The intrinsic takes three values that control the behavior of the schedule groups.
344///
345/// - `mask`: Classify instruction groups using the [`sched_barrier`] mask values.
346/// - `size`: The number of instructions that are in the group.
347/// - `sync_id`: Order is enforced between groups with matching values.
348///
349/// The mask can include multiple instruction types. It is undefined behavior to set values beyond the range of valid masks.
350///
351/// Combining multiple `sched_group_barrier` intrinsics enables an ordering of specific instruction types during instruction scheduling.
352/// For example, the following enforces a sequence of 1 VMEM read, followed by 1 VALU instruction, followed by 5 MFMA instructions.
353///
354/// ```rust
355/// // 1 VMEM read
356/// sched_group_barrier::<32, 1, 0>()
357/// // 1 VALU
358/// sched_group_barrier::<2, 1, 0>()
359/// // 5 MFMA
360/// sched_group_barrier::<8, 5, 0>()
361/// ```
362///
363#[doc = include_str!("intrinsic_is_convergent.md")]
364#[inline]
365#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
366pub unsafe fn sched_group_barrier<const MASK: u32, const SIZE: u32, const SYNC_ID: u32>() {
367    static_assert_uimm_bits!(MASK, 11);
368    unsafe { llvm_sched_group_barrier(MASK, SIZE, SYNC_ID) }
369}
370
371/// Sleeps for approximately `COUNT * 64` cycles.
372///
373/// `COUNT` must be a constant.
374/// Only the lower 7 bits of `COUNT` are used.
375/// If `COUNT == 0x8000`, sleep forever until woken up, or killed.
376#[inline]
377#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
378pub fn s_sleep<const COUNT: u32>() {
379    llvm_s_sleep(COUNT)
380}
381
382/// Stop execution of the kernel.
383///
384/// This usually signals an error state.
385///
386#[doc = include_str!("intrinsic_is_convergent.md")]
387#[inline]
388#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
389pub fn s_sethalt<const VALUE: u32>() -> ! {
390    static_assert_uimm_bits!(VALUE, 3);
391    llvm_s_sethalt(VALUE)
392}
393
394/// Returns the current process counter.
395///
396/// Provides access to the s_getpc_b64 instruction, but with the return value sign-extended
397/// from the width of the underlying PC hardware register even on processors where the
398/// s_getpc_b64 instruction returns a zero-extended value.
399#[inline]
400#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
401pub fn s_getpc() -> i64 {
402    llvm_s_getpc()
403}
404
405/// Masked bit count, low 32 lanes.
406///
407/// Computes the number of bits set in `value`, masked with a thread mask
408/// which contains 1 for all active threads less than the current thread within a wavefront.
409/// `init` is added to the result.
410#[inline]
411#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
412pub fn mbcnt_lo(value: u32, init: u32) -> u32 {
413    llvm_mbcnt_lo(value, init)
414}
415/// Masked bit count, high 32 lanes.
416///
417/// Computes the number of bits set in `value`, masked with a thread mask
418/// which contains 1 for all active threads less than the current thread within a wavefront.
419/// `init` is added to the result.
420#[inline]
421#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
422pub fn mbcnt_hi(value: u32, init: u32) -> u32 {
423    llvm_mbcnt_hi(value, init)
424}
425
426/// Returns a bitfield (`u32` or `u64`) containing the result of its i1 argument
427/// in all active lanes, and zero in all inactive lanes.
428///
429#[doc = include_str!("intrinsic_is_convergent.md")]
430#[inline]
431#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
432pub fn ballot(b: bool) -> u64 {
433    llvm_ballot(b)
434}
435
436/// Indexes into the `value` with the current lane id and returns for each lane
437/// if the corresponding bit is set.
438///
439/// While [`ballot`] converts a `bool` to a mask, `inverse_ballot` converts a mask back to a `bool`.
440/// This means `inverse_ballot(ballot(b)) == b`.
441/// The inverse of `ballot(inverse_ballot(value)) ~= value` is not always true as inactive lanes are set to zero by `ballot`.
442///
443#[doc = include_str!("intrinsic_is_convergent.md")]
444#[inline]
445#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
446pub fn inverse_ballot(value: u64) -> bool {
447    llvm_inverse_ballot(value)
448}
449
450/// Performs an arithmetic min reduction on the unsigned values provided by each lane in the wavefront.
451///
452/// The `STRATEGY` argument is a hint for the reduction strategy.
453/// - 0: Target default preference
454/// - 1: Iterative strategy
455/// - 2: DPP
456///
457/// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy.
458///
459#[doc = include_str!("intrinsic_is_convergent.md")]
460#[inline]
461#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
462pub fn wave_reduce_umin<const STRATEGY: u32>(value: u32) -> u32 {
463    static_assert!(STRATEGY <= 2);
464    llvm_wave_reduce_umin(value, STRATEGY)
465}
466/// Performs an arithmetic min reduction on the signed values provided by each lane in the wavefront.
467///
468/// The `STRATEGY` argument is a hint for the reduction strategy.
469/// - 0: Target default preference
470/// - 1: Iterative strategy
471/// - 2: DPP
472///
473/// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy.
474///
475#[doc = include_str!("intrinsic_is_convergent.md")]
476#[inline]
477#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
478pub fn wave_reduce_min<const STRATEGY: u32>(value: i32) -> i32 {
479    static_assert!(STRATEGY <= 2);
480    llvm_wave_reduce_min(value, STRATEGY)
481}
482
483/// Performs an arithmetic max reduction on the unsigned values provided by each lane in the wavefront.
484///
485/// The `STRATEGY` argument is a hint for the reduction strategy.
486/// - 0: Target default preference
487/// - 1: Iterative strategy
488/// - 2: DPP
489///
490/// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy.
491///
492#[doc = include_str!("intrinsic_is_convergent.md")]
493#[inline]
494#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
495pub fn wave_reduce_umax<const STRATEGY: u32>(value: u32) -> u32 {
496    static_assert!(STRATEGY <= 2);
497    llvm_wave_reduce_umax(value, STRATEGY)
498}
499/// Performs an arithmetic max reduction on the signed values provided by each lane in the wavefront.
500///
501/// The `STRATEGY` argument is a hint for the reduction strategy.
502/// - 0: Target default preference
503/// - 1: Iterative strategy
504/// - 2: DPP
505///
506/// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy.
507///
508#[doc = include_str!("intrinsic_is_convergent.md")]
509#[inline]
510#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
511pub fn wave_reduce_max<const STRATEGY: u32>(value: i32) -> i32 {
512    static_assert!(STRATEGY <= 2);
513    llvm_wave_reduce_max(value, STRATEGY)
514}
515
516/// Performs an arithmetic add reduction on the values provided by each lane in the wavefront.
517///
518/// The `STRATEGY` argument is a hint for the reduction strategy.
519/// - 0: Target default preference
520/// - 1: Iterative strategy
521/// - 2: DPP
522///
523/// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy.
524///
525#[doc = include_str!("intrinsic_is_convergent.md")]
526#[inline]
527#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
528pub fn wave_reduce_add<const STRATEGY: u32>(value: u32) -> u32 {
529    static_assert!(STRATEGY <= 2);
530    llvm_wave_reduce_add(value, STRATEGY)
531}
532
533/// Performs a logical and reduction on the unsigned values provided by each lane in the wavefront.
534///
535/// The `STRATEGY` argument is a hint for the reduction strategy.
536/// - 0: Target default preference
537/// - 1: Iterative strategy
538/// - 2: DPP
539///
540/// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy.
541///
542#[doc = include_str!("intrinsic_is_convergent.md")]
543#[inline]
544#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
545pub fn wave_reduce_and<const STRATEGY: u32>(value: u32) -> u32 {
546    static_assert!(STRATEGY <= 2);
547    llvm_wave_reduce_and(value, STRATEGY)
548}
549/// Performs a logical or reduction on the unsigned values provided by each lane in the wavefront.
550///
551/// The `STRATEGY` argument is a hint for the reduction strategy.
552/// - 0: Target default preference
553/// - 1: Iterative strategy
554/// - 2: DPP
555///
556/// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy.
557///
558#[doc = include_str!("intrinsic_is_convergent.md")]
559#[inline]
560#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
561pub fn wave_reduce_or<const STRATEGY: u32>(value: u32) -> u32 {
562    static_assert!(STRATEGY <= 2);
563    llvm_wave_reduce_or(value, STRATEGY)
564}
565/// Performs a logical xor reduction on the unsigned values provided by each lane in the wavefront.
566///
567/// The `STRATEGY` argument is a hint for the reduction strategy.
568/// - 0: Target default preference
569/// - 1: Iterative strategy
570/// - 2: DPP
571///
572/// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy.
573///
574#[doc = include_str!("intrinsic_is_convergent.md")]
575#[inline]
576#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
577pub fn wave_reduce_xor<const STRATEGY: u32>(value: u32) -> u32 {
578    static_assert!(STRATEGY <= 2);
579    llvm_wave_reduce_xor(value, STRATEGY)
580}
581
582// The following intrinsics can have multiple sizes
583
584/// Get `value` from the first active lane in the wavefront.
585///
586#[doc = include_str!("intrinsic_is_convergent.md")]
587#[inline]
588#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
589pub fn readfirstlane_u32(value: u32) -> u32 {
590    llvm_readfirstlane_u32(value)
591}
592/// Get `value` from the first active lane in the wavefront.
593///
594#[doc = include_str!("intrinsic_is_convergent.md")]
595#[inline]
596#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
597pub fn readfirstlane_u64(value: u64) -> u64 {
598    llvm_readfirstlane_u64(value)
599}
600/// Get `value` from the lane at index `lane` in the wavefront.
601///
602/// The lane argument must be uniform across the currently active threads
603/// of the current wavefront. Otherwise, the result is undefined.
604///
605#[doc = include_str!("intrinsic_is_convergent.md")]
606#[inline]
607#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
608pub unsafe fn readlane_u32(value: u32, lane: u32) -> u32 {
609    unsafe { llvm_readlane_u32(value, lane) }
610}
611/// Get `value` from the lane at index `lane` in the wavefront.
612///
613/// The lane argument must be uniform across the currently active threads
614/// of the current wavefront. Otherwise, the result is undefined.
615///
616#[doc = include_str!("intrinsic_is_convergent.md")]
617#[inline]
618#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
619pub unsafe fn readlane_u64(value: u64, lane: u32) -> u64 {
620    unsafe { llvm_readlane_u64(value, lane) }
621}
622/// Return `value` for the lane at index `lane` in the wavefront.
623/// Return `default` for all other lanes.
624///
625/// The value to write and lane select arguments must be uniform across the
626/// currently active threads of the current wavefront. Otherwise, the result is
627/// undefined.
628///
629/// `value` is the value returned by `lane`.
630/// `default` is the value returned by all lanes other than `lane`.
631///
632#[doc = include_str!("intrinsic_is_convergent.md")]
633#[inline]
634#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
635pub unsafe fn writelane_u32(value: u32, lane: u32, default: u32) -> u32 {
636    unsafe { llvm_writelane_u32(value, lane, default) }
637}
638/// Return `value` for the lane at index `lane` in the wavefront.
639/// Return `default` for all other lanes.
640///
641/// The value to write and lane select arguments must be uniform across the
642/// currently active threads of the current wavefront. Otherwise, the result is
643/// undefined.
644///
645/// `value` is the value returned by `lane`.
646/// `default` is the value returned by all lanes other than `lane`.
647///
648#[doc = include_str!("intrinsic_is_convergent.md")]
649#[inline]
650#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
651pub unsafe fn writelane_u64(value: u64, lane: u32, default: u64) -> u64 {
652    unsafe { llvm_writelane_u64(value, lane, default) }
653}
654
655/// Stop execution of the wavefront.
656///
657/// This usually signals the end of a successful execution.
658///
659#[doc = include_str!("intrinsic_is_convergent.md")]
660#[inline]
661#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
662pub fn endpgm() -> ! {
663    llvm_endpgm()
664}
665
666/// The `update_dpp` intrinsic represents the `update.dpp` operation in AMDGPU.
667/// It takes an old value, a source operand, a DPP control operand, a row mask, a bank mask, and a bound control.
668/// This operation is equivalent to a sequence of `v_mov_b32` operations.
669///
670/// `llvm.amdgcn.update.dpp.i32 <old> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>`
671/// Should be equivalent to:
672/// ```asm
673/// v_mov_b32 <dest> <old>
674/// v_mov_b32 <dest> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>
675/// ```
676///
677#[doc = include_str!("intrinsic_is_convergent.md")]
678#[inline]
679#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
680pub unsafe fn update_dpp<
681    const DPP_CTRL: u32,
682    const ROW_MASK: u32,
683    const BANK_MASK: u32,
684    const BOUND_CONTROL: bool,
685>(
686    old: u32,
687    src: u32,
688) -> u32 {
689    unsafe { llvm_update_dpp(old, src, DPP_CTRL, ROW_MASK, BANK_MASK, BOUND_CONTROL) }
690}
691
692/// Measures time based on a fixed frequency.
693///
694/// Provides a real-time clock counter that runs at constant speed (typically 100 MHz) independent of ALU clock speeds.
695/// The clock is consistent across the chip, so can be used for measuring between different wavefronts.
696#[inline]
697#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
698pub fn s_memrealtime() -> u64 {
699    llvm_s_memrealtime()
700}
701
702/// Scatter data across all lanes in a wavefront.
703///
704/// Writes `value` to the lane `lane`.
705///
706/// Reading from inactive lanes returns `0`.
707/// In case multiple values get written to the same `lane`, the value from the source lane with the higher index is taken.
708///
709#[doc = include_str!("intrinsic_is_convergent.md")]
710#[inline]
711#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
712pub unsafe fn ds_permute(lane: u32, value: u32) -> u32 {
713    unsafe { llvm_ds_permute(lane, value) }
714}
715/// Gather data across all lanes in a wavefront.
716///
717/// Returns the `value` given to `ds_permute` by lane `lane`.
718///
719/// Reading from inactive lanes returns `0`.
720///
721#[doc = include_str!("intrinsic_is_convergent.md")]
722#[inline]
723#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
724pub unsafe fn ds_bpermute(lane: u32, value: u32) -> u32 {
725    unsafe { llvm_ds_bpermute(lane, value) }
726}
727/// Permute a 64-bit value.
728///
729/// `selector` selects between different patterns in which the 64-bit values represented by `src0` and `src1` are permuted.
730#[inline]
731#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
732pub unsafe fn perm(src0: u32, src1: u32, selector: u32) -> u32 {
733    unsafe { llvm_perm(src0, src1, selector) }
734}
735
736// gfx10
737/// Performs arbitrary gather-style operation within a row (16 contiguous lanes) of the second input operand.
738///
739/// The third and fourth inputs must be uniform across the current wavefront.
740/// These are combined into a single 64-bit value representing lane selects used to swizzle within each row.
741///
742#[doc = include_str!("intrinsic_is_convergent.md")]
743#[inline]
744#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
745pub unsafe fn permlane16_u32<const FI: bool, const BOUND_CONTROL: bool>(
746    old: u32,
747    src0: u32,
748    src1: u32,
749    src2: u32,
750) -> u32 {
751    unsafe { llvm_permlane16_u32(old, src0, src1, src2, FI, BOUND_CONTROL) }
752}
753
754// gfx10
755/// Performs arbitrary gather-style operation across two rows (16 contiguous lanes) of the second input operand.
756///
757/// The third and fourth inputs must be uniform across the current wavefront.
758/// These are combined into a single 64-bit value representing lane selects used to swizzle within each row.
759///
760#[doc = include_str!("intrinsic_is_convergent.md")]
761#[inline]
762#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
763pub unsafe fn permlanex16_u32<const FI: bool, const BOUND_CONTROL: bool>(
764    old: u32,
765    src0: u32,
766    src1: u32,
767    src2: u32,
768) -> u32 {
769    unsafe { llvm_permlanex16_u32(old, src0, src1, src2, FI, BOUND_CONTROL) }
770}
771
772/// Get the index of the current wavefront in the workgroup.
773#[inline]
774#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
775pub fn s_get_waveid_in_workgroup() -> u32 {
776    llvm_s_get_waveid_in_workgroup()
777}
778
779// gfx11
780/// Swap `value` between upper and lower 32 lanes in a wavefront.
781///
782/// Does nothing for wave32.
783///
784#[doc = include_str!("intrinsic_is_convergent.md")]
785#[inline]
786#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
787pub unsafe fn permlane64_u32(value: u32) -> u32 {
788    unsafe { llvm_permlane64_u32(value) }
789}
790
791// gfx12
792/// Performs arbitrary gather-style operation within a row (16 contiguous lanes) of the second input operand.
793///
794/// In contrast to [`permlane16_u32`], allows each lane to specify its own gather lane.
795///
796#[doc = include_str!("intrinsic_is_convergent.md")]
797#[inline]
798#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
799pub unsafe fn permlane16_var<const FI: bool, const BOUND_CONTROL: bool>(
800    old: u32,
801    src0: u32,
802    src1: u32,
803) -> u32 {
804    unsafe { llvm_permlane16_var(old, src0, src1, FI, BOUND_CONTROL) }
805}
806
807// gfx12
808/// Performs arbitrary gather-style operation across two rows (16 contiguous lanes) of the second input operand.
809///
810/// In contrast to [`permlanex16_u32`], allows each lane to specify its own gather lane.
811///
812#[doc = include_str!("intrinsic_is_convergent.md")]
813#[inline]
814#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
815pub unsafe fn permlanex16_var<const FI: bool, const BOUND_CONTROL: bool>(
816    old: u32,
817    src0: u32,
818    src1: u32,
819) -> u32 {
820    unsafe { llvm_permlanex16_var(old, src0, src1, FI, BOUND_CONTROL) }
821}
822
823/// Get the index of the current wavefront in the workgroup.
824#[inline]
825#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
826pub fn wave_id() -> u32 {
827    llvm_wave_id()
828}
829
830// gfx950
831/// Provide direct access to `v_permlane16_swap_b32` instruction on supported targets.
832///
833/// Swaps the values across lanes of first 2 operands.
834/// Odd rows of the first operand are swapped with even rows of the second operand (one row is 16 lanes).
835/// Returns a pair for the swapped registers.
836/// The first element of the return corresponds to the swapped element of the first argument.
837///
838#[doc = include_str!("intrinsic_is_convergent.md")]
839#[inline]
840#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
841pub unsafe fn permlane16_swap<const FI: bool, const BOUND_CONTROL: bool>(
842    vdst_old: u32,
843    vsrc_src0: u32,
844) -> (u32, u32) {
845    unsafe { llvm_permlane16_swap(vdst_old, vsrc_src0, FI, BOUND_CONTROL) }
846}
847
848// gfx950
849/// Provide direct access to `v_permlane32_swap_b32` instruction on supported targets.
850///
851/// Swaps the values across lanes of first 2 operands.
852/// Rows 2 and 3 of the first operand are swapped with rows 0 and 1 of the second operand (one row is 16 lanes).
853/// Returns a pair for the swapped registers.
854/// The first element of the return corresponds to the swapped element of the first argument.
855///
856#[doc = include_str!("intrinsic_is_convergent.md")]
857#[inline]
858#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
859pub unsafe fn permlane32_swap<const FI: bool, const BOUND_CONTROL: bool>(
860    vdst_old: u32,
861    vsrc_src0: u32,
862) -> (u32, u32) {
863    unsafe { llvm_permlane32_swap(vdst_old, vsrc_src0, FI, BOUND_CONTROL) }
864}
865
866// Functions to generate code, used to check that the intrinsics build.
867// Marked as no_mangle, so the compiler does not remove the functions.
868// To test, uncomment the `#[cfg(test)]` line below and run
869// NORUN=1 NOSTD=1 TARGET=amdgcn-amd-amdhsa CARGO_UNSTABLE_BUILD_STD=core ci/run.sh
870//
871// Note that depending on the target-cpu set in run.sh, some of these intrinsics are not available
872// and compilation fails with `Cannot select: intrinsic %llvm.amdgcn...`.
873// Uncomment these intrinsics to check.
874#[cfg(test)]
875mod tests {
876    use super::*;
877
878    #[unsafe(no_mangle)]
879    fn test_workitem_id_x() -> u32 {
880        workitem_id_x()
881    }
882    #[unsafe(no_mangle)]
883    fn test_workitem_id_y() -> u32 {
884        workitem_id_y()
885    }
886    #[unsafe(no_mangle)]
887    fn test_workitem_id_z() -> u32 {
888        workitem_id_z()
889    }
890
891    #[unsafe(no_mangle)]
892    fn test_workgroup_id_x() -> u32 {
893        workgroup_id_x()
894    }
895    #[unsafe(no_mangle)]
896    fn test_workgroup_id_y() -> u32 {
897        workgroup_id_y()
898    }
899    #[unsafe(no_mangle)]
900    fn test_workgroup_id_z() -> u32 {
901        workgroup_id_z()
902    }
903
904    #[unsafe(no_mangle)]
905    fn test_groupstaticsize() -> u32 {
906        groupstaticsize()
907    }
908    #[unsafe(no_mangle)]
909    fn test_dispatch_id() -> u64 {
910        dispatch_id()
911    }
912
913    #[unsafe(no_mangle)]
914    fn test_wavefrontsize() -> u32 {
915        wavefrontsize()
916    }
917
918    #[unsafe(no_mangle)]
919    fn test_s_barrier() {
920        s_barrier()
921    }
922
923    #[unsafe(no_mangle)]
924    fn test_s_barrier_signal() {
925        unsafe { s_barrier_signal::<-1>() }
926    }
927
928    #[unsafe(no_mangle)]
929    fn test_s_barrier_signal_isfirst() -> bool {
930        unsafe { s_barrier_signal_isfirst::<-1>() }
931    }
932
933    #[unsafe(no_mangle)]
934    fn test_s_barrier_wait() {
935        unsafe { s_barrier_wait::<-1>() }
936    }
937
938    #[unsafe(no_mangle)]
939    fn test_s_get_barrier_state() -> u32 {
940        unsafe { s_get_barrier_state::<-1>() }
941    }
942
943    #[unsafe(no_mangle)]
944    fn test_wave_barrier() {
945        wave_barrier()
946    }
947
948    #[unsafe(no_mangle)]
949    fn test_sched_barrier() {
950        unsafe { sched_barrier::<1>() }
951    }
952
953    #[unsafe(no_mangle)]
954    fn test_sched_group_barrier() {
955        unsafe { sched_group_barrier::<1, 1, 0>() }
956    }
957
958    #[unsafe(no_mangle)]
959    fn test_s_sleep() {
960        s_sleep::<1>()
961    }
962
963    #[unsafe(no_mangle)]
964    fn test_s_sethalt() -> ! {
965        s_sethalt::<1>()
966    }
967
968    #[unsafe(no_mangle)]
969    fn test_s_getpc() -> i64 {
970        s_getpc()
971    }
972
973    #[unsafe(no_mangle)]
974    fn test_mbcnt_lo(value: u32, init: u32) -> u32 {
975        mbcnt_lo(value, init)
976    }
977    #[unsafe(no_mangle)]
978    fn test_mbcnt_hi(value: u32, init: u32) -> u32 {
979        mbcnt_hi(value, init)
980    }
981
982    #[unsafe(no_mangle)]
983    fn test_ballot(b: bool) -> u64 {
984        ballot(b)
985    }
986
987    #[unsafe(no_mangle)]
988    fn test_inverse_ballot(value: u64) -> bool {
989        inverse_ballot(value)
990    }
991
992    #[unsafe(no_mangle)]
993    fn test_wave_reduce_umin(value: u32) -> u32 {
994        wave_reduce_umin::<0>(value)
995    }
996    #[unsafe(no_mangle)]
997    fn test_wave_reduce_min(value: i32) -> i32 {
998        wave_reduce_min::<0>(value)
999    }
1000
1001    #[unsafe(no_mangle)]
1002    fn test_wave_reduce_umax(value: u32) -> u32 {
1003        wave_reduce_umax::<0>(value)
1004    }
1005    #[unsafe(no_mangle)]
1006    fn test_wave_reduce_max(value: i32) -> i32 {
1007        wave_reduce_max::<0>(value)
1008    }
1009
1010    #[unsafe(no_mangle)]
1011    fn test_wave_reduce_add(value: u32) -> u32 {
1012        wave_reduce_add::<0>(value)
1013    }
1014
1015    #[unsafe(no_mangle)]
1016    fn test_wave_reduce_and(value: u32) -> u32 {
1017        wave_reduce_and::<0>(value)
1018    }
1019    #[unsafe(no_mangle)]
1020    fn test_wave_reduce_or(value: u32) -> u32 {
1021        wave_reduce_or::<0>(value)
1022    }
1023    #[unsafe(no_mangle)]
1024    fn test_wave_reduce_xor(value: u32) -> u32 {
1025        wave_reduce_xor::<0>(value)
1026    }
1027
1028    #[unsafe(no_mangle)]
1029    fn test_readfirstlane_u32(value: u32) -> u32 {
1030        readfirstlane_u32(value)
1031    }
1032    #[unsafe(no_mangle)]
1033    fn test_readfirstlane_u64(value: u64) -> u64 {
1034        readfirstlane_u64(value)
1035    }
1036    #[unsafe(no_mangle)]
1037    fn test_readlane_u32(value: u32, lane: u32) -> u32 {
1038        unsafe { readlane_u32(value, lane) }
1039    }
1040    #[unsafe(no_mangle)]
1041    fn test_readlane_u64(value: u64, lane: u32) -> u64 {
1042        unsafe { readlane_u64(value, lane) }
1043    }
1044    #[unsafe(no_mangle)]
1045    fn test_writelane_u32(value: u32, lane: u32, default: u32) -> u32 {
1046        unsafe { writelane_u32(value, lane, default) }
1047    }
1048    #[unsafe(no_mangle)]
1049    fn test_writelane_u64(value: u64, lane: u32, default: u64) -> u64 {
1050        unsafe { writelane_u64(value, lane, default) }
1051    }
1052
1053    #[unsafe(no_mangle)]
1054    fn test_endpgm() -> ! {
1055        endpgm()
1056    }
1057
1058    #[unsafe(no_mangle)]
1059    fn test_update_dpp(old: u32, src: u32) -> u32 {
1060        unsafe { update_dpp::<0, 0, 0, true>(old, src) }
1061    }
1062
1063    #[unsafe(no_mangle)]
1064    fn test_s_memrealtime() -> u64 {
1065        s_memrealtime()
1066    }
1067
1068    #[unsafe(no_mangle)]
1069    fn test_ds_permute(lane: u32, value: u32) -> u32 {
1070        unsafe { ds_permute(lane, value) }
1071    }
1072    #[unsafe(no_mangle)]
1073    fn test_ds_bpermute(lane: u32, value: u32) -> u32 {
1074        unsafe { ds_bpermute(lane, value) }
1075    }
1076    #[unsafe(no_mangle)]
1077    fn test_perm(src0: u32, src1: u32, selector: u32) -> u32 {
1078        unsafe { perm(src0, src1, selector) }
1079    }
1080
1081    #[unsafe(no_mangle)]
1082    fn test_permlane16_u32(old: u32, src0: u32, src1: u32, src2: u32) -> u32 {
1083        unsafe { permlane16_u32::<false, true>(old, src0, src1, src2) }
1084    }
1085
1086    #[unsafe(no_mangle)]
1087    fn test_permlanex16_u32(old: u32, src0: u32, src1: u32, src2: u32) -> u32 {
1088        unsafe { permlanex16_u32::<false, true>(old, src0, src1, src2) }
1089    }
1090
1091    #[unsafe(no_mangle)]
1092    fn test_s_get_waveid_in_workgroup() -> u32 {
1093        s_get_waveid_in_workgroup()
1094    }
1095
1096    #[unsafe(no_mangle)]
1097    fn test_permlane64_u32(value: u32) -> u32 {
1098        unsafe { permlane64_u32(value) }
1099    }
1100
1101    #[unsafe(no_mangle)]
1102    fn test_permlane16_var(old: u32, src0: u32, src1: u32) -> u32 {
1103        unsafe { permlane16_var::<false, true>(old, src0, src1) }
1104    }
1105
1106    #[unsafe(no_mangle)]
1107    fn test_permlanex16_var(old: u32, src0: u32, src1: u32) -> u32 {
1108        unsafe { permlanex16_var::<false, true>(old, src0, src1) }
1109    }
1110
1111    #[unsafe(no_mangle)]
1112    fn test_wave_id() -> u32 {
1113        wave_id()
1114    }
1115
1116    #[unsafe(no_mangle)]
1117    fn test_permlane16_swap(vdst_old: u32, vsrc_src0: u32) -> (u32, u32) {
1118        unsafe { permlane16_swap::<false, true>(vdst_old, vsrc_src0) }
1119    }
1120
1121    #[unsafe(no_mangle)]
1122    fn test_permlane32_swap(vdst_old: u32, vsrc_src0: u32) -> (u32, u32) {
1123        unsafe { permlane32_swap::<false, true>(vdst_old, vsrc_src0) }
1124    }
1125}