Skip to main content

core/stdarch/crates/core_arch/src/x86_64/
amx.rs

1use crate::core_arch::x86_64::{__tile1024i, Tile};
2use crate::core_arch::{simd::*, x86::*};
3
4#[cfg(test)]
5use stdarch_test::assert_instr;
6
7/// Load tile configuration from a 64-byte memory location specified by `mem_addr`.
8/// The tile configuration format is specified below, and includes the tile type pallette,
9/// the number of bytes per row, and the number of rows. If the specified pallette_id is zero,
10/// that signifies the init state for both the tile config and the tile data, and the tiles are zeroed.
11/// Any invalid configurations will result in #GP fault.
12///
13/// ```intel
14/// //	format of memory payload. each field is a byte.
15///		 0: palette
16///		 1: start_row
17///	  2-15: reserved, must be zero
18///	 16-17: tile0.colsb
19///	 18-19: tile1.colsb
20///	 20-21: tile2.colsb
21///			...
22///	 30-31: tile7.colsb
23///	 32-47: reserved, must be zero
24///		48: tile0.rows
25///		49: tile1.rows
26///		50: tile2.rows
27///			 ...
28///		55: tile7.rows
29///	 56-63: reserved, must be zero
30/// ```
31///
32/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_loadconfig&ig_expand=6875)
33#[inline]
34#[target_feature(enable = "amx-tile")]
35#[cfg_attr(test, assert_instr(ldtilecfg))]
36#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
37pub unsafe fn _tile_loadconfig(mem_addr: *const u8) {
38    ldtilecfg(mem_addr);
39}
40
41/// Stores the current tile configuration to a 64-byte memory location specified by `mem_addr`.
42/// The tile configuration format is as specified in [`_tile_loadconfig`], and includes the tile type pallette,
43/// the number of bytes per row, and the number of rows. If tiles are not configured, all zeroes will be stored to memory.
44///
45/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_storeconfig&ig_expand=6879)
46#[inline]
47#[target_feature(enable = "amx-tile")]
48#[cfg_attr(test, assert_instr(sttilecfg))]
49#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
50pub unsafe fn _tile_storeconfig(mem_addr: *mut u8) {
51    sttilecfg(mem_addr);
52}
53
54/// Load tile rows from memory specified by base address and stride into destination tile dst using the tile configuration previously configured via [`_tile_loadconfig`].
55///
56/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_loadd&ig_expand=6877)
57#[inline]
58#[rustc_legacy_const_generics(0)]
59#[target_feature(enable = "amx-tile")]
60#[cfg_attr(test, assert_instr(tileloadd, DST = 0))]
61#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
62pub unsafe fn _tile_loadd<const DST: i32>(base: *const u8, stride: usize) {
63    static_assert_uimm_bits!(DST, 3);
64    tileloadd64(DST as i8, base, stride as u64);
65}
66
67/// Load tile rows from memory specified by base address and stride into destination tile dst. The shape
68/// of the tile is specified in the struct of [`__tile1024i`]. The register of the tile is allocated by the compiler.
69///
70/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=__tile_loadd&ig_expand=6877)
71#[inline]
72#[target_feature(enable = "amx-tile")]
73#[cfg_attr(test, assert_instr(tileloadd))]
74#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
75pub unsafe fn __tile_loadd(dst: *mut __tile1024i, base: *const u8, stride: usize) {
76    (*dst).tile = tileloadd64_internal((*dst).rows, (*dst).colsb, base, stride as u64);
77}
78
79/// Release the tile configuration to return to the init state, which releases all storage it currently holds.
80///
81/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_release&ig_expand=6878)
82#[inline]
83#[target_feature(enable = "amx-tile")]
84#[cfg_attr(test, assert_instr(tilerelease))]
85#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
86pub unsafe fn _tile_release() {
87    tilerelease();
88}
89
90/// Store the tile specified by src to memory specified by base address and stride using the tile configuration previously configured via [`_tile_loadconfig`].
91///
92/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_stored&ig_expand=6881)
93#[inline]
94#[rustc_legacy_const_generics(0)]
95#[target_feature(enable = "amx-tile")]
96#[cfg_attr(test, assert_instr(tilestored, DST = 0))]
97#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
98pub unsafe fn _tile_stored<const DST: i32>(base: *mut u8, stride: usize) {
99    static_assert_uimm_bits!(DST, 3);
100    tilestored64(DST as i8, base, stride as u64);
101}
102
103/// Store the tile specified by src to memory specified by base address and stride. The shape of the tile
104/// is specified in the struct of [`__tile1024i`]. The register of the tile is allocated by the compiler.
105///
106/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=__tile_stored&ig_expand=6881)
107#[inline]
108#[target_feature(enable = "amx-tile")]
109#[cfg_attr(test, assert_instr(tilestored))]
110#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
111pub unsafe fn __tile_stored(base: *mut u8, stride: usize, src: __tile1024i) {
112    tilestored64_internal(src.rows, src.colsb, base, stride as u64, src.tile);
113}
114
115/// Load tile rows from memory specified by base address and stride into destination tile dst using the tile configuration
116/// previously configured via [`_tile_loadconfig`]. This intrinsic provides a hint to the implementation that the data will
117/// likely not be reused in the near future and the data caching can be optimized accordingly.
118///
119/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_stream_loadd&ig_expand=6883)
120#[inline]
121#[rustc_legacy_const_generics(0)]
122#[target_feature(enable = "amx-tile")]
123#[cfg_attr(test, assert_instr(tileloaddt1, DST = 0))]
124#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
125pub unsafe fn _tile_stream_loadd<const DST: i32>(base: *const u8, stride: usize) {
126    static_assert_uimm_bits!(DST, 3);
127    tileloaddt164(DST as i8, base, stride as u64);
128}
129
130/// Load tile rows from memory specified by base address and stride into destination tile dst. The shape
131/// of the tile is specified in the struct of [`__tile1024i`]. The register of the tile is allocated by the compiler.
132/// This intrinsic provides a hint to the implementation that the data will likely not be reused in the
133/// near future and the data caching can be optimized accordingly.
134///
135/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=__tile_stream_loadd&ig_expand=6883)
136#[inline]
137#[target_feature(enable = "amx-tile")]
138#[cfg_attr(test, assert_instr(tileloaddt1))]
139#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
140pub unsafe fn __tile_stream_loadd(dst: *mut __tile1024i, base: *const u8, stride: usize) {
141    (*dst).tile = tileloaddt164_internal((*dst).rows, (*dst).colsb, base, stride as u64);
142}
143
144/// Zero the tile specified by `tdest`.
145///
146/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_zero&ig_expand=6885)
147#[inline]
148#[rustc_legacy_const_generics(0)]
149#[target_feature(enable = "amx-tile")]
150#[cfg_attr(test, assert_instr(tilezero, DST = 0))]
151#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
152pub unsafe fn _tile_zero<const DST: i32>() {
153    static_assert_uimm_bits!(DST, 3);
154    tilezero(DST as i8);
155}
156
157/// Zero the tile specified by `dst`. The shape of the tile is specified in the struct of [`__tile1024i`].
158/// The register of the tile is allocated by the compiler.
159///
160/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=__tile_zero&ig_expand=6885)
161#[inline]
162#[target_feature(enable = "amx-tile")]
163#[cfg_attr(test, assert_instr(tilezero))]
164#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
165pub unsafe fn __tile_zero(dst: *mut __tile1024i) {
166    (*dst).tile = tilezero_internal((*dst).rows, (*dst).colsb);
167}
168
169/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles a and b,
170/// accumulating the intermediate single-precision (32-bit) floating-point elements
171/// with elements in dst, and store the 32-bit result back to tile dst.
172///
173/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_dpbf16ps&ig_expand=6864)
174#[inline]
175#[rustc_legacy_const_generics(0, 1, 2)]
176#[target_feature(enable = "amx-bf16")]
177#[cfg_attr(test, assert_instr(tdpbf16ps, DST = 0, A = 1, B = 2))]
178#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
179pub unsafe fn _tile_dpbf16ps<const DST: i32, const A: i32, const B: i32>() {
180    static_assert_uimm_bits!(DST, 3);
181    static_assert_uimm_bits!(A, 3);
182    static_assert_uimm_bits!(B, 3);
183    tdpbf16ps(DST as i8, A as i8, B as i8);
184}
185
186/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles a and b,
187/// accumulating the intermediate single-precision (32-bit) floating-point elements
188/// with elements in dst, and store the 32-bit result back to tile dst. The shape of the tile
189/// is specified in the struct of [`__tile1024i`]. The register of the tile is allocated by the compiler.
190///
191/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=__tile_dpbf16ps&ig_expand=6864)
192#[inline]
193#[target_feature(enable = "amx-bf16")]
194#[cfg_attr(test, assert_instr(tdpbf16ps))]
195#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
196pub unsafe fn __tile_dpbf16ps(dst: *mut __tile1024i, a: __tile1024i, b: __tile1024i) {
197    (*dst).tile = tdpbf16ps_internal(a.rows, b.colsb, a.colsb, (*dst).tile, a.tile, b.tile);
198}
199
200/// Compute dot-product of bytes in tiles with a source/destination accumulator.
201/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding
202/// signed 8-bit integers in b, producing 4 intermediate 32-bit results.
203/// Sum these 4 results with the corresponding 32-bit integer in dst, and store the 32-bit result back to tile dst.
204///
205/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_dpbssd&ig_expand=6866)
206#[inline]
207#[rustc_legacy_const_generics(0, 1, 2)]
208#[target_feature(enable = "amx-int8")]
209#[cfg_attr(test, assert_instr(tdpbssd, DST = 0, A = 1, B = 2))]
210#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
211pub unsafe fn _tile_dpbssd<const DST: i32, const A: i32, const B: i32>() {
212    static_assert_uimm_bits!(DST, 3);
213    static_assert_uimm_bits!(A, 3);
214    static_assert_uimm_bits!(B, 3);
215    tdpbssd(DST as i8, A as i8, B as i8);
216}
217
218/// Compute dot-product of bytes in tiles with a source/destination accumulator.
219/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding
220/// signed 8-bit integers in b, producing 4 intermediate 32-bit results.
221/// Sum these 4 results with the corresponding 32-bit integer in dst, and store the 32-bit result back to tile dst.
222/// The shape of the tile is specified in the struct of [`__tile1024i`]. The register of the tile is allocated by the compiler.
223///
224/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=__tile_dpbssd&ig_expand=6866)
225#[inline]
226#[target_feature(enable = "amx-int8")]
227#[cfg_attr(test, assert_instr(tdpbssd))]
228#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
229pub unsafe fn __tile_dpbssd(dst: *mut __tile1024i, a: __tile1024i, b: __tile1024i) {
230    (*dst).tile = tdpbssd_internal(a.rows, b.colsb, a.colsb, (*dst).tile, a.tile, b.tile);
231}
232
233/// Compute dot-product of bytes in tiles with a source/destination accumulator.
234/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding
235/// unsigned 8-bit integers in b, producing 4 intermediate 32-bit results.
236/// Sum these 4 results with the corresponding 32-bit integer in dst, and store the 32-bit result back to tile dst.
237///
238/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_dpbsud&ig_expand=6868)
239#[inline]
240#[rustc_legacy_const_generics(0, 1, 2)]
241#[target_feature(enable = "amx-int8")]
242#[cfg_attr(test, assert_instr(tdpbsud, DST = 0, A = 1, B = 2))]
243#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
244pub unsafe fn _tile_dpbsud<const DST: i32, const A: i32, const B: i32>() {
245    static_assert_uimm_bits!(DST, 3);
246    static_assert_uimm_bits!(A, 3);
247    static_assert_uimm_bits!(B, 3);
248    tdpbsud(DST as i8, A as i8, B as i8);
249}
250
251/// Compute dot-product of bytes in tiles with a source/destination accumulator.
252/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding
253/// unsigned 8-bit integers in b, producing 4 intermediate 32-bit results.
254/// Sum these 4 results with the corresponding 32-bit integer in dst, and store the 32-bit result back to tile dst.
255/// The shape of the tile is specified in the struct of [`__tile1024i`]. The register of the tile is allocated by the compiler.
256///
257/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=__tile_dpbsud&ig_expand=6868)
258#[inline]
259#[target_feature(enable = "amx-int8")]
260#[cfg_attr(test, assert_instr(tdpbsud))]
261#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
262pub unsafe fn __tile_dpbsud(dst: *mut __tile1024i, a: __tile1024i, b: __tile1024i) {
263    (*dst).tile = tdpbsud_internal(a.rows, b.colsb, a.colsb, (*dst).tile, a.tile, b.tile);
264}
265
266/// Compute dot-product of bytes in tiles with a source/destination accumulator.
267/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding
268/// signed 8-bit integers in b, producing 4 intermediate 32-bit results.
269/// Sum these 4 results with the corresponding 32-bit integer in dst, and store the 32-bit result back to tile dst.
270///
271/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_dpbusd&ig_expand=6870)
272#[inline]
273#[rustc_legacy_const_generics(0, 1, 2)]
274#[target_feature(enable = "amx-int8")]
275#[cfg_attr(test, assert_instr(tdpbusd, DST = 0, A = 1, B = 2))]
276#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
277pub unsafe fn _tile_dpbusd<const DST: i32, const A: i32, const B: i32>() {
278    static_assert_uimm_bits!(DST, 3);
279    static_assert_uimm_bits!(A, 3);
280    static_assert_uimm_bits!(B, 3);
281    tdpbusd(DST as i8, A as i8, B as i8);
282}
283
284/// Compute dot-product of bytes in tiles with a source/destination accumulator.
285/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding
286/// signed 8-bit integers in b, producing 4 intermediate 32-bit results.
287/// Sum these 4 results with the corresponding 32-bit integer in dst, and store the 32-bit result back to tile dst.
288/// The shape of the tile is specified in the struct of [`__tile1024i`]. The register of the tile is allocated by the compiler.
289///
290/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=__tile_dpbusd&ig_expand=6870)
291#[inline]
292#[target_feature(enable = "amx-int8")]
293#[cfg_attr(test, assert_instr(tdpbusd))]
294#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
295pub unsafe fn __tile_dpbusd(dst: *mut __tile1024i, a: __tile1024i, b: __tile1024i) {
296    (*dst).tile = tdpbusd_internal(a.rows, b.colsb, a.colsb, (*dst).tile, a.tile, b.tile);
297}
298
299/// Compute dot-product of bytes in tiles with a source/destination accumulator.
300/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding
301/// unsigned 8-bit integers in b, producing 4 intermediate 32-bit results.
302/// Sum these 4 results with the corresponding 32-bit integer in dst, and store the 32-bit result back to tile dst.
303///
304/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_dpbuud&ig_expand=6872)
305#[inline]
306#[rustc_legacy_const_generics(0, 1, 2)]
307#[target_feature(enable = "amx-int8")]
308#[cfg_attr(test, assert_instr(tdpbuud, DST = 0, A = 1, B = 2))]
309#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
310pub unsafe fn _tile_dpbuud<const DST: i32, const A: i32, const B: i32>() {
311    static_assert_uimm_bits!(DST, 3);
312    static_assert_uimm_bits!(A, 3);
313    static_assert_uimm_bits!(B, 3);
314    tdpbuud(DST as i8, A as i8, B as i8);
315}
316
317/// Compute dot-product of bytes in tiles with a source/destination accumulator.
318/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding
319/// unsigned 8-bit integers in b, producing 4 intermediate 32-bit results.
320/// Sum these 4 results with the corresponding 32-bit integer in dst, and store the 32-bit result back to tile dst.
321/// The shape of the tile is specified in the struct of [`__tile1024i`]. The register of the tile is allocated by the compiler.
322///
323/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=__tile_dpbuud&ig_expand=6872)
324#[inline]
325#[target_feature(enable = "amx-int8")]
326#[cfg_attr(test, assert_instr(tdpbuud))]
327#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
328pub unsafe fn __tile_dpbuud(dst: *mut __tile1024i, a: __tile1024i, b: __tile1024i) {
329    (*dst).tile = tdpbuud_internal(a.rows, b.colsb, a.colsb, (*dst).tile, a.tile, b.tile);
330}
331
332/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles a and b,
333/// accumulating the intermediate single-precision (32-bit) floating-point elements
334///  with elements in dst, and store the 32-bit result back to tile dst.
335///
336/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_dpfp16ps&ig_expand=6874)
337#[inline]
338#[rustc_legacy_const_generics(0, 1, 2)]
339#[target_feature(enable = "amx-fp16")]
340#[cfg_attr(test, assert_instr(tdpfp16ps, DST = 0, A = 1, B = 2))]
341#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
342pub unsafe fn _tile_dpfp16ps<const DST: i32, const A: i32, const B: i32>() {
343    static_assert_uimm_bits!(DST, 3);
344    static_assert_uimm_bits!(A, 3);
345    static_assert_uimm_bits!(B, 3);
346    tdpfp16ps(DST as i8, A as i8, B as i8);
347}
348
349/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles a and b,
350/// accumulating the intermediate single-precision (32-bit) floating-point elements
351///  with elements in dst, and store the 32-bit result back to tile dst.
352/// The shape of the tile is specified in the struct of [`__tile1024i`]. The register of the tile is allocated by the compiler.
353///
354/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=__tile_dpfp16ps&ig_expand=6874)
355#[inline]
356#[target_feature(enable = "amx-fp16")]
357#[cfg_attr(test, assert_instr(tdpfp16ps))]
358#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
359pub unsafe fn __tile_dpfp16ps(dst: *mut __tile1024i, a: __tile1024i, b: __tile1024i) {
360    (*dst).tile = tdpfp16ps_internal(a.rows, b.colsb, a.colsb, (*dst).tile, a.tile, b.tile);
361}
362
363/// Perform matrix multiplication of two tiles containing complex elements and accumulate the results into a packed single precision tile.
364/// Each dword element in input tiles a and b is interpreted as a complex number with FP16 real part and FP16 imaginary part.
365/// Calculates the imaginary part of the result. For each possible combination of (row of a, column of b),
366/// it performs a set of multiplication and accumulations on all corresponding complex numbers (one from a and one from b).
367/// The imaginary part of the a element is multiplied with the real part of the corresponding b element, and the real part of
368/// the a element is multiplied with the imaginary part of the corresponding b elements. The two accumulated results are added,
369/// and then accumulated into the corresponding row and column of dst.
370///
371/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_cmmimfp16ps&ig_expand=6860)
372#[inline]
373#[rustc_legacy_const_generics(0, 1, 2)]
374#[target_feature(enable = "amx-complex")]
375#[cfg_attr(test, assert_instr(tcmmimfp16ps, DST = 0, A = 1, B = 2))]
376#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
377pub unsafe fn _tile_cmmimfp16ps<const DST: i32, const A: i32, const B: i32>() {
378    static_assert_uimm_bits!(DST, 3);
379    static_assert_uimm_bits!(A, 3);
380    static_assert_uimm_bits!(B, 3);
381    tcmmimfp16ps(DST as i8, A as i8, B as i8);
382}
383
384/// Perform matrix multiplication of two tiles containing complex elements and accumulate the results into a packed single precision tile.
385/// Each dword element in input tiles a and b is interpreted as a complex number with FP16 real part and FP16 imaginary part.
386/// Calculates the imaginary part of the result. For each possible combination of (row of a, column of b),
387/// it performs a set of multiplication and accumulations on all corresponding complex numbers (one from a and one from b).
388/// The imaginary part of the a element is multiplied with the real part of the corresponding b element, and the real part of
389/// the a element is multiplied with the imaginary part of the corresponding b elements. The two accumulated results are added,
390/// and then accumulated into the corresponding row and column of dst.
391/// The shape of the tile is specified in the struct of [`__tile1024i`]. The register of the tile is allocated by the compiler.
392///
393/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=__tile_cmmimfp16ps&ig_expand=6860)
394#[inline]
395#[target_feature(enable = "amx-complex")]
396#[cfg_attr(test, assert_instr(tcmmimfp16ps))]
397#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
398pub unsafe fn __tile_cmmimfp16ps(dst: *mut __tile1024i, a: __tile1024i, b: __tile1024i) {
399    (*dst).tile = tcmmimfp16ps_internal(a.rows, b.colsb, a.colsb, (*dst).tile, a.tile, b.tile);
400}
401
402/// Perform matrix multiplication of two tiles containing complex elements and accumulate the results into a packed single precision tile.
403/// Each dword element in input tiles a and b is interpreted as a complex number with FP16 real part and FP16 imaginary part.
404/// Calculates the real part of the result. For each possible combination of (row of a, column of b),
405/// it performs a set of multiplication and accumulations on all corresponding complex numbers (one from a and one from b).
406/// The real part of the a element is multiplied with the real part of the corresponding b element, and the negated imaginary part of
407/// the a element is multiplied with the imaginary part of the corresponding b elements.
408/// The two accumulated results are added, and then accumulated into the corresponding row and column of dst.
409///
410/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tile_cmmrlfp16ps&ig_expand=6862)
411#[inline]
412#[rustc_legacy_const_generics(0, 1, 2)]
413#[target_feature(enable = "amx-complex")]
414#[cfg_attr(test, assert_instr(tcmmrlfp16ps, DST = 0, A = 1, B = 2))]
415#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
416pub unsafe fn _tile_cmmrlfp16ps<const DST: i32, const A: i32, const B: i32>() {
417    static_assert_uimm_bits!(DST, 3);
418    static_assert_uimm_bits!(A, 3);
419    static_assert_uimm_bits!(B, 3);
420    tcmmrlfp16ps(DST as i8, A as i8, B as i8);
421}
422
423/// Perform matrix multiplication of two tiles containing complex elements and accumulate the results into a packed single precision tile.
424/// Each dword element in input tiles a and b is interpreted as a complex number with FP16 real part and FP16 imaginary part.
425/// Calculates the real part of the result. For each possible combination of (row of a, column of b),
426/// it performs a set of multiplication and accumulations on all corresponding complex numbers (one from a and one from b).
427/// The real part of the a element is multiplied with the real part of the corresponding b element, and the negated imaginary part of
428/// the a element is multiplied with the imaginary part of the corresponding b elements.
429/// The two accumulated results are added, and then accumulated into the corresponding row and column of dst.
430/// The shape of the tile is specified in the struct of [`__tile1024i`]. The register of the tile is allocated by the compiler.
431///
432/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=__tile_cmmrlfp16ps&ig_expand=6862)
433#[inline]
434#[target_feature(enable = "amx-complex")]
435#[cfg_attr(test, assert_instr(tcmmrlfp16ps))]
436#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
437pub unsafe fn __tile_cmmrlfp16ps(dst: *mut __tile1024i, a: __tile1024i, b: __tile1024i) {
438    (*dst).tile = tcmmrlfp16ps_internal(a.rows, b.colsb, a.colsb, (*dst).tile, a.tile, b.tile);
439}
440
441/// Compute dot-product of BF8 (8-bit E5M2) floating-point elements in tile a and BF8 (8-bit E5M2)
442/// floating-point elements in tile b, accumulating the intermediate single-precision
443/// (32-bit) floating-point elements with elements in dst, and store the 32-bit result
444/// back to tile dst.
445#[inline]
446#[rustc_legacy_const_generics(0, 1, 2)]
447#[target_feature(enable = "amx-fp8")]
448#[cfg_attr(
449    all(test, not(target_vendor = "apple")),
450    assert_instr(tdpbf8ps, DST = 0, A = 1, B = 2)
451)]
452#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
453pub unsafe fn _tile_dpbf8ps<const DST: i32, const A: i32, const B: i32>() {
454    static_assert_uimm_bits!(DST, 3);
455    static_assert_uimm_bits!(A, 3);
456    static_assert_uimm_bits!(B, 3);
457    tdpbf8ps(DST as i8, A as i8, B as i8);
458}
459
460/// Compute dot-product of BF8 (8-bit E5M2) floating-point elements in tile a and BF8 (8-bit E5M2)
461/// floating-point elements in tile b, accumulating the intermediate single-precision
462/// (32-bit) floating-point elements with elements in dst, and store the 32-bit result
463/// back to tile dst.
464/// The shape of the tile is specified in the struct of [`__tile1024i`]. The register of the tile is allocated by the compiler.
465#[inline]
466#[target_feature(enable = "amx-fp8")]
467#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(tdpbf8ps))]
468#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
469pub unsafe fn __tile_dpbf8ps(dst: *mut __tile1024i, a: __tile1024i, b: __tile1024i) {
470    (*dst).tile = tdpbf8ps_internal(a.rows, b.colsb, a.colsb, (*dst).tile, a.tile, b.tile);
471}
472
473/// Compute dot-product of BF8 (8-bit E5M2) floating-point elements in tile a and HF8
474/// (8-bit E4M3) floating-point elements in tile b, accumulating the intermediate single-precision
475/// (32-bit) floating-point elements with elements in dst, and store the 32-bit result
476/// back to tile dst.
477#[inline]
478#[rustc_legacy_const_generics(0, 1, 2)]
479#[target_feature(enable = "amx-fp8")]
480#[cfg_attr(
481    all(test, not(target_vendor = "apple")),
482    assert_instr(tdpbhf8ps, DST = 0, A = 1, B = 2)
483)]
484#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
485pub unsafe fn _tile_dpbhf8ps<const DST: i32, const A: i32, const B: i32>() {
486    static_assert_uimm_bits!(DST, 3);
487    static_assert_uimm_bits!(A, 3);
488    static_assert_uimm_bits!(B, 3);
489    tdpbhf8ps(DST as i8, A as i8, B as i8);
490}
491
492/// Compute dot-product of BF8 (8-bit E5M2) floating-point elements in tile a and HF8
493/// (8-bit E4M3) floating-point elements in tile b, accumulating the intermediate single-precision
494/// (32-bit) floating-point elements with elements in dst, and store the 32-bit result
495/// back to tile dst.
496/// The shape of the tile is specified in the struct of [`__tile1024i`]. The register of the tile is allocated by the compiler.
497#[inline]
498#[target_feature(enable = "amx-fp8")]
499#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(tdpbhf8ps))]
500#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
501pub unsafe fn __tile_dpbhf8ps(dst: *mut __tile1024i, a: __tile1024i, b: __tile1024i) {
502    (*dst).tile = tdpbhf8ps_internal(a.rows, b.colsb, a.colsb, (*dst).tile, a.tile, b.tile);
503}
504
505/// Compute dot-product of HF8 (8-bit E4M3) floating-point elements in tile a and BF8
506/// (8-bit E5M2) floating-point elements in tile b, accumulating the intermediate single-precision
507/// (32-bit) floating-point elements with elements in dst, and store the 32-bit result
508/// back to tile dst.
509#[inline]
510#[rustc_legacy_const_generics(0, 1, 2)]
511#[target_feature(enable = "amx-fp8")]
512#[cfg_attr(
513    all(test, not(target_vendor = "apple")),
514    assert_instr(tdphbf8ps, DST = 0, A = 1, B = 2)
515)]
516#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
517pub unsafe fn _tile_dphbf8ps<const DST: i32, const A: i32, const B: i32>() {
518    static_assert_uimm_bits!(DST, 3);
519    static_assert_uimm_bits!(A, 3);
520    static_assert_uimm_bits!(B, 3);
521    tdphbf8ps(DST as i8, A as i8, B as i8);
522}
523
524/// Compute dot-product of HF8 (8-bit E4M3) floating-point elements in tile a and BF8
525/// (8-bit E5M2) floating-point elements in tile b, accumulating the intermediate single-precision
526/// (32-bit) floating-point elements with elements in dst, and store the 32-bit result
527/// back to tile dst.
528/// The shape of the tile is specified in the struct of [`__tile1024i`]. The register of the tile is allocated by the compiler.
529#[inline]
530#[target_feature(enable = "amx-fp8")]
531#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(tdphbf8ps))]
532#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
533pub unsafe fn __tile_dphbf8ps(dst: *mut __tile1024i, a: __tile1024i, b: __tile1024i) {
534    (*dst).tile = tdphbf8ps_internal(a.rows, b.colsb, a.colsb, (*dst).tile, a.tile, b.tile);
535}
536
537/// Compute dot-product of HF8 (8-bit E4M3) floating-point elements in tile a and HF8 (8-bit E4M3)
538/// floating-point elements in tile b, accumulating the intermediate single-precision
539/// (32-bit) floating-point elements with elements in dst, and store the 32-bit result
540/// back to tile dst.
541#[inline]
542#[rustc_legacy_const_generics(0, 1, 2)]
543#[target_feature(enable = "amx-fp8")]
544#[cfg_attr(
545    all(test, not(target_vendor = "apple")),
546    assert_instr(tdphf8ps, DST = 0, A = 1, B = 2)
547)]
548#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
549pub unsafe fn _tile_dphf8ps<const DST: i32, const A: i32, const B: i32>() {
550    static_assert_uimm_bits!(DST, 3);
551    static_assert_uimm_bits!(A, 3);
552    static_assert_uimm_bits!(B, 3);
553    tdphf8ps(DST as i8, A as i8, B as i8);
554}
555
556/// Compute dot-product of HF8 (8-bit E4M3) floating-point elements in tile a and HF8 (8-bit E4M3)
557/// floating-point elements in tile b, accumulating the intermediate single-precision
558/// (32-bit) floating-point elements with elements in dst, and store the 32-bit result
559/// back to tile dst.
560/// The shape of the tile is specified in the struct of [`__tile1024i`]. The register of the tile is allocated by the compiler.
561#[inline]
562#[target_feature(enable = "amx-fp8")]
563#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(tdphf8ps))]
564#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
565pub unsafe fn __tile_dphf8ps(dst: *mut __tile1024i, a: __tile1024i, b: __tile1024i) {
566    (*dst).tile = tdphf8ps_internal(a.rows, b.colsb, a.colsb, (*dst).tile, a.tile, b.tile);
567}
568
569/// Load tile rows from memory specified by base address and stride into destination tile dst
570/// using the tile configuration previously configured via [`_tile_loadconfig`].
571/// Additionally, this intrinsic indicates the source memory location is likely to become
572/// read-shared by multiple processors, i.e., read in the future by at least one other processor
573/// before it is written, assuming it is ever written in the future.
574#[inline]
575#[rustc_legacy_const_generics(0)]
576#[target_feature(enable = "amx-movrs")]
577#[cfg_attr(
578    all(test, not(target_vendor = "apple")),
579    assert_instr(tileloaddrs, DST = 0)
580)]
581#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
582pub unsafe fn _tile_loaddrs<const DST: i32>(base: *const u8, stride: usize) {
583    static_assert_uimm_bits!(DST, 3);
584    tileloaddrs64(DST as i8, base, stride as u64);
585}
586
587/// Load tile rows from memory specified by base address and stride into destination tile dst.
588/// The shape of the tile is specified in the struct of [`__tile1024i`]. The register of the tile is allocated by the compiler.
589/// Additionally, this intrinsic indicates the source memory location is likely to become
590/// read-shared by multiple processors, i.e., read in the future by at least one other processor
591/// before it is written, assuming it is ever written in the future.
592#[inline]
593#[target_feature(enable = "amx-movrs")]
594#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(tileloaddrs))]
595#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
596pub unsafe fn __tile_loaddrs(dst: *mut __tile1024i, base: *const u8, stride: usize) {
597    (*dst).tile = tileloaddrs64_internal((*dst).rows, (*dst).colsb, base, stride as u64);
598}
599
600/// Load tile rows from memory specified by base address and stride into destination tile dst
601/// using the tile configuration previously configured via [`_tile_loadconfig`].
602/// Provides a hint to the implementation that the data would be reused but does not need
603/// to be resident in the nearest cache levels.
604/// Additionally, this intrinsic indicates the source memory location is likely to become
605/// read-shared by multiple processors, i.e., read in the future by at least one other processor
606/// before it is written, assuming it is ever written in the future.
607#[inline]
608#[rustc_legacy_const_generics(0)]
609#[target_feature(enable = "amx-movrs")]
610#[cfg_attr(
611    all(test, not(target_vendor = "apple")),
612    assert_instr(tileloaddrst1, DST = 0)
613)]
614#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
615pub unsafe fn _tile_stream_loaddrs<const DST: i32>(base: *const u8, stride: usize) {
616    static_assert_uimm_bits!(DST, 3);
617    tileloaddrst164(DST as i8, base, stride as u64);
618}
619
620/// Load tile rows from memory specified by base address and stride into destination tile dst.
621/// The shape of the tile is specified in the struct of [`__tile1024i`]. The register of the tile is allocated by the compiler.
622/// Provides a hint to the implementation that the data would be reused but does not need
623/// to be resident in the nearest cache levels.
624/// Additionally, this intrinsic indicates the source memory location is likely to become
625/// read-shared by multiple processors, i.e., read in the future by at least one other processor
626/// before it is written, assuming it is ever written in the future.
627#[inline]
628#[target_feature(enable = "amx-movrs")]
629#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(tileloaddrst1))]
630#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
631pub unsafe fn __tile_stream_loaddrs(dst: *mut __tile1024i, base: *const u8, stride: usize) {
632    (*dst).tile = tileloaddrst164_internal((*dst).rows, (*dst).colsb, base, stride as u64);
633}
634
635/// Perform matrix multiplication of two tiles a and b, containing packed single precision (32-bit)
636/// floating-point elements, which are converted to TF32 (tensor-float32) format, and accumulate the
637///  results into a packed single precision tile.
638/// For each possible combination of (row of a, column of b), it performs
639///  - convert to TF32
640///  - multiply the corresponding elements of a and b
641///  - accumulate the results into the corresponding row and column of dst using round-to-nearest-even
642/// rounding mode.
643/// Output FP32 denormals are always flushed to zero, input single precision denormals are always
644/// handled and *not* treated as zero.
645#[inline]
646#[rustc_legacy_const_generics(0, 1, 2)]
647#[target_feature(enable = "amx-tf32")]
648#[cfg_attr(
649    all(test, not(target_vendor = "apple")),
650    assert_instr(tmmultf32ps, DST = 0, A = 1, B = 2)
651)]
652#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
653pub unsafe fn _tile_mmultf32ps<const DST: i32, const A: i32, const B: i32>() {
654    static_assert_uimm_bits!(DST, 3);
655    static_assert_uimm_bits!(A, 3);
656    static_assert_uimm_bits!(B, 3);
657    tmmultf32ps(DST as i8, A as i8, B as i8);
658}
659
660/// Perform matrix multiplication of two tiles a and b, containing packed single precision (32-bit)
661/// floating-point elements, which are converted to TF32 (tensor-float32) format, and accumulate the
662///  results into a packed single precision tile.
663/// For each possible combination of (row of a, column of b), it performs
664///  - convert to TF32
665///  - multiply the corresponding elements of a and b
666///  - accumulate the results into the corresponding row and column of dst using round-to-nearest-even
667/// rounding mode.
668/// Output FP32 denormals are always flushed to zero, input single precision denormals are always
669/// handled and *not* treated as zero.
670/// The shape of the tile is specified in the struct of [`__tile1024i`]. The register of the tile is allocated by the compiler.
671#[inline]
672#[target_feature(enable = "amx-tf32")]
673#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(tmmultf32ps))]
674#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
675pub unsafe fn __tile_mmultf32ps(dst: *mut __tile1024i, a: __tile1024i, b: __tile1024i) {
676    (*dst).tile = tmmultf32ps_internal(a.rows, b.colsb, a.colsb, (*dst).tile, a.tile, b.tile);
677}
678
679/// Moves a row from a tile register to a zmm register, converting the packed 32-bit signed integer
680/// elements to packed single-precision (32-bit) floating-point elements.
681#[inline]
682#[rustc_legacy_const_generics(0)]
683#[target_feature(enable = "amx-avx512,avx10.2")]
684#[cfg_attr(
685    all(test, not(target_vendor = "apple")),
686    assert_instr(tcvtrowd2ps, TILE = 0)
687)]
688#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
689pub unsafe fn _tile_cvtrowd2ps<const TILE: i32>(row: u32) -> __m512 {
690    static_assert_uimm_bits!(TILE, 3);
691    tcvtrowd2ps(TILE as i8, row).as_m512()
692}
693
694/// Moves a row from a tile register to a zmm register, converting the packed 32-bit signed integer
695/// elements to packed single-precision (32-bit) floating-point elements.
696#[inline]
697#[rustc_legacy_const_generics(0, 1)]
698#[target_feature(enable = "amx-avx512,avx10.2")]
699#[cfg_attr(
700    all(test, not(target_vendor = "apple")),
701    assert_instr(tcvtrowd2ps, TILE = 0, ROW = 0)
702)]
703#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
704pub unsafe fn _tile_cvtrowd2psi<const TILE: i32, const ROW: i32>() -> __m512 {
705    static_assert_uimm_bits!(TILE, 3);
706    static_assert_uimm_bits!(ROW, 6);
707    tcvtrowd2psi(TILE as i8, ROW as u32).as_m512()
708}
709
710/// Moves a row from a tile register to a zmm register, converting the packed 32-bit signed integer
711/// elements to packed single-precision (32-bit) floating-point elements.
712/// The shape of the tile is specified in the struct of [`__tile1024i`]. The register of the tile is allocated by the compiler.
713#[inline]
714#[target_feature(enable = "amx-avx512,avx10.2")]
715#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(tcvtrowd2ps))]
716#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
717pub unsafe fn __tile_cvtrowd2ps(src: __tile1024i, row: u32) -> __m512 {
718    tcvtrowd2ps_internal(src.rows, src.colsb, src.tile, row).as_m512()
719}
720
721/// Moves a row from a tile register to a zmm register, converting the packed single-precision (32-bit)
722/// floating-point elements to packed half-precision (16-bit) floating-point elements. The resulting
723/// 16-bit elements are placed in the high 16-bits within each 32-bit element of the returned vector.
724#[inline]
725#[rustc_legacy_const_generics(0)]
726#[target_feature(enable = "amx-avx512,avx10.2")]
727#[cfg_attr(
728    all(test, not(target_vendor = "apple")),
729    assert_instr(tcvtrowps2phh, TILE = 0)
730)]
731#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
732pub unsafe fn _tile_cvtrowps2phh<const TILE: i32>(row: u32) -> __m512h {
733    static_assert_uimm_bits!(TILE, 3);
734    tcvtrowps2phh(TILE as i8, row).as_m512h()
735}
736
737/// Moves a row from a tile register to a zmm register, converting the packed single-precision (32-bit)
738/// floating-point elements to packed half-precision (16-bit) floating-point elements. The resulting
739/// 16-bit elements are placed in the high 16-bits within each 32-bit element of the returned vector.
740#[inline]
741#[rustc_legacy_const_generics(0, 1)]
742#[target_feature(enable = "amx-avx512,avx10.2")]
743#[cfg_attr(
744    all(test, not(target_vendor = "apple")),
745    assert_instr(tcvtrowps2phh, TILE = 0, ROW = 0)
746)]
747#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
748pub unsafe fn _tile_cvtrowps2phhi<const TILE: i32, const ROW: i32>() -> __m512h {
749    static_assert_uimm_bits!(TILE, 3);
750    static_assert_uimm_bits!(ROW, 6);
751    tcvtrowps2phhi(TILE as i8, ROW as u32).as_m512h()
752}
753
754/// Moves a row from a tile register to a zmm register, converting the packed single-precision (32-bit)
755/// floating-point elements to packed half-precision (16-bit) floating-point elements. The resulting
756/// 16-bit elements are placed in the high 16-bits within each 32-bit element of the returned vector.
757/// The shape of the tile is specified in the struct of [`__tile1024i`]. The register of the tile is allocated by the compiler.
758#[inline]
759#[target_feature(enable = "amx-avx512,avx10.2")]
760#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(tcvtrowps2phh))]
761#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
762pub unsafe fn __tile_cvtrowps2phh(src: __tile1024i, row: u32) -> __m512h {
763    tcvtrowps2phh_internal(src.rows, src.colsb, src.tile, row).as_m512h()
764}
765
766/// Moves a row from a tile register to a zmm register, converting the packed single-precision (32-bit)
767/// floating-point elements to packed half-precision (16-bit) floating-point elements. The resulting
768/// 16-bit elements are placed in the low 16-bits within each 32-bit element of the returned vector.
769#[inline]
770#[rustc_legacy_const_generics(0)]
771#[target_feature(enable = "amx-avx512,avx10.2")]
772#[cfg_attr(
773    all(test, not(target_vendor = "apple")),
774    assert_instr(tcvtrowps2phl, TILE = 0)
775)]
776#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
777pub unsafe fn _tile_cvtrowps2phl<const TILE: i32>(row: u32) -> __m512h {
778    static_assert_uimm_bits!(TILE, 3);
779    tcvtrowps2phl(TILE as i8, row).as_m512h()
780}
781
782/// Moves a row from a tile register to a zmm register, converting the packed single-precision (32-bit)
783/// floating-point elements to packed half-precision (16-bit) floating-point elements. The resulting
784/// 16-bit elements are placed in the low 16-bits within each 32-bit element of the returned vector.
785#[inline]
786#[rustc_legacy_const_generics(0, 1)]
787#[target_feature(enable = "amx-avx512,avx10.2")]
788#[cfg_attr(
789    all(test, not(target_vendor = "apple")),
790    assert_instr(tcvtrowps2phl, TILE = 0, ROW = 0)
791)]
792#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
793pub unsafe fn _tile_cvtrowps2phli<const TILE: i32, const ROW: i32>() -> __m512h {
794    static_assert_uimm_bits!(TILE, 3);
795    static_assert_uimm_bits!(ROW, 6);
796    tcvtrowps2phli(TILE as i8, ROW as u32).as_m512h()
797}
798
799/// Moves a row from a tile register to a zmm register, converting the packed single-precision (32-bit)
800/// floating-point elements to packed half-precision (16-bit) floating-point elements. The resulting
801/// 16-bit elements are placed in the low 16-bits within each 32-bit element of the returned vector.
802/// The shape of the tile is specified in the struct of [`__tile1024i`]. The register of the tile is allocated by the compiler.
803#[inline]
804#[target_feature(enable = "amx-avx512,avx10.2")]
805#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(tcvtrowps2phl))]
806#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
807pub unsafe fn __tile_cvtrowps2phl(src: __tile1024i, row: u32) -> __m512h {
808    tcvtrowps2phl_internal(src.rows, src.colsb, src.tile, row).as_m512h()
809}
810
811/// Moves a row from a tile register to a zmm register, converting the packed single-precision (32-bit)
812/// floating-point elements to packed BF16 (16-bit) floating-point elements. The resulting
813/// 16-bit elements are placed in the high 16-bits within each 32-bit element of the returned vector.
814#[inline]
815#[rustc_legacy_const_generics(0)]
816#[target_feature(enable = "amx-avx512,avx10.2")]
817#[cfg_attr(
818    all(test, not(target_vendor = "apple")),
819    assert_instr(tcvtrowps2bf16h, TILE = 0)
820)]
821#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
822pub unsafe fn _tile_cvtrowps2bf16h<const TILE: i32>(row: u32) -> __m512bh {
823    static_assert_uimm_bits!(TILE, 3);
824    tcvtrowps2bf16h(TILE as i8, row).as_m512bh()
825}
826
827/// Moves a row from a tile register to a zmm register, converting the packed single-precision (32-bit)
828/// floating-point elements to packed BF16 (16-bit) floating-point elements. The resulting
829/// 16-bit elements are placed in the high 16-bits within each 32-bit element of the returned vector.
830#[inline]
831#[rustc_legacy_const_generics(0, 1)]
832#[target_feature(enable = "amx-avx512,avx10.2")]
833#[cfg_attr(
834    all(test, not(target_vendor = "apple")),
835    assert_instr(tcvtrowps2bf16h, TILE = 0, ROW = 0)
836)]
837#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
838pub unsafe fn _tile_cvtrowps2bf16hi<const TILE: i32, const ROW: i32>() -> __m512bh {
839    static_assert_uimm_bits!(TILE, 3);
840    static_assert_uimm_bits!(ROW, 6);
841    tcvtrowps2bf16hi(TILE as i8, ROW as u32).as_m512bh()
842}
843
844/// Moves a row from a tile register to a zmm register, converting the packed single-precision (32-bit)
845/// floating-point elements to packed BF16 (16-bit) floating-point elements. The resulting
846/// 16-bit elements are placed in the high 16-bits within each 32-bit element of the returned vector.
847/// The shape of the tile is specified in the struct of [`__tile1024i`]. The register of the tile is allocated by the compiler.
848#[inline]
849#[target_feature(enable = "amx-avx512,avx10.2")]
850#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(tcvtrowps2bf16h))]
851#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
852pub unsafe fn __tile_cvtrowps2bf16h(src: __tile1024i, row: u32) -> __m512bh {
853    tcvtrowps2bf16h_internal(src.rows, src.colsb, src.tile, row).as_m512bh()
854}
855
856/// Moves a row from a tile register to a zmm register, converting the packed single-precision (32-bit)
857/// floating-point elements to packed BF16 (16-bit) floating-point elements. The resulting
858/// 16-bit elements are placed in the low 16-bits within each 32-bit element of the returned vector.
859#[inline]
860#[rustc_legacy_const_generics(0)]
861#[target_feature(enable = "amx-avx512,avx10.2")]
862#[cfg_attr(
863    all(test, not(target_vendor = "apple")),
864    assert_instr(tcvtrowps2bf16l, TILE = 0)
865)]
866#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
867pub unsafe fn _tile_cvtrowps2bf16l<const TILE: i32>(row: u32) -> __m512bh {
868    static_assert_uimm_bits!(TILE, 3);
869    tcvtrowps2bf16l(TILE as i8, row).as_m512bh()
870}
871
872/// Moves a row from a tile register to a zmm register, converting the packed single-precision (32-bit)
873/// floating-point elements to packed BF16 (16-bit) floating-point elements. The resulting
874/// 16-bit elements are placed in the low 16-bits within each 32-bit element of the returned vector.
875#[inline]
876#[rustc_legacy_const_generics(0, 1)]
877#[target_feature(enable = "amx-avx512,avx10.2")]
878#[cfg_attr(
879    all(test, not(target_vendor = "apple")),
880    assert_instr(tcvtrowps2bf16l, TILE = 0, ROW = 0)
881)]
882#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
883pub unsafe fn _tile_cvtrowps2bf16li<const TILE: i32, const ROW: i32>() -> __m512bh {
884    static_assert_uimm_bits!(TILE, 3);
885    static_assert_uimm_bits!(ROW, 6);
886    tcvtrowps2bf16li(TILE as i8, ROW as u32).as_m512bh()
887}
888
889/// Moves a row from a tile register to a zmm register, converting the packed single-precision (32-bit)
890/// floating-point elements to packed BF16 (16-bit) floating-point elements. The resulting
891/// 16-bit elements are placed in the low 16-bits within each 32-bit element of the returned vector.
892/// The shape of the tile is specified in the struct of [`__tile1024i`]. The register of the tile is allocated by the compiler.
893#[inline]
894#[target_feature(enable = "amx-avx512,avx10.2")]
895#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(tcvtrowps2bf16l))]
896#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
897pub unsafe fn __tile_cvtrowps2bf16l(src: __tile1024i, row: u32) -> __m512bh {
898    tcvtrowps2bf16l_internal(src.rows, src.colsb, src.tile, row).as_m512bh()
899}
900
901/// Moves one row of tile data into a zmm vector register
902#[inline]
903#[rustc_legacy_const_generics(0)]
904#[target_feature(enable = "amx-avx512,avx10.2")]
905#[cfg_attr(
906    all(test, not(target_vendor = "apple")),
907    assert_instr(tilemovrow, TILE = 0)
908)]
909#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
910pub unsafe fn _tile_movrow<const TILE: i32>(row: u32) -> __m512i {
911    static_assert_uimm_bits!(TILE, 3);
912    tilemovrow(TILE as i8, row).as_m512i()
913}
914
915/// Moves one row of tile data into a zmm vector register
916#[inline]
917#[rustc_legacy_const_generics(0, 1)]
918#[target_feature(enable = "amx-avx512,avx10.2")]
919#[cfg_attr(
920    all(test, not(target_vendor = "apple")),
921    assert_instr(tilemovrow, TILE = 0, ROW = 0)
922)]
923#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
924pub unsafe fn _tile_movrowi<const TILE: i32, const ROW: i32>() -> __m512i {
925    static_assert_uimm_bits!(TILE, 3);
926    static_assert_uimm_bits!(ROW, 6);
927    tilemovrowi(TILE as i8, ROW as u32).as_m512i()
928}
929
930/// Moves one row of tile data into a zmm vector register
931/// The shape of the tile is specified in the struct of [`__tile1024i`]. The register of the tile is allocated by the compiler.
932#[inline]
933#[target_feature(enable = "amx-avx512,avx10.2")]
934#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(tilemovrow))]
935#[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
936pub unsafe fn __tile_movrow(src: __tile1024i, row: u32) -> __m512i {
937    tilemovrow_internal(src.rows, src.colsb, src.tile, row).as_m512i()
938}
939
940#[allow(improper_ctypes)]
941unsafe extern "unadjusted" {
942    #[link_name = "llvm.x86.ldtilecfg"]
943    fn ldtilecfg(mem_addr: *const u8);
944    #[link_name = "llvm.x86.sttilecfg"]
945    fn sttilecfg(mem_addr: *mut u8);
946
947    #[link_name = "llvm.x86.tileloadd64"]
948    fn tileloadd64(dst: i8, base: *const u8, stride: u64);
949    #[link_name = "llvm.x86.tileloadd64.internal"]
950    fn tileloadd64_internal(rows: u16, colsb: u16, base: *const u8, stride: u64) -> Tile;
951
952    #[link_name = "llvm.x86.tileloaddt164"]
953    fn tileloaddt164(dst: i8, base: *const u8, stride: u64);
954    #[link_name = "llvm.x86.tileloaddt164.internal"]
955    fn tileloaddt164_internal(rows: u16, colsb: u16, base: *const u8, stride: u64) -> Tile;
956
957    #[link_name = "llvm.x86.tilerelease"]
958    fn tilerelease();
959
960    #[link_name = "llvm.x86.tilestored64"]
961    fn tilestored64(dst: i8, base: *mut u8, stride: u64);
962    #[link_name = "llvm.x86.tilestored64.internal"]
963    fn tilestored64_internal(rows: u16, colsb: u16, base: *mut u8, stride: u64, src: Tile);
964
965    #[link_name = "llvm.x86.tilezero"]
966    fn tilezero(dst: i8);
967    #[link_name = "llvm.x86.tilezero.internal"]
968    fn tilezero_internal(rows: u16, colsb: u16) -> Tile;
969
970    #[link_name = "llvm.x86.tdpbf16ps"]
971    fn tdpbf16ps(dst: i8, a: i8, b: i8);
972    #[link_name = "llvm.x86.tdpbf16ps.internal"]
973    fn tdpbf16ps_internal(m: u16, n: u16, k: u16, dst: Tile, a: Tile, b: Tile) -> Tile;
974
975    #[link_name = "llvm.x86.tdpbuud"]
976    fn tdpbuud(dst: i8, a: i8, b: i8);
977    #[link_name = "llvm.x86.tdpbuud.internal"]
978    fn tdpbuud_internal(m: u16, n: u16, k: u16, dst: Tile, a: Tile, b: Tile) -> Tile;
979
980    #[link_name = "llvm.x86.tdpbusd"]
981    fn tdpbusd(dst: i8, a: i8, b: i8);
982    #[link_name = "llvm.x86.tdpbusd.internal"]
983    fn tdpbusd_internal(m: u16, n: u16, k: u16, dst: Tile, a: Tile, b: Tile) -> Tile;
984
985    #[link_name = "llvm.x86.tdpbsud"]
986    fn tdpbsud(dst: i8, a: i8, b: i8);
987    #[link_name = "llvm.x86.tdpbsud.internal"]
988    fn tdpbsud_internal(m: u16, n: u16, k: u16, dst: Tile, a: Tile, b: Tile) -> Tile;
989
990    #[link_name = "llvm.x86.tdpbssd"]
991    fn tdpbssd(dst: i8, a: i8, b: i8);
992    #[link_name = "llvm.x86.tdpbssd.internal"]
993    fn tdpbssd_internal(m: u16, n: u16, k: u16, dst: Tile, a: Tile, b: Tile) -> Tile;
994
995    #[link_name = "llvm.x86.tdpfp16ps"]
996    fn tdpfp16ps(dst: i8, a: i8, b: i8);
997    #[link_name = "llvm.x86.tdpfp16ps.internal"]
998    fn tdpfp16ps_internal(m: u16, n: u16, k: u16, dst: Tile, a: Tile, b: Tile) -> Tile;
999
1000    #[link_name = "llvm.x86.tcmmimfp16ps"]
1001    fn tcmmimfp16ps(dst: i8, a: i8, b: i8);
1002    #[link_name = "llvm.x86.tcmmimfp16ps.internal"]
1003    fn tcmmimfp16ps_internal(m: u16, n: u16, k: u16, dst: Tile, a: Tile, b: Tile) -> Tile;
1004
1005    #[link_name = "llvm.x86.tcmmrlfp16ps"]
1006    fn tcmmrlfp16ps(dst: i8, a: i8, b: i8);
1007    #[link_name = "llvm.x86.tcmmrlfp16ps.internal"]
1008    fn tcmmrlfp16ps_internal(m: u16, n: u16, k: u16, dst: Tile, a: Tile, b: Tile) -> Tile;
1009
1010    #[link_name = "llvm.x86.tdpbf8ps"]
1011    fn tdpbf8ps(dst: i8, a: i8, b: i8);
1012    #[link_name = "llvm.x86.tdpbf8ps.internal"]
1013    fn tdpbf8ps_internal(m: u16, n: u16, k: u16, dst: Tile, a: Tile, b: Tile) -> Tile;
1014
1015    #[link_name = "llvm.x86.tdpbhf8ps"]
1016    fn tdpbhf8ps(dst: i8, a: i8, b: i8);
1017    #[link_name = "llvm.x86.tdpbhf8ps.internal"]
1018    fn tdpbhf8ps_internal(m: u16, n: u16, k: u16, dst: Tile, a: Tile, b: Tile) -> Tile;
1019
1020    #[link_name = "llvm.x86.tdphbf8ps"]
1021    fn tdphbf8ps(dst: i8, a: i8, b: i8);
1022    #[link_name = "llvm.x86.tdphbf8ps.internal"]
1023    fn tdphbf8ps_internal(m: u16, n: u16, k: u16, dst: Tile, a: Tile, b: Tile) -> Tile;
1024
1025    #[link_name = "llvm.x86.tdphf8ps"]
1026    fn tdphf8ps(dst: i8, a: i8, b: i8);
1027    #[link_name = "llvm.x86.tdphf8ps.internal"]
1028    fn tdphf8ps_internal(m: u16, n: u16, k: u16, dst: Tile, a: Tile, b: Tile) -> Tile;
1029
1030    #[link_name = "llvm.x86.tileloaddrs64"]
1031    fn tileloaddrs64(dst: i8, base: *const u8, stride: u64);
1032    #[link_name = "llvm.x86.tileloaddrs64.internal"]
1033    fn tileloaddrs64_internal(rows: u16, colsb: u16, base: *const u8, stride: u64) -> Tile;
1034
1035    #[link_name = "llvm.x86.tileloaddrst164"]
1036    fn tileloaddrst164(dst: i8, base: *const u8, stride: u64);
1037    #[link_name = "llvm.x86.tileloaddrst164.internal"]
1038    fn tileloaddrst164_internal(rows: u16, colsb: u16, base: *const u8, stride: u64) -> Tile;
1039
1040    #[link_name = "llvm.x86.tmmultf32ps"]
1041    fn tmmultf32ps(dst: i8, a: i8, b: i8);
1042    #[link_name = "llvm.x86.tmmultf32ps.internal"]
1043    fn tmmultf32ps_internal(m: u16, n: u16, k: u16, dst: Tile, a: Tile, b: Tile) -> Tile;
1044
1045    #[link_name = "llvm.x86.tcvtrowd2ps"]
1046    fn tcvtrowd2ps(tile: i8, row: u32) -> f32x16;
1047    #[link_name = "llvm.x86.tcvtrowd2psi"]
1048    fn tcvtrowd2psi(tile: i8, row: u32) -> f32x16;
1049    #[link_name = "llvm.x86.tcvtrowd2ps.internal"]
1050    fn tcvtrowd2ps_internal(rows: u16, colsb: u16, src: Tile, row: u32) -> f32x16;
1051
1052    #[link_name = "llvm.x86.tcvtrowps2phh"]
1053    fn tcvtrowps2phh(tile: i8, row: u32) -> f16x32;
1054    #[link_name = "llvm.x86.tcvtrowps2phhi"]
1055    fn tcvtrowps2phhi(tile: i8, row: u32) -> f16x32;
1056    #[link_name = "llvm.x86.tcvtrowps2phh.internal"]
1057    fn tcvtrowps2phh_internal(rows: u16, colsb: u16, src: Tile, row: u32) -> f16x32;
1058
1059    #[link_name = "llvm.x86.tcvtrowps2phl"]
1060    fn tcvtrowps2phl(tile: i8, row: u32) -> f16x32;
1061    #[link_name = "llvm.x86.tcvtrowps2phli"]
1062    fn tcvtrowps2phli(tile: i8, row: u32) -> f16x32;
1063    #[link_name = "llvm.x86.tcvtrowps2phl.internal"]
1064    fn tcvtrowps2phl_internal(rows: u16, colsb: u16, src: Tile, row: u32) -> f16x32;
1065
1066    #[link_name = "llvm.x86.tcvtrowps2bf16h"]
1067    fn tcvtrowps2bf16h(tile: i8, row: u32) -> u16x32;
1068    #[link_name = "llvm.x86.tcvtrowps2bf16hi"]
1069    fn tcvtrowps2bf16hi(tile: i8, row: u32) -> u16x32;
1070    #[link_name = "llvm.x86.tcvtrowps2bf16h.internal"]
1071    fn tcvtrowps2bf16h_internal(rows: u16, colsb: u16, src: Tile, row: u32) -> u16x32;
1072
1073    #[link_name = "llvm.x86.tcvtrowps2bf16l"]
1074    fn tcvtrowps2bf16l(tile: i8, row: u32) -> u16x32;
1075    #[link_name = "llvm.x86.tcvtrowps2bf16li"]
1076    fn tcvtrowps2bf16li(tile: i8, row: u32) -> u16x32;
1077    #[link_name = "llvm.x86.tcvtrowps2bf16l.internal"]
1078    fn tcvtrowps2bf16l_internal(rows: u16, colsb: u16, src: Tile, row: u32) -> u16x32;
1079
1080    #[link_name = "llvm.x86.tilemovrow"]
1081    fn tilemovrow(tile: i8, row: u32) -> i32x16;
1082    #[link_name = "llvm.x86.tilemovrowi"]
1083    fn tilemovrowi(tile: i8, row: u32) -> i32x16;
1084    #[link_name = "llvm.x86.tilemovrow.internal"]
1085    fn tilemovrow_internal(rows: u16, colsb: u16, src: Tile, row: u32) -> i32x16;
1086}
1087
1088#[cfg(test)]
1089mod tests {
1090    use crate::core_arch::x86::_mm_cvtness_sbh;
1091    use crate::core_arch::x86_64::*;
1092    use core::array;
1093    use stdarch_test::simd_test;
1094    #[cfg(target_os = "linux")]
1095    use syscalls::{Sysno, syscall};
1096
1097    #[allow(non_camel_case_types)]
1098    #[repr(C, packed)]
1099    #[derive(Copy, Clone, Default, Debug, PartialEq)]
1100    struct __tilecfg {
1101        /// 0 `or` 1
1102        palette: u8,
1103        start_row: u8,
1104        /// reserved, must be zero
1105        reserved_a0: [u8; 14],
1106        /// number of bytes of one row in each tile
1107        colsb: [u16; 8],
1108        /// reserved, must be zero
1109        reserved_b0: [u16; 8],
1110        /// number of rows in each tile
1111        rows: [u8; 8],
1112        /// reserved, must be zero
1113        reserved_c0: [u8; 8],
1114    }
1115
1116    impl __tilecfg {
1117        fn new(palette: u8, start_row: u8, colsb: [u16; 8], rows: [u8; 8]) -> Self {
1118            Self {
1119                palette,
1120                start_row,
1121                reserved_a0: [0u8; 14],
1122                colsb,
1123                reserved_b0: [0u16; 8],
1124                rows,
1125                reserved_c0: [0u8; 8],
1126            }
1127        }
1128
1129        const fn as_ptr(&self) -> *const u8 {
1130            self as *const Self as *const u8
1131        }
1132
1133        fn as_mut_ptr(&mut self) -> *mut u8 {
1134            self as *mut Self as *mut u8
1135        }
1136    }
1137
1138    #[cfg(not(target_os = "linux"))]
1139    #[target_feature(enable = "amx-tile")]
1140    fn _init_amx() {}
1141
1142    #[cfg(target_os = "linux")]
1143    #[target_feature(enable = "amx-tile")]
1144    #[inline]
1145    fn _init_amx() {
1146        let mut ret: usize;
1147        let mut xfeatures: usize = 0;
1148        ret = unsafe {
1149            syscall!(Sysno::arch_prctl, 0x1022, &raw mut xfeatures)
1150                .expect("arch_prctl ARCH_GET_XCOMP_PERM syscall failed")
1151        };
1152        if ret != 0 {
1153            panic!("Failed to get XFEATURES");
1154        } else {
1155            match 0b11 & (xfeatures >> 17) {
1156                0 => panic!("AMX is not available"),
1157                1 => {
1158                    ret = unsafe {
1159                        syscall!(Sysno::arch_prctl, 0x1023, 18)
1160                            .expect("arch_prctl ARCH_REQ_XCOMP_PERM syscall failed")
1161                    };
1162                    if ret != 0 {
1163                        panic!("Failed to enable AMX");
1164                    }
1165                }
1166                3 => {}
1167                _ => unreachable!(),
1168            }
1169        }
1170    }
1171
1172    impl __tile1024i {
1173        #[inline]
1174        #[target_feature(enable = "amx-tile")]
1175        fn zeroed(rows: u16, colsb: u16) -> Self {
1176            Self {
1177                rows,
1178                colsb,
1179                tile: unsafe { super::tilezero_internal(rows, colsb) },
1180            }
1181        }
1182    }
1183
1184    #[simd_test(enable = "amx-tile")]
1185    fn test_tile_loadconfig() {
1186        unsafe {
1187            let config = __tilecfg::default();
1188            _tile_loadconfig(config.as_ptr());
1189            _tile_release();
1190        }
1191    }
1192
1193    #[simd_test(enable = "amx-tile")]
1194    fn test_tile_storeconfig() {
1195        unsafe {
1196            let config = __tilecfg::new(1, 0, [32; 8], [8; 8]);
1197            _tile_loadconfig(config.as_ptr());
1198            let mut _config = __tilecfg::default();
1199            _tile_storeconfig(_config.as_mut_ptr());
1200            _tile_release();
1201            assert_eq!(config, _config);
1202        }
1203    }
1204
1205    #[simd_test(enable = "amx-tile")]
1206    fn test_tile_zero() {
1207        unsafe {
1208            _init_amx();
1209            let mut config = __tilecfg::default();
1210            config.palette = 1;
1211            config.colsb[0] = 64;
1212            config.rows[0] = 16;
1213            _tile_loadconfig(config.as_ptr());
1214            _tile_zero::<0>();
1215            let mut out = [[1_i8; 64]; 16];
1216            _tile_stored::<0>(out.as_mut_ptr().cast(), 64);
1217            _tile_release();
1218            assert_eq!(out, [[0; 64]; 16]);
1219        }
1220    }
1221
1222    #[simd_test(enable = "amx-tile")]
1223    fn test__tile_zero() {
1224        unsafe {
1225            _init_amx();
1226
1227            let tile = __tile1024i::zeroed(16, 64);
1228
1229            let mut out = [[1_i8; 64]; 16];
1230            __tile_stored(out.as_mut_ptr().cast(), 64, tile);
1231
1232            assert_eq!(out, [[0; 64]; 16]);
1233        }
1234    }
1235
1236    #[simd_test(enable = "amx-tile")]
1237    fn test_tile_stored() {
1238        unsafe {
1239            _init_amx();
1240            let mut config = __tilecfg::default();
1241            config.palette = 1;
1242            config.colsb[0] = 64;
1243            config.rows[0] = 16;
1244            _tile_loadconfig(config.as_ptr());
1245            _tile_zero::<0>();
1246            let mut out = [[1_i8; 64]; 16];
1247            _tile_stored::<0>(out.as_mut_ptr().cast(), 64);
1248            _tile_release();
1249            assert_eq!(out, [[0; 64]; 16]);
1250        }
1251    }
1252
1253    #[simd_test(enable = "amx-tile")]
1254    fn test__tile_stored() {
1255        unsafe {
1256            _init_amx();
1257
1258            let tile = __tile1024i::zeroed(16, 64);
1259
1260            let mut out = [[1_i8; 64]; 16];
1261            __tile_stored(out.as_mut_ptr().cast(), 64, tile);
1262
1263            assert_eq!(out, [[0; 64]; 16]);
1264        }
1265    }
1266
1267    #[simd_test(enable = "amx-tile")]
1268    fn test_tile_loadd() {
1269        unsafe {
1270            _init_amx();
1271            let mut config = __tilecfg::default();
1272            config.palette = 1;
1273            config.colsb[0] = 64;
1274            config.rows[0] = 16;
1275            _tile_loadconfig(config.as_ptr());
1276            _tile_zero::<0>();
1277            let mat = [1_i8; 1024];
1278            _tile_loadd::<0>(mat.as_ptr().cast(), 64);
1279            let mut out = [[0_i8; 64]; 16];
1280            _tile_stored::<0>(out.as_mut_ptr().cast(), 64);
1281            _tile_release();
1282            assert_eq!(out, [[1; 64]; 16]);
1283        }
1284    }
1285
1286    #[simd_test(enable = "amx-tile")]
1287    fn test__tile_loadd() {
1288        unsafe {
1289            _init_amx();
1290
1291            let mut tile = __tile1024i::zeroed(16, 64);
1292
1293            let mat = [1_i8; 1024];
1294            __tile_loadd(&mut tile, mat.as_ptr().cast(), 64);
1295            let mut out = [[0_i8; 64]; 16];
1296            __tile_stored(out.as_mut_ptr().cast(), 64, tile);
1297
1298            assert_eq!(out, [[1; 64]; 16]);
1299        }
1300    }
1301
1302    #[simd_test(enable = "amx-tile")]
1303    fn test_tile_stream_loadd() {
1304        unsafe {
1305            _init_amx();
1306            let mut config = __tilecfg::default();
1307            config.palette = 1;
1308            config.colsb[0] = 64;
1309            config.rows[0] = 16;
1310            _tile_loadconfig(config.as_ptr());
1311            _tile_zero::<0>();
1312            let mat = [1_i8; 1024];
1313            _tile_stream_loadd::<0>(mat.as_ptr().cast(), 64);
1314            let mut out = [[0_i8; 64]; 16];
1315            _tile_stored::<0>(out.as_mut_ptr().cast(), 64);
1316            _tile_release();
1317            assert_eq!(out, [[1; 64]; 16]);
1318        }
1319    }
1320
1321    #[simd_test(enable = "amx-tile")]
1322    fn test__tile_stream_loadd() {
1323        unsafe {
1324            _init_amx();
1325
1326            let mut tile = __tile1024i::zeroed(16, 64);
1327
1328            let mat = [1_i8; 1024];
1329            __tile_stream_loadd(&mut tile, mat.as_ptr().cast(), 64);
1330            let mut out = [[0_i8; 64]; 16];
1331            __tile_stored(out.as_mut_ptr().cast(), 64, tile);
1332
1333            assert_eq!(out, [[1; 64]; 16]);
1334        }
1335    }
1336
1337    #[simd_test(enable = "amx-tile")]
1338    fn test_tile_release() {
1339        unsafe {
1340            _tile_release();
1341        }
1342    }
1343
1344    const BF16_1: u16 = 0x3f80;
1345    const BF16_2: u16 = 0x4000;
1346
1347    #[simd_test(enable = "amx-bf16")]
1348    fn test_tile_dpbf16ps() {
1349        unsafe {
1350            _init_amx();
1351            let ones = [BF16_1; 512];
1352            let twos = [BF16_2; 512];
1353            let mut res = [[0f32; 16]; 16];
1354            let mut config = __tilecfg::default();
1355            config.palette = 1;
1356            (0..=2).for_each(|i| {
1357                config.colsb[i] = 64;
1358                config.rows[i] = 16;
1359            });
1360            _tile_loadconfig(config.as_ptr());
1361            _tile_zero::<0>();
1362            _tile_loadd::<1>(ones.as_ptr().cast(), 64);
1363            _tile_loadd::<2>(twos.as_ptr().cast(), 64);
1364            _tile_dpbf16ps::<0, 1, 2>();
1365            _tile_stored::<0>(res.as_mut_ptr().cast(), 64);
1366            _tile_release();
1367            assert_eq!(res, [[64f32; 16]; 16]);
1368        }
1369    }
1370
1371    #[simd_test(enable = "amx-bf16")]
1372    fn test__tile_dpbf16ps() {
1373        unsafe {
1374            _init_amx();
1375            let ones = [BF16_1; 512];
1376            let twos = [BF16_2; 512];
1377            let mut res = [[0f32; 16]; 16];
1378
1379            let mut a = __tile1024i::zeroed(16, 64);
1380            let mut b = __tile1024i::zeroed(16, 64);
1381            let mut c = __tile1024i::zeroed(16, 64);
1382
1383            __tile_loadd(&mut a, ones.as_ptr().cast(), 64);
1384            __tile_loadd(&mut b, twos.as_ptr().cast(), 64);
1385            __tile_dpbf16ps(&mut c, a, b);
1386            __tile_stored(res.as_mut_ptr().cast(), 64, c);
1387
1388            assert_eq!(res, [[64f32; 16]; 16]);
1389        }
1390    }
1391
1392    #[simd_test(enable = "amx-int8")]
1393    fn test_tile_dpbssd() {
1394        unsafe {
1395            _init_amx();
1396            let ones = [-1_i8; 1024];
1397            let twos = [-2_i8; 1024];
1398            let mut res = [[0_i32; 16]; 16];
1399            let mut config = __tilecfg::default();
1400            config.palette = 1;
1401            (0..=2).for_each(|i| {
1402                config.colsb[i] = 64;
1403                config.rows[i] = 16;
1404            });
1405            _tile_loadconfig(config.as_ptr());
1406            _tile_zero::<0>();
1407            _tile_loadd::<1>(ones.as_ptr().cast(), 64);
1408            _tile_loadd::<2>(twos.as_ptr().cast(), 64);
1409            _tile_dpbssd::<0, 1, 2>();
1410            _tile_stored::<0>(res.as_mut_ptr().cast(), 64);
1411            _tile_release();
1412            assert_eq!(res, [[128_i32; 16]; 16]);
1413        }
1414    }
1415
1416    #[simd_test(enable = "amx-int8")]
1417    fn test__tile_dpbssd() {
1418        unsafe {
1419            _init_amx();
1420            let ones = [-1_i8; 1024];
1421            let twos = [-2_i8; 1024];
1422            let mut res = [[0_i32; 16]; 16];
1423
1424            let mut a = __tile1024i::zeroed(16, 64);
1425            let mut b = __tile1024i::zeroed(16, 64);
1426            let mut c = __tile1024i::zeroed(16, 64);
1427
1428            __tile_loadd(&mut a, ones.as_ptr().cast(), 64);
1429            __tile_loadd(&mut b, twos.as_ptr().cast(), 64);
1430            __tile_dpbssd(&mut c, a, b);
1431            __tile_stored(res.as_mut_ptr().cast(), 64, c);
1432
1433            assert_eq!(res, [[128_i32; 16]; 16]);
1434        }
1435    }
1436
1437    #[simd_test(enable = "amx-int8")]
1438    fn test_tile_dpbsud() {
1439        unsafe {
1440            _init_amx();
1441            let ones = [-1_i8; 1024];
1442            let twos = [2_u8; 1024];
1443            let mut res = [[0_i32; 16]; 16];
1444            let mut config = __tilecfg::default();
1445            config.palette = 1;
1446            (0..=2).for_each(|i| {
1447                config.colsb[i] = 64;
1448                config.rows[i] = 16;
1449            });
1450            _tile_loadconfig(config.as_ptr());
1451            _tile_zero::<0>();
1452            _tile_loadd::<1>(ones.as_ptr().cast(), 64);
1453            _tile_loadd::<2>(twos.as_ptr(), 64);
1454            _tile_dpbsud::<0, 1, 2>();
1455            _tile_stored::<0>(res.as_mut_ptr().cast(), 64);
1456            _tile_release();
1457            assert_eq!(res, [[-128_i32; 16]; 16]);
1458        }
1459    }
1460
1461    #[simd_test(enable = "amx-int8")]
1462    fn test__tile_dpbsud() {
1463        unsafe {
1464            _init_amx();
1465            let ones = [-1_i8; 1024];
1466            let twos = [2_u8; 1024];
1467            let mut res = [[0_i32; 16]; 16];
1468
1469            let mut a = __tile1024i::zeroed(16, 64);
1470            let mut b = __tile1024i::zeroed(16, 64);
1471            let mut c = __tile1024i::zeroed(16, 64);
1472
1473            __tile_loadd(&mut a, ones.as_ptr().cast(), 64);
1474            __tile_loadd(&mut b, twos.as_ptr(), 64);
1475            __tile_dpbsud(&mut c, a, b);
1476            __tile_stored(res.as_mut_ptr().cast(), 64, c);
1477
1478            assert_eq!(res, [[-128_i32; 16]; 16]);
1479        }
1480    }
1481
1482    #[simd_test(enable = "amx-int8")]
1483    fn test_tile_dpbusd() {
1484        unsafe {
1485            _init_amx();
1486            let ones = [1_u8; 1024];
1487            let twos = [-2_i8; 1024];
1488            let mut res = [[0_i32; 16]; 16];
1489            let mut config = __tilecfg::default();
1490            config.palette = 1;
1491            (0..=2).for_each(|i| {
1492                config.colsb[i] = 64;
1493                config.rows[i] = 16;
1494            });
1495            _tile_loadconfig(config.as_ptr());
1496            _tile_zero::<0>();
1497            _tile_loadd::<1>(ones.as_ptr(), 64);
1498            _tile_loadd::<2>(twos.as_ptr().cast(), 64);
1499            _tile_dpbusd::<0, 1, 2>();
1500            _tile_stored::<0>(res.as_mut_ptr().cast(), 64);
1501            _tile_release();
1502            assert_eq!(res, [[-128_i32; 16]; 16]);
1503        }
1504    }
1505
1506    #[simd_test(enable = "amx-int8")]
1507    fn test__tile_dpbusd() {
1508        unsafe {
1509            _init_amx();
1510            let ones = [1_u8; 1024];
1511            let twos = [-2_i8; 1024];
1512            let mut res = [[0_i32; 16]; 16];
1513
1514            let mut a = __tile1024i::zeroed(16, 64);
1515            let mut b = __tile1024i::zeroed(16, 64);
1516            let mut c = __tile1024i::zeroed(16, 64);
1517
1518            __tile_loadd(&mut a, ones.as_ptr(), 64);
1519            __tile_loadd(&mut b, twos.as_ptr().cast(), 64);
1520            __tile_dpbusd(&mut c, a, b);
1521            __tile_stored(res.as_mut_ptr().cast(), 64, c);
1522
1523            assert_eq!(res, [[-128_i32; 16]; 16]);
1524        }
1525    }
1526
1527    #[simd_test(enable = "amx-int8")]
1528    fn test_tile_dpbuud() {
1529        unsafe {
1530            _init_amx();
1531            let ones = [1_u8; 1024];
1532            let twos = [2_u8; 1024];
1533            let mut res = [[0_i32; 16]; 16];
1534            let mut config = __tilecfg::default();
1535            config.palette = 1;
1536            (0..=2).for_each(|i| {
1537                config.colsb[i] = 64;
1538                config.rows[i] = 16;
1539            });
1540            _tile_loadconfig(config.as_ptr());
1541            _tile_zero::<0>();
1542            _tile_loadd::<1>(ones.as_ptr(), 64);
1543            _tile_loadd::<2>(twos.as_ptr(), 64);
1544            _tile_dpbuud::<0, 1, 2>();
1545            _tile_stored::<0>(res.as_mut_ptr().cast(), 64);
1546            _tile_release();
1547            assert_eq!(res, [[128_i32; 16]; 16]);
1548        }
1549    }
1550
1551    #[simd_test(enable = "amx-int8")]
1552    fn test__tile_dpbuud() {
1553        unsafe {
1554            _init_amx();
1555            let ones = [1_u8; 1024];
1556            let twos = [2_u8; 1024];
1557            let mut res = [[0_i32; 16]; 16];
1558
1559            let mut a = __tile1024i::zeroed(16, 64);
1560            let mut b = __tile1024i::zeroed(16, 64);
1561            let mut c = __tile1024i::zeroed(16, 64);
1562
1563            __tile_loadd(&mut a, ones.as_ptr(), 64);
1564            __tile_loadd(&mut b, twos.as_ptr(), 64);
1565            __tile_dpbuud(&mut c, a, b);
1566            __tile_stored(res.as_mut_ptr().cast(), 64, c);
1567
1568            assert_eq!(res, [[128_i32; 16]; 16]);
1569        }
1570    }
1571
1572    #[simd_test(enable = "amx-fp16")]
1573    fn test_tile_dpfp16ps() {
1574        unsafe {
1575            _init_amx();
1576            let ones = [1f16; 512];
1577            let twos = [2f16; 512];
1578            let mut res = [[0f32; 16]; 16];
1579            let mut config = __tilecfg::default();
1580            config.palette = 1;
1581            (0..=2).for_each(|i| {
1582                config.colsb[i] = 64;
1583                config.rows[i] = 16;
1584            });
1585            _tile_loadconfig(config.as_ptr());
1586            _tile_zero::<0>();
1587            _tile_loadd::<1>(ones.as_ptr().cast(), 64);
1588            _tile_loadd::<2>(twos.as_ptr().cast(), 64);
1589            _tile_dpfp16ps::<0, 1, 2>();
1590            _tile_stored::<0>(res.as_mut_ptr().cast(), 64);
1591            _tile_release();
1592            assert_eq!(res, [[64f32; 16]; 16]);
1593        }
1594    }
1595
1596    #[simd_test(enable = "amx-fp16")]
1597    fn test__tile_dpfp16ps() {
1598        unsafe {
1599            _init_amx();
1600            let ones = [1f16; 512];
1601            let twos = [2f16; 512];
1602            let mut res = [[0f32; 16]; 16];
1603
1604            let mut a = __tile1024i::zeroed(16, 64);
1605            let mut b = __tile1024i::zeroed(16, 64);
1606            let mut c = __tile1024i::zeroed(16, 64);
1607
1608            __tile_loadd(&mut a, ones.as_ptr().cast(), 64);
1609            __tile_loadd(&mut b, twos.as_ptr().cast(), 64);
1610            __tile_dpfp16ps(&mut c, a, b);
1611            __tile_stored(res.as_mut_ptr().cast(), 64, c);
1612
1613            assert_eq!(res, [[64f32; 16]; 16]);
1614        }
1615    }
1616
1617    #[simd_test(enable = "amx-complex")]
1618    fn test_tile_cmmimfp16ps() {
1619        unsafe {
1620            _init_amx();
1621            let ones = [1f16; 512];
1622            let twos = [2f16; 512];
1623            let mut res = [[0f32; 16]; 16];
1624            let mut config = __tilecfg::default();
1625            config.palette = 1;
1626            (0..=2).for_each(|i| {
1627                config.colsb[i] = 64;
1628                config.rows[i] = 16;
1629            });
1630            _tile_loadconfig(config.as_ptr());
1631            _tile_zero::<0>();
1632            _tile_loadd::<1>(ones.as_ptr().cast(), 64);
1633            _tile_loadd::<2>(twos.as_ptr().cast(), 64);
1634            _tile_cmmimfp16ps::<0, 1, 2>();
1635            _tile_stored::<0>(res.as_mut_ptr().cast(), 64);
1636            _tile_release();
1637            assert_eq!(res, [[64f32; 16]; 16]);
1638        }
1639    }
1640
1641    #[simd_test(enable = "amx-complex")]
1642    fn test__tile_cmmimfp16ps() {
1643        unsafe {
1644            _init_amx();
1645            let ones = [1f16; 512];
1646            let twos = [2f16; 512];
1647            let mut res = [[0f32; 16]; 16];
1648
1649            let mut a = __tile1024i::zeroed(16, 64);
1650            let mut b = __tile1024i::zeroed(16, 64);
1651            let mut c = __tile1024i::zeroed(16, 64);
1652
1653            __tile_loadd(&mut a, ones.as_ptr().cast(), 64);
1654            __tile_loadd(&mut b, twos.as_ptr().cast(), 64);
1655            __tile_cmmimfp16ps(&mut c, a, b);
1656            __tile_stored(res.as_mut_ptr().cast(), 64, c);
1657
1658            assert_eq!(res, [[64f32; 16]; 16]);
1659        }
1660    }
1661
1662    #[simd_test(enable = "amx-complex")]
1663    fn test_tile_cmmrlfp16ps() {
1664        unsafe {
1665            _init_amx();
1666            let ones = [1f16; 512];
1667            let twos = [2f16; 512];
1668            let mut res = [[0f32; 16]; 16];
1669            let mut config = __tilecfg::default();
1670            config.palette = 1;
1671            (0..=2).for_each(|i| {
1672                config.colsb[i] = 64;
1673                config.rows[i] = 16;
1674            });
1675            _tile_loadconfig(config.as_ptr());
1676            _tile_zero::<0>();
1677            _tile_loadd::<1>(ones.as_ptr().cast(), 64);
1678            _tile_loadd::<2>(twos.as_ptr().cast(), 64);
1679            _tile_cmmrlfp16ps::<0, 1, 2>();
1680            _tile_stored::<0>(res.as_mut_ptr().cast(), 64);
1681            _tile_release();
1682            assert_eq!(res, [[0f32; 16]; 16]);
1683        }
1684    }
1685
1686    #[simd_test(enable = "amx-complex")]
1687    fn test__tile_cmmrlfp16ps() {
1688        unsafe {
1689            _init_amx();
1690            let ones = [1f16; 512];
1691            let twos = [2f16; 512];
1692            let mut res = [[0f32; 16]; 16];
1693
1694            let mut a = __tile1024i::zeroed(16, 64);
1695            let mut b = __tile1024i::zeroed(16, 64);
1696            let mut c = __tile1024i::zeroed(16, 64);
1697
1698            __tile_loadd(&mut a, ones.as_ptr().cast(), 64);
1699            __tile_loadd(&mut b, twos.as_ptr().cast(), 64);
1700            __tile_cmmrlfp16ps(&mut c, a, b);
1701            __tile_stored(res.as_mut_ptr().cast(), 64, c);
1702
1703            assert_eq!(res, [[0f32; 16]; 16]);
1704        }
1705    }
1706
1707    const BF8_ONE: u8 = 0x3c;
1708    const BF8_TWO: u8 = 0x40;
1709    const HF8_ONE: u8 = 0x38;
1710    const HF8_TWO: u8 = 0x40;
1711
1712    #[simd_test(enable = "amx-fp8")]
1713    fn test_tile_dpbf8ps() {
1714        unsafe {
1715            _init_amx();
1716            let ones = [BF8_ONE; 1024];
1717            let twos = [BF8_TWO; 1024];
1718            let mut res = [[0.0_f32; 16]; 16];
1719            let mut config = __tilecfg::default();
1720            config.palette = 1;
1721            (0..=2).for_each(|i| {
1722                config.colsb[i] = 64;
1723                config.rows[i] = 16;
1724            });
1725            _tile_loadconfig(config.as_ptr());
1726            _tile_zero::<0>();
1727            _tile_loadd::<1>(ones.as_ptr(), 64);
1728            _tile_loadd::<2>(twos.as_ptr(), 64);
1729            _tile_dpbf8ps::<0, 1, 2>();
1730            _tile_stored::<0>(res.as_mut_ptr().cast(), 64);
1731            _tile_release();
1732            assert_eq!(res, [[128.0_f32; 16]; 16]);
1733        }
1734    }
1735
1736    #[simd_test(enable = "amx-fp8")]
1737    fn test__tile_dpbf8ps() {
1738        unsafe {
1739            _init_amx();
1740            let ones = [BF8_ONE; 1024];
1741            let twos = [BF8_TWO; 1024];
1742            let mut res = [[0.0_f32; 16]; 16];
1743
1744            let mut a = __tile1024i::zeroed(16, 64);
1745            let mut b = __tile1024i::zeroed(16, 64);
1746            let mut c = __tile1024i::zeroed(16, 64);
1747
1748            __tile_loadd(&mut a, ones.as_ptr(), 64);
1749            __tile_loadd(&mut b, twos.as_ptr(), 64);
1750            __tile_dpbf8ps(&mut c, a, b);
1751            __tile_stored(res.as_mut_ptr().cast(), 64, c);
1752
1753            assert_eq!(res, [[128.0_f32; 16]; 16]);
1754        }
1755    }
1756
1757    #[simd_test(enable = "amx-fp8")]
1758    fn test_tile_dpbhf8ps() {
1759        unsafe {
1760            _init_amx();
1761            let ones = [BF8_ONE; 1024];
1762            let twos = [HF8_TWO; 1024];
1763            let mut res = [[0.0_f32; 16]; 16];
1764            let mut config = __tilecfg::default();
1765            config.palette = 1;
1766            (0..=2).for_each(|i| {
1767                config.colsb[i] = 64;
1768                config.rows[i] = 16;
1769            });
1770            _tile_loadconfig(config.as_ptr());
1771            _tile_zero::<0>();
1772            _tile_loadd::<1>(ones.as_ptr(), 64);
1773            _tile_loadd::<2>(twos.as_ptr(), 64);
1774            _tile_dpbhf8ps::<0, 1, 2>();
1775            _tile_stored::<0>(res.as_mut_ptr().cast(), 64);
1776            _tile_release();
1777            assert_eq!(res, [[128.0_f32; 16]; 16]);
1778        }
1779    }
1780
1781    #[simd_test(enable = "amx-fp8")]
1782    fn test__tile_dpbhf8ps() {
1783        unsafe {
1784            _init_amx();
1785            let ones = [BF8_ONE; 1024];
1786            let twos = [HF8_TWO; 1024];
1787            let mut res = [[0.0_f32; 16]; 16];
1788
1789            let mut a = __tile1024i::zeroed(16, 64);
1790            let mut b = __tile1024i::zeroed(16, 64);
1791            let mut c = __tile1024i::zeroed(16, 64);
1792
1793            __tile_loadd(&mut a, ones.as_ptr(), 64);
1794            __tile_loadd(&mut b, twos.as_ptr(), 64);
1795            __tile_dpbhf8ps(&mut c, a, b);
1796            __tile_stored(res.as_mut_ptr().cast(), 64, c);
1797
1798            assert_eq!(res, [[128.0_f32; 16]; 16]);
1799        }
1800    }
1801
1802    #[simd_test(enable = "amx-fp8")]
1803    fn test_tile_dphbf8ps() {
1804        unsafe {
1805            _init_amx();
1806            let ones = [HF8_ONE; 1024];
1807            let twos = [BF8_TWO; 1024];
1808            let mut res = [[0.0_f32; 16]; 16];
1809            let mut config = __tilecfg::default();
1810            config.palette = 1;
1811            (0..=2).for_each(|i| {
1812                config.colsb[i] = 64;
1813                config.rows[i] = 16;
1814            });
1815            _tile_loadconfig(config.as_ptr());
1816            _tile_zero::<0>();
1817            _tile_loadd::<1>(ones.as_ptr(), 64);
1818            _tile_loadd::<2>(twos.as_ptr(), 64);
1819            _tile_dphbf8ps::<0, 1, 2>();
1820            _tile_stored::<0>(res.as_mut_ptr().cast(), 64);
1821            _tile_release();
1822            assert_eq!(res, [[128.0_f32; 16]; 16]);
1823        }
1824    }
1825
1826    #[simd_test(enable = "amx-fp8")]
1827    fn test__tile_dphbf8ps() {
1828        unsafe {
1829            _init_amx();
1830            let ones = [HF8_ONE; 1024];
1831            let twos = [BF8_TWO; 1024];
1832            let mut res = [[0.0_f32; 16]; 16];
1833
1834            let mut a = __tile1024i::zeroed(16, 64);
1835            let mut b = __tile1024i::zeroed(16, 64);
1836            let mut c = __tile1024i::zeroed(16, 64);
1837
1838            __tile_loadd(&mut a, ones.as_ptr(), 64);
1839            __tile_loadd(&mut b, twos.as_ptr(), 64);
1840            __tile_dphbf8ps(&mut c, a, b);
1841            __tile_stored(res.as_mut_ptr().cast(), 64, c);
1842
1843            assert_eq!(res, [[128.0_f32; 16]; 16]);
1844        }
1845    }
1846
1847    #[simd_test(enable = "amx-fp8")]
1848    fn test_tile_dphf8ps() {
1849        unsafe {
1850            _init_amx();
1851            let ones = [HF8_ONE; 1024];
1852            let twos = [HF8_TWO; 1024];
1853            let mut res = [[0.0_f32; 16]; 16];
1854            let mut config = __tilecfg::default();
1855            config.palette = 1;
1856            (0..=2).for_each(|i| {
1857                config.colsb[i] = 64;
1858                config.rows[i] = 16;
1859            });
1860            _tile_loadconfig(config.as_ptr());
1861            _tile_zero::<0>();
1862            _tile_loadd::<1>(ones.as_ptr(), 64);
1863            _tile_loadd::<2>(twos.as_ptr(), 64);
1864            _tile_dphf8ps::<0, 1, 2>();
1865            _tile_stored::<0>(res.as_mut_ptr().cast(), 64);
1866            _tile_release();
1867            assert_eq!(res, [[128.0_f32; 16]; 16]);
1868        }
1869    }
1870
1871    #[simd_test(enable = "amx-fp8")]
1872    fn test__tile_dphf8ps() {
1873        unsafe {
1874            _init_amx();
1875            let ones = [HF8_ONE; 1024];
1876            let twos = [HF8_TWO; 1024];
1877            let mut res = [[0.0_f32; 16]; 16];
1878
1879            let mut a = __tile1024i::zeroed(16, 64);
1880            let mut b = __tile1024i::zeroed(16, 64);
1881            let mut c = __tile1024i::zeroed(16, 64);
1882
1883            __tile_loadd(&mut a, ones.as_ptr(), 64);
1884            __tile_loadd(&mut b, twos.as_ptr(), 64);
1885            __tile_dphf8ps(&mut c, a, b);
1886            __tile_stored(res.as_mut_ptr().cast(), 64, c);
1887
1888            assert_eq!(res, [[128.0_f32; 16]; 16]);
1889        }
1890    }
1891
1892    #[simd_test(enable = "amx-movrs")]
1893    fn test_tile_loaddrs() {
1894        unsafe {
1895            _init_amx();
1896            let mut config = __tilecfg::default();
1897            config.palette = 1;
1898            config.colsb[0] = 64;
1899            config.rows[0] = 16;
1900            _tile_loadconfig(config.as_ptr());
1901            _tile_zero::<0>();
1902            let mat = [1_i8; 1024];
1903            _tile_loaddrs::<0>(mat.as_ptr().cast(), 64);
1904            let mut out = [[0_i8; 64]; 16];
1905            _tile_stored::<0>(out.as_mut_ptr().cast(), 64);
1906            _tile_release();
1907            assert_eq!(out, [[1; 64]; 16]);
1908        }
1909    }
1910
1911    #[simd_test(enable = "amx-movrs")]
1912    fn test__tile_loaddrs() {
1913        unsafe {
1914            _init_amx();
1915
1916            let mut tile = __tile1024i::zeroed(16, 64);
1917
1918            let mat = [1_i8; 1024];
1919            __tile_loaddrs(&mut tile, mat.as_ptr().cast(), 64);
1920            let mut out = [[0_i8; 64]; 16];
1921            __tile_stored(out.as_mut_ptr().cast(), 64, tile);
1922
1923            assert_eq!(out, [[1; 64]; 16]);
1924        }
1925    }
1926
1927    #[simd_test(enable = "amx-movrs")]
1928    fn test_tile_stream_loaddrs() {
1929        unsafe {
1930            _init_amx();
1931            let mut config = __tilecfg::default();
1932            config.palette = 1;
1933            config.colsb[0] = 64;
1934            config.rows[0] = 16;
1935            _tile_loadconfig(config.as_ptr());
1936            _tile_zero::<0>();
1937            let mat = [1_i8; 1024];
1938            _tile_stream_loaddrs::<0>(mat.as_ptr().cast(), 64);
1939            let mut out = [[0_i8; 64]; 16];
1940            _tile_stored::<0>(out.as_mut_ptr().cast(), 64);
1941            _tile_release();
1942            assert_eq!(out, [[1; 64]; 16]);
1943        }
1944    }
1945
1946    #[simd_test(enable = "amx-movrs")]
1947    fn test__tile_stream_loaddrs() {
1948        unsafe {
1949            _init_amx();
1950
1951            let mut tile = __tile1024i::zeroed(16, 64);
1952
1953            let mat = [1_i8; 1024];
1954            __tile_stream_loaddrs(&mut tile, mat.as_ptr().cast(), 64);
1955            let mut out = [[0_i8; 64]; 16];
1956            __tile_stored(out.as_mut_ptr().cast(), 64, tile);
1957
1958            assert_eq!(out, [[1; 64]; 16]);
1959        }
1960    }
1961
1962    #[simd_test(enable = "amx-avx512,avx10.2")]
1963    fn test_tile_movrow() {
1964        unsafe {
1965            _init_amx();
1966            let array: [[u8; 64]; 16] = array::from_fn(|i| [i as _; _]);
1967
1968            let mut config = __tilecfg::default();
1969            config.palette = 1;
1970            config.colsb[0] = 64;
1971            config.rows[0] = 16;
1972            _tile_loadconfig(config.as_ptr());
1973            _tile_loadd::<0>(array.as_ptr().cast(), 64);
1974            for i in 0..16 {
1975                let row = _tile_movrow::<0>(i);
1976                assert_eq!(*row.as_u8x64().as_array(), [i as _; _]);
1977            }
1978        }
1979    }
1980
1981    macro_rules! wrap_imm4 {
1982        ($name:ident :: <$TILE:literal>, $row:expr) => {
1983            match $row {
1984                0 => $name::<$TILE, 0>(),
1985                1 => $name::<$TILE, 1>(),
1986                2 => $name::<$TILE, 2>(),
1987                3 => $name::<$TILE, 3>(),
1988                4 => $name::<$TILE, 4>(),
1989                5 => $name::<$TILE, 5>(),
1990                6 => $name::<$TILE, 6>(),
1991                7 => $name::<$TILE, 7>(),
1992                8 => $name::<$TILE, 8>(),
1993                9 => $name::<$TILE, 9>(),
1994                10 => $name::<$TILE, 10>(),
1995                11 => $name::<$TILE, 11>(),
1996                12 => $name::<$TILE, 12>(),
1997                13 => $name::<$TILE, 13>(),
1998                14 => $name::<$TILE, 14>(),
1999                15 => $name::<$TILE, 15>(),
2000                _ => panic!("row index out of range"),
2001            }
2002        };
2003    }
2004
2005    #[simd_test(enable = "amx-avx512,avx10.2")]
2006    fn test_tile_movrowi() {
2007        unsafe {
2008            _init_amx();
2009            let array: [[u8; 64]; 16] = array::from_fn(|i| [i as _; _]);
2010
2011            let mut config = __tilecfg::default();
2012            config.palette = 1;
2013            config.colsb[0] = 64;
2014            config.rows[0] = 16;
2015            _tile_loadconfig(config.as_ptr());
2016            _tile_loadd::<0>(array.as_ptr().cast(), 64);
2017
2018            for i in 0..16 {
2019                let row = wrap_imm4!(_tile_movrowi::<0>, i);
2020                assert_eq!(*row.as_u8x64().as_array(), [i as _; _]);
2021            }
2022        }
2023    }
2024
2025    #[simd_test(enable = "amx-avx512,avx10.2")]
2026    fn test__tile_movrow() {
2027        unsafe {
2028            _init_amx();
2029            let array: [[u8; 64]; 16] = array::from_fn(|i| [i as _; _]);
2030
2031            let mut tile = __tile1024i::zeroed(16, 64);
2032            __tile_loadd(&mut tile, array.as_ptr().cast(), 64);
2033
2034            for i in 0..16 {
2035                let row = __tile_movrow(tile, i);
2036                assert_eq!(*row.as_u8x64().as_array(), [i as _; _]);
2037            }
2038        }
2039    }
2040
2041    #[simd_test(enable = "amx-avx512,avx10.2")]
2042    fn test_tile_cvtrowd2ps() {
2043        unsafe {
2044            _init_amx();
2045            let array: [[u32; 16]; 16] = array::from_fn(|i| [i as _; _]);
2046
2047            let mut config = __tilecfg::default();
2048            config.palette = 1;
2049            config.colsb[0] = 64;
2050            config.rows[0] = 16;
2051            _tile_loadconfig(config.as_ptr());
2052            _tile_loadd::<0>(array.as_ptr().cast(), 64);
2053            for i in 0..16 {
2054                let row = _tile_cvtrowd2ps::<0>(i);
2055                assert_eq!(*row.as_f32x16().as_array(), [i as _; _]);
2056            }
2057        }
2058    }
2059
2060    #[simd_test(enable = "amx-avx512,avx10.2")]
2061    fn test_tile_cvtrowd2psi() {
2062        unsafe {
2063            _init_amx();
2064            let array: [[u32; 16]; 16] = array::from_fn(|i| [i as _; _]);
2065
2066            let mut config = __tilecfg::default();
2067            config.palette = 1;
2068            config.colsb[0] = 64;
2069            config.rows[0] = 16;
2070            _tile_loadconfig(config.as_ptr());
2071            _tile_loadd::<0>(array.as_ptr().cast(), 64);
2072
2073            for i in 0..16 {
2074                let row = wrap_imm4!(_tile_cvtrowd2psi::<0>, i);
2075                assert_eq!(*row.as_f32x16().as_array(), [i as _; _]);
2076            }
2077        }
2078    }
2079
2080    #[simd_test(enable = "amx-avx512,avx10.2")]
2081    fn test__tile_cvtrowd2ps() {
2082        unsafe {
2083            _init_amx();
2084            let array: [[u32; 16]; 16] = array::from_fn(|i| [i as _; _]);
2085
2086            let mut tile = __tile1024i::zeroed(16, 64);
2087            __tile_loadd(&mut tile, array.as_ptr().cast(), 64);
2088
2089            for i in 0..16 {
2090                let row = __tile_cvtrowd2ps(tile, i);
2091                assert_eq!(*row.as_f32x16().as_array(), [i as _; _]);
2092            }
2093        }
2094    }
2095
2096    #[simd_test(enable = "amx-avx512,avx10.2")]
2097    fn test_tile_cvtrowps2phh() {
2098        unsafe {
2099            _init_amx();
2100            let array: [[f32; 16]; 16] = array::from_fn(|i| [i as _; _]);
2101
2102            let mut config = __tilecfg::default();
2103            config.palette = 1;
2104            config.colsb[0] = 64;
2105            config.rows[0] = 16;
2106            _tile_loadconfig(config.as_ptr());
2107            _tile_loadd::<0>(array.as_ptr().cast(), 64);
2108            for i in 0..16 {
2109                let row = _tile_cvtrowps2phh::<0>(i);
2110                assert_eq!(
2111                    *row.as_f16x32().as_array(),
2112                    array::from_fn(|j| if j & 1 == 0 { 0.0 } else { i as _ })
2113                );
2114            }
2115        }
2116    }
2117
2118    #[simd_test(enable = "amx-avx512,avx10.2")]
2119    fn test_tile_cvtrowps2phhi() {
2120        unsafe {
2121            _init_amx();
2122            let array: [[f32; 16]; 16] = array::from_fn(|i| [i as _; _]);
2123
2124            let mut config = __tilecfg::default();
2125            config.palette = 1;
2126            config.colsb[0] = 64;
2127            config.rows[0] = 16;
2128            _tile_loadconfig(config.as_ptr());
2129            _tile_loadd::<0>(array.as_ptr().cast(), 64);
2130            for i in 0..16 {
2131                let row = wrap_imm4!(_tile_cvtrowps2phhi::<0>, i);
2132                assert_eq!(
2133                    *row.as_f16x32().as_array(),
2134                    array::from_fn(|j| if j & 1 == 0 { 0.0 } else { i as _ })
2135                );
2136            }
2137        }
2138    }
2139
2140    #[simd_test(enable = "amx-avx512,avx10.2")]
2141    fn test__tile_cvtrowps2phh() {
2142        unsafe {
2143            _init_amx();
2144            let array: [[f32; 16]; 16] = array::from_fn(|i| [i as _; _]);
2145
2146            let mut tile = __tile1024i::zeroed(16, 64);
2147            __tile_loadd(&mut tile, array.as_ptr().cast(), 64);
2148
2149            for i in 0..16 {
2150                let row = __tile_cvtrowps2phh(tile, i);
2151                assert_eq!(
2152                    *row.as_f16x32().as_array(),
2153                    array::from_fn(|j| if j & 1 == 0 { 0.0 } else { i as _ })
2154                );
2155            }
2156        }
2157    }
2158
2159    #[simd_test(enable = "amx-avx512,avx10.2")]
2160    fn test_tile_cvtrowps2phl() {
2161        unsafe {
2162            _init_amx();
2163            let array: [[f32; 16]; 16] = array::from_fn(|i| [i as _; _]);
2164
2165            let mut config = __tilecfg::default();
2166            config.palette = 1;
2167            config.colsb[0] = 64;
2168            config.rows[0] = 16;
2169            _tile_loadconfig(config.as_ptr());
2170            _tile_loadd::<0>(array.as_ptr().cast(), 64);
2171            for i in 0..16 {
2172                let row = _tile_cvtrowps2phl::<0>(i);
2173                assert_eq!(
2174                    *row.as_f16x32().as_array(),
2175                    array::from_fn(|j| if j & 1 == 0 { i as _ } else { 0.0 })
2176                );
2177            }
2178        }
2179    }
2180
2181    #[simd_test(enable = "amx-avx512,avx10.2")]
2182    fn test_tile_cvtrowps2phli() {
2183        unsafe {
2184            _init_amx();
2185            let array: [[f32; 16]; 16] = array::from_fn(|i| [i as _; _]);
2186
2187            let mut config = __tilecfg::default();
2188            config.palette = 1;
2189            config.colsb[0] = 64;
2190            config.rows[0] = 16;
2191            _tile_loadconfig(config.as_ptr());
2192            _tile_loadd::<0>(array.as_ptr().cast(), 64);
2193            for i in 0..16 {
2194                let row = wrap_imm4!(_tile_cvtrowps2phli::<0>, i);
2195                assert_eq!(
2196                    *row.as_f16x32().as_array(),
2197                    array::from_fn(|j| if j & 1 == 0 { i as _ } else { 0.0 })
2198                );
2199            }
2200        }
2201    }
2202
2203    #[simd_test(enable = "amx-avx512,avx10.2")]
2204    fn test__tile_cvtrowps2phl() {
2205        unsafe {
2206            _init_amx();
2207            let array: [[f32; 16]; 16] = array::from_fn(|i| [i as _; _]);
2208
2209            let mut tile = __tile1024i::zeroed(16, 64);
2210            __tile_loadd(&mut tile, array.as_ptr().cast(), 64);
2211
2212            for i in 0..16 {
2213                let row = __tile_cvtrowps2phl(tile, i);
2214                assert_eq!(
2215                    *row.as_f16x32().as_array(),
2216                    array::from_fn(|j| if j & 1 == 0 { i as _ } else { 0.0 })
2217                );
2218            }
2219        }
2220    }
2221
2222    #[simd_test(enable = "amx-avx512,avx10.2")]
2223    fn test_tile_cvtrowps2bf16h() {
2224        unsafe {
2225            _init_amx();
2226            let array: [[f32; 16]; 16] = array::from_fn(|i| [i as _; _]);
2227
2228            let mut config = __tilecfg::default();
2229            config.palette = 1;
2230            config.colsb[0] = 64;
2231            config.rows[0] = 16;
2232            _tile_loadconfig(config.as_ptr());
2233            _tile_loadd::<0>(array.as_ptr().cast(), 64);
2234            for i in 0..16 {
2235                let row = _tile_cvtrowps2bf16h::<0>(i);
2236                assert_eq!(
2237                    *row.as_u16x32().as_array(),
2238                    array::from_fn(|j| if j & 1 == 0 {
2239                        0
2240                    } else {
2241                        _mm_cvtness_sbh(i as _).to_bits()
2242                    })
2243                );
2244            }
2245        }
2246    }
2247
2248    #[simd_test(enable = "amx-avx512,avx10.2")]
2249    fn test_tile_cvtrowps2bf16hi() {
2250        unsafe {
2251            _init_amx();
2252            let array: [[f32; 16]; 16] = array::from_fn(|i| [i as _; _]);
2253
2254            let mut config = __tilecfg::default();
2255            config.palette = 1;
2256            config.colsb[0] = 64;
2257            config.rows[0] = 16;
2258            _tile_loadconfig(config.as_ptr());
2259            _tile_loadd::<0>(array.as_ptr().cast(), 64);
2260            for i in 0..16 {
2261                let row = wrap_imm4!(_tile_cvtrowps2bf16hi::<0>, i);
2262                assert_eq!(
2263                    *row.as_u16x32().as_array(),
2264                    array::from_fn(|j| if j & 1 == 0 {
2265                        0
2266                    } else {
2267                        _mm_cvtness_sbh(i as _).to_bits()
2268                    })
2269                );
2270            }
2271        }
2272    }
2273
2274    #[simd_test(enable = "amx-avx512,avx10.2")]
2275    fn test__tile_cvtrowps2bf16h() {
2276        unsafe {
2277            _init_amx();
2278            let array: [[f32; 16]; 16] = array::from_fn(|i| [i as _; _]);
2279
2280            let mut tile = __tile1024i::zeroed(16, 64);
2281            __tile_loadd(&mut tile, array.as_ptr().cast(), 64);
2282
2283            for i in 0..16 {
2284                let row = __tile_cvtrowps2bf16h(tile, i);
2285                assert_eq!(
2286                    *row.as_u16x32().as_array(),
2287                    array::from_fn(|j| if j & 1 == 0 {
2288                        0
2289                    } else {
2290                        _mm_cvtness_sbh(i as _).to_bits()
2291                    })
2292                );
2293            }
2294        }
2295    }
2296
2297    #[simd_test(enable = "amx-avx512,avx10.2")]
2298    fn test_tile_cvtrowps2bf16l() {
2299        unsafe {
2300            _init_amx();
2301            let array: [[f32; 16]; 16] = array::from_fn(|i| [i as _; _]);
2302
2303            let mut config = __tilecfg::default();
2304            config.palette = 1;
2305            config.colsb[0] = 64;
2306            config.rows[0] = 16;
2307            _tile_loadconfig(config.as_ptr());
2308            _tile_loadd::<0>(array.as_ptr().cast(), 64);
2309            for i in 0..16 {
2310                let row = _tile_cvtrowps2bf16l::<0>(i);
2311                assert_eq!(
2312                    *row.as_u16x32().as_array(),
2313                    array::from_fn(|j| if j & 1 == 0 {
2314                        _mm_cvtness_sbh(i as _).to_bits()
2315                    } else {
2316                        0
2317                    })
2318                );
2319            }
2320        }
2321    }
2322
2323    #[simd_test(enable = "amx-avx512,avx10.2")]
2324    fn test_tile_cvtrowps2bf16li() {
2325        unsafe {
2326            _init_amx();
2327            let array: [[f32; 16]; 16] = array::from_fn(|i| [i as _; _]);
2328
2329            let mut config = __tilecfg::default();
2330            config.palette = 1;
2331            config.colsb[0] = 64;
2332            config.rows[0] = 16;
2333            _tile_loadconfig(config.as_ptr());
2334            _tile_loadd::<0>(array.as_ptr().cast(), 64);
2335            for i in 0..16 {
2336                let row = wrap_imm4!(_tile_cvtrowps2bf16li::<0>, i);
2337                assert_eq!(
2338                    *row.as_u16x32().as_array(),
2339                    array::from_fn(|j| if j & 1 == 0 {
2340                        _mm_cvtness_sbh(i as _).to_bits()
2341                    } else {
2342                        0
2343                    })
2344                );
2345            }
2346        }
2347    }
2348
2349    #[simd_test(enable = "amx-avx512,avx10.2")]
2350    fn test__tile_cvtrowps2bf16l() {
2351        unsafe {
2352            _init_amx();
2353            let array: [[f32; 16]; 16] = array::from_fn(|i| [i as _; _]);
2354
2355            let mut tile = __tile1024i::zeroed(16, 64);
2356            __tile_loadd(&mut tile, array.as_ptr().cast(), 64);
2357
2358            for i in 0..16 {
2359                let row = __tile_cvtrowps2bf16l(tile, i);
2360                assert_eq!(
2361                    *row.as_u16x32().as_array(),
2362                    array::from_fn(|j| if j & 1 == 0 {
2363                        _mm_cvtness_sbh(i as _).to_bits()
2364                    } else {
2365                        0
2366                    })
2367                );
2368            }
2369        }
2370    }
2371
2372    #[simd_test(enable = "amx-tf32")]
2373    fn test_tile_mmultf32ps() {
2374        unsafe {
2375            _init_amx();
2376            let a: [[f32; 16]; 16] = array::from_fn(|i| [i as _; _]);
2377            let b: [[f32; 16]; 16] = [array::from_fn(|j| j as _); _];
2378            let mut res = [[0.0; 16]; 16];
2379
2380            let mut config = __tilecfg::default();
2381            config.palette = 1;
2382            (0..=2).for_each(|i| {
2383                config.colsb[i] = 64;
2384                config.rows[i] = 16;
2385            });
2386            _tile_loadconfig(config.as_ptr());
2387            _tile_zero::<0>();
2388            _tile_loadd::<1>(a.as_ptr().cast(), 64);
2389            _tile_loadd::<2>(b.as_ptr().cast(), 64);
2390            _tile_mmultf32ps::<0, 1, 2>();
2391            _tile_stored::<0>(res.as_mut_ptr().cast(), 64);
2392            _tile_release();
2393
2394            let expected = array::from_fn(|i| array::from_fn(|j| 16.0 * i as f32 * j as f32));
2395            assert_eq!(res, expected);
2396        }
2397    }
2398
2399    #[simd_test(enable = "amx-tf32")]
2400    fn test__tile_mmultf32ps() {
2401        unsafe {
2402            _init_amx();
2403            let a: [[f32; 16]; 16] = array::from_fn(|i| [i as _; _]);
2404            let b: [[f32; 16]; 16] = [array::from_fn(|j| j as _); _];
2405            let mut res = [[0.0; 16]; 16];
2406
2407            let mut tile_a = __tile1024i::zeroed(16, 64);
2408            let mut tile_b = __tile1024i::zeroed(16, 64);
2409            let mut tile_c = __tile1024i::zeroed(16, 64);
2410
2411            __tile_loadd(&mut tile_a, a.as_ptr().cast(), 64);
2412            __tile_loadd(&mut tile_b, b.as_ptr().cast(), 64);
2413            __tile_mmultf32ps(&mut tile_c, tile_a, tile_b);
2414            __tile_stored(res.as_mut_ptr().cast(), 64, tile_c);
2415
2416            let expected = array::from_fn(|i| array::from_fn(|j| 16.0 * i as f32 * j as f32));
2417            assert_eq!(res, expected);
2418        }
2419    }
2420}