Skip to main content

core/stdarch/crates/core_arch/src/x86/
sse2.rs

1//! Streaming SIMD Extensions 2 (SSE2)
2
3#[cfg(test)]
4use stdarch_test::assert_instr;
5
6use crate::{
7    core_arch::{simd::*, x86::*},
8    intrinsics::simd::*,
9    intrinsics::sqrtf64,
10    mem, ptr,
11};
12
13/// Provides a hint to the processor that the code sequence is a spin-wait loop.
14///
15/// This can help improve the performance and power consumption of spin-wait
16/// loops.
17///
18/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause)
19#[inline]
20#[cfg_attr(all(test, target_feature = "sse2"), assert_instr(pause))]
21#[stable(feature = "simd_x86", since = "1.27.0")]
22pub fn _mm_pause() {
23    // note: `pause` is guaranteed to be interpreted as a `nop` by CPUs without
24    // the SSE2 target-feature - therefore it does not require any target features
25    unsafe { pause() }
26}
27
28/// Invalidates and flushes the cache line that contains `p` from all levels of
29/// the cache hierarchy.
30///
31/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush)
32///
33/// # Safety
34///
35/// Unlike the prefetch intrinsics, `CLFLUSH` is subject to all the permission
36/// checking and faults associated with a byte load, so `p` must point to a
37/// byte that is valid for reads.
38#[inline]
39#[target_feature(enable = "sse2")]
40#[cfg_attr(test, assert_instr(clflush))]
41#[stable(feature = "simd_x86", since = "1.27.0")]
42pub unsafe fn _mm_clflush(p: *const u8) {
43    clflush(p)
44}
45
46/// Performs a serializing operation on all load-from-memory instructions
47/// that were issued prior to this instruction.
48///
49/// Guarantees that every load instruction that precedes, in program order, is
50/// globally visible before any load instruction which follows the fence in
51/// program order.
52///
53/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence)
54#[inline]
55#[target_feature(enable = "sse2")]
56#[cfg_attr(test, assert_instr(lfence))]
57#[stable(feature = "simd_x86", since = "1.27.0")]
58pub fn _mm_lfence() {
59    unsafe { lfence() }
60}
61
62/// Performs a serializing operation on all load-from-memory and store-to-memory
63/// instructions that were issued prior to this instruction.
64///
65/// Guarantees that every memory access that precedes, in program order, the
66/// memory fence instruction is globally visible before any memory instruction
67/// which follows the fence in program order.
68///
69/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence)
70#[inline]
71#[target_feature(enable = "sse2")]
72#[cfg_attr(test, assert_instr(mfence))]
73#[stable(feature = "simd_x86", since = "1.27.0")]
74pub fn _mm_mfence() {
75    unsafe { mfence() }
76}
77
78/// Adds packed 8-bit integers in `a` and `b`.
79///
80/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8)
81#[inline]
82#[target_feature(enable = "sse2")]
83#[cfg_attr(test, assert_instr(paddb))]
84#[stable(feature = "simd_x86", since = "1.27.0")]
85#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
86pub const fn _mm_add_epi8(a: __m128i, b: __m128i) -> __m128i {
87    unsafe { transmute(simd_add(a.as_i8x16(), b.as_i8x16())) }
88}
89
90/// Adds packed 16-bit integers in `a` and `b`.
91///
92/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16)
93#[inline]
94#[target_feature(enable = "sse2")]
95#[cfg_attr(test, assert_instr(paddw))]
96#[stable(feature = "simd_x86", since = "1.27.0")]
97#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
98pub const fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i {
99    unsafe { transmute(simd_add(a.as_i16x8(), b.as_i16x8())) }
100}
101
102/// Adds packed 32-bit integers in `a` and `b`.
103///
104/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32)
105#[inline]
106#[target_feature(enable = "sse2")]
107#[cfg_attr(test, assert_instr(paddd))]
108#[stable(feature = "simd_x86", since = "1.27.0")]
109#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
110pub const fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i {
111    unsafe { transmute(simd_add(a.as_i32x4(), b.as_i32x4())) }
112}
113
114/// Adds packed 64-bit integers in `a` and `b`.
115///
116/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64)
117#[inline]
118#[target_feature(enable = "sse2")]
119#[cfg_attr(test, assert_instr(paddq))]
120#[stable(feature = "simd_x86", since = "1.27.0")]
121#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
122pub const fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i {
123    unsafe { transmute(simd_add(a.as_i64x2(), b.as_i64x2())) }
124}
125
126/// Adds packed 8-bit integers in `a` and `b` using saturation.
127///
128/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8)
129#[inline]
130#[target_feature(enable = "sse2")]
131#[cfg_attr(test, assert_instr(paddsb))]
132#[stable(feature = "simd_x86", since = "1.27.0")]
133#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
134pub const fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i {
135    unsafe { transmute(simd_saturating_add(a.as_i8x16(), b.as_i8x16())) }
136}
137
138/// Adds packed 16-bit integers in `a` and `b` using saturation.
139///
140/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16)
141#[inline]
142#[target_feature(enable = "sse2")]
143#[cfg_attr(test, assert_instr(paddsw))]
144#[stable(feature = "simd_x86", since = "1.27.0")]
145#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
146pub const fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i {
147    unsafe { transmute(simd_saturating_add(a.as_i16x8(), b.as_i16x8())) }
148}
149
150/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
151///
152/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8)
153#[inline]
154#[target_feature(enable = "sse2")]
155#[cfg_attr(test, assert_instr(paddusb))]
156#[stable(feature = "simd_x86", since = "1.27.0")]
157#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
158pub const fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i {
159    unsafe { transmute(simd_saturating_add(a.as_u8x16(), b.as_u8x16())) }
160}
161
162/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
163///
164/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16)
165#[inline]
166#[target_feature(enable = "sse2")]
167#[cfg_attr(test, assert_instr(paddusw))]
168#[stable(feature = "simd_x86", since = "1.27.0")]
169#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
170pub const fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i {
171    unsafe { transmute(simd_saturating_add(a.as_u16x8(), b.as_u16x8())) }
172}
173
174/// Averages packed unsigned 8-bit integers in `a` and `b`.
175///
176/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8)
177#[inline]
178#[target_feature(enable = "sse2")]
179#[cfg_attr(test, assert_instr(pavgb))]
180#[stable(feature = "simd_x86", since = "1.27.0")]
181#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
182pub const fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
183    unsafe {
184        let a = simd_cast::<_, u16x16>(a.as_u8x16());
185        let b = simd_cast::<_, u16x16>(b.as_u8x16());
186        let r = simd_shr(simd_add(simd_add(a, b), u16x16::splat(1)), u16x16::splat(1));
187        transmute(simd_cast::<_, u8x16>(r))
188    }
189}
190
191/// Averages packed unsigned 16-bit integers in `a` and `b`.
192///
193/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16)
194#[inline]
195#[target_feature(enable = "sse2")]
196#[cfg_attr(test, assert_instr(pavgw))]
197#[stable(feature = "simd_x86", since = "1.27.0")]
198#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
199pub const fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
200    unsafe {
201        let a = simd_cast::<_, u32x8>(a.as_u16x8());
202        let b = simd_cast::<_, u32x8>(b.as_u16x8());
203        let r = simd_shr(simd_add(simd_add(a, b), u32x8::splat(1)), u32x8::splat(1));
204        transmute(simd_cast::<_, u16x8>(r))
205    }
206}
207
208/// Multiplies and then horizontally add signed 16 bit integers in `a` and `b`.
209///
210/// Multiplies packed signed 16-bit integers in `a` and `b`, producing
211/// intermediate signed 32-bit integers. Horizontally add adjacent pairs of
212/// intermediate 32-bit integers.
213///
214/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16)
215#[inline]
216#[target_feature(enable = "sse2")]
217#[cfg_attr(test, assert_instr(pmaddwd))]
218#[stable(feature = "simd_x86", since = "1.27.0")]
219pub fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
220    // It's a trick used in the Adler-32 algorithm to perform a widening addition.
221    //
222    // ```rust
223    // #[target_feature(enable = "sse2")]
224    // unsafe fn widening_add(mad: __m128i) -> __m128i {
225    //     _mm_madd_epi16(mad, _mm_set1_epi16(1))
226    // }
227    // ```
228    //
229    // If we implement this using generic vector intrinsics, the optimizer
230    // will eliminate this pattern, and `pmaddwd` will no longer be emitted.
231    // For this reason, we use x86 intrinsics.
232    unsafe { transmute(pmaddwd(a.as_i16x8(), b.as_i16x8())) }
233}
234
235/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
236/// maximum values.
237///
238/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16)
239#[inline]
240#[target_feature(enable = "sse2")]
241#[cfg_attr(test, assert_instr(pmaxsw))]
242#[stable(feature = "simd_x86", since = "1.27.0")]
243#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
244pub const fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i {
245    unsafe { simd_imax(a.as_i16x8(), b.as_i16x8()).as_m128i() }
246}
247
248/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
249/// packed maximum values.
250///
251/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8)
252#[inline]
253#[target_feature(enable = "sse2")]
254#[cfg_attr(test, assert_instr(pmaxub))]
255#[stable(feature = "simd_x86", since = "1.27.0")]
256#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
257pub const fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i {
258    unsafe { simd_imax(a.as_u8x16(), b.as_u8x16()).as_m128i() }
259}
260
261/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
262/// minimum values.
263///
264/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16)
265#[inline]
266#[target_feature(enable = "sse2")]
267#[cfg_attr(test, assert_instr(pminsw))]
268#[stable(feature = "simd_x86", since = "1.27.0")]
269#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
270pub const fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i {
271    unsafe { simd_imin(a.as_i16x8(), b.as_i16x8()).as_m128i() }
272}
273
274/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
275/// packed minimum values.
276///
277/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8)
278#[inline]
279#[target_feature(enable = "sse2")]
280#[cfg_attr(test, assert_instr(pminub))]
281#[stable(feature = "simd_x86", since = "1.27.0")]
282#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
283pub const fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i {
284    unsafe { simd_imin(a.as_u8x16(), b.as_u8x16()).as_m128i() }
285}
286
287/// Multiplies the packed 16-bit integers in `a` and `b`.
288///
289/// The multiplication produces intermediate 32-bit integers, and returns the
290/// high 16 bits of the intermediate integers.
291///
292/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16)
293#[inline]
294#[target_feature(enable = "sse2")]
295#[cfg_attr(test, assert_instr(pmulhw))]
296#[stable(feature = "simd_x86", since = "1.27.0")]
297#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
298pub const fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
299    unsafe {
300        let a = simd_cast::<_, i32x8>(a.as_i16x8());
301        let b = simd_cast::<_, i32x8>(b.as_i16x8());
302        let r = simd_shr(simd_mul(a, b), i32x8::splat(16));
303        transmute(simd_cast::<i32x8, i16x8>(r))
304    }
305}
306
307/// Multiplies the packed unsigned 16-bit integers in `a` and `b`.
308///
309/// The multiplication produces intermediate 32-bit integers, and returns the
310/// high 16 bits of the intermediate integers.
311///
312/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16)
313#[inline]
314#[target_feature(enable = "sse2")]
315#[cfg_attr(test, assert_instr(pmulhuw))]
316#[stable(feature = "simd_x86", since = "1.27.0")]
317#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
318pub const fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i {
319    unsafe {
320        let a = simd_cast::<_, u32x8>(a.as_u16x8());
321        let b = simd_cast::<_, u32x8>(b.as_u16x8());
322        let r = simd_shr(simd_mul(a, b), u32x8::splat(16));
323        transmute(simd_cast::<u32x8, u16x8>(r))
324    }
325}
326
327/// Multiplies the packed 16-bit integers in `a` and `b`.
328///
329/// The multiplication produces intermediate 32-bit integers, and returns the
330/// low 16 bits of the intermediate integers.
331///
332/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16)
333#[inline]
334#[target_feature(enable = "sse2")]
335#[cfg_attr(test, assert_instr(pmullw))]
336#[stable(feature = "simd_x86", since = "1.27.0")]
337#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
338pub const fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
339    unsafe { transmute(simd_mul(a.as_i16x8(), b.as_i16x8())) }
340}
341
342/// Multiplies the low unsigned 32-bit integers from each packed 64-bit element
343/// in `a` and `b`.
344///
345/// Returns the unsigned 64-bit results.
346///
347/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32)
348#[inline]
349#[target_feature(enable = "sse2")]
350#[cfg_attr(test, assert_instr(pmuludq))]
351#[stable(feature = "simd_x86", since = "1.27.0")]
352#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
353pub const fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
354    unsafe {
355        let a = a.as_u64x2();
356        let b = b.as_u64x2();
357        let mask = u64x2::splat(u32::MAX as u64);
358        transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
359    }
360}
361
362/// Sum the absolute differences of packed unsigned 8-bit integers.
363///
364/// Computes the absolute differences of packed unsigned 8-bit integers in `a`
365/// and `b`, then horizontally sum each consecutive 8 differences to produce
366/// two unsigned 16-bit integers, and pack these unsigned 16-bit integers in
367/// the low 16 bits of 64-bit elements returned.
368///
369/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8)
370#[inline]
371#[target_feature(enable = "sse2")]
372#[cfg_attr(test, assert_instr(psadbw))]
373#[stable(feature = "simd_x86", since = "1.27.0")]
374pub fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i {
375    unsafe { transmute(psadbw(a.as_u8x16(), b.as_u8x16())) }
376}
377
378/// Subtracts packed 8-bit integers in `b` from packed 8-bit integers in `a`.
379///
380/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8)
381#[inline]
382#[target_feature(enable = "sse2")]
383#[cfg_attr(test, assert_instr(psubb))]
384#[stable(feature = "simd_x86", since = "1.27.0")]
385#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
386pub const fn _mm_sub_epi8(a: __m128i, b: __m128i) -> __m128i {
387    unsafe { transmute(simd_sub(a.as_i8x16(), b.as_i8x16())) }
388}
389
390/// Subtracts packed 16-bit integers in `b` from packed 16-bit integers in `a`.
391///
392/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16)
393#[inline]
394#[target_feature(enable = "sse2")]
395#[cfg_attr(test, assert_instr(psubw))]
396#[stable(feature = "simd_x86", since = "1.27.0")]
397#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
398pub const fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i {
399    unsafe { transmute(simd_sub(a.as_i16x8(), b.as_i16x8())) }
400}
401
402/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
403///
404/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32)
405#[inline]
406#[target_feature(enable = "sse2")]
407#[cfg_attr(test, assert_instr(psubd))]
408#[stable(feature = "simd_x86", since = "1.27.0")]
409#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
410pub const fn _mm_sub_epi32(a: __m128i, b: __m128i) -> __m128i {
411    unsafe { transmute(simd_sub(a.as_i32x4(), b.as_i32x4())) }
412}
413
414/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
415///
416/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64)
417#[inline]
418#[target_feature(enable = "sse2")]
419#[cfg_attr(test, assert_instr(psubq))]
420#[stable(feature = "simd_x86", since = "1.27.0")]
421#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
422pub const fn _mm_sub_epi64(a: __m128i, b: __m128i) -> __m128i {
423    unsafe { transmute(simd_sub(a.as_i64x2(), b.as_i64x2())) }
424}
425
426/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
427/// using saturation.
428///
429/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8)
430#[inline]
431#[target_feature(enable = "sse2")]
432#[cfg_attr(test, assert_instr(psubsb))]
433#[stable(feature = "simd_x86", since = "1.27.0")]
434#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
435pub const fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i {
436    unsafe { transmute(simd_saturating_sub(a.as_i8x16(), b.as_i8x16())) }
437}
438
439/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
440/// using saturation.
441///
442/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16)
443#[inline]
444#[target_feature(enable = "sse2")]
445#[cfg_attr(test, assert_instr(psubsw))]
446#[stable(feature = "simd_x86", since = "1.27.0")]
447#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
448pub const fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i {
449    unsafe { transmute(simd_saturating_sub(a.as_i16x8(), b.as_i16x8())) }
450}
451
452/// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit
453/// integers in `a` using saturation.
454///
455/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8)
456#[inline]
457#[target_feature(enable = "sse2")]
458#[cfg_attr(test, assert_instr(psubusb))]
459#[stable(feature = "simd_x86", since = "1.27.0")]
460#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
461pub const fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i {
462    unsafe { transmute(simd_saturating_sub(a.as_u8x16(), b.as_u8x16())) }
463}
464
465/// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit
466/// integers in `a` using saturation.
467///
468/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16)
469#[inline]
470#[target_feature(enable = "sse2")]
471#[cfg_attr(test, assert_instr(psubusw))]
472#[stable(feature = "simd_x86", since = "1.27.0")]
473#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
474pub const fn _mm_subs_epu16(a: __m128i, b: __m128i) -> __m128i {
475    unsafe { transmute(simd_saturating_sub(a.as_u16x8(), b.as_u16x8())) }
476}
477
478/// Shifts `a` left by `IMM8` bytes while shifting in zeros.
479///
480/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128)
481#[inline]
482#[target_feature(enable = "sse2")]
483#[cfg_attr(test, assert_instr(pslldq, IMM8 = 1))]
484#[rustc_legacy_const_generics(1)]
485#[stable(feature = "simd_x86", since = "1.27.0")]
486#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
487pub const fn _mm_slli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
488    static_assert_uimm_bits!(IMM8, 8);
489    unsafe { _mm_slli_si128_impl::<IMM8>(a) }
490}
491
492/// Implementation detail: converts the immediate argument of the
493/// `_mm_slli_si128` intrinsic into a compile-time constant.
494#[inline]
495#[target_feature(enable = "sse2")]
496#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
497const unsafe fn _mm_slli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
498    const fn mask(shift: i32, i: u32) -> u32 {
499        let shift = shift as u32 & 0xff;
500        if shift > 15 { i } else { 16 - shift + i }
501    }
502    transmute::<i8x16, _>(simd_shuffle!(
503        i8x16::ZERO,
504        a.as_i8x16(),
505        [
506            mask(IMM8, 0),
507            mask(IMM8, 1),
508            mask(IMM8, 2),
509            mask(IMM8, 3),
510            mask(IMM8, 4),
511            mask(IMM8, 5),
512            mask(IMM8, 6),
513            mask(IMM8, 7),
514            mask(IMM8, 8),
515            mask(IMM8, 9),
516            mask(IMM8, 10),
517            mask(IMM8, 11),
518            mask(IMM8, 12),
519            mask(IMM8, 13),
520            mask(IMM8, 14),
521            mask(IMM8, 15),
522        ],
523    ))
524}
525
526/// Shifts `a` left by `IMM8` bytes while shifting in zeros.
527///
528/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128)
529#[inline]
530#[target_feature(enable = "sse2")]
531#[cfg_attr(test, assert_instr(pslldq, IMM8 = 1))]
532#[rustc_legacy_const_generics(1)]
533#[stable(feature = "simd_x86", since = "1.27.0")]
534#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
535pub const fn _mm_bslli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
536    unsafe {
537        static_assert_uimm_bits!(IMM8, 8);
538        _mm_slli_si128_impl::<IMM8>(a)
539    }
540}
541
542/// Shifts `a` right by `IMM8` bytes while shifting in zeros.
543///
544/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128)
545#[inline]
546#[target_feature(enable = "sse2")]
547#[cfg_attr(test, assert_instr(psrldq, IMM8 = 1))]
548#[rustc_legacy_const_generics(1)]
549#[stable(feature = "simd_x86", since = "1.27.0")]
550#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
551pub const fn _mm_bsrli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
552    unsafe {
553        static_assert_uimm_bits!(IMM8, 8);
554        _mm_srli_si128_impl::<IMM8>(a)
555    }
556}
557
558/// Shifts packed 16-bit integers in `a` left by `IMM8` while shifting in zeros.
559///
560/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16)
561#[inline]
562#[target_feature(enable = "sse2")]
563#[cfg_attr(test, assert_instr(psllw, IMM8 = 7))]
564#[rustc_legacy_const_generics(1)]
565#[stable(feature = "simd_x86", since = "1.27.0")]
566#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
567pub const fn _mm_slli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
568    static_assert_uimm_bits!(IMM8, 8);
569    unsafe {
570        if IMM8 >= 16 {
571            _mm_setzero_si128()
572        } else {
573            transmute(simd_shl(a.as_u16x8(), u16x8::splat(IMM8 as u16)))
574        }
575    }
576}
577
578/// Shifts packed 16-bit integers in `a` left by `count` while shifting in
579/// zeros.
580///
581/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16)
582#[inline]
583#[target_feature(enable = "sse2")]
584#[cfg_attr(test, assert_instr(psllw))]
585#[stable(feature = "simd_x86", since = "1.27.0")]
586pub fn _mm_sll_epi16(a: __m128i, count: __m128i) -> __m128i {
587    unsafe { transmute(psllw(a.as_i16x8(), count.as_i16x8())) }
588}
589
590/// Shifts packed 32-bit integers in `a` left by `IMM8` while shifting in zeros.
591///
592/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32)
593#[inline]
594#[target_feature(enable = "sse2")]
595#[cfg_attr(test, assert_instr(pslld, IMM8 = 7))]
596#[rustc_legacy_const_generics(1)]
597#[stable(feature = "simd_x86", since = "1.27.0")]
598#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
599pub const fn _mm_slli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
600    static_assert_uimm_bits!(IMM8, 8);
601    unsafe {
602        if IMM8 >= 32 {
603            _mm_setzero_si128()
604        } else {
605            transmute(simd_shl(a.as_u32x4(), u32x4::splat(IMM8 as u32)))
606        }
607    }
608}
609
610/// Shifts packed 32-bit integers in `a` left by `count` while shifting in
611/// zeros.
612///
613/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32)
614#[inline]
615#[target_feature(enable = "sse2")]
616#[cfg_attr(test, assert_instr(pslld))]
617#[stable(feature = "simd_x86", since = "1.27.0")]
618pub fn _mm_sll_epi32(a: __m128i, count: __m128i) -> __m128i {
619    unsafe { transmute(pslld(a.as_i32x4(), count.as_i32x4())) }
620}
621
622/// Shifts packed 64-bit integers in `a` left by `IMM8` while shifting in zeros.
623///
624/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64)
625#[inline]
626#[target_feature(enable = "sse2")]
627#[cfg_attr(test, assert_instr(psllq, IMM8 = 7))]
628#[rustc_legacy_const_generics(1)]
629#[stable(feature = "simd_x86", since = "1.27.0")]
630#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
631pub const fn _mm_slli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
632    static_assert_uimm_bits!(IMM8, 8);
633    unsafe {
634        if IMM8 >= 64 {
635            _mm_setzero_si128()
636        } else {
637            transmute(simd_shl(a.as_u64x2(), u64x2::splat(IMM8 as u64)))
638        }
639    }
640}
641
642/// Shifts packed 64-bit integers in `a` left by `count` while shifting in
643/// zeros.
644///
645/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64)
646#[inline]
647#[target_feature(enable = "sse2")]
648#[cfg_attr(test, assert_instr(psllq))]
649#[stable(feature = "simd_x86", since = "1.27.0")]
650pub fn _mm_sll_epi64(a: __m128i, count: __m128i) -> __m128i {
651    unsafe { transmute(psllq(a.as_i64x2(), count.as_i64x2())) }
652}
653
654/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in sign
655/// bits.
656///
657/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16)
658#[inline]
659#[target_feature(enable = "sse2")]
660#[cfg_attr(test, assert_instr(psraw, IMM8 = 1))]
661#[rustc_legacy_const_generics(1)]
662#[stable(feature = "simd_x86", since = "1.27.0")]
663#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
664pub const fn _mm_srai_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
665    static_assert_uimm_bits!(IMM8, 8);
666    unsafe { transmute(simd_shr(a.as_i16x8(), i16x8::splat(IMM8.min(15) as i16))) }
667}
668
669/// Shifts packed 16-bit integers in `a` right by `count` while shifting in sign
670/// bits.
671///
672/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16)
673#[inline]
674#[target_feature(enable = "sse2")]
675#[cfg_attr(test, assert_instr(psraw))]
676#[stable(feature = "simd_x86", since = "1.27.0")]
677pub fn _mm_sra_epi16(a: __m128i, count: __m128i) -> __m128i {
678    unsafe { transmute(psraw(a.as_i16x8(), count.as_i16x8())) }
679}
680
681/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in sign
682/// bits.
683///
684/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32)
685#[inline]
686#[target_feature(enable = "sse2")]
687#[cfg_attr(test, assert_instr(psrad, IMM8 = 1))]
688#[rustc_legacy_const_generics(1)]
689#[stable(feature = "simd_x86", since = "1.27.0")]
690#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
691pub const fn _mm_srai_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
692    static_assert_uimm_bits!(IMM8, 8);
693    unsafe { transmute(simd_shr(a.as_i32x4(), i32x4::splat(IMM8.min(31)))) }
694}
695
696/// Shifts packed 32-bit integers in `a` right by `count` while shifting in sign
697/// bits.
698///
699/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32)
700#[inline]
701#[target_feature(enable = "sse2")]
702#[cfg_attr(test, assert_instr(psrad))]
703#[stable(feature = "simd_x86", since = "1.27.0")]
704pub fn _mm_sra_epi32(a: __m128i, count: __m128i) -> __m128i {
705    unsafe { transmute(psrad(a.as_i32x4(), count.as_i32x4())) }
706}
707
708/// Shifts `a` right by `IMM8` bytes while shifting in zeros.
709///
710/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128)
711#[inline]
712#[target_feature(enable = "sse2")]
713#[cfg_attr(test, assert_instr(psrldq, IMM8 = 1))]
714#[rustc_legacy_const_generics(1)]
715#[stable(feature = "simd_x86", since = "1.27.0")]
716#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
717pub const fn _mm_srli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
718    static_assert_uimm_bits!(IMM8, 8);
719    unsafe { _mm_srli_si128_impl::<IMM8>(a) }
720}
721
722/// Implementation detail: converts the immediate argument of the
723/// `_mm_srli_si128` intrinsic into a compile-time constant.
724#[inline]
725#[target_feature(enable = "sse2")]
726#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
727const unsafe fn _mm_srli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
728    const fn mask(shift: i32, i: u32) -> u32 {
729        if (shift as u32) > 15 {
730            i + 16
731        } else {
732            i + (shift as u32)
733        }
734    }
735    let x: i8x16 = simd_shuffle!(
736        a.as_i8x16(),
737        i8x16::ZERO,
738        [
739            mask(IMM8, 0),
740            mask(IMM8, 1),
741            mask(IMM8, 2),
742            mask(IMM8, 3),
743            mask(IMM8, 4),
744            mask(IMM8, 5),
745            mask(IMM8, 6),
746            mask(IMM8, 7),
747            mask(IMM8, 8),
748            mask(IMM8, 9),
749            mask(IMM8, 10),
750            mask(IMM8, 11),
751            mask(IMM8, 12),
752            mask(IMM8, 13),
753            mask(IMM8, 14),
754            mask(IMM8, 15),
755        ],
756    );
757    transmute(x)
758}
759
760/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
761/// zeros.
762///
763/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16)
764#[inline]
765#[target_feature(enable = "sse2")]
766#[cfg_attr(test, assert_instr(psrlw, IMM8 = 1))]
767#[rustc_legacy_const_generics(1)]
768#[stable(feature = "simd_x86", since = "1.27.0")]
769#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
770pub const fn _mm_srli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
771    static_assert_uimm_bits!(IMM8, 8);
772    unsafe {
773        if IMM8 >= 16 {
774            _mm_setzero_si128()
775        } else {
776            transmute(simd_shr(a.as_u16x8(), u16x8::splat(IMM8 as u16)))
777        }
778    }
779}
780
781/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
782/// zeros.
783///
784/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16)
785#[inline]
786#[target_feature(enable = "sse2")]
787#[cfg_attr(test, assert_instr(psrlw))]
788#[stable(feature = "simd_x86", since = "1.27.0")]
789pub fn _mm_srl_epi16(a: __m128i, count: __m128i) -> __m128i {
790    unsafe { transmute(psrlw(a.as_i16x8(), count.as_i16x8())) }
791}
792
793/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
794/// zeros.
795///
796/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32)
797#[inline]
798#[target_feature(enable = "sse2")]
799#[cfg_attr(test, assert_instr(psrld, IMM8 = 8))]
800#[rustc_legacy_const_generics(1)]
801#[stable(feature = "simd_x86", since = "1.27.0")]
802#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
803pub const fn _mm_srli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
804    static_assert_uimm_bits!(IMM8, 8);
805    unsafe {
806        if IMM8 >= 32 {
807            _mm_setzero_si128()
808        } else {
809            transmute(simd_shr(a.as_u32x4(), u32x4::splat(IMM8 as u32)))
810        }
811    }
812}
813
814/// Shifts packed 32-bit integers in `a` right by `count` while shifting in
815/// zeros.
816///
817/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32)
818#[inline]
819#[target_feature(enable = "sse2")]
820#[cfg_attr(test, assert_instr(psrld))]
821#[stable(feature = "simd_x86", since = "1.27.0")]
822pub fn _mm_srl_epi32(a: __m128i, count: __m128i) -> __m128i {
823    unsafe { transmute(psrld(a.as_i32x4(), count.as_i32x4())) }
824}
825
826/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
827/// zeros.
828///
829/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64)
830#[inline]
831#[target_feature(enable = "sse2")]
832#[cfg_attr(test, assert_instr(psrlq, IMM8 = 1))]
833#[rustc_legacy_const_generics(1)]
834#[stable(feature = "simd_x86", since = "1.27.0")]
835#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
836pub const fn _mm_srli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
837    static_assert_uimm_bits!(IMM8, 8);
838    unsafe {
839        if IMM8 >= 64 {
840            _mm_setzero_si128()
841        } else {
842            transmute(simd_shr(a.as_u64x2(), u64x2::splat(IMM8 as u64)))
843        }
844    }
845}
846
847/// Shifts packed 64-bit integers in `a` right by `count` while shifting in
848/// zeros.
849///
850/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64)
851#[inline]
852#[target_feature(enable = "sse2")]
853#[cfg_attr(test, assert_instr(psrlq))]
854#[stable(feature = "simd_x86", since = "1.27.0")]
855pub fn _mm_srl_epi64(a: __m128i, count: __m128i) -> __m128i {
856    unsafe { transmute(psrlq(a.as_i64x2(), count.as_i64x2())) }
857}
858
859/// Computes the bitwise AND of 128 bits (representing integer data) in `a` and
860/// `b`.
861///
862/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128)
863#[inline]
864#[target_feature(enable = "sse2")]
865#[cfg_attr(test, assert_instr(andps))]
866#[stable(feature = "simd_x86", since = "1.27.0")]
867#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
868pub const fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i {
869    unsafe { simd_and(a, b) }
870}
871
872/// Computes the bitwise NOT of 128 bits (representing integer data) in `a` and
873/// then AND with `b`.
874///
875/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128)
876#[inline]
877#[target_feature(enable = "sse2")]
878#[cfg_attr(test, assert_instr(andnps))]
879#[stable(feature = "simd_x86", since = "1.27.0")]
880#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
881pub const fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i {
882    unsafe { simd_and(simd_xor(_mm_set1_epi8(-1), a), b) }
883}
884
885/// Computes the bitwise OR of 128 bits (representing integer data) in `a` and
886/// `b`.
887///
888/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128)
889#[inline]
890#[target_feature(enable = "sse2")]
891#[cfg_attr(test, assert_instr(orps))]
892#[stable(feature = "simd_x86", since = "1.27.0")]
893#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
894pub const fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i {
895    unsafe { simd_or(a, b) }
896}
897
898/// Computes the bitwise XOR of 128 bits (representing integer data) in `a` and
899/// `b`.
900///
901/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128)
902#[inline]
903#[target_feature(enable = "sse2")]
904#[cfg_attr(test, assert_instr(xorps))]
905#[stable(feature = "simd_x86", since = "1.27.0")]
906#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
907pub const fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i {
908    unsafe { simd_xor(a, b) }
909}
910
911/// Compares packed 8-bit integers in `a` and `b` for equality.
912///
913/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8)
914#[inline]
915#[target_feature(enable = "sse2")]
916#[cfg_attr(test, assert_instr(pcmpeqb))]
917#[stable(feature = "simd_x86", since = "1.27.0")]
918#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
919pub const fn _mm_cmpeq_epi8(a: __m128i, b: __m128i) -> __m128i {
920    unsafe { transmute::<i8x16, _>(simd_eq(a.as_i8x16(), b.as_i8x16())) }
921}
922
923/// Compares packed 16-bit integers in `a` and `b` for equality.
924///
925/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16)
926#[inline]
927#[target_feature(enable = "sse2")]
928#[cfg_attr(test, assert_instr(pcmpeqw))]
929#[stable(feature = "simd_x86", since = "1.27.0")]
930#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
931pub const fn _mm_cmpeq_epi16(a: __m128i, b: __m128i) -> __m128i {
932    unsafe { transmute::<i16x8, _>(simd_eq(a.as_i16x8(), b.as_i16x8())) }
933}
934
935/// Compares packed 32-bit integers in `a` and `b` for equality.
936///
937/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32)
938#[inline]
939#[target_feature(enable = "sse2")]
940#[cfg_attr(test, assert_instr(pcmpeqd))]
941#[stable(feature = "simd_x86", since = "1.27.0")]
942#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
943pub const fn _mm_cmpeq_epi32(a: __m128i, b: __m128i) -> __m128i {
944    unsafe { transmute::<i32x4, _>(simd_eq(a.as_i32x4(), b.as_i32x4())) }
945}
946
947/// Compares packed 8-bit integers in `a` and `b` for greater-than.
948///
949/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8)
950#[inline]
951#[target_feature(enable = "sse2")]
952#[cfg_attr(test, assert_instr(pcmpgtb))]
953#[stable(feature = "simd_x86", since = "1.27.0")]
954#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
955pub const fn _mm_cmpgt_epi8(a: __m128i, b: __m128i) -> __m128i {
956    unsafe { transmute::<i8x16, _>(simd_gt(a.as_i8x16(), b.as_i8x16())) }
957}
958
959/// Compares packed 16-bit integers in `a` and `b` for greater-than.
960///
961/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16)
962#[inline]
963#[target_feature(enable = "sse2")]
964#[cfg_attr(test, assert_instr(pcmpgtw))]
965#[stable(feature = "simd_x86", since = "1.27.0")]
966#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
967pub const fn _mm_cmpgt_epi16(a: __m128i, b: __m128i) -> __m128i {
968    unsafe { transmute::<i16x8, _>(simd_gt(a.as_i16x8(), b.as_i16x8())) }
969}
970
971/// Compares packed 32-bit integers in `a` and `b` for greater-than.
972///
973/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32)
974#[inline]
975#[target_feature(enable = "sse2")]
976#[cfg_attr(test, assert_instr(pcmpgtd))]
977#[stable(feature = "simd_x86", since = "1.27.0")]
978#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
979pub const fn _mm_cmpgt_epi32(a: __m128i, b: __m128i) -> __m128i {
980    unsafe { transmute::<i32x4, _>(simd_gt(a.as_i32x4(), b.as_i32x4())) }
981}
982
983/// Compares packed 8-bit integers in `a` and `b` for less-than.
984///
985/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8)
986#[inline]
987#[target_feature(enable = "sse2")]
988#[cfg_attr(test, assert_instr(pcmpgtb))]
989#[stable(feature = "simd_x86", since = "1.27.0")]
990#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
991pub const fn _mm_cmplt_epi8(a: __m128i, b: __m128i) -> __m128i {
992    unsafe { transmute::<i8x16, _>(simd_lt(a.as_i8x16(), b.as_i8x16())) }
993}
994
995/// Compares packed 16-bit integers in `a` and `b` for less-than.
996///
997/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16)
998#[inline]
999#[target_feature(enable = "sse2")]
1000#[cfg_attr(test, assert_instr(pcmpgtw))]
1001#[stable(feature = "simd_x86", since = "1.27.0")]
1002#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1003pub const fn _mm_cmplt_epi16(a: __m128i, b: __m128i) -> __m128i {
1004    unsafe { transmute::<i16x8, _>(simd_lt(a.as_i16x8(), b.as_i16x8())) }
1005}
1006
1007/// Compares packed 32-bit integers in `a` and `b` for less-than.
1008///
1009/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32)
1010#[inline]
1011#[target_feature(enable = "sse2")]
1012#[cfg_attr(test, assert_instr(pcmpgtd))]
1013#[stable(feature = "simd_x86", since = "1.27.0")]
1014#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1015pub const fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i {
1016    unsafe { transmute::<i32x4, _>(simd_lt(a.as_i32x4(), b.as_i32x4())) }
1017}
1018
1019/// Converts the lower two packed 32-bit integers in `a` to packed
1020/// double-precision (64-bit) floating-point elements.
1021///
1022/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd)
1023#[inline]
1024#[target_feature(enable = "sse2")]
1025#[cfg_attr(test, assert_instr(cvtdq2pd))]
1026#[stable(feature = "simd_x86", since = "1.27.0")]
1027#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1028pub const fn _mm_cvtepi32_pd(a: __m128i) -> __m128d {
1029    unsafe {
1030        let a = a.as_i32x4();
1031        simd_cast::<i32x2, __m128d>(simd_shuffle!(a, a, [0, 1]))
1032    }
1033}
1034
1035/// Returns `a` with its lower element replaced by `b` after converting it to
1036/// an `f64`.
1037///
1038/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd)
1039#[inline]
1040#[target_feature(enable = "sse2")]
1041#[cfg_attr(test, assert_instr(cvtsi2sd))]
1042#[stable(feature = "simd_x86", since = "1.27.0")]
1043#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1044pub const fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d {
1045    unsafe { simd_insert!(a, 0, b as f64) }
1046}
1047
1048/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
1049/// floating-point elements.
1050///
1051/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps)
1052#[inline]
1053#[target_feature(enable = "sse2")]
1054#[cfg_attr(test, assert_instr(cvtdq2ps))]
1055#[stable(feature = "simd_x86", since = "1.27.0")]
1056#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1057pub const fn _mm_cvtepi32_ps(a: __m128i) -> __m128 {
1058    unsafe { transmute(simd_cast::<_, f32x4>(a.as_i32x4())) }
1059}
1060
1061/// Converts packed single-precision (32-bit) floating-point elements in `a`
1062/// to packed 32-bit integers.
1063///
1064/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32)
1065#[inline]
1066#[target_feature(enable = "sse2")]
1067#[cfg_attr(test, assert_instr(cvtps2dq))]
1068#[stable(feature = "simd_x86", since = "1.27.0")]
1069pub fn _mm_cvtps_epi32(a: __m128) -> __m128i {
1070    unsafe { transmute(cvtps2dq(a)) }
1071}
1072
1073/// Returns a vector whose lowest element is `a` and all higher elements are
1074/// `0`.
1075///
1076/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128)
1077#[inline]
1078#[target_feature(enable = "sse2")]
1079#[stable(feature = "simd_x86", since = "1.27.0")]
1080#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1081pub const fn _mm_cvtsi32_si128(a: i32) -> __m128i {
1082    unsafe { transmute(i32x4::new(a, 0, 0, 0)) }
1083}
1084
1085/// Returns the lowest element of `a`.
1086///
1087/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32)
1088#[inline]
1089#[target_feature(enable = "sse2")]
1090#[stable(feature = "simd_x86", since = "1.27.0")]
1091#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1092pub const fn _mm_cvtsi128_si32(a: __m128i) -> i32 {
1093    unsafe { simd_extract!(a.as_i32x4(), 0) }
1094}
1095
1096/// Sets packed 64-bit integers with the supplied values, from highest to
1097/// lowest.
1098///
1099/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x)
1100#[inline]
1101#[target_feature(enable = "sse2")]
1102// no particular instruction to test
1103#[stable(feature = "simd_x86", since = "1.27.0")]
1104#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1105pub const fn _mm_set_epi64x(e1: i64, e0: i64) -> __m128i {
1106    unsafe { transmute(i64x2::new(e0, e1)) }
1107}
1108
1109/// Sets packed 32-bit integers with the supplied values.
1110///
1111/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32)
1112#[inline]
1113#[target_feature(enable = "sse2")]
1114// no particular instruction to test
1115#[stable(feature = "simd_x86", since = "1.27.0")]
1116#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1117pub const fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
1118    unsafe { transmute(i32x4::new(e0, e1, e2, e3)) }
1119}
1120
1121/// Sets packed 16-bit integers with the supplied values.
1122///
1123/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16)
1124#[inline]
1125#[target_feature(enable = "sse2")]
1126// no particular instruction to test
1127#[stable(feature = "simd_x86", since = "1.27.0")]
1128#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1129pub const fn _mm_set_epi16(
1130    e7: i16,
1131    e6: i16,
1132    e5: i16,
1133    e4: i16,
1134    e3: i16,
1135    e2: i16,
1136    e1: i16,
1137    e0: i16,
1138) -> __m128i {
1139    unsafe { transmute(i16x8::new(e0, e1, e2, e3, e4, e5, e6, e7)) }
1140}
1141
1142/// Sets packed 8-bit integers with the supplied values.
1143///
1144/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8)
1145#[inline]
1146#[target_feature(enable = "sse2")]
1147// no particular instruction to test
1148#[stable(feature = "simd_x86", since = "1.27.0")]
1149#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1150pub const fn _mm_set_epi8(
1151    e15: i8,
1152    e14: i8,
1153    e13: i8,
1154    e12: i8,
1155    e11: i8,
1156    e10: i8,
1157    e9: i8,
1158    e8: i8,
1159    e7: i8,
1160    e6: i8,
1161    e5: i8,
1162    e4: i8,
1163    e3: i8,
1164    e2: i8,
1165    e1: i8,
1166    e0: i8,
1167) -> __m128i {
1168    unsafe {
1169        #[rustfmt::skip]
1170        transmute(i8x16::new(
1171            e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
1172        ))
1173    }
1174}
1175
1176/// Broadcasts 64-bit integer `a` to all elements.
1177///
1178/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x)
1179#[inline]
1180#[target_feature(enable = "sse2")]
1181// no particular instruction to test
1182#[stable(feature = "simd_x86", since = "1.27.0")]
1183#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1184pub const fn _mm_set1_epi64x(a: i64) -> __m128i {
1185    i64x2::splat(a).as_m128i()
1186}
1187
1188/// Broadcasts 32-bit integer `a` to all elements.
1189///
1190/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32)
1191#[inline]
1192#[target_feature(enable = "sse2")]
1193// no particular instruction to test
1194#[stable(feature = "simd_x86", since = "1.27.0")]
1195#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1196pub const fn _mm_set1_epi32(a: i32) -> __m128i {
1197    i32x4::splat(a).as_m128i()
1198}
1199
1200/// Broadcasts 16-bit integer `a` to all elements.
1201///
1202/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16)
1203#[inline]
1204#[target_feature(enable = "sse2")]
1205// no particular instruction to test
1206#[stable(feature = "simd_x86", since = "1.27.0")]
1207#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1208pub const fn _mm_set1_epi16(a: i16) -> __m128i {
1209    i16x8::splat(a).as_m128i()
1210}
1211
1212/// Broadcasts 8-bit integer `a` to all elements.
1213///
1214/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8)
1215#[inline]
1216#[target_feature(enable = "sse2")]
1217// no particular instruction to test
1218#[stable(feature = "simd_x86", since = "1.27.0")]
1219#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1220pub const fn _mm_set1_epi8(a: i8) -> __m128i {
1221    i8x16::splat(a).as_m128i()
1222}
1223
1224/// Sets packed 32-bit integers with the supplied values in reverse order.
1225///
1226/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32)
1227#[inline]
1228#[target_feature(enable = "sse2")]
1229// no particular instruction to test
1230#[stable(feature = "simd_x86", since = "1.27.0")]
1231#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1232pub const fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
1233    _mm_set_epi32(e0, e1, e2, e3)
1234}
1235
1236/// Sets packed 16-bit integers with the supplied values in reverse order.
1237///
1238/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16)
1239#[inline]
1240#[target_feature(enable = "sse2")]
1241// no particular instruction to test
1242#[stable(feature = "simd_x86", since = "1.27.0")]
1243#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1244pub const fn _mm_setr_epi16(
1245    e7: i16,
1246    e6: i16,
1247    e5: i16,
1248    e4: i16,
1249    e3: i16,
1250    e2: i16,
1251    e1: i16,
1252    e0: i16,
1253) -> __m128i {
1254    _mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7)
1255}
1256
1257/// Sets packed 8-bit integers with the supplied values in reverse order.
1258///
1259/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8)
1260#[inline]
1261#[target_feature(enable = "sse2")]
1262// no particular instruction to test
1263#[stable(feature = "simd_x86", since = "1.27.0")]
1264#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1265pub const fn _mm_setr_epi8(
1266    e15: i8,
1267    e14: i8,
1268    e13: i8,
1269    e12: i8,
1270    e11: i8,
1271    e10: i8,
1272    e9: i8,
1273    e8: i8,
1274    e7: i8,
1275    e6: i8,
1276    e5: i8,
1277    e4: i8,
1278    e3: i8,
1279    e2: i8,
1280    e1: i8,
1281    e0: i8,
1282) -> __m128i {
1283    #[rustfmt::skip]
1284    _mm_set_epi8(
1285        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
1286    )
1287}
1288
1289/// Returns a vector with all elements set to zero.
1290///
1291/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128)
1292#[inline]
1293#[target_feature(enable = "sse2")]
1294#[cfg_attr(test, assert_instr(xorps))]
1295#[stable(feature = "simd_x86", since = "1.27.0")]
1296#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1297pub const fn _mm_setzero_si128() -> __m128i {
1298    const { unsafe { mem::zeroed() } }
1299}
1300
1301/// Loads 64-bit integer from memory into first element of returned vector.
1302///
1303/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64)
1304#[inline]
1305#[target_feature(enable = "sse2")]
1306#[stable(feature = "simd_x86", since = "1.27.0")]
1307#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1308pub const unsafe fn _mm_loadl_epi64(mem_addr: *const __m128i) -> __m128i {
1309    _mm_set_epi64x(0, ptr::read_unaligned(mem_addr as *const i64))
1310}
1311
1312/// Loads 128-bits of integer data from memory into a new vector.
1313///
1314/// `mem_addr` must be aligned on a 16-byte boundary.
1315///
1316/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128)
1317#[inline]
1318#[target_feature(enable = "sse2")]
1319#[cfg_attr(
1320    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1321    assert_instr(movaps)
1322)]
1323#[stable(feature = "simd_x86", since = "1.27.0")]
1324#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1325pub const unsafe fn _mm_load_si128(mem_addr: *const __m128i) -> __m128i {
1326    *mem_addr
1327}
1328
1329/// Loads 128-bits of integer data from memory into a new vector.
1330///
1331/// `mem_addr` does not need to be aligned on any particular boundary.
1332///
1333/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128)
1334#[inline]
1335#[target_feature(enable = "sse2")]
1336#[cfg_attr(test, assert_instr(movups))]
1337#[stable(feature = "simd_x86", since = "1.27.0")]
1338#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1339pub const unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i {
1340    let mut dst: __m128i = _mm_undefined_si128();
1341    ptr::copy_nonoverlapping(
1342        mem_addr as *const u8,
1343        ptr::addr_of_mut!(dst) as *mut u8,
1344        mem::size_of::<__m128i>(),
1345    );
1346    dst
1347}
1348
1349/// Conditionally store 8-bit integer elements from `a` into memory using
1350/// `mask` flagged as non-temporal (unlikely to be used again soon).
1351///
1352/// Elements are not stored when the highest bit is not set in the
1353/// corresponding element.
1354///
1355/// `mem_addr` should correspond to a 128-bit memory location and does not need
1356/// to be aligned on any particular boundary.
1357///
1358/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128)
1359///
1360/// # Safety of non-temporal stores
1361///
1362/// After using this intrinsic, but before any other access to the memory that this intrinsic
1363/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1364/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1365/// return.
1366///
1367/// See [`_mm_sfence`] for details.
1368#[inline]
1369#[target_feature(enable = "sse2")]
1370#[cfg_attr(test, assert_instr(maskmovdqu))]
1371#[stable(feature = "simd_x86", since = "1.27.0")]
1372pub unsafe fn _mm_maskmoveu_si128(a: __m128i, mask: __m128i, mem_addr: *mut i8) {
1373    maskmovdqu(a.as_i8x16(), mask.as_i8x16(), mem_addr)
1374}
1375
1376/// Stores 128-bits of integer data from `a` into memory.
1377///
1378/// `mem_addr` must be aligned on a 16-byte boundary.
1379///
1380/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128)
1381#[inline]
1382#[target_feature(enable = "sse2")]
1383#[cfg_attr(
1384    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1385    assert_instr(movaps)
1386)]
1387#[stable(feature = "simd_x86", since = "1.27.0")]
1388#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1389pub const unsafe fn _mm_store_si128(mem_addr: *mut __m128i, a: __m128i) {
1390    *mem_addr = a;
1391}
1392
1393/// Stores 128-bits of integer data from `a` into memory.
1394///
1395/// `mem_addr` does not need to be aligned on any particular boundary.
1396///
1397/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128)
1398#[inline]
1399#[target_feature(enable = "sse2")]
1400#[cfg_attr(test, assert_instr(movups))] // FIXME movdqu expected
1401#[stable(feature = "simd_x86", since = "1.27.0")]
1402#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1403pub const unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) {
1404    mem_addr.write_unaligned(a);
1405}
1406
1407/// Stores the lower 64-bit integer `a` to a memory location.
1408///
1409/// `mem_addr` does not need to be aligned on any particular boundary.
1410///
1411/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64)
1412#[inline]
1413#[target_feature(enable = "sse2")]
1414#[stable(feature = "simd_x86", since = "1.27.0")]
1415#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1416pub const unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
1417    ptr::copy_nonoverlapping(ptr::addr_of!(a) as *const u8, mem_addr as *mut u8, 8);
1418}
1419
1420/// Stores a 128-bit integer vector to a 128-bit aligned memory location.
1421/// To minimize caching, the data is flagged as non-temporal (unlikely to be
1422/// used again soon).
1423///
1424/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128)
1425///
1426/// # Safety of non-temporal stores
1427///
1428/// After using this intrinsic, but before any other access to the memory that this intrinsic
1429/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1430/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1431/// return.
1432///
1433/// See [`_mm_sfence`] for details.
1434#[inline]
1435#[target_feature(enable = "sse2")]
1436#[cfg_attr(test, assert_instr(movntdq))]
1437#[stable(feature = "simd_x86", since = "1.27.0")]
1438pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
1439    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1440    crate::arch::asm!(
1441        vps!("movntdq",  ",{a}"),
1442        p = in(reg) mem_addr,
1443        a = in(xmm_reg) a,
1444        options(nostack, preserves_flags),
1445    );
1446}
1447
1448/// Stores a 32-bit integer value in the specified memory location.
1449/// To minimize caching, the data is flagged as non-temporal (unlikely to be
1450/// used again soon).
1451///
1452/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32)
1453///
1454/// # Safety of non-temporal stores
1455///
1456/// After using this intrinsic, but before any other access to the memory that this intrinsic
1457/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1458/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1459/// return.
1460///
1461/// See [`_mm_sfence`] for details.
1462#[inline]
1463#[target_feature(enable = "sse2")]
1464#[cfg_attr(test, assert_instr(movnti))]
1465#[stable(feature = "simd_x86", since = "1.27.0")]
1466pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
1467    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1468    crate::arch::asm!(
1469        vps!("movnti", ",{a:e}"), // `:e` for 32bit value
1470        p = in(reg) mem_addr,
1471        a = in(reg) a,
1472        options(nostack, preserves_flags),
1473    );
1474}
1475
1476/// Returns a vector where the low element is extracted from `a` and its upper
1477/// element is zero.
1478///
1479/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64)
1480#[inline]
1481#[target_feature(enable = "sse2")]
1482// FIXME movd on msvc, movd on i686
1483#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(movq))]
1484#[stable(feature = "simd_x86", since = "1.27.0")]
1485#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1486pub const fn _mm_move_epi64(a: __m128i) -> __m128i {
1487    unsafe {
1488        let r: i64x2 = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 2]);
1489        transmute(r)
1490    }
1491}
1492
1493/// Converts packed signed 16-bit integers from `a` and `b` to packed 8-bit integers
1494/// using signed saturation.
1495///
1496/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16)
1497#[inline]
1498#[target_feature(enable = "sse2")]
1499#[cfg_attr(test, assert_instr(packsswb))]
1500#[stable(feature = "simd_x86", since = "1.27.0")]
1501#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1502pub const fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
1503    unsafe {
1504        let max = simd_splat(i8::MAX as i16);
1505        let min = simd_splat(i8::MIN as i16);
1506
1507        let clamped_a = simd_imax(simd_imin(a.as_i16x8(), max), min)
1508            .as_m128i()
1509            .as_i8x16();
1510        let clamped_b = simd_imax(simd_imin(b.as_i16x8(), max), min)
1511            .as_m128i()
1512            .as_i8x16();
1513
1514        // Shuffle the low i8 of each i16 from two concatenated vectors into
1515        // the low bits of the result register.
1516        const IDXS: [u32; 16] = [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30];
1517        let result: i8x16 = simd_shuffle!(clamped_a, clamped_b, IDXS);
1518
1519        result.as_m128i()
1520    }
1521}
1522
1523/// Converts packed signed 32-bit integers from `a` and `b` to packed 16-bit integers
1524/// using signed saturation.
1525///
1526/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32)
1527#[inline]
1528#[target_feature(enable = "sse2")]
1529#[cfg_attr(test, assert_instr(packssdw))]
1530#[stable(feature = "simd_x86", since = "1.27.0")]
1531#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1532pub const fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i {
1533    unsafe {
1534        let max = simd_splat(i16::MAX as i32);
1535        let min = simd_splat(i16::MIN as i32);
1536
1537        let clamped_a = simd_imax(simd_imin(a.as_i32x4(), max), min);
1538        let clamped_b = simd_imax(simd_imin(b.as_i32x4(), max), min);
1539
1540        let clamped_a: i16x4 = simd_cast(clamped_a);
1541        let clamped_b: i16x4 = simd_cast(clamped_b);
1542
1543        let a: i64 = transmute(clamped_a);
1544        let b: i64 = transmute(clamped_b);
1545        i64x2::new(a, b).as_m128i()
1546    }
1547}
1548
1549/// Converts packed signed 16-bit integers from `a` and `b` to packed 8-bit integers
1550/// using unsigned saturation.
1551///
1552/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16)
1553#[inline]
1554#[target_feature(enable = "sse2")]
1555#[cfg_attr(test, assert_instr(packuswb))]
1556#[stable(feature = "simd_x86", since = "1.27.0")]
1557#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1558pub const fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i {
1559    unsafe {
1560        let max = simd_splat(u8::MAX as i16);
1561        let min = simd_splat(u8::MIN as i16);
1562
1563        let clamped_a = simd_imax(simd_imin(a.as_i16x8(), max), min)
1564            .as_m128i()
1565            .as_i8x16();
1566        let clamped_b = simd_imax(simd_imin(b.as_i16x8(), max), min)
1567            .as_m128i()
1568            .as_i8x16();
1569
1570        // Shuffle the low bytes of each i16 from two concatenated vectors into
1571        // the low bits of the result register.
1572        // Without `simd_shuffle`, this intrinsic will cause the AVX-512BW
1573        // `_mm_mask_packus_epi16` and `_mm_maskz_packus_epi16` tests to fail.
1574        const IDXS: [u32; 16] = [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30];
1575        let result: i8x16 = simd_shuffle!(clamped_a, clamped_b, IDXS);
1576
1577        result.as_m128i()
1578    }
1579}
1580
1581/// Returns the `imm8` element of `a`.
1582///
1583/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16)
1584#[inline]
1585#[target_feature(enable = "sse2")]
1586#[cfg_attr(test, assert_instr(pextrw, IMM8 = 7))]
1587#[rustc_legacy_const_generics(1)]
1588#[stable(feature = "simd_x86", since = "1.27.0")]
1589#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1590pub const fn _mm_extract_epi16<const IMM8: i32>(a: __m128i) -> i32 {
1591    static_assert_uimm_bits!(IMM8, 3);
1592    unsafe { simd_extract!(a.as_u16x8(), IMM8 as u32, u16) as i32 }
1593}
1594
1595/// Returns a new vector where the `imm8` element of `a` is replaced with `i`.
1596///
1597/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16)
1598#[inline]
1599#[target_feature(enable = "sse2")]
1600#[cfg_attr(test, assert_instr(pinsrw, IMM8 = 7))]
1601#[rustc_legacy_const_generics(2)]
1602#[stable(feature = "simd_x86", since = "1.27.0")]
1603#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1604pub const fn _mm_insert_epi16<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
1605    static_assert_uimm_bits!(IMM8, 3);
1606    unsafe { transmute(simd_insert!(a.as_i16x8(), IMM8 as u32, i as i16)) }
1607}
1608
1609/// Returns a mask of the most significant bit of each element in `a`.
1610///
1611/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8)
1612#[inline]
1613#[target_feature(enable = "sse2")]
1614#[cfg_attr(test, assert_instr(pmovmskb))]
1615#[stable(feature = "simd_x86", since = "1.27.0")]
1616#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1617pub const fn _mm_movemask_epi8(a: __m128i) -> i32 {
1618    unsafe {
1619        let z = i8x16::ZERO;
1620        let m: i8x16 = simd_lt(a.as_i8x16(), z);
1621        simd_bitmask::<_, u16>(m) as u32 as i32
1622    }
1623}
1624
1625/// Shuffles 32-bit integers in `a` using the control in `IMM8`.
1626///
1627/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32)
1628#[inline]
1629#[target_feature(enable = "sse2")]
1630#[cfg_attr(test, assert_instr(pshufd, IMM8 = 9))]
1631#[rustc_legacy_const_generics(1)]
1632#[stable(feature = "simd_x86", since = "1.27.0")]
1633#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1634pub const fn _mm_shuffle_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
1635    static_assert_uimm_bits!(IMM8, 8);
1636    unsafe {
1637        let a = a.as_i32x4();
1638        let x: i32x4 = simd_shuffle!(
1639            a,
1640            a,
1641            [
1642                IMM8 as u32 & 0b11,
1643                (IMM8 as u32 >> 2) & 0b11,
1644                (IMM8 as u32 >> 4) & 0b11,
1645                (IMM8 as u32 >> 6) & 0b11,
1646            ],
1647        );
1648        transmute(x)
1649    }
1650}
1651
1652/// Shuffles 16-bit integers in the high 64 bits of `a` using the control in
1653/// `IMM8`.
1654///
1655/// Put the results in the high 64 bits of the returned vector, with the low 64
1656/// bits being copied from `a`.
1657///
1658/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16)
1659#[inline]
1660#[target_feature(enable = "sse2")]
1661#[cfg_attr(test, assert_instr(pshufhw, IMM8 = 9))]
1662#[rustc_legacy_const_generics(1)]
1663#[stable(feature = "simd_x86", since = "1.27.0")]
1664#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1665pub const fn _mm_shufflehi_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
1666    static_assert_uimm_bits!(IMM8, 8);
1667    unsafe {
1668        let a = a.as_i16x8();
1669        let x: i16x8 = simd_shuffle!(
1670            a,
1671            a,
1672            [
1673                0,
1674                1,
1675                2,
1676                3,
1677                (IMM8 as u32 & 0b11) + 4,
1678                ((IMM8 as u32 >> 2) & 0b11) + 4,
1679                ((IMM8 as u32 >> 4) & 0b11) + 4,
1680                ((IMM8 as u32 >> 6) & 0b11) + 4,
1681            ],
1682        );
1683        transmute(x)
1684    }
1685}
1686
1687/// Shuffles 16-bit integers in the low 64 bits of `a` using the control in
1688/// `IMM8`.
1689///
1690/// Put the results in the low 64 bits of the returned vector, with the high 64
1691/// bits being copied from `a`.
1692///
1693/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16)
1694#[inline]
1695#[target_feature(enable = "sse2")]
1696#[cfg_attr(test, assert_instr(pshuflw, IMM8 = 9))]
1697#[rustc_legacy_const_generics(1)]
1698#[stable(feature = "simd_x86", since = "1.27.0")]
1699#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1700pub const fn _mm_shufflelo_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
1701    static_assert_uimm_bits!(IMM8, 8);
1702    unsafe {
1703        let a = a.as_i16x8();
1704        let x: i16x8 = simd_shuffle!(
1705            a,
1706            a,
1707            [
1708                IMM8 as u32 & 0b11,
1709                (IMM8 as u32 >> 2) & 0b11,
1710                (IMM8 as u32 >> 4) & 0b11,
1711                (IMM8 as u32 >> 6) & 0b11,
1712                4,
1713                5,
1714                6,
1715                7,
1716            ],
1717        );
1718        transmute(x)
1719    }
1720}
1721
1722/// Unpacks and interleave 8-bit integers from the high half of `a` and `b`.
1723///
1724/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8)
1725#[inline]
1726#[target_feature(enable = "sse2")]
1727#[cfg_attr(test, assert_instr(punpckhbw))]
1728#[stable(feature = "simd_x86", since = "1.27.0")]
1729#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1730pub const fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
1731    unsafe {
1732        transmute::<i8x16, _>(simd_shuffle!(
1733            a.as_i8x16(),
1734            b.as_i8x16(),
1735            [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31],
1736        ))
1737    }
1738}
1739
1740/// Unpacks and interleave 16-bit integers from the high half of `a` and `b`.
1741///
1742/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16)
1743#[inline]
1744#[target_feature(enable = "sse2")]
1745#[cfg_attr(test, assert_instr(punpckhwd))]
1746#[stable(feature = "simd_x86", since = "1.27.0")]
1747#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1748pub const fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
1749    unsafe {
1750        let x = simd_shuffle!(a.as_i16x8(), b.as_i16x8(), [4, 12, 5, 13, 6, 14, 7, 15]);
1751        transmute::<i16x8, _>(x)
1752    }
1753}
1754
1755/// Unpacks and interleave 32-bit integers from the high half of `a` and `b`.
1756///
1757/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32)
1758#[inline]
1759#[target_feature(enable = "sse2")]
1760#[cfg_attr(test, assert_instr(unpckhps))]
1761#[stable(feature = "simd_x86", since = "1.27.0")]
1762#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1763pub const fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
1764    unsafe { transmute::<i32x4, _>(simd_shuffle!(a.as_i32x4(), b.as_i32x4(), [2, 6, 3, 7])) }
1765}
1766
1767/// Unpacks and interleave 64-bit integers from the high half of `a` and `b`.
1768///
1769/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64)
1770#[inline]
1771#[target_feature(enable = "sse2")]
1772#[cfg_attr(test, assert_instr(unpckhpd))]
1773#[stable(feature = "simd_x86", since = "1.27.0")]
1774#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1775pub const fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
1776    unsafe { transmute::<i64x2, _>(simd_shuffle!(a.as_i64x2(), b.as_i64x2(), [1, 3])) }
1777}
1778
1779/// Unpacks and interleave 8-bit integers from the low half of `a` and `b`.
1780///
1781/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8)
1782#[inline]
1783#[target_feature(enable = "sse2")]
1784#[cfg_attr(test, assert_instr(punpcklbw))]
1785#[stable(feature = "simd_x86", since = "1.27.0")]
1786#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1787pub const fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
1788    unsafe {
1789        transmute::<i8x16, _>(simd_shuffle!(
1790            a.as_i8x16(),
1791            b.as_i8x16(),
1792            [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23],
1793        ))
1794    }
1795}
1796
1797/// Unpacks and interleave 16-bit integers from the low half of `a` and `b`.
1798///
1799/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16)
1800#[inline]
1801#[target_feature(enable = "sse2")]
1802#[cfg_attr(test, assert_instr(punpcklwd))]
1803#[stable(feature = "simd_x86", since = "1.27.0")]
1804#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1805pub const fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
1806    unsafe {
1807        let x = simd_shuffle!(a.as_i16x8(), b.as_i16x8(), [0, 8, 1, 9, 2, 10, 3, 11]);
1808        transmute::<i16x8, _>(x)
1809    }
1810}
1811
1812/// Unpacks and interleave 32-bit integers from the low half of `a` and `b`.
1813///
1814/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32)
1815#[inline]
1816#[target_feature(enable = "sse2")]
1817#[cfg_attr(test, assert_instr(unpcklps))]
1818#[stable(feature = "simd_x86", since = "1.27.0")]
1819#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1820pub const fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
1821    unsafe { transmute::<i32x4, _>(simd_shuffle!(a.as_i32x4(), b.as_i32x4(), [0, 4, 1, 5])) }
1822}
1823
1824/// Unpacks and interleave 64-bit integers from the low half of `a` and `b`.
1825///
1826/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64)
1827#[inline]
1828#[target_feature(enable = "sse2")]
1829#[cfg_attr(test, assert_instr(movlhps))]
1830#[stable(feature = "simd_x86", since = "1.27.0")]
1831#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1832pub const fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
1833    unsafe { transmute::<i64x2, _>(simd_shuffle!(a.as_i64x2(), b.as_i64x2(), [0, 2])) }
1834}
1835
1836/// Returns a new vector with the low element of `a` replaced by the sum of the
1837/// low elements of `a` and `b`.
1838///
1839/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd)
1840#[inline]
1841#[target_feature(enable = "sse2")]
1842#[cfg_attr(test, assert_instr(addsd))]
1843#[stable(feature = "simd_x86", since = "1.27.0")]
1844#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1845pub const fn _mm_add_sd(a: __m128d, b: __m128d) -> __m128d {
1846    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) + _mm_cvtsd_f64(b)) }
1847}
1848
1849/// Adds packed double-precision (64-bit) floating-point elements in `a` and
1850/// `b`.
1851///
1852/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd)
1853#[inline]
1854#[target_feature(enable = "sse2")]
1855#[cfg_attr(test, assert_instr(addpd))]
1856#[stable(feature = "simd_x86", since = "1.27.0")]
1857#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1858pub const fn _mm_add_pd(a: __m128d, b: __m128d) -> __m128d {
1859    unsafe { simd_add(a, b) }
1860}
1861
1862/// Returns a new vector with the low element of `a` replaced by the result of
1863/// diving the lower element of `a` by the lower element of `b`.
1864///
1865/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd)
1866#[inline]
1867#[target_feature(enable = "sse2")]
1868#[cfg_attr(test, assert_instr(divsd))]
1869#[stable(feature = "simd_x86", since = "1.27.0")]
1870#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1871pub const fn _mm_div_sd(a: __m128d, b: __m128d) -> __m128d {
1872    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) / _mm_cvtsd_f64(b)) }
1873}
1874
1875/// Divide packed double-precision (64-bit) floating-point elements in `a` by
1876/// packed elements in `b`.
1877///
1878/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd)
1879#[inline]
1880#[target_feature(enable = "sse2")]
1881#[cfg_attr(test, assert_instr(divpd))]
1882#[stable(feature = "simd_x86", since = "1.27.0")]
1883#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1884pub const fn _mm_div_pd(a: __m128d, b: __m128d) -> __m128d {
1885    unsafe { simd_div(a, b) }
1886}
1887
1888/// Returns a new vector with the low element of `a` replaced by the maximum
1889/// of the lower elements of `a` and `b`.
1890///
1891/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd)
1892#[inline]
1893#[target_feature(enable = "sse2")]
1894#[cfg_attr(test, assert_instr(maxsd))]
1895#[stable(feature = "simd_x86", since = "1.27.0")]
1896pub fn _mm_max_sd(a: __m128d, b: __m128d) -> __m128d {
1897    unsafe { maxsd(a, b) }
1898}
1899
1900/// Returns a new vector with the maximum values from corresponding elements in
1901/// `a` and `b`.
1902///
1903/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd)
1904#[inline]
1905#[target_feature(enable = "sse2")]
1906#[cfg_attr(test, assert_instr(maxpd))]
1907#[stable(feature = "simd_x86", since = "1.27.0")]
1908pub fn _mm_max_pd(a: __m128d, b: __m128d) -> __m128d {
1909    unsafe { maxpd(a, b) }
1910}
1911
1912/// Returns a new vector with the low element of `a` replaced by the minimum
1913/// of the lower elements of `a` and `b`.
1914///
1915/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd)
1916#[inline]
1917#[target_feature(enable = "sse2")]
1918#[cfg_attr(test, assert_instr(minsd))]
1919#[stable(feature = "simd_x86", since = "1.27.0")]
1920pub fn _mm_min_sd(a: __m128d, b: __m128d) -> __m128d {
1921    unsafe { minsd(a, b) }
1922}
1923
1924/// Returns a new vector with the minimum values from corresponding elements in
1925/// `a` and `b`.
1926///
1927/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd)
1928#[inline]
1929#[target_feature(enable = "sse2")]
1930#[cfg_attr(test, assert_instr(minpd))]
1931#[stable(feature = "simd_x86", since = "1.27.0")]
1932pub fn _mm_min_pd(a: __m128d, b: __m128d) -> __m128d {
1933    unsafe { minpd(a, b) }
1934}
1935
1936/// Returns a new vector with the low element of `a` replaced by multiplying the
1937/// low elements of `a` and `b`.
1938///
1939/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_sd)
1940#[inline]
1941#[target_feature(enable = "sse2")]
1942#[cfg_attr(test, assert_instr(mulsd))]
1943#[stable(feature = "simd_x86", since = "1.27.0")]
1944#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1945pub const fn _mm_mul_sd(a: __m128d, b: __m128d) -> __m128d {
1946    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) * _mm_cvtsd_f64(b)) }
1947}
1948
1949/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
1950/// and `b`.
1951///
1952/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd)
1953#[inline]
1954#[target_feature(enable = "sse2")]
1955#[cfg_attr(test, assert_instr(mulpd))]
1956#[stable(feature = "simd_x86", since = "1.27.0")]
1957#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1958pub const fn _mm_mul_pd(a: __m128d, b: __m128d) -> __m128d {
1959    unsafe { simd_mul(a, b) }
1960}
1961
1962/// Returns a new vector with the low element of `a` replaced by the square
1963/// root of the lower element `b`.
1964///
1965/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd)
1966#[inline]
1967#[target_feature(enable = "sse2")]
1968#[cfg_attr(test, assert_instr(sqrtsd))]
1969#[stable(feature = "simd_x86", since = "1.27.0")]
1970pub fn _mm_sqrt_sd(a: __m128d, b: __m128d) -> __m128d {
1971    unsafe { simd_insert!(a, 0, sqrtf64(_mm_cvtsd_f64(b))) }
1972}
1973
1974/// Returns a new vector with the square root of each of the values in `a`.
1975///
1976/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd)
1977#[inline]
1978#[target_feature(enable = "sse2")]
1979#[cfg_attr(test, assert_instr(sqrtpd))]
1980#[stable(feature = "simd_x86", since = "1.27.0")]
1981pub fn _mm_sqrt_pd(a: __m128d) -> __m128d {
1982    unsafe { simd_fsqrt(a) }
1983}
1984
1985/// Returns a new vector with the low element of `a` replaced by subtracting the
1986/// low element by `b` from the low element of `a`.
1987///
1988/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd)
1989#[inline]
1990#[target_feature(enable = "sse2")]
1991#[cfg_attr(test, assert_instr(subsd))]
1992#[stable(feature = "simd_x86", since = "1.27.0")]
1993#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1994pub const fn _mm_sub_sd(a: __m128d, b: __m128d) -> __m128d {
1995    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) - _mm_cvtsd_f64(b)) }
1996}
1997
1998/// Subtract packed double-precision (64-bit) floating-point elements in `b`
1999/// from `a`.
2000///
2001/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_pd)
2002#[inline]
2003#[target_feature(enable = "sse2")]
2004#[cfg_attr(test, assert_instr(subpd))]
2005#[stable(feature = "simd_x86", since = "1.27.0")]
2006#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2007pub const fn _mm_sub_pd(a: __m128d, b: __m128d) -> __m128d {
2008    unsafe { simd_sub(a, b) }
2009}
2010
2011/// Computes the bitwise AND of packed double-precision (64-bit) floating-point
2012/// elements in `a` and `b`.
2013///
2014/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd)
2015#[inline]
2016#[target_feature(enable = "sse2")]
2017#[cfg_attr(test, assert_instr(andps))]
2018#[stable(feature = "simd_x86", since = "1.27.0")]
2019#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2020pub const fn _mm_and_pd(a: __m128d, b: __m128d) -> __m128d {
2021    unsafe {
2022        let a: __m128i = transmute(a);
2023        let b: __m128i = transmute(b);
2024        transmute(_mm_and_si128(a, b))
2025    }
2026}
2027
2028/// Computes the bitwise NOT of `a` and then AND with `b`.
2029///
2030/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd)
2031#[inline]
2032#[target_feature(enable = "sse2")]
2033#[cfg_attr(test, assert_instr(andnps))]
2034#[stable(feature = "simd_x86", since = "1.27.0")]
2035#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2036pub const fn _mm_andnot_pd(a: __m128d, b: __m128d) -> __m128d {
2037    unsafe {
2038        let a: __m128i = transmute(a);
2039        let b: __m128i = transmute(b);
2040        transmute(_mm_andnot_si128(a, b))
2041    }
2042}
2043
2044/// Computes the bitwise OR of `a` and `b`.
2045///
2046/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_pd)
2047#[inline]
2048#[target_feature(enable = "sse2")]
2049#[cfg_attr(test, assert_instr(orps))]
2050#[stable(feature = "simd_x86", since = "1.27.0")]
2051#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2052pub const fn _mm_or_pd(a: __m128d, b: __m128d) -> __m128d {
2053    unsafe {
2054        let a: __m128i = transmute(a);
2055        let b: __m128i = transmute(b);
2056        transmute(_mm_or_si128(a, b))
2057    }
2058}
2059
2060/// Computes the bitwise XOR of `a` and `b`.
2061///
2062/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd)
2063#[inline]
2064#[target_feature(enable = "sse2")]
2065#[cfg_attr(test, assert_instr(xorps))]
2066#[stable(feature = "simd_x86", since = "1.27.0")]
2067#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2068pub const fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
2069    unsafe {
2070        let a: __m128i = transmute(a);
2071        let b: __m128i = transmute(b);
2072        transmute(_mm_xor_si128(a, b))
2073    }
2074}
2075
2076/// Returns a new vector with the low element of `a` replaced by the equality
2077/// comparison of the lower elements of `a` and `b`.
2078///
2079/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd)
2080#[inline]
2081#[target_feature(enable = "sse2")]
2082#[cfg_attr(test, assert_instr(cmpeqsd))]
2083#[stable(feature = "simd_x86", since = "1.27.0")]
2084pub fn _mm_cmpeq_sd(a: __m128d, b: __m128d) -> __m128d {
2085    unsafe { cmpsd(a, b, 0) }
2086}
2087
2088/// Returns a new vector with the low element of `a` replaced by the less-than
2089/// comparison of the lower elements of `a` and `b`.
2090///
2091/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd)
2092#[inline]
2093#[target_feature(enable = "sse2")]
2094#[cfg_attr(test, assert_instr(cmpltsd))]
2095#[stable(feature = "simd_x86", since = "1.27.0")]
2096pub fn _mm_cmplt_sd(a: __m128d, b: __m128d) -> __m128d {
2097    unsafe { cmpsd(a, b, 1) }
2098}
2099
2100/// Returns a new vector with the low element of `a` replaced by the
2101/// less-than-or-equal comparison of the lower elements of `a` and `b`.
2102///
2103/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd)
2104#[inline]
2105#[target_feature(enable = "sse2")]
2106#[cfg_attr(test, assert_instr(cmplesd))]
2107#[stable(feature = "simd_x86", since = "1.27.0")]
2108pub fn _mm_cmple_sd(a: __m128d, b: __m128d) -> __m128d {
2109    unsafe { cmpsd(a, b, 2) }
2110}
2111
2112/// Returns a new vector with the low element of `a` replaced by the
2113/// greater-than comparison of the lower elements of `a` and `b`.
2114///
2115/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd)
2116#[inline]
2117#[target_feature(enable = "sse2")]
2118#[cfg_attr(test, assert_instr(cmpltsd))]
2119#[stable(feature = "simd_x86", since = "1.27.0")]
2120pub fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d {
2121    unsafe { simd_insert!(_mm_cmplt_sd(b, a), 1, simd_extract!(a, 1, f64)) }
2122}
2123
2124/// Returns a new vector with the low element of `a` replaced by the
2125/// greater-than-or-equal comparison of the lower elements of `a` and `b`.
2126///
2127/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd)
2128#[inline]
2129#[target_feature(enable = "sse2")]
2130#[cfg_attr(test, assert_instr(cmplesd))]
2131#[stable(feature = "simd_x86", since = "1.27.0")]
2132pub fn _mm_cmpge_sd(a: __m128d, b: __m128d) -> __m128d {
2133    unsafe { simd_insert!(_mm_cmple_sd(b, a), 1, simd_extract!(a, 1, f64)) }
2134}
2135
2136/// Returns a new vector with the low element of `a` replaced by the result
2137/// of comparing both of the lower elements of `a` and `b` to `NaN`. If
2138/// neither are equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0`
2139/// otherwise.
2140///
2141/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd)
2142#[inline]
2143#[target_feature(enable = "sse2")]
2144#[cfg_attr(test, assert_instr(cmpordsd))]
2145#[stable(feature = "simd_x86", since = "1.27.0")]
2146pub fn _mm_cmpord_sd(a: __m128d, b: __m128d) -> __m128d {
2147    unsafe { cmpsd(a, b, 7) }
2148}
2149
2150/// Returns a new vector with the low element of `a` replaced by the result of
2151/// comparing both of the lower elements of `a` and `b` to `NaN`. If either is
2152/// equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0` otherwise.
2153///
2154/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd)
2155#[inline]
2156#[target_feature(enable = "sse2")]
2157#[cfg_attr(test, assert_instr(cmpunordsd))]
2158#[stable(feature = "simd_x86", since = "1.27.0")]
2159pub fn _mm_cmpunord_sd(a: __m128d, b: __m128d) -> __m128d {
2160    unsafe { cmpsd(a, b, 3) }
2161}
2162
2163/// Returns a new vector with the low element of `a` replaced by the not-equal
2164/// comparison of the lower elements of `a` and `b`.
2165///
2166/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd)
2167#[inline]
2168#[target_feature(enable = "sse2")]
2169#[cfg_attr(test, assert_instr(cmpneqsd))]
2170#[stable(feature = "simd_x86", since = "1.27.0")]
2171pub fn _mm_cmpneq_sd(a: __m128d, b: __m128d) -> __m128d {
2172    unsafe { cmpsd(a, b, 4) }
2173}
2174
2175/// Returns a new vector with the low element of `a` replaced by the
2176/// not-less-than comparison of the lower elements of `a` and `b`.
2177///
2178/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd)
2179#[inline]
2180#[target_feature(enable = "sse2")]
2181#[cfg_attr(test, assert_instr(cmpnltsd))]
2182#[stable(feature = "simd_x86", since = "1.27.0")]
2183pub fn _mm_cmpnlt_sd(a: __m128d, b: __m128d) -> __m128d {
2184    unsafe { cmpsd(a, b, 5) }
2185}
2186
2187/// Returns a new vector with the low element of `a` replaced by the
2188/// not-less-than-or-equal comparison of the lower elements of `a` and `b`.
2189///
2190/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd)
2191#[inline]
2192#[target_feature(enable = "sse2")]
2193#[cfg_attr(test, assert_instr(cmpnlesd))]
2194#[stable(feature = "simd_x86", since = "1.27.0")]
2195pub fn _mm_cmpnle_sd(a: __m128d, b: __m128d) -> __m128d {
2196    unsafe { cmpsd(a, b, 6) }
2197}
2198
2199/// Returns a new vector with the low element of `a` replaced by the
2200/// not-greater-than comparison of the lower elements of `a` and `b`.
2201///
2202/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd)
2203#[inline]
2204#[target_feature(enable = "sse2")]
2205#[cfg_attr(test, assert_instr(cmpnltsd))]
2206#[stable(feature = "simd_x86", since = "1.27.0")]
2207pub fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d {
2208    unsafe { simd_insert!(_mm_cmpnlt_sd(b, a), 1, simd_extract!(a, 1, f64)) }
2209}
2210
2211/// Returns a new vector with the low element of `a` replaced by the
2212/// not-greater-than-or-equal comparison of the lower elements of `a` and `b`.
2213///
2214/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd)
2215#[inline]
2216#[target_feature(enable = "sse2")]
2217#[cfg_attr(test, assert_instr(cmpnlesd))]
2218#[stable(feature = "simd_x86", since = "1.27.0")]
2219pub fn _mm_cmpnge_sd(a: __m128d, b: __m128d) -> __m128d {
2220    unsafe { simd_insert!(_mm_cmpnle_sd(b, a), 1, simd_extract!(a, 1, f64)) }
2221}
2222
2223/// Compares corresponding elements in `a` and `b` for equality.
2224///
2225/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd)
2226#[inline]
2227#[target_feature(enable = "sse2")]
2228#[cfg_attr(test, assert_instr(cmpeqpd))]
2229#[stable(feature = "simd_x86", since = "1.27.0")]
2230pub fn _mm_cmpeq_pd(a: __m128d, b: __m128d) -> __m128d {
2231    unsafe { cmppd(a, b, 0) }
2232}
2233
2234/// Compares corresponding elements in `a` and `b` for less-than.
2235///
2236/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd)
2237#[inline]
2238#[target_feature(enable = "sse2")]
2239#[cfg_attr(test, assert_instr(cmpltpd))]
2240#[stable(feature = "simd_x86", since = "1.27.0")]
2241pub fn _mm_cmplt_pd(a: __m128d, b: __m128d) -> __m128d {
2242    unsafe { cmppd(a, b, 1) }
2243}
2244
2245/// Compares corresponding elements in `a` and `b` for less-than-or-equal
2246///
2247/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd)
2248#[inline]
2249#[target_feature(enable = "sse2")]
2250#[cfg_attr(test, assert_instr(cmplepd))]
2251#[stable(feature = "simd_x86", since = "1.27.0")]
2252pub fn _mm_cmple_pd(a: __m128d, b: __m128d) -> __m128d {
2253    unsafe { cmppd(a, b, 2) }
2254}
2255
2256/// Compares corresponding elements in `a` and `b` for greater-than.
2257///
2258/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd)
2259#[inline]
2260#[target_feature(enable = "sse2")]
2261#[cfg_attr(test, assert_instr(cmpltpd))]
2262#[stable(feature = "simd_x86", since = "1.27.0")]
2263pub fn _mm_cmpgt_pd(a: __m128d, b: __m128d) -> __m128d {
2264    _mm_cmplt_pd(b, a)
2265}
2266
2267/// Compares corresponding elements in `a` and `b` for greater-than-or-equal.
2268///
2269/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd)
2270#[inline]
2271#[target_feature(enable = "sse2")]
2272#[cfg_attr(test, assert_instr(cmplepd))]
2273#[stable(feature = "simd_x86", since = "1.27.0")]
2274pub fn _mm_cmpge_pd(a: __m128d, b: __m128d) -> __m128d {
2275    _mm_cmple_pd(b, a)
2276}
2277
2278/// Compares corresponding elements in `a` and `b` to see if neither is `NaN`.
2279///
2280/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd)
2281#[inline]
2282#[target_feature(enable = "sse2")]
2283#[cfg_attr(test, assert_instr(cmpordpd))]
2284#[stable(feature = "simd_x86", since = "1.27.0")]
2285pub fn _mm_cmpord_pd(a: __m128d, b: __m128d) -> __m128d {
2286    unsafe { cmppd(a, b, 7) }
2287}
2288
2289/// Compares corresponding elements in `a` and `b` to see if either is `NaN`.
2290///
2291/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd)
2292#[inline]
2293#[target_feature(enable = "sse2")]
2294#[cfg_attr(test, assert_instr(cmpunordpd))]
2295#[stable(feature = "simd_x86", since = "1.27.0")]
2296pub fn _mm_cmpunord_pd(a: __m128d, b: __m128d) -> __m128d {
2297    unsafe { cmppd(a, b, 3) }
2298}
2299
2300/// Compares corresponding elements in `a` and `b` for not-equal.
2301///
2302/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd)
2303#[inline]
2304#[target_feature(enable = "sse2")]
2305#[cfg_attr(test, assert_instr(cmpneqpd))]
2306#[stable(feature = "simd_x86", since = "1.27.0")]
2307pub fn _mm_cmpneq_pd(a: __m128d, b: __m128d) -> __m128d {
2308    unsafe { cmppd(a, b, 4) }
2309}
2310
2311/// Compares corresponding elements in `a` and `b` for not-less-than.
2312///
2313/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd)
2314#[inline]
2315#[target_feature(enable = "sse2")]
2316#[cfg_attr(test, assert_instr(cmpnltpd))]
2317#[stable(feature = "simd_x86", since = "1.27.0")]
2318pub fn _mm_cmpnlt_pd(a: __m128d, b: __m128d) -> __m128d {
2319    unsafe { cmppd(a, b, 5) }
2320}
2321
2322/// Compares corresponding elements in `a` and `b` for not-less-than-or-equal.
2323///
2324/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd)
2325#[inline]
2326#[target_feature(enable = "sse2")]
2327#[cfg_attr(test, assert_instr(cmpnlepd))]
2328#[stable(feature = "simd_x86", since = "1.27.0")]
2329pub fn _mm_cmpnle_pd(a: __m128d, b: __m128d) -> __m128d {
2330    unsafe { cmppd(a, b, 6) }
2331}
2332
2333/// Compares corresponding elements in `a` and `b` for not-greater-than.
2334///
2335/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_pd)
2336#[inline]
2337#[target_feature(enable = "sse2")]
2338#[cfg_attr(test, assert_instr(cmpnltpd))]
2339#[stable(feature = "simd_x86", since = "1.27.0")]
2340pub fn _mm_cmpngt_pd(a: __m128d, b: __m128d) -> __m128d {
2341    _mm_cmpnlt_pd(b, a)
2342}
2343
2344/// Compares corresponding elements in `a` and `b` for
2345/// not-greater-than-or-equal.
2346///
2347/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd)
2348#[inline]
2349#[target_feature(enable = "sse2")]
2350#[cfg_attr(test, assert_instr(cmpnlepd))]
2351#[stable(feature = "simd_x86", since = "1.27.0")]
2352pub fn _mm_cmpnge_pd(a: __m128d, b: __m128d) -> __m128d {
2353    _mm_cmpnle_pd(b, a)
2354}
2355
2356/// Compares the lower element of `a` and `b` for equality.
2357///
2358/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd)
2359#[inline]
2360#[target_feature(enable = "sse2")]
2361#[cfg_attr(test, assert_instr(comisd))]
2362#[stable(feature = "simd_x86", since = "1.27.0")]
2363pub fn _mm_comieq_sd(a: __m128d, b: __m128d) -> i32 {
2364    unsafe { comieqsd(a, b) }
2365}
2366
2367/// Compares the lower element of `a` and `b` for less-than.
2368///
2369/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd)
2370#[inline]
2371#[target_feature(enable = "sse2")]
2372#[cfg_attr(test, assert_instr(comisd))]
2373#[stable(feature = "simd_x86", since = "1.27.0")]
2374pub fn _mm_comilt_sd(a: __m128d, b: __m128d) -> i32 {
2375    unsafe { comiltsd(a, b) }
2376}
2377
2378/// Compares the lower element of `a` and `b` for less-than-or-equal.
2379///
2380/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd)
2381#[inline]
2382#[target_feature(enable = "sse2")]
2383#[cfg_attr(test, assert_instr(comisd))]
2384#[stable(feature = "simd_x86", since = "1.27.0")]
2385pub fn _mm_comile_sd(a: __m128d, b: __m128d) -> i32 {
2386    unsafe { comilesd(a, b) }
2387}
2388
2389/// Compares the lower element of `a` and `b` for greater-than.
2390///
2391/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd)
2392#[inline]
2393#[target_feature(enable = "sse2")]
2394#[cfg_attr(test, assert_instr(comisd))]
2395#[stable(feature = "simd_x86", since = "1.27.0")]
2396pub fn _mm_comigt_sd(a: __m128d, b: __m128d) -> i32 {
2397    unsafe { comigtsd(a, b) }
2398}
2399
2400/// Compares the lower element of `a` and `b` for greater-than-or-equal.
2401///
2402/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd)
2403#[inline]
2404#[target_feature(enable = "sse2")]
2405#[cfg_attr(test, assert_instr(comisd))]
2406#[stable(feature = "simd_x86", since = "1.27.0")]
2407pub fn _mm_comige_sd(a: __m128d, b: __m128d) -> i32 {
2408    unsafe { comigesd(a, b) }
2409}
2410
2411/// Compares the lower element of `a` and `b` for not-equal.
2412///
2413/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd)
2414#[inline]
2415#[target_feature(enable = "sse2")]
2416#[cfg_attr(test, assert_instr(comisd))]
2417#[stable(feature = "simd_x86", since = "1.27.0")]
2418pub fn _mm_comineq_sd(a: __m128d, b: __m128d) -> i32 {
2419    unsafe { comineqsd(a, b) }
2420}
2421
2422/// Compares the lower element of `a` and `b` for equality.
2423///
2424/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomieq_sd)
2425#[inline]
2426#[target_feature(enable = "sse2")]
2427#[cfg_attr(test, assert_instr(ucomisd))]
2428#[stable(feature = "simd_x86", since = "1.27.0")]
2429pub fn _mm_ucomieq_sd(a: __m128d, b: __m128d) -> i32 {
2430    unsafe { ucomieqsd(a, b) }
2431}
2432
2433/// Compares the lower element of `a` and `b` for less-than.
2434///
2435/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomilt_sd)
2436#[inline]
2437#[target_feature(enable = "sse2")]
2438#[cfg_attr(test, assert_instr(ucomisd))]
2439#[stable(feature = "simd_x86", since = "1.27.0")]
2440pub fn _mm_ucomilt_sd(a: __m128d, b: __m128d) -> i32 {
2441    unsafe { ucomiltsd(a, b) }
2442}
2443
2444/// Compares the lower element of `a` and `b` for less-than-or-equal.
2445///
2446/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomile_sd)
2447#[inline]
2448#[target_feature(enable = "sse2")]
2449#[cfg_attr(test, assert_instr(ucomisd))]
2450#[stable(feature = "simd_x86", since = "1.27.0")]
2451pub fn _mm_ucomile_sd(a: __m128d, b: __m128d) -> i32 {
2452    unsafe { ucomilesd(a, b) }
2453}
2454
2455/// Compares the lower element of `a` and `b` for greater-than.
2456///
2457/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomigt_sd)
2458#[inline]
2459#[target_feature(enable = "sse2")]
2460#[cfg_attr(test, assert_instr(ucomisd))]
2461#[stable(feature = "simd_x86", since = "1.27.0")]
2462pub fn _mm_ucomigt_sd(a: __m128d, b: __m128d) -> i32 {
2463    unsafe { ucomigtsd(a, b) }
2464}
2465
2466/// Compares the lower element of `a` and `b` for greater-than-or-equal.
2467///
2468/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomige_sd)
2469#[inline]
2470#[target_feature(enable = "sse2")]
2471#[cfg_attr(test, assert_instr(ucomisd))]
2472#[stable(feature = "simd_x86", since = "1.27.0")]
2473pub fn _mm_ucomige_sd(a: __m128d, b: __m128d) -> i32 {
2474    unsafe { ucomigesd(a, b) }
2475}
2476
2477/// Compares the lower element of `a` and `b` for not-equal.
2478///
2479/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomineq_sd)
2480#[inline]
2481#[target_feature(enable = "sse2")]
2482#[cfg_attr(test, assert_instr(ucomisd))]
2483#[stable(feature = "simd_x86", since = "1.27.0")]
2484pub fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> i32 {
2485    unsafe { ucomineqsd(a, b) }
2486}
2487
2488/// Converts packed double-precision (64-bit) floating-point elements in `a` to
2489/// packed single-precision (32-bit) floating-point elements
2490///
2491/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps)
2492#[inline]
2493#[target_feature(enable = "sse2")]
2494#[cfg_attr(test, assert_instr(cvtpd2ps))]
2495#[stable(feature = "simd_x86", since = "1.27.0")]
2496#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2497pub const fn _mm_cvtpd_ps(a: __m128d) -> __m128 {
2498    unsafe {
2499        let r = simd_cast::<_, f32x2>(a.as_f64x2());
2500        let zero = f32x2::ZERO;
2501        transmute::<f32x4, _>(simd_shuffle!(r, zero, [0, 1, 2, 3]))
2502    }
2503}
2504
2505/// Converts packed single-precision (32-bit) floating-point elements in `a` to
2506/// packed
2507/// double-precision (64-bit) floating-point elements.
2508///
2509/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd)
2510#[inline]
2511#[target_feature(enable = "sse2")]
2512#[cfg_attr(test, assert_instr(cvtps2pd))]
2513#[stable(feature = "simd_x86", since = "1.27.0")]
2514#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2515pub const fn _mm_cvtps_pd(a: __m128) -> __m128d {
2516    unsafe {
2517        let a = a.as_f32x4();
2518        transmute(simd_cast::<f32x2, f64x2>(simd_shuffle!(a, a, [0, 1])))
2519    }
2520}
2521
2522/// Converts packed double-precision (64-bit) floating-point elements in `a` to
2523/// packed 32-bit integers.
2524///
2525/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32)
2526#[inline]
2527#[target_feature(enable = "sse2")]
2528#[cfg_attr(test, assert_instr(cvtpd2dq))]
2529#[stable(feature = "simd_x86", since = "1.27.0")]
2530pub fn _mm_cvtpd_epi32(a: __m128d) -> __m128i {
2531    unsafe { transmute(cvtpd2dq(a)) }
2532}
2533
2534/// Converts the lower double-precision (64-bit) floating-point element in a to
2535/// a 32-bit integer.
2536///
2537/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32)
2538#[inline]
2539#[target_feature(enable = "sse2")]
2540#[cfg_attr(test, assert_instr(cvtsd2si))]
2541#[stable(feature = "simd_x86", since = "1.27.0")]
2542pub fn _mm_cvtsd_si32(a: __m128d) -> i32 {
2543    unsafe { cvtsd2si(a) }
2544}
2545
2546/// Converts the lower double-precision (64-bit) floating-point element in `b`
2547/// to a single-precision (32-bit) floating-point element, store the result in
2548/// the lower element of the return value, and copies the upper element from `a`
2549/// to the upper element the return value.
2550///
2551/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss)
2552#[inline]
2553#[target_feature(enable = "sse2")]
2554#[cfg_attr(test, assert_instr(cvtsd2ss))]
2555#[stable(feature = "simd_x86", since = "1.27.0")]
2556pub fn _mm_cvtsd_ss(a: __m128, b: __m128d) -> __m128 {
2557    unsafe { cvtsd2ss(a, b) }
2558}
2559
2560/// Returns the lower double-precision (64-bit) floating-point element of `a`.
2561///
2562/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64)
2563#[inline]
2564#[target_feature(enable = "sse2")]
2565#[stable(feature = "simd_x86", since = "1.27.0")]
2566#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2567pub const fn _mm_cvtsd_f64(a: __m128d) -> f64 {
2568    unsafe { simd_extract!(a, 0) }
2569}
2570
2571/// Converts the lower single-precision (32-bit) floating-point element in `b`
2572/// to a double-precision (64-bit) floating-point element, store the result in
2573/// the lower element of the return value, and copies the upper element from `a`
2574/// to the upper element the return value.
2575///
2576/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd)
2577#[inline]
2578#[target_feature(enable = "sse2")]
2579#[cfg_attr(test, assert_instr(cvtss2sd))]
2580#[stable(feature = "simd_x86", since = "1.27.0")]
2581#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2582pub const fn _mm_cvtss_sd(a: __m128d, b: __m128) -> __m128d {
2583    unsafe {
2584        let elt: f32 = simd_extract!(b, 0);
2585        simd_insert!(a, 0, elt as f64)
2586    }
2587}
2588
2589/// Converts packed double-precision (64-bit) floating-point elements in `a` to
2590/// packed 32-bit integers with truncation.
2591///
2592/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32)
2593#[inline]
2594#[target_feature(enable = "sse2")]
2595#[cfg_attr(test, assert_instr(cvttpd2dq))]
2596#[stable(feature = "simd_x86", since = "1.27.0")]
2597pub fn _mm_cvttpd_epi32(a: __m128d) -> __m128i {
2598    unsafe { transmute(cvttpd2dq(a)) }
2599}
2600
2601/// Converts the lower double-precision (64-bit) floating-point element in `a`
2602/// to a 32-bit integer with truncation.
2603///
2604/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32)
2605#[inline]
2606#[target_feature(enable = "sse2")]
2607#[cfg_attr(test, assert_instr(cvttsd2si))]
2608#[stable(feature = "simd_x86", since = "1.27.0")]
2609pub fn _mm_cvttsd_si32(a: __m128d) -> i32 {
2610    unsafe { cvttsd2si(a) }
2611}
2612
2613/// Converts packed single-precision (32-bit) floating-point elements in `a` to
2614/// packed 32-bit integers with truncation.
2615///
2616/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32)
2617#[inline]
2618#[target_feature(enable = "sse2")]
2619#[cfg_attr(test, assert_instr(cvttps2dq))]
2620#[stable(feature = "simd_x86", since = "1.27.0")]
2621pub fn _mm_cvttps_epi32(a: __m128) -> __m128i {
2622    unsafe { transmute(cvttps2dq(a)) }
2623}
2624
2625/// Copies double-precision (64-bit) floating-point element `a` to the lower
2626/// element of the packed 64-bit return value.
2627///
2628/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd)
2629#[inline]
2630#[target_feature(enable = "sse2")]
2631#[stable(feature = "simd_x86", since = "1.27.0")]
2632#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2633pub const fn _mm_set_sd(a: f64) -> __m128d {
2634    _mm_set_pd(0.0, a)
2635}
2636
2637/// Broadcasts double-precision (64-bit) floating-point value a to all elements
2638/// of the return value.
2639///
2640/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd)
2641#[inline]
2642#[target_feature(enable = "sse2")]
2643#[stable(feature = "simd_x86", since = "1.27.0")]
2644#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2645pub const fn _mm_set1_pd(a: f64) -> __m128d {
2646    _mm_set_pd(a, a)
2647}
2648
2649/// Broadcasts double-precision (64-bit) floating-point value a to all elements
2650/// of the return value.
2651///
2652/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1)
2653#[inline]
2654#[target_feature(enable = "sse2")]
2655#[stable(feature = "simd_x86", since = "1.27.0")]
2656#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2657pub const fn _mm_set_pd1(a: f64) -> __m128d {
2658    _mm_set_pd(a, a)
2659}
2660
2661/// Sets packed double-precision (64-bit) floating-point elements in the return
2662/// value with the supplied values.
2663///
2664/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd)
2665#[inline]
2666#[target_feature(enable = "sse2")]
2667#[stable(feature = "simd_x86", since = "1.27.0")]
2668#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2669pub const fn _mm_set_pd(a: f64, b: f64) -> __m128d {
2670    __m128d([b, a])
2671}
2672
2673/// Sets packed double-precision (64-bit) floating-point elements in the return
2674/// value with the supplied values in reverse order.
2675///
2676/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd)
2677#[inline]
2678#[target_feature(enable = "sse2")]
2679#[stable(feature = "simd_x86", since = "1.27.0")]
2680#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2681pub const fn _mm_setr_pd(a: f64, b: f64) -> __m128d {
2682    _mm_set_pd(b, a)
2683}
2684
2685/// Returns packed double-precision (64-bit) floating-point elements with all
2686/// zeros.
2687///
2688/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd)
2689#[inline]
2690#[target_feature(enable = "sse2")]
2691#[cfg_attr(test, assert_instr(xorp))]
2692#[stable(feature = "simd_x86", since = "1.27.0")]
2693#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2694pub const fn _mm_setzero_pd() -> __m128d {
2695    const { unsafe { mem::zeroed() } }
2696}
2697
2698/// Returns a mask of the most significant bit of each element in `a`.
2699///
2700/// The mask is stored in the 2 least significant bits of the return value.
2701/// All other bits are set to `0`.
2702///
2703/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd)
2704#[inline]
2705#[target_feature(enable = "sse2")]
2706#[cfg_attr(test, assert_instr(movmskpd))]
2707#[stable(feature = "simd_x86", since = "1.27.0")]
2708#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2709pub const fn _mm_movemask_pd(a: __m128d) -> i32 {
2710    // Propagate the highest bit to the rest, because simd_bitmask
2711    // requires all-1 or all-0.
2712    unsafe {
2713        let mask: i64x2 = simd_lt(transmute(a), i64x2::ZERO);
2714        simd_bitmask::<i64x2, u8>(mask) as i32
2715    }
2716}
2717
2718/// Loads 128-bits (composed of 2 packed double-precision (64-bit)
2719/// floating-point elements) from memory into the returned vector.
2720/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
2721/// exception may be generated.
2722///
2723/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd)
2724#[inline]
2725#[target_feature(enable = "sse2")]
2726#[cfg_attr(
2727    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
2728    assert_instr(movaps)
2729)]
2730#[stable(feature = "simd_x86", since = "1.27.0")]
2731#[allow(clippy::cast_ptr_alignment)]
2732#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2733pub const unsafe fn _mm_load_pd(mem_addr: *const f64) -> __m128d {
2734    *(mem_addr as *const __m128d)
2735}
2736
2737/// Loads a 64-bit double-precision value to the low element of a
2738/// 128-bit integer vector and clears the upper element.
2739///
2740/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd)
2741#[inline]
2742#[target_feature(enable = "sse2")]
2743#[cfg_attr(test, assert_instr(movsd))]
2744#[stable(feature = "simd_x86", since = "1.27.0")]
2745#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2746pub const unsafe fn _mm_load_sd(mem_addr: *const f64) -> __m128d {
2747    _mm_setr_pd(*mem_addr, 0.)
2748}
2749
2750/// Loads a double-precision value into the high-order bits of a 128-bit
2751/// vector of `[2 x double]`. The low-order bits are copied from the low-order
2752/// bits of the first operand.
2753///
2754/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd)
2755#[inline]
2756#[target_feature(enable = "sse2")]
2757#[cfg_attr(test, assert_instr(movhps))]
2758#[stable(feature = "simd_x86", since = "1.27.0")]
2759#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2760pub const unsafe fn _mm_loadh_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
2761    _mm_setr_pd(simd_extract!(a, 0), *mem_addr)
2762}
2763
2764/// Loads a double-precision value into the low-order bits of a 128-bit
2765/// vector of `[2 x double]`. The high-order bits are copied from the
2766/// high-order bits of the first operand.
2767///
2768/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd)
2769#[inline]
2770#[target_feature(enable = "sse2")]
2771#[cfg_attr(test, assert_instr(movlps))]
2772#[stable(feature = "simd_x86", since = "1.27.0")]
2773#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2774pub const unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
2775    _mm_setr_pd(*mem_addr, simd_extract!(a, 1))
2776}
2777
2778/// Stores a 128-bit floating point vector of `[2 x double]` to a 128-bit
2779/// aligned memory location.
2780/// To minimize caching, the data is flagged as non-temporal (unlikely to be
2781/// used again soon).
2782///
2783/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd)
2784///
2785/// # Safety of non-temporal stores
2786///
2787/// After using this intrinsic, but before any other access to the memory that this intrinsic
2788/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
2789/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
2790/// return.
2791///
2792/// See [`_mm_sfence`] for details.
2793#[inline]
2794#[target_feature(enable = "sse2")]
2795#[cfg_attr(test, assert_instr(movntpd))]
2796#[stable(feature = "simd_x86", since = "1.27.0")]
2797#[allow(clippy::cast_ptr_alignment)]
2798pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) {
2799    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
2800    crate::arch::asm!(
2801        vps!("movntpd", ",{a}"),
2802        p = in(reg) mem_addr,
2803        a = in(xmm_reg) a,
2804        options(nostack, preserves_flags),
2805    );
2806}
2807
2808/// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a
2809/// memory location.
2810///
2811/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_sd)
2812#[inline]
2813#[target_feature(enable = "sse2")]
2814#[cfg_attr(test, assert_instr(movlps))]
2815#[stable(feature = "simd_x86", since = "1.27.0")]
2816#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2817pub const unsafe fn _mm_store_sd(mem_addr: *mut f64, a: __m128d) {
2818    *mem_addr = simd_extract!(a, 0)
2819}
2820
2821/// Stores 128-bits (composed of 2 packed double-precision (64-bit)
2822/// floating-point elements) from `a` into memory. `mem_addr` must be aligned
2823/// on a 16-byte boundary or a general-protection exception may be generated.
2824///
2825/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd)
2826#[inline]
2827#[target_feature(enable = "sse2")]
2828#[cfg_attr(
2829    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
2830    assert_instr(movaps)
2831)]
2832#[stable(feature = "simd_x86", since = "1.27.0")]
2833#[allow(clippy::cast_ptr_alignment)]
2834#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2835pub const unsafe fn _mm_store_pd(mem_addr: *mut f64, a: __m128d) {
2836    *(mem_addr as *mut __m128d) = a;
2837}
2838
2839/// Stores 128-bits (composed of 2 packed double-precision (64-bit)
2840/// floating-point elements) from `a` into memory.
2841/// `mem_addr` does not need to be aligned on any particular boundary.
2842///
2843/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd)
2844#[inline]
2845#[target_feature(enable = "sse2")]
2846#[cfg_attr(test, assert_instr(movups))] // FIXME movupd expected
2847#[stable(feature = "simd_x86", since = "1.27.0")]
2848#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2849pub const unsafe fn _mm_storeu_pd(mem_addr: *mut f64, a: __m128d) {
2850    mem_addr.cast::<__m128d>().write_unaligned(a);
2851}
2852
2853/// Store 16-bit integer from the first element of a into memory.
2854///
2855/// `mem_addr` does not need to be aligned on any particular boundary.
2856///
2857/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16)
2858#[inline]
2859#[target_feature(enable = "sse2")]
2860#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2861#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2862pub const unsafe fn _mm_storeu_si16(mem_addr: *mut u8, a: __m128i) {
2863    ptr::write_unaligned(mem_addr as *mut i16, simd_extract(a.as_i16x8(), 0))
2864}
2865
2866/// Store 32-bit integer from the first element of a into memory.
2867///
2868/// `mem_addr` does not need to be aligned on any particular boundary.
2869///
2870/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32)
2871#[inline]
2872#[target_feature(enable = "sse2")]
2873#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2874#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2875pub const unsafe fn _mm_storeu_si32(mem_addr: *mut u8, a: __m128i) {
2876    ptr::write_unaligned(mem_addr as *mut i32, simd_extract(a.as_i32x4(), 0))
2877}
2878
2879/// Store 64-bit integer from the first element of a into memory.
2880///
2881/// `mem_addr` does not need to be aligned on any particular boundary.
2882///
2883/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64)
2884#[inline]
2885#[target_feature(enable = "sse2")]
2886#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2887#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2888pub const unsafe fn _mm_storeu_si64(mem_addr: *mut u8, a: __m128i) {
2889    ptr::write_unaligned(mem_addr as *mut i64, simd_extract(a.as_i64x2(), 0))
2890}
2891
2892/// Stores the lower double-precision (64-bit) floating-point element from `a`
2893/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
2894/// 16-byte boundary or a general-protection exception may be generated.
2895///
2896/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_pd)
2897#[inline]
2898#[target_feature(enable = "sse2")]
2899#[stable(feature = "simd_x86", since = "1.27.0")]
2900#[allow(clippy::cast_ptr_alignment)]
2901#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2902pub const unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: __m128d) {
2903    let b: __m128d = simd_shuffle!(a, a, [0, 0]);
2904    *(mem_addr as *mut __m128d) = b;
2905}
2906
2907/// Stores the lower double-precision (64-bit) floating-point element from `a`
2908/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
2909/// 16-byte boundary or a general-protection exception may be generated.
2910///
2911/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1)
2912#[inline]
2913#[target_feature(enable = "sse2")]
2914#[stable(feature = "simd_x86", since = "1.27.0")]
2915#[allow(clippy::cast_ptr_alignment)]
2916#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2917pub const unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: __m128d) {
2918    let b: __m128d = simd_shuffle!(a, a, [0, 0]);
2919    *(mem_addr as *mut __m128d) = b;
2920}
2921
2922/// Stores 2 double-precision (64-bit) floating-point elements from `a` into
2923/// memory in reverse order.
2924/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
2925/// exception may be generated.
2926///
2927/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd)
2928#[inline]
2929#[target_feature(enable = "sse2")]
2930#[stable(feature = "simd_x86", since = "1.27.0")]
2931#[allow(clippy::cast_ptr_alignment)]
2932#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2933pub const unsafe fn _mm_storer_pd(mem_addr: *mut f64, a: __m128d) {
2934    let b: __m128d = simd_shuffle!(a, a, [1, 0]);
2935    *(mem_addr as *mut __m128d) = b;
2936}
2937
2938/// Stores the upper 64 bits of a 128-bit vector of `[2 x double]` to a
2939/// memory location.
2940///
2941/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd)
2942#[inline]
2943#[target_feature(enable = "sse2")]
2944#[cfg_attr(test, assert_instr(movhps))]
2945#[stable(feature = "simd_x86", since = "1.27.0")]
2946#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2947pub const unsafe fn _mm_storeh_pd(mem_addr: *mut f64, a: __m128d) {
2948    *mem_addr = simd_extract!(a, 1);
2949}
2950
2951/// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a
2952/// memory location.
2953///
2954/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd)
2955#[inline]
2956#[target_feature(enable = "sse2")]
2957#[cfg_attr(test, assert_instr(movlps))]
2958#[stable(feature = "simd_x86", since = "1.27.0")]
2959#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2960pub const unsafe fn _mm_storel_pd(mem_addr: *mut f64, a: __m128d) {
2961    *mem_addr = simd_extract!(a, 0);
2962}
2963
2964/// Loads a double-precision (64-bit) floating-point element from memory
2965/// into both elements of returned vector.
2966///
2967/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd)
2968#[inline]
2969#[target_feature(enable = "sse2")]
2970// #[cfg_attr(test, assert_instr(movapd))] // FIXME LLVM uses different codegen
2971#[stable(feature = "simd_x86", since = "1.27.0")]
2972#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2973pub const unsafe fn _mm_load1_pd(mem_addr: *const f64) -> __m128d {
2974    let d = *mem_addr;
2975    _mm_setr_pd(d, d)
2976}
2977
2978/// Loads a double-precision (64-bit) floating-point element from memory
2979/// into both elements of returned vector.
2980///
2981/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1)
2982#[inline]
2983#[target_feature(enable = "sse2")]
2984// #[cfg_attr(test, assert_instr(movapd))] // FIXME same as _mm_load1_pd
2985#[stable(feature = "simd_x86", since = "1.27.0")]
2986#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2987pub const unsafe fn _mm_load_pd1(mem_addr: *const f64) -> __m128d {
2988    _mm_load1_pd(mem_addr)
2989}
2990
2991/// Loads 2 double-precision (64-bit) floating-point elements from memory into
2992/// the returned vector in reverse order. `mem_addr` must be aligned on a
2993/// 16-byte boundary or a general-protection exception may be generated.
2994///
2995/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd)
2996#[inline]
2997#[target_feature(enable = "sse2")]
2998#[cfg_attr(
2999    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
3000    assert_instr(movaps)
3001)]
3002#[stable(feature = "simd_x86", since = "1.27.0")]
3003#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3004pub const unsafe fn _mm_loadr_pd(mem_addr: *const f64) -> __m128d {
3005    let a = _mm_load_pd(mem_addr);
3006    simd_shuffle!(a, a, [1, 0])
3007}
3008
3009/// Loads 128-bits (composed of 2 packed double-precision (64-bit)
3010/// floating-point elements) from memory into the returned vector.
3011/// `mem_addr` does not need to be aligned on any particular boundary.
3012///
3013/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd)
3014#[inline]
3015#[target_feature(enable = "sse2")]
3016#[cfg_attr(test, assert_instr(movups))]
3017#[stable(feature = "simd_x86", since = "1.27.0")]
3018#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3019pub const unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> __m128d {
3020    let mut dst = _mm_undefined_pd();
3021    ptr::copy_nonoverlapping(
3022        mem_addr as *const u8,
3023        ptr::addr_of_mut!(dst) as *mut u8,
3024        mem::size_of::<__m128d>(),
3025    );
3026    dst
3027}
3028
3029/// Loads unaligned 16-bits of integer data from memory into new vector.
3030///
3031/// `mem_addr` does not need to be aligned on any particular boundary.
3032///
3033/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16)
3034#[inline]
3035#[target_feature(enable = "sse2")]
3036#[stable(feature = "simd_x86_updates", since = "1.82.0")]
3037#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3038pub const unsafe fn _mm_loadu_si16(mem_addr: *const u8) -> __m128i {
3039    transmute(i16x8::new(
3040        ptr::read_unaligned(mem_addr as *const i16),
3041        0,
3042        0,
3043        0,
3044        0,
3045        0,
3046        0,
3047        0,
3048    ))
3049}
3050
3051/// Loads unaligned 32-bits of integer data from memory into new vector.
3052///
3053/// `mem_addr` does not need to be aligned on any particular boundary.
3054///
3055/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32)
3056#[inline]
3057#[target_feature(enable = "sse2")]
3058#[stable(feature = "simd_x86_updates", since = "1.82.0")]
3059#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3060pub const unsafe fn _mm_loadu_si32(mem_addr: *const u8) -> __m128i {
3061    transmute(i32x4::new(
3062        ptr::read_unaligned(mem_addr as *const i32),
3063        0,
3064        0,
3065        0,
3066    ))
3067}
3068
3069/// Loads unaligned 64-bits of integer data from memory into new vector.
3070///
3071/// `mem_addr` does not need to be aligned on any particular boundary.
3072///
3073/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64)
3074#[inline]
3075#[target_feature(enable = "sse2")]
3076#[stable(feature = "simd_x86_mm_loadu_si64", since = "1.46.0")]
3077#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3078pub const unsafe fn _mm_loadu_si64(mem_addr: *const u8) -> __m128i {
3079    transmute(i64x2::new(ptr::read_unaligned(mem_addr as *const i64), 0))
3080}
3081
3082/// Constructs a 128-bit floating-point vector of `[2 x double]` from two
3083/// 128-bit vector parameters of `[2 x double]`, using the immediate-value
3084/// parameter as a specifier.
3085///
3086/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd)
3087#[inline]
3088#[target_feature(enable = "sse2")]
3089#[cfg_attr(test, assert_instr(shufps, MASK = 2))]
3090#[rustc_legacy_const_generics(2)]
3091#[stable(feature = "simd_x86", since = "1.27.0")]
3092#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3093pub const fn _mm_shuffle_pd<const MASK: i32>(a: __m128d, b: __m128d) -> __m128d {
3094    static_assert_uimm_bits!(MASK, 8);
3095    unsafe { simd_shuffle!(a, b, [MASK as u32 & 0b1, ((MASK as u32 >> 1) & 0b1) + 2]) }
3096}
3097
3098/// Constructs a 128-bit floating-point vector of `[2 x double]`. The lower
3099/// 64 bits are set to the lower 64 bits of the second parameter. The upper
3100/// 64 bits are set to the upper 64 bits of the first parameter.
3101///
3102/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd)
3103#[inline]
3104#[target_feature(enable = "sse2")]
3105#[cfg_attr(test, assert_instr(movsd))]
3106#[stable(feature = "simd_x86", since = "1.27.0")]
3107#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3108pub const fn _mm_move_sd(a: __m128d, b: __m128d) -> __m128d {
3109    unsafe { _mm_setr_pd(simd_extract!(b, 0), simd_extract!(a, 1)) }
3110}
3111
3112/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
3113/// floating-point vector of `[4 x float]`.
3114///
3115/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps)
3116#[inline]
3117#[target_feature(enable = "sse2")]
3118#[stable(feature = "simd_x86", since = "1.27.0")]
3119#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3120pub const fn _mm_castpd_ps(a: __m128d) -> __m128 {
3121    unsafe { transmute(a) }
3122}
3123
3124/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
3125/// integer vector.
3126///
3127/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128)
3128#[inline]
3129#[target_feature(enable = "sse2")]
3130#[stable(feature = "simd_x86", since = "1.27.0")]
3131#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3132pub const fn _mm_castpd_si128(a: __m128d) -> __m128i {
3133    unsafe { transmute(a) }
3134}
3135
3136/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
3137/// floating-point vector of `[2 x double]`.
3138///
3139/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd)
3140#[inline]
3141#[target_feature(enable = "sse2")]
3142#[stable(feature = "simd_x86", since = "1.27.0")]
3143#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3144pub const fn _mm_castps_pd(a: __m128) -> __m128d {
3145    unsafe { transmute(a) }
3146}
3147
3148/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
3149/// integer vector.
3150///
3151/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128)
3152#[inline]
3153#[target_feature(enable = "sse2")]
3154#[stable(feature = "simd_x86", since = "1.27.0")]
3155#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3156pub const fn _mm_castps_si128(a: __m128) -> __m128i {
3157    unsafe { transmute(a) }
3158}
3159
3160/// Casts a 128-bit integer vector into a 128-bit floating-point vector
3161/// of `[2 x double]`.
3162///
3163/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd)
3164#[inline]
3165#[target_feature(enable = "sse2")]
3166#[stable(feature = "simd_x86", since = "1.27.0")]
3167#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3168pub const fn _mm_castsi128_pd(a: __m128i) -> __m128d {
3169    unsafe { transmute(a) }
3170}
3171
3172/// Casts a 128-bit integer vector into a 128-bit floating-point vector
3173/// of `[4 x float]`.
3174///
3175/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps)
3176#[inline]
3177#[target_feature(enable = "sse2")]
3178#[stable(feature = "simd_x86", since = "1.27.0")]
3179#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3180pub const fn _mm_castsi128_ps(a: __m128i) -> __m128 {
3181    unsafe { transmute(a) }
3182}
3183
3184/// Returns vector of type __m128d with indeterminate elements.with indetermination elements.
3185/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
3186/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
3187/// In practice, this is typically equivalent to [`mem::zeroed`].
3188///
3189/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd)
3190#[inline]
3191#[target_feature(enable = "sse2")]
3192#[stable(feature = "simd_x86", since = "1.27.0")]
3193#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3194pub const fn _mm_undefined_pd() -> __m128d {
3195    const { unsafe { mem::zeroed() } }
3196}
3197
3198/// Returns vector of type __m128i with indeterminate elements.with indetermination elements.
3199/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
3200/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
3201/// In practice, this is typically equivalent to [`mem::zeroed`].
3202///
3203/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_si128)
3204#[inline]
3205#[target_feature(enable = "sse2")]
3206#[stable(feature = "simd_x86", since = "1.27.0")]
3207#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3208pub const fn _mm_undefined_si128() -> __m128i {
3209    const { unsafe { mem::zeroed() } }
3210}
3211
3212/// The resulting `__m128d` element is composed by the low-order values of
3213/// the two `__m128d` interleaved input elements, i.e.:
3214///
3215/// * The `[127:64]` bits are copied from the `[127:64]` bits of the second input
3216/// * The `[63:0]` bits are copied from the `[127:64]` bits of the first input
3217///
3218/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd)
3219#[inline]
3220#[target_feature(enable = "sse2")]
3221#[cfg_attr(test, assert_instr(unpckhpd))]
3222#[stable(feature = "simd_x86", since = "1.27.0")]
3223#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3224pub const fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d {
3225    unsafe { simd_shuffle!(a, b, [1, 3]) }
3226}
3227
3228/// The resulting `__m128d` element is composed by the high-order values of
3229/// the two `__m128d` interleaved input elements, i.e.:
3230///
3231/// * The `[127:64]` bits are copied from the `[63:0]` bits of the second input
3232/// * The `[63:0]` bits are copied from the `[63:0]` bits of the first input
3233///
3234/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd)
3235#[inline]
3236#[target_feature(enable = "sse2")]
3237#[cfg_attr(test, assert_instr(movlhps))]
3238#[stable(feature = "simd_x86", since = "1.27.0")]
3239#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3240pub const fn _mm_unpacklo_pd(a: __m128d, b: __m128d) -> __m128d {
3241    unsafe { simd_shuffle!(a, b, [0, 2]) }
3242}
3243
3244#[allow(improper_ctypes)]
3245unsafe extern "C" {
3246    #[link_name = "llvm.x86.sse2.pause"]
3247    fn pause();
3248    #[link_name = "llvm.x86.sse2.clflush"]
3249    fn clflush(p: *const u8);
3250    #[link_name = "llvm.x86.sse2.lfence"]
3251    fn lfence();
3252    #[link_name = "llvm.x86.sse2.mfence"]
3253    fn mfence();
3254    #[link_name = "llvm.x86.sse2.pmadd.wd"]
3255    fn pmaddwd(a: i16x8, b: i16x8) -> i32x4;
3256    #[link_name = "llvm.x86.sse2.psad.bw"]
3257    fn psadbw(a: u8x16, b: u8x16) -> u64x2;
3258    #[link_name = "llvm.x86.sse2.psll.w"]
3259    fn psllw(a: i16x8, count: i16x8) -> i16x8;
3260    #[link_name = "llvm.x86.sse2.psll.d"]
3261    fn pslld(a: i32x4, count: i32x4) -> i32x4;
3262    #[link_name = "llvm.x86.sse2.psll.q"]
3263    fn psllq(a: i64x2, count: i64x2) -> i64x2;
3264    #[link_name = "llvm.x86.sse2.psra.w"]
3265    fn psraw(a: i16x8, count: i16x8) -> i16x8;
3266    #[link_name = "llvm.x86.sse2.psra.d"]
3267    fn psrad(a: i32x4, count: i32x4) -> i32x4;
3268    #[link_name = "llvm.x86.sse2.psrl.w"]
3269    fn psrlw(a: i16x8, count: i16x8) -> i16x8;
3270    #[link_name = "llvm.x86.sse2.psrl.d"]
3271    fn psrld(a: i32x4, count: i32x4) -> i32x4;
3272    #[link_name = "llvm.x86.sse2.psrl.q"]
3273    fn psrlq(a: i64x2, count: i64x2) -> i64x2;
3274    #[link_name = "llvm.x86.sse2.cvtps2dq"]
3275    fn cvtps2dq(a: __m128) -> i32x4;
3276    #[link_name = "llvm.x86.sse2.maskmov.dqu"]
3277    fn maskmovdqu(a: i8x16, mask: i8x16, mem_addr: *mut i8);
3278    #[link_name = "llvm.x86.sse2.max.sd"]
3279    fn maxsd(a: __m128d, b: __m128d) -> __m128d;
3280    #[link_name = "llvm.x86.sse2.max.pd"]
3281    fn maxpd(a: __m128d, b: __m128d) -> __m128d;
3282    #[link_name = "llvm.x86.sse2.min.sd"]
3283    fn minsd(a: __m128d, b: __m128d) -> __m128d;
3284    #[link_name = "llvm.x86.sse2.min.pd"]
3285    fn minpd(a: __m128d, b: __m128d) -> __m128d;
3286    #[link_name = "llvm.x86.sse2.cmp.sd"]
3287    fn cmpsd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3288    #[link_name = "llvm.x86.sse2.cmp.pd"]
3289    fn cmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3290    #[link_name = "llvm.x86.sse2.comieq.sd"]
3291    fn comieqsd(a: __m128d, b: __m128d) -> i32;
3292    #[link_name = "llvm.x86.sse2.comilt.sd"]
3293    fn comiltsd(a: __m128d, b: __m128d) -> i32;
3294    #[link_name = "llvm.x86.sse2.comile.sd"]
3295    fn comilesd(a: __m128d, b: __m128d) -> i32;
3296    #[link_name = "llvm.x86.sse2.comigt.sd"]
3297    fn comigtsd(a: __m128d, b: __m128d) -> i32;
3298    #[link_name = "llvm.x86.sse2.comige.sd"]
3299    fn comigesd(a: __m128d, b: __m128d) -> i32;
3300    #[link_name = "llvm.x86.sse2.comineq.sd"]
3301    fn comineqsd(a: __m128d, b: __m128d) -> i32;
3302    #[link_name = "llvm.x86.sse2.ucomieq.sd"]
3303    fn ucomieqsd(a: __m128d, b: __m128d) -> i32;
3304    #[link_name = "llvm.x86.sse2.ucomilt.sd"]
3305    fn ucomiltsd(a: __m128d, b: __m128d) -> i32;
3306    #[link_name = "llvm.x86.sse2.ucomile.sd"]
3307    fn ucomilesd(a: __m128d, b: __m128d) -> i32;
3308    #[link_name = "llvm.x86.sse2.ucomigt.sd"]
3309    fn ucomigtsd(a: __m128d, b: __m128d) -> i32;
3310    #[link_name = "llvm.x86.sse2.ucomige.sd"]
3311    fn ucomigesd(a: __m128d, b: __m128d) -> i32;
3312    #[link_name = "llvm.x86.sse2.ucomineq.sd"]
3313    fn ucomineqsd(a: __m128d, b: __m128d) -> i32;
3314    #[link_name = "llvm.x86.sse2.cvtpd2dq"]
3315    fn cvtpd2dq(a: __m128d) -> i32x4;
3316    #[link_name = "llvm.x86.sse2.cvtsd2si"]
3317    fn cvtsd2si(a: __m128d) -> i32;
3318    #[link_name = "llvm.x86.sse2.cvtsd2ss"]
3319    fn cvtsd2ss(a: __m128, b: __m128d) -> __m128;
3320    #[link_name = "llvm.x86.sse2.cvttpd2dq"]
3321    fn cvttpd2dq(a: __m128d) -> i32x4;
3322    #[link_name = "llvm.x86.sse2.cvttsd2si"]
3323    fn cvttsd2si(a: __m128d) -> i32;
3324    #[link_name = "llvm.x86.sse2.cvttps2dq"]
3325    fn cvttps2dq(a: __m128) -> i32x4;
3326}
3327
3328#[cfg(test)]
3329mod tests {
3330    use crate::core_arch::assert_eq_const as assert_eq;
3331    use crate::{
3332        core_arch::{simd::*, x86::*},
3333        hint::black_box,
3334    };
3335    use std::{boxed, f32, f64, mem, ptr};
3336    use stdarch_test::simd_test;
3337
3338    const NAN: f64 = f64::NAN;
3339
3340    #[test]
3341    fn test_mm_pause() {
3342        _mm_pause()
3343    }
3344
3345    #[simd_test(enable = "sse2")]
3346    fn test_mm_clflush() {
3347        let x = 0_u8;
3348        unsafe {
3349            _mm_clflush(ptr::addr_of!(x));
3350        }
3351    }
3352
3353    #[simd_test(enable = "sse2")]
3354    // Miri cannot support this until it is clear how it fits in the Rust memory model
3355    #[cfg_attr(miri, ignore)]
3356    fn test_mm_lfence() {
3357        _mm_lfence();
3358    }
3359
3360    #[simd_test(enable = "sse2")]
3361    // Miri cannot support this until it is clear how it fits in the Rust memory model
3362    #[cfg_attr(miri, ignore)]
3363    fn test_mm_mfence() {
3364        _mm_mfence();
3365    }
3366
3367    #[simd_test(enable = "sse2")]
3368    const fn test_mm_add_epi8() {
3369        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3370        #[rustfmt::skip]
3371        let b = _mm_setr_epi8(
3372            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3373        );
3374        let r = _mm_add_epi8(a, b);
3375        #[rustfmt::skip]
3376        let e = _mm_setr_epi8(
3377            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3378        );
3379        assert_eq_m128i(r, e);
3380    }
3381
3382    #[simd_test(enable = "sse2")]
3383    fn test_mm_add_epi8_overflow() {
3384        let a = _mm_set1_epi8(0x7F);
3385        let b = _mm_set1_epi8(1);
3386        let r = _mm_add_epi8(a, b);
3387        assert_eq_m128i(r, _mm_set1_epi8(-128));
3388    }
3389
3390    #[simd_test(enable = "sse2")]
3391    const fn test_mm_add_epi16() {
3392        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3393        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3394        let r = _mm_add_epi16(a, b);
3395        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3396        assert_eq_m128i(r, e);
3397    }
3398
3399    #[simd_test(enable = "sse2")]
3400    const fn test_mm_add_epi32() {
3401        let a = _mm_setr_epi32(0, 1, 2, 3);
3402        let b = _mm_setr_epi32(4, 5, 6, 7);
3403        let r = _mm_add_epi32(a, b);
3404        let e = _mm_setr_epi32(4, 6, 8, 10);
3405        assert_eq_m128i(r, e);
3406    }
3407
3408    #[simd_test(enable = "sse2")]
3409    const fn test_mm_add_epi64() {
3410        let a = _mm_setr_epi64x(0, 1);
3411        let b = _mm_setr_epi64x(2, 3);
3412        let r = _mm_add_epi64(a, b);
3413        let e = _mm_setr_epi64x(2, 4);
3414        assert_eq_m128i(r, e);
3415    }
3416
3417    #[simd_test(enable = "sse2")]
3418    const fn test_mm_adds_epi8() {
3419        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3420        #[rustfmt::skip]
3421        let b = _mm_setr_epi8(
3422            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3423        );
3424        let r = _mm_adds_epi8(a, b);
3425        #[rustfmt::skip]
3426        let e = _mm_setr_epi8(
3427            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3428        );
3429        assert_eq_m128i(r, e);
3430    }
3431
3432    #[simd_test(enable = "sse2")]
3433    fn test_mm_adds_epi8_saturate_positive() {
3434        let a = _mm_set1_epi8(0x7F);
3435        let b = _mm_set1_epi8(1);
3436        let r = _mm_adds_epi8(a, b);
3437        assert_eq_m128i(r, a);
3438    }
3439
3440    #[simd_test(enable = "sse2")]
3441    fn test_mm_adds_epi8_saturate_negative() {
3442        let a = _mm_set1_epi8(-0x80);
3443        let b = _mm_set1_epi8(-1);
3444        let r = _mm_adds_epi8(a, b);
3445        assert_eq_m128i(r, a);
3446    }
3447
3448    #[simd_test(enable = "sse2")]
3449    const fn test_mm_adds_epi16() {
3450        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3451        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3452        let r = _mm_adds_epi16(a, b);
3453        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3454        assert_eq_m128i(r, e);
3455    }
3456
3457    #[simd_test(enable = "sse2")]
3458    fn test_mm_adds_epi16_saturate_positive() {
3459        let a = _mm_set1_epi16(0x7FFF);
3460        let b = _mm_set1_epi16(1);
3461        let r = _mm_adds_epi16(a, b);
3462        assert_eq_m128i(r, a);
3463    }
3464
3465    #[simd_test(enable = "sse2")]
3466    fn test_mm_adds_epi16_saturate_negative() {
3467        let a = _mm_set1_epi16(-0x8000);
3468        let b = _mm_set1_epi16(-1);
3469        let r = _mm_adds_epi16(a, b);
3470        assert_eq_m128i(r, a);
3471    }
3472
3473    #[simd_test(enable = "sse2")]
3474    const fn test_mm_adds_epu8() {
3475        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3476        #[rustfmt::skip]
3477        let b = _mm_setr_epi8(
3478            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3479        );
3480        let r = _mm_adds_epu8(a, b);
3481        #[rustfmt::skip]
3482        let e = _mm_setr_epi8(
3483            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3484        );
3485        assert_eq_m128i(r, e);
3486    }
3487
3488    #[simd_test(enable = "sse2")]
3489    fn test_mm_adds_epu8_saturate() {
3490        let a = _mm_set1_epi8(!0);
3491        let b = _mm_set1_epi8(1);
3492        let r = _mm_adds_epu8(a, b);
3493        assert_eq_m128i(r, a);
3494    }
3495
3496    #[simd_test(enable = "sse2")]
3497    const fn test_mm_adds_epu16() {
3498        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3499        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3500        let r = _mm_adds_epu16(a, b);
3501        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3502        assert_eq_m128i(r, e);
3503    }
3504
3505    #[simd_test(enable = "sse2")]
3506    fn test_mm_adds_epu16_saturate() {
3507        let a = _mm_set1_epi16(!0);
3508        let b = _mm_set1_epi16(1);
3509        let r = _mm_adds_epu16(a, b);
3510        assert_eq_m128i(r, a);
3511    }
3512
3513    #[simd_test(enable = "sse2")]
3514    const fn test_mm_avg_epu8() {
3515        let (a, b) = (_mm_set1_epi8(3), _mm_set1_epi8(9));
3516        let r = _mm_avg_epu8(a, b);
3517        assert_eq_m128i(r, _mm_set1_epi8(6));
3518    }
3519
3520    #[simd_test(enable = "sse2")]
3521    const fn test_mm_avg_epu16() {
3522        let (a, b) = (_mm_set1_epi16(3), _mm_set1_epi16(9));
3523        let r = _mm_avg_epu16(a, b);
3524        assert_eq_m128i(r, _mm_set1_epi16(6));
3525    }
3526
3527    #[simd_test(enable = "sse2")]
3528    fn test_mm_madd_epi16() {
3529        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
3530        let b = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
3531        let r = _mm_madd_epi16(a, b);
3532        let e = _mm_setr_epi32(29, 81, 149, 233);
3533        assert_eq_m128i(r, e);
3534
3535        // Test large values.
3536        // MIN*MIN+MIN*MIN will overflow into i32::MIN.
3537        let a = _mm_setr_epi16(
3538            i16::MAX,
3539            i16::MAX,
3540            i16::MIN,
3541            i16::MIN,
3542            i16::MIN,
3543            i16::MAX,
3544            0,
3545            0,
3546        );
3547        let b = _mm_setr_epi16(
3548            i16::MAX,
3549            i16::MAX,
3550            i16::MIN,
3551            i16::MIN,
3552            i16::MAX,
3553            i16::MIN,
3554            0,
3555            0,
3556        );
3557        let r = _mm_madd_epi16(a, b);
3558        let e = _mm_setr_epi32(0x7FFE0002, i32::MIN, -0x7FFF0000, 0);
3559        assert_eq_m128i(r, e);
3560    }
3561
3562    #[simd_test(enable = "sse2")]
3563    const fn test_mm_max_epi16() {
3564        let a = _mm_set1_epi16(1);
3565        let b = _mm_set1_epi16(-1);
3566        let r = _mm_max_epi16(a, b);
3567        assert_eq_m128i(r, a);
3568    }
3569
3570    #[simd_test(enable = "sse2")]
3571    const fn test_mm_max_epu8() {
3572        let a = _mm_set1_epi8(1);
3573        let b = _mm_set1_epi8(!0);
3574        let r = _mm_max_epu8(a, b);
3575        assert_eq_m128i(r, b);
3576    }
3577
3578    #[simd_test(enable = "sse2")]
3579    const fn test_mm_min_epi16() {
3580        let a = _mm_set1_epi16(1);
3581        let b = _mm_set1_epi16(-1);
3582        let r = _mm_min_epi16(a, b);
3583        assert_eq_m128i(r, b);
3584    }
3585
3586    #[simd_test(enable = "sse2")]
3587    const fn test_mm_min_epu8() {
3588        let a = _mm_set1_epi8(1);
3589        let b = _mm_set1_epi8(!0);
3590        let r = _mm_min_epu8(a, b);
3591        assert_eq_m128i(r, a);
3592    }
3593
3594    #[simd_test(enable = "sse2")]
3595    const fn test_mm_mulhi_epi16() {
3596        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(-1001));
3597        let r = _mm_mulhi_epi16(a, b);
3598        assert_eq_m128i(r, _mm_set1_epi16(-16));
3599    }
3600
3601    #[simd_test(enable = "sse2")]
3602    const fn test_mm_mulhi_epu16() {
3603        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(1001));
3604        let r = _mm_mulhi_epu16(a, b);
3605        assert_eq_m128i(r, _mm_set1_epi16(15));
3606    }
3607
3608    #[simd_test(enable = "sse2")]
3609    const fn test_mm_mullo_epi16() {
3610        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(-1001));
3611        let r = _mm_mullo_epi16(a, b);
3612        assert_eq_m128i(r, _mm_set1_epi16(-17960));
3613    }
3614
3615    #[simd_test(enable = "sse2")]
3616    const fn test_mm_mul_epu32() {
3617        let a = _mm_setr_epi64x(1_000_000_000, 1 << 34);
3618        let b = _mm_setr_epi64x(1_000_000_000, 1 << 35);
3619        let r = _mm_mul_epu32(a, b);
3620        let e = _mm_setr_epi64x(1_000_000_000 * 1_000_000_000, 0);
3621        assert_eq_m128i(r, e);
3622    }
3623
3624    #[simd_test(enable = "sse2")]
3625    fn test_mm_sad_epu8() {
3626        #[rustfmt::skip]
3627        let a = _mm_setr_epi8(
3628            255u8 as i8, 254u8 as i8, 253u8 as i8, 252u8 as i8,
3629            1, 2, 3, 4,
3630            155u8 as i8, 154u8 as i8, 153u8 as i8, 152u8 as i8,
3631            1, 2, 3, 4,
3632        );
3633        let b = _mm_setr_epi8(0, 0, 0, 0, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2);
3634        let r = _mm_sad_epu8(a, b);
3635        let e = _mm_setr_epi64x(1020, 614);
3636        assert_eq_m128i(r, e);
3637    }
3638
3639    #[simd_test(enable = "sse2")]
3640    const fn test_mm_sub_epi8() {
3641        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(6));
3642        let r = _mm_sub_epi8(a, b);
3643        assert_eq_m128i(r, _mm_set1_epi8(-1));
3644    }
3645
3646    #[simd_test(enable = "sse2")]
3647    const fn test_mm_sub_epi16() {
3648        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(6));
3649        let r = _mm_sub_epi16(a, b);
3650        assert_eq_m128i(r, _mm_set1_epi16(-1));
3651    }
3652
3653    #[simd_test(enable = "sse2")]
3654    const fn test_mm_sub_epi32() {
3655        let (a, b) = (_mm_set1_epi32(5), _mm_set1_epi32(6));
3656        let r = _mm_sub_epi32(a, b);
3657        assert_eq_m128i(r, _mm_set1_epi32(-1));
3658    }
3659
3660    #[simd_test(enable = "sse2")]
3661    const fn test_mm_sub_epi64() {
3662        let (a, b) = (_mm_set1_epi64x(5), _mm_set1_epi64x(6));
3663        let r = _mm_sub_epi64(a, b);
3664        assert_eq_m128i(r, _mm_set1_epi64x(-1));
3665    }
3666
3667    #[simd_test(enable = "sse2")]
3668    const fn test_mm_subs_epi8() {
3669        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(2));
3670        let r = _mm_subs_epi8(a, b);
3671        assert_eq_m128i(r, _mm_set1_epi8(3));
3672    }
3673
3674    #[simd_test(enable = "sse2")]
3675    fn test_mm_subs_epi8_saturate_positive() {
3676        let a = _mm_set1_epi8(0x7F);
3677        let b = _mm_set1_epi8(-1);
3678        let r = _mm_subs_epi8(a, b);
3679        assert_eq_m128i(r, a);
3680    }
3681
3682    #[simd_test(enable = "sse2")]
3683    fn test_mm_subs_epi8_saturate_negative() {
3684        let a = _mm_set1_epi8(-0x80);
3685        let b = _mm_set1_epi8(1);
3686        let r = _mm_subs_epi8(a, b);
3687        assert_eq_m128i(r, a);
3688    }
3689
3690    #[simd_test(enable = "sse2")]
3691    const fn test_mm_subs_epi16() {
3692        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(2));
3693        let r = _mm_subs_epi16(a, b);
3694        assert_eq_m128i(r, _mm_set1_epi16(3));
3695    }
3696
3697    #[simd_test(enable = "sse2")]
3698    fn test_mm_subs_epi16_saturate_positive() {
3699        let a = _mm_set1_epi16(0x7FFF);
3700        let b = _mm_set1_epi16(-1);
3701        let r = _mm_subs_epi16(a, b);
3702        assert_eq_m128i(r, a);
3703    }
3704
3705    #[simd_test(enable = "sse2")]
3706    fn test_mm_subs_epi16_saturate_negative() {
3707        let a = _mm_set1_epi16(-0x8000);
3708        let b = _mm_set1_epi16(1);
3709        let r = _mm_subs_epi16(a, b);
3710        assert_eq_m128i(r, a);
3711    }
3712
3713    #[simd_test(enable = "sse2")]
3714    const fn test_mm_subs_epu8() {
3715        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(2));
3716        let r = _mm_subs_epu8(a, b);
3717        assert_eq_m128i(r, _mm_set1_epi8(3));
3718    }
3719
3720    #[simd_test(enable = "sse2")]
3721    fn test_mm_subs_epu8_saturate() {
3722        let a = _mm_set1_epi8(0);
3723        let b = _mm_set1_epi8(1);
3724        let r = _mm_subs_epu8(a, b);
3725        assert_eq_m128i(r, a);
3726    }
3727
3728    #[simd_test(enable = "sse2")]
3729    const fn test_mm_subs_epu16() {
3730        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(2));
3731        let r = _mm_subs_epu16(a, b);
3732        assert_eq_m128i(r, _mm_set1_epi16(3));
3733    }
3734
3735    #[simd_test(enable = "sse2")]
3736    fn test_mm_subs_epu16_saturate() {
3737        let a = _mm_set1_epi16(0);
3738        let b = _mm_set1_epi16(1);
3739        let r = _mm_subs_epu16(a, b);
3740        assert_eq_m128i(r, a);
3741    }
3742
3743    #[simd_test(enable = "sse2")]
3744    const fn test_mm_slli_si128() {
3745        #[rustfmt::skip]
3746        let a = _mm_setr_epi8(
3747            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3748        );
3749        let r = _mm_slli_si128::<1>(a);
3750        let e = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3751        assert_eq_m128i(r, e);
3752
3753        #[rustfmt::skip]
3754        let a = _mm_setr_epi8(
3755            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3756        );
3757        let r = _mm_slli_si128::<15>(a);
3758        let e = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
3759        assert_eq_m128i(r, e);
3760
3761        #[rustfmt::skip]
3762        let a = _mm_setr_epi8(
3763            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3764        );
3765        let r = _mm_slli_si128::<16>(a);
3766        assert_eq_m128i(r, _mm_set1_epi8(0));
3767    }
3768
3769    #[simd_test(enable = "sse2")]
3770    const fn test_mm_slli_epi16() {
3771        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3772        let r = _mm_slli_epi16::<4>(a);
3773        assert_eq_m128i(
3774            r,
3775            _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0),
3776        );
3777        let r = _mm_slli_epi16::<16>(a);
3778        assert_eq_m128i(r, _mm_set1_epi16(0));
3779    }
3780
3781    #[simd_test(enable = "sse2")]
3782    fn test_mm_sll_epi16() {
3783        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3784        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 4));
3785        assert_eq_m128i(
3786            r,
3787            _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0),
3788        );
3789        let r = _mm_sll_epi16(a, _mm_set_epi64x(4, 0));
3790        assert_eq_m128i(r, a);
3791        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 16));
3792        assert_eq_m128i(r, _mm_set1_epi16(0));
3793        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, i64::MAX));
3794        assert_eq_m128i(r, _mm_set1_epi16(0));
3795    }
3796
3797    #[simd_test(enable = "sse2")]
3798    const fn test_mm_slli_epi32() {
3799        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3800        let r = _mm_slli_epi32::<4>(a);
3801        assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0));
3802        let r = _mm_slli_epi32::<32>(a);
3803        assert_eq_m128i(r, _mm_set1_epi32(0));
3804    }
3805
3806    #[simd_test(enable = "sse2")]
3807    fn test_mm_sll_epi32() {
3808        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3809        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 4));
3810        assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0));
3811        let r = _mm_sll_epi32(a, _mm_set_epi64x(4, 0));
3812        assert_eq_m128i(r, a);
3813        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 32));
3814        assert_eq_m128i(r, _mm_set1_epi32(0));
3815        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, i64::MAX));
3816        assert_eq_m128i(r, _mm_set1_epi32(0));
3817    }
3818
3819    #[simd_test(enable = "sse2")]
3820    const fn test_mm_slli_epi64() {
3821        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3822        let r = _mm_slli_epi64::<4>(a);
3823        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0));
3824        let r = _mm_slli_epi64::<64>(a);
3825        assert_eq_m128i(r, _mm_set1_epi64x(0));
3826    }
3827
3828    #[simd_test(enable = "sse2")]
3829    fn test_mm_sll_epi64() {
3830        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3831        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 4));
3832        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0));
3833        let r = _mm_sll_epi64(a, _mm_set_epi64x(4, 0));
3834        assert_eq_m128i(r, a);
3835        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 64));
3836        assert_eq_m128i(r, _mm_set1_epi64x(0));
3837        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, i64::MAX));
3838        assert_eq_m128i(r, _mm_set1_epi64x(0));
3839    }
3840
3841    #[simd_test(enable = "sse2")]
3842    const fn test_mm_srai_epi16() {
3843        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3844        let r = _mm_srai_epi16::<4>(a);
3845        assert_eq_m128i(
3846            r,
3847            _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10),
3848        );
3849        let r = _mm_srai_epi16::<16>(a);
3850        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3851    }
3852
3853    #[simd_test(enable = "sse2")]
3854    fn test_mm_sra_epi16() {
3855        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3856        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 4));
3857        assert_eq_m128i(
3858            r,
3859            _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10),
3860        );
3861        let r = _mm_sra_epi16(a, _mm_set_epi64x(4, 0));
3862        assert_eq_m128i(r, a);
3863        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 16));
3864        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3865        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, i64::MAX));
3866        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3867    }
3868
3869    #[simd_test(enable = "sse2")]
3870    const fn test_mm_srai_epi32() {
3871        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3872        let r = _mm_srai_epi32::<4>(a);
3873        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000));
3874        let r = _mm_srai_epi32::<32>(a);
3875        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
3876    }
3877
3878    #[simd_test(enable = "sse2")]
3879    fn test_mm_sra_epi32() {
3880        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3881        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 4));
3882        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000));
3883        let r = _mm_sra_epi32(a, _mm_set_epi64x(4, 0));
3884        assert_eq_m128i(r, a);
3885        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 32));
3886        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
3887        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, i64::MAX));
3888        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
3889    }
3890
3891    #[simd_test(enable = "sse2")]
3892    const fn test_mm_srli_si128() {
3893        #[rustfmt::skip]
3894        let a = _mm_setr_epi8(
3895            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3896        );
3897        let r = _mm_srli_si128::<1>(a);
3898        #[rustfmt::skip]
3899        let e = _mm_setr_epi8(
3900            2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0,
3901        );
3902        assert_eq_m128i(r, e);
3903
3904        #[rustfmt::skip]
3905        let a = _mm_setr_epi8(
3906            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3907        );
3908        let r = _mm_srli_si128::<15>(a);
3909        let e = _mm_setr_epi8(16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3910        assert_eq_m128i(r, e);
3911
3912        #[rustfmt::skip]
3913        let a = _mm_setr_epi8(
3914            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3915        );
3916        let r = _mm_srli_si128::<16>(a);
3917        assert_eq_m128i(r, _mm_set1_epi8(0));
3918    }
3919
3920    #[simd_test(enable = "sse2")]
3921    const fn test_mm_srli_epi16() {
3922        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3923        let r = _mm_srli_epi16::<4>(a);
3924        assert_eq_m128i(
3925            r,
3926            _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0),
3927        );
3928        let r = _mm_srli_epi16::<16>(a);
3929        assert_eq_m128i(r, _mm_set1_epi16(0));
3930    }
3931
3932    #[simd_test(enable = "sse2")]
3933    fn test_mm_srl_epi16() {
3934        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3935        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 4));
3936        assert_eq_m128i(
3937            r,
3938            _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0),
3939        );
3940        let r = _mm_srl_epi16(a, _mm_set_epi64x(4, 0));
3941        assert_eq_m128i(r, a);
3942        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 16));
3943        assert_eq_m128i(r, _mm_set1_epi16(0));
3944        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, i64::MAX));
3945        assert_eq_m128i(r, _mm_set1_epi16(0));
3946    }
3947
3948    #[simd_test(enable = "sse2")]
3949    const fn test_mm_srli_epi32() {
3950        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3951        let r = _mm_srli_epi32::<4>(a);
3952        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000));
3953        let r = _mm_srli_epi32::<32>(a);
3954        assert_eq_m128i(r, _mm_set1_epi32(0));
3955    }
3956
3957    #[simd_test(enable = "sse2")]
3958    fn test_mm_srl_epi32() {
3959        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3960        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 4));
3961        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000));
3962        let r = _mm_srl_epi32(a, _mm_set_epi64x(4, 0));
3963        assert_eq_m128i(r, a);
3964        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 32));
3965        assert_eq_m128i(r, _mm_set1_epi32(0));
3966        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, i64::MAX));
3967        assert_eq_m128i(r, _mm_set1_epi32(0));
3968    }
3969
3970    #[simd_test(enable = "sse2")]
3971    const fn test_mm_srli_epi64() {
3972        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3973        let r = _mm_srli_epi64::<4>(a);
3974        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000));
3975        let r = _mm_srli_epi64::<64>(a);
3976        assert_eq_m128i(r, _mm_set1_epi64x(0));
3977    }
3978
3979    #[simd_test(enable = "sse2")]
3980    fn test_mm_srl_epi64() {
3981        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3982        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 4));
3983        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000));
3984        let r = _mm_srl_epi64(a, _mm_set_epi64x(4, 0));
3985        assert_eq_m128i(r, a);
3986        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 64));
3987        assert_eq_m128i(r, _mm_set1_epi64x(0));
3988        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, i64::MAX));
3989        assert_eq_m128i(r, _mm_set1_epi64x(0));
3990    }
3991
3992    #[simd_test(enable = "sse2")]
3993    const fn test_mm_and_si128() {
3994        let a = _mm_set1_epi8(5);
3995        let b = _mm_set1_epi8(3);
3996        let r = _mm_and_si128(a, b);
3997        assert_eq_m128i(r, _mm_set1_epi8(1));
3998    }
3999
4000    #[simd_test(enable = "sse2")]
4001    const fn test_mm_andnot_si128() {
4002        let a = _mm_set1_epi8(5);
4003        let b = _mm_set1_epi8(3);
4004        let r = _mm_andnot_si128(a, b);
4005        assert_eq_m128i(r, _mm_set1_epi8(2));
4006    }
4007
4008    #[simd_test(enable = "sse2")]
4009    const fn test_mm_or_si128() {
4010        let a = _mm_set1_epi8(5);
4011        let b = _mm_set1_epi8(3);
4012        let r = _mm_or_si128(a, b);
4013        assert_eq_m128i(r, _mm_set1_epi8(7));
4014    }
4015
4016    #[simd_test(enable = "sse2")]
4017    const fn test_mm_xor_si128() {
4018        let a = _mm_set1_epi8(5);
4019        let b = _mm_set1_epi8(3);
4020        let r = _mm_xor_si128(a, b);
4021        assert_eq_m128i(r, _mm_set1_epi8(6));
4022    }
4023
4024    #[simd_test(enable = "sse2")]
4025    const fn test_mm_cmpeq_epi8() {
4026        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
4027        let b = _mm_setr_epi8(15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
4028        let r = _mm_cmpeq_epi8(a, b);
4029        #[rustfmt::skip]
4030        assert_eq_m128i(
4031            r,
4032            _mm_setr_epi8(
4033                0, 0, 0xFFu8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
4034            )
4035        );
4036    }
4037
4038    #[simd_test(enable = "sse2")]
4039    const fn test_mm_cmpeq_epi16() {
4040        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4041        let b = _mm_setr_epi16(7, 6, 2, 4, 3, 2, 1, 0);
4042        let r = _mm_cmpeq_epi16(a, b);
4043        assert_eq_m128i(r, _mm_setr_epi16(0, 0, !0, 0, 0, 0, 0, 0));
4044    }
4045
4046    #[simd_test(enable = "sse2")]
4047    const fn test_mm_cmpeq_epi32() {
4048        let a = _mm_setr_epi32(0, 1, 2, 3);
4049        let b = _mm_setr_epi32(3, 2, 2, 0);
4050        let r = _mm_cmpeq_epi32(a, b);
4051        assert_eq_m128i(r, _mm_setr_epi32(0, 0, !0, 0));
4052    }
4053
4054    #[simd_test(enable = "sse2")]
4055    const fn test_mm_cmpgt_epi8() {
4056        let a = _mm_set_epi8(5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
4057        let b = _mm_set1_epi8(0);
4058        let r = _mm_cmpgt_epi8(a, b);
4059        let e = _mm_set_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
4060        assert_eq_m128i(r, e);
4061    }
4062
4063    #[simd_test(enable = "sse2")]
4064    const fn test_mm_cmpgt_epi16() {
4065        let a = _mm_set_epi16(5, 0, 0, 0, 0, 0, 0, 0);
4066        let b = _mm_set1_epi16(0);
4067        let r = _mm_cmpgt_epi16(a, b);
4068        let e = _mm_set_epi16(!0, 0, 0, 0, 0, 0, 0, 0);
4069        assert_eq_m128i(r, e);
4070    }
4071
4072    #[simd_test(enable = "sse2")]
4073    const fn test_mm_cmpgt_epi32() {
4074        let a = _mm_set_epi32(5, 0, 0, 0);
4075        let b = _mm_set1_epi32(0);
4076        let r = _mm_cmpgt_epi32(a, b);
4077        assert_eq_m128i(r, _mm_set_epi32(!0, 0, 0, 0));
4078    }
4079
4080    #[simd_test(enable = "sse2")]
4081    const fn test_mm_cmplt_epi8() {
4082        let a = _mm_set1_epi8(0);
4083        let b = _mm_set_epi8(5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
4084        let r = _mm_cmplt_epi8(a, b);
4085        let e = _mm_set_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
4086        assert_eq_m128i(r, e);
4087    }
4088
4089    #[simd_test(enable = "sse2")]
4090    const fn test_mm_cmplt_epi16() {
4091        let a = _mm_set1_epi16(0);
4092        let b = _mm_set_epi16(5, 0, 0, 0, 0, 0, 0, 0);
4093        let r = _mm_cmplt_epi16(a, b);
4094        let e = _mm_set_epi16(!0, 0, 0, 0, 0, 0, 0, 0);
4095        assert_eq_m128i(r, e);
4096    }
4097
4098    #[simd_test(enable = "sse2")]
4099    const fn test_mm_cmplt_epi32() {
4100        let a = _mm_set1_epi32(0);
4101        let b = _mm_set_epi32(5, 0, 0, 0);
4102        let r = _mm_cmplt_epi32(a, b);
4103        assert_eq_m128i(r, _mm_set_epi32(!0, 0, 0, 0));
4104    }
4105
4106    #[simd_test(enable = "sse2")]
4107    const fn test_mm_cvtepi32_pd() {
4108        let a = _mm_set_epi32(35, 25, 15, 5);
4109        let r = _mm_cvtepi32_pd(a);
4110        assert_eq_m128d(r, _mm_setr_pd(5.0, 15.0));
4111    }
4112
4113    #[simd_test(enable = "sse2")]
4114    const fn test_mm_cvtsi32_sd() {
4115        let a = _mm_set1_pd(3.5);
4116        let r = _mm_cvtsi32_sd(a, 5);
4117        assert_eq_m128d(r, _mm_setr_pd(5.0, 3.5));
4118    }
4119
4120    #[simd_test(enable = "sse2")]
4121    const fn test_mm_cvtepi32_ps() {
4122        let a = _mm_setr_epi32(1, 2, 3, 4);
4123        let r = _mm_cvtepi32_ps(a);
4124        assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0));
4125    }
4126
4127    #[simd_test(enable = "sse2")]
4128    fn test_mm_cvtps_epi32() {
4129        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
4130        let r = _mm_cvtps_epi32(a);
4131        assert_eq_m128i(r, _mm_setr_epi32(1, 2, 3, 4));
4132    }
4133
4134    #[simd_test(enable = "sse2")]
4135    const fn test_mm_cvtsi32_si128() {
4136        let r = _mm_cvtsi32_si128(5);
4137        assert_eq_m128i(r, _mm_setr_epi32(5, 0, 0, 0));
4138    }
4139
4140    #[simd_test(enable = "sse2")]
4141    const fn test_mm_cvtsi128_si32() {
4142        let r = _mm_cvtsi128_si32(_mm_setr_epi32(5, 0, 0, 0));
4143        assert_eq!(r, 5);
4144    }
4145
4146    #[simd_test(enable = "sse2")]
4147    const fn test_mm_set_epi64x() {
4148        let r = _mm_set_epi64x(0, 1);
4149        assert_eq_m128i(r, _mm_setr_epi64x(1, 0));
4150    }
4151
4152    #[simd_test(enable = "sse2")]
4153    const fn test_mm_set_epi32() {
4154        let r = _mm_set_epi32(0, 1, 2, 3);
4155        assert_eq_m128i(r, _mm_setr_epi32(3, 2, 1, 0));
4156    }
4157
4158    #[simd_test(enable = "sse2")]
4159    const fn test_mm_set_epi16() {
4160        let r = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4161        assert_eq_m128i(r, _mm_setr_epi16(7, 6, 5, 4, 3, 2, 1, 0));
4162    }
4163
4164    #[simd_test(enable = "sse2")]
4165    const fn test_mm_set_epi8() {
4166        #[rustfmt::skip]
4167        let r = _mm_set_epi8(
4168            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4169        );
4170        #[rustfmt::skip]
4171        let e = _mm_setr_epi8(
4172            15, 14, 13, 12, 11, 10, 9, 8,
4173            7, 6, 5, 4, 3, 2, 1, 0,
4174        );
4175        assert_eq_m128i(r, e);
4176    }
4177
4178    #[simd_test(enable = "sse2")]
4179    const fn test_mm_set1_epi64x() {
4180        let r = _mm_set1_epi64x(1);
4181        assert_eq_m128i(r, _mm_set1_epi64x(1));
4182    }
4183
4184    #[simd_test(enable = "sse2")]
4185    const fn test_mm_set1_epi32() {
4186        let r = _mm_set1_epi32(1);
4187        assert_eq_m128i(r, _mm_set1_epi32(1));
4188    }
4189
4190    #[simd_test(enable = "sse2")]
4191    const fn test_mm_set1_epi16() {
4192        let r = _mm_set1_epi16(1);
4193        assert_eq_m128i(r, _mm_set1_epi16(1));
4194    }
4195
4196    #[simd_test(enable = "sse2")]
4197    const fn test_mm_set1_epi8() {
4198        let r = _mm_set1_epi8(1);
4199        assert_eq_m128i(r, _mm_set1_epi8(1));
4200    }
4201
4202    #[simd_test(enable = "sse2")]
4203    const fn test_mm_setr_epi32() {
4204        let r = _mm_setr_epi32(0, 1, 2, 3);
4205        assert_eq_m128i(r, _mm_setr_epi32(0, 1, 2, 3));
4206    }
4207
4208    #[simd_test(enable = "sse2")]
4209    const fn test_mm_setr_epi16() {
4210        let r = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4211        assert_eq_m128i(r, _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7));
4212    }
4213
4214    #[simd_test(enable = "sse2")]
4215    const fn test_mm_setr_epi8() {
4216        #[rustfmt::skip]
4217        let r = _mm_setr_epi8(
4218            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4219        );
4220        #[rustfmt::skip]
4221        let e = _mm_setr_epi8(
4222            0, 1, 2, 3, 4, 5, 6, 7,
4223            8, 9, 10, 11, 12, 13, 14, 15,
4224        );
4225        assert_eq_m128i(r, e);
4226    }
4227
4228    #[simd_test(enable = "sse2")]
4229    const fn test_mm_setzero_si128() {
4230        let r = _mm_setzero_si128();
4231        assert_eq_m128i(r, _mm_set1_epi64x(0));
4232    }
4233
4234    #[simd_test(enable = "sse2")]
4235    const fn test_mm_loadl_epi64() {
4236        let a = _mm_setr_epi64x(6, 5);
4237        let r = unsafe { _mm_loadl_epi64(ptr::addr_of!(a)) };
4238        assert_eq_m128i(r, _mm_setr_epi64x(6, 0));
4239    }
4240
4241    #[simd_test(enable = "sse2")]
4242    const fn test_mm_load_si128() {
4243        let a = _mm_set_epi64x(5, 6);
4244        let r = unsafe { _mm_load_si128(ptr::addr_of!(a) as *const _) };
4245        assert_eq_m128i(a, r);
4246    }
4247
4248    #[simd_test(enable = "sse2")]
4249    const fn test_mm_loadu_si128() {
4250        let a = _mm_set_epi64x(5, 6);
4251        let r = unsafe { _mm_loadu_si128(ptr::addr_of!(a) as *const _) };
4252        assert_eq_m128i(a, r);
4253    }
4254
4255    #[simd_test(enable = "sse2")]
4256    // Miri cannot support this until it is clear how it fits in the Rust memory model
4257    // (non-temporal store)
4258    #[cfg_attr(miri, ignore)]
4259    fn test_mm_maskmoveu_si128() {
4260        let a = _mm_set1_epi8(9);
4261        #[rustfmt::skip]
4262        let mask = _mm_set_epi8(
4263            0, 0, 0x80u8 as i8, 0, 0, 0, 0, 0,
4264            0, 0, 0, 0, 0, 0, 0, 0,
4265        );
4266        let mut r = _mm_set1_epi8(0);
4267        unsafe {
4268            _mm_maskmoveu_si128(a, mask, ptr::addr_of_mut!(r) as *mut i8);
4269        }
4270        _mm_sfence();
4271        let e = _mm_set_epi8(0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
4272        assert_eq_m128i(r, e);
4273    }
4274
4275    #[simd_test(enable = "sse2")]
4276    const fn test_mm_store_si128() {
4277        let a = _mm_set1_epi8(9);
4278        let mut r = _mm_set1_epi8(0);
4279        unsafe {
4280            _mm_store_si128(&mut r, a);
4281        }
4282        assert_eq_m128i(r, a);
4283    }
4284
4285    #[simd_test(enable = "sse2")]
4286    const fn test_mm_storeu_si128() {
4287        let a = _mm_set1_epi8(9);
4288        let mut r = _mm_set1_epi8(0);
4289        unsafe {
4290            _mm_storeu_si128(&mut r, a);
4291        }
4292        assert_eq_m128i(r, a);
4293    }
4294
4295    #[simd_test(enable = "sse2")]
4296    const fn test_mm_storel_epi64() {
4297        let a = _mm_setr_epi64x(2, 9);
4298        let mut r = _mm_set1_epi8(0);
4299        unsafe {
4300            _mm_storel_epi64(&mut r, a);
4301        }
4302        assert_eq_m128i(r, _mm_setr_epi64x(2, 0));
4303    }
4304
4305    #[simd_test(enable = "sse2")]
4306    // Miri cannot support this until it is clear how it fits in the Rust memory model
4307    // (non-temporal store)
4308    #[cfg_attr(miri, ignore)]
4309    fn test_mm_stream_si128() {
4310        let a = _mm_setr_epi32(1, 2, 3, 4);
4311        let mut r = _mm_undefined_si128();
4312        unsafe {
4313            _mm_stream_si128(ptr::addr_of_mut!(r), a);
4314        }
4315        _mm_sfence();
4316        assert_eq_m128i(r, a);
4317    }
4318
4319    #[simd_test(enable = "sse2")]
4320    // Miri cannot support this until it is clear how it fits in the Rust memory model
4321    // (non-temporal store)
4322    #[cfg_attr(miri, ignore)]
4323    fn test_mm_stream_si32() {
4324        let a: i32 = 7;
4325        let mut mem = boxed::Box::<i32>::new(-1);
4326        unsafe {
4327            _mm_stream_si32(ptr::addr_of_mut!(*mem), a);
4328        }
4329        _mm_sfence();
4330        assert_eq!(a, *mem);
4331    }
4332
4333    #[simd_test(enable = "sse2")]
4334    const fn test_mm_move_epi64() {
4335        let a = _mm_setr_epi64x(5, 6);
4336        let r = _mm_move_epi64(a);
4337        assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
4338    }
4339
4340    #[simd_test(enable = "sse2")]
4341    const fn test_mm_packs_epi16() {
4342        let a = _mm_setr_epi16(0x80, -0x81, 0, 0, 0, 0, 0, 0);
4343        let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -0x81, 0x80);
4344        let r = _mm_packs_epi16(a, b);
4345        #[rustfmt::skip]
4346        assert_eq_m128i(
4347            r,
4348            _mm_setr_epi8(
4349                0x7F, -0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0x80, 0x7F
4350            )
4351        );
4352    }
4353
4354    #[simd_test(enable = "sse2")]
4355    const fn test_mm_packs_epi32() {
4356        let a = _mm_setr_epi32(0x8000, -0x8001, 0, 0);
4357        let b = _mm_setr_epi32(0, 0, -0x8001, 0x8000);
4358        let r = _mm_packs_epi32(a, b);
4359        assert_eq_m128i(
4360            r,
4361            _mm_setr_epi16(0x7FFF, -0x8000, 0, 0, 0, 0, -0x8000, 0x7FFF),
4362        );
4363    }
4364
4365    #[simd_test(enable = "sse2")]
4366    const fn test_mm_packus_epi16() {
4367        let a = _mm_setr_epi16(0x100, -1, 0, 0, 0, 0, 0, 0);
4368        let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -1, 0x100);
4369        let r = _mm_packus_epi16(a, b);
4370        assert_eq_m128i(
4371            r,
4372            _mm_setr_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, !0),
4373        );
4374    }
4375
4376    #[simd_test(enable = "sse2")]
4377    const fn test_mm_extract_epi16() {
4378        let a = _mm_setr_epi16(-1, 1, 2, 3, 4, 5, 6, 7);
4379        let r1 = _mm_extract_epi16::<0>(a);
4380        let r2 = _mm_extract_epi16::<3>(a);
4381        assert_eq!(r1, 0xFFFF);
4382        assert_eq!(r2, 3);
4383    }
4384
4385    #[simd_test(enable = "sse2")]
4386    const fn test_mm_insert_epi16() {
4387        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4388        let r = _mm_insert_epi16::<0>(a, 9);
4389        let e = _mm_setr_epi16(9, 1, 2, 3, 4, 5, 6, 7);
4390        assert_eq_m128i(r, e);
4391    }
4392
4393    #[simd_test(enable = "sse2")]
4394    const fn test_mm_movemask_epi8() {
4395        #[rustfmt::skip]
4396        let a = _mm_setr_epi8(
4397            0b1000_0000u8 as i8, 0b0, 0b1000_0000u8 as i8, 0b01,
4398            0b0101, 0b1111_0000u8 as i8, 0, 0,
4399            0, 0b1011_0101u8 as i8, 0b1111_0000u8 as i8, 0b0101,
4400            0b01, 0b1000_0000u8 as i8, 0b0, 0b1000_0000u8 as i8,
4401        );
4402        let r = _mm_movemask_epi8(a);
4403        assert_eq!(r, 0b10100110_00100101);
4404    }
4405
4406    #[simd_test(enable = "sse2")]
4407    const fn test_mm_shuffle_epi32() {
4408        let a = _mm_setr_epi32(5, 10, 15, 20);
4409        let r = _mm_shuffle_epi32::<0b00_01_01_11>(a);
4410        let e = _mm_setr_epi32(20, 10, 10, 5);
4411        assert_eq_m128i(r, e);
4412    }
4413
4414    #[simd_test(enable = "sse2")]
4415    const fn test_mm_shufflehi_epi16() {
4416        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 10, 15, 20);
4417        let r = _mm_shufflehi_epi16::<0b00_01_01_11>(a);
4418        let e = _mm_setr_epi16(1, 2, 3, 4, 20, 10, 10, 5);
4419        assert_eq_m128i(r, e);
4420    }
4421
4422    #[simd_test(enable = "sse2")]
4423    const fn test_mm_shufflelo_epi16() {
4424        let a = _mm_setr_epi16(5, 10, 15, 20, 1, 2, 3, 4);
4425        let r = _mm_shufflelo_epi16::<0b00_01_01_11>(a);
4426        let e = _mm_setr_epi16(20, 10, 10, 5, 1, 2, 3, 4);
4427        assert_eq_m128i(r, e);
4428    }
4429
4430    #[simd_test(enable = "sse2")]
4431    const fn test_mm_unpackhi_epi8() {
4432        #[rustfmt::skip]
4433        let a = _mm_setr_epi8(
4434            0, 1, 2, 3, 4, 5, 6, 7,
4435            8, 9, 10, 11, 12, 13, 14, 15,
4436        );
4437        #[rustfmt::skip]
4438        let b = _mm_setr_epi8(
4439            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
4440        );
4441        let r = _mm_unpackhi_epi8(a, b);
4442        #[rustfmt::skip]
4443        let e = _mm_setr_epi8(
4444            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
4445        );
4446        assert_eq_m128i(r, e);
4447    }
4448
4449    #[simd_test(enable = "sse2")]
4450    const fn test_mm_unpackhi_epi16() {
4451        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4452        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
4453        let r = _mm_unpackhi_epi16(a, b);
4454        let e = _mm_setr_epi16(4, 12, 5, 13, 6, 14, 7, 15);
4455        assert_eq_m128i(r, e);
4456    }
4457
4458    #[simd_test(enable = "sse2")]
4459    const fn test_mm_unpackhi_epi32() {
4460        let a = _mm_setr_epi32(0, 1, 2, 3);
4461        let b = _mm_setr_epi32(4, 5, 6, 7);
4462        let r = _mm_unpackhi_epi32(a, b);
4463        let e = _mm_setr_epi32(2, 6, 3, 7);
4464        assert_eq_m128i(r, e);
4465    }
4466
4467    #[simd_test(enable = "sse2")]
4468    const fn test_mm_unpackhi_epi64() {
4469        let a = _mm_setr_epi64x(0, 1);
4470        let b = _mm_setr_epi64x(2, 3);
4471        let r = _mm_unpackhi_epi64(a, b);
4472        let e = _mm_setr_epi64x(1, 3);
4473        assert_eq_m128i(r, e);
4474    }
4475
4476    #[simd_test(enable = "sse2")]
4477    const fn test_mm_unpacklo_epi8() {
4478        #[rustfmt::skip]
4479        let a = _mm_setr_epi8(
4480            0, 1, 2, 3, 4, 5, 6, 7,
4481            8, 9, 10, 11, 12, 13, 14, 15,
4482        );
4483        #[rustfmt::skip]
4484        let b = _mm_setr_epi8(
4485            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
4486        );
4487        let r = _mm_unpacklo_epi8(a, b);
4488        #[rustfmt::skip]
4489        let e = _mm_setr_epi8(
4490            0, 16, 1, 17, 2, 18, 3, 19,
4491            4, 20, 5, 21, 6, 22, 7, 23,
4492        );
4493        assert_eq_m128i(r, e);
4494    }
4495
4496    #[simd_test(enable = "sse2")]
4497    const fn test_mm_unpacklo_epi16() {
4498        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4499        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
4500        let r = _mm_unpacklo_epi16(a, b);
4501        let e = _mm_setr_epi16(0, 8, 1, 9, 2, 10, 3, 11);
4502        assert_eq_m128i(r, e);
4503    }
4504
4505    #[simd_test(enable = "sse2")]
4506    const fn test_mm_unpacklo_epi32() {
4507        let a = _mm_setr_epi32(0, 1, 2, 3);
4508        let b = _mm_setr_epi32(4, 5, 6, 7);
4509        let r = _mm_unpacklo_epi32(a, b);
4510        let e = _mm_setr_epi32(0, 4, 1, 5);
4511        assert_eq_m128i(r, e);
4512    }
4513
4514    #[simd_test(enable = "sse2")]
4515    const fn test_mm_unpacklo_epi64() {
4516        let a = _mm_setr_epi64x(0, 1);
4517        let b = _mm_setr_epi64x(2, 3);
4518        let r = _mm_unpacklo_epi64(a, b);
4519        let e = _mm_setr_epi64x(0, 2);
4520        assert_eq_m128i(r, e);
4521    }
4522
4523    #[simd_test(enable = "sse2")]
4524    const fn test_mm_add_sd() {
4525        let a = _mm_setr_pd(1.0, 2.0);
4526        let b = _mm_setr_pd(5.0, 10.0);
4527        let r = _mm_add_sd(a, b);
4528        assert_eq_m128d(r, _mm_setr_pd(6.0, 2.0));
4529    }
4530
4531    #[simd_test(enable = "sse2")]
4532    const fn test_mm_add_pd() {
4533        let a = _mm_setr_pd(1.0, 2.0);
4534        let b = _mm_setr_pd(5.0, 10.0);
4535        let r = _mm_add_pd(a, b);
4536        assert_eq_m128d(r, _mm_setr_pd(6.0, 12.0));
4537    }
4538
4539    #[simd_test(enable = "sse2")]
4540    const fn test_mm_div_sd() {
4541        let a = _mm_setr_pd(1.0, 2.0);
4542        let b = _mm_setr_pd(5.0, 10.0);
4543        let r = _mm_div_sd(a, b);
4544        assert_eq_m128d(r, _mm_setr_pd(0.2, 2.0));
4545    }
4546
4547    #[simd_test(enable = "sse2")]
4548    const fn test_mm_div_pd() {
4549        let a = _mm_setr_pd(1.0, 2.0);
4550        let b = _mm_setr_pd(5.0, 10.0);
4551        let r = _mm_div_pd(a, b);
4552        assert_eq_m128d(r, _mm_setr_pd(0.2, 0.2));
4553    }
4554
4555    #[simd_test(enable = "sse2")]
4556    fn test_mm_max_sd() {
4557        let a = _mm_setr_pd(1.0, 2.0);
4558        let b = _mm_setr_pd(5.0, 10.0);
4559        let r = _mm_max_sd(a, b);
4560        assert_eq_m128d(r, _mm_setr_pd(5.0, 2.0));
4561    }
4562
4563    #[simd_test(enable = "sse2")]
4564    fn test_mm_max_pd() {
4565        let a = _mm_setr_pd(1.0, 2.0);
4566        let b = _mm_setr_pd(5.0, 10.0);
4567        let r = _mm_max_pd(a, b);
4568        assert_eq_m128d(r, _mm_setr_pd(5.0, 10.0));
4569
4570        // Check SSE(2)-specific semantics for -0.0 handling.
4571        let a = _mm_setr_pd(-0.0, 0.0);
4572        let b = _mm_setr_pd(0.0, 0.0);
4573        // Cast to __m128i to compare exact bit patterns
4574        let r1 = _mm_castpd_si128(_mm_max_pd(a, b));
4575        let r2 = _mm_castpd_si128(_mm_max_pd(b, a));
4576        let a = _mm_castpd_si128(a);
4577        let b = _mm_castpd_si128(b);
4578        assert_eq_m128i(r1, b);
4579        assert_eq_m128i(r2, a);
4580        assert_ne!(a.as_u8x16(), b.as_u8x16()); // sanity check that -0.0 is actually present
4581    }
4582
4583    #[simd_test(enable = "sse2")]
4584    fn test_mm_min_sd() {
4585        let a = _mm_setr_pd(1.0, 2.0);
4586        let b = _mm_setr_pd(5.0, 10.0);
4587        let r = _mm_min_sd(a, b);
4588        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4589    }
4590
4591    #[simd_test(enable = "sse2")]
4592    fn test_mm_min_pd() {
4593        let a = _mm_setr_pd(1.0, 2.0);
4594        let b = _mm_setr_pd(5.0, 10.0);
4595        let r = _mm_min_pd(a, b);
4596        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4597
4598        // Check SSE(2)-specific semantics for -0.0 handling.
4599        let a = _mm_setr_pd(-0.0, 0.0);
4600        let b = _mm_setr_pd(0.0, 0.0);
4601        // Cast to __m128i to compare exact bit patterns
4602        let r1 = _mm_castpd_si128(_mm_min_pd(a, b));
4603        let r2 = _mm_castpd_si128(_mm_min_pd(b, a));
4604        let a = _mm_castpd_si128(a);
4605        let b = _mm_castpd_si128(b);
4606        assert_eq_m128i(r1, b);
4607        assert_eq_m128i(r2, a);
4608        assert_ne!(a.as_u8x16(), b.as_u8x16()); // sanity check that -0.0 is actually present
4609    }
4610
4611    #[simd_test(enable = "sse2")]
4612    const fn test_mm_mul_sd() {
4613        let a = _mm_setr_pd(1.0, 2.0);
4614        let b = _mm_setr_pd(5.0, 10.0);
4615        let r = _mm_mul_sd(a, b);
4616        assert_eq_m128d(r, _mm_setr_pd(5.0, 2.0));
4617    }
4618
4619    #[simd_test(enable = "sse2")]
4620    const fn test_mm_mul_pd() {
4621        let a = _mm_setr_pd(1.0, 2.0);
4622        let b = _mm_setr_pd(5.0, 10.0);
4623        let r = _mm_mul_pd(a, b);
4624        assert_eq_m128d(r, _mm_setr_pd(5.0, 20.0));
4625    }
4626
4627    #[simd_test(enable = "sse2")]
4628    fn test_mm_sqrt_sd() {
4629        let a = _mm_setr_pd(1.0, 2.0);
4630        let b = _mm_setr_pd(5.0, 10.0);
4631        let r = _mm_sqrt_sd(a, b);
4632        assert_eq_m128d(r, _mm_setr_pd(5.0f64.sqrt(), 2.0));
4633    }
4634
4635    #[simd_test(enable = "sse2")]
4636    fn test_mm_sqrt_pd() {
4637        let r = _mm_sqrt_pd(_mm_setr_pd(1.0, 2.0));
4638        assert_eq_m128d(r, _mm_setr_pd(1.0f64.sqrt(), 2.0f64.sqrt()));
4639    }
4640
4641    #[simd_test(enable = "sse2")]
4642    const fn test_mm_sub_sd() {
4643        let a = _mm_setr_pd(1.0, 2.0);
4644        let b = _mm_setr_pd(5.0, 10.0);
4645        let r = _mm_sub_sd(a, b);
4646        assert_eq_m128d(r, _mm_setr_pd(-4.0, 2.0));
4647    }
4648
4649    #[simd_test(enable = "sse2")]
4650    const fn test_mm_sub_pd() {
4651        let a = _mm_setr_pd(1.0, 2.0);
4652        let b = _mm_setr_pd(5.0, 10.0);
4653        let r = _mm_sub_pd(a, b);
4654        assert_eq_m128d(r, _mm_setr_pd(-4.0, -8.0));
4655    }
4656
4657    #[simd_test(enable = "sse2")]
4658    const fn test_mm_and_pd() {
4659        let a = f64x2::from_bits(u64x2::splat(5)).as_m128d();
4660        let b = f64x2::from_bits(u64x2::splat(3)).as_m128d();
4661        let r = _mm_and_pd(a, b);
4662        let e = f64x2::from_bits(u64x2::splat(1)).as_m128d();
4663        assert_eq_m128d(r, e);
4664    }
4665
4666    #[simd_test(enable = "sse2")]
4667    const fn test_mm_andnot_pd() {
4668        let a = f64x2::from_bits(u64x2::splat(5)).as_m128d();
4669        let b = f64x2::from_bits(u64x2::splat(3)).as_m128d();
4670        let r = _mm_andnot_pd(a, b);
4671        let e = f64x2::from_bits(u64x2::splat(2)).as_m128d();
4672        assert_eq_m128d(r, e);
4673    }
4674
4675    #[simd_test(enable = "sse2")]
4676    const fn test_mm_or_pd() {
4677        let a = f64x2::from_bits(u64x2::splat(5)).as_m128d();
4678        let b = f64x2::from_bits(u64x2::splat(3)).as_m128d();
4679        let r = _mm_or_pd(a, b);
4680        let e = f64x2::from_bits(u64x2::splat(7)).as_m128d();
4681        assert_eq_m128d(r, e);
4682    }
4683
4684    #[simd_test(enable = "sse2")]
4685    const fn test_mm_xor_pd() {
4686        let a = f64x2::from_bits(u64x2::splat(5)).as_m128d();
4687        let b = f64x2::from_bits(u64x2::splat(3)).as_m128d();
4688        let r = _mm_xor_pd(a, b);
4689        let e = f64x2::from_bits(u64x2::splat(6)).as_m128d();
4690        assert_eq_m128d(r, e);
4691    }
4692
4693    #[simd_test(enable = "sse2")]
4694    fn test_mm_cmpeq_sd() {
4695        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4696        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4697        let r = _mm_castpd_si128(_mm_cmpeq_sd(a, b));
4698        assert_eq_m128i(r, e);
4699    }
4700
4701    #[simd_test(enable = "sse2")]
4702    fn test_mm_cmplt_sd() {
4703        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4704        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4705        let r = _mm_castpd_si128(_mm_cmplt_sd(a, b));
4706        assert_eq_m128i(r, e);
4707    }
4708
4709    #[simd_test(enable = "sse2")]
4710    fn test_mm_cmple_sd() {
4711        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4712        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4713        let r = _mm_castpd_si128(_mm_cmple_sd(a, b));
4714        assert_eq_m128i(r, e);
4715    }
4716
4717    #[simd_test(enable = "sse2")]
4718    fn test_mm_cmpgt_sd() {
4719        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4720        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4721        let r = _mm_castpd_si128(_mm_cmpgt_sd(a, b));
4722        assert_eq_m128i(r, e);
4723    }
4724
4725    #[simd_test(enable = "sse2")]
4726    fn test_mm_cmpge_sd() {
4727        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4728        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4729        let r = _mm_castpd_si128(_mm_cmpge_sd(a, b));
4730        assert_eq_m128i(r, e);
4731    }
4732
4733    #[simd_test(enable = "sse2")]
4734    fn test_mm_cmpord_sd() {
4735        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4736        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4737        let r = _mm_castpd_si128(_mm_cmpord_sd(a, b));
4738        assert_eq_m128i(r, e);
4739    }
4740
4741    #[simd_test(enable = "sse2")]
4742    fn test_mm_cmpunord_sd() {
4743        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4744        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4745        let r = _mm_castpd_si128(_mm_cmpunord_sd(a, b));
4746        assert_eq_m128i(r, e);
4747    }
4748
4749    #[simd_test(enable = "sse2")]
4750    fn test_mm_cmpneq_sd() {
4751        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4752        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4753        let r = _mm_castpd_si128(_mm_cmpneq_sd(a, b));
4754        assert_eq_m128i(r, e);
4755    }
4756
4757    #[simd_test(enable = "sse2")]
4758    fn test_mm_cmpnlt_sd() {
4759        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4760        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4761        let r = _mm_castpd_si128(_mm_cmpnlt_sd(a, b));
4762        assert_eq_m128i(r, e);
4763    }
4764
4765    #[simd_test(enable = "sse2")]
4766    fn test_mm_cmpnle_sd() {
4767        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4768        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4769        let r = _mm_castpd_si128(_mm_cmpnle_sd(a, b));
4770        assert_eq_m128i(r, e);
4771    }
4772
4773    #[simd_test(enable = "sse2")]
4774    fn test_mm_cmpngt_sd() {
4775        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4776        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4777        let r = _mm_castpd_si128(_mm_cmpngt_sd(a, b));
4778        assert_eq_m128i(r, e);
4779    }
4780
4781    #[simd_test(enable = "sse2")]
4782    fn test_mm_cmpnge_sd() {
4783        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4784        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4785        let r = _mm_castpd_si128(_mm_cmpnge_sd(a, b));
4786        assert_eq_m128i(r, e);
4787    }
4788
4789    #[simd_test(enable = "sse2")]
4790    fn test_mm_cmpeq_pd() {
4791        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4792        let e = _mm_setr_epi64x(!0, 0);
4793        let r = _mm_castpd_si128(_mm_cmpeq_pd(a, b));
4794        assert_eq_m128i(r, e);
4795    }
4796
4797    #[simd_test(enable = "sse2")]
4798    fn test_mm_cmplt_pd() {
4799        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4800        let e = _mm_setr_epi64x(0, !0);
4801        let r = _mm_castpd_si128(_mm_cmplt_pd(a, b));
4802        assert_eq_m128i(r, e);
4803    }
4804
4805    #[simd_test(enable = "sse2")]
4806    fn test_mm_cmple_pd() {
4807        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4808        let e = _mm_setr_epi64x(!0, !0);
4809        let r = _mm_castpd_si128(_mm_cmple_pd(a, b));
4810        assert_eq_m128i(r, e);
4811    }
4812
4813    #[simd_test(enable = "sse2")]
4814    fn test_mm_cmpgt_pd() {
4815        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4816        let e = _mm_setr_epi64x(0, 0);
4817        let r = _mm_castpd_si128(_mm_cmpgt_pd(a, b));
4818        assert_eq_m128i(r, e);
4819    }
4820
4821    #[simd_test(enable = "sse2")]
4822    fn test_mm_cmpge_pd() {
4823        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4824        let e = _mm_setr_epi64x(!0, 0);
4825        let r = _mm_castpd_si128(_mm_cmpge_pd(a, b));
4826        assert_eq_m128i(r, e);
4827    }
4828
4829    #[simd_test(enable = "sse2")]
4830    fn test_mm_cmpord_pd() {
4831        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4832        let e = _mm_setr_epi64x(0, !0);
4833        let r = _mm_castpd_si128(_mm_cmpord_pd(a, b));
4834        assert_eq_m128i(r, e);
4835    }
4836
4837    #[simd_test(enable = "sse2")]
4838    fn test_mm_cmpunord_pd() {
4839        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4840        let e = _mm_setr_epi64x(!0, 0);
4841        let r = _mm_castpd_si128(_mm_cmpunord_pd(a, b));
4842        assert_eq_m128i(r, e);
4843    }
4844
4845    #[simd_test(enable = "sse2")]
4846    fn test_mm_cmpneq_pd() {
4847        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4848        let e = _mm_setr_epi64x(!0, !0);
4849        let r = _mm_castpd_si128(_mm_cmpneq_pd(a, b));
4850        assert_eq_m128i(r, e);
4851    }
4852
4853    #[simd_test(enable = "sse2")]
4854    fn test_mm_cmpnlt_pd() {
4855        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4856        let e = _mm_setr_epi64x(0, 0);
4857        let r = _mm_castpd_si128(_mm_cmpnlt_pd(a, b));
4858        assert_eq_m128i(r, e);
4859    }
4860
4861    #[simd_test(enable = "sse2")]
4862    fn test_mm_cmpnle_pd() {
4863        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4864        let e = _mm_setr_epi64x(0, 0);
4865        let r = _mm_castpd_si128(_mm_cmpnle_pd(a, b));
4866        assert_eq_m128i(r, e);
4867    }
4868
4869    #[simd_test(enable = "sse2")]
4870    fn test_mm_cmpngt_pd() {
4871        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4872        let e = _mm_setr_epi64x(0, !0);
4873        let r = _mm_castpd_si128(_mm_cmpngt_pd(a, b));
4874        assert_eq_m128i(r, e);
4875    }
4876
4877    #[simd_test(enable = "sse2")]
4878    fn test_mm_cmpnge_pd() {
4879        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4880        let e = _mm_setr_epi64x(0, !0);
4881        let r = _mm_castpd_si128(_mm_cmpnge_pd(a, b));
4882        assert_eq_m128i(r, e);
4883    }
4884
4885    #[simd_test(enable = "sse2")]
4886    fn test_mm_comieq_sd() {
4887        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4888        assert!(_mm_comieq_sd(a, b) != 0);
4889
4890        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(1.0, 3.0));
4891        assert!(_mm_comieq_sd(a, b) == 0);
4892    }
4893
4894    #[simd_test(enable = "sse2")]
4895    fn test_mm_comilt_sd() {
4896        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4897        assert!(_mm_comilt_sd(a, b) == 0);
4898    }
4899
4900    #[simd_test(enable = "sse2")]
4901    fn test_mm_comile_sd() {
4902        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4903        assert!(_mm_comile_sd(a, b) != 0);
4904    }
4905
4906    #[simd_test(enable = "sse2")]
4907    fn test_mm_comigt_sd() {
4908        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4909        assert!(_mm_comigt_sd(a, b) == 0);
4910    }
4911
4912    #[simd_test(enable = "sse2")]
4913    fn test_mm_comige_sd() {
4914        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4915        assert!(_mm_comige_sd(a, b) != 0);
4916    }
4917
4918    #[simd_test(enable = "sse2")]
4919    fn test_mm_comineq_sd() {
4920        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4921        assert!(_mm_comineq_sd(a, b) == 0);
4922    }
4923
4924    #[simd_test(enable = "sse2")]
4925    fn test_mm_ucomieq_sd() {
4926        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4927        assert!(_mm_ucomieq_sd(a, b) != 0);
4928
4929        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(NAN, 3.0));
4930        assert!(_mm_ucomieq_sd(a, b) == 0);
4931    }
4932
4933    #[simd_test(enable = "sse2")]
4934    fn test_mm_ucomilt_sd() {
4935        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4936        assert!(_mm_ucomilt_sd(a, b) == 0);
4937    }
4938
4939    #[simd_test(enable = "sse2")]
4940    fn test_mm_ucomile_sd() {
4941        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4942        assert!(_mm_ucomile_sd(a, b) != 0);
4943    }
4944
4945    #[simd_test(enable = "sse2")]
4946    fn test_mm_ucomigt_sd() {
4947        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4948        assert!(_mm_ucomigt_sd(a, b) == 0);
4949    }
4950
4951    #[simd_test(enable = "sse2")]
4952    fn test_mm_ucomige_sd() {
4953        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4954        assert!(_mm_ucomige_sd(a, b) != 0);
4955    }
4956
4957    #[simd_test(enable = "sse2")]
4958    fn test_mm_ucomineq_sd() {
4959        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4960        assert!(_mm_ucomineq_sd(a, b) == 0);
4961    }
4962
4963    #[simd_test(enable = "sse2")]
4964    const fn test_mm_movemask_pd() {
4965        let r = _mm_movemask_pd(_mm_setr_pd(-1.0, 5.0));
4966        assert_eq!(r, 0b01);
4967
4968        let r = _mm_movemask_pd(_mm_setr_pd(-1.0, -5.0));
4969        assert_eq!(r, 0b11);
4970    }
4971
4972    #[repr(align(16))]
4973    struct Memory {
4974        data: [f64; 4],
4975    }
4976
4977    #[simd_test(enable = "sse2")]
4978    const fn test_mm_load_pd() {
4979        let mem = Memory {
4980            data: [1.0f64, 2.0, 3.0, 4.0],
4981        };
4982        let vals = &mem.data;
4983        let d = vals.as_ptr();
4984
4985        let r = unsafe { _mm_load_pd(d) };
4986        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4987    }
4988
4989    #[simd_test(enable = "sse2")]
4990    const fn test_mm_load_sd() {
4991        let a = 1.;
4992        let expected = _mm_setr_pd(a, 0.);
4993        let r = unsafe { _mm_load_sd(&a) };
4994        assert_eq_m128d(r, expected);
4995    }
4996
4997    #[simd_test(enable = "sse2")]
4998    const fn test_mm_loadh_pd() {
4999        let a = _mm_setr_pd(1., 2.);
5000        let b = 3.;
5001        let expected = _mm_setr_pd(_mm_cvtsd_f64(a), 3.);
5002        let r = unsafe { _mm_loadh_pd(a, &b) };
5003        assert_eq_m128d(r, expected);
5004    }
5005
5006    #[simd_test(enable = "sse2")]
5007    const fn test_mm_loadl_pd() {
5008        let a = _mm_setr_pd(1., 2.);
5009        let b = 3.;
5010        let expected = _mm_setr_pd(3., get_m128d(a, 1));
5011        let r = unsafe { _mm_loadl_pd(a, &b) };
5012        assert_eq_m128d(r, expected);
5013    }
5014
5015    #[simd_test(enable = "sse2")]
5016    // Miri cannot support this until it is clear how it fits in the Rust memory model
5017    // (non-temporal store)
5018    #[cfg_attr(miri, ignore)]
5019    fn test_mm_stream_pd() {
5020        #[repr(align(128))]
5021        struct Memory {
5022            pub data: [f64; 2],
5023        }
5024        let a = _mm_set1_pd(7.0);
5025        let mut mem = Memory { data: [-1.0; 2] };
5026
5027        unsafe {
5028            _mm_stream_pd(ptr::addr_of_mut!(mem.data[0]), a);
5029        }
5030        _mm_sfence();
5031        for i in 0..2 {
5032            assert_eq!(mem.data[i], get_m128d(a, i));
5033        }
5034    }
5035
5036    #[simd_test(enable = "sse2")]
5037    const fn test_mm_store_sd() {
5038        let mut dest = 0.;
5039        let a = _mm_setr_pd(1., 2.);
5040        unsafe {
5041            _mm_store_sd(&mut dest, a);
5042        }
5043        assert_eq!(dest, _mm_cvtsd_f64(a));
5044    }
5045
5046    #[simd_test(enable = "sse2")]
5047    const fn test_mm_store_pd() {
5048        let mut mem = Memory { data: [0.0f64; 4] };
5049        let vals = &mut mem.data;
5050        let a = _mm_setr_pd(1.0, 2.0);
5051        let d = vals.as_mut_ptr();
5052
5053        unsafe {
5054            _mm_store_pd(d, *black_box(&a));
5055        }
5056        assert_eq!(vals[0], 1.0);
5057        assert_eq!(vals[1], 2.0);
5058    }
5059
5060    #[simd_test(enable = "sse2")]
5061    const fn test_mm_storeu_pd() {
5062        // guaranteed to be aligned to 16 bytes
5063        let mut mem = Memory { data: [0.0f64; 4] };
5064        let vals = &mut mem.data;
5065        let a = _mm_setr_pd(1.0, 2.0);
5066
5067        // so p is *not* aligned to 16 bytes
5068        unsafe {
5069            let p = vals.as_mut_ptr().offset(1);
5070            _mm_storeu_pd(p, *black_box(&a));
5071        }
5072
5073        assert_eq!(*vals, [0.0, 1.0, 2.0, 0.0]);
5074    }
5075
5076    #[simd_test(enable = "sse2")]
5077    const fn test_mm_storeu_si16() {
5078        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
5079        let mut r = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
5080        unsafe {
5081            _mm_storeu_si16(ptr::addr_of_mut!(r).cast(), a);
5082        }
5083        let e = _mm_setr_epi16(1, 10, 11, 12, 13, 14, 15, 16);
5084        assert_eq_m128i(r, e);
5085    }
5086
5087    #[simd_test(enable = "sse2")]
5088    const fn test_mm_storeu_si32() {
5089        let a = _mm_setr_epi32(1, 2, 3, 4);
5090        let mut r = _mm_setr_epi32(5, 6, 7, 8);
5091        unsafe {
5092            _mm_storeu_si32(ptr::addr_of_mut!(r).cast(), a);
5093        }
5094        let e = _mm_setr_epi32(1, 6, 7, 8);
5095        assert_eq_m128i(r, e);
5096    }
5097
5098    #[simd_test(enable = "sse2")]
5099    const fn test_mm_storeu_si64() {
5100        let a = _mm_setr_epi64x(1, 2);
5101        let mut r = _mm_setr_epi64x(3, 4);
5102        unsafe {
5103            _mm_storeu_si64(ptr::addr_of_mut!(r).cast(), a);
5104        }
5105        let e = _mm_setr_epi64x(1, 4);
5106        assert_eq_m128i(r, e);
5107    }
5108
5109    #[simd_test(enable = "sse2")]
5110    const fn test_mm_store1_pd() {
5111        let mut mem = Memory { data: [0.0f64; 4] };
5112        let vals = &mut mem.data;
5113        let a = _mm_setr_pd(1.0, 2.0);
5114        let d = vals.as_mut_ptr();
5115
5116        unsafe {
5117            _mm_store1_pd(d, *black_box(&a));
5118        }
5119        assert_eq!(vals[0], 1.0);
5120        assert_eq!(vals[1], 1.0);
5121    }
5122
5123    #[simd_test(enable = "sse2")]
5124    const fn test_mm_store_pd1() {
5125        let mut mem = Memory { data: [0.0f64; 4] };
5126        let vals = &mut mem.data;
5127        let a = _mm_setr_pd(1.0, 2.0);
5128        let d = vals.as_mut_ptr();
5129
5130        unsafe {
5131            _mm_store_pd1(d, *black_box(&a));
5132        }
5133        assert_eq!(vals[0], 1.0);
5134        assert_eq!(vals[1], 1.0);
5135    }
5136
5137    #[simd_test(enable = "sse2")]
5138    const fn test_mm_storer_pd() {
5139        let mut mem = Memory { data: [0.0f64; 4] };
5140        let vals = &mut mem.data;
5141        let a = _mm_setr_pd(1.0, 2.0);
5142        let d = vals.as_mut_ptr();
5143
5144        unsafe {
5145            _mm_storer_pd(d, *black_box(&a));
5146        }
5147        assert_eq!(vals[0], 2.0);
5148        assert_eq!(vals[1], 1.0);
5149    }
5150
5151    #[simd_test(enable = "sse2")]
5152    const fn test_mm_storeh_pd() {
5153        let mut dest = 0.;
5154        let a = _mm_setr_pd(1., 2.);
5155        unsafe {
5156            _mm_storeh_pd(&mut dest, a);
5157        }
5158        assert_eq!(dest, get_m128d(a, 1));
5159    }
5160
5161    #[simd_test(enable = "sse2")]
5162    const fn test_mm_storel_pd() {
5163        let mut dest = 0.;
5164        let a = _mm_setr_pd(1., 2.);
5165        unsafe {
5166            _mm_storel_pd(&mut dest, a);
5167        }
5168        assert_eq!(dest, _mm_cvtsd_f64(a));
5169    }
5170
5171    #[simd_test(enable = "sse2")]
5172    const fn test_mm_loadr_pd() {
5173        let mut mem = Memory {
5174            data: [1.0f64, 2.0, 3.0, 4.0],
5175        };
5176        let vals = &mut mem.data;
5177        let d = vals.as_ptr();
5178
5179        let r = unsafe { _mm_loadr_pd(d) };
5180        assert_eq_m128d(r, _mm_setr_pd(2.0, 1.0));
5181    }
5182
5183    #[simd_test(enable = "sse2")]
5184    const fn test_mm_loadu_pd() {
5185        // guaranteed to be aligned to 16 bytes
5186        let mut mem = Memory {
5187            data: [1.0f64, 2.0, 3.0, 4.0],
5188        };
5189        let vals = &mut mem.data;
5190
5191        // so this will *not* be aligned to 16 bytes
5192        let d = unsafe { vals.as_ptr().offset(1) };
5193
5194        let r = unsafe { _mm_loadu_pd(d) };
5195        let e = _mm_setr_pd(2.0, 3.0);
5196        assert_eq_m128d(r, e);
5197    }
5198
5199    #[simd_test(enable = "sse2")]
5200    const fn test_mm_loadu_si16() {
5201        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
5202        let r = unsafe { _mm_loadu_si16(ptr::addr_of!(a) as *const _) };
5203        assert_eq_m128i(r, _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0));
5204    }
5205
5206    #[simd_test(enable = "sse2")]
5207    const fn test_mm_loadu_si32() {
5208        let a = _mm_setr_epi32(1, 2, 3, 4);
5209        let r = unsafe { _mm_loadu_si32(ptr::addr_of!(a) as *const _) };
5210        assert_eq_m128i(r, _mm_setr_epi32(1, 0, 0, 0));
5211    }
5212
5213    #[simd_test(enable = "sse2")]
5214    const fn test_mm_loadu_si64() {
5215        let a = _mm_setr_epi64x(5, 6);
5216        let r = unsafe { _mm_loadu_si64(ptr::addr_of!(a) as *const _) };
5217        assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
5218    }
5219
5220    #[simd_test(enable = "sse2")]
5221    const fn test_mm_cvtpd_ps() {
5222        let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, 5.0));
5223        assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, 0.0));
5224
5225        let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, -5.0));
5226        assert_eq_m128(r, _mm_setr_ps(-1.0, -5.0, 0.0, 0.0));
5227
5228        let r = _mm_cvtpd_ps(_mm_setr_pd(f64::MAX, f64::MIN));
5229        assert_eq_m128(r, _mm_setr_ps(f32::INFINITY, f32::NEG_INFINITY, 0.0, 0.0));
5230
5231        let r = _mm_cvtpd_ps(_mm_setr_pd(f32::MAX as f64, f32::MIN as f64));
5232        assert_eq_m128(r, _mm_setr_ps(f32::MAX, f32::MIN, 0.0, 0.0));
5233    }
5234
5235    #[simd_test(enable = "sse2")]
5236    const fn test_mm_cvtps_pd() {
5237        let r = _mm_cvtps_pd(_mm_setr_ps(-1.0, 2.0, -3.0, 5.0));
5238        assert_eq_m128d(r, _mm_setr_pd(-1.0, 2.0));
5239
5240        let r = _mm_cvtps_pd(_mm_setr_ps(
5241            f32::MAX,
5242            f32::INFINITY,
5243            f32::NEG_INFINITY,
5244            f32::MIN,
5245        ));
5246        assert_eq_m128d(r, _mm_setr_pd(f32::MAX as f64, f64::INFINITY));
5247    }
5248
5249    #[simd_test(enable = "sse2")]
5250    fn test_mm_cvtpd_epi32() {
5251        let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, 5.0));
5252        assert_eq_m128i(r, _mm_setr_epi32(-1, 5, 0, 0));
5253
5254        let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, -5.0));
5255        assert_eq_m128i(r, _mm_setr_epi32(-1, -5, 0, 0));
5256
5257        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::MAX, f64::MIN));
5258        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5259
5260        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::INFINITY, f64::NEG_INFINITY));
5261        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5262
5263        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::NAN, f64::NAN));
5264        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5265    }
5266
5267    #[simd_test(enable = "sse2")]
5268    fn test_mm_cvtsd_si32() {
5269        let r = _mm_cvtsd_si32(_mm_setr_pd(-2.0, 5.0));
5270        assert_eq!(r, -2);
5271
5272        let r = _mm_cvtsd_si32(_mm_setr_pd(f64::MAX, f64::MIN));
5273        assert_eq!(r, i32::MIN);
5274
5275        let r = _mm_cvtsd_si32(_mm_setr_pd(f64::NAN, f64::NAN));
5276        assert_eq!(r, i32::MIN);
5277    }
5278
5279    #[simd_test(enable = "sse2")]
5280    fn test_mm_cvtsd_ss() {
5281        let a = _mm_setr_ps(-1.1, -2.2, 3.3, 4.4);
5282        let b = _mm_setr_pd(2.0, -5.0);
5283
5284        let r = _mm_cvtsd_ss(a, b);
5285
5286        assert_eq_m128(r, _mm_setr_ps(2.0, -2.2, 3.3, 4.4));
5287
5288        let a = _mm_setr_ps(-1.1, f32::NEG_INFINITY, f32::MAX, f32::NEG_INFINITY);
5289        let b = _mm_setr_pd(f64::INFINITY, -5.0);
5290
5291        let r = _mm_cvtsd_ss(a, b);
5292
5293        assert_eq_m128(
5294            r,
5295            _mm_setr_ps(
5296                f32::INFINITY,
5297                f32::NEG_INFINITY,
5298                f32::MAX,
5299                f32::NEG_INFINITY,
5300            ),
5301        );
5302    }
5303
5304    #[simd_test(enable = "sse2")]
5305    const fn test_mm_cvtsd_f64() {
5306        let r = _mm_cvtsd_f64(_mm_setr_pd(-1.1, 2.2));
5307        assert_eq!(r, -1.1);
5308    }
5309
5310    #[simd_test(enable = "sse2")]
5311    const fn test_mm_cvtss_sd() {
5312        let a = _mm_setr_pd(-1.1, 2.2);
5313        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
5314
5315        let r = _mm_cvtss_sd(a, b);
5316        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.2));
5317
5318        let a = _mm_setr_pd(-1.1, f64::INFINITY);
5319        let b = _mm_setr_ps(f32::NEG_INFINITY, 2.0, 3.0, 4.0);
5320
5321        let r = _mm_cvtss_sd(a, b);
5322        assert_eq_m128d(r, _mm_setr_pd(f64::NEG_INFINITY, f64::INFINITY));
5323    }
5324
5325    #[simd_test(enable = "sse2")]
5326    fn test_mm_cvttpd_epi32() {
5327        let a = _mm_setr_pd(-1.1, 2.2);
5328        let r = _mm_cvttpd_epi32(a);
5329        assert_eq_m128i(r, _mm_setr_epi32(-1, 2, 0, 0));
5330
5331        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
5332        let r = _mm_cvttpd_epi32(a);
5333        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5334    }
5335
5336    #[simd_test(enable = "sse2")]
5337    fn test_mm_cvttsd_si32() {
5338        let a = _mm_setr_pd(-1.1, 2.2);
5339        let r = _mm_cvttsd_si32(a);
5340        assert_eq!(r, -1);
5341
5342        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
5343        let r = _mm_cvttsd_si32(a);
5344        assert_eq!(r, i32::MIN);
5345    }
5346
5347    #[simd_test(enable = "sse2")]
5348    fn test_mm_cvttps_epi32() {
5349        let a = _mm_setr_ps(-1.1, 2.2, -3.3, 6.6);
5350        let r = _mm_cvttps_epi32(a);
5351        assert_eq_m128i(r, _mm_setr_epi32(-1, 2, -3, 6));
5352
5353        let a = _mm_setr_ps(f32::NEG_INFINITY, f32::INFINITY, f32::MIN, f32::MAX);
5354        let r = _mm_cvttps_epi32(a);
5355        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, i32::MIN, i32::MIN));
5356    }
5357
5358    #[simd_test(enable = "sse2")]
5359    const fn test_mm_set_sd() {
5360        let r = _mm_set_sd(-1.0_f64);
5361        assert_eq_m128d(r, _mm_setr_pd(-1.0_f64, 0_f64));
5362    }
5363
5364    #[simd_test(enable = "sse2")]
5365    const fn test_mm_set1_pd() {
5366        let r = _mm_set1_pd(-1.0_f64);
5367        assert_eq_m128d(r, _mm_setr_pd(-1.0_f64, -1.0_f64));
5368    }
5369
5370    #[simd_test(enable = "sse2")]
5371    const fn test_mm_set_pd1() {
5372        let r = _mm_set_pd1(-2.0_f64);
5373        assert_eq_m128d(r, _mm_setr_pd(-2.0_f64, -2.0_f64));
5374    }
5375
5376    #[simd_test(enable = "sse2")]
5377    const fn test_mm_set_pd() {
5378        let r = _mm_set_pd(1.0_f64, 5.0_f64);
5379        assert_eq_m128d(r, _mm_setr_pd(5.0_f64, 1.0_f64));
5380    }
5381
5382    #[simd_test(enable = "sse2")]
5383    const fn test_mm_setr_pd() {
5384        let r = _mm_setr_pd(1.0_f64, -5.0_f64);
5385        assert_eq_m128d(r, _mm_setr_pd(1.0_f64, -5.0_f64));
5386    }
5387
5388    #[simd_test(enable = "sse2")]
5389    const fn test_mm_setzero_pd() {
5390        let r = _mm_setzero_pd();
5391        assert_eq_m128d(r, _mm_setr_pd(0_f64, 0_f64));
5392    }
5393
5394    #[simd_test(enable = "sse2")]
5395    const fn test_mm_load1_pd() {
5396        let d = -5.0;
5397        let r = unsafe { _mm_load1_pd(&d) };
5398        assert_eq_m128d(r, _mm_setr_pd(d, d));
5399    }
5400
5401    #[simd_test(enable = "sse2")]
5402    const fn test_mm_load_pd1() {
5403        let d = -5.0;
5404        let r = unsafe { _mm_load_pd1(&d) };
5405        assert_eq_m128d(r, _mm_setr_pd(d, d));
5406    }
5407
5408    #[simd_test(enable = "sse2")]
5409    const fn test_mm_unpackhi_pd() {
5410        let a = _mm_setr_pd(1.0, 2.0);
5411        let b = _mm_setr_pd(3.0, 4.0);
5412        let r = _mm_unpackhi_pd(a, b);
5413        assert_eq_m128d(r, _mm_setr_pd(2.0, 4.0));
5414    }
5415
5416    #[simd_test(enable = "sse2")]
5417    const fn test_mm_unpacklo_pd() {
5418        let a = _mm_setr_pd(1.0, 2.0);
5419        let b = _mm_setr_pd(3.0, 4.0);
5420        let r = _mm_unpacklo_pd(a, b);
5421        assert_eq_m128d(r, _mm_setr_pd(1.0, 3.0));
5422    }
5423
5424    #[simd_test(enable = "sse2")]
5425    const fn test_mm_shuffle_pd() {
5426        let a = _mm_setr_pd(1., 2.);
5427        let b = _mm_setr_pd(3., 4.);
5428        let expected = _mm_setr_pd(1., 3.);
5429        let r = _mm_shuffle_pd::<0b00_00_00_00>(a, b);
5430        assert_eq_m128d(r, expected);
5431    }
5432
5433    #[simd_test(enable = "sse2")]
5434    const fn test_mm_move_sd() {
5435        let a = _mm_setr_pd(1., 2.);
5436        let b = _mm_setr_pd(3., 4.);
5437        let expected = _mm_setr_pd(3., 2.);
5438        let r = _mm_move_sd(a, b);
5439        assert_eq_m128d(r, expected);
5440    }
5441
5442    #[simd_test(enable = "sse2")]
5443    const fn test_mm_castpd_ps() {
5444        let a = _mm_set1_pd(0.);
5445        let expected = _mm_set1_ps(0.);
5446        let r = _mm_castpd_ps(a);
5447        assert_eq_m128(r, expected);
5448    }
5449
5450    #[simd_test(enable = "sse2")]
5451    const fn test_mm_castpd_si128() {
5452        let a = _mm_set1_pd(0.);
5453        let expected = _mm_set1_epi64x(0);
5454        let r = _mm_castpd_si128(a);
5455        assert_eq_m128i(r, expected);
5456    }
5457
5458    #[simd_test(enable = "sse2")]
5459    const fn test_mm_castps_pd() {
5460        let a = _mm_set1_ps(0.);
5461        let expected = _mm_set1_pd(0.);
5462        let r = _mm_castps_pd(a);
5463        assert_eq_m128d(r, expected);
5464    }
5465
5466    #[simd_test(enable = "sse2")]
5467    const fn test_mm_castps_si128() {
5468        let a = _mm_set1_ps(0.);
5469        let expected = _mm_set1_epi32(0);
5470        let r = _mm_castps_si128(a);
5471        assert_eq_m128i(r, expected);
5472    }
5473
5474    #[simd_test(enable = "sse2")]
5475    const fn test_mm_castsi128_pd() {
5476        let a = _mm_set1_epi64x(0);
5477        let expected = _mm_set1_pd(0.);
5478        let r = _mm_castsi128_pd(a);
5479        assert_eq_m128d(r, expected);
5480    }
5481
5482    #[simd_test(enable = "sse2")]
5483    const fn test_mm_castsi128_ps() {
5484        let a = _mm_set1_epi32(0);
5485        let expected = _mm_set1_ps(0.);
5486        let r = _mm_castsi128_ps(a);
5487        assert_eq_m128(r, expected);
5488    }
5489}