diff --git a/coresimd/ppsv/api/minimal.rs b/coresimd/ppsv/api/minimal.rs
index 4470bd6c31..fd1ffdb82d 100644
--- a/coresimd/ppsv/api/minimal.rs
+++ b/coresimd/ppsv/api/minimal.rs
@@ -81,6 +81,10 @@ macro_rules! impl_minimal {
                 simd_insert(self, index as u32, new_value)
             }
         }
+
+        impl super::api::Simd for $id {
+            type Element = $elem_ty;
+        }
     }
 }
 
diff --git a/coresimd/ppsv/api/mod.rs b/coresimd/ppsv/api/mod.rs
index 857264b9f3..931ed5ccf6 100644
--- a/coresimd/ppsv/api/mod.rs
+++ b/coresimd/ppsv/api/mod.rs
@@ -112,9 +112,8 @@ mod partial_eq;
 // TODO:
 //#[macro_use]
 //mod partial_ord;
-// TODO:
-//#[macro_use]
-//mod shuffles;
+#[macro_use]
+pub mod shuffles;
 // TODO:
 //#[macro_use]
 //mod gather_scatter;
@@ -125,9 +124,15 @@ mod scalar_shifts;
 #[macro_use]
 mod shifts;
 
-/// Sealed trait used for constraining select implementations.
+/// Sealed trait used to constrain select implementations.
 pub trait Lanes<A> {}
 
+/// Sealed trait used to constraint vector shuffles.
+pub trait Simd {
+    /// Element type of the SIMD vector.
+    type Element;
+}
+
 /// Defines a portable packed SIMD floating-point vector type.
 macro_rules! simd_f_ty {
     ($id:ident : $elem_count:expr, $elem_ty:ident, $mask_ty:ident, $test_mod:ident,
diff --git a/coresimd/ppsv/api/shuffles.rs b/coresimd/ppsv/api/shuffles.rs
new file mode 100644
index 0000000000..3bd5bd3d76
--- /dev/null
+++ b/coresimd/ppsv/api/shuffles.rs
@@ -0,0 +1,393 @@
+//! Shuffle vectors
+#![allow(unused)]
+
+macro_rules! impl_shuffle {
+    ($_e:expr) => {
+        use super::Simd;
+
+        /// This trait is not public.
+        ///
+        /// It is only used to constrain the return
+        /// type of the vector shuffles.
+        pub trait Shuffle<A> {
+            /// The result type of the shuffle.
+            type Output;
+        }
+
+        // These implementations for the native
+        // types allow constraining the the shuffles for
+        // each portable vector type by just implementing
+        // the Simd trait once for them.
+
+        impl Shuffle<[u32; 2]> for i8 {
+            type Output = super::super::i8x2;
+        }
+        impl Shuffle<[u32; 4]> for i8 {
+            type Output = super::super::i8x4;
+        }
+        impl Shuffle<[u32; 8]> for i8 {
+            type Output = super::super::i8x8;
+        }
+        impl Shuffle<[u32; 16]> for i8 {
+            type Output = super::super::i8x16;
+        }
+        impl Shuffle<[u32; 32]> for i8 {
+            type Output = super::super::i8x32;
+        }
+        impl Shuffle<[u32; 64]> for i8 {
+            type Output = super::super::i8x64;
+        }
+        impl Shuffle<[u32; 2]> for u8 {
+            type Output = super::super::u8x2;
+        }
+        impl Shuffle<[u32; 4]> for u8 {
+            type Output = super::super::u8x4;
+        }
+        impl Shuffle<[u32; 8]> for u8 {
+            type Output = super::super::u8x8;
+        }
+        impl Shuffle<[u32; 16]> for u8 {
+            type Output = super::super::u8x16;
+        }
+        impl Shuffle<[u32; 32]> for u8 {
+            type Output = super::super::u8x32;
+        }
+        impl Shuffle<[u32; 64]> for u8 {
+            type Output = super::super::u8x64;
+        }
+
+        impl Shuffle<[u32; 2]> for i16 {
+            type Output = super::super::i16x2;
+        }
+        impl Shuffle<[u32; 4]> for i16 {
+            type Output = super::super::i16x4;
+        }
+        impl Shuffle<[u32; 8]> for i16 {
+            type Output = super::super::i16x8;
+        }
+        impl Shuffle<[u32; 16]> for i16 {
+            type Output = super::super::i16x16;
+        }
+        impl Shuffle<[u32; 32]> for i16 {
+            type Output = super::super::i16x32;
+        }
+        impl Shuffle<[u32; 2]> for u16 {
+            type Output = super::super::u16x2;
+        }
+        impl Shuffle<[u32; 4]> for u16 {
+            type Output = super::super::u16x4;
+        }
+        impl Shuffle<[u32; 8]> for u16 {
+            type Output = super::super::u16x8;
+        }
+        impl Shuffle<[u32; 16]> for u16 {
+            type Output = super::super::u16x16;
+        }
+        impl Shuffle<[u32; 32]> for u16 {
+            type Output = super::super::u16x32;
+        }
+
+        impl Shuffle<[u32; 2]> for i32 {
+            type Output = super::super::i32x2;
+        }
+        impl Shuffle<[u32; 4]> for i32 {
+            type Output = super::super::i32x4;
+        }
+        impl Shuffle<[u32; 8]> for i32 {
+            type Output = super::super::i32x8;
+        }
+        impl Shuffle<[u32; 16]> for i32 {
+            type Output = super::super::i32x16;
+        }
+        impl Shuffle<[u32; 2]> for u32 {
+            type Output = super::super::u32x2;
+        }
+        impl Shuffle<[u32; 4]> for u32 {
+            type Output = super::super::u32x4;
+        }
+        impl Shuffle<[u32; 8]> for u32 {
+            type Output = super::super::u32x8;
+        }
+        impl Shuffle<[u32; 16]> for u32 {
+            type Output = super::super::u32x16;
+        }
+        impl Shuffle<[u32; 2]> for f32 {
+            type Output = super::super::f32x2;
+        }
+        impl Shuffle<[u32; 4]> for f32 {
+            type Output = super::super::f32x4;
+        }
+        impl Shuffle<[u32; 8]> for f32 {
+            type Output = super::super::f32x8;
+        }
+        impl Shuffle<[u32; 16]> for f32 {
+            type Output = super::super::f32x16;
+        }
+
+        impl Shuffle<[u32; 2]> for i64 {
+            type Output = super::super::i64x2;
+        }
+        impl Shuffle<[u32; 4]> for i64 {
+            type Output = super::super::i64x4;
+        }
+        impl Shuffle<[u32; 8]> for i64 {
+            type Output = super::super::i64x8;
+        }
+        impl Shuffle<[u32; 2]> for u64 {
+            type Output = super::super::u64x2;
+        }
+        impl Shuffle<[u32; 4]> for u64 {
+            type Output = super::super::u64x4;
+        }
+        impl Shuffle<[u32; 8]> for u64 {
+            type Output = super::super::u64x8;
+        }
+        impl Shuffle<[u32; 2]> for f64 {
+            type Output = super::super::f64x2;
+        }
+        impl Shuffle<[u32; 4]> for f64 {
+            type Output = super::super::f64x4;
+        }
+        impl Shuffle<[u32; 8]> for f64 {
+            type Output = super::super::f64x8;
+        }
+
+        /// The shuffle intrinsics are reimported here.
+        ///
+        /// At typeck both input vector types are required to be equal and thus
+        /// have the same length, the arrays of indices are required to have the
+        /// correct lengths, and the result type is constrained by the `where`
+        /// clauses below such that only the correct result types can type check.
+        ///
+        /// FIXME: The only way to produce a monomorphization-time error here is
+        /// to pass the intrinsic an element index that is out-of-bounds. Fixing
+        /// this probably requires checking that the indices are in-bounds in
+        /// MIR typeck.
+        mod intrinsics {
+            use super::{Simd, Shuffle};
+            extern "platform-intrinsic" {
+                pub fn simd_shuffle2<T: Simd, U>(a: T, b: T, indices: [u32; 2]) -> U
+                    where <T as Simd>::Element: Shuffle<[u32; 2], Output = U>;
+
+                pub fn simd_shuffle4<T: Simd, U>(a: T, b: T, indices: [u32; 4]) -> U
+                    where <T as Simd>::Element: Shuffle<[u32; 4], Output = U>;
+
+                pub fn simd_shuffle8<T: Simd, U>(a: T, b: T, indices: [u32; 8]) -> U
+                    where <T as Simd>::Element: Shuffle<[u32; 8], Output = U>;
+
+                pub fn simd_shuffle16<T: Simd, U>(a: T, b: T, indices: [u32; 16]) -> U
+                    where <T as Simd>::Element: Shuffle<[u32; 16], Output = U>;
+
+                pub fn simd_shuffle32<T: Simd, U>(a: T, b: T, indices: [u32; 32]) -> U
+                    where <T as Simd>::Element: Shuffle<[u32; 32], Output = U>;
+
+                pub fn simd_shuffle64<T: Simd, U>(a: T, b: T, indices: [u32; 64]) -> U
+                    where <T as Simd>::Element: Shuffle<[u32; 64], Output = U>;
+            }
+        }
+
+        pub use self::intrinsics::simd_shuffle2 as __shuffle_vector2;
+        pub use self::intrinsics::simd_shuffle4 as __shuffle_vector4;
+        pub use self::intrinsics::simd_shuffle8 as __shuffle_vector8;
+        pub use self::intrinsics::simd_shuffle16 as __shuffle_vector16;
+        pub use self::intrinsics::simd_shuffle32 as __shuffle_vector32;
+        pub use self::intrinsics::simd_shuffle64 as __shuffle_vector64;
+    }
+}
+
+vector_impl!([impl_shuffle, 0]);
+
+/// Shuffles vector elements.
+///
+/// This macro returns a new vector that contains a shuffle of the elements in
+/// one or two input vectors:
+///
+/// * `shuffle!(vec, [indices...])`: one-vector version
+/// * `shuffle!(vec0, vec1, [indices...])`: two-vector version
+///
+/// In the two-vector version both `vec0` and `vec1` must have the same type.
+/// The element type of the resulting vector is the element type of the input
+/// vector.
+///
+/// The number of `indices` must be a power-of-two in range `[0, 64)` smaller
+/// than two times the number of lanes in the input vector. The length of the
+/// resulting vector equals the number of indices provided.
+///
+/// Given a vector with `N` lanes, the indices in range `[0, N)` refer to the
+/// `N` elements in the vector. In the two-vector version, the indices in range
+/// `[N, 2*N)` refer to elements in the second vector.
+///
+/// # Examples
+///
+/// ```
+/// # #![cfg_attr(not(dox), feature(stdsimd))]
+/// # #![cfg_attr(not(dox), no_std)]
+/// # #[cfg(not(dox))]
+/// # extern crate std as real_std;
+/// # #[cfg(not(dox))]
+/// # #[macro_use]
+/// # extern crate stdsimd as std;
+/// # use std::simd::*;
+/// # fn main() {
+/// // Shuffle allows reordering the elements of a vector:
+/// let x = i32x4::new(1, 2, 3, 4);
+/// let r = shuffle!(x, [2, 1, 3, 0]);
+/// assert_eq!(r, i32x4::new(3, 2, 4, 1));
+///
+/// // The resulting vector can be smaller than the input:
+/// let r = shuffle!(x, [1, 3]);
+/// assert_eq!(r, i32x2::new(2, 4));
+///
+/// // Equal:
+/// let r = shuffle!(x, [1, 3, 2, 0]);
+/// assert_eq!(r, i32x4::new(2, 4, 3, 1));
+///
+/// // Or larger:
+/// let r = shuffle!(x, [1, 3, 2, 2, 1, 3, 2, 2]);
+/// assert_eq!(r, i32x8::new(2, 4, 3, 3, 2, 4, 3, 3));
+/// // At most 2 * the number of lanes in the input vector.
+///
+/// // It also allows reordering elements of two vectors:
+/// let y = i32x4::new(5, 6, 7, 8);
+/// let r = shuffle!(x, y, [4, 0, 5, 1]);
+/// assert_eq!(r, i32x4::new(5, 1, 6, 2));
+/// // And this can be used to construct larger or smaller
+/// // vectors as well.
+/// # }
+/// ```
+#[macro_export]
+macro_rules! shuffle {
+    ($vec0:expr, $vec1:expr, [$l0:expr, $l1:expr]) => {
+        {
+            #[allow(unused_unsafe)]
+            let r = unsafe {
+                $crate::simd::__shuffle_vector2(
+                    $vec0, $vec1,
+                    [$l0, $l1]
+                )
+            };
+            r
+        }
+    };
+    ($vec0:expr, $vec1:expr, [$l0:expr, $l1:expr, $l2:expr, $l3:expr]) => {
+        {
+            #[allow(unused_unsafe)]
+            let r = unsafe {
+                $crate::simd::__shuffle_vector4(
+                    $vec0, $vec1,
+                    [$l0, $l1, $l2, $l3]
+                )
+            };
+            r
+        }
+    };
+    ($vec0:expr, $vec1:expr,
+     [$l0:expr, $l1:expr, $l2:expr, $l3:expr,
+      $l4:expr, $l5:expr, $l6:expr, $l7:expr]) => {
+        {
+            #[allow(unused_unsafe)]
+            let r = unsafe {
+                $crate::simd::__shuffle_vector8(
+                    $vec0, $vec1,
+                    [$l0, $l1, $l2, $l3,
+                     $l4, $l5, $l6, $l7]
+                )
+            };
+            r
+        }
+    };
+    ($vec0:expr, $vec1:expr,
+     [$l0:expr, $l1:expr, $l2:expr, $l3:expr,
+      $l4:expr, $l5:expr, $l6:expr, $l7:expr,
+      $l8:expr, $l9:expr, $l10:expr, $l11:expr,
+      $l12:expr, $l13:expr, $l14:expr, $l15:expr]) => {
+        {
+            #[allow(unused_unsafe)]
+            let r = unsafe {
+                $crate::simd::__shuffle_vector16(
+                    $vec0, $vec1,
+                    [$l0, $l1, $l2, $l3,
+                 $l4, $l5, $l6, $l7,
+                     $l8, $l9, $l10, $l11,
+                     $l12, $l13, $l14, $l15]
+                )
+            };
+        r
+        }
+    };
+    ($vec0:expr, $vec1:expr,
+     [$l0:expr, $l1:expr, $l2:expr, $l3:expr,
+      $l4:expr, $l5:expr, $l6:expr, $l7:expr,
+      $l8:expr, $l9:expr, $l10:expr, $l11:expr,
+      $l12:expr, $l13:expr, $l14:expr, $l15:expr,
+      $l16:expr, $l17:expr, $l18:expr, $l19:expr,
+      $l20:expr, $l21:expr, $l22:expr, $l23:expr,
+      $l24:expr, $l25:expr, $l26:expr, $l27:expr,
+      $l28:expr, $l29:expr, $l30:expr, $l31:expr]) => {
+        {
+            #[allow(unused_unsafe)]
+            let r = unsafe {
+                $crate::simd::__shuffle_vector32(
+                    $vec0, $vec1,
+                    [$l0, $l1, $l2, $l3,
+                     $l4, $l5, $l6, $l7,
+                     $l8, $l9, $l10, $l11,
+                     $l12, $l13, $l14, $l15,
+                     $l16, $l17, $l18, $l19,
+                     $l20, $l21, $l22, $l23,
+                     $l24, $l25, $l26, $l27,
+                     $l28, $l29, $l30, $l31]
+                )
+            };
+            r
+        }
+    };
+    ($vec0:expr, $vec1:expr,
+     [$l0:expr, $l1:expr, $l2:expr, $l3:expr,
+      $l4:expr, $l5:expr, $l6:expr, $l7:expr,
+      $l8:expr, $l9:expr, $l10:expr, $l11:expr,
+      $l12:expr, $l13:expr, $l14:expr, $l15:expr,
+      $l16:expr, $l17:expr, $l18:expr, $l19:expr,
+      $l20:expr, $l21:expr, $l22:expr, $l23:expr,
+      $l24:expr, $l25:expr, $l26:expr, $l27:expr,
+      $l28:expr, $l29:expr, $l30:expr, $l31:expr,
+      $l32:expr, $l33:expr, $l34:expr, $l35:expr,
+      $l36:expr, $l37:expr, $l38:expr, $l39:expr,
+      $l40:expr, $l41:expr, $l42:expr, $l43:expr,
+      $l44:expr, $l45:expr, $l46:expr, $l47:expr,
+      $l48:expr, $l49:expr, $l50:expr, $l51:expr,
+      $l52:expr, $l53:expr, $l54:expr, $l55:expr,
+      $l56:expr, $l57:expr, $l58:expr, $l59:expr,
+      $l60:expr, $l61:expr, $l62:expr, $l63:expr]) => {
+        {
+            #[allow(unused_unsafe)]
+            let r = unsafe {
+                $crate::simd::__shuffle_vector64(
+                    $vec0, $vec1,
+                    [$l0, $l1, $l2, $l3,
+                     $l4, $l5, $l6, $l7,
+                     $l8, $l9, $l10, $l11,
+                     $l12, $l13, $l14, $l15,
+                     $l16, $l17, $l18, $l19,
+                     $l20, $l21, $l22, $l23,
+                     $l24, $l25, $l26, $l27,
+                     $l28, $l29, $l30, $l31,
+                     $l32, $l33, $l34, $l35,
+                     $l36, $l37, $l38, $l39,
+                     $l40, $l41, $l42, $l43,
+                     $l44, $l45, $l46, $l47,
+                     $l48, $l49, $l50, $l51,
+                     $l52, $l53, $l54, $l55,
+                     $l56, $l57, $l58, $l59,
+                     $l60, $l61, $l62, $l63]
+                )
+            };
+            r
+        }
+    };
+    ($vec:expr, [$($l:expr),*]) => {
+        match $vec {
+            v => shuffle!(v, v, [$($l),*])
+        }
+    }
+}
diff --git a/coresimd/ppsv/mod.rs b/coresimd/ppsv/mod.rs
index 4d5c92dad0..c6ef497060 100644
--- a/coresimd/ppsv/mod.rs
+++ b/coresimd/ppsv/mod.rs
@@ -86,3 +86,21 @@ impl<T> FromBits<T> for T {
 
 /// Work arounds code generation issues.
 mod codegen;
+
+/// Exposes private shuffle intrinsics
+/// used by the `shuffle!` macro.
+#[allow(unused)]
+macro_rules! expose_shuffles {
+    ($_e:expr) => {
+        pub use self::api::shuffles::{
+            __shuffle_vector2,
+            __shuffle_vector4,
+            __shuffle_vector8,
+            __shuffle_vector16,
+            __shuffle_vector32,
+            __shuffle_vector64,
+        };
+    }
+}
+
+vector_impl!([expose_shuffles, 0]);
diff --git a/coresimd/simd_llvm.rs b/coresimd/simd_llvm.rs
index c83c2d4b35..2ba3944bd4 100644
--- a/coresimd/simd_llvm.rs
+++ b/coresimd/simd_llvm.rs
@@ -15,6 +15,8 @@ extern "platform-intrinsic" {
     pub fn simd_shuffle8<T, U>(x: T, y: T, idx: [u32; 8]) -> U;
     pub fn simd_shuffle16<T, U>(x: T, y: T, idx: [u32; 16]) -> U;
     pub fn simd_shuffle32<T, U>(x: T, y: T, idx: [u32; 32]) -> U;
+    pub fn simd_shuffle64<T, U>(x: T, y: T, idx: [u32; 64]) -> U;
+    pub fn simd_shuffle128<T, U>(x: T, y: T, idx: [u32; 128]) -> U;
 
     pub fn simd_insert<T, U>(x: T, idx: u32, val: U) -> T;
     pub fn simd_extract<T, U>(x: T, idx: u32) -> U;
diff --git a/crates/coresimd/tests/shuffles.rs b/crates/coresimd/tests/shuffles.rs
new file mode 100644
index 0000000000..a5d7ba95b1
--- /dev/null
+++ b/crates/coresimd/tests/shuffles.rs
@@ -0,0 +1,242 @@
+#![feature(stdsimd)]
+
+#[macro_use]
+extern crate coresimd;
+
+use coresimd::simd::*;
+
+#[test]
+fn shuffle2() {
+    let x = u8x2::new(3, 42);
+    let e = u8x2::new(42, 3);
+    let r = shuffle!(x, [1, 0]);
+    assert_eq!(r, e);
+
+    let y = u8x2::new(7, 12);
+    let e = u8x2::new(42, 12);
+    let r = shuffle!(x, y, [1, 3]);
+    assert_eq!(r, e);
+
+    let x = i16x4::new(1, 2, 3, 4);
+    let e = i16x2::new(2, 4);
+    let r = shuffle!(x, [1, 3]);
+    assert_eq!(r, e);
+
+    let y = i16x4::new(5, 6, 7, 8);
+    let e = i16x2::new(2, 7);
+    let r = shuffle!(x, y, [1, 6]);
+    assert_eq!(r, e);
+}
+
+#[test]
+fn shuffle4() {
+    let x = u8x2::new(3, 42);
+    let e = u8x4::new(42, 3, 42, 42);
+    let r = shuffle!(x, [1, 0, 1, 1]);
+    assert_eq!(r, e);
+
+    let x = u32x4::new(1, 2, 3, 4);
+    let e = u32x4::new(2, 4, 1, 3);
+    let r = shuffle!(x, [1, 3, 0, 2]);
+    assert_eq!(r, e);
+
+    let y = u32x4::new(5, 6, 7, 8);
+    let e = u32x4::new(3, 2, 6, 1);
+    let r = shuffle!(x, y, [2, 1, 5, 0]);
+    assert_eq!(r, e);
+
+    let x = i32x8::new(1, 2, 3, 4, 7, 3, 2, 1);
+    let e = i32x4::new(2, 7, 3, 3);
+    let r = shuffle!(x, [1, 4, 5, 5]);
+    assert_eq!(r, e);
+
+    let y = i32x8::new(5, 6, 7, 8, 1, 5, 2, 3);
+    let e = i32x4::new(3, 5, 7, 3);
+    let r = shuffle!(x, y, [15, 13, 4, 5]);
+    assert_eq!(r, e);
+}
+
+#[test]
+fn shuffle8() {
+    let x = f32x8::new(1., 2., 3., 4., 5., 6., 7., 8.);
+    let e = f32x8::new(2., 8., 1., 3., 5., 2., 7., 4.);
+    let r = shuffle!(x, [1, 7, 0, 2, 4, 1, 6, 3]);
+    assert_eq!(r, e);
+
+    let y = f32x8::new(51., 61., 71., 81., 11., 21., 31., 41.);
+    let e = f32x8::new(2., 8., 51., 3., 71., 41., 7., 4.);
+    let r = shuffle!(x, y, [1, 7, 8, 2, 10, 15, 6, 3]);
+    assert_eq!(r, e);
+}
+
+#[test]
+fn shuffle16() {
+    #[cfg_attr(rustfmt, rustfmt_skip)]
+    let x = u8x16::new(
+        0, 1, 2, 3,
+        4, 5, 6, 7,
+        8, 9, 10, 11,
+        12, 13, 14, 15,
+    );
+    #[cfg_attr(rustfmt, rustfmt_skip)]
+    let y = u8x16::new(
+        16, 17, 18, 19,
+        20, 21, 22, 23,
+        24, 25, 26, 27,
+        28, 29, 30, 31
+    );
+    #[cfg_attr(rustfmt, rustfmt_skip)]
+    let e = u8x16::new(
+        0, 1, 2, 3,
+        16, 17, 18, 19,
+        8, 9, 10, 11,
+        20, 21, 22, 23
+    );
+    #[cfg_attr(rustfmt, rustfmt_skip)]
+    let r = shuffle!(
+        x, y,
+        [
+            0, 1, 2, 3,
+            16, 17, 18, 19,
+            8, 9, 10, 11,
+            20, 21, 22, 23
+        ]
+    );
+    assert_eq!(r, e);
+}
+
+#[test]
+fn shuffle32() {
+    #[cfg_attr(rustfmt, rustfmt_skip)]
+    let x = u8x32::new(
+        0, 1, 2, 3,
+        4, 5, 6, 7,
+        8, 9, 10, 11,
+        12, 13, 14, 15,
+        16, 17, 18, 19,
+        20, 21, 22, 23,
+        24, 25, 26, 27,
+        28, 29, 30, 31
+    );
+    #[cfg_attr(rustfmt, rustfmt_skip)]
+    let y = u8x32::new(
+        32, 33, 34, 35,
+        36, 37, 38, 39,
+        40, 41, 42, 43,
+        44, 45, 46, 47,
+        48, 49, 50, 51,
+        52, 53, 54, 55,
+        56, 57, 58, 59,
+        60, 61, 62, 63,
+    );
+    #[cfg_attr(rustfmt, rustfmt_skip)]
+    let e = u8x32::new(
+        0, 1, 2, 3,
+        32, 33, 34, 35,
+        8, 9, 10, 11,
+        36, 37, 38, 39,
+        8, 9, 10, 11,
+        40, 41, 42, 43,
+        12, 13, 14, 15,
+        44, 45, 46, 47
+    );
+    #[cfg_attr(rustfmt, rustfmt_skip)]
+    let r = shuffle!(
+        x, y,
+        [
+            0, 1, 2, 3,
+            32, 33, 34, 35,
+            8, 9, 10, 11,
+            36, 37, 38, 39,
+            8, 9, 10, 11,
+            40, 41, 42, 43,
+            12, 13, 14, 15,
+            44, 45, 46, 47
+        ]
+    );
+    assert_eq!(r, e);
+}
+
+#[test]
+fn shuffle64() {
+    #[cfg_attr(rustfmt, rustfmt_skip)]
+    let x = u8x64::new(
+        0, 1, 2, 3,
+        4, 5, 6, 7,
+        8, 9, 10, 11,
+        12, 13, 14, 15,
+        16, 17, 18, 19,
+        20, 21, 22, 23,
+        24, 25, 26, 27,
+        28, 29, 30, 31,
+        32, 33, 34, 35,
+        36, 37, 38, 39,
+        40, 41, 42, 43,
+        44, 45, 46, 47,
+        48, 49, 50, 51,
+        52, 53, 54, 55,
+        56, 57, 58, 59,
+        60, 61, 62, 63,
+    );
+    #[cfg_attr(rustfmt, rustfmt_skip)]
+    let y = u8x64::new(
+        64, 65, 66, 67,
+        68, 69, 70, 71,
+        72, 73, 74, 75,
+        76, 77, 78, 79,
+        80, 81, 82, 83,
+        84, 85, 86, 87,
+        88, 89, 90, 91,
+        92, 93, 94, 95,
+        96, 97, 98, 99,
+        100, 101, 102, 103,
+        104, 105, 106, 107,
+        108, 109, 110, 111,
+        112, 113, 114, 115,
+        116, 117, 118, 119,
+        120, 121, 122, 123,
+        124, 125, 126, 127,
+    );
+    #[cfg_attr(rustfmt, rustfmt_skip)]
+    let e = u8x64::new(
+        0, 1, 2, 3,
+        64, 65, 66, 67,
+        8, 9, 10, 11,
+        68, 69, 70, 71,
+        8, 9, 10, 11,
+        72, 73, 74, 75,
+        12, 13, 14, 15,
+        76, 77, 78, 79,
+        16, 17, 18, 19,
+        80, 81, 82, 83,
+        20, 21, 22, 23,
+        84, 85, 86, 87,
+        88, 89, 90, 91,
+        24, 25, 26, 27,
+        92, 93, 94, 95,
+        28, 29, 30, 31
+    );
+    #[cfg_attr(rustfmt, rustfmt_skip)]
+    let r = shuffle!(
+        x, y,
+        [
+            0, 1, 2, 3,
+            64, 65, 66, 67,
+            8, 9, 10, 11,
+            68, 69, 70, 71,
+            8, 9, 10, 11,
+            72, 73, 74, 75,
+            12, 13, 14, 15,
+            76, 77, 78, 79,
+            16, 17, 18, 19,
+            80, 81, 82, 83,
+            20, 21, 22, 23,
+            84, 85, 86, 87,
+            88, 89, 90, 91,
+            24, 25, 26, 27,
+            92, 93, 94, 95,
+            28, 29, 30, 31
+        ]
+    );
+    assert_eq!(r, e);
+}
diff --git a/crates/stdsimd/src/lib.rs b/crates/stdsimd/src/lib.rs
index abee4fcfd4..b7a6bac4bc 100644
--- a/crates/stdsimd/src/lib.rs
+++ b/crates/stdsimd/src/lib.rs
@@ -7,7 +7,7 @@
 //!
 //! [stdsimd]: https://rust-lang-nursery.github.io/stdsimd/x86_64/stdsimd/
 
-#![feature(const_fn, integer_atomics, staged_api, stdsimd)]
+#![feature(const_fn, integer_atomics, staged_api, stdsimd, use_extern_macros)]
 #![feature(doc_cfg, allow_internal_unstable)]
 #![cfg_attr(feature = "cargo-clippy", allow(shadow_reuse))]
 #![cfg_attr(target_os = "linux", feature(linkage))]
@@ -28,6 +28,7 @@ extern crate std;
 mod stdsimd;
 
 pub use stdsimd::*;
+pub use coresimd::shuffle;
 
 #[allow(unused_imports)]
 use __do_not_use_this_import::fs;
diff --git a/crates/stdsimd/tests/shuffle.rs b/crates/stdsimd/tests/shuffle.rs
new file mode 100644
index 0000000000..cd9cd76bdf
--- /dev/null
+++ b/crates/stdsimd/tests/shuffle.rs
@@ -0,0 +1,17 @@
+#![feature(stdsimd)]
+#![deny(warnings)]
+
+#[macro_use]
+extern crate stdsimd;
+
+use stdsimd::simd::*;
+
+fn main() {
+    // check that shuffle! does not produce warnings when used in an unsafe block
+    unsafe {
+        let a = f32x4::new(0., 1., 2., 3.);
+        let b = f32x4::new(4., 5., 6., 7.);
+        let e = f32x4::new(0., 2., 4., 6.);
+        assert_eq!(e, shuffle!(a, b, [0, 2, 4, 6]));
+    }
+}