diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 014a9aca0f..e1c765e995 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -13,7 +13,7 @@ jobs:
     name: Check Style
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@master
+    - uses: actions/checkout@v4
     - name: Install Rust
       run: rustup update nightly && rustup default nightly
     - run: ci/style.sh
@@ -23,7 +23,7 @@ jobs:
     needs: [style]
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@master
+    - uses: actions/checkout@v4
     - name: Install Rust
       run: rustup update nightly && rustup default nightly
     - run: ci/dox.sh
@@ -43,7 +43,7 @@ jobs:
     needs: [style]
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@master
+    - uses: actions/checkout@v4
     - name: Install Rust
       run: rustup update nightly && rustup default nightly
     - run: cargo test --manifest-path crates/stdarch-verify/Cargo.toml
@@ -53,7 +53,7 @@ jobs:
     needs: [style]
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@master
+    - uses: actions/checkout@v4
     - name: Install Rust
       run: rustup update nightly && rustup default nightly
     - run: RUST_STD_DETECT_UNSTABLE=avx cargo test --features=std_detect_env_override --manifest-path crates/std_detect/Cargo.toml env_override_no_avx
@@ -164,7 +164,7 @@ jobs:
           os: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@master
+    - uses: actions/checkout@v4
       with:
         submodules: recursive
     - name: Install Rust (rustup)
@@ -221,7 +221,24 @@ jobs:
     name: Build std_detect
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@master
+    - uses: actions/checkout@v4
     - name: Install Rust
       run: rustup update nightly && rustup default nightly
     - run: ./ci/build-std-detect.sh
+
+  success:
+    needs:
+      - docs
+      - verify
+      - env_override
+      - test
+      - build-std-detect
+    runs-on: ubuntu-latest
+    # GitHub branch protection is exceedingly silly and treats "jobs skipped because a dependency
+    # failed" as success. So we have to do some contortions to ensure the job fails if any of its
+    # dependencies fails.
+    if: always() # make sure this is never "skipped"
+    steps:
+      # Manually check the status of all dependencies. `if: failure()` does not work.
+      - name: check if any dependency failed
+        run: jq --exit-status 'all(.result == "success")' <<< '${{ toJson(needs) }}'
diff --git a/crates/core_arch/src/aarch64/crc.rs b/crates/core_arch/src/aarch64/crc.rs
deleted file mode 100644
index 35940e0db8..0000000000
--- a/crates/core_arch/src/aarch64/crc.rs
+++ /dev/null
@@ -1,51 +0,0 @@
-extern "unadjusted" {
-    #[link_name = "llvm.aarch64.crc32x"]
-    fn crc32x_(crc: u32, data: u64) -> u32;
-
-    #[link_name = "llvm.aarch64.crc32cx"]
-    fn crc32cx_(crc: u32, data: u64) -> u32;
-}
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-/// CRC32 single round checksum for quad words (64 bits).
-///
-/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/__crc32d)
-#[inline]
-#[target_feature(enable = "crc")]
-#[cfg_attr(test, assert_instr(crc32x))]
-#[unstable(feature = "stdarch_arm_crc32", issue = "117215")]
-pub unsafe fn __crc32d(crc: u32, data: u64) -> u32 {
-    crc32x_(crc, data)
-}
-
-/// CRC32-C single round checksum for quad words (64 bits).
-///
-/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/__crc32cd)
-#[inline]
-#[target_feature(enable = "crc")]
-#[cfg_attr(test, assert_instr(crc32cx))]
-#[unstable(feature = "stdarch_arm_crc32", issue = "117215")]
-pub unsafe fn __crc32cd(crc: u32, data: u64) -> u32 {
-    crc32cx_(crc, data)
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::core_arch::{aarch64::*, simd::*};
-    use std::mem;
-    use stdarch_test::simd_test;
-
-    #[simd_test(enable = "crc")]
-    unsafe fn test_crc32d() {
-        assert_eq!(__crc32d(0, 0), 0);
-        assert_eq!(__crc32d(0, 18446744073709551615), 1147535477);
-    }
-
-    #[simd_test(enable = "crc")]
-    unsafe fn test_crc32cd() {
-        assert_eq!(__crc32cd(0, 0), 0);
-        assert_eq!(__crc32cd(0, 18446744073709551615), 3293575501);
-    }
-}
diff --git a/crates/core_arch/src/aarch64/mod.rs b/crates/core_arch/src/aarch64/mod.rs
index fefd2f4780..ebd7a31781 100644
--- a/crates/core_arch/src/aarch64/mod.rs
+++ b/crates/core_arch/src/aarch64/mod.rs
@@ -17,10 +17,6 @@ mod tme;
 #[unstable(feature = "stdarch_aarch64_tme", issue = "117216")]
 pub use self::tme::*;
 
-mod crc;
-#[unstable(feature = "stdarch_arm_crc32", issue = "117215")]
-pub use self::crc::*;
-
 mod prefetch;
 #[unstable(feature = "stdarch_aarch64_prefetch", issue = "117217")]
 pub use self::prefetch::*;
diff --git a/crates/core_arch/src/arm_shared/crc.rs b/crates/core_arch/src/arm_shared/crc.rs
index b1f716e1aa..8eedd21696 100644
--- a/crates/core_arch/src/arm_shared/crc.rs
+++ b/crates/core_arch/src/arm_shared/crc.rs
@@ -18,6 +18,10 @@ extern "unadjusted" {
     #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crc32cw")]
     #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.crc32cw")]
     fn crc32cw_(crc: u32, data: u32) -> u32;
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crc32x")]
+    fn crc32x_(crc: u32, data: u64) -> u32;
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crc32cx")]
+    fn crc32cx_(crc: u32, data: u64) -> u32;
 }
 
 #[cfg(test)]
@@ -95,12 +99,82 @@ pub unsafe fn __crc32cw(crc: u32, data: u32) -> u32 {
     crc32cw_(crc, data)
 }
 
+/// CRC32 single round checksum for quad words (64 bits).
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/__crc32d)
+#[inline]
+#[target_feature(enable = "crc")]
+#[cfg(target_arch = "aarch64")]
+#[cfg_attr(test, assert_instr(crc32x))]
+#[unstable(feature = "stdarch_arm_crc32", issue = "117215")]
+pub unsafe fn __crc32d(crc: u32, data: u64) -> u32 {
+    crc32x_(crc, data)
+}
+
+/// CRC32 single round checksum for quad words (64 bits).
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/__crc32d)
+#[inline]
+#[target_feature(enable = "crc")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(test, assert_instr(crc32w))]
+#[unstable(feature = "stdarch_arm_crc32", issue = "117215")]
+pub unsafe fn __crc32d(crc: u32, data: u64) -> u32 {
+    // On 32-bit ARM this intrinsic emits a chain of two `crc32_w` instructions
+    // and truncates the data to 32 bits in both clang and gcc
+    crc32w_(
+        crc32w_(crc, (data & 0xffffffff) as u32),
+        (data >> 32) as u32,
+    )
+}
+
+/// CRC32 single round checksum for quad words (64 bits).
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/__crc32cd)
+#[inline]
+#[target_feature(enable = "crc")]
+#[cfg(target_arch = "aarch64")]
+#[cfg_attr(test, assert_instr(crc32cx))]
+#[unstable(feature = "stdarch_arm_crc32", issue = "117215")]
+pub unsafe fn __crc32cd(crc: u32, data: u64) -> u32 {
+    crc32cx_(crc, data)
+}
+
+/// CRC32 single round checksum for quad words (64 bits).
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/__crc32cd)
+#[inline]
+#[target_feature(enable = "crc")]
+#[cfg(target_arch = "arm")]
+#[cfg_attr(test, assert_instr(crc32cw))]
+#[unstable(feature = "stdarch_arm_crc32", issue = "117215")]
+pub unsafe fn __crc32cd(crc: u32, data: u64) -> u32 {
+    // On 32-bit ARM this intrinsic emits a chain of two `crc32_cw` instructions
+    // and truncates the data to 32 bits in both clang and gcc
+    crc32cw_(
+        crc32cw_(crc, (data & 0xffffffff) as u32),
+        (data >> 32) as u32,
+    )
+}
+
 #[cfg(test)]
 mod tests {
     use crate::core_arch::{arm_shared::*, simd::*};
     use std::mem;
     use stdarch_test::simd_test;
 
+    #[simd_test(enable = "crc")]
+    unsafe fn test_crc32d() {
+        assert_eq!(__crc32d(0, 0), 0);
+        assert_eq!(__crc32d(0, 18446744073709551615), 1147535477);
+    }
+
+    #[simd_test(enable = "crc")]
+    unsafe fn test_crc32cd() {
+        assert_eq!(__crc32cd(0, 0), 0);
+        assert_eq!(__crc32cd(0, 18446744073709551615), 3293575501);
+    }
+
     #[simd_test(enable = "crc")]
     unsafe fn test_crc32b() {
         assert_eq!(__crc32b(0, 0), 0);
diff --git a/crates/core_arch/src/arm_shared/neon/mod.rs b/crates/core_arch/src/arm_shared/neon/mod.rs
index 5b43549d34..2d12f5e99b 100644
--- a/crates/core_arch/src/arm_shared/neon/mod.rs
+++ b/crates/core_arch/src/arm_shared/neon/mod.rs
@@ -8707,8 +8707,7 @@ pub unsafe fn vpadalq_u32(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t {
 
 /// 8-bit integer matrix multiply-accumulate
 #[inline]
-#[cfg_attr(not(bootstrap), target_feature(enable = "i8mm"))]
-#[target_feature(enable = "neon")]
+#[target_feature(enable = "neon,i8mm")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smmla))]
@@ -8735,8 +8734,7 @@ pub unsafe fn vmmlaq_s32(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t
 
 /// 8-bit integer matrix multiply-accumulate
 #[inline]
-#[cfg_attr(not(bootstrap), target_feature(enable = "i8mm"))]
-#[target_feature(enable = "neon")]
+#[target_feature(enable = "neon,i8mm")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ummla))]
@@ -8763,8 +8761,7 @@ pub unsafe fn vmmlaq_u32(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x
 
 /// Unsigned and signed 8-bit integer matrix multiply-accumulate
 #[inline]
-#[cfg_attr(not(bootstrap), target_feature(enable = "i8mm"))]
-#[target_feature(enable = "neon")]
+#[target_feature(enable = "neon,i8mm")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usmmla))]
diff --git a/crates/core_arch/src/lib.rs b/crates/core_arch/src/lib.rs
index bd4de67445..5dcd11fb68 100644
--- a/crates/core_arch/src/lib.rs
+++ b/crates/core_arch/src/lib.rs
@@ -35,7 +35,7 @@
     inline_const,
     generic_arg_infer
 )]
-#![cfg_attr(test, feature(test, abi_vectorcall))]
+#![cfg_attr(test, feature(test, abi_vectorcall, stdarch_internal))]
 #![deny(clippy::missing_inline_in_public_items)]
 #![allow(
     clippy::identity_op,
diff --git a/crates/core_arch/src/powerpc/altivec.rs b/crates/core_arch/src/powerpc/altivec.rs
index 544fce89ab..4fc166da7a 100644
--- a/crates/core_arch/src/powerpc/altivec.rs
+++ b/crates/core_arch/src/powerpc/altivec.rs
@@ -244,6 +244,13 @@ extern "C" {
     #[link_name = "llvm.ppc.altivec.vcmpequw"]
     fn vcmpequw(a: vector_unsigned_int, b: vector_unsigned_int) -> vector_bool_int;
 
+    #[link_name = "llvm.ppc.altivec.vcmpneb"]
+    fn vcmpneb(a: vector_signed_char, b: vector_signed_char) -> vector_bool_char;
+    #[link_name = "llvm.ppc.altivec.vcmpneh"]
+    fn vcmpneh(a: vector_signed_short, b: vector_signed_short) -> vector_bool_short;
+    #[link_name = "llvm.ppc.altivec.vcmpnew"]
+    fn vcmpnew(a: vector_signed_int, b: vector_signed_int) -> vector_bool_int;
+
     #[link_name = "llvm.ppc.altivec.vcmpgefp"]
     fn vcmpgefp(a: vector_float, b: vector_float) -> vector_bool_int;
 
@@ -335,6 +342,28 @@ extern "C" {
 
     #[link_name = "llvm.ppc.altivec.vlogefp"]
     fn vlogefp(a: vector_float) -> vector_float;
+
+    #[link_name = "llvm.ppc.altivec.sll"]
+    fn vsl(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int;
+    #[link_name = "llvm.ppc.altivec.slo"]
+    fn vslo(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int;
+
+    #[link_name = "llvm.ppc.altivec.srab"]
+    fn vsrab(a: vector_signed_char, b: vector_unsigned_char) -> vector_signed_char;
+    #[link_name = "llvm.ppc.altivec.srah"]
+    fn vsrah(a: vector_signed_short, b: vector_unsigned_short) -> vector_signed_short;
+    #[link_name = "llvm.ppc.altivec.sraw"]
+    fn vsraw(a: vector_signed_int, b: vector_unsigned_int) -> vector_signed_int;
+
+    #[link_name = "llvm.ppc.altivec.srl"]
+    fn vsr(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int;
+    #[link_name = "llvm.ppc.altivec.sro"]
+    fn vsro(a: vector_signed_int, b: vector_signed_int) -> vector_signed_int;
+
+    #[link_name = "llvm.ppc.altivec.slv"]
+    fn vslv(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char;
+    #[link_name = "llvm.ppc.altivec.srv"]
+    fn vsrv(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char;
 }
 
 macro_rules! s_t_l {
@@ -389,6 +418,32 @@ macro_rules! t_t_l {
     };
 }
 
+macro_rules! t_t_s {
+    (i32) => {
+        i32x4
+    };
+    (i16) => {
+        i16x8
+    };
+    (i8) => {
+        i8x16
+    };
+
+    (u32) => {
+        u32x4
+    };
+    (u16) => {
+        u16x8
+    };
+    (u8) => {
+        u8x16
+    };
+
+    (f32) => {
+        f32x4
+    };
+}
+
 macro_rules! impl_from {
     ($s: ident) => {
         #[unstable(feature = "stdarch_powerpc", issue = "111145")]
@@ -606,6 +661,57 @@ mod sealed {
     impl_vec_xl! { vec_xl_u32 lxvd2x / lxv u32 }
     impl_vec_xl! { vec_xl_f32 lxvd2x / lxv f32 }
 
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorXst {
+        type Out;
+        unsafe fn vec_xst(self, a: isize, p: Self::Out);
+    }
+
+    macro_rules! impl_vec_xst {
+        ($fun:ident $notpwr9:ident / $pwr9:ident $ty:ident) => {
+            #[inline]
+            #[target_feature(enable = "altivec")]
+            #[cfg_attr(
+                all(test, not(target_feature = "power9-altivec")),
+                assert_instr($notpwr9)
+            )]
+            #[cfg_attr(all(test, target_feature = "power9-altivec"), assert_instr($pwr9))]
+            pub unsafe fn $fun(s: t_t_l!($ty), a: isize, b: *mut $ty) {
+                let addr = (b as *mut u8).offset(a);
+
+                // Workaround ptr::copy_nonoverlapping not being inlined
+                extern "rust-intrinsic" {
+                    #[rustc_nounwind]
+                    pub fn copy_nonoverlapping<T>(src: *const T, dst: *mut T, count: usize);
+                }
+
+                copy_nonoverlapping(
+                    &s as *const _ as *const u8,
+                    addr,
+                    mem::size_of::<t_t_l!($ty)>(),
+                );
+            }
+
+            #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+            impl VectorXst for t_t_l!($ty) {
+                type Out = *mut $ty;
+                #[inline]
+                #[target_feature(enable = "altivec")]
+                unsafe fn vec_xst(self, a: isize, b: Self::Out) {
+                    $fun(self, a, b)
+                }
+            }
+        };
+    }
+
+    impl_vec_xst! { vec_xst_i8 stxvd2x / stxv i8 }
+    impl_vec_xst! { vec_xst_u8 stxvd2x / stxv u8 }
+    impl_vec_xst! { vec_xst_i16 stxvd2x / stxv i16 }
+    impl_vec_xst! { vec_xst_u16 stxvd2x / stxv u16 }
+    impl_vec_xst! { vec_xst_i32 stxvd2x / stxv i32 }
+    impl_vec_xst! { vec_xst_u32 stxvd2x / stxv u32 }
+    impl_vec_xst! { vec_xst_f32 stxvd2x / stxv f32 }
+
     test_impl! { vec_floor(a: vector_float) -> vector_float [ vfloor, vrfim / xvrspim ] }
 
     test_impl! { vec_vexptefp(a: vector_float) -> vector_float [ vexptefp, vexptefp ] }
@@ -640,6 +746,34 @@ mod sealed {
 
     impl_vec_cmp! { [VectorCmpEq vec_cmpeq] (vec_vcmpequb, vec_vcmpequh, vec_vcmpequw) }
 
+    macro_rules! impl_cmpne {
+        ($fun:ident ($ty:ident) -> $r:ident $([ $pwr9:ident ])? ) => {
+            #[inline]
+            #[target_feature(enable = "altivec")]
+            $( #[cfg_attr(all(test, target_feature = "power9-altivec"), assert_instr($pwr9))] )?
+            unsafe fn $fun(a: $ty, b: $ty) -> $r {
+                $( if cfg!(target_feature = "power9-altivec") {
+                    transmute($pwr9(transmute(a), transmute(b)))
+                } else )? {
+                    let zero = transmute(i32x4::new(0, 0, 0, 0));
+                    vec_nor(vec_cmpeq(a, b), zero)
+                }
+            }
+        };
+    }
+
+    impl_cmpne! { vec_vcmpneb(vector_signed_char) -> vector_bool_char [ vcmpneb ] }
+    impl_cmpne! { vec_vcmpneh(vector_signed_short) -> vector_bool_short [ vcmpneh ] }
+    impl_cmpne! { vec_vcmpnew(vector_signed_int) -> vector_bool_int [ vcmpnew ] }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorCmpNe<Other> {
+        type Result;
+        unsafe fn vec_cmpne(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_cmp! { [VectorCmpNe vec_cmpne] (vec_vcmpneb, vec_vcmpneh, vec_vcmpnew) }
+
     test_impl! { vec_vcmpbfp(a: vector_float, b: vector_float) -> vector_signed_int [vcmpbfp, vcmpbfp] }
 
     #[inline]
@@ -2145,6 +2279,33 @@ mod sealed {
         }
     }
 
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorAdde {
+        unsafe fn vec_adde(self, b: Self, c: Self) -> Self;
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorAdde for vector_unsigned_int {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_adde(self, b: Self, c: Self) -> Self {
+            let mask: vector_unsigned_int = transmute(u32x4::new(1, 1, 1, 1));
+            let carry = vec_and(c, mask);
+            vec_add(vec_add(self, b), carry)
+        }
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    impl VectorAdde for vector_signed_int {
+        #[inline]
+        #[target_feature(enable = "altivec")]
+        unsafe fn vec_adde(self, b: Self, c: Self) -> Self {
+            let mask: vector_signed_int = transmute(i32x4::new(1, 1, 1, 1));
+            let carry = vec_and(c, mask);
+            vec_add(vec_add(self, b), carry)
+        }
+    }
+
     #[unstable(feature = "stdarch_powerpc", issue = "111145")]
     pub trait VectorMladd<Other> {
         type Result;
@@ -2220,9 +2381,6 @@ mod sealed {
     vector_vnor! { vec_vnorsb i8 }
     vector_vnor! { vec_vnorsh i16 }
     vector_vnor! { vec_vnorsw i32 }
-    vector_vnor! { vec_vnorub u8 }
-    vector_vnor! { vec_vnoruh u16 }
-    vector_vnor! { vec_vnoruw u32 }
 
     #[unstable(feature = "stdarch_powerpc", issue = "111145")]
     pub trait VectorNor<Other> {
@@ -2230,7 +2388,7 @@ mod sealed {
         unsafe fn vec_nor(self, b: Other) -> Self::Result;
     }
 
-    impl_vec_trait! { [VectorNor vec_nor] 2 (vec_vnorub, vec_vnorsb, vec_vnoruh, vec_vnorsh, vec_vnoruw, vec_vnorsw) }
+    impl_vec_trait! { [VectorNor vec_nor]+ 2b (vec_vnorsb, vec_vnorsh, vec_vnorsw) }
 
     #[inline]
     #[target_feature(enable = "altivec")]
@@ -2569,6 +2727,264 @@ mod sealed {
     impl_vec_trait! { [VectorUnpackl vec_unpackl]+ vec_vupklsb (vector_bool_char) -> vector_bool_short }
     impl_vec_trait! { [VectorUnpackl vec_unpackl] vec_vupklsh (vector_signed_short) -> vector_signed_int }
     impl_vec_trait! { [VectorUnpackl vec_unpackl]+ vec_vupklsh (vector_bool_short) -> vector_bool_int }
+
+    macro_rules! impl_vec_shift {
+        ([$Trait:ident $m:ident] ($b:ident, $h:ident, $w:ident)) => {
+            impl_vec_trait!{ [$Trait $m]+ $b (vector_unsigned_char, vector_unsigned_char) -> vector_unsigned_char }
+            impl_vec_trait!{ [$Trait $m]+ $b (vector_signed_char, vector_unsigned_char) -> vector_signed_char }
+            impl_vec_trait!{ [$Trait $m]+ $h (vector_unsigned_short, vector_unsigned_short) -> vector_unsigned_short }
+            impl_vec_trait!{ [$Trait $m]+ $h (vector_signed_short, vector_unsigned_short) -> vector_signed_short }
+            impl_vec_trait!{ [$Trait $m]+ $w (vector_unsigned_int, vector_unsigned_int) -> vector_unsigned_int }
+            impl_vec_trait!{ [$Trait $m]+ $w (vector_signed_int, vector_unsigned_int) -> vector_signed_int }
+        };
+    }
+
+    macro_rules! impl_shift {
+        ($fun:ident $intr:ident $ty:ident) => {
+            #[inline]
+            #[target_feature(enable = "altivec")]
+            #[cfg_attr(test, assert_instr($fun))]
+            unsafe fn $fun(a: t_t_l!($ty), b: t_t_l!($ty)) -> t_t_l!($ty) {
+                let a = transmute(a);
+                let b = simd_rem(
+                    transmute(b),
+                    <t_t_s!($ty)>::splat(mem::size_of::<$ty>() as $ty * $ty::BITS as $ty),
+                );
+
+                transmute($intr(a, b))
+            }
+        };
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorSl<Other> {
+        type Result;
+        unsafe fn vec_sl(self, b: Other) -> Self::Result;
+    }
+
+    impl_shift! { vslb simd_shl u8 }
+    impl_shift! { vslh simd_shl u16 }
+    impl_shift! { vslw simd_shl u32 }
+
+    impl_vec_shift! { [VectorSl vec_sl] (vslb, vslh, vslw) }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorSr<Other> {
+        type Result;
+        unsafe fn vec_sr(self, b: Other) -> Self::Result;
+    }
+
+    impl_shift! { vsrb simd_shr u8 }
+    impl_shift! { vsrh simd_shr u16 }
+    impl_shift! { vsrw simd_shr u32 }
+
+    impl_vec_shift! { [VectorSr vec_sr] (vsrb, vsrh, vsrw) }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorSra<Other> {
+        type Result;
+        unsafe fn vec_sra(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_shift! { [VectorSra vec_sra] (vsrab, vsrah, vsraw) }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorSld {
+        unsafe fn vec_sld<const UIMM4: i32>(self, b: Self) -> Self;
+        unsafe fn vec_sldw<const UIMM2: i32>(self, b: Self) -> Self;
+    }
+
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(vsldoi, UIMM4 = 1))]
+    unsafe fn vsldoi<const UIMM4: i32>(
+        a: vector_unsigned_char,
+        b: vector_unsigned_char,
+    ) -> vector_unsigned_char {
+        static_assert_uimm_bits!(UIMM4, 4);
+        let d = UIMM4 as u8;
+        if cfg!(target_endian = "little") {
+            let perm = u8x16::new(
+                16 - d,
+                17 - d,
+                18 - d,
+                19 - d,
+                20 - d,
+                21 - d,
+                22 - d,
+                23 - d,
+                24 - d,
+                25 - d,
+                26 - d,
+                27 - d,
+                28 - d,
+                29 - d,
+                30 - d,
+                31 - d,
+            );
+
+            vec_perm(b, a, transmute(perm))
+        } else {
+            let perm = u8x16::new(
+                d,
+                d + 1,
+                d + 2,
+                d + 3,
+                d + 4,
+                d + 5,
+                d + 6,
+                d + 7,
+                d + 8,
+                d + 9,
+                d + 10,
+                d + 11,
+                d + 12,
+                d + 13,
+                d + 14,
+                d + 15,
+            );
+            vec_perm(a, b, transmute(perm))
+        }
+    }
+
+    // TODO: collapse the two once generic_const_exprs are usable.
+    #[inline]
+    #[target_feature(enable = "altivec")]
+    #[cfg_attr(test, assert_instr(xxsldwi, UIMM2 = 1))]
+    unsafe fn xxsldwi<const UIMM2: i32>(
+        a: vector_unsigned_char,
+        b: vector_unsigned_char,
+    ) -> vector_unsigned_char {
+        static_assert_uimm_bits!(UIMM2, 2);
+        let d = (UIMM2 << 2) as u8;
+        if cfg!(target_endian = "little") {
+            let perm = u8x16::new(
+                16 - d,
+                17 - d,
+                18 - d,
+                19 - d,
+                20 - d,
+                21 - d,
+                22 - d,
+                23 - d,
+                24 - d,
+                25 - d,
+                26 - d,
+                27 - d,
+                28 - d,
+                29 - d,
+                30 - d,
+                31 - d,
+            );
+
+            vec_perm(b, a, transmute(perm))
+        } else {
+            let perm = u8x16::new(
+                d,
+                d + 1,
+                d + 2,
+                d + 3,
+                d + 4,
+                d + 5,
+                d + 6,
+                d + 7,
+                d + 8,
+                d + 9,
+                d + 10,
+                d + 11,
+                d + 12,
+                d + 13,
+                d + 14,
+                d + 15,
+            );
+            vec_perm(a, b, transmute(perm))
+        }
+    }
+
+    macro_rules! impl_vec_sld {
+        ($($ty:ident),+) => { $(
+            #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+            impl VectorSld for $ty {
+                #[inline]
+                #[target_feature(enable = "altivec")]
+                unsafe fn vec_sld<const UIMM4: i32>(self, b: Self) -> Self {
+                    transmute(vsldoi::<UIMM4>(transmute(self), transmute(b)))
+                }
+                #[inline]
+                #[target_feature(enable = "altivec")]
+                unsafe fn vec_sldw<const UIMM2: i32>(self, b: Self) -> Self {
+                    transmute(xxsldwi::<UIMM2>(transmute(self), transmute(b)))
+                }
+           }
+        )+ };
+    }
+
+    impl_vec_sld! { vector_bool_char, vector_signed_char, vector_unsigned_char }
+    impl_vec_sld! { vector_bool_short, vector_signed_short, vector_unsigned_short }
+    impl_vec_sld! { vector_bool_int, vector_signed_int, vector_unsigned_int }
+    impl_vec_sld! { vector_float }
+
+    macro_rules! impl_vec_shift_long {
+        ([$Trait:ident $m:ident] ($f:ident)) => {
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_unsigned_char, vector_unsigned_char) -> vector_unsigned_char }
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_signed_char, vector_unsigned_char) -> vector_signed_char }
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_unsigned_short, vector_unsigned_char) -> vector_unsigned_short }
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_signed_short, vector_unsigned_char) -> vector_signed_short }
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_unsigned_int, vector_unsigned_char) -> vector_unsigned_int }
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_signed_int, vector_unsigned_char) -> vector_signed_int }
+        };
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorSll<Other> {
+        type Result;
+        unsafe fn vec_sll(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_shift_long! { [VectorSll vec_sll] (vsl) }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorSrl<Other> {
+        type Result;
+        unsafe fn vec_srl(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_shift_long! { [VectorSrl vec_srl] (vsr) }
+
+    macro_rules! impl_vec_shift_octect {
+        ([$Trait:ident $m:ident] ($f:ident)) => {
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_unsigned_char, vector_signed_char) -> vector_unsigned_char }
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_signed_char, vector_signed_char) -> vector_signed_char }
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_unsigned_short, vector_signed_char) -> vector_unsigned_short }
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_signed_short, vector_signed_char) -> vector_signed_short }
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_unsigned_int, vector_signed_char) -> vector_unsigned_int }
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_signed_int, vector_signed_char) -> vector_signed_int }
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_float, vector_signed_char) -> vector_float }
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_unsigned_char, vector_unsigned_char) -> vector_unsigned_char }
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_signed_char, vector_unsigned_char) -> vector_signed_char }
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_unsigned_short, vector_unsigned_char) -> vector_unsigned_short }
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_signed_short, vector_unsigned_char) -> vector_signed_short }
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_unsigned_int, vector_unsigned_char) -> vector_unsigned_int }
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_signed_int, vector_unsigned_char) -> vector_signed_int }
+            impl_vec_trait!{ [$Trait $m]+ $f (vector_float, vector_unsigned_char) -> vector_float }
+        };
+    }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorSlo<Other> {
+        type Result;
+        unsafe fn vec_slo(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_shift_octect! { [VectorSlo vec_slo] (vslo) }
+
+    #[unstable(feature = "stdarch_powerpc", issue = "111145")]
+    pub trait VectorSro<Other> {
+        type Result;
+        unsafe fn vec_sro(self, b: Other) -> Self::Result;
+    }
+
+    impl_vec_shift_octect! { [VectorSro vec_sro] (vsro) }
 }
 
 /// Vector Merge Low
@@ -2648,6 +3064,179 @@ where
     a.vec_unpackl()
 }
 
+/// Vector Shift Left
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_sl<T, U>(a: T, b: U) -> <T as sealed::VectorSl<U>>::Result
+where
+    T: sealed::VectorSl<U>,
+{
+    a.vec_sl(b)
+}
+
+/// Vector Shift Right
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_sr<T, U>(a: T, b: U) -> <T as sealed::VectorSr<U>>::Result
+where
+    T: sealed::VectorSr<U>,
+{
+    a.vec_sr(b)
+}
+
+/// Vector Shift Right Algebraic
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_sra<T, U>(a: T, b: U) -> <T as sealed::VectorSra<U>>::Result
+where
+    T: sealed::VectorSra<U>,
+{
+    a.vec_sra(b)
+}
+
+/// Vector Shift Left Double
+///
+/// ## Endian considerations
+///
+/// This intrinsic is not endian-neutral, so uses of vec_sld in
+/// big-endian code must be rewritten for little-endian targets.
+///
+/// Historically, vec_sld could be used to shift by amounts not a multiple of the element size
+/// for most types, in which case the purpose of the shift is difficult to determine and difficult
+/// to automatically rewrite efficiently for little endian.
+///
+/// So the concatenation of a and b is done in big-endian fashion (left to right), and the shift is
+/// always to the left. This will generally produce surprising results for little-endian targets.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_sld<T, const UIMM4: i32>(a: T, b: T) -> T
+where
+    T: sealed::VectorSld,
+{
+    a.vec_sld::<UIMM4>(b)
+}
+
+/// Vector Shift Left Double by Words
+///
+/// ## Endian considerations
+///
+/// This intrinsic is not endian-neutral, so uses of vec_sldw in
+/// big-endian code must be rewritten for little-endian targets.
+///
+/// The concatenation of a and b is done in big-endian fashion (left to right), and the shift is
+/// always to the left. This will generally produce surprising results for little- endian targets.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_sldw<T, const UIMM2: i32>(a: T, b: T) -> T
+where
+    T: sealed::VectorSld,
+{
+    a.vec_sldw::<UIMM2>(b)
+}
+
+/// Vector Shift Left Long
+///
+/// ## Endian considerations
+/// This intrinsic is not endian-neutral, so uses of vec_sll in big-endian
+/// code must be rewritten for little-endian targets.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_sll<T, U>(a: T, b: U) -> <T as sealed::VectorSll<U>>::Result
+where
+    T: sealed::VectorSll<U>,
+{
+    a.vec_sll(b)
+}
+
+/// Vector Shift Right Long
+///
+/// ## Endian considerations
+/// This intrinsic is not endian-neutral, so uses of vec_srl in big-endian
+/// code must be rewritten for little-endian targets.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_srl<T, U>(a: T, b: U) -> <T as sealed::VectorSrl<U>>::Result
+where
+    T: sealed::VectorSrl<U>,
+{
+    a.vec_srl(b)
+}
+
+/// Vector Shift Left by Octets
+///
+/// ## Endian considerations
+/// This intrinsic is not endian-neutral, so uses of vec_slo in big-endian code must be rewritten
+/// for little-endian targets. The shift count is in element 15 of b for big-endian, but in element
+/// 0 of b for little-endian.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_slo<T, U>(a: T, b: U) -> <T as sealed::VectorSlo<U>>::Result
+where
+    T: sealed::VectorSlo<U>,
+{
+    a.vec_slo(b)
+}
+
+/// Vector Shift Right by Octets
+///
+/// ## Endian considerations
+/// This intrinsic is not endian-neutral, so uses of vec_sro in big-endian code must be rewritten
+/// for little-endian targets. The shift count is in element 15 of b for big-endian, but in element
+/// 0 of b for little-endian.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_sro<T, U>(a: T, b: U) -> <T as sealed::VectorSro<U>>::Result
+where
+    T: sealed::VectorSro<U>,
+{
+    a.vec_sro(b)
+}
+
+/// Vector Shift Left Variable
+///
+/// ## Result value
+/// Let v be a 17-byte vector formed from a in bytes `[0:15]` and a zero byte in element 16.
+/// Then each byte element i of r is determined as follows. The start bit sb is
+/// obtained from bits 5:7 of byte element i of b. Then the contents of bits sb:sb+7 of the
+/// halfword in byte elements i:i+1 of v are placed into byte element i of r.
+///
+/// ## Endian considerations
+/// All bit and byte element numbers are specified in big-endian order. This intrinsic is not
+/// endian-neutral.
+#[inline]
+#[target_feature(enable = "power9-altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_slv(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char {
+    vslv(a, b)
+}
+
+/// Vector Shift Right Variable
+///
+/// ## Result value
+/// Let v be a 17-byte vector formed from a zero byte in element 0 and the elements of
+/// a in bytes `[1:16]`. Then each byte element i of r is determined as follows. The start bit sb is
+/// obtained from bits 5:7 of byte element i of b. Then the contents of bits (8 – sb):(15 – sb) of
+/// the halfword in byte elements i:i+1 of v are placed into byte element i of r.
+///
+/// ## Endian considerations
+/// All bit and byte element numbers are specified in big-endian order. This intrinsic is not
+/// endian-neutral.
+#[inline]
+#[target_feature(enable = "power9-altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_srv(a: vector_unsigned_char, b: vector_unsigned_char) -> vector_unsigned_char {
+    vsrv(a, b)
+}
+
 /// Vector Load Indexed.
 #[inline]
 #[target_feature(enable = "altivec")]
@@ -2692,6 +3281,17 @@ where
     p.vec_xl(off)
 }
 
+/// VSX Unaligned Store
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_xst<T>(v: T, off: isize, p: <T as sealed::VectorXst>::Out)
+where
+    T: sealed::VectorXst,
+{
+    v.vec_xst(off, p)
+}
+
 /// Vector Base-2 Logarithm Estimate
 #[inline]
 #[target_feature(enable = "altivec")]
@@ -2766,6 +3366,21 @@ where
     a.vec_cmpeq(b)
 }
 
+/// Vector Compare Not Equal
+///
+/// ## Result value
+/// For each element of r, the value of each bit is 1 if the corresponding elements
+/// of a and b are not equal. Otherwise, the value of each bit is 0.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_cmpne<T, U>(a: T, b: U) -> <T as sealed::VectorCmpNe<U>>::Result
+where
+    T: sealed::VectorCmpNe<U>,
+{
+    a.vec_cmpne(b)
+}
+
 /// Vector cmpb.
 #[inline]
 #[target_feature(enable = "altivec")]
@@ -2901,11 +3516,11 @@ where
 }
 
 splat! { vec_splat_u8, u8, u8x16 [vspltisb, "Vector Splat to Unsigned Byte"] }
-splat! { vec_splat_i8, i8, i8x16 [vspltisb, "Vector Splat to Signed Byte"] }
+splat! { vec_splat_s8, i8, i8x16 [vspltisb, "Vector Splat to Signed Byte"] }
 splat! { vec_splat_u16, u16, u16x8 [vspltish, "Vector Splat to Unsigned Halfword"] }
-splat! { vec_splat_i16, i16, i16x8 [vspltish, "Vector Splat to Signed Halfword"] }
+splat! { vec_splat_s16, i16, i16x8 [vspltish, "Vector Splat to Signed Halfword"] }
 splat! { vec_splat_u32, u32, u32x4 [vspltisw, "Vector Splat to Unsigned Word"] }
-splat! { vec_splat_i32, i32, i32x4 [vspltisw, "Vector Splat to Signed Word"] }
+splat! { vec_splat_s32, i32, i32x4 [vspltisw, "Vector Splat to Signed Word"] }
 
 /// Vector splats.
 #[inline]
@@ -2982,6 +3597,22 @@ where
     a.vec_add(b)
 }
 
+/// Vector Add Extended
+///
+/// ## Result value
+/// The value of each element of r is produced by adding the corresponding elements of
+/// a and b with a carry specified in the corresponding element of c (1 if there is a carry, 0
+/// otherwise).
+#[inline]
+#[target_feature(enable = "altivec")]
+#[unstable(feature = "stdarch_powerpc", issue = "111145")]
+pub unsafe fn vec_adde<T>(a: T, b: T, c: T) -> T
+where
+    T: sealed::VectorAdde,
+{
+    a.vec_adde(b, c)
+}
+
 /// Vector Convert to Floating-Point
 #[inline]
 #[target_feature(enable = "altivec")]
@@ -3579,6 +4210,21 @@ mod tests {
         }
     }
 
+    #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_xst() {
+        let v: vector_unsigned_char = transmute(u8x16::new(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+        ));
+
+        for off in 0..16 {
+            let mut buf = [0u8; 32];
+            vec_xst(v, 0, (buf.as_mut_ptr() as *mut u8).offset(off));
+            for i in 0..16 {
+                assert_eq!(i as u8, buf[off as usize..][i]);
+            }
+        }
+    }
+
     #[simd_test(enable = "altivec")]
     unsafe fn test_vec_ldl() {
         let pat = [
@@ -3721,6 +4367,42 @@ mod tests {
         [false, true, true, false]
     }
 
+    test_vec_2! { test_vec_cmpne_i8, vec_cmpne, i8x16 -> m8x16,
+        [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [true, true, true, true, false, false, false, false, false, false, false, false, false, false, false, false]
+    }
+
+    test_vec_2! { test_vec_cmpne_u8, vec_cmpne, u8x16 -> m8x16,
+        [1, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 255, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [true, true, true, true, false, false, false, false, false, false, false, false, false, false, false, false]
+    }
+
+    test_vec_2! { test_vec_cmpne_i16, vec_cmpne, i16x8 -> m16x8,
+        [1, -1, 0, 0, 0, 0, 0, 0],
+        [0, 0, -1, 1, 0, 0, 0, 0],
+        [true, true, true, true, false, false, false, false]
+    }
+
+    test_vec_2! { test_vec_cmpne_u16, vec_cmpne, u16x8 -> m16x8,
+        [1, 255, 0, 0, 0, 0, 0, 0],
+        [0, 0, 255, 1, 0, 0, 0, 0],
+        [true, true, true, true, false, false, false, false]
+    }
+
+    test_vec_2! { test_vec_cmpne_i32, vec_cmpne, i32x4 -> m32x4,
+        [1, -1, 0, 0],
+        [0, -1, 0, 1],
+        [true, false, false, true]
+    }
+
+    test_vec_2! { test_vec_cmpne_u32, vec_cmpne, u32x4 -> m32x4,
+        [1, 255, 0, 0],
+        [0, 255,  0, 1],
+        [true, false, false, true]
+    }
+
     test_vec_2! { test_vec_all_eq_i8_false, vec_all_eq, i8x16 -> bool,
         [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
@@ -4769,9 +5451,9 @@ mod tests {
     test_vec_splat! { test_vec_splat_u8, vec_splat_u8, u8x16, -1, u8::MAX }
     test_vec_splat! { test_vec_splat_u16, vec_splat_u16, u16x8, -1, u16::MAX }
     test_vec_splat! { test_vec_splat_u32, vec_splat_u32, u32x4, -1, u32::MAX }
-    test_vec_splat! { test_vec_splat_i8, vec_splat_i8, i8x16, -1, -1 }
-    test_vec_splat! { test_vec_splat_i16, vec_splat_i16, i16x8, -1, -1 }
-    test_vec_splat! { test_vec_splat_i32, vec_splat_i32, i32x4, -1, -1 }
+    test_vec_splat! { test_vec_splat_s8, vec_splat_s8, i8x16, -1, -1 }
+    test_vec_splat! { test_vec_splat_s16, vec_splat_s16, i16x8, -1, -1 }
+    test_vec_splat! { test_vec_splat_s32, vec_splat_s32, i32x4, -1, -1 }
 
     macro_rules! test_vec_sub {
         { $name: ident, $ty: ident, [$($a:expr),+], [$($b:expr),+], [$($d:expr),+] } => {
diff --git a/crates/core_arch/src/simd_llvm.rs b/crates/core_arch/src/simd_llvm.rs
index decdecaaf4..5b6cd0b93b 100644
--- a/crates/core_arch/src/simd_llvm.rs
+++ b/crates/core_arch/src/simd_llvm.rs
@@ -24,6 +24,7 @@ extern "platform-intrinsic" {
     pub fn simd_sub<T>(x: T, y: T) -> T;
     pub fn simd_mul<T>(x: T, y: T) -> T;
     pub fn simd_div<T>(x: T, y: T) -> T;
+    pub fn simd_rem<T>(x: T, y: T) -> T;
     pub fn simd_shl<T>(x: T, y: T) -> T;
     pub fn simd_shr<T>(x: T, y: T) -> T;
     pub fn simd_and<T>(x: T, y: T) -> T;
diff --git a/crates/intrinsic-test/missing_arm.txt b/crates/intrinsic-test/missing_arm.txt
index 7439cd6e66..0ea4cec406 100644
--- a/crates/intrinsic-test/missing_arm.txt
+++ b/crates/intrinsic-test/missing_arm.txt
@@ -14,8 +14,6 @@ vbfmlaltq_laneq_f32
 vbfmmlaq_f32
 
 # Implemented in Clang and stdarch for A64 only even though CSV claims A32 support
-__crc32d
-__crc32cd
 vaddq_p64
 vbsl_p64
 vbslq_p64
diff --git a/crates/std_detect/src/detect/arch/x86.rs b/crates/std_detect/src/detect/arch/x86.rs
index 828ac5c38a..f4f45750ed 100644
--- a/crates/std_detect/src/detect/arch/x86.rs
+++ b/crates/std_detect/src/detect/arch/x86.rs
@@ -75,6 +75,7 @@ features! {
     /// * `"avx512bitalg"`
     /// * `"avx512bf16"`
     /// * `"avx512vp2intersect"`
+    /// * `"avx512fp16"`
     /// * `"f16c"`
     /// * `"fma"`
     /// * `"bmi1"`
@@ -169,6 +170,8 @@ features! {
     /// AVX-512 BF16 (BFLOAT16 instructions)
     @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512vp2intersect: "avx512vp2intersect";
     /// AVX-512 P2INTERSECT
+    @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512fp16: "avx512fp16";
+    /// AVX-512 FP16 (FLOAT16 instructions)
     @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] f16c: "f16c";
     /// F16C (Conversions between IEEE-754 `binary16` and `binary32` formats)
     @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] fma: "fma";
diff --git a/crates/std_detect/src/detect/mod.rs b/crates/std_detect/src/detect/mod.rs
index 5ce4e54e23..c938abf17d 100644
--- a/crates/std_detect/src/detect/mod.rs
+++ b/crates/std_detect/src/detect/mod.rs
@@ -66,6 +66,9 @@ cfg_if! {
     } else if #[cfg(all(target_os = "windows", target_arch = "aarch64"))] {
         #[path = "os/windows/aarch64.rs"]
         mod os;
+    } else if #[cfg(all(target_os = "macos", target_arch = "aarch64", feature = "libc"))] {
+        #[path = "os/macos/aarch64.rs"]
+        mod os;
     } else {
         #[path = "os/other.rs"]
         mod os;
diff --git a/crates/std_detect/src/detect/os/macos/aarch64.rs b/crates/std_detect/src/detect/os/macos/aarch64.rs
new file mode 100644
index 0000000000..d7ebd956d6
--- /dev/null
+++ b/crates/std_detect/src/detect/os/macos/aarch64.rs
@@ -0,0 +1,98 @@
+//! Run-time feature detection for aarch64 on macOS.
+
+use crate::detect::{cache, Feature};
+
+#[inline]
+fn _sysctlbyname(name: &str) -> bool {
+    use libc;
+
+    let mut enabled: i32 = 0;
+    let mut enabled_len: usize = 4;
+    let enabled_ptr = &mut enabled as *mut i32 as *mut libc::c_void;
+
+    let ret = unsafe {
+        libc::sysctlbyname(
+            name.as_ptr() as *const i8,
+            enabled_ptr,
+            &mut enabled_len,
+            core::ptr::null_mut(),
+            0,
+        )
+    };
+
+    match ret {
+        0 => enabled != 0,
+        _ => false,
+    }
+}
+
+/// Try to read the features using sysctlbyname.
+pub(crate) fn detect_features() -> cache::Initializer {
+    let mut value = cache::Initializer::default();
+
+    let mut enable_feature = |f, enable| {
+        if enable {
+            value.set(f as u32);
+        }
+    };
+
+    let asimd = _sysctlbyname("hw.optional.AdvSIMD\0");
+    let pmull = _sysctlbyname("hw.optional.arm.FEAT_PMULL\0");
+    let fp = _sysctlbyname("hw.optional.floatingpoint\0");
+    let fp16 = _sysctlbyname("hw.optional.arm.FEAT_FP16\0");
+    let crc = _sysctlbyname("hw.optional.armv8_crc32\0");
+    let lse = _sysctlbyname("hw.optional.arm.FEAT_LSE\0");
+    let lse2 = _sysctlbyname("hw.optional.arm.FEAT_LSE2\0");
+    let rdm = _sysctlbyname("hw.optional.arm.FEAT_RDM\0");
+    let rcpc = _sysctlbyname("hw.optional.arm.FEAT_LRCPC\0");
+    let rcpc2 = _sysctlbyname("hw.optional.arm.FEAT_LRCPC2\0");
+    let dotprod = _sysctlbyname("hw.optional.arm.FEAT_DotProd\0");
+    let fhm = _sysctlbyname("hw.optional.arm.FEAT_FHM\0");
+    let flagm = _sysctlbyname("hw.optional.arm.FEAT_FlagM\0");
+    let ssbs = _sysctlbyname("hw.optional.arm.FEAT_SSBS\0");
+    let sb = _sysctlbyname("hw.optional.arm.FEAT_SB\0");
+    let paca = _sysctlbyname("hw.optional.arm.FEAT_PAuth\0");
+    let dpb = _sysctlbyname("hw.optional.arm.FEAT_DPB\0");
+    let dpb2 = _sysctlbyname("hw.optional.arm.FEAT_DPB2\0");
+    let frintts = _sysctlbyname("hw.optional.arm.FEAT_FRINTTS\0");
+    let i8mm = _sysctlbyname("hw.optional.arm.FEAT_I8MM\0");
+    let bf16 = _sysctlbyname("hw.optional.arm.FEAT_BF16\0");
+    let bti = _sysctlbyname("hw.optional.arm.FEAT_BTI\0");
+    let fcma = _sysctlbyname("hw.optional.arm.FEAT_FCMA\0");
+    let aes = _sysctlbyname("hw.optional.arm.FEAT_AES\0");
+    let sha1 = _sysctlbyname("hw.optional.arm.FEAT_SHA1\0");
+    let sha2 = _sysctlbyname("hw.optional.arm.FEAT_SHA256\0");
+    let sha3 = _sysctlbyname("hw.optional.arm.FEAT_SHA3\0");
+    let sha512 = _sysctlbyname("hw.optional.arm.FEAT_SHA512\0");
+    let jsconv = _sysctlbyname("hw.optional.arm.FEAT_JSCVT\0");
+
+    enable_feature(Feature::asimd, asimd);
+    enable_feature(Feature::pmull, pmull);
+    enable_feature(Feature::fp, fp);
+    enable_feature(Feature::fp16, fp16);
+    enable_feature(Feature::crc, crc);
+    enable_feature(Feature::lse, lse);
+    enable_feature(Feature::lse2, lse2);
+    enable_feature(Feature::rdm, rdm);
+    enable_feature(Feature::rcpc, rcpc);
+    enable_feature(Feature::rcpc2, rcpc2);
+    enable_feature(Feature::dotprod, dotprod);
+    enable_feature(Feature::fhm, fhm);
+    enable_feature(Feature::flagm, flagm);
+    enable_feature(Feature::ssbs, ssbs);
+    enable_feature(Feature::sb, sb);
+    enable_feature(Feature::paca, paca);
+    enable_feature(Feature::dpb, dpb);
+    enable_feature(Feature::dpb2, dpb2);
+    enable_feature(Feature::frintts, frintts);
+    enable_feature(Feature::i8mm, i8mm);
+    enable_feature(Feature::bf16, bf16);
+    enable_feature(Feature::bti, bti);
+    enable_feature(Feature::fcma, fcma);
+    enable_feature(Feature::aes, aes);
+    enable_feature(Feature::jsconv, jsconv);
+    enable_feature(Feature::sha2, sha1 && sha2 && asimd);
+    enable_feature(Feature::sha3, sha512 && sha3 && asimd);
+
+    value
+}
diff --git a/crates/std_detect/src/detect/os/x86.rs b/crates/std_detect/src/detect/os/x86.rs
index d8dd84db49..4ff9ac5f13 100644
--- a/crates/std_detect/src/detect/os/x86.rs
+++ b/crates/std_detect/src/detect/os/x86.rs
@@ -69,12 +69,13 @@ pub(crate) fn detect_features() -> cache::Initializer {
 
     // EAX = 7, ECX = 0: Queries "Extended Features";
     // Contains information about bmi,bmi2, and avx2 support.
-    let (extended_features_ebx, extended_features_ecx) = if max_basic_leaf >= 7 {
-        let CpuidResult { ebx, ecx, .. } = unsafe { __cpuid(0x0000_0007_u32) };
-        (ebx, ecx)
-    } else {
-        (0, 0) // CPUID does not support "Extended Features"
-    };
+    let (extended_features_ebx, extended_features_ecx, extended_features_edx) =
+        if max_basic_leaf >= 7 {
+            let CpuidResult { ebx, ecx, edx, .. } = unsafe { __cpuid(0x0000_0007_u32) };
+            (ebx, ecx, edx)
+        } else {
+            (0, 0, 0) // CPUID does not support "Extended Features"
+        };
 
     // EAX = 0x8000_0000, ECX = 0: Get Highest Extended Function Supported
     // - EAX returns the max leaf value for extended information, that is,
@@ -217,6 +218,7 @@ pub(crate) fn detect_features() -> cache::Initializer {
                         enable(extended_features_ecx, 11, Feature::avx512vnni);
                         enable(extended_features_ecx, 12, Feature::avx512bitalg);
                         enable(extended_features_ecx, 14, Feature::avx512vpopcntdq);
+                        enable(extended_features_edx, 23, Feature::avx512fp16);
                     }
                 }
             }
diff --git a/crates/std_detect/src/lib.rs b/crates/std_detect/src/lib.rs
index 9bdd647313..19cc021712 100644
--- a/crates/std_detect/src/lib.rs
+++ b/crates/std_detect/src/lib.rs
@@ -13,7 +13,7 @@
 //! * `powerpc64`: [`is_powerpc64_feature_detected`]
 //! * `loongarch`: [`is_loongarch_feature_detected`]
 
-#![stable(feature = "stdsimd", since = "1.27.0")]
+#![unstable(feature = "stdarch_internal", issue = "none")]
 #![feature(staged_api, doc_cfg, allow_internal_unstable)]
 #![deny(rust_2018_idioms)]
 #![allow(clippy::shadow_reuse)]
@@ -23,7 +23,14 @@
 // Temporary hack: needed to build against toolchains from before the mass feature renaming.
 // Remove this as soon as the stdarch submodule is updated on nightly.
 #![allow(stable_features)]
-#![feature(stdsimd)]
+#![cfg_attr(not(feature = "rustc-dep-of-std"), feature(stdsimd))]
+#![cfg_attr(
+    all(
+        any(target_arch = "x86", target_arch = "x86_64"),
+        feature = "rustc-dep-of-std"
+    ),
+    feature(stdarch_x86_has_cpuid)
+)]
 
 #[cfg(test)]
 #[macro_use]
@@ -35,5 +42,5 @@ extern crate std;
 extern crate alloc;
 
 #[doc(hidden)]
-#[stable(feature = "stdsimd", since = "1.27.0")]
+#[unstable(feature = "stdarch_internal", issue = "none")]
 pub mod detect;
diff --git a/crates/std_detect/tests/cpu-detection.rs b/crates/std_detect/tests/cpu-detection.rs
index cb57b849d6..1053de3a82 100644
--- a/crates/std_detect/tests/cpu-detection.rs
+++ b/crates/std_detect/tests/cpu-detection.rs
@@ -139,6 +139,38 @@ fn aarch64_bsd() {
     println!("sha2: {:?}", is_aarch64_feature_detected!("sha2"));
 }
 
+#[test]
+#[cfg(all(target_arch = "aarch64", target_os = "macos"))]
+fn aarch64_macos() {
+    println!("asimd: {:?}", is_aarch64_feature_detected!("asimd"));
+    println!("fp: {:?}", is_aarch64_feature_detected!("fp"));
+    println!("fp16: {:?}", is_aarch64_feature_detected!("fp16"));
+    println!("pmull: {:?}", is_aarch64_feature_detected!("pmull"));
+    println!("crc: {:?}", is_aarch64_feature_detected!("crc"));
+    println!("lse: {:?}", is_aarch64_feature_detected!("lse"));
+    println!("lse2: {:?}", is_aarch64_feature_detected!("lse2"));
+    println!("rdm: {:?}", is_aarch64_feature_detected!("rdm"));
+    println!("rcpc: {:?}", is_aarch64_feature_detected!("rcpc"));
+    println!("rcpc2: {:?}", is_aarch64_feature_detected!("rcpc2"));
+    println!("dotprod: {:?}", is_aarch64_feature_detected!("dotprod"));
+    println!("fhm: {:?}", is_aarch64_feature_detected!("fhm"));
+    println!("flagm: {:?}", is_aarch64_feature_detected!("flagm"));
+    println!("ssbs: {:?}", is_aarch64_feature_detected!("ssbs"));
+    println!("sb: {:?}", is_aarch64_feature_detected!("sb"));
+    println!("paca: {:?}", is_aarch64_feature_detected!("paca"));
+    println!("dpb: {:?}", is_aarch64_feature_detected!("dpb"));
+    println!("dpb2: {:?}", is_aarch64_feature_detected!("dpb2"));
+    println!("frintts: {:?}", is_aarch64_feature_detected!("frintts"));
+    println!("i8mm: {:?}", is_aarch64_feature_detected!("i8mm"));
+    println!("bf16: {:?}", is_aarch64_feature_detected!("bf16"));
+    println!("bti: {:?}", is_aarch64_feature_detected!("bti"));
+    println!("fcma: {:?}", is_aarch64_feature_detected!("fcma"));
+    println!("jsconv: {:?}", is_aarch64_feature_detected!("jsconv"));
+    println!("aes: {:?}", is_aarch64_feature_detected!("aes"));
+    println!("sha2: {:?}", is_aarch64_feature_detected!("sha2"));
+    println!("sha3: {:?}", is_aarch64_feature_detected!("sha3"));
+}
+
 #[test]
 #[cfg(all(target_arch = "powerpc", target_os = "linux"))]
 fn powerpc_linux() {
@@ -201,6 +233,7 @@ fn x86_all() {
         "avx512vp2intersect {:?}",
         is_x86_feature_detected!("avx512vp2intersect")
     );
+    println!("avx512fp16 {:?}", is_x86_feature_detected!("avx512fp16"));
     println!("f16c: {:?}", is_x86_feature_detected!("f16c"));
     println!("fma: {:?}", is_x86_feature_detected!("fma"));
     println!("bmi1: {:?}", is_x86_feature_detected!("bmi1"));
diff --git a/crates/std_detect/tests/macro_trailing_commas.rs b/crates/std_detect/tests/macro_trailing_commas.rs
index 8304b225f5..d37629ec0a 100644
--- a/crates/std_detect/tests/macro_trailing_commas.rs
+++ b/crates/std_detect/tests/macro_trailing_commas.rs
@@ -1,3 +1,15 @@
+#![allow(internal_features)]
+#![cfg_attr(
+    any(
+        target_arch = "arm",
+        target_arch = "aarch64",
+        target_arch = "x86",
+        target_arch = "x86_64",
+        target_arch = "powerpc",
+        target_arch = "powerpc64"
+    ),
+    feature(stdarch_internal)
+)]
 #![cfg_attr(target_arch = "arm", feature(stdarch_arm_feature_detection))]
 #![cfg_attr(target_arch = "powerpc", feature(stdarch_powerpc_feature_detection))]
 #![cfg_attr(target_arch = "powerpc64", feature(stdarch_powerpc_feature_detection))]
diff --git a/crates/std_detect/tests/x86-specific.rs b/crates/std_detect/tests/x86-specific.rs
index 54bcab7b1e..ae7f677ed4 100644
--- a/crates/std_detect/tests/x86-specific.rs
+++ b/crates/std_detect/tests/x86-specific.rs
@@ -1,4 +1,6 @@
 #![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#![allow(internal_features)]
+#![feature(stdarch_internal)]
 
 extern crate cupid;
 #[macro_use]
@@ -49,6 +51,7 @@ fn dump() {
         "avx512vp2intersect {:?}",
         is_x86_feature_detected!("avx512vp2intersect")
     );
+    println!("avx512fp16 {:?}", is_x86_feature_detected!("avx512fp16"));
     println!("fma: {:?}", is_x86_feature_detected!("fma"));
     println!("abm: {:?}", is_x86_feature_detected!("abm"));
     println!("bmi: {:?}", is_x86_feature_detected!("bmi1"));
diff --git a/examples/connect5.rs b/examples/connect5.rs
index 53e9b8124d..a569689fad 100644
--- a/examples/connect5.rs
+++ b/examples/connect5.rs
@@ -28,9 +28,10 @@
 //! You should see a game self-playing. In the end of the game, it shows the average time for
 //! each move.
 
+#![allow(internal_features)]
 #![feature(avx512_target_feature)]
-#![cfg_attr(target_arch = "x86", feature(stdarch_x86_avx512))]
-#![cfg_attr(target_arch = "x86_64", feature(stdarch_x86_avx512))]
+#![cfg_attr(target_arch = "x86", feature(stdarch_x86_avx512, stdarch_internal))]
+#![cfg_attr(target_arch = "x86_64", feature(stdarch_x86_avx512, stdarch_internal))]
 #![feature(stmt_expr_attributes)]
 
 use rand::seq::SliceRandom;
diff --git a/examples/hex.rs b/examples/hex.rs
index 490556e8bf..b73a306f5f 100644
--- a/examples/hex.rs
+++ b/examples/hex.rs
@@ -12,8 +12,13 @@
 //!
 //! and you should see `746573740a` get printed out.
 
+#![allow(internal_features)]
 #![feature(wasm_target_feature)]
 #![cfg_attr(test, feature(test))]
+#![cfg_attr(
+    any(target_arch = "x86", target_arch = "x86_64"),
+    feature(stdarch_internal)
+)]
 #![allow(
     clippy::unwrap_used,
     clippy::print_stdout,