dotnet · kg · May 19, 2023 · May 18, 2023 · May 18, 2023
@@ -3245,13 +3245,65 @@ function emit_simd_3(builder: WasmBuilder, ip: MintOpcodePtr, index: SimdIntrins
                 builder.appendU8(WasmOpcode.i32_eqz);
             append_stloc_tail(builder, getArgU16(ip, 1), WasmOpcode.i32_store);
             return true;
+        case SimdIntrinsic3.V128_I2_SHUFFLE:
+        case SimdIntrinsic3.V128_I4_SHUFFLE:
+            // FIXME: I8
+            // FIXME: Many uses of these shuffles have constant shuffle indices,
+            //  which we could convert into bytes at compile time for vastly improved performance
+            return emit_shuffle(builder, ip, index === SimdIntrinsic3.V128_I2_SHUFFLE ? 8 : 4);
         default:
             return false;
     }
 
     return false;
 }
 
+// implement i16 and i32 shuffles on top of wasm's only shuffle opcode by expanding the
+//  element shuffle indices into byte indices
+function emit_shuffle(builder: WasmBuilder, ip: MintOpcodePtr, elementCount: number): boolean {
+    const elementSize = 16 / elementCount;
+    mono_assert((elementSize === 2) || (elementSize === 4), "Unsupported shuffle element size");
+    builder.local("pLocals");
+    // Load vec
+    append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load);
+    // Load indices (in chars)
+    append_ldloc(builder, getArgU16(ip, 3), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load);
+    // There's no direct narrowing opcode for i32 -> i8, so we have to do two steps :(
+    if (elementCount === 4) {
+        // i32{lane0 ... lane3} -> i16{lane0 ... lane3, 0 ...}
+        builder.i52_const(0);
+        builder.appendSimd(WasmSimdOpcode.i64x2_splat);
+        builder.appendSimd(WasmSimdOpcode.i16x8_narrow_i32x4_u);
+    }
+    // Load a zero vector (narrow takes two vectors)
+    builder.i52_const(0);
+    builder.appendSimd(WasmSimdOpcode.i64x2_splat);
+    // i16{lane0 ... lane7} -> i8{lane0 ... lane7, 0 ...}
+    builder.appendSimd(WasmSimdOpcode.i8x16_narrow_i16x8_u);
+    // i8{0, 1, 2, 3 ...} -> i8{0, 0, 1, 1, 2, 2, 3, 3 ...}
+    builder.appendSimd(WasmSimdOpcode.v128_const);
+    for (let i = 0; i < elementCount; i++) {
+        for (let j = 0; j < elementSize; j++)
+            builder.appendU8(i);
+    }
+    builder.appendSimd(WasmSimdOpcode.i8x16_swizzle);
+    // multiply indices by 2 to scale from char indices to byte indices
+    builder.i32_const(elementCount === 4 ? 2 : 1);
+    builder.appendSimd(WasmSimdOpcode.i8x16_shl);
+    // now add 1 to the secondary lane of each char
+    builder.appendSimd(WasmSimdOpcode.v128_const);
+    for (let i = 0; i < elementCount; i++) {
+        for (let j = 0; j < elementSize; j++)
+            builder.appendU8(j);
+    }
+    // we can do a bitwise or since we know we previously multiplied all the lanes by 2
+    builder.appendSimd(WasmSimdOpcode.v128_or);
+    // we now have two vectors on the stack, the values and the byte indices
+    builder.appendSimd(WasmSimdOpcode.i8x16_swizzle);
+    append_simd_store(builder, ip);
+    return true;
+}
+
 function emit_simd_4(builder: WasmBuilder, ip: MintOpcodePtr, index: SimdIntrinsic4): boolean {
     const simple = <WasmSimdOpcode>cwraps.mono_jiterp_get_simd_opcode(3, index);
     if (simple) {