From 48a9d2ad97698d4aee872c1632f04e8354ef54ea Mon Sep 17 00:00:00 2001
From: Sergei Isakov <iserge@google.com>
Date: Tue, 20 Apr 2021 17:39:21 +0200
Subject: [PATCH 01/10] Add support for avx512.

---
 lib/BUILD                               |   43 +
 lib/simmux.h                            |    8 +-
 lib/simulator_avx512.h                  | 8878 +++++++++++++++++++++++
 lib/statespace_avx512.h                 |  423 ++
 lib/umux.h                              |   10 +-
 lib/unitary_calculator_avx512.h         | 6385 ++++++++++++++++
 lib/unitaryspace_avx512.h               |  110 +
 tests/BUILD                             |   71 +-
 tests/make.sh                           |    4 +
 tests/simulator_avx512_test.cc          |   84 +
 tests/statespace_avx512_test.cc         |  105 +
 tests/statespace_avx_test.cc            |    4 +
 tests/statespace_basic_test.cc          |    4 +
 tests/statespace_sse_test.cc            |    4 +
 tests/statespace_testfixture.h          |  236 +-
 tests/unitary_calculator_avx512_test.cc |   65 +
 tests/unitary_calculator_testfixture.h  |    3 +-
 tests/unitaryspace_avx512_test.cc       |   46 +
 18 files changed, 16366 insertions(+), 117 deletions(-)
 create mode 100644 lib/simulator_avx512.h
 create mode 100644 lib/statespace_avx512.h
 create mode 100644 lib/unitary_calculator_avx512.h
 create mode 100644 lib/unitaryspace_avx512.h
 create mode 100644 tests/simulator_avx512_test.cc
 create mode 100644 tests/statespace_avx512_test.cc
 create mode 100644 tests/unitary_calculator_avx512_test.cc
 create mode 100644 tests/unitaryspace_avx512_test.cc

diff --git a/lib/BUILD b/lib/BUILD
index 3bfca5f90..4a960e1a0 100644
--- a/lib/BUILD
+++ b/lib/BUILD
@@ -33,18 +33,22 @@ cc_library(
         "seqfor.h",
         "simmux.h",
         "simulator_avx.h",
+        "simulator_avx512.h",
         "simulator_basic.h",
         "simulator_sse.h",
         "statespace_avx.h",
+        "statespace_avx512.h",
         "statespace_basic.h",
         "statespace_sse.h",
         "statespace.h",
         "umux.h",
         "unitaryspace.h",
         "unitaryspace_avx.h",
+        "unitaryspace_avx512.h",
         "unitaryspace_basic.h",
         "unitaryspace_sse.h",
         "unitary_calculator_avx.h",
+        "unitary_calculator_avx512.h",
         "unitary_calculator_basic.h",
         "unitary_calculator_sse.h",
         "util.h",
@@ -75,10 +79,12 @@ cc_library(
         "seqfor.h",
         "simmux.h",
         "simulator_avx.h",
+        "simulator_avx512.h",
         "simulator_basic.h",
         "simulator_sse.h",
         "statespace.h",
         "statespace_avx.h",
+        "statespace_avx512.h",
         "statespace_basic.h",
         "statespace_sse.h",
         "umux.h",
@@ -118,10 +124,12 @@ cc_library(
         "seqfor.h",
         "simmux.h",
         "simulator_avx.h",
+        "simulator_avx512.h",
         "simulator_basic.h",
         "simulator_sse.h",
         "statespace.h",
         "statespace_avx.h",
+        "statespace_avx512.h",
         "statespace_basic.h",
         "statespace_sse.h",
         "util.h",
@@ -330,6 +338,15 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "statespace_avx512",
+    hdrs = ["statespace_avx512.h"],
+    deps = [
+        ":statespace",
+        ":util",
+    ],
+)
+
 cc_library(
     name = "statespace_basic",
     hdrs = ["statespace_basic.h"],
@@ -359,6 +376,15 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "simulator_avx512",
+    hdrs = ["simulator_avx512.h"],
+    deps = [
+        ":bits",
+        ":statespace_avx512",
+    ],
+)
+
 cc_library(
     name = "simulator_basic",
     hdrs = ["simulator_basic.h"],
@@ -383,6 +409,7 @@ cc_library(
     hdrs = ["simmux.h"],
     deps = [
         ":simulator_avx",
+        ":simulator_avx512",
         ":simulator_basic",
         ":simulator_sse",
     ],
@@ -450,6 +477,12 @@ cc_library(
     deps = [":unitaryspace"],
 )
 
+cc_library(
+    name = "unitaryspace_avx512",
+    hdrs = ["unitaryspace_avx512.h"],
+    deps = [":unitaryspace"],
+)
+
 cc_library(
     name = "unitaryspace_basic",
     hdrs = ["unitaryspace_basic.h"],
@@ -473,6 +506,15 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "unitary_calculator_avx512",
+    hdrs = ["unitary_calculator_avx512.h"],
+    deps = [
+        ":bits",
+        ":unitaryspace_avx512"
+    ],
+)
+
 cc_library(
     name = "unitary_calculator_basic",
     hdrs = ["unitary_calculator_basic.h"],
@@ -498,6 +540,7 @@ cc_library(
     hdrs = ["umux.h"],
     deps = [
         ":unitary_calculator_avx",
+        ":unitary_calculator_avx512",
         ":unitary_calculator_basic",
         ":unitary_calculator_sse",
     ],
diff --git a/lib/simmux.h b/lib/simmux.h
index 626eab1cc..d3c4074ef 100644
--- a/lib/simmux.h
+++ b/lib/simmux.h
@@ -15,7 +15,13 @@
 #ifndef SIMMUX_H_
 #define SIMMUX_H_
 
-#ifdef __AVX2__
+#ifdef __AVX512F__
+# include "simulator_avx512.h"
+  namespace qsim {
+    template <typename For>
+    using Simulator = SimulatorAVX512<For>;
+  }
+#elif __AVX2__
 # include "simulator_avx.h"
   namespace qsim {
     template <typename For>
diff --git a/lib/simulator_avx512.h b/lib/simulator_avx512.h
new file mode 100644
index 000000000..ddce44db3
--- /dev/null
+++ b/lib/simulator_avx512.h
@@ -0,0 +1,8878 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SIMULATOR_AVX512_H_
+#define SIMULATOR_AVX512_H_
+
+#include <immintrin.h>
+
+#include <algorithm>
+#include <complex>
+#include <cstdint>
+
+#include "bits.h"
+#include "statespace_avx512.h"
+
+namespace qsim {
+
+/**
+ * Quantum circuit simulator with AVX512 vectorization.
+ */
+template <typename For>
+class SimulatorAVX512 final {
+ public:
+  using StateSpace = StateSpaceAVX512<For>;
+  using State = typename StateSpace::State;
+  using fp_type = typename StateSpace::fp_type;
+
+  template <typename... ForArgs>
+  explicit SimulatorAVX512(ForArgs&&... args) : for_(args...) {}
+
+  /**
+   * Applies a gate using AVX512 instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyGate(const std::vector<unsigned>& qs,
+                 const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    switch (qs.size()) {
+    case 1:
+      if (qs[0] > 3) {
+        ApplyGate1H(qs, matrix, state);
+      } else {
+        ApplyGate1L(qs, matrix, state);
+      }
+      break;
+    case 2:
+      if (qs[0] > 3) {
+        ApplyGate2HH(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        ApplyGate2HL(qs, matrix, state);
+      } else {
+        ApplyGate2LL(qs, matrix, state);
+      }
+      break;
+    case 3:
+      if (qs[0] > 3) {
+        ApplyGate3HHH(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        ApplyGate3HHL(qs, matrix, state);
+      } else if (qs[2] > 3) {
+        ApplyGate3HLL(qs, matrix, state);
+      } else {
+        ApplyGate3LLL(qs, matrix, state);
+      }
+      break;
+    case 4:
+      if (qs[0] > 3) {
+        ApplyGate4HHHH(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        ApplyGate4HHHL(qs, matrix, state);
+      } else if (qs[2] > 3) {
+        ApplyGate4HHLL(qs, matrix, state);
+      } else if (qs[3] > 3) {
+        ApplyGate4HLLL(qs, matrix, state);
+      } else {
+        ApplyGate4LLLL(qs, matrix, state);
+      }
+      break;
+    case 5:
+      if (qs[0] > 3) {
+        ApplyGate5HHHHH(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        ApplyGate5HHHHL(qs, matrix, state);
+      } else if (qs[2] > 3) {
+        ApplyGate5HHHLL(qs, matrix, state);
+      } else if (qs[3] > 3) {
+        ApplyGate5HHLLL(qs, matrix, state);
+      } else {
+        ApplyGate5HLLLL(qs, matrix, state);
+      }
+      break;
+    case 6:
+      if (qs[0] > 3) {
+        ApplyGate6HHHHHH(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        ApplyGate6HHHHHL(qs, matrix, state);
+      } else if (qs[2] > 3) {
+        ApplyGate6HHHHLL(qs, matrix, state);
+      } else if (qs[3] > 3) {
+        ApplyGate6HHHLLL(qs, matrix, state);
+      } else {
+        ApplyGate6HHLLLL(qs, matrix, state);
+      }
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+  }
+
+  /**
+   * Applies a controlled gate using AVX512 instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param cqs Indices of control qubits.
+   * @param cmask Bit mask of control qubit values.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyControlledGate(const std::vector<unsigned>& qs,
+                           const std::vector<unsigned>& cqs, uint64_t cmask,
+                           const fp_type* matrix, State& state) const {
+    if (cqs.size() == 0) {
+      ApplyGate(qs, matrix, state);
+      return;
+    }
+
+    switch (qs.size()) {
+    case 1:
+      if (qs[0] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGate1H_H(qs, cqs, cmask, matrix, state);
+        } else {
+          ApplyControlledGate1H_L(qs, cqs, cmask, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 3) {
+          ApplyControlledGate1L_H(qs, cqs, cmask, matrix, state);
+        } else {
+          ApplyControlledGate1L_L(qs, cqs, cmask, matrix, state);
+        }
+      }
+      break;
+    case 2:
+      if (qs[0] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGate2HH_H(qs, cqs, cmask, matrix, state);
+        } else {
+          ApplyControlledGate2HH_L(qs, cqs, cmask, matrix, state);
+        }
+      } else if (qs[1] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGate2HL_H(qs, cqs, cmask, matrix, state);
+        } else {
+          ApplyControlledGate2HL_L(qs, cqs, cmask, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 3) {
+          ApplyControlledGate2LL_H(qs, cqs, cmask, matrix, state);
+        } else {
+          ApplyControlledGate2LL_L(qs, cqs, cmask, matrix, state);
+        }
+      }
+      break;
+    case 3:
+      if (qs[0] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGate3HHH_H(qs, cqs, cmask, matrix, state);
+        } else {
+          ApplyControlledGate3HHH_L(qs, cqs, cmask, matrix, state);
+        }
+      } else if (qs[1] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGate3HHL_H(qs, cqs, cmask, matrix, state);
+        } else {
+          ApplyControlledGate3HHL_L(qs, cqs, cmask, matrix, state);
+        }
+      } else if (qs[2] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGate3HLL_H(qs, cqs, cmask, matrix, state);
+        } else {
+          ApplyControlledGate3HLL_L(qs, cqs, cmask, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 3) {
+          ApplyControlledGate3LLL_H(qs, cqs, cmask, matrix, state);
+        } else {
+          ApplyControlledGate3LLL_L(qs, cqs, cmask, matrix, state);
+        }
+      }
+      break;
+    case 4:
+      if (qs[0] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGate4HHHH_H(qs, cqs, cmask, matrix, state);
+        } else {
+          ApplyControlledGate4HHHH_L(qs, cqs, cmask, matrix, state);
+        }
+      } else if (qs[1] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGate4HHHL_H(qs, cqs, cmask, matrix, state);
+        } else {
+          ApplyControlledGate4HHHL_L(qs, cqs, cmask, matrix, state);
+        }
+      } else if (qs[2] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGate4HHLL_H(qs, cqs, cmask, matrix, state);
+        } else {
+          ApplyControlledGate4HHLL_L(qs, cqs, cmask, matrix, state);
+        }
+      } else if (qs[3] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGate4HLLL_H(qs, cqs, cmask, matrix, state);
+        } else {
+          ApplyControlledGate4HLLL_L(qs, cqs, cmask, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 3) {
+          ApplyControlledGate4LLLL_H(qs, cqs, cmask, matrix, state);
+        } else {
+          ApplyControlledGate4LLLL_L(qs, cqs, cmask, matrix, state);
+        }
+      }
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+  }
+
+  /**
+   * Computes the expectation value of an operator using AVX512 instructions.
+   * @param qs Indices of the qubits the operator acts on.
+   * @param matrix The operator matrix.
+   * @param state The state of the system.
+   * @return The computed expectation value.
+   */
+  std::complex<double> ExpectationValue(const std::vector<unsigned>& qs,
+                                        const fp_type* matrix,
+                                        const State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    switch (qs.size()) {
+    case 1:
+      if (qs[0] > 3) {
+        return ExpectationValue1H(qs, matrix, state);
+      } else {
+        return ExpectationValue1L(qs, matrix, state);
+      }
+      break;
+    case 2:
+      if (qs[0] > 3) {
+        return ExpectationValue2HH(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        return ExpectationValue2HL(qs, matrix, state);
+      } else {
+        return ExpectationValue2LL(qs, matrix, state);
+      }
+      break;
+    case 3:
+      if (qs[0] > 3) {
+        return ExpectationValue3HHH(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        return ExpectationValue3HHL(qs, matrix, state);
+      } else if (qs[2] > 3) {
+        return ExpectationValue3HLL(qs, matrix, state);
+      } else {
+        return ExpectationValue3LLL(qs, matrix, state);
+      }
+      break;
+    case 4:
+      if (qs[0] > 3) {
+        return ExpectationValue4HHHH(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        return ExpectationValue4HHHL(qs, matrix, state);
+      } else if (qs[2] > 3) {
+        return ExpectationValue4HHLL(qs, matrix, state);
+      } else if (qs[3] > 3) {
+        return ExpectationValue4HLLL(qs, matrix, state);
+      } else {
+        return ExpectationValue4LLLL(qs, matrix, state);
+      }
+      break;
+    case 5:
+      if (qs[0] > 3) {
+        return ExpectationValue5HHHHH(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        return ExpectationValue5HHHHL(qs, matrix, state);
+      } else if (qs[2] > 3) {
+        return ExpectationValue5HHHLL(qs, matrix, state);
+      } else if (qs[3] > 3) {
+        return ExpectationValue5HHLLL(qs, matrix, state);
+      } else {
+        return ExpectationValue5HLLLL(qs, matrix, state);
+      }
+      break;
+    case 6:
+      if (qs[0] > 3) {
+        return ExpectationValue6HHHHHH(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        return ExpectationValue6HHHHHL(qs, matrix, state);
+      } else if (qs[2] > 3) {
+        return ExpectationValue6HHHHLL(qs, matrix, state);
+      } else if (qs[3] > 3) {
+        return ExpectationValue6HHHLLL(qs, matrix, state);
+      } else {
+        return ExpectationValue6HHLLLL(qs, matrix, state);
+      }
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+
+    return 0;
+  }
+
+  /**
+   * @return The size of SIMD register if applicable.
+   */
+  static unsigned SIMDRegisterSize() {
+    return 16;
+  }
+
+ private:
+  void ApplyGate1H(const std::vector<unsigned>& qs,
+                   const fp_type* matrix, State& state) const {
+    uint64_t xs[1];
+    uint64_t ms[2];
+
+    xs[0] = uint64_t{1} << (qs[0] + 1);
+    ms[0] = (uint64_t{1} << qs[0]) - 1;
+    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
+
+    uint64_t xss[2];
+    for (unsigned i = 0; i < 2; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 1; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss,
+                fp_type* rstate) {
+      __m512 ru, iu, rn, in;
+      __m512 rs[2], is[2];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rs[l] = _mm512_load_ps(p0 + xss[l]);
+        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        ru = _mm512_set1_ps(v[j]);
+        iu = _mm512_set1_ps(v[j + 1]);
+        rn = _mm512_mul_ps(rs[0], ru);
+        in = _mm512_mul_ps(rs[0], iu);
+        rn = _mm512_fnmadd_ps(is[0], iu, rn);
+        in = _mm512_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 2; ++n) {
+          ru = _mm512_set1_ps(v[j]);
+          iu = _mm512_set1_ps(v[j + 1]);
+          rn = _mm512_fmadd_ps(rs[n], ru, rn);
+          in = _mm512_fmadd_ps(rs[n], iu, in);
+          rn = _mm512_fnmadd_ps(is[n], iu, rn);
+          in = _mm512_fmadd_ps(is[n], ru, in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 5;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, matrix, ms, xss, rstate);
+  }
+
+  void ApplyGate1L(const std::vector<unsigned>& qs,
+                   const fp_type* matrix, State& state) const {
+    unsigned p[16];
+    __m512i idx[1];
+
+    auto s = StateSpace::Create(5);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned m = 0; m < 2; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2);
+        }
+
+        unsigned l = 2 * (2 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[2], is[2];
+
+      auto p0 = rstate + 32 * i;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rs[2 * l] = _mm512_load_ps(p0);
+        is[2 * l] = _mm512_load_ps(p0 + 16);
+
+        for (unsigned j = 1; j < 2; ++j) {
+          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
+          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 2; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0, rn);
+        _mm512_store_ps(p0 + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 4;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, idx, rstate);
+  }
+
+  void ApplyGate2HH(const std::vector<unsigned>& qs,
+                    const fp_type* matrix, State& state) const {
+    uint64_t xs[2];
+    uint64_t ms[3];
+
+    xs[0] = uint64_t{1} << (qs[0] + 1);
+    ms[0] = (uint64_t{1} << qs[0]) - 1;
+    for (unsigned i = 1; i < 2; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
+
+    uint64_t xss[4];
+    for (unsigned i = 0; i < 4; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 2; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss,
+                fp_type* rstate) {
+      __m512 ru, iu, rn, in;
+      __m512 rs[4], is[4];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rs[l] = _mm512_load_ps(p0 + xss[l]);
+        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        ru = _mm512_set1_ps(v[j]);
+        iu = _mm512_set1_ps(v[j + 1]);
+        rn = _mm512_mul_ps(rs[0], ru);
+        in = _mm512_mul_ps(rs[0], iu);
+        rn = _mm512_fnmadd_ps(is[0], iu, rn);
+        in = _mm512_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 4; ++n) {
+          ru = _mm512_set1_ps(v[j]);
+          iu = _mm512_set1_ps(v[j + 1]);
+          rn = _mm512_fmadd_ps(rs[n], ru, rn);
+          in = _mm512_fmadd_ps(rs[n], iu, in);
+          rn = _mm512_fnmadd_ps(is[n], iu, rn);
+          in = _mm512_fmadd_ps(is[n], ru, in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 6;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, matrix, ms, xss, rstate);
+  }
+
+  void ApplyGate2HL(const std::vector<unsigned>& qs,
+                    const fp_type* matrix, State& state) const {
+    uint64_t xs[1];
+    uint64_t ms[2];
+
+    xs[0] = uint64_t{1} << (qs[1] + 1);
+    ms[0] = (uint64_t{1} << qs[1]) - 1;
+    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
+
+    uint64_t xss[2];
+    for (unsigned i = 0; i < 2; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 1; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[1];
+
+    auto s = StateSpace::Create(7);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 2; ++i) {
+      for (unsigned m = 0; m < 4; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2);
+        }
+
+        unsigned l = 2 * (4 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[4], is[4];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 2; ++j) {
+          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
+          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 4; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 5;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss, idx, rstate);
+  }
+
+  void ApplyGate2LL(const std::vector<unsigned>& qs,
+                    const fp_type* matrix, State& state) const {
+    unsigned p[16];
+    __m512i idx[3];
+
+    auto s = StateSpace::Create(6);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
+
+    for (unsigned i = 0; i < 3; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned m = 0; m < 4; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4);
+        }
+
+        unsigned l = 2 * (4 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[4], is[4];
+
+      auto p0 = rstate + 32 * i;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rs[4 * l] = _mm512_load_ps(p0);
+        is[4 * l] = _mm512_load_ps(p0 + 16);
+
+        for (unsigned j = 1; j < 4; ++j) {
+          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
+          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 4; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0, rn);
+        _mm512_store_ps(p0 + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 4;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, idx, rstate);
+  }
+
+  void ApplyGate3HHH(const std::vector<unsigned>& qs,
+                     const fp_type* matrix, State& state) const {
+    uint64_t xs[3];
+    uint64_t ms[4];
+
+    xs[0] = uint64_t{1} << (qs[0] + 1);
+    ms[0] = (uint64_t{1} << qs[0]) - 1;
+    for (unsigned i = 1; i < 3; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
+
+    uint64_t xss[8];
+    for (unsigned i = 0; i < 8; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 3; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss,
+                fp_type* rstate) {
+      __m512 ru, iu, rn, in;
+      __m512 rs[8], is[8];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
+          | (128 * i & ms[3]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        rs[l] = _mm512_load_ps(p0 + xss[l]);
+        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        ru = _mm512_set1_ps(v[j]);
+        iu = _mm512_set1_ps(v[j + 1]);
+        rn = _mm512_mul_ps(rs[0], ru);
+        in = _mm512_mul_ps(rs[0], iu);
+        rn = _mm512_fnmadd_ps(is[0], iu, rn);
+        in = _mm512_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 8; ++n) {
+          ru = _mm512_set1_ps(v[j]);
+          iu = _mm512_set1_ps(v[j + 1]);
+          rn = _mm512_fmadd_ps(rs[n], ru, rn);
+          in = _mm512_fmadd_ps(rs[n], iu, in);
+          rn = _mm512_fnmadd_ps(is[n], iu, rn);
+          in = _mm512_fmadd_ps(is[n], ru, in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 7;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, matrix, ms, xss, rstate);
+  }
+
+  void ApplyGate3HHL(const std::vector<unsigned>& qs,
+                     const fp_type* matrix, State& state) const {
+    uint64_t xs[2];
+    uint64_t ms[3];
+
+    xs[0] = uint64_t{1} << (qs[1] + 1);
+    ms[0] = (uint64_t{1} << qs[1]) - 1;
+    for (unsigned i = 1; i < 2; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
+
+    uint64_t xss[4];
+    for (unsigned i = 0; i < 4; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 2; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[1];
+
+    auto s = StateSpace::Create(9);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 4; ++i) {
+      for (unsigned m = 0; m < 8; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2);
+        }
+
+        unsigned l = 2 * (8 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[8], is[8];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 2; ++j) {
+          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
+          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 8; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 6;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss, idx, rstate);
+  }
+
+  void ApplyGate3HLL(const std::vector<unsigned>& qs,
+                     const fp_type* matrix, State& state) const {
+    uint64_t xs[1];
+    uint64_t ms[2];
+
+    xs[0] = uint64_t{1} << (qs[2] + 1);
+    ms[0] = (uint64_t{1} << qs[2]) - 1;
+    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
+
+    uint64_t xss[2];
+    for (unsigned i = 0; i < 2; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 1; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[3];
+
+    auto s = StateSpace::Create(8);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
+
+    for (unsigned i = 0; i < 3; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 2; ++i) {
+      for (unsigned m = 0; m < 8; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4);
+        }
+
+        unsigned l = 2 * (8 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[8], is[8];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 4; ++j) {
+          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
+          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 8; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 5;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss, idx, rstate);
+  }
+
+  void ApplyGate3LLL(const std::vector<unsigned>& qs,
+                     const fp_type* matrix, State& state) const {
+    unsigned p[16];
+    __m512i idx[7];
+
+    auto s = StateSpace::Create(7);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
+
+    for (unsigned i = 0; i < 7; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned m = 0; m < 8; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (64 * i + 8 * k + 8 * (m / 8) + (k + m) % 8);
+        }
+
+        unsigned l = 2 * (8 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[8], is[8];
+
+      auto p0 = rstate + 32 * i;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rs[8 * l] = _mm512_load_ps(p0);
+        is[8 * l] = _mm512_load_ps(p0 + 16);
+
+        for (unsigned j = 1; j < 8; ++j) {
+          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
+          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 8; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0, rn);
+        _mm512_store_ps(p0 + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 4;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, idx, rstate);
+  }
+
+  void ApplyGate4HHHH(const std::vector<unsigned>& qs,
+                      const fp_type* matrix, State& state) const {
+    uint64_t xs[4];
+    uint64_t ms[5];
+
+    xs[0] = uint64_t{1} << (qs[0] + 1);
+    ms[0] = (uint64_t{1} << qs[0]) - 1;
+    for (unsigned i = 1; i < 4; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
+
+    uint64_t xss[16];
+    for (unsigned i = 0; i < 16; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 4; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss,
+                fp_type* rstate) {
+      __m512 ru, iu, rn, in;
+      __m512 rs[16], is[16];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
+          | (128 * i & ms[3]) | (256 * i & ms[4]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 16; ++l) {
+        rs[l] = _mm512_load_ps(p0 + xss[l]);
+        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 16; ++l) {
+        ru = _mm512_set1_ps(v[j]);
+        iu = _mm512_set1_ps(v[j + 1]);
+        rn = _mm512_mul_ps(rs[0], ru);
+        in = _mm512_mul_ps(rs[0], iu);
+        rn = _mm512_fnmadd_ps(is[0], iu, rn);
+        in = _mm512_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 16; ++n) {
+          ru = _mm512_set1_ps(v[j]);
+          iu = _mm512_set1_ps(v[j + 1]);
+          rn = _mm512_fmadd_ps(rs[n], ru, rn);
+          in = _mm512_fmadd_ps(rs[n], iu, in);
+          rn = _mm512_fnmadd_ps(is[n], iu, rn);
+          in = _mm512_fmadd_ps(is[n], ru, in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 8;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, matrix, ms, xss, rstate);
+  }
+
+  void ApplyGate4HHHL(const std::vector<unsigned>& qs,
+                      const fp_type* matrix, State& state) const {
+    uint64_t xs[3];
+    uint64_t ms[4];
+
+    xs[0] = uint64_t{1} << (qs[1] + 1);
+    ms[0] = (uint64_t{1} << qs[1]) - 1;
+    for (unsigned i = 1; i < 3; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
+
+    uint64_t xss[8];
+    for (unsigned i = 0; i < 8; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 3; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[1];
+
+    auto s = StateSpace::Create(11);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 8; ++i) {
+      for (unsigned m = 0; m < 16; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2);
+        }
+
+        unsigned l = 2 * (16 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[16], is[16];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
+          | (128 * i & ms[3]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 2; ++j) {
+          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
+          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 16; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 7;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss, idx, rstate);
+  }
+
+  void ApplyGate4HHLL(const std::vector<unsigned>& qs,
+                      const fp_type* matrix, State& state) const {
+    uint64_t xs[2];
+    uint64_t ms[3];
+
+    xs[0] = uint64_t{1} << (qs[2] + 1);
+    ms[0] = (uint64_t{1} << qs[2]) - 1;
+    for (unsigned i = 1; i < 2; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
+
+    uint64_t xss[4];
+    for (unsigned i = 0; i < 4; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 2; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[3];
+
+    auto s = StateSpace::Create(10);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
+
+    for (unsigned i = 0; i < 3; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 4; ++i) {
+      for (unsigned m = 0; m < 16; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4);
+        }
+
+        unsigned l = 2 * (16 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[16], is[16];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 4; ++j) {
+          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
+          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 16; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 6;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss, idx, rstate);
+  }
+
+  void ApplyGate4HLLL(const std::vector<unsigned>& qs,
+                      const fp_type* matrix, State& state) const {
+    uint64_t xs[1];
+    uint64_t ms[2];
+
+    xs[0] = uint64_t{1} << (qs[3] + 1);
+    ms[0] = (uint64_t{1} << qs[3]) - 1;
+    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
+
+    uint64_t xss[2];
+    for (unsigned i = 0; i < 2; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 1; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[7];
+
+    auto s = StateSpace::Create(9);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
+
+    for (unsigned i = 0; i < 7; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 2; ++i) {
+      for (unsigned m = 0; m < 16; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (128 * i + 16 * k + 8 * (m / 8) + (k + m) % 8);
+        }
+
+        unsigned l = 2 * (16 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[16], is[16];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rs[8 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 8; ++j) {
+          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
+          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 16; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 5;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss, idx, rstate);
+  }
+
+  void ApplyGate4LLLL(const std::vector<unsigned>& qs,
+                      const fp_type* matrix, State& state) const {
+    unsigned p[16];
+    __m512i idx[15];
+
+    auto s = StateSpace::Create(8);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]);
+
+    for (unsigned i = 0; i < 15; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned m = 0; m < 16; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (256 * i + 16 * k + 16 * (m / 16) + (k + m) % 16);
+        }
+
+        unsigned l = 2 * (16 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[16], is[16];
+
+      auto p0 = rstate + 32 * i;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rs[16 * l] = _mm512_load_ps(p0);
+        is[16 * l] = _mm512_load_ps(p0 + 16);
+
+        for (unsigned j = 1; j < 16; ++j) {
+          rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]);
+          is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 16; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0, rn);
+        _mm512_store_ps(p0 + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 4;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, idx, rstate);
+  }
+
+  void ApplyGate5HHHHH(const std::vector<unsigned>& qs,
+                       const fp_type* matrix, State& state) const {
+    uint64_t xs[5];
+    uint64_t ms[6];
+
+    xs[0] = uint64_t{1} << (qs[0] + 1);
+    ms[0] = (uint64_t{1} << qs[0]) - 1;
+    for (unsigned i = 1; i < 5; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1);
+
+    uint64_t xss[32];
+    for (unsigned i = 0; i < 32; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 5; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss,
+                fp_type* rstate) {
+      __m512 ru, iu, rn, in;
+      __m512 rs[32], is[32];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
+          | (128 * i & ms[3]) | (256 * i & ms[4]) | (512 * i & ms[5]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 32; ++l) {
+        rs[l] = _mm512_load_ps(p0 + xss[l]);
+        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 32; ++l) {
+        ru = _mm512_set1_ps(v[j]);
+        iu = _mm512_set1_ps(v[j + 1]);
+        rn = _mm512_mul_ps(rs[0], ru);
+        in = _mm512_mul_ps(rs[0], iu);
+        rn = _mm512_fnmadd_ps(is[0], iu, rn);
+        in = _mm512_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 32; ++n) {
+          ru = _mm512_set1_ps(v[j]);
+          iu = _mm512_set1_ps(v[j + 1]);
+          rn = _mm512_fmadd_ps(rs[n], ru, rn);
+          in = _mm512_fmadd_ps(rs[n], iu, in);
+          rn = _mm512_fnmadd_ps(is[n], iu, rn);
+          in = _mm512_fmadd_ps(is[n], ru, in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 9;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, matrix, ms, xss, rstate);
+  }
+
+  void ApplyGate5HHHHL(const std::vector<unsigned>& qs,
+                       const fp_type* matrix, State& state) const {
+    uint64_t xs[4];
+    uint64_t ms[5];
+
+    xs[0] = uint64_t{1} << (qs[1] + 1);
+    ms[0] = (uint64_t{1} << qs[1]) - 1;
+    for (unsigned i = 1; i < 4; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
+
+    uint64_t xss[16];
+    for (unsigned i = 0; i < 16; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 4; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[1];
+
+    auto s = StateSpace::Create(13);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 16; ++i) {
+      for (unsigned m = 0; m < 32; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (64 * i + 32 * k + 2 * (m / 2) + (k + m) % 2);
+        }
+
+        unsigned l = 2 * (32 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[32], is[32];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
+          | (128 * i & ms[3]) | (256 * i & ms[4]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 16; ++l) {
+        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 2; ++j) {
+          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
+          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 16; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 32; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 8;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss, idx, rstate);
+  }
+
+  void ApplyGate5HHHLL(const std::vector<unsigned>& qs,
+                       const fp_type* matrix, State& state) const {
+    uint64_t xs[3];
+    uint64_t ms[4];
+
+    xs[0] = uint64_t{1} << (qs[2] + 1);
+    ms[0] = (uint64_t{1} << qs[2]) - 1;
+    for (unsigned i = 1; i < 3; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
+
+    uint64_t xss[8];
+    for (unsigned i = 0; i < 8; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 3; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[3];
+
+    auto s = StateSpace::Create(12);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
+
+    for (unsigned i = 0; i < 3; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 8; ++i) {
+      for (unsigned m = 0; m < 32; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (128 * i + 32 * k + 4 * (m / 4) + (k + m) % 4);
+        }
+
+        unsigned l = 2 * (32 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[32], is[32];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
+          | (128 * i & ms[3]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 4; ++j) {
+          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
+          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 32; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 7;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss, idx, rstate);
+  }
+
+  void ApplyGate5HHLLL(const std::vector<unsigned>& qs,
+                       const fp_type* matrix, State& state) const {
+    uint64_t xs[2];
+    uint64_t ms[3];
+
+    xs[0] = uint64_t{1} << (qs[3] + 1);
+    ms[0] = (uint64_t{1} << qs[3]) - 1;
+    for (unsigned i = 1; i < 2; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 3] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 3]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
+
+    uint64_t xss[4];
+    for (unsigned i = 0; i < 4; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 2; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[7];
+
+    auto s = StateSpace::Create(11);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
+
+    for (unsigned i = 0; i < 7; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 4; ++i) {
+      for (unsigned m = 0; m < 32; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (256 * i + 32 * k + 8 * (m / 8) + (k + m) % 8);
+        }
+
+        unsigned l = 2 * (32 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[32], is[32];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rs[8 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 8; ++j) {
+          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
+          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 32; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 6;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss, idx, rstate);
+  }
+
+  void ApplyGate5HLLLL(const std::vector<unsigned>& qs,
+                       const fp_type* matrix, State& state) const {
+    uint64_t xs[1];
+    uint64_t ms[2];
+
+    xs[0] = uint64_t{1} << (qs[4] + 1);
+    ms[0] = (uint64_t{1} << qs[4]) - 1;
+    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
+
+    uint64_t xss[2];
+    for (unsigned i = 0; i < 2; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 1; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[15];
+
+    auto s = StateSpace::Create(10);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]);
+
+    for (unsigned i = 0; i < 15; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 2; ++i) {
+      for (unsigned m = 0; m < 32; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (512 * i + 32 * k + 16 * (m / 16) + (k + m) % 16);
+        }
+
+        unsigned l = 2 * (32 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[32], is[32];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rs[16 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[16 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 16; ++j) {
+          rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]);
+          is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 32; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 5;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss, idx, rstate);
+  }
+
+  void ApplyGate6HHHHHH(const std::vector<unsigned>& qs,
+                        const fp_type* matrix, State& state) const {
+    uint64_t xs[6];
+    uint64_t ms[7];
+
+    xs[0] = uint64_t{1} << (qs[0] + 1);
+    ms[0] = (uint64_t{1} << qs[0]) - 1;
+    for (unsigned i = 1; i < 6; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[6] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[5] - 1);
+
+    uint64_t xss[64];
+    for (unsigned i = 0; i < 64; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 6; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss,
+                fp_type* rstate) {
+      __m512 ru, iu, rn, in;
+      __m512 rs[64], is[64];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
+          | (128 * i & ms[3]) | (256 * i & ms[4]) | (512 * i & ms[5])
+          | (1024 * i & ms[6]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 64; ++l) {
+        rs[l] = _mm512_load_ps(p0 + xss[l]);
+        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 64; ++l) {
+        ru = _mm512_set1_ps(v[j]);
+        iu = _mm512_set1_ps(v[j + 1]);
+        rn = _mm512_mul_ps(rs[0], ru);
+        in = _mm512_mul_ps(rs[0], iu);
+        rn = _mm512_fnmadd_ps(is[0], iu, rn);
+        in = _mm512_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 64; ++n) {
+          ru = _mm512_set1_ps(v[j]);
+          iu = _mm512_set1_ps(v[j + 1]);
+          rn = _mm512_fmadd_ps(rs[n], ru, rn);
+          in = _mm512_fmadd_ps(rs[n], iu, in);
+          rn = _mm512_fnmadd_ps(is[n], iu, rn);
+          in = _mm512_fmadd_ps(is[n], ru, in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 10;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, matrix, ms, xss, rstate);
+  }
+
+  void ApplyGate6HHHHHL(const std::vector<unsigned>& qs,
+                        const fp_type* matrix, State& state) const {
+    uint64_t xs[5];
+    uint64_t ms[6];
+
+    xs[0] = uint64_t{1} << (qs[1] + 1);
+    ms[0] = (uint64_t{1} << qs[1]) - 1;
+    for (unsigned i = 1; i < 5; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1);
+
+    uint64_t xss[32];
+    for (unsigned i = 0; i < 32; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 5; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[1];
+
+    auto s = StateSpace::Create(15);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 32; ++i) {
+      for (unsigned m = 0; m < 64; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (128 * i + 64 * k + 2 * (m / 2) + (k + m) % 2);
+        }
+
+        unsigned l = 2 * (64 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[64], is[64];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
+          | (128 * i & ms[3]) | (256 * i & ms[4]) | (512 * i & ms[5]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 32; ++l) {
+        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 2; ++j) {
+          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
+          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 32; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 64; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 9;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss, idx, rstate);
+  }
+
+  void ApplyGate6HHHHLL(const std::vector<unsigned>& qs,
+                        const fp_type* matrix, State& state) const {
+    uint64_t xs[4];
+    uint64_t ms[5];
+
+    xs[0] = uint64_t{1} << (qs[2] + 1);
+    ms[0] = (uint64_t{1} << qs[2]) - 1;
+    for (unsigned i = 1; i < 4; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
+
+    uint64_t xss[16];
+    for (unsigned i = 0; i < 16; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 4; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[3];
+
+    auto s = StateSpace::Create(14);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
+
+    for (unsigned i = 0; i < 3; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 16; ++i) {
+      for (unsigned m = 0; m < 64; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (256 * i + 64 * k + 4 * (m / 4) + (k + m) % 4);
+        }
+
+        unsigned l = 2 * (64 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[64], is[64];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
+          | (128 * i & ms[3]) | (256 * i & ms[4]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 16; ++l) {
+        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 4; ++j) {
+          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
+          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 16; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 64; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 8;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss, idx, rstate);
+  }
+
+  void ApplyGate6HHHLLL(const std::vector<unsigned>& qs,
+                        const fp_type* matrix, State& state) const {
+    uint64_t xs[3];
+    uint64_t ms[4];
+
+    xs[0] = uint64_t{1} << (qs[3] + 1);
+    ms[0] = (uint64_t{1} << qs[3]) - 1;
+    for (unsigned i = 1; i < 3; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 3] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 3]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
+
+    uint64_t xss[8];
+    for (unsigned i = 0; i < 8; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 3; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[7];
+
+    auto s = StateSpace::Create(13);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
+
+    for (unsigned i = 0; i < 7; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 8; ++i) {
+      for (unsigned m = 0; m < 64; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (512 * i + 64 * k + 8 * (m / 8) + (k + m) % 8);
+        }
+
+        unsigned l = 2 * (64 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[64], is[64];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
+          | (128 * i & ms[3]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        rs[8 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 8; ++j) {
+          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
+          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 64; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 7;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss, idx, rstate);
+  }
+
+  void ApplyGate6HHLLLL(const std::vector<unsigned>& qs,
+                        const fp_type* matrix, State& state) const {
+    uint64_t xs[2];
+    uint64_t ms[3];
+
+    xs[0] = uint64_t{1} << (qs[4] + 1);
+    ms[0] = (uint64_t{1} << qs[4]) - 1;
+    for (unsigned i = 1; i < 2; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 4] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 4]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
+
+    uint64_t xss[4];
+    for (unsigned i = 0; i < 4; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 2; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[15];
+
+    auto s = StateSpace::Create(12);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]);
+
+    for (unsigned i = 0; i < 15; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 4; ++i) {
+      for (unsigned m = 0; m < 64; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (1024 * i + 64 * k + 16 * (m / 16) + (k + m) % 16);
+        }
+
+        unsigned l = 2 * (64 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[64], is[64];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rs[16 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[16 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 16; ++j) {
+          rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]);
+          is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 64; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 6;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss, idx, rstate);
+  }
+
+  void ApplyControlledGate1H_H(const std::vector<unsigned>& qs,
+                               const std::vector<unsigned>& cqs,
+                               uint64_t cmask, const fp_type* matrix,
+                               State& state) const {
+    uint64_t xs[1];
+    uint64_t ms[2];
+
+    xs[0] = uint64_t{1} << (qs[0] + 1);
+    ms[0] = (uint64_t{1} << qs[0]) - 1;
+    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
+
+    uint64_t xss[2];
+    for (unsigned i = 0; i < 2; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 1; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
+
+    for (auto q : qs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                fp_type* rstate) {
+      __m512 ru, iu, rn, in;
+      __m512 rs[2], is[2];
+
+      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rs[l] = _mm512_load_ps(p0 + xss[l]);
+        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        ru = _mm512_set1_ps(v[j]);
+        iu = _mm512_set1_ps(v[j + 1]);
+        rn = _mm512_mul_ps(rs[0], ru);
+        in = _mm512_mul_ps(rs[0], iu);
+        rn = _mm512_fnmadd_ps(is[0], iu, rn);
+        in = _mm512_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 2; ++n) {
+          ru = _mm512_set1_ps(v[j]);
+          iu = _mm512_set1_ps(v[j + 1]);
+          rn = _mm512_fmadd_ps(rs[n], ru, rn);
+          in = _mm512_fmadd_ps(rs[n], iu, in);
+          rn = _mm512_fnmadd_ps(is[n], iu, rn);
+          in = _mm512_fmadd_ps(is[n], ru, in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 5 + cqs.size();
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, matrix, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, rstate);
+  }
+
+  void ApplyControlledGate1H_L(const std::vector<unsigned>& qs,
+                               const std::vector<unsigned>& cqs,
+                               uint64_t cmask, const fp_type* matrix,
+                               State& state) const {
+    uint64_t xs[1];
+    uint64_t ms[2];
+
+    xs[0] = uint64_t{1} << (qs[0] + 1);
+    ms[0] = (uint64_t{1} << qs[0]) - 1;
+    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
+
+    uint64_t xss[2];
+    for (unsigned i = 0; i < 2; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 1; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned cl = 0;
+    uint64_t emaskl = 0;
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        emaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
+    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
+
+    for (auto q : qs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+
+    auto s = StateSpace::Create(6);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 2; ++i) {
+      for (unsigned m = 0; m < 2; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (2 * i + 2 * k + m);
+        }
+
+        unsigned l = 2 * (2 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          fp_type v = (p[j] / 2) / 2 == (p[j] / 2) % 2 ? 1 : 0;
+          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[2], is[2];
+
+      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rs[l] = _mm512_load_ps(p0 + xss[l]);
+        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 2; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 5 + cqs.size() - cl;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, rstate);
+  }
+
+  void ApplyControlledGate1L_H(const std::vector<unsigned>& qs,
+                               const std::vector<unsigned>& cqs,
+                               uint64_t cmask, const fp_type* matrix,
+                               State& state) const {
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[1];
+
+    auto s = StateSpace::Create(5);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned m = 0; m < 2; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2);
+        }
+
+        unsigned l = 2 * (2 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[2], is[2];
+
+      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rs[2 * l] = _mm512_load_ps(p0);
+        is[2 * l] = _mm512_load_ps(p0 + 16);
+
+        for (unsigned j = 1; j < 2; ++j) {
+          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
+          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 2; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0, rn);
+        _mm512_store_ps(p0 + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 4 + cqs.size();
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w,
+             state.num_qubits(), cmaskh, emaskh, idx, rstate);
+  }
+
+  void ApplyControlledGate1L_L(const std::vector<unsigned>& qs,
+                               const std::vector<unsigned>& cqs,
+                               uint64_t cmask, const fp_type* matrix,
+                               State& state) const {
+    unsigned cl = 0;
+    uint64_t emaskl = 0;
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        emaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
+    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[1];
+
+    auto s = StateSpace::Create(5);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned m = 0; m < 2; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2);
+        }
+
+        unsigned l = 2 * (2 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          fp_type v = (p[j] / 2) / 2 == (p[j] / 2) % 2 ? 1 : 0;
+          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[2], is[2];
+
+      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rs[2 * l] = _mm512_load_ps(p0);
+        is[2 * l] = _mm512_load_ps(p0 + 16);
+
+        for (unsigned j = 1; j < 2; ++j) {
+          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
+          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 2; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0, rn);
+        _mm512_store_ps(p0 + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 4 + cqs.size() - cl;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w,
+             state.num_qubits(), cmaskh, emaskh, idx, rstate);
+  }
+
+  void ApplyControlledGate2HH_H(const std::vector<unsigned>& qs,
+                                const std::vector<unsigned>& cqs,
+                                uint64_t cmask, const fp_type* matrix,
+                                State& state) const {
+    uint64_t xs[2];
+    uint64_t ms[3];
+
+    xs[0] = uint64_t{1} << (qs[0] + 1);
+    ms[0] = (uint64_t{1} << qs[0]) - 1;
+    for (unsigned i = 1; i < 2; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
+
+    uint64_t xss[4];
+    for (unsigned i = 0; i < 4; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 2; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
+
+    for (auto q : qs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                fp_type* rstate) {
+      __m512 ru, iu, rn, in;
+      __m512 rs[4], is[4];
+
+      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rs[l] = _mm512_load_ps(p0 + xss[l]);
+        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        ru = _mm512_set1_ps(v[j]);
+        iu = _mm512_set1_ps(v[j + 1]);
+        rn = _mm512_mul_ps(rs[0], ru);
+        in = _mm512_mul_ps(rs[0], iu);
+        rn = _mm512_fnmadd_ps(is[0], iu, rn);
+        in = _mm512_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 4; ++n) {
+          ru = _mm512_set1_ps(v[j]);
+          iu = _mm512_set1_ps(v[j + 1]);
+          rn = _mm512_fmadd_ps(rs[n], ru, rn);
+          in = _mm512_fmadd_ps(rs[n], iu, in);
+          rn = _mm512_fnmadd_ps(is[n], iu, rn);
+          in = _mm512_fmadd_ps(is[n], ru, in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 6 + cqs.size();
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, matrix, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, rstate);
+  }
+
+  void ApplyControlledGate2HH_L(const std::vector<unsigned>& qs,
+                                const std::vector<unsigned>& cqs,
+                                uint64_t cmask, const fp_type* matrix,
+                                State& state) const {
+    uint64_t xs[2];
+    uint64_t ms[3];
+
+    xs[0] = uint64_t{1} << (qs[0] + 1);
+    ms[0] = (uint64_t{1} << qs[0]) - 1;
+    for (unsigned i = 1; i < 2; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
+
+    uint64_t xss[4];
+    for (unsigned i = 0; i < 4; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 2; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned cl = 0;
+    uint64_t emaskl = 0;
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        emaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
+    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
+
+    for (auto q : qs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+
+    auto s = StateSpace::Create(8);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 4; ++i) {
+      for (unsigned m = 0; m < 4; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (4 * i + 4 * k + m);
+        }
+
+        unsigned l = 2 * (4 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          fp_type v = (p[j] / 2) / 4 == (p[j] / 2) % 4 ? 1 : 0;
+          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[4], is[4];
+
+      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rs[l] = _mm512_load_ps(p0 + xss[l]);
+        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 4; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 6 + cqs.size() - cl;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, rstate);
+  }
+
+  void ApplyControlledGate2HL_H(const std::vector<unsigned>& qs,
+                                const std::vector<unsigned>& cqs,
+                                uint64_t cmask, const fp_type* matrix,
+                                State& state) const {
+    uint64_t xs[1];
+    uint64_t ms[2];
+
+    xs[0] = uint64_t{1} << (qs[1] + 1);
+    ms[0] = (uint64_t{1} << qs[1]) - 1;
+    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
+
+    uint64_t xss[2];
+    for (unsigned i = 0; i < 2; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 1; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[1];
+
+    auto s = StateSpace::Create(7);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 2; ++i) {
+      for (unsigned m = 0; m < 4; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2);
+        }
+
+        unsigned l = 2 * (4 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[4], is[4];
+
+      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 2; ++j) {
+          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
+          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 4; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 5 + cqs.size();
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, idx, rstate);
+  }
+
+  void ApplyControlledGate2HL_L(const std::vector<unsigned>& qs,
+                                const std::vector<unsigned>& cqs,
+                                uint64_t cmask, const fp_type* matrix,
+                                State& state) const {
+    uint64_t xs[1];
+    uint64_t ms[2];
+
+    xs[0] = uint64_t{1} << (qs[1] + 1);
+    ms[0] = (uint64_t{1} << qs[1]) - 1;
+    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
+
+    uint64_t xss[2];
+    for (unsigned i = 0; i < 2; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 1; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned cl = 0;
+    uint64_t emaskl = 0;
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        emaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
+    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[1];
+
+    auto s = StateSpace::Create(7);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 2; ++i) {
+      for (unsigned m = 0; m < 4; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2);
+        }
+
+        unsigned l = 2 * (4 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          fp_type v = (p[j] / 2) / 4 == (p[j] / 2) % 4 ? 1 : 0;
+          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[4], is[4];
+
+      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 2; ++j) {
+          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
+          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 4; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 5 + cqs.size() - cl;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, idx, rstate);
+  }
+
+  void ApplyControlledGate2LL_H(const std::vector<unsigned>& qs,
+                                const std::vector<unsigned>& cqs,
+                                uint64_t cmask, const fp_type* matrix,
+                                State& state) const {
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[3];
+
+    auto s = StateSpace::Create(6);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
+
+    for (unsigned i = 0; i < 3; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned m = 0; m < 4; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4);
+        }
+
+        unsigned l = 2 * (4 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[4], is[4];
+
+      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rs[4 * l] = _mm512_load_ps(p0);
+        is[4 * l] = _mm512_load_ps(p0 + 16);
+
+        for (unsigned j = 1; j < 4; ++j) {
+          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
+          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 4; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0, rn);
+        _mm512_store_ps(p0 + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 4 + cqs.size();
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w,
+             state.num_qubits(), cmaskh, emaskh, idx, rstate);
+  }
+
+  void ApplyControlledGate2LL_L(const std::vector<unsigned>& qs,
+                                const std::vector<unsigned>& cqs,
+                                uint64_t cmask, const fp_type* matrix,
+                                State& state) const {
+    unsigned cl = 0;
+    uint64_t emaskl = 0;
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        emaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
+    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[3];
+
+    auto s = StateSpace::Create(6);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
+
+    for (unsigned i = 0; i < 3; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned m = 0; m < 4; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4);
+        }
+
+        unsigned l = 2 * (4 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          fp_type v = (p[j] / 2) / 4 == (p[j] / 2) % 4 ? 1 : 0;
+          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[4], is[4];
+
+      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rs[4 * l] = _mm512_load_ps(p0);
+        is[4 * l] = _mm512_load_ps(p0 + 16);
+
+        for (unsigned j = 1; j < 4; ++j) {
+          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
+          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 4; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0, rn);
+        _mm512_store_ps(p0 + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 4 + cqs.size() - cl;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w,
+             state.num_qubits(), cmaskh, emaskh, idx, rstate);
+  }
+
+  void ApplyControlledGate3HHH_H(const std::vector<unsigned>& qs,
+                                 const std::vector<unsigned>& cqs,
+                                 uint64_t cmask, const fp_type* matrix,
+                                 State& state) const {
+    uint64_t xs[3];
+    uint64_t ms[4];
+
+    xs[0] = uint64_t{1} << (qs[0] + 1);
+    ms[0] = (uint64_t{1} << qs[0]) - 1;
+    for (unsigned i = 1; i < 3; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
+
+    uint64_t xss[8];
+    for (unsigned i = 0; i < 8; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 3; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
+
+    for (auto q : qs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                fp_type* rstate) {
+      __m512 ru, iu, rn, in;
+      __m512 rs[8], is[8];
+
+      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        rs[l] = _mm512_load_ps(p0 + xss[l]);
+        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        ru = _mm512_set1_ps(v[j]);
+        iu = _mm512_set1_ps(v[j + 1]);
+        rn = _mm512_mul_ps(rs[0], ru);
+        in = _mm512_mul_ps(rs[0], iu);
+        rn = _mm512_fnmadd_ps(is[0], iu, rn);
+        in = _mm512_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 8; ++n) {
+          ru = _mm512_set1_ps(v[j]);
+          iu = _mm512_set1_ps(v[j + 1]);
+          rn = _mm512_fmadd_ps(rs[n], ru, rn);
+          in = _mm512_fmadd_ps(rs[n], iu, in);
+          rn = _mm512_fnmadd_ps(is[n], iu, rn);
+          in = _mm512_fmadd_ps(is[n], ru, in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 7 + cqs.size();
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, matrix, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, rstate);
+  }
+
+  void ApplyControlledGate3HHH_L(const std::vector<unsigned>& qs,
+                                 const std::vector<unsigned>& cqs,
+                                 uint64_t cmask, const fp_type* matrix,
+                                 State& state) const {
+    uint64_t xs[3];
+    uint64_t ms[4];
+
+    xs[0] = uint64_t{1} << (qs[0] + 1);
+    ms[0] = (uint64_t{1} << qs[0]) - 1;
+    for (unsigned i = 1; i < 3; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
+
+    uint64_t xss[8];
+    for (unsigned i = 0; i < 8; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 3; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned cl = 0;
+    uint64_t emaskl = 0;
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        emaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
+    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
+
+    for (auto q : qs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+
+    auto s = StateSpace::Create(10);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 8; ++i) {
+      for (unsigned m = 0; m < 8; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (8 * i + 8 * k + m);
+        }
+
+        unsigned l = 2 * (8 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0;
+          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[8], is[8];
+
+      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        rs[l] = _mm512_load_ps(p0 + xss[l]);
+        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 8; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 7 + cqs.size() - cl;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, rstate);
+  }
+
+  void ApplyControlledGate3HHL_H(const std::vector<unsigned>& qs,
+                                 const std::vector<unsigned>& cqs,
+                                 uint64_t cmask, const fp_type* matrix,
+                                 State& state) const {
+    uint64_t xs[2];
+    uint64_t ms[3];
+
+    xs[0] = uint64_t{1} << (qs[1] + 1);
+    ms[0] = (uint64_t{1} << qs[1]) - 1;
+    for (unsigned i = 1; i < 2; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
+
+    uint64_t xss[4];
+    for (unsigned i = 0; i < 4; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 2; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[1];
+
+    auto s = StateSpace::Create(9);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 4; ++i) {
+      for (unsigned m = 0; m < 8; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2);
+        }
+
+        unsigned l = 2 * (8 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[8], is[8];
+
+      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 2; ++j) {
+          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
+          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 8; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 6 + cqs.size();
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, idx, rstate);
+  }
+
+  void ApplyControlledGate3HHL_L(const std::vector<unsigned>& qs,
+                                 const std::vector<unsigned>& cqs,
+                                 uint64_t cmask, const fp_type* matrix,
+                                 State& state) const {
+    uint64_t xs[2];
+    uint64_t ms[3];
+
+    xs[0] = uint64_t{1} << (qs[1] + 1);
+    ms[0] = (uint64_t{1} << qs[1]) - 1;
+    for (unsigned i = 1; i < 2; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
+
+    uint64_t xss[4];
+    for (unsigned i = 0; i < 4; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 2; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned cl = 0;
+    uint64_t emaskl = 0;
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        emaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
+    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[1];
+
+    auto s = StateSpace::Create(9);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 4; ++i) {
+      for (unsigned m = 0; m < 8; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2);
+        }
+
+        unsigned l = 2 * (8 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0;
+          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[8], is[8];
+
+      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 2; ++j) {
+          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
+          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 8; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 6 + cqs.size() - cl;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, idx, rstate);
+  }
+
+  void ApplyControlledGate3HLL_H(const std::vector<unsigned>& qs,
+                                 const std::vector<unsigned>& cqs,
+                                 uint64_t cmask, const fp_type* matrix,
+                                 State& state) const {
+    uint64_t xs[1];
+    uint64_t ms[2];
+
+    xs[0] = uint64_t{1} << (qs[2] + 1);
+    ms[0] = (uint64_t{1} << qs[2]) - 1;
+    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
+
+    uint64_t xss[2];
+    for (unsigned i = 0; i < 2; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 1; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[3];
+
+    auto s = StateSpace::Create(8);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
+
+    for (unsigned i = 0; i < 3; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 2; ++i) {
+      for (unsigned m = 0; m < 8; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4);
+        }
+
+        unsigned l = 2 * (8 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[8], is[8];
+
+      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 4; ++j) {
+          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
+          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 8; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 5 + cqs.size();
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, idx, rstate);
+  }
+
+  void ApplyControlledGate3HLL_L(const std::vector<unsigned>& qs,
+                                 const std::vector<unsigned>& cqs,
+                                 uint64_t cmask, const fp_type* matrix,
+                                 State& state) const {
+    uint64_t xs[1];
+    uint64_t ms[2];
+
+    xs[0] = uint64_t{1} << (qs[2] + 1);
+    ms[0] = (uint64_t{1} << qs[2]) - 1;
+    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
+
+    uint64_t xss[2];
+    for (unsigned i = 0; i < 2; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 1; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned cl = 0;
+    uint64_t emaskl = 0;
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        emaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
+    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[3];
+
+    auto s = StateSpace::Create(8);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
+
+    for (unsigned i = 0; i < 3; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 2; ++i) {
+      for (unsigned m = 0; m < 8; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4);
+        }
+
+        unsigned l = 2 * (8 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0;
+          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[8], is[8];
+
+      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 4; ++j) {
+          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
+          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 8; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 5 + cqs.size() - cl;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, idx, rstate);
+  }
+
+  void ApplyControlledGate3LLL_H(const std::vector<unsigned>& qs,
+                                 const std::vector<unsigned>& cqs,
+                                 uint64_t cmask, const fp_type* matrix,
+                                 State& state) const {
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[7];
+
+    auto s = StateSpace::Create(7);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
+
+    for (unsigned i = 0; i < 7; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned m = 0; m < 8; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (64 * i + 8 * k + 8 * (m / 8) + (k + m) % 8);
+        }
+
+        unsigned l = 2 * (8 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[8], is[8];
+
+      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rs[8 * l] = _mm512_load_ps(p0);
+        is[8 * l] = _mm512_load_ps(p0 + 16);
+
+        for (unsigned j = 1; j < 8; ++j) {
+          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
+          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 8; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0, rn);
+        _mm512_store_ps(p0 + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 4 + cqs.size();
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w,
+             state.num_qubits(), cmaskh, emaskh, idx, rstate);
+  }
+
+  void ApplyControlledGate3LLL_L(const std::vector<unsigned>& qs,
+                                 const std::vector<unsigned>& cqs,
+                                 uint64_t cmask, const fp_type* matrix,
+                                 State& state) const {
+    unsigned cl = 0;
+    uint64_t emaskl = 0;
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        emaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
+    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[7];
+
+    auto s = StateSpace::Create(7);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
+
+    for (unsigned i = 0; i < 7; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned m = 0; m < 8; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (64 * i + 8 * k + 8 * (m / 8) + (k + m) % 8);
+        }
+
+        unsigned l = 2 * (8 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0;
+          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[8], is[8];
+
+      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rs[8 * l] = _mm512_load_ps(p0);
+        is[8 * l] = _mm512_load_ps(p0 + 16);
+
+        for (unsigned j = 1; j < 8; ++j) {
+          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
+          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 8; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0, rn);
+        _mm512_store_ps(p0 + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 4 + cqs.size() - cl;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w,
+             state.num_qubits(), cmaskh, emaskh, idx, rstate);
+  }
+
+  void ApplyControlledGate4HHHH_H(const std::vector<unsigned>& qs,
+                                  const std::vector<unsigned>& cqs,
+                                  uint64_t cmask, const fp_type* matrix,
+                                  State& state) const {
+    uint64_t xs[4];
+    uint64_t ms[5];
+
+    xs[0] = uint64_t{1} << (qs[0] + 1);
+    ms[0] = (uint64_t{1} << qs[0]) - 1;
+    for (unsigned i = 1; i < 4; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
+
+    uint64_t xss[16];
+    for (unsigned i = 0; i < 16; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 4; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
+
+    for (auto q : qs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                fp_type* rstate) {
+      __m512 ru, iu, rn, in;
+      __m512 rs[16], is[16];
+
+      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 16; ++l) {
+        rs[l] = _mm512_load_ps(p0 + xss[l]);
+        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 16; ++l) {
+        ru = _mm512_set1_ps(v[j]);
+        iu = _mm512_set1_ps(v[j + 1]);
+        rn = _mm512_mul_ps(rs[0], ru);
+        in = _mm512_mul_ps(rs[0], iu);
+        rn = _mm512_fnmadd_ps(is[0], iu, rn);
+        in = _mm512_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 16; ++n) {
+          ru = _mm512_set1_ps(v[j]);
+          iu = _mm512_set1_ps(v[j + 1]);
+          rn = _mm512_fmadd_ps(rs[n], ru, rn);
+          in = _mm512_fmadd_ps(rs[n], iu, in);
+          rn = _mm512_fnmadd_ps(is[n], iu, rn);
+          in = _mm512_fmadd_ps(is[n], ru, in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 8 + cqs.size();
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, matrix, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, rstate);
+  }
+
+  void ApplyControlledGate4HHHH_L(const std::vector<unsigned>& qs,
+                                  const std::vector<unsigned>& cqs,
+                                  uint64_t cmask, const fp_type* matrix,
+                                  State& state) const {
+    uint64_t xs[4];
+    uint64_t ms[5];
+
+    xs[0] = uint64_t{1} << (qs[0] + 1);
+    ms[0] = (uint64_t{1} << qs[0]) - 1;
+    for (unsigned i = 1; i < 4; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
+
+    uint64_t xss[16];
+    for (unsigned i = 0; i < 16; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 4; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned cl = 0;
+    uint64_t emaskl = 0;
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        emaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
+    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
+
+    for (auto q : qs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+
+    auto s = StateSpace::Create(12);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 16; ++i) {
+      for (unsigned m = 0; m < 16; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (16 * i + 16 * k + m);
+        }
+
+        unsigned l = 2 * (16 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
+          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[16], is[16];
+
+      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 16; ++l) {
+        rs[l] = _mm512_load_ps(p0 + xss[l]);
+        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 16; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 16; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 8 + cqs.size() - cl;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, rstate);
+  }
+
+  void ApplyControlledGate4HHHL_H(const std::vector<unsigned>& qs,
+                                  const std::vector<unsigned>& cqs,
+                                  uint64_t cmask, const fp_type* matrix,
+                                  State& state) const {
+    uint64_t xs[3];
+    uint64_t ms[4];
+
+    xs[0] = uint64_t{1} << (qs[1] + 1);
+    ms[0] = (uint64_t{1} << qs[1]) - 1;
+    for (unsigned i = 1; i < 3; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
+
+    uint64_t xss[8];
+    for (unsigned i = 0; i < 8; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 3; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[1];
+
+    auto s = StateSpace::Create(11);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 8; ++i) {
+      for (unsigned m = 0; m < 16; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2);
+        }
+
+        unsigned l = 2 * (16 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[16], is[16];
+
+      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 2; ++j) {
+          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
+          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 16; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 7 + cqs.size();
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, idx, rstate);
+  }
+
+  void ApplyControlledGate4HHHL_L(const std::vector<unsigned>& qs,
+                                  const std::vector<unsigned>& cqs,
+                                  uint64_t cmask, const fp_type* matrix,
+                                  State& state) const {
+    uint64_t xs[3];
+    uint64_t ms[4];
+
+    xs[0] = uint64_t{1} << (qs[1] + 1);
+    ms[0] = (uint64_t{1} << qs[1]) - 1;
+    for (unsigned i = 1; i < 3; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
+
+    uint64_t xss[8];
+    for (unsigned i = 0; i < 8; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 3; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned cl = 0;
+    uint64_t emaskl = 0;
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        emaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
+    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[1];
+
+    auto s = StateSpace::Create(11);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 8; ++i) {
+      for (unsigned m = 0; m < 16; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2);
+        }
+
+        unsigned l = 2 * (16 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
+          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[16], is[16];
+
+      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 2; ++j) {
+          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
+          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 16; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 7 + cqs.size() - cl;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, idx, rstate);
+  }
+
+  void ApplyControlledGate4HHLL_H(const std::vector<unsigned>& qs,
+                                  const std::vector<unsigned>& cqs,
+                                  uint64_t cmask, const fp_type* matrix,
+                                  State& state) const {
+    uint64_t xs[2];
+    uint64_t ms[3];
+
+    xs[0] = uint64_t{1} << (qs[2] + 1);
+    ms[0] = (uint64_t{1} << qs[2]) - 1;
+    for (unsigned i = 1; i < 2; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
+
+    uint64_t xss[4];
+    for (unsigned i = 0; i < 4; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 2; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[3];
+
+    auto s = StateSpace::Create(10);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
+
+    for (unsigned i = 0; i < 3; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 4; ++i) {
+      for (unsigned m = 0; m < 16; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4);
+        }
+
+        unsigned l = 2 * (16 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[16], is[16];
+
+      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 4; ++j) {
+          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
+          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 16; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 6 + cqs.size();
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, idx, rstate);
+  }
+
+  void ApplyControlledGate4HHLL_L(const std::vector<unsigned>& qs,
+                                  const std::vector<unsigned>& cqs,
+                                  uint64_t cmask, const fp_type* matrix,
+                                  State& state) const {
+    uint64_t xs[2];
+    uint64_t ms[3];
+
+    xs[0] = uint64_t{1} << (qs[2] + 1);
+    ms[0] = (uint64_t{1} << qs[2]) - 1;
+    for (unsigned i = 1; i < 2; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
+
+    uint64_t xss[4];
+    for (unsigned i = 0; i < 4; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 2; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned cl = 0;
+    uint64_t emaskl = 0;
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        emaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
+    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[3];
+
+    auto s = StateSpace::Create(10);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
+
+    for (unsigned i = 0; i < 3; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 4; ++i) {
+      for (unsigned m = 0; m < 16; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4);
+        }
+
+        unsigned l = 2 * (16 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
+          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[16], is[16];
+
+      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 4; ++j) {
+          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
+          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 16; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 6 + cqs.size() - cl;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, idx, rstate);
+  }
+
+  void ApplyControlledGate4HLLL_H(const std::vector<unsigned>& qs,
+                                  const std::vector<unsigned>& cqs,
+                                  uint64_t cmask, const fp_type* matrix,
+                                  State& state) const {
+    uint64_t xs[1];
+    uint64_t ms[2];
+
+    xs[0] = uint64_t{1} << (qs[3] + 1);
+    ms[0] = (uint64_t{1} << qs[3]) - 1;
+    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
+
+    uint64_t xss[2];
+    for (unsigned i = 0; i < 2; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 1; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[7];
+
+    auto s = StateSpace::Create(9);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
+
+    for (unsigned i = 0; i < 7; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 2; ++i) {
+      for (unsigned m = 0; m < 16; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (128 * i + 16 * k + 8 * (m / 8) + (k + m) % 8);
+        }
+
+        unsigned l = 2 * (16 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[16], is[16];
+
+      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rs[8 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 8; ++j) {
+          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
+          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 16; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 5 + cqs.size();
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, idx, rstate);
+  }
+
+  void ApplyControlledGate4HLLL_L(const std::vector<unsigned>& qs,
+                                  const std::vector<unsigned>& cqs,
+                                  uint64_t cmask, const fp_type* matrix,
+                                  State& state) const {
+    uint64_t xs[1];
+    uint64_t ms[2];
+
+    xs[0] = uint64_t{1} << (qs[3] + 1);
+    ms[0] = (uint64_t{1} << qs[3]) - 1;
+    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
+
+    uint64_t xss[2];
+    for (unsigned i = 0; i < 2; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 1; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned cl = 0;
+    uint64_t emaskl = 0;
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        emaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
+    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[7];
+
+    auto s = StateSpace::Create(9);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
+
+    for (unsigned i = 0; i < 7; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 2; ++i) {
+      for (unsigned m = 0; m < 16; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (128 * i + 16 * k + 8 * (m / 8) + (k + m) % 8);
+        }
+
+        unsigned l = 2 * (16 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
+          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[16], is[16];
+
+      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rs[8 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 8; ++j) {
+          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
+          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 16; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 5 + cqs.size() - cl;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, idx, rstate);
+  }
+
+  void ApplyControlledGate4LLLL_H(const std::vector<unsigned>& qs,
+                                  const std::vector<unsigned>& cqs,
+                                  uint64_t cmask, const fp_type* matrix,
+                                  State& state) const {
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[15];
+
+    auto s = StateSpace::Create(8);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]);
+
+    for (unsigned i = 0; i < 15; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned m = 0; m < 16; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (256 * i + 16 * k + 16 * (m / 16) + (k + m) % 16);
+        }
+
+        unsigned l = 2 * (16 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[16], is[16];
+
+      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rs[16 * l] = _mm512_load_ps(p0);
+        is[16 * l] = _mm512_load_ps(p0 + 16);
+
+        for (unsigned j = 1; j < 16; ++j) {
+          rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]);
+          is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 16; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0, rn);
+        _mm512_store_ps(p0 + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 4 + cqs.size();
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w,
+             state.num_qubits(), cmaskh, emaskh, idx, rstate);
+  }
+
+  void ApplyControlledGate4LLLL_L(const std::vector<unsigned>& qs,
+                                  const std::vector<unsigned>& cqs,
+                                  uint64_t cmask, const fp_type* matrix,
+                                  State& state) const {
+    unsigned cl = 0;
+    uint64_t emaskl = 0;
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        emaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
+    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[15];
+
+    auto s = StateSpace::Create(8);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]);
+
+    for (unsigned i = 0; i < 15; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned m = 0; m < 16; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (256 * i + 16 * k + 16 * (m / 16) + (k + m) % 16);
+        }
+
+        unsigned l = 2 * (16 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
+          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[16], is[16];
+
+      uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh;
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rs[16 * l] = _mm512_load_ps(p0);
+        is[16 * l] = _mm512_load_ps(p0 + 16);
+
+        for (unsigned j = 1; j < 16; ++j) {
+          rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]);
+          is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 16; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0, rn);
+        _mm512_store_ps(p0 + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 4 + cqs.size() - cl;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w,
+             state.num_qubits(), cmaskh, emaskh, idx, rstate);
+  }
+
+  std::complex<double> ExpectationValue1H(const std::vector<unsigned>& qs,
+                                          const fp_type* matrix,
+                                          const State& state) const {
+    uint64_t xs[1];
+    uint64_t ms[2];
+
+    xs[0] = uint64_t{1} << (qs[0] + 1);
+    ms[0] = (uint64_t{1} << qs[0]) - 1;
+    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
+
+    uint64_t xss[2];
+    for (unsigned i = 0; i < 2; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 1; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss,
+                const fp_type* rstate) {
+      __m512 ru, iu, rn, in;
+      __m512 rs[2], is[2];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rs[l] = _mm512_load_ps(p0 + xss[l]);
+        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+      }
+
+      double re = 0;
+      double im = 0;
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        ru = _mm512_set1_ps(v[j]);
+        iu = _mm512_set1_ps(v[j + 1]);
+        rn = _mm512_mul_ps(rs[0], ru);
+        in = _mm512_mul_ps(rs[0], iu);
+        rn = _mm512_fnmadd_ps(is[0], iu, rn);
+        in = _mm512_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 2; ++n) {
+          ru = _mm512_set1_ps(v[j]);
+          iu = _mm512_set1_ps(v[j + 1]);
+          rn = _mm512_fmadd_ps(rs[n], ru, rn);
+          in = _mm512_fmadd_ps(rs[n], iu, in);
+          rn = _mm512_fnmadd_ps(is[n], iu, rn);
+          in = _mm512_fmadd_ps(is[n], ru, in);
+
+          j += 2;
+        }
+
+        __m512 v_re = _mm512_fmadd_ps(is[l], in, _mm512_mul_ps(rs[l], rn));
+        __m512 v_im = _mm512_fnmadd_ps(is[l], rn, _mm512_mul_ps(rs[l], in));
+
+        re += detail::HorizontalSumAVX512(v_re);
+        im += detail::HorizontalSumAVX512(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    const fp_type* rstate = state.get();
+
+    unsigned k = 5;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return for_.RunReduce(size, f, Op(), matrix, ms, xss, rstate);
+  }
+
+  std::complex<double> ExpectationValue1L(const std::vector<unsigned>& qs,
+                                          const fp_type* matrix,
+                                          const State& state) const {
+    unsigned p[16];
+    __m512i idx[1];
+
+    auto s = StateSpace::Create(5);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned m = 0; m < 2; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2);
+        }
+
+        unsigned l = 2 * (2 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const __m512i* idx, const fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[2], is[2];
+
+      auto p0 = rstate + 32 * i;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rs[2 * l] = _mm512_load_ps(p0);
+        is[2 * l] = _mm512_load_ps(p0 + 16);
+
+        for (unsigned j = 1; j < 2; ++j) {
+          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
+          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        }
+      }
+
+      double re = 0;
+      double im = 0;
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 2; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        __m512 v_re = _mm512_fmadd_ps(is[l], in, _mm512_mul_ps(rs[l], rn));
+        __m512 v_im = _mm512_fnmadd_ps(is[l], rn, _mm512_mul_ps(rs[l], in));
+
+        re += detail::HorizontalSumAVX512(v_re);
+        im += detail::HorizontalSumAVX512(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    const fp_type* rstate = state.get();
+
+    unsigned k = 4;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return for_.RunReduce(size, f, Op(), w, idx, rstate);
+  }
+
+  std::complex<double> ExpectationValue2HH(const std::vector<unsigned>& qs,
+                                           const fp_type* matrix,
+                                           const State& state) const {
+    uint64_t xs[2];
+    uint64_t ms[3];
+
+    xs[0] = uint64_t{1} << (qs[0] + 1);
+    ms[0] = (uint64_t{1} << qs[0]) - 1;
+    for (unsigned i = 1; i < 2; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
+
+    uint64_t xss[4];
+    for (unsigned i = 0; i < 4; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 2; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss,
+                const fp_type* rstate) {
+      __m512 ru, iu, rn, in;
+      __m512 rs[4], is[4];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rs[l] = _mm512_load_ps(p0 + xss[l]);
+        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+      }
+
+      double re = 0;
+      double im = 0;
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        ru = _mm512_set1_ps(v[j]);
+        iu = _mm512_set1_ps(v[j + 1]);
+        rn = _mm512_mul_ps(rs[0], ru);
+        in = _mm512_mul_ps(rs[0], iu);
+        rn = _mm512_fnmadd_ps(is[0], iu, rn);
+        in = _mm512_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 4; ++n) {
+          ru = _mm512_set1_ps(v[j]);
+          iu = _mm512_set1_ps(v[j + 1]);
+          rn = _mm512_fmadd_ps(rs[n], ru, rn);
+          in = _mm512_fmadd_ps(rs[n], iu, in);
+          rn = _mm512_fnmadd_ps(is[n], iu, rn);
+          in = _mm512_fmadd_ps(is[n], ru, in);
+
+          j += 2;
+        }
+
+        __m512 v_re = _mm512_fmadd_ps(is[l], in, _mm512_mul_ps(rs[l], rn));
+        __m512 v_im = _mm512_fnmadd_ps(is[l], rn, _mm512_mul_ps(rs[l], in));
+
+        re += detail::HorizontalSumAVX512(v_re);
+        im += detail::HorizontalSumAVX512(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    const fp_type* rstate = state.get();
+
+    unsigned k = 6;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return for_.RunReduce(size, f, Op(), matrix, ms, xss, rstate);
+  }
+
+  std::complex<double> ExpectationValue2HL(const std::vector<unsigned>& qs,
+                                           const fp_type* matrix,
+                                           const State& state) const {
+    uint64_t xs[1];
+    uint64_t ms[2];
+
+    xs[0] = uint64_t{1} << (qs[1] + 1);
+    ms[0] = (uint64_t{1} << qs[1]) - 1;
+    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
+
+    uint64_t xss[2];
+    for (unsigned i = 0; i < 2; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 1; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[1];
+
+    auto s = StateSpace::Create(7);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 2; ++i) {
+      for (unsigned m = 0; m < 4; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2);
+        }
+
+        unsigned l = 2 * (4 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, const fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[4], is[4];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 2; ++j) {
+          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
+          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        }
+      }
+
+      double re = 0;
+      double im = 0;
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 4; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        unsigned m = 2 * l;
+
+        __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn));
+        __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in));
+
+        re += detail::HorizontalSumAVX512(v_re);
+        im += detail::HorizontalSumAVX512(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    const fp_type* rstate = state.get();
+
+    unsigned k = 5;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
+  }
+
+  std::complex<double> ExpectationValue2LL(const std::vector<unsigned>& qs,
+                                           const fp_type* matrix,
+                                           const State& state) const {
+    unsigned p[16];
+    __m512i idx[3];
+
+    auto s = StateSpace::Create(6);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
+
+    for (unsigned i = 0; i < 3; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned m = 0; m < 4; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4);
+        }
+
+        unsigned l = 2 * (4 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const __m512i* idx, const fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[4], is[4];
+
+      auto p0 = rstate + 32 * i;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rs[4 * l] = _mm512_load_ps(p0);
+        is[4 * l] = _mm512_load_ps(p0 + 16);
+
+        for (unsigned j = 1; j < 4; ++j) {
+          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
+          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
+        }
+      }
+
+      double re = 0;
+      double im = 0;
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 4; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        __m512 v_re = _mm512_fmadd_ps(is[l], in, _mm512_mul_ps(rs[l], rn));
+        __m512 v_im = _mm512_fnmadd_ps(is[l], rn, _mm512_mul_ps(rs[l], in));
+
+        re += detail::HorizontalSumAVX512(v_re);
+        im += detail::HorizontalSumAVX512(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    const fp_type* rstate = state.get();
+
+    unsigned k = 4;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return for_.RunReduce(size, f, Op(), w, idx, rstate);
+  }
+
+  std::complex<double> ExpectationValue3HHH(const std::vector<unsigned>& qs,
+                                            const fp_type* matrix,
+                                            const State& state) const {
+    uint64_t xs[3];
+    uint64_t ms[4];
+
+    xs[0] = uint64_t{1} << (qs[0] + 1);
+    ms[0] = (uint64_t{1} << qs[0]) - 1;
+    for (unsigned i = 1; i < 3; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
+
+    uint64_t xss[8];
+    for (unsigned i = 0; i < 8; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 3; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss,
+                const fp_type* rstate) {
+      __m512 ru, iu, rn, in;
+      __m512 rs[8], is[8];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
+          | (128 * i & ms[3]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        rs[l] = _mm512_load_ps(p0 + xss[l]);
+        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+      }
+
+      double re = 0;
+      double im = 0;
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        ru = _mm512_set1_ps(v[j]);
+        iu = _mm512_set1_ps(v[j + 1]);
+        rn = _mm512_mul_ps(rs[0], ru);
+        in = _mm512_mul_ps(rs[0], iu);
+        rn = _mm512_fnmadd_ps(is[0], iu, rn);
+        in = _mm512_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 8; ++n) {
+          ru = _mm512_set1_ps(v[j]);
+          iu = _mm512_set1_ps(v[j + 1]);
+          rn = _mm512_fmadd_ps(rs[n], ru, rn);
+          in = _mm512_fmadd_ps(rs[n], iu, in);
+          rn = _mm512_fnmadd_ps(is[n], iu, rn);
+          in = _mm512_fmadd_ps(is[n], ru, in);
+
+          j += 2;
+        }
+
+        __m512 v_re = _mm512_fmadd_ps(is[l], in, _mm512_mul_ps(rs[l], rn));
+        __m512 v_im = _mm512_fnmadd_ps(is[l], rn, _mm512_mul_ps(rs[l], in));
+
+        re += detail::HorizontalSumAVX512(v_re);
+        im += detail::HorizontalSumAVX512(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    const fp_type* rstate = state.get();
+
+    unsigned k = 7;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return for_.RunReduce(size, f, Op(), matrix, ms, xss, rstate);
+  }
+
+  std::complex<double> ExpectationValue3HHL(const std::vector<unsigned>& qs,
+                                            const fp_type* matrix,
+                                            const State& state) const {
+    uint64_t xs[2];
+    uint64_t ms[3];
+
+    xs[0] = uint64_t{1} << (qs[1] + 1);
+    ms[0] = (uint64_t{1} << qs[1]) - 1;
+    for (unsigned i = 1; i < 2; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
+
+    uint64_t xss[4];
+    for (unsigned i = 0; i < 4; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 2; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[1];
+
+    auto s = StateSpace::Create(9);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 4; ++i) {
+      for (unsigned m = 0; m < 8; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2);
+        }
+
+        unsigned l = 2 * (8 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, const fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[8], is[8];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 2; ++j) {
+          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
+          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        }
+      }
+
+      double re = 0;
+      double im = 0;
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 8; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        unsigned m = 2 * l;
+
+        __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn));
+        __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in));
+
+        re += detail::HorizontalSumAVX512(v_re);
+        im += detail::HorizontalSumAVX512(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    const fp_type* rstate = state.get();
+
+    unsigned k = 6;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
+  }
+
+  std::complex<double> ExpectationValue3HLL(const std::vector<unsigned>& qs,
+                                            const fp_type* matrix,
+                                            const State& state) const {
+    uint64_t xs[1];
+    uint64_t ms[2];
+
+    xs[0] = uint64_t{1} << (qs[2] + 1);
+    ms[0] = (uint64_t{1} << qs[2]) - 1;
+    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
+
+    uint64_t xss[2];
+    for (unsigned i = 0; i < 2; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 1; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[3];
+
+    auto s = StateSpace::Create(8);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
+
+    for (unsigned i = 0; i < 3; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 2; ++i) {
+      for (unsigned m = 0; m < 8; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4);
+        }
+
+        unsigned l = 2 * (8 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, const fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[8], is[8];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 4; ++j) {
+          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
+          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
+        }
+      }
+
+      double re = 0;
+      double im = 0;
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 8; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        unsigned m = 4 * l;
+
+        __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn));
+        __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in));
+
+        re += detail::HorizontalSumAVX512(v_re);
+        im += detail::HorizontalSumAVX512(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    const fp_type* rstate = state.get();
+
+    unsigned k = 5;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
+  }
+
+  std::complex<double> ExpectationValue3LLL(const std::vector<unsigned>& qs,
+                                            const fp_type* matrix,
+                                            const State& state) const {
+    unsigned p[16];
+    __m512i idx[7];
+
+    auto s = StateSpace::Create(7);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
+
+    for (unsigned i = 0; i < 7; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned m = 0; m < 8; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (64 * i + 8 * k + 8 * (m / 8) + (k + m) % 8);
+        }
+
+        unsigned l = 2 * (8 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const __m512i* idx, const fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[8], is[8];
+
+      auto p0 = rstate + 32 * i;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rs[8 * l] = _mm512_load_ps(p0);
+        is[8 * l] = _mm512_load_ps(p0 + 16);
+
+        for (unsigned j = 1; j < 8; ++j) {
+          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
+          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
+        }
+      }
+
+      double re = 0;
+      double im = 0;
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 8; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        __m512 v_re = _mm512_fmadd_ps(is[l], in, _mm512_mul_ps(rs[l], rn));
+        __m512 v_im = _mm512_fnmadd_ps(is[l], rn, _mm512_mul_ps(rs[l], in));
+
+        re += detail::HorizontalSumAVX512(v_re);
+        im += detail::HorizontalSumAVX512(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    const fp_type* rstate = state.get();
+
+    unsigned k = 4;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return for_.RunReduce(size, f, Op(), w, idx, rstate);
+  }
+
+  std::complex<double> ExpectationValue4HHHH(const std::vector<unsigned>& qs,
+                                             const fp_type* matrix,
+                                             const State& state) const {
+    uint64_t xs[4];
+    uint64_t ms[5];
+
+    xs[0] = uint64_t{1} << (qs[0] + 1);
+    ms[0] = (uint64_t{1} << qs[0]) - 1;
+    for (unsigned i = 1; i < 4; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
+
+    uint64_t xss[16];
+    for (unsigned i = 0; i < 16; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 4; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss,
+                const fp_type* rstate) {
+      __m512 ru, iu, rn, in;
+      __m512 rs[16], is[16];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
+          | (128 * i & ms[3]) | (256 * i & ms[4]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 16; ++l) {
+        rs[l] = _mm512_load_ps(p0 + xss[l]);
+        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+      }
+
+      double re = 0;
+      double im = 0;
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 16; ++l) {
+        ru = _mm512_set1_ps(v[j]);
+        iu = _mm512_set1_ps(v[j + 1]);
+        rn = _mm512_mul_ps(rs[0], ru);
+        in = _mm512_mul_ps(rs[0], iu);
+        rn = _mm512_fnmadd_ps(is[0], iu, rn);
+        in = _mm512_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 16; ++n) {
+          ru = _mm512_set1_ps(v[j]);
+          iu = _mm512_set1_ps(v[j + 1]);
+          rn = _mm512_fmadd_ps(rs[n], ru, rn);
+          in = _mm512_fmadd_ps(rs[n], iu, in);
+          rn = _mm512_fnmadd_ps(is[n], iu, rn);
+          in = _mm512_fmadd_ps(is[n], ru, in);
+
+          j += 2;
+        }
+
+        __m512 v_re = _mm512_fmadd_ps(is[l], in, _mm512_mul_ps(rs[l], rn));
+        __m512 v_im = _mm512_fnmadd_ps(is[l], rn, _mm512_mul_ps(rs[l], in));
+
+        re += detail::HorizontalSumAVX512(v_re);
+        im += detail::HorizontalSumAVX512(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    const fp_type* rstate = state.get();
+
+    unsigned k = 8;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return for_.RunReduce(size, f, Op(), matrix, ms, xss, rstate);
+  }
+
+  std::complex<double> ExpectationValue4HHHL(const std::vector<unsigned>& qs,
+                                             const fp_type* matrix,
+                                             const State& state) const {
+    uint64_t xs[3];
+    uint64_t ms[4];
+
+    xs[0] = uint64_t{1} << (qs[1] + 1);
+    ms[0] = (uint64_t{1} << qs[1]) - 1;
+    for (unsigned i = 1; i < 3; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
+
+    uint64_t xss[8];
+    for (unsigned i = 0; i < 8; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 3; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[1];
+
+    auto s = StateSpace::Create(11);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 8; ++i) {
+      for (unsigned m = 0; m < 16; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2);
+        }
+
+        unsigned l = 2 * (16 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, const fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[16], is[16];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
+          | (128 * i & ms[3]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 2; ++j) {
+          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
+          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        }
+      }
+
+      double re = 0;
+      double im = 0;
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 16; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        unsigned m = 2 * l;
+
+        __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn));
+        __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in));
+
+        re += detail::HorizontalSumAVX512(v_re);
+        im += detail::HorizontalSumAVX512(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    const fp_type* rstate = state.get();
+
+    unsigned k = 7;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
+  }
+
+  std::complex<double> ExpectationValue4HHLL(const std::vector<unsigned>& qs,
+                                             const fp_type* matrix,
+                                             const State& state) const {
+    uint64_t xs[2];
+    uint64_t ms[3];
+
+    xs[0] = uint64_t{1} << (qs[2] + 1);
+    ms[0] = (uint64_t{1} << qs[2]) - 1;
+    for (unsigned i = 1; i < 2; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
+
+    uint64_t xss[4];
+    for (unsigned i = 0; i < 4; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 2; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[3];
+
+    auto s = StateSpace::Create(10);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
+
+    for (unsigned i = 0; i < 3; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 4; ++i) {
+      for (unsigned m = 0; m < 16; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4);
+        }
+
+        unsigned l = 2 * (16 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, const fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[16], is[16];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 4; ++j) {
+          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
+          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
+        }
+      }
+
+      double re = 0;
+      double im = 0;
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 16; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        unsigned m = 4 * l;
+
+        __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn));
+        __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in));
+
+        re += detail::HorizontalSumAVX512(v_re);
+        im += detail::HorizontalSumAVX512(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    const fp_type* rstate = state.get();
+
+    unsigned k = 6;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
+  }
+
+  std::complex<double> ExpectationValue4HLLL(const std::vector<unsigned>& qs,
+                                             const fp_type* matrix,
+                                             const State& state) const {
+    uint64_t xs[1];
+    uint64_t ms[2];
+
+    xs[0] = uint64_t{1} << (qs[3] + 1);
+    ms[0] = (uint64_t{1} << qs[3]) - 1;
+    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
+
+    uint64_t xss[2];
+    for (unsigned i = 0; i < 2; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 1; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[7];
+
+    auto s = StateSpace::Create(9);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
+
+    for (unsigned i = 0; i < 7; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 2; ++i) {
+      for (unsigned m = 0; m < 16; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (128 * i + 16 * k + 8 * (m / 8) + (k + m) % 8);
+        }
+
+        unsigned l = 2 * (16 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, const fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[16], is[16];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rs[8 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 8; ++j) {
+          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
+          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
+        }
+      }
+
+      double re = 0;
+      double im = 0;
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 16; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        unsigned m = 8 * l;
+
+        __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn));
+        __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in));
+
+        re += detail::HorizontalSumAVX512(v_re);
+        im += detail::HorizontalSumAVX512(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    const fp_type* rstate = state.get();
+
+    unsigned k = 5;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
+  }
+
+  std::complex<double> ExpectationValue4LLLL(const std::vector<unsigned>& qs,
+                                             const fp_type* matrix,
+                                             const State& state) const {
+    unsigned p[16];
+    __m512i idx[15];
+
+    auto s = StateSpace::Create(8);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]);
+
+    for (unsigned i = 0; i < 15; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned m = 0; m < 16; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (256 * i + 16 * k + 16 * (m / 16) + (k + m) % 16);
+        }
+
+        unsigned l = 2 * (16 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const __m512i* idx, const fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[16], is[16];
+
+      auto p0 = rstate + 32 * i;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rs[16 * l] = _mm512_load_ps(p0);
+        is[16 * l] = _mm512_load_ps(p0 + 16);
+
+        for (unsigned j = 1; j < 16; ++j) {
+          rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]);
+          is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]);
+        }
+      }
+
+      double re = 0;
+      double im = 0;
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 16; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        __m512 v_re = _mm512_fmadd_ps(is[l], in, _mm512_mul_ps(rs[l], rn));
+        __m512 v_im = _mm512_fnmadd_ps(is[l], rn, _mm512_mul_ps(rs[l], in));
+
+        re += detail::HorizontalSumAVX512(v_re);
+        im += detail::HorizontalSumAVX512(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    const fp_type* rstate = state.get();
+
+    unsigned k = 4;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return for_.RunReduce(size, f, Op(), w, idx, rstate);
+  }
+
+  std::complex<double> ExpectationValue5HHHHH(const std::vector<unsigned>& qs,
+                                              const fp_type* matrix,
+                                              const State& state) const {
+    uint64_t xs[5];
+    uint64_t ms[6];
+
+    xs[0] = uint64_t{1} << (qs[0] + 1);
+    ms[0] = (uint64_t{1} << qs[0]) - 1;
+    for (unsigned i = 1; i < 5; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1);
+
+    uint64_t xss[32];
+    for (unsigned i = 0; i < 32; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 5; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss,
+                const fp_type* rstate) {
+      __m512 ru, iu, rn, in;
+      __m512 rs[32], is[32];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
+          | (128 * i & ms[3]) | (256 * i & ms[4]) | (512 * i & ms[5]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 32; ++l) {
+        rs[l] = _mm512_load_ps(p0 + xss[l]);
+        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+      }
+
+      double re = 0;
+      double im = 0;
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 32; ++l) {
+        ru = _mm512_set1_ps(v[j]);
+        iu = _mm512_set1_ps(v[j + 1]);
+        rn = _mm512_mul_ps(rs[0], ru);
+        in = _mm512_mul_ps(rs[0], iu);
+        rn = _mm512_fnmadd_ps(is[0], iu, rn);
+        in = _mm512_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 32; ++n) {
+          ru = _mm512_set1_ps(v[j]);
+          iu = _mm512_set1_ps(v[j + 1]);
+          rn = _mm512_fmadd_ps(rs[n], ru, rn);
+          in = _mm512_fmadd_ps(rs[n], iu, in);
+          rn = _mm512_fnmadd_ps(is[n], iu, rn);
+          in = _mm512_fmadd_ps(is[n], ru, in);
+
+          j += 2;
+        }
+
+        __m512 v_re = _mm512_fmadd_ps(is[l], in, _mm512_mul_ps(rs[l], rn));
+        __m512 v_im = _mm512_fnmadd_ps(is[l], rn, _mm512_mul_ps(rs[l], in));
+
+        re += detail::HorizontalSumAVX512(v_re);
+        im += detail::HorizontalSumAVX512(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    const fp_type* rstate = state.get();
+
+    unsigned k = 9;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return for_.RunReduce(size, f, Op(), matrix, ms, xss, rstate);
+  }
+
+  std::complex<double> ExpectationValue5HHHHL(const std::vector<unsigned>& qs,
+                                              const fp_type* matrix,
+                                              const State& state) const {
+    uint64_t xs[4];
+    uint64_t ms[5];
+
+    xs[0] = uint64_t{1} << (qs[1] + 1);
+    ms[0] = (uint64_t{1} << qs[1]) - 1;
+    for (unsigned i = 1; i < 4; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
+
+    uint64_t xss[16];
+    for (unsigned i = 0; i < 16; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 4; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[1];
+
+    auto s = StateSpace::Create(13);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 16; ++i) {
+      for (unsigned m = 0; m < 32; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (64 * i + 32 * k + 2 * (m / 2) + (k + m) % 2);
+        }
+
+        unsigned l = 2 * (32 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, const fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[32], is[32];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
+          | (128 * i & ms[3]) | (256 * i & ms[4]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 16; ++l) {
+        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 2; ++j) {
+          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
+          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        }
+      }
+
+      double re = 0;
+      double im = 0;
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 16; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 32; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        unsigned m = 2 * l;
+
+        __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn));
+        __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in));
+
+        re += detail::HorizontalSumAVX512(v_re);
+        im += detail::HorizontalSumAVX512(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    const fp_type* rstate = state.get();
+
+    unsigned k = 8;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
+  }
+
+  std::complex<double> ExpectationValue5HHHLL(const std::vector<unsigned>& qs,
+                                              const fp_type* matrix,
+                                              const State& state) const {
+    uint64_t xs[3];
+    uint64_t ms[4];
+
+    xs[0] = uint64_t{1} << (qs[2] + 1);
+    ms[0] = (uint64_t{1} << qs[2]) - 1;
+    for (unsigned i = 1; i < 3; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
+
+    uint64_t xss[8];
+    for (unsigned i = 0; i < 8; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 3; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[3];
+
+    auto s = StateSpace::Create(12);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
+
+    for (unsigned i = 0; i < 3; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 8; ++i) {
+      for (unsigned m = 0; m < 32; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (128 * i + 32 * k + 4 * (m / 4) + (k + m) % 4);
+        }
+
+        unsigned l = 2 * (32 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, const fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[32], is[32];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
+          | (128 * i & ms[3]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 4; ++j) {
+          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
+          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
+        }
+      }
+
+      double re = 0;
+      double im = 0;
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 32; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        unsigned m = 4 * l;
+
+        __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn));
+        __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in));
+
+        re += detail::HorizontalSumAVX512(v_re);
+        im += detail::HorizontalSumAVX512(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    const fp_type* rstate = state.get();
+
+    unsigned k = 7;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
+  }
+
+  std::complex<double> ExpectationValue5HHLLL(const std::vector<unsigned>& qs,
+                                              const fp_type* matrix,
+                                              const State& state) const {
+    uint64_t xs[2];
+    uint64_t ms[3];
+
+    xs[0] = uint64_t{1} << (qs[3] + 1);
+    ms[0] = (uint64_t{1} << qs[3]) - 1;
+    for (unsigned i = 1; i < 2; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 3] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 3]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
+
+    uint64_t xss[4];
+    for (unsigned i = 0; i < 4; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 2; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[7];
+
+    auto s = StateSpace::Create(11);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
+
+    for (unsigned i = 0; i < 7; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 4; ++i) {
+      for (unsigned m = 0; m < 32; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (256 * i + 32 * k + 8 * (m / 8) + (k + m) % 8);
+        }
+
+        unsigned l = 2 * (32 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, const fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[32], is[32];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rs[8 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 8; ++j) {
+          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
+          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
+        }
+      }
+
+      double re = 0;
+      double im = 0;
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 32; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        unsigned m = 8 * l;
+
+        __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn));
+        __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in));
+
+        re += detail::HorizontalSumAVX512(v_re);
+        im += detail::HorizontalSumAVX512(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    const fp_type* rstate = state.get();
+
+    unsigned k = 6;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
+  }
+
+  std::complex<double> ExpectationValue5HLLLL(const std::vector<unsigned>& qs,
+                                              const fp_type* matrix,
+                                              const State& state) const {
+    uint64_t xs[1];
+    uint64_t ms[2];
+
+    xs[0] = uint64_t{1} << (qs[4] + 1);
+    ms[0] = (uint64_t{1} << qs[4]) - 1;
+    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
+
+    uint64_t xss[2];
+    for (unsigned i = 0; i < 2; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 1; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[15];
+
+    auto s = StateSpace::Create(10);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]);
+
+    for (unsigned i = 0; i < 15; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 2; ++i) {
+      for (unsigned m = 0; m < 32; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (512 * i + 32 * k + 16 * (m / 16) + (k + m) % 16);
+        }
+
+        unsigned l = 2 * (32 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, const fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[32], is[32];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rs[16 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[16 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 16; ++j) {
+          rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]);
+          is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]);
+        }
+      }
+
+      double re = 0;
+      double im = 0;
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 32; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        unsigned m = 16 * l;
+
+        __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn));
+        __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in));
+
+        re += detail::HorizontalSumAVX512(v_re);
+        im += detail::HorizontalSumAVX512(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    const fp_type* rstate = state.get();
+
+    unsigned k = 5;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
+  }
+
+  std::complex<double> ExpectationValue6HHHHHH(const std::vector<unsigned>& qs,
+                                               const fp_type* matrix,
+                                               const State& state) const {
+    uint64_t xs[6];
+    uint64_t ms[7];
+
+    xs[0] = uint64_t{1} << (qs[0] + 1);
+    ms[0] = (uint64_t{1} << qs[0]) - 1;
+    for (unsigned i = 1; i < 6; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[6] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[5] - 1);
+
+    uint64_t xss[64];
+    for (unsigned i = 0; i < 64; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 6; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss,
+                const fp_type* rstate) {
+      __m512 ru, iu, rn, in;
+      __m512 rs[64], is[64];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
+          | (128 * i & ms[3]) | (256 * i & ms[4]) | (512 * i & ms[5])
+          | (1024 * i & ms[6]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 64; ++l) {
+        rs[l] = _mm512_load_ps(p0 + xss[l]);
+        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+      }
+
+      double re = 0;
+      double im = 0;
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 64; ++l) {
+        ru = _mm512_set1_ps(v[j]);
+        iu = _mm512_set1_ps(v[j + 1]);
+        rn = _mm512_mul_ps(rs[0], ru);
+        in = _mm512_mul_ps(rs[0], iu);
+        rn = _mm512_fnmadd_ps(is[0], iu, rn);
+        in = _mm512_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 64; ++n) {
+          ru = _mm512_set1_ps(v[j]);
+          iu = _mm512_set1_ps(v[j + 1]);
+          rn = _mm512_fmadd_ps(rs[n], ru, rn);
+          in = _mm512_fmadd_ps(rs[n], iu, in);
+          rn = _mm512_fnmadd_ps(is[n], iu, rn);
+          in = _mm512_fmadd_ps(is[n], ru, in);
+
+          j += 2;
+        }
+
+        __m512 v_re = _mm512_fmadd_ps(is[l], in, _mm512_mul_ps(rs[l], rn));
+        __m512 v_im = _mm512_fnmadd_ps(is[l], rn, _mm512_mul_ps(rs[l], in));
+
+        re += detail::HorizontalSumAVX512(v_re);
+        im += detail::HorizontalSumAVX512(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    const fp_type* rstate = state.get();
+
+    unsigned k = 10;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return for_.RunReduce(size, f, Op(), matrix, ms, xss, rstate);
+  }
+
+  std::complex<double> ExpectationValue6HHHHHL(const std::vector<unsigned>& qs,
+                                               const fp_type* matrix,
+                                               const State& state) const {
+    uint64_t xs[5];
+    uint64_t ms[6];
+
+    xs[0] = uint64_t{1} << (qs[1] + 1);
+    ms[0] = (uint64_t{1} << qs[1]) - 1;
+    for (unsigned i = 1; i < 5; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1);
+
+    uint64_t xss[32];
+    for (unsigned i = 0; i < 32; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 5; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[1];
+
+    auto s = StateSpace::Create(15);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 32; ++i) {
+      for (unsigned m = 0; m < 64; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (128 * i + 64 * k + 2 * (m / 2) + (k + m) % 2);
+        }
+
+        unsigned l = 2 * (64 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, const fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[64], is[64];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
+          | (128 * i & ms[3]) | (256 * i & ms[4]) | (512 * i & ms[5]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 32; ++l) {
+        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 2; ++j) {
+          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
+          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        }
+      }
+
+      double re = 0;
+      double im = 0;
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 32; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 64; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        unsigned m = 2 * l;
+
+        __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn));
+        __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in));
+
+        re += detail::HorizontalSumAVX512(v_re);
+        im += detail::HorizontalSumAVX512(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    const fp_type* rstate = state.get();
+
+    unsigned k = 9;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
+  }
+
+  std::complex<double> ExpectationValue6HHHHLL(const std::vector<unsigned>& qs,
+                                               const fp_type* matrix,
+                                               const State& state) const {
+    uint64_t xs[4];
+    uint64_t ms[5];
+
+    xs[0] = uint64_t{1} << (qs[2] + 1);
+    ms[0] = (uint64_t{1} << qs[2]) - 1;
+    for (unsigned i = 1; i < 4; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
+
+    uint64_t xss[16];
+    for (unsigned i = 0; i < 16; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 4; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[3];
+
+    auto s = StateSpace::Create(14);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
+
+    for (unsigned i = 0; i < 3; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 16; ++i) {
+      for (unsigned m = 0; m < 64; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (256 * i + 64 * k + 4 * (m / 4) + (k + m) % 4);
+        }
+
+        unsigned l = 2 * (64 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, const fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[64], is[64];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
+          | (128 * i & ms[3]) | (256 * i & ms[4]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 16; ++l) {
+        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 4; ++j) {
+          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
+          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
+        }
+      }
+
+      double re = 0;
+      double im = 0;
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 16; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 64; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        unsigned m = 4 * l;
+
+        __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn));
+        __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in));
+
+        re += detail::HorizontalSumAVX512(v_re);
+        im += detail::HorizontalSumAVX512(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    const fp_type* rstate = state.get();
+
+    unsigned k = 8;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
+  }
+
+  std::complex<double> ExpectationValue6HHHLLL(const std::vector<unsigned>& qs,
+                                               const fp_type* matrix,
+                                               const State& state) const {
+    uint64_t xs[3];
+    uint64_t ms[4];
+
+    xs[0] = uint64_t{1} << (qs[3] + 1);
+    ms[0] = (uint64_t{1} << qs[3]) - 1;
+    for (unsigned i = 1; i < 3; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 3] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 3]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
+
+    uint64_t xss[8];
+    for (unsigned i = 0; i < 8; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 3; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[7];
+
+    auto s = StateSpace::Create(13);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
+
+    for (unsigned i = 0; i < 7; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 8; ++i) {
+      for (unsigned m = 0; m < 64; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (512 * i + 64 * k + 8 * (m / 8) + (k + m) % 8);
+        }
+
+        unsigned l = 2 * (64 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, const fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[64], is[64];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2])
+          | (128 * i & ms[3]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        rs[8 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 8; ++j) {
+          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
+          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
+        }
+      }
+
+      double re = 0;
+      double im = 0;
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 64; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        unsigned m = 8 * l;
+
+        __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn));
+        __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in));
+
+        re += detail::HorizontalSumAVX512(v_re);
+        im += detail::HorizontalSumAVX512(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    const fp_type* rstate = state.get();
+
+    unsigned k = 7;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
+  }
+
+  std::complex<double> ExpectationValue6HHLLLL(const std::vector<unsigned>& qs,
+                                               const fp_type* matrix,
+                                               const State& state) const {
+    uint64_t xs[2];
+    uint64_t ms[3];
+
+    xs[0] = uint64_t{1} << (qs[4] + 1);
+    ms[0] = (uint64_t{1} << qs[4]) - 1;
+    for (unsigned i = 1; i < 2; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 4] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 4]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
+
+    uint64_t xss[4];
+    for (unsigned i = 0; i < 4; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 2; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[15];
+
+    auto s = StateSpace::Create(12);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]);
+
+    for (unsigned i = 0; i < 15; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 4; ++i) {
+      for (unsigned m = 0; m < 64; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (1024 * i + 64 * k + 16 * (m / 16) + (k + m) % 16);
+        }
+
+        unsigned l = 2 * (64 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, const fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[64], is[64];
+
+      uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]);
+
+      auto p0 = rstate + 2 * k;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rs[16 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[16 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 16; ++j) {
+          rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]);
+          is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]);
+        }
+      }
+
+      double re = 0;
+      double im = 0;
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 64; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        unsigned m = 16 * l;
+
+        __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn));
+        __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in));
+
+        re += detail::HorizontalSumAVX512(v_re);
+        im += detail::HorizontalSumAVX512(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    const fp_type* rstate = state.get();
+
+    unsigned k = 6;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate);
+  }
+
+  static unsigned MaskedAdd(
+      unsigned a, unsigned b, unsigned mask, unsigned lsize) {
+    unsigned c = bits::CompressBits(a, 4, mask);
+    return bits::ExpandBits((c + b) % lsize, 4, mask);
+  }
+
+  For for_;
+};
+
+}  // namespace qsim
+
+#endif  // SIMULATOR_AVX512_H_
diff --git a/lib/statespace_avx512.h b/lib/statespace_avx512.h
new file mode 100644
index 000000000..3b98250be
--- /dev/null
+++ b/lib/statespace_avx512.h
@@ -0,0 +1,423 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef STATESPACE_AVX512_H_
+#define STATESPACE_AVX512_H_
+
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cmath>
+#include <complex>
+#include <cstdint>
+#include <functional>
+
+#include "statespace.h"
+#include "util.h"
+
+namespace qsim {
+
+namespace detail {
+
+inline unsigned GetZeroMaskAVX512(uint64_t i, uint64_t mask, uint64_t bits) {
+  __m512i s1 = _mm512_setr_epi64(
+      i + 0, i + 1, i + 2, i + 3, i + 4, i + 5, i + 6, i + 7);
+  __m512i s2 = _mm512_setr_epi64(
+      i + 8, i + 9, i + 10, i + 11, i + 12, i + 13, i + 14, i + 15);
+  __m512i ma = _mm512_set1_epi64(mask);
+  __m512i bi = _mm512_set1_epi64(bits);
+
+  s1 = _mm512_and_si512(s1, ma);
+  s2 = _mm512_and_si512(s2, ma);
+
+  unsigned m1 = _mm512_cmpeq_epu64_mask(s1, bi);
+  unsigned m2 = _mm512_cmpeq_epu64_mask(s2, bi);
+
+  return (m2 << 8) | m1;
+}
+
+inline double HorizontalSumAVX512(__m512 s) {
+  return _mm512_reduce_add_ps(s);
+}
+
+}  // namespace detail
+
+/**
+ * Object containing context and routines for AVX state-vector manipulations.
+ * State is a vectorized sequence of sixteen real components followed by
+ * sixteen imaginary components. Sixteen single-precison floating numbers can
+ * be loaded into an AVX512 register.
+ */
+template <typename For>
+struct StateSpaceAVX512 : public StateSpace<StateSpaceAVX512<For>, For, float> {
+  using Base = StateSpace<StateSpaceAVX512<For>, For, float>;
+  using State = typename Base::State;
+  using fp_type = typename Base::fp_type;
+
+  template <typename... ForArgs>
+  explicit StateSpaceAVX512(ForArgs&&... args) : Base(args...) {}
+
+  static uint64_t MinSize(unsigned num_qubits) {
+    return std::max(uint64_t{32}, 2 * (uint64_t{1} << num_qubits));
+  };
+
+  void InternalToNormalOrder(State& state) const {
+    __m512i idx1 = _mm512_setr_epi32(
+        0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
+    __m512i idx2 = _mm512_setr_epi32(
+        8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                __m512i idx1, __m512i idx2, fp_type* p) {
+      __m512 v1 = _mm512_load_ps(p + 32 * i);
+      __m512 v2 = _mm512_load_ps(p + 32 * i + 16);
+
+      _mm512_store_ps(p + 32 * i,  _mm512_permutex2var_ps(v1, idx1, v2));
+      _mm512_store_ps(p + 32 * i + 16,  _mm512_permutex2var_ps(v1, idx2, v2));
+    };
+
+    Base::for_.Run(
+        MinSize(state.num_qubits()) / 32, f, idx1, idx2, state.get());
+  }
+
+  void NormalToInternalOrder(State& state) const {
+    __m512i idx1 = _mm512_setr_epi32(
+        0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+    __m512i idx2 = _mm512_setr_epi32(
+        1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                __m512i idx1, __m512i idx2, fp_type* p) {
+      __m512 re = _mm512_load_ps(p + 32 * i);
+      __m512 im = _mm512_load_ps(p + 32 * i + 16);
+
+      _mm512_store_ps(p + 32 * i,  _mm512_permutex2var_ps(re, idx1, im));
+      _mm512_store_ps(p + 32 * i + 16,  _mm512_permutex2var_ps(re, idx2, im));
+    };
+
+    Base::for_.Run(
+        MinSize(state.num_qubits()) / 32, f, idx1, idx2, state.get());
+  }
+
+  void SetAllZeros(State& state) const {
+    __m512 val0 = _mm512_setzero_ps();
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, __m512 val0, fp_type* p) {
+      _mm512_store_ps(p + 32 * i, val0);
+      _mm512_store_ps(p + 32 * i + 16, val0);
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 32, f, val0, state.get());
+  }
+
+  // Uniform superposition.
+  void SetStateUniform(State& state) const {
+    __m512 val0 = _mm512_setzero_ps();
+    __m512 valu;
+
+    fp_type v = double{1} / std::sqrt(uint64_t{1} << state.num_qubits());
+
+    switch (state.num_qubits()) {
+    case 1:
+      valu = _mm512_set_ps(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, v, v);
+      break;
+    case 2:
+      valu = _mm512_set_ps(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, v, v, v, v);
+      break;
+    case 3:
+      valu = _mm512_set_ps(0, 0, 0, 0, 0, 0, 0, 0, v, v, v, v, v, v, v, v);
+      break;
+    default:
+      valu = _mm512_set1_ps(v);
+      break;
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const __m512& val0, const __m512& valu, fp_type* p) {
+      _mm512_store_ps(p + 32 * i, valu);
+      _mm512_store_ps(p + 32 * i + 16, val0);
+    };
+
+    Base::for_.Run(
+        MinSize(state.num_qubits()) / 32, f, val0, valu, state.get());
+  }
+
+  // |0> state.
+  void SetStateZero(State& state) const {
+    SetAllZeros(state);
+    state.get()[0] = 1;
+  }
+
+  static std::complex<fp_type> GetAmpl(const State& state, uint64_t i) {
+    uint64_t p = (32 * (i / 16)) + (i % 16);
+    return std::complex<fp_type>(state.get()[p], state.get()[p + 16]);
+  }
+
+  static void SetAmpl(
+      State& state, uint64_t i, const std::complex<fp_type>& ampl) {
+    uint64_t p = (32 * (i / 16)) + (i % 16);
+    state.get()[p] = std::real(ampl);
+    state.get()[p + 16] = std::imag(ampl);
+  }
+
+  static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) {
+    uint64_t p = (32 * (i / 16)) + (i % 16);
+    state.get()[p] = re;
+    state.get()[p + 16] = im;
+  }
+
+  // Sets state[i] = complex(re, im) where (i & mask) == bits.
+  // if `exclude` is true then the criteria becomes (i & mask) != bits.
+  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits,
+                   const std::complex<fp_type>& val,
+                   bool exclude = false) const {
+    BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val), exclude);
+  }
+
+  // Sets state[i] = complex(re, im) where (i & mask) == bits.
+  // if `exclude` is true then the criteria becomes (i & mask) != bits.
+  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re,
+                   fp_type im, bool exclude = false) const {
+    __m512 re_reg = _mm512_set1_ps(re);
+    __m512 im_reg = _mm512_set1_ps(im);
+
+    __mmask16 exclude_n(-exclude);
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, uint64_t maskv,
+                uint64_t bitsv, __m512 re_n, __m512 im_n, __mmask16 exclude_n,
+                fp_type* p) {
+      __m512 re = _mm512_load_ps(p + 32 * i);
+      __m512 im = _mm512_load_ps(p + 32 * i + 16);
+
+      __mmask16 ml =
+          detail::GetZeroMaskAVX512(16 * i, maskv, bitsv) ^ exclude_n;
+
+      re = _mm512_mask_blend_ps(ml, re, re_n);
+      im = _mm512_mask_blend_ps(ml, im, im_n);
+
+      _mm512_store_ps(p + 32 * i, re);
+      _mm512_store_ps(p + 32 * i + 16, im);
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 32, f, mask, bits,
+                   re_reg, im_reg, exclude_n, state.get());
+  }
+
+// Does the equivalent of dest += src elementwise.
+  bool Add(const State& src, State& dest) const {
+    if (src.num_qubits() != dest.num_qubits()) {
+      return false;
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* p1, fp_type* p2) {
+      __m512 re1 = _mm512_load_ps(p1 + 32 * i);
+      __m512 im1 = _mm512_load_ps(p1 + 32 * i + 16);
+      __m512 re2 = _mm512_load_ps(p2 + 32 * i);
+      __m512 im2 = _mm512_load_ps(p2 + 32 * i + 16);
+
+      _mm512_store_ps(p2 + 32 * i, _mm512_add_ps(re1, re2));
+      _mm512_store_ps(p2 + 32 * i + 16, _mm512_add_ps(im1, im2));
+    };
+
+    Base::for_.Run(MinSize(src.num_qubits()) / 32, f, src.get(), dest.get());
+
+    return true;
+  }
+
+  // Does the equivalent of state *= a elementwise.
+  void Multiply(fp_type a, State& state) const {
+    __m512 r = _mm512_set1_ps(a);
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, __m512 r, fp_type* p) {
+      __m512 re = _mm512_load_ps(p + 32 * i);
+      __m512 im = _mm512_load_ps(p + 32 * i + 16);
+
+      _mm512_store_ps(p + 32 * i, _mm512_mul_ps(re, r));
+      _mm512_store_ps(p + 32 * i + 16, _mm512_mul_ps(im, r));
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 32, f, r, state.get());
+  }
+
+  std::complex<double> InnerProduct(
+      const State& state1, const State& state2) const {
+    if (state1.num_qubits() != state2.num_qubits()) {
+      return std::nan("");
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* p1, const fp_type* p2) -> std::complex<double> {
+      __m512 re1 = _mm512_load_ps(p1 + 32 * i);
+      __m512 im1 = _mm512_load_ps(p1 + 32 * i + 16);
+      __m512 re2 = _mm512_load_ps(p2 + 32 * i);
+      __m512 im2 = _mm512_load_ps(p2 + 32 * i + 16);
+
+      __m512 ip_re = _mm512_fmadd_ps(im1, im2, _mm512_mul_ps(re1, re2));
+      __m512 ip_im = _mm512_fnmadd_ps(im1, re2, _mm512_mul_ps(re1, im2));
+
+      double re = detail::HorizontalSumAVX512(ip_re);
+      double im = detail::HorizontalSumAVX512(ip_im);
+
+      return std::complex<double>{re, im};
+    };
+
+    using Op = std::plus<std::complex<double>>;
+    return Base::for_.RunReduce(MinSize(state1.num_qubits()) / 32, f,
+                                Op(), state1.get(), state2.get());
+  }
+
+  double RealInnerProduct(const State& state1, const State& state2) const {
+    if (state1.num_qubits() != state2.num_qubits()) {
+      return std::nan("");
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* p1, const fp_type* p2) -> double {
+      __m512 re1 = _mm512_load_ps(p1 + 32 * i);
+      __m512 im1 = _mm512_load_ps(p1 + 32 * i + 16);
+      __m512 re2 = _mm512_load_ps(p2 + 32 * i);
+      __m512 im2 = _mm512_load_ps(p2 + 32 * i + 16);
+
+      __m512 ip_re = _mm512_fmadd_ps(im1, im2, _mm512_mul_ps(re1, re2));
+
+      return detail::HorizontalSumAVX512(ip_re);
+    };
+
+    using Op = std::plus<double>;
+    return Base::for_.RunReduce(MinSize(state1.num_qubits()) / 32, f,
+                                Op(), state1.get(), state2.get());
+  }
+
+  template <typename DistrRealType = double>
+  std::vector<uint64_t> Sample(
+      const State& state, uint64_t num_samples, unsigned seed) const {
+    std::vector<uint64_t> bitstrings;
+
+    if (num_samples > 0) {
+      double norm = 0;
+      uint64_t size = MinSize(state.num_qubits()) / 32;
+      const fp_type* p = state.get();
+
+      for (uint64_t k = 0; k < size; ++k) {
+        for (unsigned j = 0; j < 16; ++j) {
+          auto re = p[32 * k + j];
+          auto im = p[32 * k + 16 + j];
+          norm += re * re + im * im;
+        }
+      }
+
+      auto rs = GenerateRandomValues<DistrRealType>(num_samples, seed, norm);
+
+      uint64_t m = 0;
+      double csum = 0;
+      bitstrings.reserve(num_samples);
+
+      for (uint64_t k = 0; k < size; ++k) {
+        for (unsigned j = 0; j < 16; ++j) {
+          auto re = p[32 * k + j];
+          auto im = p[32 * k + 16 + j];
+          csum += re * re + im * im;
+          while (rs[m] < csum && m < num_samples) {
+            bitstrings.emplace_back(16 * k + j);
+            ++m;
+          }
+        }
+      }
+    }
+
+    return bitstrings;
+  }
+
+  using MeasurementResult = typename Base::MeasurementResult;
+
+  void Collapse(const MeasurementResult& mr, State& state) const {
+    auto f1 = [](unsigned n, unsigned m, uint64_t i,
+                 uint64_t mask, uint64_t bits, const fp_type* p) -> double {
+      __mmask16 ml = detail::GetZeroMaskAVX512(16 * i, mask, bits);
+
+      __m512 re = _mm512_maskz_load_ps(ml, p + 32 * i);
+      __m512 im = _mm512_maskz_load_ps(ml, p + 32 * i + 16);
+      __m512 s1 = _mm512_fmadd_ps(im, im, _mm512_mul_ps(re, re));
+
+      return detail::HorizontalSumAVX512(s1);
+    };
+
+    using Op = std::plus<double>;
+    double norm = Base::for_.RunReduce(MinSize(state.num_qubits()) / 32, f1,
+                                       Op(), mr.mask, mr.bits, state.get());
+
+    __m512 renorm = _mm512_set1_ps(1.0 / std::sqrt(norm));
+
+    auto f2 = [](unsigned n, unsigned m, uint64_t i,
+                 uint64_t mask, uint64_t bits, __m512 renorm, fp_type* p) {
+      __mmask16 ml = detail::GetZeroMaskAVX512(16 * i, mask, bits);
+
+      __m512 re = _mm512_maskz_load_ps(ml, p + 32 * i);
+      __m512 im = _mm512_maskz_load_ps(ml, p + 32 * i + 16);
+
+      re = _mm512_mul_ps(re, renorm);
+      im = _mm512_mul_ps(im, renorm);
+
+      _mm512_store_ps(p + 32 * i, re);
+      _mm512_store_ps(p + 32 * i + 16, im);
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 32, f2,
+                   mr.mask, mr.bits, renorm, state.get());
+  }
+
+  std::vector<double> PartialNorms(const State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* p) -> double {
+      __m512 re = _mm512_load_ps(p + 32 * i);
+      __m512 im = _mm512_load_ps(p + 32 * i + 16);
+      __m512 s1 = _mm512_fmadd_ps(im, im, _mm512_mul_ps(re, re));
+
+      return detail::HorizontalSumAVX512(s1);
+    };
+
+    using Op = std::plus<double>;
+    return Base::for_.RunReduceP(
+        MinSize(state.num_qubits()) / 32, f, Op(), state.get());
+  }
+
+  uint64_t FindMeasuredBits(
+      unsigned m, double r, uint64_t mask, const State& state) const {
+    double csum = 0;
+
+    uint64_t k0 = Base::for_.GetIndex0(MinSize(state.num_qubits()) / 32, m);
+    uint64_t k1 = Base::for_.GetIndex1(MinSize(state.num_qubits()) / 32, m);
+
+    const fp_type* p = state.get();
+
+    for (uint64_t k = k0; k < k1; ++k) {
+      for (uint64_t j = 0; j < 16; ++j) {
+        auto re = p[32 * k + j];
+        auto im = p[32 * k + j + 16];
+        csum += re * re + im * im;
+        if (r < csum) {
+          return (16 * k + j) & mask;
+        }
+      }
+    }
+
+    // Return the last bitstring in the unlikely case of underflow.
+    return (16 * k1 - 1) & mask;
+  }
+};
+
+}  // namespace qsim
+
+#endif  // STATESPACE_AVX512_H_
diff --git a/lib/umux.h b/lib/umux.h
index 31de85118..83b951b86 100644
--- a/lib/umux.h
+++ b/lib/umux.h
@@ -15,7 +15,15 @@
 #ifndef UMUX_H_
 #define UMUX_H_
 
-#ifdef __AVX2__
+#ifdef __AVX512F__
+# include "unitary_calculator_avx512.h"
+  namespace qsim {
+  namespace unitary {
+    template <typename For>
+    using UnitaryCalculator = UnitaryCalculatorAVX512<For>;
+  }
+  }
+#elif __AVX2__
 # include "unitary_calculator_avx.h"
   namespace qsim {
   namespace unitary {
diff --git a/lib/unitary_calculator_avx512.h b/lib/unitary_calculator_avx512.h
new file mode 100644
index 000000000..174f44af1
--- /dev/null
+++ b/lib/unitary_calculator_avx512.h
@@ -0,0 +1,6385 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UNITARY_CALCULATOR_AVX512_H_
+#define UNITARY_CALCULATOR_AVX512_H_
+
+#include <immintrin.h>
+
+#include <algorithm>
+#include <complex>
+#include <cstdint>
+
+#include "bits.h"
+#include "unitaryspace_avx512.h"
+
+namespace qsim {
+namespace unitary {
+
+/**
+ * Quantum circuit unitary calculator with AVX512 vectorization.
+ */
+template <typename For>
+class UnitaryCalculatorAVX512 final {
+ public:
+  using UnitarySpace = UnitarySpaceAVX512<For>;
+  using Unitary = typename UnitarySpace::Unitary;
+  using fp_type = typename UnitarySpace::fp_type;
+
+  using StateSpace = UnitarySpace;
+  using State = Unitary;
+
+  template <typename... ForArgs>
+  explicit UnitaryCalculatorAVX512(ForArgs&&... args) : for_(args...) {}
+
+  /**
+   * Applies a gate using AVX512 instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyGate(const std::vector<unsigned>& qs,
+                 const fp_type* matrix, Unitary& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    switch (qs.size()) {
+    case 1:
+      if (qs[0] > 3) {
+        ApplyGate1H(qs, matrix, state);
+      } else {
+        ApplyGate1L(qs, matrix, state);
+      }
+      break;
+    case 2:
+      if (qs[0] > 3) {
+        ApplyGate2HH(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        ApplyGate2HL(qs, matrix, state);
+      } else {
+        ApplyGate2LL(qs, matrix, state);
+      }
+      break;
+    case 3:
+      if (qs[0] > 3) {
+        ApplyGate3HHH(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        ApplyGate3HHL(qs, matrix, state);
+      } else if (qs[2] > 3) {
+        ApplyGate3HLL(qs, matrix, state);
+      } else {
+        ApplyGate3LLL(qs, matrix, state);
+      }
+      break;
+    case 4:
+      if (qs[0] > 3) {
+        ApplyGate4HHHH(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        ApplyGate4HHHL(qs, matrix, state);
+      } else if (qs[2] > 3) {
+        ApplyGate4HHLL(qs, matrix, state);
+      } else if (qs[3] > 3) {
+        ApplyGate4HLLL(qs, matrix, state);
+      } else {
+        ApplyGate4LLLL(qs, matrix, state);
+      }
+      break;
+    case 5:
+      if (qs[0] > 3) {
+        ApplyGate5HHHHH(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        ApplyGate5HHHHL(qs, matrix, state);
+      } else if (qs[2] > 3) {
+        ApplyGate5HHHLL(qs, matrix, state);
+      } else if (qs[3] > 3) {
+        ApplyGate5HHLLL(qs, matrix, state);
+      } else {
+        ApplyGate5HLLLL(qs, matrix, state);
+      }
+      break;
+    case 6:
+      if (qs[0] > 3) {
+        ApplyGate6HHHHHH(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        ApplyGate6HHHHHL(qs, matrix, state);
+      } else if (qs[2] > 3) {
+        ApplyGate6HHHHLL(qs, matrix, state);
+      } else if (qs[3] > 3) {
+        ApplyGate6HHHLLL(qs, matrix, state);
+      } else {
+        ApplyGate6HHLLLL(qs, matrix, state);
+      }
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+  }
+
+  /**
+   * Applies a controlled gate using AVX512 instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param cqs Indices of control qubits.
+   * @param cmask Bit mask of control qubit values.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyControlledGate(const std::vector<unsigned>& qs,
+                           const std::vector<unsigned>& cqs, uint64_t cmask,
+                           const fp_type* matrix, Unitary& state) const {
+    if (cqs.size() == 0) {
+      ApplyGate(qs, matrix, state);
+      return;
+    }
+
+    switch (qs.size()) {
+    case 1:
+      if (qs[0] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGate1H_H(qs, cqs, cmask, matrix, state);
+        } else {
+          ApplyControlledGate1H_L(qs, cqs, cmask, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 3) {
+          ApplyControlledGate1L_H(qs, cqs, cmask, matrix, state);
+        } else {
+          ApplyControlledGate1L_L(qs, cqs, cmask, matrix, state);
+        }
+      }
+      break;
+    case 2:
+      if (qs[0] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGate2HH_H(qs, cqs, cmask, matrix, state);
+        } else {
+          ApplyControlledGate2HH_L(qs, cqs, cmask, matrix, state);
+        }
+      } else if (qs[1] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGate2HL_H(qs, cqs, cmask, matrix, state);
+        } else {
+          ApplyControlledGate2HL_L(qs, cqs, cmask, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 3) {
+          ApplyControlledGate2LL_H(qs, cqs, cmask, matrix, state);
+        } else {
+          ApplyControlledGate2LL_L(qs, cqs, cmask, matrix, state);
+        }
+      }
+      break;
+    case 3:
+      if (qs[0] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGate3HHH_H(qs, cqs, cmask, matrix, state);
+        } else {
+          ApplyControlledGate3HHH_L(qs, cqs, cmask, matrix, state);
+        }
+      } else if (qs[1] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGate3HHL_H(qs, cqs, cmask, matrix, state);
+        } else {
+          ApplyControlledGate3HHL_L(qs, cqs, cmask, matrix, state);
+        }
+      } else if (qs[2] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGate3HLL_H(qs, cqs, cmask, matrix, state);
+        } else {
+          ApplyControlledGate3HLL_L(qs, cqs, cmask, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 3) {
+          ApplyControlledGate3LLL_H(qs, cqs, cmask, matrix, state);
+        } else {
+          ApplyControlledGate3LLL_L(qs, cqs, cmask, matrix, state);
+        }
+      }
+      break;
+    case 4:
+      if (qs[0] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGate4HHHH_H(qs, cqs, cmask, matrix, state);
+        } else {
+          ApplyControlledGate4HHHH_L(qs, cqs, cmask, matrix, state);
+        }
+      } else if (qs[1] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGate4HHHL_H(qs, cqs, cmask, matrix, state);
+        } else {
+          ApplyControlledGate4HHHL_L(qs, cqs, cmask, matrix, state);
+        }
+      } else if (qs[2] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGate4HHLL_H(qs, cqs, cmask, matrix, state);
+        } else {
+          ApplyControlledGate4HHLL_L(qs, cqs, cmask, matrix, state);
+        }
+      } else if (qs[3] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGate4HLLL_H(qs, cqs, cmask, matrix, state);
+        } else {
+          ApplyControlledGate4HLLL_L(qs, cqs, cmask, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 3) {
+          ApplyControlledGate4LLLL_H(qs, cqs, cmask, matrix, state);
+        } else {
+          ApplyControlledGate4LLLL_L(qs, cqs, cmask, matrix, state);
+        }
+      }
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+  }
+
+  /**
+   * @return The size of SIMD register if applicable.
+   */
+  static unsigned SIMDRegisterSize() {
+    return 16;
+  }
+
+ private:
+  void ApplyGate1H(const std::vector<unsigned>& qs,
+                   const fp_type* matrix, Unitary& state) const {
+    uint64_t xs[1];
+    uint64_t ms[2];
+
+    xs[0] = uint64_t{1} << (qs[0] + 1);
+    ms[0] = (uint64_t{1} << qs[0]) - 1;
+    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
+
+    uint64_t xss[2];
+    for (unsigned i = 0; i < 2; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 1; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss,
+                uint64_t size, uint64_t row_size, fp_type* rstate) {
+      __m512 ru, iu, rn, in;
+      __m512 rs[2], is[2];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]);
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rs[l] = _mm512_load_ps(p0 + xss[l]);
+        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        ru = _mm512_set1_ps(v[j]);
+        iu = _mm512_set1_ps(v[j + 1]);
+        rn = _mm512_mul_ps(rs[0], ru);
+        in = _mm512_mul_ps(rs[0], iu);
+        rn = _mm512_fnmadd_ps(is[0], iu, rn);
+        in = _mm512_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 2; ++n) {
+          ru = _mm512_set1_ps(v[j]);
+          iu = _mm512_set1_ps(v[j + 1]);
+          rn = _mm512_fmadd_ps(rs[n], ru, rn);
+          in = _mm512_fmadd_ps(rs[n], iu, in);
+          rn = _mm512_fnmadd_ps(is[n], iu, rn);
+          in = _mm512_fmadd_ps(is[n], ru, in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 5;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, rstate);
+  }
+
+  void ApplyGate1L(const std::vector<unsigned>& qs,
+                   const fp_type* matrix, Unitary& state) const {
+    unsigned p[16];
+    __m512i idx[1];
+
+    auto s = UnitarySpace::Create(5);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned m = 0; m < 2; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2);
+        }
+
+        unsigned l = 2 * (2 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[2], is[2];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      auto p0 = rstate + row_size * r + 32 * ii;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rs[2 * l] = _mm512_load_ps(p0);
+        is[2 * l] = _mm512_load_ps(p0 + 16);
+
+        for (unsigned j = 1; j < 2; ++j) {
+          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
+          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 2; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0, rn);
+        _mm512_store_ps(p0 + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 4;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w, idx, size, raw_size, rstate);
+  }
+
+  void ApplyGate2HH(const std::vector<unsigned>& qs,
+                    const fp_type* matrix, Unitary& state) const {
+    uint64_t xs[2];
+    uint64_t ms[3];
+
+    xs[0] = uint64_t{1} << (qs[0] + 1);
+    ms[0] = (uint64_t{1} << qs[0]) - 1;
+    for (unsigned i = 1; i < 2; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
+
+    uint64_t xss[4];
+    for (unsigned i = 0; i < 4; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 2; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss,
+                uint64_t size, uint64_t row_size, fp_type* rstate) {
+      __m512 ru, iu, rn, in;
+      __m512 rs[4], is[4];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2]);
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rs[l] = _mm512_load_ps(p0 + xss[l]);
+        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        ru = _mm512_set1_ps(v[j]);
+        iu = _mm512_set1_ps(v[j + 1]);
+        rn = _mm512_mul_ps(rs[0], ru);
+        in = _mm512_mul_ps(rs[0], iu);
+        rn = _mm512_fnmadd_ps(is[0], iu, rn);
+        in = _mm512_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 4; ++n) {
+          ru = _mm512_set1_ps(v[j]);
+          iu = _mm512_set1_ps(v[j + 1]);
+          rn = _mm512_fmadd_ps(rs[n], ru, rn);
+          in = _mm512_fmadd_ps(rs[n], iu, in);
+          rn = _mm512_fnmadd_ps(is[n], iu, rn);
+          in = _mm512_fmadd_ps(is[n], ru, in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 6;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, rstate);
+  }
+
+  void ApplyGate2HL(const std::vector<unsigned>& qs,
+                    const fp_type* matrix, Unitary& state) const {
+    uint64_t xs[1];
+    uint64_t ms[2];
+
+    xs[0] = uint64_t{1} << (qs[1] + 1);
+    ms[0] = (uint64_t{1} << qs[1]) - 1;
+    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
+
+    uint64_t xss[2];
+    for (unsigned i = 0; i < 2; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 1; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[1];
+
+    auto s = UnitarySpace::Create(7);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 2; ++i) {
+      for (unsigned m = 0; m < 4; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2);
+        }
+
+        unsigned l = 2 * (4 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[4], is[4];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]);
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 2; ++j) {
+          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
+          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 4; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 5;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
+  }
+
+  void ApplyGate2LL(const std::vector<unsigned>& qs,
+                    const fp_type* matrix, Unitary& state) const {
+    unsigned p[16];
+    __m512i idx[3];
+
+    auto s = UnitarySpace::Create(6);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
+
+    for (unsigned i = 0; i < 3; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned m = 0; m < 4; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4);
+        }
+
+        unsigned l = 2 * (4 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[4], is[4];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      auto p0 = rstate + row_size * r + 32 * ii;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rs[4 * l] = _mm512_load_ps(p0);
+        is[4 * l] = _mm512_load_ps(p0 + 16);
+
+        for (unsigned j = 1; j < 4; ++j) {
+          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
+          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 4; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0, rn);
+        _mm512_store_ps(p0 + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 4;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w, idx, size, raw_size, rstate);
+  }
+
+  void ApplyGate3HHH(const std::vector<unsigned>& qs,
+                     const fp_type* matrix, Unitary& state) const {
+    uint64_t xs[3];
+    uint64_t ms[4];
+
+    xs[0] = uint64_t{1} << (qs[0] + 1);
+    ms[0] = (uint64_t{1} << qs[0]) - 1;
+    for (unsigned i = 1; i < 3; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
+
+    uint64_t xss[8];
+    for (unsigned i = 0; i < 8; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 3; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss,
+                uint64_t size, uint64_t row_size, fp_type* rstate) {
+      __m512 ru, iu, rn, in;
+      __m512 rs[8], is[8];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2])
+          | (128 * ii & ms[3]);
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        rs[l] = _mm512_load_ps(p0 + xss[l]);
+        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        ru = _mm512_set1_ps(v[j]);
+        iu = _mm512_set1_ps(v[j + 1]);
+        rn = _mm512_mul_ps(rs[0], ru);
+        in = _mm512_mul_ps(rs[0], iu);
+        rn = _mm512_fnmadd_ps(is[0], iu, rn);
+        in = _mm512_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 8; ++n) {
+          ru = _mm512_set1_ps(v[j]);
+          iu = _mm512_set1_ps(v[j + 1]);
+          rn = _mm512_fmadd_ps(rs[n], ru, rn);
+          in = _mm512_fmadd_ps(rs[n], iu, in);
+          rn = _mm512_fnmadd_ps(is[n], iu, rn);
+          in = _mm512_fmadd_ps(is[n], ru, in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 7;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, rstate);
+  }
+
+  void ApplyGate3HHL(const std::vector<unsigned>& qs,
+                     const fp_type* matrix, Unitary& state) const {
+    uint64_t xs[2];
+    uint64_t ms[3];
+
+    xs[0] = uint64_t{1} << (qs[1] + 1);
+    ms[0] = (uint64_t{1} << qs[1]) - 1;
+    for (unsigned i = 1; i < 2; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
+
+    uint64_t xss[4];
+    for (unsigned i = 0; i < 4; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 2; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[1];
+
+    auto s = UnitarySpace::Create(9);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 4; ++i) {
+      for (unsigned m = 0; m < 8; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2);
+        }
+
+        unsigned l = 2 * (8 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[8], is[8];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2]);
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 2; ++j) {
+          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
+          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 8; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 6;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
+  }
+
+  void ApplyGate3HLL(const std::vector<unsigned>& qs,
+                     const fp_type* matrix, Unitary& state) const {
+    uint64_t xs[1];
+    uint64_t ms[2];
+
+    xs[0] = uint64_t{1} << (qs[2] + 1);
+    ms[0] = (uint64_t{1} << qs[2]) - 1;
+    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
+
+    uint64_t xss[2];
+    for (unsigned i = 0; i < 2; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 1; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[3];
+
+    auto s = UnitarySpace::Create(8);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
+
+    for (unsigned i = 0; i < 3; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 2; ++i) {
+      for (unsigned m = 0; m < 8; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4);
+        }
+
+        unsigned l = 2 * (8 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[8], is[8];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]);
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 4; ++j) {
+          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
+          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 8; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 5;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
+  }
+
+  void ApplyGate3LLL(const std::vector<unsigned>& qs,
+                     const fp_type* matrix, Unitary& state) const {
+    unsigned p[16];
+    __m512i idx[7];
+
+    auto s = UnitarySpace::Create(7);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
+
+    for (unsigned i = 0; i < 7; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned m = 0; m < 8; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (64 * i + 8 * k + 8 * (m / 8) + (k + m) % 8);
+        }
+
+        unsigned l = 2 * (8 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[8], is[8];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      auto p0 = rstate + row_size * r + 32 * ii;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rs[8 * l] = _mm512_load_ps(p0);
+        is[8 * l] = _mm512_load_ps(p0 + 16);
+
+        for (unsigned j = 1; j < 8; ++j) {
+          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
+          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 8; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0, rn);
+        _mm512_store_ps(p0 + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 4;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w, idx, size, raw_size, rstate);
+  }
+
+  void ApplyGate4HHHH(const std::vector<unsigned>& qs,
+                      const fp_type* matrix, Unitary& state) const {
+    uint64_t xs[4];
+    uint64_t ms[5];
+
+    xs[0] = uint64_t{1} << (qs[0] + 1);
+    ms[0] = (uint64_t{1} << qs[0]) - 1;
+    for (unsigned i = 1; i < 4; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
+
+    uint64_t xss[16];
+    for (unsigned i = 0; i < 16; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 4; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss,
+                uint64_t size, uint64_t row_size, fp_type* rstate) {
+      __m512 ru, iu, rn, in;
+      __m512 rs[16], is[16];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2])
+          | (128 * ii & ms[3]) | (256 * ii & ms[4]);
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 16; ++l) {
+        rs[l] = _mm512_load_ps(p0 + xss[l]);
+        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 16; ++l) {
+        ru = _mm512_set1_ps(v[j]);
+        iu = _mm512_set1_ps(v[j + 1]);
+        rn = _mm512_mul_ps(rs[0], ru);
+        in = _mm512_mul_ps(rs[0], iu);
+        rn = _mm512_fnmadd_ps(is[0], iu, rn);
+        in = _mm512_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 16; ++n) {
+          ru = _mm512_set1_ps(v[j]);
+          iu = _mm512_set1_ps(v[j + 1]);
+          rn = _mm512_fmadd_ps(rs[n], ru, rn);
+          in = _mm512_fmadd_ps(rs[n], iu, in);
+          rn = _mm512_fnmadd_ps(is[n], iu, rn);
+          in = _mm512_fmadd_ps(is[n], ru, in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 8;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, rstate);
+  }
+
+  void ApplyGate4HHHL(const std::vector<unsigned>& qs,
+                      const fp_type* matrix, Unitary& state) const {
+    uint64_t xs[3];
+    uint64_t ms[4];
+
+    xs[0] = uint64_t{1} << (qs[1] + 1);
+    ms[0] = (uint64_t{1} << qs[1]) - 1;
+    for (unsigned i = 1; i < 3; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
+
+    uint64_t xss[8];
+    for (unsigned i = 0; i < 8; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 3; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[1];
+
+    auto s = UnitarySpace::Create(11);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 8; ++i) {
+      for (unsigned m = 0; m < 16; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2);
+        }
+
+        unsigned l = 2 * (16 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[16], is[16];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2])
+          | (128 * ii & ms[3]);
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 2; ++j) {
+          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
+          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 16; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 7;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
+  }
+
+  void ApplyGate4HHLL(const std::vector<unsigned>& qs,
+                      const fp_type* matrix, Unitary& state) const {
+    uint64_t xs[2];
+    uint64_t ms[3];
+
+    xs[0] = uint64_t{1} << (qs[2] + 1);
+    ms[0] = (uint64_t{1} << qs[2]) - 1;
+    for (unsigned i = 1; i < 2; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
+
+    uint64_t xss[4];
+    for (unsigned i = 0; i < 4; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 2; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[3];
+
+    auto s = UnitarySpace::Create(10);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
+
+    for (unsigned i = 0; i < 3; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 4; ++i) {
+      for (unsigned m = 0; m < 16; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4);
+        }
+
+        unsigned l = 2 * (16 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[16], is[16];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2]);
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 4; ++j) {
+          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
+          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 16; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 6;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
+  }
+
+  void ApplyGate4HLLL(const std::vector<unsigned>& qs,
+                      const fp_type* matrix, Unitary& state) const {
+    uint64_t xs[1];
+    uint64_t ms[2];
+
+    xs[0] = uint64_t{1} << (qs[3] + 1);
+    ms[0] = (uint64_t{1} << qs[3]) - 1;
+    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
+
+    uint64_t xss[2];
+    for (unsigned i = 0; i < 2; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 1; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[7];
+
+    auto s = UnitarySpace::Create(9);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
+
+    for (unsigned i = 0; i < 7; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 2; ++i) {
+      for (unsigned m = 0; m < 16; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (128 * i + 16 * k + 8 * (m / 8) + (k + m) % 8);
+        }
+
+        unsigned l = 2 * (16 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[16], is[16];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]);
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rs[8 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 8; ++j) {
+          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
+          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 16; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 5;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
+  }
+
+  void ApplyGate4LLLL(const std::vector<unsigned>& qs,
+                      const fp_type* matrix, Unitary& state) const {
+    unsigned p[16];
+    __m512i idx[15];
+
+    auto s = UnitarySpace::Create(8);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]);
+
+    for (unsigned i = 0; i < 15; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned m = 0; m < 16; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (256 * i + 16 * k + 16 * (m / 16) + (k + m) % 16);
+        }
+
+        unsigned l = 2 * (16 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[16], is[16];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      auto p0 = rstate + row_size * r + 32 * ii;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rs[16 * l] = _mm512_load_ps(p0);
+        is[16 * l] = _mm512_load_ps(p0 + 16);
+
+        for (unsigned j = 1; j < 16; ++j) {
+          rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]);
+          is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 16; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0, rn);
+        _mm512_store_ps(p0 + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 4;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w, idx, size, raw_size, rstate);
+  }
+
+  void ApplyGate5HHHHH(const std::vector<unsigned>& qs,
+                       const fp_type* matrix, Unitary& state) const {
+    uint64_t xs[5];
+    uint64_t ms[6];
+
+    xs[0] = uint64_t{1} << (qs[0] + 1);
+    ms[0] = (uint64_t{1} << qs[0]) - 1;
+    for (unsigned i = 1; i < 5; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1);
+
+    uint64_t xss[32];
+    for (unsigned i = 0; i < 32; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 5; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss,
+                uint64_t size, uint64_t row_size, fp_type* rstate) {
+      __m512 ru, iu, rn, in;
+      __m512 rs[32], is[32];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2])
+          | (128 * ii & ms[3]) | (256 * ii & ms[4]) | (512 * ii & ms[5]);
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 32; ++l) {
+        rs[l] = _mm512_load_ps(p0 + xss[l]);
+        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 32; ++l) {
+        ru = _mm512_set1_ps(v[j]);
+        iu = _mm512_set1_ps(v[j + 1]);
+        rn = _mm512_mul_ps(rs[0], ru);
+        in = _mm512_mul_ps(rs[0], iu);
+        rn = _mm512_fnmadd_ps(is[0], iu, rn);
+        in = _mm512_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 32; ++n) {
+          ru = _mm512_set1_ps(v[j]);
+          iu = _mm512_set1_ps(v[j + 1]);
+          rn = _mm512_fmadd_ps(rs[n], ru, rn);
+          in = _mm512_fmadd_ps(rs[n], iu, in);
+          rn = _mm512_fnmadd_ps(is[n], iu, rn);
+          in = _mm512_fmadd_ps(is[n], ru, in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 9;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, rstate);
+  }
+
+  void ApplyGate5HHHHL(const std::vector<unsigned>& qs,
+                       const fp_type* matrix, Unitary& state) const {
+    uint64_t xs[4];
+    uint64_t ms[5];
+
+    xs[0] = uint64_t{1} << (qs[1] + 1);
+    ms[0] = (uint64_t{1} << qs[1]) - 1;
+    for (unsigned i = 1; i < 4; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
+
+    uint64_t xss[16];
+    for (unsigned i = 0; i < 16; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 4; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[1];
+
+    auto s = UnitarySpace::Create(13);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 16; ++i) {
+      for (unsigned m = 0; m < 32; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (64 * i + 32 * k + 2 * (m / 2) + (k + m) % 2);
+        }
+
+        unsigned l = 2 * (32 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[32], is[32];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2])
+          | (128 * ii & ms[3]) | (256 * ii & ms[4]);
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 16; ++l) {
+        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 2; ++j) {
+          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
+          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 16; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 32; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 8;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
+  }
+
+  void ApplyGate5HHHLL(const std::vector<unsigned>& qs,
+                       const fp_type* matrix, Unitary& state) const {
+    uint64_t xs[3];
+    uint64_t ms[4];
+
+    xs[0] = uint64_t{1} << (qs[2] + 1);
+    ms[0] = (uint64_t{1} << qs[2]) - 1;
+    for (unsigned i = 1; i < 3; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
+
+    uint64_t xss[8];
+    for (unsigned i = 0; i < 8; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 3; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[3];
+
+    auto s = UnitarySpace::Create(12);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
+
+    for (unsigned i = 0; i < 3; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 8; ++i) {
+      for (unsigned m = 0; m < 32; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (128 * i + 32 * k + 4 * (m / 4) + (k + m) % 4);
+        }
+
+        unsigned l = 2 * (32 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[32], is[32];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2])
+          | (128 * ii & ms[3]);
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 4; ++j) {
+          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
+          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 32; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 7;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
+  }
+
+  void ApplyGate5HHLLL(const std::vector<unsigned>& qs,
+                       const fp_type* matrix, Unitary& state) const {
+    uint64_t xs[2];
+    uint64_t ms[3];
+
+    xs[0] = uint64_t{1} << (qs[3] + 1);
+    ms[0] = (uint64_t{1} << qs[3]) - 1;
+    for (unsigned i = 1; i < 2; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 3] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 3]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
+
+    uint64_t xss[4];
+    for (unsigned i = 0; i < 4; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 2; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[7];
+
+    auto s = UnitarySpace::Create(11);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
+
+    for (unsigned i = 0; i < 7; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 4; ++i) {
+      for (unsigned m = 0; m < 32; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (256 * i + 32 * k + 8 * (m / 8) + (k + m) % 8);
+        }
+
+        unsigned l = 2 * (32 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[32], is[32];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2]);
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rs[8 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 8; ++j) {
+          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
+          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 32; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 6;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
+  }
+
+  void ApplyGate5HLLLL(const std::vector<unsigned>& qs,
+                       const fp_type* matrix, Unitary& state) const {
+    uint64_t xs[1];
+    uint64_t ms[2];
+
+    xs[0] = uint64_t{1} << (qs[4] + 1);
+    ms[0] = (uint64_t{1} << qs[4]) - 1;
+    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
+
+    uint64_t xss[2];
+    for (unsigned i = 0; i < 2; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 1; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[15];
+
+    auto s = UnitarySpace::Create(10);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]);
+
+    for (unsigned i = 0; i < 15; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 2; ++i) {
+      for (unsigned m = 0; m < 32; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (512 * i + 32 * k + 16 * (m / 16) + (k + m) % 16);
+        }
+
+        unsigned l = 2 * (32 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[32], is[32];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]);
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rs[16 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[16 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 16; ++j) {
+          rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]);
+          is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 32; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 5;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
+  }
+
+  void ApplyGate6HHHHHH(const std::vector<unsigned>& qs,
+                        const fp_type* matrix, Unitary& state) const {
+    uint64_t xs[6];
+    uint64_t ms[7];
+
+    xs[0] = uint64_t{1} << (qs[0] + 1);
+    ms[0] = (uint64_t{1} << qs[0]) - 1;
+    for (unsigned i = 1; i < 6; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[6] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[5] - 1);
+
+    uint64_t xss[64];
+    for (unsigned i = 0; i < 64; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 6; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss,
+                uint64_t size, uint64_t row_size, fp_type* rstate) {
+      __m512 ru, iu, rn, in;
+      __m512 rs[64], is[64];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2])
+          | (128 * ii & ms[3]) | (256 * ii & ms[4]) | (512 * ii & ms[5])
+          | (1024 * ii & ms[6]);
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 64; ++l) {
+        rs[l] = _mm512_load_ps(p0 + xss[l]);
+        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 64; ++l) {
+        ru = _mm512_set1_ps(v[j]);
+        iu = _mm512_set1_ps(v[j + 1]);
+        rn = _mm512_mul_ps(rs[0], ru);
+        in = _mm512_mul_ps(rs[0], iu);
+        rn = _mm512_fnmadd_ps(is[0], iu, rn);
+        in = _mm512_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 64; ++n) {
+          ru = _mm512_set1_ps(v[j]);
+          iu = _mm512_set1_ps(v[j + 1]);
+          rn = _mm512_fmadd_ps(rs[n], ru, rn);
+          in = _mm512_fmadd_ps(rs[n], iu, in);
+          rn = _mm512_fnmadd_ps(is[n], iu, rn);
+          in = _mm512_fmadd_ps(is[n], ru, in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 10;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, rstate);
+  }
+
+  void ApplyGate6HHHHHL(const std::vector<unsigned>& qs,
+                        const fp_type* matrix, Unitary& state) const {
+    uint64_t xs[5];
+    uint64_t ms[6];
+
+    xs[0] = uint64_t{1} << (qs[1] + 1);
+    ms[0] = (uint64_t{1} << qs[1]) - 1;
+    for (unsigned i = 1; i < 5; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1);
+
+    uint64_t xss[32];
+    for (unsigned i = 0; i < 32; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 5; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[1];
+
+    auto s = UnitarySpace::Create(15);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 32; ++i) {
+      for (unsigned m = 0; m < 64; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (128 * i + 64 * k + 2 * (m / 2) + (k + m) % 2);
+        }
+
+        unsigned l = 2 * (64 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[64], is[64];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2])
+          | (128 * ii & ms[3]) | (256 * ii & ms[4]) | (512 * ii & ms[5]);
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 32; ++l) {
+        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 2; ++j) {
+          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
+          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 32; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 64; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 9;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
+  }
+
+  void ApplyGate6HHHHLL(const std::vector<unsigned>& qs,
+                        const fp_type* matrix, Unitary& state) const {
+    uint64_t xs[4];
+    uint64_t ms[5];
+
+    xs[0] = uint64_t{1} << (qs[2] + 1);
+    ms[0] = (uint64_t{1} << qs[2]) - 1;
+    for (unsigned i = 1; i < 4; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
+
+    uint64_t xss[16];
+    for (unsigned i = 0; i < 16; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 4; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[3];
+
+    auto s = UnitarySpace::Create(14);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
+
+    for (unsigned i = 0; i < 3; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 16; ++i) {
+      for (unsigned m = 0; m < 64; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (256 * i + 64 * k + 4 * (m / 4) + (k + m) % 4);
+        }
+
+        unsigned l = 2 * (64 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[64], is[64];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2])
+          | (128 * ii & ms[3]) | (256 * ii & ms[4]);
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 16; ++l) {
+        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 4; ++j) {
+          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
+          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 16; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 64; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 8;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
+  }
+
+  void ApplyGate6HHHLLL(const std::vector<unsigned>& qs,
+                        const fp_type* matrix, Unitary& state) const {
+    uint64_t xs[3];
+    uint64_t ms[4];
+
+    xs[0] = uint64_t{1} << (qs[3] + 1);
+    ms[0] = (uint64_t{1} << qs[3]) - 1;
+    for (unsigned i = 1; i < 3; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 3] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 3]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
+
+    uint64_t xss[8];
+    for (unsigned i = 0; i < 8; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 3; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[7];
+
+    auto s = UnitarySpace::Create(13);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
+
+    for (unsigned i = 0; i < 7; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 8; ++i) {
+      for (unsigned m = 0; m < 64; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (512 * i + 64 * k + 8 * (m / 8) + (k + m) % 8);
+        }
+
+        unsigned l = 2 * (64 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[64], is[64];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2])
+          | (128 * ii & ms[3]);
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        rs[8 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 8; ++j) {
+          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
+          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 64; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 7;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
+  }
+
+  void ApplyGate6HHLLLL(const std::vector<unsigned>& qs,
+                        const fp_type* matrix, Unitary& state) const {
+    uint64_t xs[2];
+    uint64_t ms[3];
+
+    xs[0] = uint64_t{1} << (qs[4] + 1);
+    ms[0] = (uint64_t{1} << qs[4]) - 1;
+    for (unsigned i = 1; i < 2; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 4] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 4]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
+
+    uint64_t xss[4];
+    for (unsigned i = 0; i < 4; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 2; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned p[16];
+    __m512i idx[15];
+
+    auto s = UnitarySpace::Create(12);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]);
+
+    for (unsigned i = 0; i < 15; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 4; ++i) {
+      for (unsigned m = 0; m < 64; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (1024 * i + 64 * k + 16 * (m / 16) + (k + m) % 16);
+        }
+
+        unsigned l = 2 * (64 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[64], is[64];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2]);
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rs[16 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[16 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 16; ++j) {
+          rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]);
+          is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 64; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 6;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate);
+  }
+
+  void ApplyControlledGate1H_H(const std::vector<unsigned>& qs,
+                               const std::vector<unsigned>& cqs,
+                               uint64_t cmask, const fp_type* matrix,
+                               Unitary& state) const {
+    uint64_t xs[1];
+    uint64_t ms[2];
+
+    xs[0] = uint64_t{1} << (qs[0] + 1);
+    ms[0] = (uint64_t{1} << qs[0]) - 1;
+    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
+
+    uint64_t xss[2];
+    for (unsigned i = 0; i < 2; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 1; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
+
+    for (auto q : qs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                uint64_t size, uint64_t row_size, fp_type* rstate) {
+      __m512 ru, iu, rn, in;
+      __m512 rs[2], is[2];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rs[l] = _mm512_load_ps(p0 + xss[l]);
+        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        ru = _mm512_set1_ps(v[j]);
+        iu = _mm512_set1_ps(v[j + 1]);
+        rn = _mm512_mul_ps(rs[0], ru);
+        in = _mm512_mul_ps(rs[0], iu);
+        rn = _mm512_fnmadd_ps(is[0], iu, rn);
+        in = _mm512_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 2; ++n) {
+          ru = _mm512_set1_ps(v[j]);
+          iu = _mm512_set1_ps(v[j + 1]);
+          rn = _mm512_fmadd_ps(rs[n], ru, rn);
+          in = _mm512_fmadd_ps(rs[n], iu, in);
+          rn = _mm512_fnmadd_ps(is[n], iu, rn);
+          in = _mm512_fmadd_ps(is[n], ru, in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 5 + cqs.size();
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, matrix, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
+  }
+
+  void ApplyControlledGate1H_L(const std::vector<unsigned>& qs,
+                               const std::vector<unsigned>& cqs,
+                               uint64_t cmask, const fp_type* matrix,
+                               Unitary& state) const {
+    uint64_t xs[1];
+    uint64_t ms[2];
+
+    xs[0] = uint64_t{1} << (qs[0] + 1);
+    ms[0] = (uint64_t{1} << qs[0]) - 1;
+    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
+
+    uint64_t xss[2];
+    for (unsigned i = 0; i < 2; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 1; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned cl = 0;
+    uint64_t emaskl = 0;
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        emaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
+    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
+
+    for (auto q : qs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+
+    auto s = UnitarySpace::Create(6);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 2; ++i) {
+      for (unsigned m = 0; m < 2; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (2 * i + 2 * k + m);
+        }
+
+        unsigned l = 2 * (2 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          fp_type v = (p[j] / 2) / 2 == (p[j] / 2) % 2 ? 1 : 0;
+          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                uint64_t size, uint64_t row_size, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[2], is[2];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rs[l] = _mm512_load_ps(p0 + xss[l]);
+        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 2; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 5 + cqs.size() - cl;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
+  }
+
+  void ApplyControlledGate1L_H(const std::vector<unsigned>& qs,
+                               const std::vector<unsigned>& cqs,
+                               uint64_t cmask, const fp_type* matrix,
+                               Unitary& state) const {
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[1];
+
+    auto s = UnitarySpace::Create(5);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned m = 0; m < 2; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2);
+        }
+
+        unsigned l = 2 * (2 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[2], is[2];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rs[2 * l] = _mm512_load_ps(p0);
+        is[2 * l] = _mm512_load_ps(p0 + 16);
+
+        for (unsigned j = 1; j < 2; ++j) {
+          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
+          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 2; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0, rn);
+        _mm512_store_ps(p0 + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 4 + cqs.size();
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w,
+             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
+  }
+
+  void ApplyControlledGate1L_L(const std::vector<unsigned>& qs,
+                               const std::vector<unsigned>& cqs,
+                               uint64_t cmask, const fp_type* matrix,
+                               Unitary& state) const {
+    unsigned cl = 0;
+    uint64_t emaskl = 0;
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        emaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
+    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[1];
+
+    auto s = UnitarySpace::Create(5);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned m = 0; m < 2; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2);
+        }
+
+        unsigned l = 2 * (2 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          fp_type v = (p[j] / 2) / 2 == (p[j] / 2) % 2 ? 1 : 0;
+          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[2], is[2];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rs[2 * l] = _mm512_load_ps(p0);
+        is[2 * l] = _mm512_load_ps(p0 + 16);
+
+        for (unsigned j = 1; j < 2; ++j) {
+          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
+          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 2; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0, rn);
+        _mm512_store_ps(p0 + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 4 + cqs.size() - cl;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w,
+             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
+  }
+
+  void ApplyControlledGate2HH_H(const std::vector<unsigned>& qs,
+                                const std::vector<unsigned>& cqs,
+                                uint64_t cmask, const fp_type* matrix,
+                                Unitary& state) const {
+    uint64_t xs[2];
+    uint64_t ms[3];
+
+    xs[0] = uint64_t{1} << (qs[0] + 1);
+    ms[0] = (uint64_t{1} << qs[0]) - 1;
+    for (unsigned i = 1; i < 2; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
+
+    uint64_t xss[4];
+    for (unsigned i = 0; i < 4; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 2; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
+
+    for (auto q : qs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                uint64_t size, uint64_t row_size, fp_type* rstate) {
+      __m512 ru, iu, rn, in;
+      __m512 rs[4], is[4];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rs[l] = _mm512_load_ps(p0 + xss[l]);
+        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        ru = _mm512_set1_ps(v[j]);
+        iu = _mm512_set1_ps(v[j + 1]);
+        rn = _mm512_mul_ps(rs[0], ru);
+        in = _mm512_mul_ps(rs[0], iu);
+        rn = _mm512_fnmadd_ps(is[0], iu, rn);
+        in = _mm512_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 4; ++n) {
+          ru = _mm512_set1_ps(v[j]);
+          iu = _mm512_set1_ps(v[j + 1]);
+          rn = _mm512_fmadd_ps(rs[n], ru, rn);
+          in = _mm512_fmadd_ps(rs[n], iu, in);
+          rn = _mm512_fnmadd_ps(is[n], iu, rn);
+          in = _mm512_fmadd_ps(is[n], ru, in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 6 + cqs.size();
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, matrix, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
+  }
+
+  void ApplyControlledGate2HH_L(const std::vector<unsigned>& qs,
+                                const std::vector<unsigned>& cqs,
+                                uint64_t cmask, const fp_type* matrix,
+                                Unitary& state) const {
+    uint64_t xs[2];
+    uint64_t ms[3];
+
+    xs[0] = uint64_t{1} << (qs[0] + 1);
+    ms[0] = (uint64_t{1} << qs[0]) - 1;
+    for (unsigned i = 1; i < 2; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
+
+    uint64_t xss[4];
+    for (unsigned i = 0; i < 4; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 2; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned cl = 0;
+    uint64_t emaskl = 0;
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        emaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
+    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
+
+    for (auto q : qs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+
+    auto s = UnitarySpace::Create(8);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 4; ++i) {
+      for (unsigned m = 0; m < 4; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (4 * i + 4 * k + m);
+        }
+
+        unsigned l = 2 * (4 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          fp_type v = (p[j] / 2) / 4 == (p[j] / 2) % 4 ? 1 : 0;
+          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                uint64_t size, uint64_t row_size, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[4], is[4];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rs[l] = _mm512_load_ps(p0 + xss[l]);
+        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 4; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 6 + cqs.size() - cl;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
+  }
+
+  void ApplyControlledGate2HL_H(const std::vector<unsigned>& qs,
+                                const std::vector<unsigned>& cqs,
+                                uint64_t cmask, const fp_type* matrix,
+                                Unitary& state) const {
+    uint64_t xs[1];
+    uint64_t ms[2];
+
+    xs[0] = uint64_t{1} << (qs[1] + 1);
+    ms[0] = (uint64_t{1} << qs[1]) - 1;
+    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
+
+    uint64_t xss[2];
+    for (unsigned i = 0; i < 2; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 1; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[1];
+
+    auto s = UnitarySpace::Create(7);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 2; ++i) {
+      for (unsigned m = 0; m < 4; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2);
+        }
+
+        unsigned l = 2 * (4 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[4], is[4];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 2; ++j) {
+          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
+          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 4; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 5 + cqs.size();
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
+  }
+
+  void ApplyControlledGate2HL_L(const std::vector<unsigned>& qs,
+                                const std::vector<unsigned>& cqs,
+                                uint64_t cmask, const fp_type* matrix,
+                                Unitary& state) const {
+    uint64_t xs[1];
+    uint64_t ms[2];
+
+    xs[0] = uint64_t{1} << (qs[1] + 1);
+    ms[0] = (uint64_t{1} << qs[1]) - 1;
+    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
+
+    uint64_t xss[2];
+    for (unsigned i = 0; i < 2; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 1; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned cl = 0;
+    uint64_t emaskl = 0;
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        emaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
+    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[1];
+
+    auto s = UnitarySpace::Create(7);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 2; ++i) {
+      for (unsigned m = 0; m < 4; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2);
+        }
+
+        unsigned l = 2 * (4 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          fp_type v = (p[j] / 2) / 4 == (p[j] / 2) % 4 ? 1 : 0;
+          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[4], is[4];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 2; ++j) {
+          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
+          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 4; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 5 + cqs.size() - cl;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
+  }
+
+  void ApplyControlledGate2LL_H(const std::vector<unsigned>& qs,
+                                const std::vector<unsigned>& cqs,
+                                uint64_t cmask, const fp_type* matrix,
+                                Unitary& state) const {
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[3];
+
+    auto s = UnitarySpace::Create(6);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
+
+    for (unsigned i = 0; i < 3; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned m = 0; m < 4; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4);
+        }
+
+        unsigned l = 2 * (4 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[4], is[4];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rs[4 * l] = _mm512_load_ps(p0);
+        is[4 * l] = _mm512_load_ps(p0 + 16);
+
+        for (unsigned j = 1; j < 4; ++j) {
+          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
+          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 4; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0, rn);
+        _mm512_store_ps(p0 + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 4 + cqs.size();
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w,
+             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
+  }
+
+  void ApplyControlledGate2LL_L(const std::vector<unsigned>& qs,
+                                const std::vector<unsigned>& cqs,
+                                uint64_t cmask, const fp_type* matrix,
+                                Unitary& state) const {
+    unsigned cl = 0;
+    uint64_t emaskl = 0;
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        emaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
+    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[3];
+
+    auto s = UnitarySpace::Create(6);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
+
+    for (unsigned i = 0; i < 3; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned m = 0; m < 4; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4);
+        }
+
+        unsigned l = 2 * (4 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          fp_type v = (p[j] / 2) / 4 == (p[j] / 2) % 4 ? 1 : 0;
+          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[4], is[4];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rs[4 * l] = _mm512_load_ps(p0);
+        is[4 * l] = _mm512_load_ps(p0 + 16);
+
+        for (unsigned j = 1; j < 4; ++j) {
+          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
+          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 4; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0, rn);
+        _mm512_store_ps(p0 + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 4 + cqs.size() - cl;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w,
+             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
+  }
+
+  void ApplyControlledGate3HHH_H(const std::vector<unsigned>& qs,
+                                 const std::vector<unsigned>& cqs,
+                                 uint64_t cmask, const fp_type* matrix,
+                                 Unitary& state) const {
+    uint64_t xs[3];
+    uint64_t ms[4];
+
+    xs[0] = uint64_t{1} << (qs[0] + 1);
+    ms[0] = (uint64_t{1} << qs[0]) - 1;
+    for (unsigned i = 1; i < 3; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
+
+    uint64_t xss[8];
+    for (unsigned i = 0; i < 8; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 3; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
+
+    for (auto q : qs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                uint64_t size, uint64_t row_size, fp_type* rstate) {
+      __m512 ru, iu, rn, in;
+      __m512 rs[8], is[8];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        rs[l] = _mm512_load_ps(p0 + xss[l]);
+        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        ru = _mm512_set1_ps(v[j]);
+        iu = _mm512_set1_ps(v[j + 1]);
+        rn = _mm512_mul_ps(rs[0], ru);
+        in = _mm512_mul_ps(rs[0], iu);
+        rn = _mm512_fnmadd_ps(is[0], iu, rn);
+        in = _mm512_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 8; ++n) {
+          ru = _mm512_set1_ps(v[j]);
+          iu = _mm512_set1_ps(v[j + 1]);
+          rn = _mm512_fmadd_ps(rs[n], ru, rn);
+          in = _mm512_fmadd_ps(rs[n], iu, in);
+          rn = _mm512_fnmadd_ps(is[n], iu, rn);
+          in = _mm512_fmadd_ps(is[n], ru, in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 7 + cqs.size();
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, matrix, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
+  }
+
+  void ApplyControlledGate3HHH_L(const std::vector<unsigned>& qs,
+                                 const std::vector<unsigned>& cqs,
+                                 uint64_t cmask, const fp_type* matrix,
+                                 Unitary& state) const {
+    uint64_t xs[3];
+    uint64_t ms[4];
+
+    xs[0] = uint64_t{1} << (qs[0] + 1);
+    ms[0] = (uint64_t{1} << qs[0]) - 1;
+    for (unsigned i = 1; i < 3; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
+
+    uint64_t xss[8];
+    for (unsigned i = 0; i < 8; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 3; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned cl = 0;
+    uint64_t emaskl = 0;
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        emaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
+    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
+
+    for (auto q : qs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+
+    auto s = UnitarySpace::Create(10);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 8; ++i) {
+      for (unsigned m = 0; m < 8; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (8 * i + 8 * k + m);
+        }
+
+        unsigned l = 2 * (8 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0;
+          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                uint64_t size, uint64_t row_size, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[8], is[8];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        rs[l] = _mm512_load_ps(p0 + xss[l]);
+        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 8; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 7 + cqs.size() - cl;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
+  }
+
+  void ApplyControlledGate3HHL_H(const std::vector<unsigned>& qs,
+                                 const std::vector<unsigned>& cqs,
+                                 uint64_t cmask, const fp_type* matrix,
+                                 Unitary& state) const {
+    uint64_t xs[2];
+    uint64_t ms[3];
+
+    xs[0] = uint64_t{1} << (qs[1] + 1);
+    ms[0] = (uint64_t{1} << qs[1]) - 1;
+    for (unsigned i = 1; i < 2; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
+
+    uint64_t xss[4];
+    for (unsigned i = 0; i < 4; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 2; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[1];
+
+    auto s = UnitarySpace::Create(9);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 4; ++i) {
+      for (unsigned m = 0; m < 8; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2);
+        }
+
+        unsigned l = 2 * (8 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[8], is[8];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 2; ++j) {
+          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
+          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 8; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 6 + cqs.size();
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
+  }
+
+  void ApplyControlledGate3HHL_L(const std::vector<unsigned>& qs,
+                                 const std::vector<unsigned>& cqs,
+                                 uint64_t cmask, const fp_type* matrix,
+                                 Unitary& state) const {
+    uint64_t xs[2];
+    uint64_t ms[3];
+
+    xs[0] = uint64_t{1} << (qs[1] + 1);
+    ms[0] = (uint64_t{1} << qs[1]) - 1;
+    for (unsigned i = 1; i < 2; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
+
+    uint64_t xss[4];
+    for (unsigned i = 0; i < 4; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 2; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned cl = 0;
+    uint64_t emaskl = 0;
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        emaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
+    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[1];
+
+    auto s = UnitarySpace::Create(9);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 4; ++i) {
+      for (unsigned m = 0; m < 8; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2);
+        }
+
+        unsigned l = 2 * (8 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0;
+          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[8], is[8];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 2; ++j) {
+          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
+          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 8; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 6 + cqs.size() - cl;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
+  }
+
+  void ApplyControlledGate3HLL_H(const std::vector<unsigned>& qs,
+                                 const std::vector<unsigned>& cqs,
+                                 uint64_t cmask, const fp_type* matrix,
+                                 Unitary& state) const {
+    uint64_t xs[1];
+    uint64_t ms[2];
+
+    xs[0] = uint64_t{1} << (qs[2] + 1);
+    ms[0] = (uint64_t{1} << qs[2]) - 1;
+    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
+
+    uint64_t xss[2];
+    for (unsigned i = 0; i < 2; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 1; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[3];
+
+    auto s = UnitarySpace::Create(8);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
+
+    for (unsigned i = 0; i < 3; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 2; ++i) {
+      for (unsigned m = 0; m < 8; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4);
+        }
+
+        unsigned l = 2 * (8 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[8], is[8];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 4; ++j) {
+          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
+          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 8; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 5 + cqs.size();
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
+  }
+
+  void ApplyControlledGate3HLL_L(const std::vector<unsigned>& qs,
+                                 const std::vector<unsigned>& cqs,
+                                 uint64_t cmask, const fp_type* matrix,
+                                 Unitary& state) const {
+    uint64_t xs[1];
+    uint64_t ms[2];
+
+    xs[0] = uint64_t{1} << (qs[2] + 1);
+    ms[0] = (uint64_t{1} << qs[2]) - 1;
+    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
+
+    uint64_t xss[2];
+    for (unsigned i = 0; i < 2; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 1; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned cl = 0;
+    uint64_t emaskl = 0;
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        emaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
+    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[3];
+
+    auto s = UnitarySpace::Create(8);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
+
+    for (unsigned i = 0; i < 3; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 2; ++i) {
+      for (unsigned m = 0; m < 8; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4);
+        }
+
+        unsigned l = 2 * (8 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0;
+          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[8], is[8];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 4; ++j) {
+          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
+          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 8; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 5 + cqs.size() - cl;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
+  }
+
+  void ApplyControlledGate3LLL_H(const std::vector<unsigned>& qs,
+                                 const std::vector<unsigned>& cqs,
+                                 uint64_t cmask, const fp_type* matrix,
+                                 Unitary& state) const {
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[7];
+
+    auto s = UnitarySpace::Create(7);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
+
+    for (unsigned i = 0; i < 7; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned m = 0; m < 8; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (64 * i + 8 * k + 8 * (m / 8) + (k + m) % 8);
+        }
+
+        unsigned l = 2 * (8 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[8], is[8];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rs[8 * l] = _mm512_load_ps(p0);
+        is[8 * l] = _mm512_load_ps(p0 + 16);
+
+        for (unsigned j = 1; j < 8; ++j) {
+          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
+          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 8; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0, rn);
+        _mm512_store_ps(p0 + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 4 + cqs.size();
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w,
+             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
+  }
+
+  void ApplyControlledGate3LLL_L(const std::vector<unsigned>& qs,
+                                 const std::vector<unsigned>& cqs,
+                                 uint64_t cmask, const fp_type* matrix,
+                                 Unitary& state) const {
+    unsigned cl = 0;
+    uint64_t emaskl = 0;
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        emaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
+    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[7];
+
+    auto s = UnitarySpace::Create(7);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
+
+    for (unsigned i = 0; i < 7; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned m = 0; m < 8; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (64 * i + 8 * k + 8 * (m / 8) + (k + m) % 8);
+        }
+
+        unsigned l = 2 * (8 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0;
+          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[8], is[8];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rs[8 * l] = _mm512_load_ps(p0);
+        is[8 * l] = _mm512_load_ps(p0 + 16);
+
+        for (unsigned j = 1; j < 8; ++j) {
+          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
+          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 8; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0, rn);
+        _mm512_store_ps(p0 + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 4 + cqs.size() - cl;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w,
+             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
+  }
+
+  void ApplyControlledGate4HHHH_H(const std::vector<unsigned>& qs,
+                                  const std::vector<unsigned>& cqs,
+                                  uint64_t cmask, const fp_type* matrix,
+                                  Unitary& state) const {
+    uint64_t xs[4];
+    uint64_t ms[5];
+
+    xs[0] = uint64_t{1} << (qs[0] + 1);
+    ms[0] = (uint64_t{1} << qs[0]) - 1;
+    for (unsigned i = 1; i < 4; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
+
+    uint64_t xss[16];
+    for (unsigned i = 0; i < 16; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 4; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
+
+    for (auto q : qs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                uint64_t size, uint64_t row_size, fp_type* rstate) {
+      __m512 ru, iu, rn, in;
+      __m512 rs[16], is[16];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 16; ++l) {
+        rs[l] = _mm512_load_ps(p0 + xss[l]);
+        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 16; ++l) {
+        ru = _mm512_set1_ps(v[j]);
+        iu = _mm512_set1_ps(v[j + 1]);
+        rn = _mm512_mul_ps(rs[0], ru);
+        in = _mm512_mul_ps(rs[0], iu);
+        rn = _mm512_fnmadd_ps(is[0], iu, rn);
+        in = _mm512_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 16; ++n) {
+          ru = _mm512_set1_ps(v[j]);
+          iu = _mm512_set1_ps(v[j + 1]);
+          rn = _mm512_fmadd_ps(rs[n], ru, rn);
+          in = _mm512_fmadd_ps(rs[n], iu, in);
+          rn = _mm512_fnmadd_ps(is[n], iu, rn);
+          in = _mm512_fmadd_ps(is[n], ru, in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 8 + cqs.size();
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, matrix, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
+  }
+
+  void ApplyControlledGate4HHHH_L(const std::vector<unsigned>& qs,
+                                  const std::vector<unsigned>& cqs,
+                                  uint64_t cmask, const fp_type* matrix,
+                                  Unitary& state) const {
+    uint64_t xs[4];
+    uint64_t ms[5];
+
+    xs[0] = uint64_t{1} << (qs[0] + 1);
+    ms[0] = (uint64_t{1} << qs[0]) - 1;
+    for (unsigned i = 1; i < 4; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 0] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1);
+
+    uint64_t xss[16];
+    for (unsigned i = 0; i < 16; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 4; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned cl = 0;
+    uint64_t emaskl = 0;
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        emaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
+    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
+
+    for (auto q : qs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+
+    auto s = UnitarySpace::Create(12);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 16; ++i) {
+      for (unsigned m = 0; m < 16; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (16 * i + 16 * k + m);
+        }
+
+        unsigned l = 2 * (16 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
+          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                uint64_t size, uint64_t row_size, fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[16], is[16];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 16; ++l) {
+        rs[l] = _mm512_load_ps(p0 + xss[l]);
+        is[l] = _mm512_load_ps(p0 + xss[l] + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 16; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 16; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 8 + cqs.size() - cl;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate);
+  }
+
+  void ApplyControlledGate4HHHL_H(const std::vector<unsigned>& qs,
+                                  const std::vector<unsigned>& cqs,
+                                  uint64_t cmask, const fp_type* matrix,
+                                  Unitary& state) const {
+    uint64_t xs[3];
+    uint64_t ms[4];
+
+    xs[0] = uint64_t{1} << (qs[1] + 1);
+    ms[0] = (uint64_t{1} << qs[1]) - 1;
+    for (unsigned i = 1; i < 3; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
+
+    uint64_t xss[8];
+    for (unsigned i = 0; i < 8; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 3; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[1];
+
+    auto s = UnitarySpace::Create(11);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 8; ++i) {
+      for (unsigned m = 0; m < 16; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2);
+        }
+
+        unsigned l = 2 * (16 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[16], is[16];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 2; ++j) {
+          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
+          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 16; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 7 + cqs.size();
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
+  }
+
+  void ApplyControlledGate4HHHL_L(const std::vector<unsigned>& qs,
+                                  const std::vector<unsigned>& cqs,
+                                  uint64_t cmask, const fp_type* matrix,
+                                  Unitary& state) const {
+    uint64_t xs[3];
+    uint64_t ms[4];
+
+    xs[0] = uint64_t{1} << (qs[1] + 1);
+    ms[0] = (uint64_t{1} << qs[1]) - 1;
+    for (unsigned i = 1; i < 3; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 1] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1);
+
+    uint64_t xss[8];
+    for (unsigned i = 0; i < 8; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 3; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned cl = 0;
+    uint64_t emaskl = 0;
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        emaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
+    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[1];
+
+    auto s = UnitarySpace::Create(11);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]);
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 8; ++i) {
+      for (unsigned m = 0; m < 16; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2);
+        }
+
+        unsigned l = 2 * (16 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
+          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[16], is[16];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        rs[2 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 2; ++j) {
+          rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]);
+          is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 8; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 16; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 7 + cqs.size() - cl;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
+  }
+
+  void ApplyControlledGate4HHLL_H(const std::vector<unsigned>& qs,
+                                  const std::vector<unsigned>& cqs,
+                                  uint64_t cmask, const fp_type* matrix,
+                                  Unitary& state) const {
+    uint64_t xs[2];
+    uint64_t ms[3];
+
+    xs[0] = uint64_t{1} << (qs[2] + 1);
+    ms[0] = (uint64_t{1} << qs[2]) - 1;
+    for (unsigned i = 1; i < 2; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
+
+    uint64_t xss[4];
+    for (unsigned i = 0; i < 4; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 2; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[3];
+
+    auto s = UnitarySpace::Create(10);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
+
+    for (unsigned i = 0; i < 3; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 4; ++i) {
+      for (unsigned m = 0; m < 16; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4);
+        }
+
+        unsigned l = 2 * (16 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[16], is[16];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 4; ++j) {
+          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
+          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 16; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 6 + cqs.size();
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
+  }
+
+  void ApplyControlledGate4HHLL_L(const std::vector<unsigned>& qs,
+                                  const std::vector<unsigned>& cqs,
+                                  uint64_t cmask, const fp_type* matrix,
+                                  Unitary& state) const {
+    uint64_t xs[2];
+    uint64_t ms[3];
+
+    xs[0] = uint64_t{1} << (qs[2] + 1);
+    ms[0] = (uint64_t{1} << qs[2]) - 1;
+    for (unsigned i = 1; i < 2; ++i) {
+      xs[i] = uint64_t{1} << (qs[i + 2] + 1);
+      ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1);
+    }
+    ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1);
+
+    uint64_t xss[4];
+    for (unsigned i = 0; i < 4; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 2; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned cl = 0;
+    uint64_t emaskl = 0;
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        emaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
+    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[3];
+
+    auto s = UnitarySpace::Create(10);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]);
+
+    for (unsigned i = 0; i < 3; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 4; ++i) {
+      for (unsigned m = 0; m < 16; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4);
+        }
+
+        unsigned l = 2 * (16 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
+          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[16], is[16];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rs[4 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 4; ++j) {
+          rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]);
+          is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 4; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 16; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 6 + cqs.size() - cl;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
+  }
+
+  void ApplyControlledGate4HLLL_H(const std::vector<unsigned>& qs,
+                                  const std::vector<unsigned>& cqs,
+                                  uint64_t cmask, const fp_type* matrix,
+                                  Unitary& state) const {
+    uint64_t xs[1];
+    uint64_t ms[2];
+
+    xs[0] = uint64_t{1} << (qs[3] + 1);
+    ms[0] = (uint64_t{1} << qs[3]) - 1;
+    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
+
+    uint64_t xss[2];
+    for (unsigned i = 0; i < 2; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 1; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[7];
+
+    auto s = UnitarySpace::Create(9);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
+
+    for (unsigned i = 0; i < 7; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 2; ++i) {
+      for (unsigned m = 0; m < 16; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (128 * i + 16 * k + 8 * (m / 8) + (k + m) % 8);
+        }
+
+        unsigned l = 2 * (16 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[16], is[16];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rs[8 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 8; ++j) {
+          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
+          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 16; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 5 + cqs.size();
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
+  }
+
+  void ApplyControlledGate4HLLL_L(const std::vector<unsigned>& qs,
+                                  const std::vector<unsigned>& cqs,
+                                  uint64_t cmask, const fp_type* matrix,
+                                  Unitary& state) const {
+    uint64_t xs[1];
+    uint64_t ms[2];
+
+    xs[0] = uint64_t{1} << (qs[3] + 1);
+    ms[0] = (uint64_t{1} << qs[3]) - 1;
+    ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1);
+
+    uint64_t xss[2];
+    for (unsigned i = 0; i < 2; ++i) {
+      uint64_t a = 0;
+      for (uint64_t k = 0; k < 1; ++k) {
+        if (((i >> k) & 1) == 1) {
+          a += xs[k];
+        }
+      }
+      xss[i] = a;
+    }
+
+    unsigned cl = 0;
+    uint64_t emaskl = 0;
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        emaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
+    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[7];
+
+    auto s = UnitarySpace::Create(9);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]);
+
+    for (unsigned i = 0; i < 7; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 2; ++i) {
+      for (unsigned m = 0; m < 16; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (128 * i + 16 * k + 8 * (m / 8) + (k + m) % 8);
+        }
+
+        unsigned l = 2 * (16 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
+          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[16], is[16];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rs[8 * l] = _mm512_load_ps(p0 + xss[l]);
+        is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16);
+
+        for (unsigned j = 1; j < 8; ++j) {
+          rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]);
+          is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 2; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 16; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0 + xss[l], rn);
+        _mm512_store_ps(p0 + xss[l] + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 5 + cqs.size() - cl;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w, ms, xss,
+             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
+  }
+
+  void ApplyControlledGate4LLLL_H(const std::vector<unsigned>& qs,
+                                  const std::vector<unsigned>& cqs,
+                                  uint64_t cmask, const fp_type* matrix,
+                                  Unitary& state) const {
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      emaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[15];
+
+    auto s = UnitarySpace::Create(8);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]);
+
+    for (unsigned i = 0; i < 15; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned m = 0; m < 16; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (256 * i + 16 * k + 16 * (m / 16) + (k + m) % 16);
+        }
+
+        unsigned l = 2 * (16 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j] = matrix[p[j]];
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = matrix[p[j] + 1];
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[16], is[16];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rs[16 * l] = _mm512_load_ps(p0);
+        is[16 * l] = _mm512_load_ps(p0 + 16);
+
+        for (unsigned j = 1; j < 16; ++j) {
+          rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]);
+          is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 16; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0, rn);
+        _mm512_store_ps(p0 + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 4 + cqs.size();
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w,
+             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
+  }
+
+  void ApplyControlledGate4LLLL_L(const std::vector<unsigned>& qs,
+                                  const std::vector<unsigned>& cqs,
+                                  uint64_t cmask, const fp_type* matrix,
+                                  Unitary& state) const {
+    unsigned cl = 0;
+    uint64_t emaskl = 0;
+    uint64_t emaskh = 0;
+
+    for (auto q : cqs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        emaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh);
+    uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl);
+
+    for (auto q : qs) {
+      if (q > 3) {
+        emaskh |= uint64_t{1} << q;
+      }
+    }
+
+    emaskh = ~emaskh ^ 15;
+
+    unsigned p[16];
+    __m512i idx[15];
+
+    auto s = UnitarySpace::Create(8);
+    __m512* w = (__m512*) s.get();
+    fp_type* wf = (fp_type*) w;
+
+    unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]);
+
+    for (unsigned i = 0; i < 15; ++i) {
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+
+    for (unsigned i = 0; i < 1; ++i) {
+      for (unsigned m = 0; m < 16; ++m) {
+        for (unsigned j = 0; j < 16; ++j) {
+          unsigned k = bits::CompressBits(j, 4, qmask);
+          p[j] = 2 * (256 * i + 16 * k + 16 * (m / 16) + (k + m) % 16);
+        }
+
+        unsigned l = 2 * (16 * i + m);
+
+        for (unsigned j = 0; j < 16; ++j) {
+          fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0;
+          wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v;
+        }
+
+        for (unsigned j = 0; j < 16; ++j) {
+          wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0;
+        }
+      }
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      __m512 rn, in;
+      __m512 rs[16], is[16];
+
+      uint64_t ii = i % size;
+      uint64_t r = i / size;
+      uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh;
+
+      auto p0 = rstate + row_size * r + 2 * c;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rs[16 * l] = _mm512_load_ps(p0);
+        is[16 * l] = _mm512_load_ps(p0 + 16);
+
+        for (unsigned j = 1; j < 16; ++j) {
+          rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]);
+          is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned l = 0; l < 1; ++l) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned n = 1; n < 16; ++n) {
+          rn = _mm512_fmadd_ps(rs[n], w[j], rn);
+          in = _mm512_fmadd_ps(rs[n], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[n], w[j], in);
+
+          j += 2;
+        }
+
+        _mm512_store_ps(p0, rn);
+        _mm512_store_ps(p0 + 16, in);
+      }
+    };
+
+    fp_type* rstate = state.get();
+
+    unsigned k = 4 + cqs.size() - cl;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w,
+             state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate);
+  }
+
+  static unsigned MaskedAdd(
+      unsigned a, unsigned b, unsigned mask, unsigned lsize) {
+    unsigned c = bits::CompressBits(a, 4, mask);
+    return bits::ExpandBits((c + b) % lsize, 4, mask);
+  }
+
+  For for_;
+};
+
+}  // namespace unitary
+}  // namespace qsim
+
+#endif  // UNITARY_CALCULATOR_AVX512_H_
diff --git a/lib/unitaryspace_avx512.h b/lib/unitaryspace_avx512.h
new file mode 100644
index 000000000..ec1a6b5d1
--- /dev/null
+++ b/lib/unitaryspace_avx512.h
@@ -0,0 +1,110 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UNITARYSPACE_AVX512_H_
+#define UNITARYSPACE_AVX512_H_
+
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cmath>
+#include <complex>
+#include <cstdint>
+
+#include "unitaryspace.h"
+
+namespace qsim {
+
+namespace unitary {
+
+/**
+ * Object containing context and routines for unitary manipulations.
+ * State is a vectorized sequence of sixteen real components followed by
+ * sixteen imaginary components. Sixteen single-precison floating numbers can
+ * be loaded into an AVX512 register.
+ */
+template <typename For>
+struct UnitarySpaceAVX512 :
+    public UnitarySpace<UnitarySpaceAVX512<For>, For, float> {
+ private:
+  using Base = UnitarySpace<UnitarySpaceAVX512<For>, For, float>;
+
+ public:
+  using Unitary = typename Base::Unitary;
+  using fp_type = typename Base::fp_type;
+
+  template <typename... ForArgs>
+  explicit UnitarySpaceAVX512(ForArgs&&... args) : Base(args...) {}
+
+  static uint64_t MinRowSize(unsigned num_qubits) {
+    return std::max(uint64_t{32}, 2 * (uint64_t{1} << num_qubits));
+  };
+
+  static uint64_t MinSize(unsigned num_qubits) {
+    return Base::Size(num_qubits) * MinRowSize(num_qubits);
+  };
+
+  void SetAllZeros(Unitary& state) const {
+    __m512 val0 = _mm512_setzero_ps();
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, __m512 val0, fp_type* p) {
+      _mm512_store_ps(p + 32 * i, val0);
+      _mm512_store_ps(p + 32 * i + 16, val0);
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 32, f, val0, state.get());
+  }
+
+  void SetIdentity(Unitary& state) {
+    SetAllZeros(state);
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                uint64_t row_size, fp_type* p) {
+      p[row_size * i + (32 * (i / 16)) + (i % 16)] = 1;
+    };
+
+    uint64_t size = Base::Size(state.num_qubits());
+    uint64_t row_size = MinRowSize(state.num_qubits());
+    Base::for_.Run(size, f, row_size, state.get());
+  }
+
+  static std::complex<fp_type> GetEntry(const Unitary& state,
+                                        uint64_t i, uint64_t j) {
+    uint64_t row_size = MinRowSize(state.num_qubits());
+    uint64_t k = (32 * (j / 16)) + (j % 16);
+    return std::complex<fp_type>(state.get()[row_size * i + k],
+                                 state.get()[row_size * i + k + 16]);
+  }
+
+  static void SetEntry(Unitary& state, uint64_t i, uint64_t j,
+                       const std::complex<fp_type>& ampl) {
+    uint64_t row_size = MinRowSize(state.num_qubits());
+    uint64_t k = (32 * (j / 16)) + (j % 16);
+    state.get()[row_size * i + k] = std::real(ampl);
+    state.get()[row_size * i + k + 16] = std::imag(ampl);
+  }
+
+  static void SetEntry(Unitary& state, uint64_t i, uint64_t j, fp_type re,
+                       fp_type im) {
+    uint64_t row_size = MinRowSize(state.num_qubits());
+    uint64_t k = (32 * (j / 16)) + (j % 16);
+    state.get()[row_size * i + k] = re;
+    state.get()[row_size * i + k + 16] = im;
+  }
+};
+
+}  // namespace unitary
+}  // namespace qsim
+
+#endif  // UNITARYSPACE_AVX512_H_
diff --git a/tests/BUILD b/tests/BUILD
index aac574b73..015464670 100644
--- a/tests/BUILD
+++ b/tests/BUILD
@@ -1,5 +1,6 @@
 # Options for testing different simulator types.
 avx_copts = ['-mavx2', '-mfma']
+avx512_copts = ['-mavx512f']
 sse_copts = ['-msse4']
 
 windows_copts = [
@@ -7,6 +8,11 @@ windows_copts = [
     "/std:c++14",
 ]
 
+windows_avx512_copts = [
+    "/arch:AVX512",
+    "/std:c++14",
+]
+
 config_setting(
     name = "windows",
     constraint_values = ["@bazel_tools//platforms:windows"],
@@ -250,6 +256,22 @@ cc_test(
     }),
 )
 
+cc_test(
+    name = "simulator_avx512_test",
+    srcs = ["simulator_avx512_test.cc"],
+    deps = [
+        ":simulator_testfixture",
+        "@com_google_googletest//:gtest_main",
+        "//lib:parfor",
+        "//lib:seqfor",
+        "//lib:simulator_avx512",
+    ],
+    copts = select({
+        ":windows": windows_avx512_copts,
+        "//conditions:default": avx512_copts,
+    }),
+)
+
 cc_test(
     name = "simulator_basic_test",
     srcs = ["simulator_basic_test.cc"],
@@ -317,6 +339,23 @@ cc_test(
     }),
 )
 
+cc_test(
+    name = "statespace_avx512_test",
+    srcs = ["statespace_avx512_test.cc"],
+    deps = [
+        ":statespace_testfixture",
+        "@com_google_googletest//:gtest_main",
+        "//lib:parfor",
+        "//lib:seqfor",
+        "//lib:simulator_avx512",
+        "//lib:statespace_avx512",
+    ],
+    copts = select({
+        ":windows": windows_avx512_copts,
+        "//conditions:default": avx512_copts,
+    }),
+)
+
 cc_test(
     name = "statespace_basic_test",
     srcs = ["statespace_basic_test.cc"],
@@ -379,6 +418,21 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "unitaryspace_avx512_test",
+    srcs = ["unitaryspace_avx512_test.cc"],
+    copts = select({
+        ":windows": windows_avx512_copts,
+        "//conditions:default": avx512_copts,
+    }),
+    deps = [
+        ":unitaryspace_testfixture",
+        "@com_google_googletest//:gtest_main",
+        "//lib:formux",
+        "//lib:unitaryspace_avx512"
+    ],
+)
+
 cc_test(
     name = "unitaryspace_basic_test",
     srcs = ["unitaryspace_basic_test.cc"],
@@ -418,7 +472,6 @@ cc_library(
     }),
     deps = [
         "@com_google_googletest//:gtest_main",
-        "//lib:bits",
         "//lib:fuser",
         "//lib:gate_appl",
         "//lib:gates_cirq",
@@ -442,6 +495,22 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "unitary_calculator_avx512_test",
+    srcs = ["unitary_calculator_avx512_test.cc"],
+    copts = select({
+        ":windows": windows_avx512_copts,
+        "//conditions:default": avx512_copts,
+    }),
+    deps = [
+        ":unitary_calculator_testfixture",
+        "@com_google_googletest//:gtest_main",
+        "//lib:formux",
+        "//lib:unitaryspace_avx512",
+        "//lib:unitary_calculator_avx512",
+    ],
+)
+
 cc_test(
     name = "unitary_calculator_basic_test",
     srcs = ["unitary_calculator_basic_test.cc"],
diff --git a/tests/make.sh b/tests/make.sh
index 63d4fcfde..6abf59860 100755
--- a/tests/make.sh
+++ b/tests/make.sh
@@ -33,15 +33,19 @@ g++ -O3 -I$path_to_include -L$path_to_lib -mavx2 -mfma -fopenmp -o qtrajectory_t
 g++ -O3 -I$path_to_include -L$path_to_lib -mavx2 -mfma -fopenmp -o run_qsim_test.x run_qsim_test.cc -lgtest -lpthread
 g++ -O3 -I$path_to_include -L$path_to_lib -mavx2 -mfma -fopenmp -o run_qsimh_test.x run_qsimh_test.cc -lgtest -lpthread
 g++ -O3 -I$path_to_include -L$path_to_lib -mavx2 -mfma -fopenmp -o simulator_avx_test.x simulator_avx_test.cc -lgtest -lpthread
+g++ -O3 -I$path_to_include -L$path_to_lib -mavx512f -fopenmp -o simulator_avx512_test.x simulator_avx512_test.cc -lgtest -lpthread
 g++ -O3 -I$path_to_include -L$path_to_lib -fopenmp -o simulator_basic_test.x simulator_basic_test.cc -lgtest -lpthread
 g++ -O3 -I$path_to_include -L$path_to_lib -msse4 -fopenmp -o simulator_sse_test.x simulator_sse_test.cc -lgtest -lpthread
 g++ -O3 -I$path_to_include -L$path_to_lib -mavx2 -mfma -fopenmp -o statespace_avx_test.x statespace_avx_test.cc -lgtest -lpthread
+g++ -O3 -I$path_to_include -L$path_to_lib -mavx512f -fopenmp -o statespace_avx512_test.x statespace_avx512_test.cc -lgtest -lpthread
 g++ -O3 -I$path_to_include -L$path_to_lib -fopenmp -o statespace_basic_test.x statespace_basic_test.cc -lgtest -lpthread
 g++ -O3 -I$path_to_include -L$path_to_lib -msse4 -fopenmp -o statespace_sse_test.x statespace_sse_test.cc -lgtest -lpthread
 g++ -O3 -I$path_to_include -L$path_to_lib -mavx2 -mfma -fopenmp -o unitary_calculator_avx_test.x unitary_calculator_avx_test.cc -lgtest -lpthread
+g++ -O3 -I$path_to_include -L$path_to_lib -mavx512f -mfma -fopenmp -o unitary_calculator_avx512_test.x unitary_calculator_avx512_test.cc -lgtest -lpthread
 g++ -O3 -I$path_to_include -L$path_to_lib -fopenmp -o unitary_calculator_basic_test.x unitary_calculator_basic_test.cc -lgtest -lpthread
 g++ -O3 -I$path_to_include -L$path_to_lib -msse4 -fopenmp -o unitary_calculator_sse_test.x unitary_calculator_sse_test.cc -lgtest -lpthread
 g++ -O3 -I$path_to_include -L$path_to_lib -mavx2 -mfma -fopenmp -o unitaryspace_avx_test.x unitaryspace_avx_test.cc -lgtest -lpthread
+g++ -O3 -I$path_to_include -L$path_to_lib -mavx512f -mfma -fopenmp -o unitaryspace_avx512_test.x unitaryspace_avx512_test.cc -lgtest -lpthread
 g++ -O3 -I$path_to_include -L$path_to_lib -fopenmp -o unitaryspace_basic_test.x unitaryspace_basic_test.cc -lgtest -lpthread
 g++ -O3 -I$path_to_include -L$path_to_lib -msse4 -fopenmp -o unitaryspace_sse_test.x unitaryspace_sse_test.cc -lgtest -lpthread
 g++ -O3 -I$path_to_include -L$path_to_lib -o vectorspace_test.x vectorspace_test.cc -lgtest -lpthread
diff --git a/tests/simulator_avx512_test.cc b/tests/simulator_avx512_test.cc
new file mode 100644
index 000000000..ec3b7b2ab
--- /dev/null
+++ b/tests/simulator_avx512_test.cc
@@ -0,0 +1,84 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "simulator_testfixture.h"
+
+#include "gtest/gtest.h"
+
+#ifdef _OPENMP
+#include "../lib/parfor.h"
+#endif
+#include "../lib/seqfor.h"
+#include "../lib/simulator_avx512.h"
+
+namespace qsim {
+
+template <class T>
+class SimulatorAVX512Test : public testing::Test {};
+
+using ::testing::Types;
+#ifdef _OPENMP
+typedef Types<ParallelFor, SequentialFor> for_impl;
+#else
+typedef Types<SequentialFor> for_impl;
+#endif
+
+TYPED_TEST_SUITE(SimulatorAVX512Test, for_impl);
+
+TYPED_TEST(SimulatorAVX512Test, ApplyGate1) {
+  TestApplyGate1<SimulatorAVX512<TypeParam>>();
+}
+
+TYPED_TEST(SimulatorAVX512Test, ApplyGate2) {
+  TestApplyGate2<SimulatorAVX512<TypeParam>>();
+}
+
+TYPED_TEST(SimulatorAVX512Test, ApplyGate3) {
+  TestApplyGate3<SimulatorAVX512<TypeParam>>();
+}
+
+TYPED_TEST(SimulatorAVX512Test, ApplyGate5) {
+  TestApplyGate5<SimulatorAVX512<TypeParam>>();
+}
+
+TYPED_TEST(SimulatorAVX512Test, CircuitWithControlledGates) {
+  TestCircuitWithControlledGates<SimulatorAVX512<TypeParam>>();
+}
+
+TYPED_TEST(SimulatorAVX512Test, CircuitWithControlledGatesDagger) {
+  TestCircuitWithControlledGatesDagger<SimulatorAVX512<TypeParam>>();
+}
+
+TYPED_TEST(SimulatorAVX512Test, MultiQubitGates) {
+  TestMultiQubitGates<SimulatorAVX512<TypeParam>>();
+}
+
+TYPED_TEST(SimulatorAVX512Test, ControlledGates) {
+  TestControlledGates<SimulatorAVX512<TypeParam>>(false);
+}
+
+TYPED_TEST(SimulatorAVX512Test, ExpectationValue1) {
+  TestExpectationValue1<SimulatorAVX512<TypeParam>>();
+}
+
+TYPED_TEST(SimulatorAVX512Test, ExpectationValue2) {
+  TestExpectationValue2<SimulatorAVX512<TypeParam>>();
+}
+
+}  // namespace qsim
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tests/statespace_avx512_test.cc b/tests/statespace_avx512_test.cc
new file mode 100644
index 000000000..a7333f9e3
--- /dev/null
+++ b/tests/statespace_avx512_test.cc
@@ -0,0 +1,105 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "statespace_testfixture.h"
+
+#include "gtest/gtest.h"
+
+#ifdef _OPENMP
+#include "../lib/parfor.h"
+#endif
+#include "../lib/seqfor.h"
+#include "../lib/simulator_avx512.h"
+#include "../lib/statespace_avx512.h"
+
+namespace qsim {
+
+template <class T>
+class StateSpaceAVX512Test : public testing::Test {};
+
+using ::testing::Types;
+#ifdef _OPENMP
+typedef Types<ParallelFor, SequentialFor> for_impl;
+#else
+typedef Types<SequentialFor> for_impl;
+#endif
+
+TYPED_TEST_SUITE(StateSpaceAVX512Test, for_impl);
+
+TYPED_TEST(StateSpaceAVX512Test, Add) {
+  TestAdd<StateSpaceAVX512<TypeParam>>();
+}
+
+TYPED_TEST(StateSpaceAVX512Test, NormSmall) {
+  TestNormSmall<StateSpaceAVX512<TypeParam>>();
+}
+
+TYPED_TEST(StateSpaceAVX512Test, NormAndInnerProductSmall) {
+  TestNormAndInnerProductSmall<StateSpaceAVX512<TypeParam>>();
+}
+
+TYPED_TEST(StateSpaceAVX512Test, NormAndInnerProduct) {
+  TestNormAndInnerProduct<SimulatorAVX512<TypeParam>>();
+}
+
+TYPED_TEST(StateSpaceAVX512Test, SamplingSmall) {
+  TestSamplingSmall<StateSpaceAVX512<TypeParam>>();
+}
+
+TYPED_TEST(StateSpaceAVX512Test, SamplingCrossEntropyDifference) {
+  TestSamplingCrossEntropyDifference<SimulatorAVX512<TypeParam>>();
+}
+
+TYPED_TEST(StateSpaceAVX512Test, Ordering) {
+  TestOrdering<StateSpaceAVX512<TypeParam>>();
+}
+
+TYPED_TEST(StateSpaceAVX512Test, MeasurementSmall) {
+  TestMeasurementSmall<StateSpaceAVX512<TypeParam>, TypeParam>();
+}
+
+TYPED_TEST(StateSpaceAVX512Test, MeasurementLarge) {
+  TestMeasurementLarge<SimulatorAVX512<TypeParam>>();
+}
+
+TYPED_TEST(StateSpaceAVX512Test, Collapse) {
+  TestCollapse<StateSpaceAVX512<TypeParam>>();
+}
+
+TYPED_TEST(StateSpaceAVX512Test, InvalidStateSize) {
+  TestInvalidStateSize<StateSpaceAVX512<TypeParam>>();
+}
+
+TYPED_TEST(StateSpaceAVX512Test, BulkSetAmpl) {
+  TestBulkSetAmplitude<StateSpaceAVX512<TypeParam>>();
+}
+
+TYPED_TEST(StateSpaceAVX512Test, BulkSetAmplExclude) {
+  TestBulkSetAmplitudeExclusion<StateSpaceAVX512<TypeParam>>();
+}
+
+TYPED_TEST(StateSpaceAVX512Test, BulkSetAmplDefault) {
+  TestBulkSetAmplitudeDefault<StateSpaceAVX512<TypeParam>>();
+}
+
+TYPED_TEST(StateSpaceAVX512Test, ThreadThrashing) {
+  TestThreadThrashing<StateSpaceAVX512<TypeParam>>();
+}
+
+}  // namespace qsim
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tests/statespace_avx_test.cc b/tests/statespace_avx_test.cc
index f2ae4e210..5bc1128ea 100644
--- a/tests/statespace_avx_test.cc
+++ b/tests/statespace_avx_test.cc
@@ -73,6 +73,10 @@ TYPED_TEST(StateSpaceAVXTest, MeasurementLarge) {
   TestMeasurementLarge<SimulatorAVX<TypeParam>>();
 }
 
+TYPED_TEST(StateSpaceAVXTest, Collapse) {
+  TestCollapse<StateSpaceAVX<TypeParam>>();
+}
+
 TYPED_TEST(StateSpaceAVXTest, InvalidStateSize) {
   TestInvalidStateSize<StateSpaceAVX<TypeParam>>();
 }
diff --git a/tests/statespace_basic_test.cc b/tests/statespace_basic_test.cc
index eb5c53069..6eaf6d38e 100644
--- a/tests/statespace_basic_test.cc
+++ b/tests/statespace_basic_test.cc
@@ -73,6 +73,10 @@ TYPED_TEST(StateSpaceBasicTest, MeasurementLarge) {
   TestMeasurementLarge<SimulatorBasic<TypeParam, float>>();
 }
 
+TYPED_TEST(StateSpaceBasicTest, Collapse) {
+  TestCollapse<StateSpaceBasic<TypeParam, float>>();
+}
+
 TYPED_TEST(StateSpaceBasicTest, InvalidStateSize) {
   TestInvalidStateSize<StateSpaceBasic<TypeParam, float>>();
 }
diff --git a/tests/statespace_sse_test.cc b/tests/statespace_sse_test.cc
index b1e0810bd..419976d0c 100644
--- a/tests/statespace_sse_test.cc
+++ b/tests/statespace_sse_test.cc
@@ -73,6 +73,10 @@ TYPED_TEST(StateSpaceSSETest, MeasurementLarge) {
   TestMeasurementLarge<SimulatorSSE<TypeParam>>();
 }
 
+TYPED_TEST(StateSpaceSSETest, Collapse) {
+  TestCollapse<StateSpaceSSE<TypeParam>>();
+}
+
 TYPED_TEST(StateSpaceSSETest, InvalidStateSize) {
   TestInvalidStateSize<StateSpaceSSE<TypeParam>>();
 }
diff --git a/tests/statespace_testfixture.h b/tests/statespace_testfixture.h
index 6dff358cd..265afe20c 100644
--- a/tests/statespace_testfixture.h
+++ b/tests/statespace_testfixture.h
@@ -359,13 +359,16 @@ void TestAdd() {
 
   constexpr unsigned num_qubits = 2;
   StateSpace state_space(1);
+
   State state1 = state_space.Create(num_qubits);
+  state_space.SetAllZeros(state1);
   state_space.SetAmpl(state1, 0, 1, 2);
   state_space.SetAmpl(state1, 1, 3, 4);
   state_space.SetAmpl(state1, 2, 5, 6);
   state_space.SetAmpl(state1, 3, 7, 8);
 
   State state2 = state_space.Create(num_qubits);
+  state_space.SetAllZeros(state2);
   state_space.SetAmpl(state2, 0, 1, 2);
   state_space.SetAmpl(state2, 1, 3, 4);
   state_space.SetAmpl(state2, 2, 5, 6);
@@ -528,6 +531,8 @@ void TestSamplingSmall() {
 
   EXPECT_FALSE(state_space.IsNull(state));
 
+  state_space.SetAllZeros(state);
+
   std::array<float, size> ps = {0.1, 0.2, 0.13, 0.12, 0.18, 0.15, 0.07, 0.05};
 
   for (uint64_t i = 0; i < size; ++i) {
@@ -597,31 +602,33 @@ void TestOrdering() {
   using fp_type = typename StateSpace::fp_type;
   using State = typename StateSpace::State;
 
-  for (unsigned num_qubits : {1, 2, 5}) {
+  for (unsigned num_qubits : {1, 2, 3, 4, 5, 6, 7, 8, 9}) {
     uint64_t size = uint64_t{1} << num_qubits;
 
     StateSpace state_space(1);
-    uint64_t raw_size = state_space.MinSize(num_qubits);
-    std::vector<fp_type> vec(raw_size, 0);
-    State state = state_space.Create(vec.data(), num_qubits);
+    State state = state_space.Create(num_qubits);
+
+    state_space.SetAllZeros(state);
 
     for (uint64_t i = 0; i < size; ++i) {
-      state_space.SetAmpl(state, i, std::complex<fp_type>(i, size + i));
+      state_space.SetAmpl(state, i, std::complex<fp_type>(i + 1, size + i + 1));
     }
 
     state_space.InternalToNormalOrder(state);
 
+    const fp_type* vec = state.get();
+
     for (uint64_t i = 0; i < size; ++i) {
-      EXPECT_NEAR(vec[2 * i], fp_type(i), 1e-8);
-      EXPECT_NEAR(vec[2 * i + 1], fp_type(size + i), 1e-8);
+      EXPECT_NEAR(vec[2 * i], fp_type(i + 1), 1e-8);
+      EXPECT_NEAR(vec[2 * i + 1], fp_type(size + i + 1), 1e-8);
     }
 
     state_space.NormalToInternalOrder(state);
 
     for (uint64_t i = 0; i < size; ++i) {
       auto a = state_space.GetAmpl(state, i);
-      EXPECT_NEAR(std::real(a), fp_type(i), 1e-8);
-      EXPECT_NEAR(std::imag(a), fp_type(size + i), 1e-8);
+      EXPECT_NEAR(std::real(a), fp_type(i + 1), 1e-8);
+      EXPECT_NEAR(std::imag(a), fp_type(size + i + 1), 1e-8);
     }
   }
 }
@@ -716,6 +723,12 @@ void TestMeasurementSmall() {
   MeasureSmall<S>(num_measurements, 1, 3, {2}, ps3, rgen);
   MeasureSmall<S>(num_measurements, 1, 3, {0, 1, 2}, ps3, rgen);
 
+  std::vector<float> ps4 = {
+    0.06, 0.10, 0.07, 0.06, 0.09, 0.08, 0.03, 0.03,
+    0.03, 0.07, 0.11, 0.04, 0.05, 0.06, 0.07, 0.05
+  };
+  MeasureSmall<S>(num_measurements, 1, 4, {0, 3}, ps4, rgen);
+
   std::vector<float> ps5 = {
     0.041, 0.043, 0.028, 0.042, 0.002, 0.008, 0.039, 0.020,
     0.017, 0.030, 0.020, 0.048, 0.020, 0.044, 0.032, 0.048,
@@ -788,6 +801,53 @@ void TestMeasurementLarge() {
   }
 }
 
+template <typename StateSpace>
+void TestCollapse() {
+  using State = typename StateSpace::State;
+  using fp_type = typename StateSpace::fp_type;
+  using MeasurementResult = typename StateSpace::MeasurementResult;
+
+  MeasurementResult mr;
+  StateSpace state_space(1);
+
+  for (unsigned num_qubits = 2; num_qubits <= 6; ++num_qubits) {
+    State state = state_space.Create(num_qubits);
+
+    unsigned size = 1 << num_qubits;
+
+    for (unsigned mask = 1; mask < size; ++mask) {
+      for (unsigned bits = 0; bits < size; ++bits) {
+        if ((mask | bits) != mask) continue;
+
+        mr.mask = mask;
+        mr.bits = bits;
+
+        unsigned num_measured_qubits = 0;
+        for (unsigned i = 0; i < num_qubits; ++i) {
+          if (((mask >> i) & 1) == 1) {
+            ++num_measured_qubits;
+          }
+        }
+
+        fp_type r = 1.0 / std::sqrt(1 << (num_qubits - num_measured_qubits));
+
+        state_space.SetStateUniform(state);
+        state_space.Collapse(mr, state);
+
+        EXPECT_NEAR(state_space.Norm(state), 1.0, 1e-6);
+
+        for (unsigned i = 0; i < size; ++i) {
+          if ((i & mask) != bits) {
+            EXPECT_EQ(state_space.GetAmpl(state, i), std::complex<float>(0, 0));
+          } else {
+            EXPECT_EQ(state_space.GetAmpl(state, i), std::complex<float>(r, 0));
+          }
+        }
+      }
+    }
+  }
+}
+
 template <typename StateSpace>
 void TestInvalidStateSize() {
   using State = typename StateSpace::State;
@@ -812,123 +872,69 @@ void TestInvalidStateSize() {
 template <typename StateSpace>
 void TestBulkSetAmplitude() {
   using State = typename StateSpace::State;
-  unsigned num_qubits = 3;
 
   StateSpace state_space(1);
 
-  State state = state_space.Create(num_qubits);
-  for(int i = 0; i < 8; i++) {
-    state_space.SetAmpl(state, i, 1, 1);
-  }
-  state_space.BulkSetAmpl(state, 1, 0, 0, 0, false);
-  EXPECT_EQ(state_space.GetAmpl(state, 0), std::complex<float>(0, 0));
-  EXPECT_EQ(state_space.GetAmpl(state, 1), std::complex<float>(1, 1));
-  EXPECT_EQ(state_space.GetAmpl(state, 2), std::complex<float>(0, 0));
-  EXPECT_EQ(state_space.GetAmpl(state, 3), std::complex<float>(1, 1));
-  EXPECT_EQ(state_space.GetAmpl(state, 4), std::complex<float>(0, 0));
-  EXPECT_EQ(state_space.GetAmpl(state, 5), std::complex<float>(1, 1));
-  EXPECT_EQ(state_space.GetAmpl(state, 6), std::complex<float>(0, 0));
-  EXPECT_EQ(state_space.GetAmpl(state, 7), std::complex<float>(1, 1));
+  for (unsigned num_qubits = 2; num_qubits <= 6; ++num_qubits) {
+    State state = state_space.Create(num_qubits);
+    state_space.SetAllZeros(state);
 
-  for(int i = 0; i < 8; i++) {
-    state_space.SetAmpl(state, i, 1, 1);
-  }
-  state_space.BulkSetAmpl(state, 2, 0, 0, 0, false);
-  EXPECT_EQ(state_space.GetAmpl(state, 0), std::complex<float>(0, 0));
-  EXPECT_EQ(state_space.GetAmpl(state, 1), std::complex<float>(0, 0));
-  EXPECT_EQ(state_space.GetAmpl(state, 2), std::complex<float>(1, 1));
-  EXPECT_EQ(state_space.GetAmpl(state, 3), std::complex<float>(1, 1));
-  EXPECT_EQ(state_space.GetAmpl(state, 4), std::complex<float>(0, 0));
-  EXPECT_EQ(state_space.GetAmpl(state, 5), std::complex<float>(0, 0));
-  EXPECT_EQ(state_space.GetAmpl(state, 6), std::complex<float>(1, 1));
-  EXPECT_EQ(state_space.GetAmpl(state, 7), std::complex<float>(1, 1));
+    unsigned size = 1 << num_qubits;
 
-  for(int i = 0; i < 8; i++) {
-    state_space.SetAmpl(state, i, 1, 1);
-  }
-  state_space.BulkSetAmpl(state, 4, 0, 0, 0, false);
-  EXPECT_EQ(state_space.GetAmpl(state, 0), std::complex<float>(0, 0));
-  EXPECT_EQ(state_space.GetAmpl(state, 1), std::complex<float>(0, 0));
-  EXPECT_EQ(state_space.GetAmpl(state, 2), std::complex<float>(0, 0));
-  EXPECT_EQ(state_space.GetAmpl(state, 3), std::complex<float>(0, 0));
-  EXPECT_EQ(state_space.GetAmpl(state, 4), std::complex<float>(1, 1));
-  EXPECT_EQ(state_space.GetAmpl(state, 5), std::complex<float>(1, 1));
-  EXPECT_EQ(state_space.GetAmpl(state, 6), std::complex<float>(1, 1));
-  EXPECT_EQ(state_space.GetAmpl(state, 7), std::complex<float>(1, 1));
+    for (unsigned mask = 1; mask < size; ++mask) {
+      for (unsigned bits = 0; bits < size; ++bits) {
+        if ((mask | bits) != mask) continue;
 
-  for(int i = 0; i < 8; i++) {
-    state_space.SetAmpl(state, i, 1, 1);
+        for (unsigned i = 0; i < size; ++i) {
+          state_space.SetAmpl(state, i, 1, 1);
+        }
+
+        state_space.BulkSetAmpl(state, mask, bits, 0, 0, false);
+
+        for (unsigned i = 0; i < size; ++i) {
+          if ((i & mask) == bits) {
+            EXPECT_EQ(state_space.GetAmpl(state, i), std::complex<float>(0, 0));
+          } else {
+            EXPECT_EQ(state_space.GetAmpl(state, i), std::complex<float>(1, 1));
+          }
+        }
+      }
+    }
   }
-  state_space.BulkSetAmpl(state, 4 | 1, 4, 0, 0, false);
-  EXPECT_EQ(state_space.GetAmpl(state, 0), std::complex<float>(1, 1));
-  EXPECT_EQ(state_space.GetAmpl(state, 1), std::complex<float>(1, 1));
-  EXPECT_EQ(state_space.GetAmpl(state, 2), std::complex<float>(1, 1));
-  EXPECT_EQ(state_space.GetAmpl(state, 3), std::complex<float>(1, 1));
-  EXPECT_EQ(state_space.GetAmpl(state, 4), std::complex<float>(0, 0));
-  EXPECT_EQ(state_space.GetAmpl(state, 5), std::complex<float>(1, 1));
-  EXPECT_EQ(state_space.GetAmpl(state, 6), std::complex<float>(0, 0));
-  EXPECT_EQ(state_space.GetAmpl(state, 7), std::complex<float>(1, 1));
 }
 
 template <typename StateSpace>
 void TestBulkSetAmplitudeExclusion() {
   using State = typename StateSpace::State;
-  unsigned num_qubits = 3;
 
   StateSpace state_space(1);
 
-  State state = state_space.Create(num_qubits);
-  for(int i = 0; i < 8; i++) {
-    state_space.SetAmpl(state, i, 1, 1);
-  }
-  state_space.BulkSetAmpl(state, 1, 0, 0, 0, true);
-  EXPECT_EQ(state_space.GetAmpl(state, 0), std::complex<float>(1, 1));
-  EXPECT_EQ(state_space.GetAmpl(state, 1), std::complex<float>(0, 0));
-  EXPECT_EQ(state_space.GetAmpl(state, 2), std::complex<float>(1, 1));
-  EXPECT_EQ(state_space.GetAmpl(state, 3), std::complex<float>(0, 0));
-  EXPECT_EQ(state_space.GetAmpl(state, 4), std::complex<float>(1, 1));
-  EXPECT_EQ(state_space.GetAmpl(state, 5), std::complex<float>(0, 0));
-  EXPECT_EQ(state_space.GetAmpl(state, 6), std::complex<float>(1, 1));
-  EXPECT_EQ(state_space.GetAmpl(state, 7), std::complex<float>(0, 0));
+  for (unsigned num_qubits = 2; num_qubits <= 6; ++num_qubits) {
+    State state = state_space.Create(num_qubits);
+    state_space.SetAllZeros(state);
 
-  for(int i = 0; i < 8; i++) {
-    state_space.SetAmpl(state, i, 1, 1);
-  }
-  state_space.BulkSetAmpl(state, 2, 0, 0, 0, true);
-  EXPECT_EQ(state_space.GetAmpl(state, 0), std::complex<float>(1, 1));
-  EXPECT_EQ(state_space.GetAmpl(state, 1), std::complex<float>(1, 1));
-  EXPECT_EQ(state_space.GetAmpl(state, 2), std::complex<float>(0, 0));
-  EXPECT_EQ(state_space.GetAmpl(state, 3), std::complex<float>(0, 0));
-  EXPECT_EQ(state_space.GetAmpl(state, 4), std::complex<float>(1, 1));
-  EXPECT_EQ(state_space.GetAmpl(state, 5), std::complex<float>(1, 1));
-  EXPECT_EQ(state_space.GetAmpl(state, 6), std::complex<float>(0, 0));
-  EXPECT_EQ(state_space.GetAmpl(state, 7), std::complex<float>(0, 0));
+    unsigned size = 1 << num_qubits;
 
-  for(int i = 0; i < 8; i++) {
-    state_space.SetAmpl(state, i, 1, 1);
-  }
-  state_space.BulkSetAmpl(state, 4, 0, 0, 0, true);
-  EXPECT_EQ(state_space.GetAmpl(state, 0), std::complex<float>(1, 1));
-  EXPECT_EQ(state_space.GetAmpl(state, 1), std::complex<float>(1, 1));
-  EXPECT_EQ(state_space.GetAmpl(state, 2), std::complex<float>(1, 1));
-  EXPECT_EQ(state_space.GetAmpl(state, 3), std::complex<float>(1, 1));
-  EXPECT_EQ(state_space.GetAmpl(state, 4), std::complex<float>(0, 0));
-  EXPECT_EQ(state_space.GetAmpl(state, 5), std::complex<float>(0, 0));
-  EXPECT_EQ(state_space.GetAmpl(state, 6), std::complex<float>(0, 0));
-  EXPECT_EQ(state_space.GetAmpl(state, 7), std::complex<float>(0, 0));
+    for (unsigned mask = 1; mask < size; ++mask) {
+      for (unsigned bits = 0; bits < size; ++bits) {
+        if ((mask | bits) != mask) continue;
 
-  for(int i = 0; i < 8; i++) {
-    state_space.SetAmpl(state, i, 1, 1);
+        for (unsigned i = 0; i < size; ++i) {
+          state_space.SetAmpl(state, i, 1, 1);
+        }
+
+        state_space.BulkSetAmpl(state, mask, bits, 0, 0, true);
+
+        for (unsigned i = 0; i < size; ++i) {
+          if ((i & mask) != bits) {
+            EXPECT_EQ(state_space.GetAmpl(state, i), std::complex<float>(0, 0));
+          } else {
+            EXPECT_EQ(state_space.GetAmpl(state, i), std::complex<float>(1, 1));
+          }
+        }
+      }
+    }
   }
-  state_space.BulkSetAmpl(state, 4 | 1, 4, 0, 0, true);
-  EXPECT_EQ(state_space.GetAmpl(state, 0), std::complex<float>(0, 0));
-  EXPECT_EQ(state_space.GetAmpl(state, 1), std::complex<float>(0, 0));
-  EXPECT_EQ(state_space.GetAmpl(state, 2), std::complex<float>(0, 0));
-  EXPECT_EQ(state_space.GetAmpl(state, 3), std::complex<float>(0, 0));
-  EXPECT_EQ(state_space.GetAmpl(state, 4), std::complex<float>(1, 1));
-  EXPECT_EQ(state_space.GetAmpl(state, 5), std::complex<float>(0, 0));
-  EXPECT_EQ(state_space.GetAmpl(state, 6), std::complex<float>(1, 1));
-  EXPECT_EQ(state_space.GetAmpl(state, 7), std::complex<float>(0, 0));
 }
 
 template <typename StateSpace>
@@ -939,7 +945,9 @@ void TestBulkSetAmplitudeDefault() {
   StateSpace state_space(1);
 
   State state = state_space.Create(num_qubits);
-  for(int i = 0; i < 8; i++) {
+  state_space.SetAllZeros(state);
+
+  for (unsigned i = 0; i < 8; ++i) {
     state_space.SetAmpl(state, i, 1, 1);
   }
   state_space.BulkSetAmpl(state, 4 | 1, 4, 0, 0);
@@ -957,13 +965,17 @@ template <typename StateSpace>
 void TestThreadThrashing() {
   using State = typename StateSpace::State;
   StateSpace state_space(1024);
-  unsigned n_qubits = 13;
-  State state = state_space.Create(n_qubits);  // must be larger than MIN_SIZE.
-  for (int i = 0; i < 100; i++) {
+  unsigned num_qubits = 13;
+
+  State state = state_space.Create(num_qubits);  // must be larger than MIN_SIZE.
+  for (unsigned i = 0; i < 100; ++i) {
     state_space.SetStateZero(state);
   }
+
   EXPECT_EQ(state_space.GetAmpl(state, 0), std::complex<float>(1, 0));
-  for (int i = 1; i < (1 << n_qubits); i++) {
+
+  unsigned size = 1 << num_qubits;
+  for (unsigned i = 1; i < size; ++i) {
     EXPECT_EQ(state_space.GetAmpl(state, i), std::complex<float>(0, 0));
   }
 }
diff --git a/tests/unitary_calculator_avx512_test.cc b/tests/unitary_calculator_avx512_test.cc
new file mode 100644
index 000000000..24381b69a
--- /dev/null
+++ b/tests/unitary_calculator_avx512_test.cc
@@ -0,0 +1,65 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "unitary_calculator_testfixture.h"
+
+#include "gtest/gtest.h"
+
+#include "../lib/formux.h"
+#include "../lib/unitary_calculator_avx512.h"
+
+namespace qsim {
+namespace unitary {
+namespace {
+
+TEST(UnitaryCalculatorAVX512Test, ApplyGate1) {
+  TestApplyGate1<UnitaryCalculatorAVX512<For>>();
+}
+
+TEST(UnitaryCalculatorAVX512Test, ApplyControlledGate1) {
+  TestApplyControlledGate1<UnitaryCalculatorAVX512<For>>();
+}
+
+TEST(UnitaryCalculatorAVX512Test, ApplyGate2) {
+  TestApplyGate2<UnitaryCalculatorAVX512<For>>();
+}
+
+TEST(UnitaryCalculatorAVX512Test, ApplyControlledGate2) {
+  TestApplyControlledGate2<UnitaryCalculatorAVX512<For>>();
+}
+
+TEST(UnitaryCalculatorAVX512Test, ApplyFusedGate) {
+  TestApplyFusedGate<UnitaryCalculatorAVX512<For>>();
+}
+
+TEST(UnitaryCalculatorAVX512Test, ApplyGates) {
+  TestApplyGates<UnitaryCalculatorAVX512<For>>(false);
+}
+
+TEST(UnitaryCalculatorAVX512Test, ApplyControlledGates) {
+  TestApplyControlledGates<UnitaryCalculatorAVX512<For>>(false);
+}
+
+TEST(UnitaryCalculatorAVX512Test, SmallCircuits) {
+  TestSmallCircuits<UnitaryCalculatorAVX512<For>>();
+}
+
+}  // namespace
+}  // namespace unitary
+}  // namespace qsim
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tests/unitary_calculator_testfixture.h b/tests/unitary_calculator_testfixture.h
index b6a5880e3..3b7707c38 100644
--- a/tests/unitary_calculator_testfixture.h
+++ b/tests/unitary_calculator_testfixture.h
@@ -20,7 +20,6 @@
 #include <cmath>
 #include <vector>
 
-#include "../lib/bits.h"
 #include "../lib/fuser.h"
 #include "../lib/gate_appl.h"
 #include "../lib/gates_cirq.h"
@@ -530,7 +529,7 @@ void TestApplyControlledGates(bool test_double) {
   using UnitarySpace = typename UnitaryCalculator::UnitarySpace;
   using fp_type = typename UnitaryCalculator::fp_type;
 
-  unsigned max_qubits = 4 + std::log2(UnitaryCalculator::SIMDRegisterSize());
+  unsigned max_qubits = 3 + std::log2(UnitaryCalculator::SIMDRegisterSize());
   unsigned max_target_qubits = 3;
   unsigned max_control_qubits = 2;
 
diff --git a/tests/unitaryspace_avx512_test.cc b/tests/unitaryspace_avx512_test.cc
new file mode 100644
index 000000000..61260d678
--- /dev/null
+++ b/tests/unitaryspace_avx512_test.cc
@@ -0,0 +1,46 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "unitaryspace_testfixture.h"
+
+#include "gtest/gtest.h"
+
+#include "../lib/formux.h"
+#include "../lib/unitaryspace_avx512.h"
+
+namespace qsim {
+
+namespace unitary {
+namespace {
+
+TEST(UnitarySpaceAVX512Test, SetZero) {
+  TestSetZeros<UnitarySpaceAVX512<For>>();
+}
+
+TEST(UnitarySpaceAVX512Test, SetIdentity) {
+  TestSetIdentity<UnitarySpaceAVX512<For>>();
+}
+
+TEST(UnitarySpaceAVX512Test, GetEntry) {
+  TestSetEntry<UnitarySpaceAVX512<For>>();
+}
+
+}  // namspace
+}  // namespace unitary
+}  // namespace qsim
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}

From 442b6ba323e701b5387cacade24abfca9a80b363 Mon Sep 17 00:00:00 2001
From: Sergei Isakov <iserge@google.com>
Date: Tue, 20 Apr 2021 18:35:33 +0200
Subject: [PATCH 02/10] Add support for avx512.

---
 .bazelrc                        | 4 ++++
 .github/workflows/bazeltest.yml | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/.bazelrc b/.bazelrc
index 79fe3007b..a2895b8bc 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -29,6 +29,10 @@ build:avx --copt -O3
 build:avx --copt -mavx2
 build:avx --copt -mfma
 
+# Build with AVX512
+build:avx512 --copt -O3
+build:avx512 --copt -mavx512f
+
 # Build with SSE
 build:sse --copt -O3
 build:sse --copt -msse4
diff --git a/.github/workflows/bazeltest.yml b/.github/workflows/bazeltest.yml
index 1bbfa1592..018da315b 100644
--- a/.github/workflows/bazeltest.yml
+++ b/.github/workflows/bazeltest.yml
@@ -16,7 +16,7 @@ jobs:
     strategy:
       matrix:
         # Hardware optimizers.
-        hardware_opt: [avx,sse,basic]
+        hardware_opt: [avx,avx512,sse,basic]
         # Optimizers for parallelism.
         parallel_opt: [openmp,nopenmp]
 
@@ -58,7 +58,7 @@ jobs:
         run: |
           bazel test --config=avx --config=openmp \
           --config=${{ matrix.sanitizer_opt }} tests:all
-  
+
   test-mem:
     name: Test with tcmalloc
     runs-on: ubuntu-16.04

From f87657d421afe0d37c5778194eae739d591524f5 Mon Sep 17 00:00:00 2001
From: Sergei Isakov <iserge@google.com>
Date: Tue, 20 Apr 2021 18:42:10 +0200
Subject: [PATCH 03/10] Add support for avx512.

---
 lib/statespace_avx512.h | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/lib/statespace_avx512.h b/lib/statespace_avx512.h
index 3b98250be..07b7485fc 100644
--- a/lib/statespace_avx512.h
+++ b/lib/statespace_avx512.h
@@ -47,8 +47,24 @@ inline unsigned GetZeroMaskAVX512(uint64_t i, uint64_t mask, uint64_t bits) {
   return (m2 << 8) | m1;
 }
 
+inline double HorizontalSumAVX(__m256 s) {
+  __m128 l = _mm256_castps256_ps128(s);
+  __m128 h = _mm256_extractf128_ps(s, 1);
+  __m128 s1  = _mm_add_ps(h, l);
+  __m128 s1s = _mm_movehdup_ps(s1);
+  __m128 s2 = _mm_add_ps(s1, s1s);
+
+  return _mm_cvtss_f32(_mm_add_ss(s2, _mm_movehl_ps(s1s, s2)));
+}
+
 inline double HorizontalSumAVX512(__m512 s) {
-  return _mm512_reduce_add_ps(s);
+  __m256 l = _mm512_castps512_ps256(s);
+  __m512d sd = _mm512_castps_pd(s);
+  __m256d hd = _mm512_extractf64x4_pd(sd, 1);
+  __m256 h = _mm256_castpd_ps(hd);
+  __m256 p = _mm256_add_ps(h, l);
+
+  return HorizontalSumAVX(p);
 }
 
 }  // namespace detail

From 34bbd5666b579a68eba1c12d27df406586eae836 Mon Sep 17 00:00:00 2001
From: Sergei Isakov <iserge@google.com>
Date: Tue, 20 Apr 2021 19:17:41 +0200
Subject: [PATCH 04/10] Add support for avx512.

---
 .github/workflows/bazeltest.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/bazeltest.yml b/.github/workflows/bazeltest.yml
index 018da315b..6064db68a 100644
--- a/.github/workflows/bazeltest.yml
+++ b/.github/workflows/bazeltest.yml
@@ -56,7 +56,7 @@ jobs:
           sudo dpkg -i bazel_0.26.0-linux-x86_64.deb
       - name: Run C++ tests
         run: |
-          bazel test --config=avx --config=openmp \
+          bazel test --config=avx512 --config=openmp \
           --config=${{ matrix.sanitizer_opt }} tests:all
 
   test-mem:
@@ -75,5 +75,5 @@ jobs:
         run: sudo apt-get install libgoogle-perftools-dev
       - name: Run C++ tests
         run: |
-          bazel test --config=avx --config=openmp \
+          bazel test --config=avx512 --config=openmp \
           --config=tcmalloc tests:all

From f4bda1a80e2e4cfcb66ca3b19e2b5d94fe8d1171 Mon Sep 17 00:00:00 2001
From: Sergei Isakov <iserge@google.com>
Date: Wed, 21 Apr 2021 13:01:47 +0200
Subject: [PATCH 05/10] Remove avx512 tests from BUILD and config files.

---
 .bazelrc                        |  4 --
 .github/workflows/bazeltest.yml |  8 ++--
 tests/BUILD                     | 70 ---------------------------------
 3 files changed, 4 insertions(+), 78 deletions(-)

diff --git a/.bazelrc b/.bazelrc
index a2895b8bc..79fe3007b 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -29,10 +29,6 @@ build:avx --copt -O3
 build:avx --copt -mavx2
 build:avx --copt -mfma
 
-# Build with AVX512
-build:avx512 --copt -O3
-build:avx512 --copt -mavx512f
-
 # Build with SSE
 build:sse --copt -O3
 build:sse --copt -msse4
diff --git a/.github/workflows/bazeltest.yml b/.github/workflows/bazeltest.yml
index 6064db68a..1bbfa1592 100644
--- a/.github/workflows/bazeltest.yml
+++ b/.github/workflows/bazeltest.yml
@@ -16,7 +16,7 @@ jobs:
     strategy:
       matrix:
         # Hardware optimizers.
-        hardware_opt: [avx,avx512,sse,basic]
+        hardware_opt: [avx,sse,basic]
         # Optimizers for parallelism.
         parallel_opt: [openmp,nopenmp]
 
@@ -56,9 +56,9 @@ jobs:
           sudo dpkg -i bazel_0.26.0-linux-x86_64.deb
       - name: Run C++ tests
         run: |
-          bazel test --config=avx512 --config=openmp \
+          bazel test --config=avx --config=openmp \
           --config=${{ matrix.sanitizer_opt }} tests:all
-
+  
   test-mem:
     name: Test with tcmalloc
     runs-on: ubuntu-16.04
@@ -75,5 +75,5 @@ jobs:
         run: sudo apt-get install libgoogle-perftools-dev
       - name: Run C++ tests
         run: |
-          bazel test --config=avx512 --config=openmp \
+          bazel test --config=avx --config=openmp \
           --config=tcmalloc tests:all
diff --git a/tests/BUILD b/tests/BUILD
index 015464670..d15286f96 100644
--- a/tests/BUILD
+++ b/tests/BUILD
@@ -1,6 +1,5 @@
 # Options for testing different simulator types.
 avx_copts = ['-mavx2', '-mfma']
-avx512_copts = ['-mavx512f']
 sse_copts = ['-msse4']
 
 windows_copts = [
@@ -8,11 +7,6 @@ windows_copts = [
     "/std:c++14",
 ]
 
-windows_avx512_copts = [
-    "/arch:AVX512",
-    "/std:c++14",
-]
-
 config_setting(
     name = "windows",
     constraint_values = ["@bazel_tools//platforms:windows"],
@@ -256,22 +250,6 @@ cc_test(
     }),
 )
 
-cc_test(
-    name = "simulator_avx512_test",
-    srcs = ["simulator_avx512_test.cc"],
-    deps = [
-        ":simulator_testfixture",
-        "@com_google_googletest//:gtest_main",
-        "//lib:parfor",
-        "//lib:seqfor",
-        "//lib:simulator_avx512",
-    ],
-    copts = select({
-        ":windows": windows_avx512_copts,
-        "//conditions:default": avx512_copts,
-    }),
-)
-
 cc_test(
     name = "simulator_basic_test",
     srcs = ["simulator_basic_test.cc"],
@@ -339,23 +317,6 @@ cc_test(
     }),
 )
 
-cc_test(
-    name = "statespace_avx512_test",
-    srcs = ["statespace_avx512_test.cc"],
-    deps = [
-        ":statespace_testfixture",
-        "@com_google_googletest//:gtest_main",
-        "//lib:parfor",
-        "//lib:seqfor",
-        "//lib:simulator_avx512",
-        "//lib:statespace_avx512",
-    ],
-    copts = select({
-        ":windows": windows_avx512_copts,
-        "//conditions:default": avx512_copts,
-    }),
-)
-
 cc_test(
     name = "statespace_basic_test",
     srcs = ["statespace_basic_test.cc"],
@@ -418,21 +379,6 @@ cc_test(
     ],
 )
 
-cc_test(
-    name = "unitaryspace_avx512_test",
-    srcs = ["unitaryspace_avx512_test.cc"],
-    copts = select({
-        ":windows": windows_avx512_copts,
-        "//conditions:default": avx512_copts,
-    }),
-    deps = [
-        ":unitaryspace_testfixture",
-        "@com_google_googletest//:gtest_main",
-        "//lib:formux",
-        "//lib:unitaryspace_avx512"
-    ],
-)
-
 cc_test(
     name = "unitaryspace_basic_test",
     srcs = ["unitaryspace_basic_test.cc"],
@@ -495,22 +441,6 @@ cc_test(
     ],
 )
 
-cc_test(
-    name = "unitary_calculator_avx512_test",
-    srcs = ["unitary_calculator_avx512_test.cc"],
-    copts = select({
-        ":windows": windows_avx512_copts,
-        "//conditions:default": avx512_copts,
-    }),
-    deps = [
-        ":unitary_calculator_testfixture",
-        "@com_google_googletest//:gtest_main",
-        "//lib:formux",
-        "//lib:unitaryspace_avx512",
-        "//lib:unitary_calculator_avx512",
-    ],
-)
-
 cc_test(
     name = "unitary_calculator_basic_test",
     srcs = ["unitary_calculator_basic_test.cc"],

From 3c9e463dc9508ef6a6f9ff31b376550f07bfb5e2 Mon Sep 17 00:00:00 2001
From: Sergei Isakov <iserge@google.com>
Date: Wed, 21 Apr 2021 13:20:47 +0200
Subject: [PATCH 06/10] Remove avx512 tests from tests/Makefile.

---
 tests/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/Makefile b/tests/Makefile
index cc2c8299b..087af58be 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -1,4 +1,4 @@
-TARGETS = $(shell find . -maxdepth 1 -name '*_test.cc')
+TARGETS = $(shell find . -maxdepth 1 -name '*_test.cc' ! -name '*512*')
 TARGETS := $(TARGETS:%.cc=%.x)
 
 GTEST_DIR = $(CURDIR)/googletest/googletest

From d3273ba97003b85463ae606f02ccd0e130799c01 Mon Sep 17 00:00:00 2001
From: Sergei Isakov <iserge@google.com>
Date: Thu, 22 Apr 2021 15:13:05 +0200
Subject: [PATCH 07/10] Fix excessive memory usage in unitary_calculator_*.h.

---
 lib/unitary_calculator_avx.h    | 74 ++++++++++++++---------------
 lib/unitary_calculator_avx512.h | 84 ++++++++++++++++-----------------
 lib/unitary_calculator_sse.h    | 58 +++++++++++------------
 3 files changed, 108 insertions(+), 108 deletions(-)

diff --git a/lib/unitary_calculator_avx.h b/lib/unitary_calculator_avx.h
index a01612d83..519ff26c1 100644
--- a/lib/unitary_calculator_avx.h
+++ b/lib/unitary_calculator_avx.h
@@ -322,7 +322,7 @@ class UnitaryCalculatorAVX final {
     unsigned p[8];
     __m256i idx[1];
 
-    auto s = UnitarySpace::Create(4);
+    auto s = UnitarySpace::Create(2);
     __m256* w = (__m256*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -513,7 +513,7 @@ class UnitaryCalculatorAVX final {
     unsigned p[8];
     __m256i idx[1];
 
-    auto s = UnitarySpace::Create(6);
+    auto s = UnitarySpace::Create(3);
     __m256* w = (__m256*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -609,7 +609,7 @@ class UnitaryCalculatorAVX final {
     unsigned p[8];
     __m256i idx[3];
 
-    auto s = UnitarySpace::Create(5);
+    auto s = UnitarySpace::Create(3);
     __m256* w = (__m256*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -805,7 +805,7 @@ class UnitaryCalculatorAVX final {
     unsigned p[8];
     __m256i idx[1];
 
-    auto s = UnitarySpace::Create(8);
+    auto s = UnitarySpace::Create(4);
     __m256* w = (__m256*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -919,7 +919,7 @@ class UnitaryCalculatorAVX final {
     unsigned p[8];
     __m256i idx[3];
 
-    auto s = UnitarySpace::Create(7);
+    auto s = UnitarySpace::Create(4);
     __m256* w = (__m256*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -1015,7 +1015,7 @@ class UnitaryCalculatorAVX final {
     unsigned p[8];
     __m256i idx[7];
 
-    auto s = UnitarySpace::Create(6);
+    auto s = UnitarySpace::Create(3);
     __m256* w = (__m256*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -1211,7 +1211,7 @@ class UnitaryCalculatorAVX final {
     unsigned p[8];
     __m256i idx[1];
 
-    auto s = UnitarySpace::Create(10);
+    auto s = UnitarySpace::Create(5);
     __m256* w = (__m256*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -1330,7 +1330,7 @@ class UnitaryCalculatorAVX final {
     unsigned p[8];
     __m256i idx[3];
 
-    auto s = UnitarySpace::Create(9);
+    auto s = UnitarySpace::Create(5);
     __m256* w = (__m256*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -1444,7 +1444,7 @@ class UnitaryCalculatorAVX final {
     unsigned p[8];
     __m256i idx[7];
 
-    auto s = UnitarySpace::Create(8);
+    auto s = UnitarySpace::Create(4);
     __m256* w = (__m256*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -1643,7 +1643,7 @@ class UnitaryCalculatorAVX final {
     unsigned p[8];
     __m256i idx[1];
 
-    auto s = UnitarySpace::Create(12);
+    auto s = UnitarySpace::Create(6);
     __m256* w = (__m256*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -1762,7 +1762,7 @@ class UnitaryCalculatorAVX final {
     unsigned p[8];
     __m256i idx[3];
 
-    auto s = UnitarySpace::Create(11);
+    auto s = UnitarySpace::Create(6);
     __m256* w = (__m256*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -1881,7 +1881,7 @@ class UnitaryCalculatorAVX final {
     unsigned p[8];
     __m256i idx[7];
 
-    auto s = UnitarySpace::Create(10);
+    auto s = UnitarySpace::Create(5);
     __m256* w = (__m256*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -2081,7 +2081,7 @@ class UnitaryCalculatorAVX final {
     unsigned p[8];
     __m256i idx[1];
 
-    auto s = UnitarySpace::Create(14);
+    auto s = UnitarySpace::Create(7);
     __m256* w = (__m256*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -2200,7 +2200,7 @@ class UnitaryCalculatorAVX final {
     unsigned p[8];
     __m256i idx[3];
 
-    auto s = UnitarySpace::Create(13);
+    auto s = UnitarySpace::Create(7);
     __m256* w = (__m256*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -2319,7 +2319,7 @@ class UnitaryCalculatorAVX final {
     unsigned p[8];
     __m256i idx[7];
 
-    auto s = UnitarySpace::Create(12);
+    auto s = UnitarySpace::Create(6);
     __m256* w = (__m256*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -2551,7 +2551,7 @@ class UnitaryCalculatorAVX final {
 
     unsigned p[8];
 
-    auto s = UnitarySpace::Create(5);
+    auto s = UnitarySpace::Create(3);
     __m256* w = (__m256*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -2654,7 +2654,7 @@ class UnitaryCalculatorAVX final {
     unsigned p[8];
     __m256i idx[1];
 
-    auto s = UnitarySpace::Create(4);
+    auto s = UnitarySpace::Create(2);
     __m256* w = (__m256*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -2777,7 +2777,7 @@ class UnitaryCalculatorAVX final {
     unsigned p[8];
     __m256i idx[1];
 
-    auto s = UnitarySpace::Create(4);
+    auto s = UnitarySpace::Create(2);
     __m256* w = (__m256*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -3018,7 +3018,7 @@ class UnitaryCalculatorAVX final {
 
     unsigned p[8];
 
-    auto s = UnitarySpace::Create(7);
+    auto s = UnitarySpace::Create(4);
     __m256* w = (__m256*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -3139,7 +3139,7 @@ class UnitaryCalculatorAVX final {
     unsigned p[8];
     __m256i idx[1];
 
-    auto s = UnitarySpace::Create(6);
+    auto s = UnitarySpace::Create(3);
     __m256* w = (__m256*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -3281,7 +3281,7 @@ class UnitaryCalculatorAVX final {
     unsigned p[8];
     __m256i idx[1];
 
-    auto s = UnitarySpace::Create(6);
+    auto s = UnitarySpace::Create(3);
     __m256* w = (__m256*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -3398,7 +3398,7 @@ class UnitaryCalculatorAVX final {
     unsigned p[8];
     __m256i idx[3];
 
-    auto s = UnitarySpace::Create(5);
+    auto s = UnitarySpace::Create(3);
     __m256* w = (__m256*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -3521,7 +3521,7 @@ class UnitaryCalculatorAVX final {
     unsigned p[8];
     __m256i idx[3];
 
-    auto s = UnitarySpace::Create(5);
+    auto s = UnitarySpace::Create(3);
     __m256* w = (__m256*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -3762,7 +3762,7 @@ class UnitaryCalculatorAVX final {
 
     unsigned p[8];
 
-    auto s = UnitarySpace::Create(9);
+    auto s = UnitarySpace::Create(5);
     __m256* w = (__m256*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -3887,7 +3887,7 @@ class UnitaryCalculatorAVX final {
     unsigned p[8];
     __m256i idx[1];
 
-    auto s = UnitarySpace::Create(8);
+    auto s = UnitarySpace::Create(4);
     __m256* w = (__m256*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -4033,7 +4033,7 @@ class UnitaryCalculatorAVX final {
     unsigned p[8];
     __m256i idx[1];
 
-    auto s = UnitarySpace::Create(8);
+    auto s = UnitarySpace::Create(4);
     __m256* w = (__m256*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -4168,7 +4168,7 @@ class UnitaryCalculatorAVX final {
     unsigned p[8];
     __m256i idx[3];
 
-    auto s = UnitarySpace::Create(7);
+    auto s = UnitarySpace::Create(4);
     __m256* w = (__m256*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -4310,7 +4310,7 @@ class UnitaryCalculatorAVX final {
     unsigned p[8];
     __m256i idx[3];
 
-    auto s = UnitarySpace::Create(7);
+    auto s = UnitarySpace::Create(4);
     __m256* w = (__m256*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -4427,7 +4427,7 @@ class UnitaryCalculatorAVX final {
     unsigned p[8];
     __m256i idx[7];
 
-    auto s = UnitarySpace::Create(6);
+    auto s = UnitarySpace::Create(3);
     __m256* w = (__m256*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -4550,7 +4550,7 @@ class UnitaryCalculatorAVX final {
     unsigned p[8];
     __m256i idx[7];
 
-    auto s = UnitarySpace::Create(6);
+    auto s = UnitarySpace::Create(3);
     __m256* w = (__m256*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -4791,7 +4791,7 @@ class UnitaryCalculatorAVX final {
 
     unsigned p[8];
 
-    auto s = UnitarySpace::Create(11);
+    auto s = UnitarySpace::Create(6);
     __m256* w = (__m256*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -4916,7 +4916,7 @@ class UnitaryCalculatorAVX final {
     unsigned p[8];
     __m256i idx[1];
 
-    auto s = UnitarySpace::Create(10);
+    auto s = UnitarySpace::Create(5);
     __m256* w = (__m256*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -5062,7 +5062,7 @@ class UnitaryCalculatorAVX final {
     unsigned p[8];
     __m256i idx[1];
 
-    auto s = UnitarySpace::Create(10);
+    auto s = UnitarySpace::Create(5);
     __m256* w = (__m256*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -5201,7 +5201,7 @@ class UnitaryCalculatorAVX final {
     unsigned p[8];
     __m256i idx[3];
 
-    auto s = UnitarySpace::Create(9);
+    auto s = UnitarySpace::Create(5);
     __m256* w = (__m256*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -5347,7 +5347,7 @@ class UnitaryCalculatorAVX final {
     unsigned p[8];
     __m256i idx[3];
 
-    auto s = UnitarySpace::Create(9);
+    auto s = UnitarySpace::Create(5);
     __m256* w = (__m256*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -5482,7 +5482,7 @@ class UnitaryCalculatorAVX final {
     unsigned p[8];
     __m256i idx[7];
 
-    auto s = UnitarySpace::Create(8);
+    auto s = UnitarySpace::Create(4);
     __m256* w = (__m256*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -5624,7 +5624,7 @@ class UnitaryCalculatorAVX final {
     unsigned p[8];
     __m256i idx[7];
 
-    auto s = UnitarySpace::Create(8);
+    auto s = UnitarySpace::Create(4);
     __m256* w = (__m256*) s.get();
     fp_type* wf = (fp_type*) w;
 
diff --git a/lib/unitary_calculator_avx512.h b/lib/unitary_calculator_avx512.h
index 174f44af1..57ba64848 100644
--- a/lib/unitary_calculator_avx512.h
+++ b/lib/unitary_calculator_avx512.h
@@ -334,7 +334,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[1];
 
-    auto s = UnitarySpace::Create(5);
+    auto s = UnitarySpace::Create(3);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -527,7 +527,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[1];
 
-    auto s = UnitarySpace::Create(7);
+    auto s = UnitarySpace::Create(4);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -625,7 +625,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[3];
 
-    auto s = UnitarySpace::Create(6);
+    auto s = UnitarySpace::Create(3);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -823,7 +823,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[1];
 
-    auto s = UnitarySpace::Create(9);
+    auto s = UnitarySpace::Create(5);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -939,7 +939,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[3];
 
-    auto s = UnitarySpace::Create(8);
+    auto s = UnitarySpace::Create(4);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -1037,7 +1037,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[7];
 
-    auto s = UnitarySpace::Create(7);
+    auto s = UnitarySpace::Create(4);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -1235,7 +1235,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[1];
 
-    auto s = UnitarySpace::Create(11);
+    auto s = UnitarySpace::Create(6);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -1356,7 +1356,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[3];
 
-    auto s = UnitarySpace::Create(10);
+    auto s = UnitarySpace::Create(5);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -1472,7 +1472,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[7];
 
-    auto s = UnitarySpace::Create(9);
+    auto s = UnitarySpace::Create(5);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -1570,7 +1570,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[15];
 
-    auto s = UnitarySpace::Create(8);
+    auto s = UnitarySpace::Create(4);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -1768,7 +1768,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[1];
 
-    auto s = UnitarySpace::Create(13);
+    auto s = UnitarySpace::Create(7);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -1889,7 +1889,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[3];
 
-    auto s = UnitarySpace::Create(12);
+    auto s = UnitarySpace::Create(6);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -2010,7 +2010,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[7];
 
-    auto s = UnitarySpace::Create(11);
+    auto s = UnitarySpace::Create(6);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -2126,7 +2126,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[15];
 
-    auto s = UnitarySpace::Create(10);
+    auto s = UnitarySpace::Create(5);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -2328,7 +2328,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[1];
 
-    auto s = UnitarySpace::Create(15);
+    auto s = UnitarySpace::Create(8);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -2449,7 +2449,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[3];
 
-    auto s = UnitarySpace::Create(14);
+    auto s = UnitarySpace::Create(7);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -2570,7 +2570,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[7];
 
-    auto s = UnitarySpace::Create(13);
+    auto s = UnitarySpace::Create(7);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -2691,7 +2691,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[15];
 
-    auto s = UnitarySpace::Create(12);
+    auto s = UnitarySpace::Create(6);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -2924,7 +2924,7 @@ class UnitaryCalculatorAVX512 final {
 
     unsigned p[16];
 
-    auto s = UnitarySpace::Create(6);
+    auto s = UnitarySpace::Create(3);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -3027,7 +3027,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[1];
 
-    auto s = UnitarySpace::Create(5);
+    auto s = UnitarySpace::Create(3);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -3152,7 +3152,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[1];
 
-    auto s = UnitarySpace::Create(5);
+    auto s = UnitarySpace::Create(3);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -3395,7 +3395,7 @@ class UnitaryCalculatorAVX512 final {
 
     unsigned p[16];
 
-    auto s = UnitarySpace::Create(8);
+    auto s = UnitarySpace::Create(4);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -3516,7 +3516,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[1];
 
-    auto s = UnitarySpace::Create(7);
+    auto s = UnitarySpace::Create(4);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -3660,7 +3660,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[1];
 
-    auto s = UnitarySpace::Create(7);
+    auto s = UnitarySpace::Create(4);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -3779,7 +3779,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[3];
 
-    auto s = UnitarySpace::Create(6);
+    auto s = UnitarySpace::Create(3);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -3904,7 +3904,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[3];
 
-    auto s = UnitarySpace::Create(6);
+    auto s = UnitarySpace::Create(3);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -4147,7 +4147,7 @@ class UnitaryCalculatorAVX512 final {
 
     unsigned p[16];
 
-    auto s = UnitarySpace::Create(10);
+    auto s = UnitarySpace::Create(5);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -4272,7 +4272,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[1];
 
-    auto s = UnitarySpace::Create(9);
+    auto s = UnitarySpace::Create(5);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -4420,7 +4420,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[1];
 
-    auto s = UnitarySpace::Create(9);
+    auto s = UnitarySpace::Create(5);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -4557,7 +4557,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[3];
 
-    auto s = UnitarySpace::Create(8);
+    auto s = UnitarySpace::Create(4);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -4701,7 +4701,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[3];
 
-    auto s = UnitarySpace::Create(8);
+    auto s = UnitarySpace::Create(4);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -4820,7 +4820,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[7];
 
-    auto s = UnitarySpace::Create(7);
+    auto s = UnitarySpace::Create(4);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -4945,7 +4945,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[7];
 
-    auto s = UnitarySpace::Create(7);
+    auto s = UnitarySpace::Create(4);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -5188,7 +5188,7 @@ class UnitaryCalculatorAVX512 final {
 
     unsigned p[16];
 
-    auto s = UnitarySpace::Create(12);
+    auto s = UnitarySpace::Create(6);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -5313,7 +5313,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[1];
 
-    auto s = UnitarySpace::Create(11);
+    auto s = UnitarySpace::Create(6);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -5461,7 +5461,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[1];
 
-    auto s = UnitarySpace::Create(11);
+    auto s = UnitarySpace::Create(6);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -5602,7 +5602,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[3];
 
-    auto s = UnitarySpace::Create(10);
+    auto s = UnitarySpace::Create(5);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -5750,7 +5750,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[3];
 
-    auto s = UnitarySpace::Create(10);
+    auto s = UnitarySpace::Create(5);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -5887,7 +5887,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[7];
 
-    auto s = UnitarySpace::Create(9);
+    auto s = UnitarySpace::Create(5);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -6031,7 +6031,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[7];
 
-    auto s = UnitarySpace::Create(9);
+    auto s = UnitarySpace::Create(5);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -6150,7 +6150,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[15];
 
-    auto s = UnitarySpace::Create(8);
+    auto s = UnitarySpace::Create(4);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -6275,7 +6275,7 @@ class UnitaryCalculatorAVX512 final {
     unsigned p[16];
     __m512i idx[15];
 
-    auto s = UnitarySpace::Create(8);
+    auto s = UnitarySpace::Create(4);
     __m512* w = (__m512*) s.get();
     fp_type* wf = (fp_type*) w;
 
diff --git a/lib/unitary_calculator_sse.h b/lib/unitary_calculator_sse.h
index 89bb21d20..2e458526e 100644
--- a/lib/unitary_calculator_sse.h
+++ b/lib/unitary_calculator_sse.h
@@ -301,7 +301,7 @@ class UnitaryCalculatorSSE final {
                    const fp_type* matrix, Unitary& state) const {
     unsigned p[4];
 
-    auto s = UnitarySpace::Create(3);
+    auto s = UnitarySpace::Create(2);
     __m128* w = (__m128*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -483,7 +483,7 @@ class UnitaryCalculatorSSE final {
 
     unsigned p[4];
 
-    auto s = UnitarySpace::Create(5);
+    auto s = UnitarySpace::Create(3);
     __m128* w = (__m128*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -570,7 +570,7 @@ class UnitaryCalculatorSSE final {
                     const fp_type* matrix, Unitary& state) const {
     unsigned p[4];
 
-    auto s = UnitarySpace::Create(4);
+    auto s = UnitarySpace::Create(2);
     __m128* w = (__m128*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -758,7 +758,7 @@ class UnitaryCalculatorSSE final {
 
     unsigned p[4];
 
-    auto s = UnitarySpace::Create(7);
+    auto s = UnitarySpace::Create(4);
     __m128* w = (__m128*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -863,7 +863,7 @@ class UnitaryCalculatorSSE final {
 
     unsigned p[4];
 
-    auto s = UnitarySpace::Create(6);
+    auto s = UnitarySpace::Create(3);
     __m128* w = (__m128*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -1054,7 +1054,7 @@ class UnitaryCalculatorSSE final {
 
     unsigned p[4];
 
-    auto s = UnitarySpace::Create(9);
+    auto s = UnitarySpace::Create(5);
     __m128* w = (__m128*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -1164,7 +1164,7 @@ class UnitaryCalculatorSSE final {
 
     unsigned p[4];
 
-    auto s = UnitarySpace::Create(8);
+    auto s = UnitarySpace::Create(4);
     __m128* w = (__m128*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -1355,7 +1355,7 @@ class UnitaryCalculatorSSE final {
 
     unsigned p[4];
 
-    auto s = UnitarySpace::Create(11);
+    auto s = UnitarySpace::Create(6);
     __m128* w = (__m128*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -1465,7 +1465,7 @@ class UnitaryCalculatorSSE final {
 
     unsigned p[4];
 
-    auto s = UnitarySpace::Create(10);
+    auto s = UnitarySpace::Create(5);
     __m128* w = (__m128*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -1658,7 +1658,7 @@ class UnitaryCalculatorSSE final {
 
     unsigned p[4];
 
-    auto s = UnitarySpace::Create(13);
+    auto s = UnitarySpace::Create(7);
     __m128* w = (__m128*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -1768,7 +1768,7 @@ class UnitaryCalculatorSSE final {
 
     unsigned p[4];
 
-    auto s = UnitarySpace::Create(12);
+    auto s = UnitarySpace::Create(6);
     __m128* w = (__m128*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -1993,7 +1993,7 @@ class UnitaryCalculatorSSE final {
 
     unsigned p[4];
 
-    auto s = UnitarySpace::Create(4);
+    auto s = UnitarySpace::Create(2);
     __m128* w = (__m128*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -2095,7 +2095,7 @@ class UnitaryCalculatorSSE final {
 
     unsigned p[4];
 
-    auto s = UnitarySpace::Create(3);
+    auto s = UnitarySpace::Create(2);
     __m128* w = (__m128*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -2209,7 +2209,7 @@ class UnitaryCalculatorSSE final {
 
     unsigned p[4];
 
-    auto s = UnitarySpace::Create(3);
+    auto s = UnitarySpace::Create(2);
     __m128* w = (__m128*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -2442,7 +2442,7 @@ class UnitaryCalculatorSSE final {
 
     unsigned p[4];
 
-    auto s = UnitarySpace::Create(6);
+    auto s = UnitarySpace::Create(3);
     __m128* w = (__m128*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -2562,7 +2562,7 @@ class UnitaryCalculatorSSE final {
 
     unsigned p[4];
 
-    auto s = UnitarySpace::Create(5);
+    auto s = UnitarySpace::Create(3);
     __m128* w = (__m128*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -2695,7 +2695,7 @@ class UnitaryCalculatorSSE final {
 
     unsigned p[4];
 
-    auto s = UnitarySpace::Create(5);
+    auto s = UnitarySpace::Create(3);
     __m128* w = (__m128*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -2803,7 +2803,7 @@ class UnitaryCalculatorSSE final {
 
     unsigned p[4];
 
-    auto s = UnitarySpace::Create(4);
+    auto s = UnitarySpace::Create(2);
     __m128* w = (__m128*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -2918,7 +2918,7 @@ class UnitaryCalculatorSSE final {
 
     unsigned p[4];
 
-    auto s = UnitarySpace::Create(4);
+    auto s = UnitarySpace::Create(2);
     __m128* w = (__m128*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -3152,7 +3152,7 @@ class UnitaryCalculatorSSE final {
 
     unsigned p[4];
 
-    auto s = UnitarySpace::Create(8);
+    auto s = UnitarySpace::Create(4);
     __m128* w = (__m128*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -3276,7 +3276,7 @@ class UnitaryCalculatorSSE final {
 
     unsigned p[4];
 
-    auto s = UnitarySpace::Create(7);
+    auto s = UnitarySpace::Create(4);
     __m128* w = (__m128*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -3413,7 +3413,7 @@ class UnitaryCalculatorSSE final {
 
     unsigned p[4];
 
-    auto s = UnitarySpace::Create(7);
+    auto s = UnitarySpace::Create(4);
     __m128* w = (__m128*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -3539,7 +3539,7 @@ class UnitaryCalculatorSSE final {
 
     unsigned p[4];
 
-    auto s = UnitarySpace::Create(6);
+    auto s = UnitarySpace::Create(3);
     __m128* w = (__m128*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -3673,7 +3673,7 @@ class UnitaryCalculatorSSE final {
 
     unsigned p[4];
 
-    auto s = UnitarySpace::Create(6);
+    auto s = UnitarySpace::Create(3);
     __m128* w = (__m128*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -3908,7 +3908,7 @@ class UnitaryCalculatorSSE final {
 
     unsigned p[4];
 
-    auto s = UnitarySpace::Create(10);
+    auto s = UnitarySpace::Create(5);
     __m128* w = (__m128*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -4032,7 +4032,7 @@ class UnitaryCalculatorSSE final {
 
     unsigned p[4];
 
-    auto s = UnitarySpace::Create(9);
+    auto s = UnitarySpace::Create(5);
     __m128* w = (__m128*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -4169,7 +4169,7 @@ class UnitaryCalculatorSSE final {
 
     unsigned p[4];
 
-    auto s = UnitarySpace::Create(9);
+    auto s = UnitarySpace::Create(5);
     __m128* w = (__m128*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -4299,7 +4299,7 @@ class UnitaryCalculatorSSE final {
 
     unsigned p[4];
 
-    auto s = UnitarySpace::Create(8);
+    auto s = UnitarySpace::Create(4);
     __m128* w = (__m128*) s.get();
     fp_type* wf = (fp_type*) w;
 
@@ -4437,7 +4437,7 @@ class UnitaryCalculatorSSE final {
 
     unsigned p[4];
 
-    auto s = UnitarySpace::Create(8);
+    auto s = UnitarySpace::Create(4);
     __m128* w = (__m128*) s.get();
     fp_type* wf = (fp_type*) w;
 

From 09e9607cc428f4df13cd6a3ee9846948cc729fd4 Mon Sep 17 00:00:00 2001
From: Sergei Isakov <iserge@google.com>
Date: Fri, 23 Apr 2021 19:24:12 +0200
Subject: [PATCH 08/10] Add runtime detection of AVX512 in tests.

---
 tests/BUILD                             | 79 +++++++++++++++++++++++++
 tests/Makefile                          |  9 ++-
 tests/cpuinfo.h                         | 42 +++++++++++++
 tests/simulator_avx512_test.cc          | 10 +++-
 tests/statespace_avx512_test.cc         | 10 +++-
 tests/unitary_calculator_avx512_test.cc | 26 +++++---
 tests/unitaryspace_avx512_test.cc       | 17 ++++--
 7 files changed, 178 insertions(+), 15 deletions(-)
 create mode 100644 tests/cpuinfo.h

diff --git a/tests/BUILD b/tests/BUILD
index d15286f96..531a4f255 100644
--- a/tests/BUILD
+++ b/tests/BUILD
@@ -1,5 +1,6 @@
 # Options for testing different simulator types.
 avx_copts = ['-mavx2', '-mfma']
+avx512_copts = ['-mavx512f']
 sse_copts = ['-msse4']
 
 windows_copts = [
@@ -7,6 +8,11 @@ windows_copts = [
     "/std:c++14",
 ]
 
+windows_avx512_copts = [
+    "/arch:AVX512",
+    "/std:c++14",
+]
+
 config_setting(
     name = "windows",
     constraint_values = ["@bazel_tools//platforms:windows"],
@@ -250,6 +256,23 @@ cc_test(
     }),
 )
 
+cc_test(
+    name = "simulator_avx512_test",
+    srcs = ["simulator_avx512_test.cc"],
+    deps = [
+        ":cpuinfo",
+        ":simulator_testfixture",
+        "@com_google_googletest//:gtest_main",
+        "//lib:parfor",
+        "//lib:seqfor",
+        "//lib:simulator_avx512",
+    ],
+    copts = select({
+        ":windows": windows_avx512_copts,
+        "//conditions:default": avx512_copts,
+    }),
+)
+
 cc_test(
     name = "simulator_basic_test",
     srcs = ["simulator_basic_test.cc"],
@@ -317,6 +340,24 @@ cc_test(
     }),
 )
 
+cc_test(
+    name = "statespace_avx512_test",
+    srcs = ["statespace_avx512_test.cc"],
+    deps = [
+        ":cpuinfo",
+        ":statespace_testfixture",
+        "@com_google_googletest//:gtest_main",
+        "//lib:parfor",
+        "//lib:seqfor",
+        "//lib:simulator_avx512",
+        "//lib:statespace_avx512",
+    ],
+    copts = select({
+        ":windows": windows_avx512_copts,
+        "//conditions:default": avx512_copts,
+    }),
+)
+
 cc_test(
     name = "statespace_basic_test",
     srcs = ["statespace_basic_test.cc"],
@@ -379,6 +420,22 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "unitaryspace_avx512_test",
+    srcs = ["unitaryspace_avx512_test.cc"],
+    copts = select({
+        ":windows": windows_avx512_copts,
+        "//conditions:default": avx512_copts,
+    }),
+    deps = [
+        ":cpuinfo",
+        ":unitaryspace_testfixture",
+        "@com_google_googletest//:gtest_main",
+        "//lib:formux",
+        "//lib:unitaryspace_avx512"
+    ],
+)
+
 cc_test(
     name = "unitaryspace_basic_test",
     srcs = ["unitaryspace_basic_test.cc"],
@@ -441,6 +498,23 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "unitary_calculator_avx512_test",
+    srcs = ["unitary_calculator_avx512_test.cc"],
+    copts = select({
+        ":windows": windows_avx512_copts,
+        "//conditions:default": avx512_copts,
+    }),
+    deps = [
+        ":cpuinfo",
+        ":unitary_calculator_testfixture",
+        "@com_google_googletest//:gtest_main",
+        "//lib:formux",
+        "//lib:unitaryspace_avx512",
+        "//lib:unitary_calculator_avx512",
+    ],
+)
+
 cc_test(
     name = "unitary_calculator_basic_test",
     srcs = ["unitary_calculator_basic_test.cc"],
@@ -486,3 +560,8 @@ cc_test(
         "//lib:vectorspace",
     ],
 )
+
+cc_library(
+    name = "cpuinfo",
+    hdrs = ["cpuinfo.h"],
+)
diff --git a/tests/Makefile b/tests/Makefile
index 087af58be..016771e48 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -1,4 +1,11 @@
-TARGETS = $(shell find . -maxdepth 1 -name '*_test.cc' ! -name '*512*')
+HAVE_AVX512 = $(shell grep avx512f /proc/cpuinfo)
+TARGETS = $(shell\
+  if [ -z "$(HAVE_AVX512)" ] ; then\
+    find . -maxdepth 1 -name "*_test.cc" ! -name "*512*";\
+  else\
+    find . -maxdepth 1 -name "*_test.cc";\
+  fi\
+)
 TARGETS := $(TARGETS:%.cc=%.x)
 
 GTEST_DIR = $(CURDIR)/googletest/googletest
diff --git a/tests/cpuinfo.h b/tests/cpuinfo.h
new file mode 100644
index 000000000..3f1f68689
--- /dev/null
+++ b/tests/cpuinfo.h
@@ -0,0 +1,42 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef CPUINFO_H_
+#define CPUINFO_H_
+
+#ifdef _WIN32
+#include <intrin.h>
+#else
+#include <cpuid.h>
+#endif
+
+namespace qsim {
+
+inline bool HaveAVX512() {
+  unsigned info[4];
+
+#ifdef _WIN32
+  __cpuidex(info, 7, 0);
+#else
+  if (__get_cpuid_count(7, 0, info, info + 1, info + 2, info + 3) == 0) {
+    info[1] = 0;
+  }
+#endif
+
+  return (info[1] & (unsigned{1} << 16)) != 0;
+}
+
+}  // namespace qsim
+
+#endif  // CPUINFO_H_
diff --git a/tests/simulator_avx512_test.cc b/tests/simulator_avx512_test.cc
index ec3b7b2ab..e5c57f854 100644
--- a/tests/simulator_avx512_test.cc
+++ b/tests/simulator_avx512_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "simulator_testfixture.h"
+#include "cpuinfo.h"
 
 #include "gtest/gtest.h"
 
@@ -25,7 +26,14 @@
 namespace qsim {
 
 template <class T>
-class SimulatorAVX512Test : public testing::Test {};
+class SimulatorAVX512Test : public testing::Test {
+ protected:
+  void SetUp() override {
+    if (!HaveAVX512()) {
+      GTEST_SKIP() << "Skipping all AVX512 tests.";
+    }
+  }
+};
 
 using ::testing::Types;
 #ifdef _OPENMP
diff --git a/tests/statespace_avx512_test.cc b/tests/statespace_avx512_test.cc
index a7333f9e3..3b8cd34ed 100644
--- a/tests/statespace_avx512_test.cc
+++ b/tests/statespace_avx512_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "statespace_testfixture.h"
+#include "cpuinfo.h"
 
 #include "gtest/gtest.h"
 
@@ -26,7 +27,14 @@
 namespace qsim {
 
 template <class T>
-class StateSpaceAVX512Test : public testing::Test {};
+class StateSpaceAVX512Test : public testing::Test {
+ protected:
+  void SetUp() override {
+    if (!HaveAVX512()) {
+      GTEST_SKIP() << "Skipping all AVX512 tests.";
+    }
+  }
+};
 
 using ::testing::Types;
 #ifdef _OPENMP
diff --git a/tests/unitary_calculator_avx512_test.cc b/tests/unitary_calculator_avx512_test.cc
index 24381b69a..4c9314171 100644
--- a/tests/unitary_calculator_avx512_test.cc
+++ b/tests/unitary_calculator_avx512_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "unitary_calculator_testfixture.h"
+#include "cpuinfo.h"
 
 #include "gtest/gtest.h"
 
@@ -23,35 +24,44 @@ namespace qsim {
 namespace unitary {
 namespace {
 
-TEST(UnitaryCalculatorAVX512Test, ApplyGate1) {
+class UnitaryCalculatorAVX512Test : public testing::Test {
+ protected:
+  void SetUp() override {
+    if (!HaveAVX512()) {
+      GTEST_SKIP() << "Skipping all AVX512 tests.";
+    }
+  }
+};
+
+TEST_F(UnitaryCalculatorAVX512Test, ApplyGate1) {
   TestApplyGate1<UnitaryCalculatorAVX512<For>>();
 }
 
-TEST(UnitaryCalculatorAVX512Test, ApplyControlledGate1) {
+TEST_F(UnitaryCalculatorAVX512Test, ApplyControlledGate1) {
   TestApplyControlledGate1<UnitaryCalculatorAVX512<For>>();
 }
 
-TEST(UnitaryCalculatorAVX512Test, ApplyGate2) {
+TEST_F(UnitaryCalculatorAVX512Test, ApplyGate2) {
   TestApplyGate2<UnitaryCalculatorAVX512<For>>();
 }
 
-TEST(UnitaryCalculatorAVX512Test, ApplyControlledGate2) {
+TEST_F(UnitaryCalculatorAVX512Test, ApplyControlledGate2) {
   TestApplyControlledGate2<UnitaryCalculatorAVX512<For>>();
 }
 
-TEST(UnitaryCalculatorAVX512Test, ApplyFusedGate) {
+TEST_F(UnitaryCalculatorAVX512Test, ApplyFusedGate) {
   TestApplyFusedGate<UnitaryCalculatorAVX512<For>>();
 }
 
-TEST(UnitaryCalculatorAVX512Test, ApplyGates) {
+TEST_F(UnitaryCalculatorAVX512Test, ApplyGates) {
   TestApplyGates<UnitaryCalculatorAVX512<For>>(false);
 }
 
-TEST(UnitaryCalculatorAVX512Test, ApplyControlledGates) {
+TEST_F(UnitaryCalculatorAVX512Test, ApplyControlledGates) {
   TestApplyControlledGates<UnitaryCalculatorAVX512<For>>(false);
 }
 
-TEST(UnitaryCalculatorAVX512Test, SmallCircuits) {
+TEST_F(UnitaryCalculatorAVX512Test, SmallCircuits) {
   TestSmallCircuits<UnitaryCalculatorAVX512<For>>();
 }
 
diff --git a/tests/unitaryspace_avx512_test.cc b/tests/unitaryspace_avx512_test.cc
index 61260d678..460c8f1ed 100644
--- a/tests/unitaryspace_avx512_test.cc
+++ b/tests/unitaryspace_avx512_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "unitaryspace_testfixture.h"
+#include "cpuinfo.h"
 
 #include "gtest/gtest.h"
 
@@ -20,19 +21,27 @@
 #include "../lib/unitaryspace_avx512.h"
 
 namespace qsim {
-
 namespace unitary {
 namespace {
 
-TEST(UnitarySpaceAVX512Test, SetZero) {
+class UnitarySpaceAVX512Test : public testing::Test {
+ protected:
+  void SetUp() override {
+    if (!HaveAVX512()) {
+      GTEST_SKIP() << "Skipping all AVX512 tests.";
+    }
+  }
+};
+
+TEST_F(UnitarySpaceAVX512Test, SetZero) {
   TestSetZeros<UnitarySpaceAVX512<For>>();
 }
 
-TEST(UnitarySpaceAVX512Test, SetIdentity) {
+TEST_F(UnitarySpaceAVX512Test, SetIdentity) {
   TestSetIdentity<UnitarySpaceAVX512<For>>();
 }
 
-TEST(UnitarySpaceAVX512Test, GetEntry) {
+TEST_F(UnitarySpaceAVX512Test, GetEntry) {
   TestSetEntry<UnitarySpaceAVX512<For>>();
 }
 

From b11741500dd5c1ca768c1c10e1d1562f75edd5b1 Mon Sep 17 00:00:00 2001
From: Sergei Isakov <iserge@google.com>
Date: Mon, 26 Apr 2021 16:26:29 +0200
Subject: [PATCH 09/10] Remove runtime detection of AVX512 in tests.

---
 tests/BUILD                             | 11 +------
 tests/cpuinfo.h                         | 42 -------------------------
 tests/simulator_avx512_test.cc          | 14 +++------
 tests/statespace_avx512_test.cc         | 14 +++------
 tests/unitary_calculator_avx512_test.cc | 30 +++++++-----------
 tests/unitaryspace_avx512_test.cc       | 20 +++++-------
 6 files changed, 30 insertions(+), 101 deletions(-)
 delete mode 100644 tests/cpuinfo.h

diff --git a/tests/BUILD b/tests/BUILD
index 531a4f255..f60b952e2 100644
--- a/tests/BUILD
+++ b/tests/BUILD
@@ -1,6 +1,6 @@
 # Options for testing different simulator types.
 avx_copts = ['-mavx2', '-mfma']
-avx512_copts = ['-mavx512f']
+avx512_copts = ['-march=native']
 sse_copts = ['-msse4']
 
 windows_copts = [
@@ -260,7 +260,6 @@ cc_test(
     name = "simulator_avx512_test",
     srcs = ["simulator_avx512_test.cc"],
     deps = [
-        ":cpuinfo",
         ":simulator_testfixture",
         "@com_google_googletest//:gtest_main",
         "//lib:parfor",
@@ -344,7 +343,6 @@ cc_test(
     name = "statespace_avx512_test",
     srcs = ["statespace_avx512_test.cc"],
     deps = [
-        ":cpuinfo",
         ":statespace_testfixture",
         "@com_google_googletest//:gtest_main",
         "//lib:parfor",
@@ -428,7 +426,6 @@ cc_test(
         "//conditions:default": avx512_copts,
     }),
     deps = [
-        ":cpuinfo",
         ":unitaryspace_testfixture",
         "@com_google_googletest//:gtest_main",
         "//lib:formux",
@@ -506,7 +503,6 @@ cc_test(
         "//conditions:default": avx512_copts,
     }),
     deps = [
-        ":cpuinfo",
         ":unitary_calculator_testfixture",
         "@com_google_googletest//:gtest_main",
         "//lib:formux",
@@ -560,8 +556,3 @@ cc_test(
         "//lib:vectorspace",
     ],
 )
-
-cc_library(
-    name = "cpuinfo",
-    hdrs = ["cpuinfo.h"],
-)
diff --git a/tests/cpuinfo.h b/tests/cpuinfo.h
deleted file mode 100644
index 3f1f68689..000000000
--- a/tests/cpuinfo.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef CPUINFO_H_
-#define CPUINFO_H_
-
-#ifdef _WIN32
-#include <intrin.h>
-#else
-#include <cpuid.h>
-#endif
-
-namespace qsim {
-
-inline bool HaveAVX512() {
-  unsigned info[4];
-
-#ifdef _WIN32
-  __cpuidex(info, 7, 0);
-#else
-  if (__get_cpuid_count(7, 0, info, info + 1, info + 2, info + 3) == 0) {
-    info[1] = 0;
-  }
-#endif
-
-  return (info[1] & (unsigned{1} << 16)) != 0;
-}
-
-}  // namespace qsim
-
-#endif  // CPUINFO_H_
diff --git a/tests/simulator_avx512_test.cc b/tests/simulator_avx512_test.cc
index e5c57f854..448666447 100644
--- a/tests/simulator_avx512_test.cc
+++ b/tests/simulator_avx512_test.cc
@@ -13,10 +13,11 @@
 // limitations under the License.
 
 #include "simulator_testfixture.h"
-#include "cpuinfo.h"
 
 #include "gtest/gtest.h"
 
+#ifdef __AVX512F__
+
 #ifdef _OPENMP
 #include "../lib/parfor.h"
 #endif
@@ -26,14 +27,7 @@
 namespace qsim {
 
 template <class T>
-class SimulatorAVX512Test : public testing::Test {
- protected:
-  void SetUp() override {
-    if (!HaveAVX512()) {
-      GTEST_SKIP() << "Skipping all AVX512 tests.";
-    }
-  }
-};
+class SimulatorAVX512Test : public testing::Test {};
 
 using ::testing::Types;
 #ifdef _OPENMP
@@ -86,6 +80,8 @@ TYPED_TEST(SimulatorAVX512Test, ExpectationValue2) {
 
 }  // namespace qsim
 
+#endif  // __AVX512F__
+
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
diff --git a/tests/statespace_avx512_test.cc b/tests/statespace_avx512_test.cc
index 3b8cd34ed..53c751168 100644
--- a/tests/statespace_avx512_test.cc
+++ b/tests/statespace_avx512_test.cc
@@ -13,10 +13,11 @@
 // limitations under the License.
 
 #include "statespace_testfixture.h"
-#include "cpuinfo.h"
 
 #include "gtest/gtest.h"
 
+#ifdef __AVX512F__
+
 #ifdef _OPENMP
 #include "../lib/parfor.h"
 #endif
@@ -27,14 +28,7 @@
 namespace qsim {
 
 template <class T>
-class StateSpaceAVX512Test : public testing::Test {
- protected:
-  void SetUp() override {
-    if (!HaveAVX512()) {
-      GTEST_SKIP() << "Skipping all AVX512 tests.";
-    }
-  }
-};
+class StateSpaceAVX512Test : public testing::Test {};
 
 using ::testing::Types;
 #ifdef _OPENMP
@@ -107,6 +101,8 @@ TYPED_TEST(StateSpaceAVX512Test, ThreadThrashing) {
 
 }  // namespace qsim
 
+#endif  // __AVX512F__
+
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
diff --git a/tests/unitary_calculator_avx512_test.cc b/tests/unitary_calculator_avx512_test.cc
index 4c9314171..10de2db07 100644
--- a/tests/unitary_calculator_avx512_test.cc
+++ b/tests/unitary_calculator_avx512_test.cc
@@ -13,10 +13,11 @@
 // limitations under the License.
 
 #include "unitary_calculator_testfixture.h"
-#include "cpuinfo.h"
 
 #include "gtest/gtest.h"
 
+#ifdef __AVX512F__
+
 #include "../lib/formux.h"
 #include "../lib/unitary_calculator_avx512.h"
 
@@ -24,44 +25,35 @@ namespace qsim {
 namespace unitary {
 namespace {
 
-class UnitaryCalculatorAVX512Test : public testing::Test {
- protected:
-  void SetUp() override {
-    if (!HaveAVX512()) {
-      GTEST_SKIP() << "Skipping all AVX512 tests.";
-    }
-  }
-};
-
-TEST_F(UnitaryCalculatorAVX512Test, ApplyGate1) {
+TEST(UnitaryCalculatorAVX512Test, ApplyGate1) {
   TestApplyGate1<UnitaryCalculatorAVX512<For>>();
 }
 
-TEST_F(UnitaryCalculatorAVX512Test, ApplyControlledGate1) {
+TEST(UnitaryCalculatorAVX512Test, ApplyControlledGate1) {
   TestApplyControlledGate1<UnitaryCalculatorAVX512<For>>();
 }
 
-TEST_F(UnitaryCalculatorAVX512Test, ApplyGate2) {
+TEST(UnitaryCalculatorAVX512Test, ApplyGate2) {
   TestApplyGate2<UnitaryCalculatorAVX512<For>>();
 }
 
-TEST_F(UnitaryCalculatorAVX512Test, ApplyControlledGate2) {
+TEST(UnitaryCalculatorAVX512Test, ApplyControlledGate2) {
   TestApplyControlledGate2<UnitaryCalculatorAVX512<For>>();
 }
 
-TEST_F(UnitaryCalculatorAVX512Test, ApplyFusedGate) {
+TEST(UnitaryCalculatorAVX512Test, ApplyFusedGate) {
   TestApplyFusedGate<UnitaryCalculatorAVX512<For>>();
 }
 
-TEST_F(UnitaryCalculatorAVX512Test, ApplyGates) {
+TEST(UnitaryCalculatorAVX512Test, ApplyGates) {
   TestApplyGates<UnitaryCalculatorAVX512<For>>(false);
 }
 
-TEST_F(UnitaryCalculatorAVX512Test, ApplyControlledGates) {
+TEST(UnitaryCalculatorAVX512Test, ApplyControlledGates) {
   TestApplyControlledGates<UnitaryCalculatorAVX512<For>>(false);
 }
 
-TEST_F(UnitaryCalculatorAVX512Test, SmallCircuits) {
+TEST(UnitaryCalculatorAVX512Test, SmallCircuits) {
   TestSmallCircuits<UnitaryCalculatorAVX512<For>>();
 }
 
@@ -69,6 +61,8 @@ TEST_F(UnitaryCalculatorAVX512Test, SmallCircuits) {
 }  // namespace unitary
 }  // namespace qsim
 
+#endif  // __AVX512F__
+
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
diff --git a/tests/unitaryspace_avx512_test.cc b/tests/unitaryspace_avx512_test.cc
index 460c8f1ed..5f1001236 100644
--- a/tests/unitaryspace_avx512_test.cc
+++ b/tests/unitaryspace_avx512_test.cc
@@ -13,10 +13,11 @@
 // limitations under the License.
 
 #include "unitaryspace_testfixture.h"
-#include "cpuinfo.h"
 
 #include "gtest/gtest.h"
 
+#ifdef __AVX512F__
+
 #include "../lib/formux.h"
 #include "../lib/unitaryspace_avx512.h"
 
@@ -24,24 +25,15 @@ namespace qsim {
 namespace unitary {
 namespace {
 
-class UnitarySpaceAVX512Test : public testing::Test {
- protected:
-  void SetUp() override {
-    if (!HaveAVX512()) {
-      GTEST_SKIP() << "Skipping all AVX512 tests.";
-    }
-  }
-};
-
-TEST_F(UnitarySpaceAVX512Test, SetZero) {
+TEST(UnitarySpaceAVX512Test, SetZero) {
   TestSetZeros<UnitarySpaceAVX512<For>>();
 }
 
-TEST_F(UnitarySpaceAVX512Test, SetIdentity) {
+TEST(UnitarySpaceAVX512Test, SetIdentity) {
   TestSetIdentity<UnitarySpaceAVX512<For>>();
 }
 
-TEST_F(UnitarySpaceAVX512Test, GetEntry) {
+TEST(UnitarySpaceAVX512Test, GetEntry) {
   TestSetEntry<UnitarySpaceAVX512<For>>();
 }
 
@@ -49,6 +41,8 @@ TEST_F(UnitarySpaceAVX512Test, GetEntry) {
 }  // namespace unitary
 }  // namespace qsim
 
+#endif  // __AVX512F__
+
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();

From 9ef5fb12fd4596967014d6a7897c9c7f5791ac6c Mon Sep 17 00:00:00 2001
From: Sergei Isakov <iserge@google.com>
Date: Mon, 26 Apr 2021 20:51:23 +0200
Subject: [PATCH 10/10] Don't run AVX512 tests on Windows.

---
 tests/simulator_avx512_test.cc          | 4 ++--
 tests/statespace_avx512_test.cc         | 4 ++--
 tests/unitary_calculator_avx512_test.cc | 4 ++--
 tests/unitaryspace_avx512_test.cc       | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/simulator_avx512_test.cc b/tests/simulator_avx512_test.cc
index 448666447..287a61524 100644
--- a/tests/simulator_avx512_test.cc
+++ b/tests/simulator_avx512_test.cc
@@ -16,7 +16,7 @@
 
 #include "gtest/gtest.h"
 
-#ifdef __AVX512F__
+#if defined(__AVX512F__) && !defined(_WIN32)
 
 #ifdef _OPENMP
 #include "../lib/parfor.h"
@@ -80,7 +80,7 @@ TYPED_TEST(SimulatorAVX512Test, ExpectationValue2) {
 
 }  // namespace qsim
 
-#endif  // __AVX512F__
+#endif  // __AVX512F__ && !_WIN32
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
diff --git a/tests/statespace_avx512_test.cc b/tests/statespace_avx512_test.cc
index 53c751168..d9913aafb 100644
--- a/tests/statespace_avx512_test.cc
+++ b/tests/statespace_avx512_test.cc
@@ -16,7 +16,7 @@
 
 #include "gtest/gtest.h"
 
-#ifdef __AVX512F__
+#if defined(__AVX512F__) && !defined(_WIN32)
 
 #ifdef _OPENMP
 #include "../lib/parfor.h"
@@ -101,7 +101,7 @@ TYPED_TEST(StateSpaceAVX512Test, ThreadThrashing) {
 
 }  // namespace qsim
 
-#endif  // __AVX512F__
+#endif  // __AVX512F__ && !_WIN32
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
diff --git a/tests/unitary_calculator_avx512_test.cc b/tests/unitary_calculator_avx512_test.cc
index 10de2db07..569826880 100644
--- a/tests/unitary_calculator_avx512_test.cc
+++ b/tests/unitary_calculator_avx512_test.cc
@@ -16,7 +16,7 @@
 
 #include "gtest/gtest.h"
 
-#ifdef __AVX512F__
+#if defined(__AVX512F__) && !defined(_WIN32)
 
 #include "../lib/formux.h"
 #include "../lib/unitary_calculator_avx512.h"
@@ -61,7 +61,7 @@ TEST(UnitaryCalculatorAVX512Test, SmallCircuits) {
 }  // namespace unitary
 }  // namespace qsim
 
-#endif  // __AVX512F__
+#endif  // __AVX512F__ && !_WIN32
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
diff --git a/tests/unitaryspace_avx512_test.cc b/tests/unitaryspace_avx512_test.cc
index 5f1001236..6301ec4db 100644
--- a/tests/unitaryspace_avx512_test.cc
+++ b/tests/unitaryspace_avx512_test.cc
@@ -16,7 +16,7 @@
 
 #include "gtest/gtest.h"
 
-#ifdef __AVX512F__
+#if defined(__AVX512F__) && !defined(_WIN32)
 
 #include "../lib/formux.h"
 #include "../lib/unitaryspace_avx512.h"
@@ -41,7 +41,7 @@ TEST(UnitarySpaceAVX512Test, GetEntry) {
 }  // namespace unitary
 }  // namespace qsim
 
-#endif  // __AVX512F__
+#endif  // __AVX512F__ && !_WIN32
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);