From 48a9d2ad97698d4aee872c1632f04e8354ef54ea Mon Sep 17 00:00:00 2001 From: Sergei Isakov Date: Tue, 20 Apr 2021 17:39:21 +0200 Subject: [PATCH 01/10] Add support for avx512. --- lib/BUILD | 43 + lib/simmux.h | 8 +- lib/simulator_avx512.h | 8878 +++++++++++++++++++++++ lib/statespace_avx512.h | 423 ++ lib/umux.h | 10 +- lib/unitary_calculator_avx512.h | 6385 ++++++++++++++++ lib/unitaryspace_avx512.h | 110 + tests/BUILD | 71 +- tests/make.sh | 4 + tests/simulator_avx512_test.cc | 84 + tests/statespace_avx512_test.cc | 105 + tests/statespace_avx_test.cc | 4 + tests/statespace_basic_test.cc | 4 + tests/statespace_sse_test.cc | 4 + tests/statespace_testfixture.h | 236 +- tests/unitary_calculator_avx512_test.cc | 65 + tests/unitary_calculator_testfixture.h | 3 +- tests/unitaryspace_avx512_test.cc | 46 + 18 files changed, 16366 insertions(+), 117 deletions(-) create mode 100644 lib/simulator_avx512.h create mode 100644 lib/statespace_avx512.h create mode 100644 lib/unitary_calculator_avx512.h create mode 100644 lib/unitaryspace_avx512.h create mode 100644 tests/simulator_avx512_test.cc create mode 100644 tests/statespace_avx512_test.cc create mode 100644 tests/unitary_calculator_avx512_test.cc create mode 100644 tests/unitaryspace_avx512_test.cc diff --git a/lib/BUILD b/lib/BUILD index 3bfca5f90..4a960e1a0 100644 --- a/lib/BUILD +++ b/lib/BUILD @@ -33,18 +33,22 @@ cc_library( "seqfor.h", "simmux.h", "simulator_avx.h", + "simulator_avx512.h", "simulator_basic.h", "simulator_sse.h", "statespace_avx.h", + "statespace_avx512.h", "statespace_basic.h", "statespace_sse.h", "statespace.h", "umux.h", "unitaryspace.h", "unitaryspace_avx.h", + "unitaryspace_avx512.h", "unitaryspace_basic.h", "unitaryspace_sse.h", "unitary_calculator_avx.h", + "unitary_calculator_avx512.h", "unitary_calculator_basic.h", "unitary_calculator_sse.h", "util.h", @@ -75,10 +79,12 @@ cc_library( "seqfor.h", "simmux.h", "simulator_avx.h", + "simulator_avx512.h", "simulator_basic.h", "simulator_sse.h", "statespace.h", "statespace_avx.h", + "statespace_avx512.h", "statespace_basic.h", "statespace_sse.h", "umux.h", @@ -118,10 +124,12 @@ cc_library( "seqfor.h", "simmux.h", "simulator_avx.h", + "simulator_avx512.h", "simulator_basic.h", "simulator_sse.h", "statespace.h", "statespace_avx.h", + "statespace_avx512.h", "statespace_basic.h", "statespace_sse.h", "util.h", @@ -330,6 +338,15 @@ cc_library( ], ) +cc_library( + name = "statespace_avx512", + hdrs = ["statespace_avx512.h"], + deps = [ + ":statespace", + ":util", + ], +) + cc_library( name = "statespace_basic", hdrs = ["statespace_basic.h"], @@ -359,6 +376,15 @@ cc_library( ], ) +cc_library( + name = "simulator_avx512", + hdrs = ["simulator_avx512.h"], + deps = [ + ":bits", + ":statespace_avx512", + ], +) + cc_library( name = "simulator_basic", hdrs = ["simulator_basic.h"], @@ -383,6 +409,7 @@ cc_library( hdrs = ["simmux.h"], deps = [ ":simulator_avx", + ":simulator_avx512", ":simulator_basic", ":simulator_sse", ], @@ -450,6 +477,12 @@ cc_library( deps = [":unitaryspace"], ) +cc_library( + name = "unitaryspace_avx512", + hdrs = ["unitaryspace_avx512.h"], + deps = [":unitaryspace"], +) + cc_library( name = "unitaryspace_basic", hdrs = ["unitaryspace_basic.h"], @@ -473,6 +506,15 @@ cc_library( ], ) +cc_library( + name = "unitary_calculator_avx512", + hdrs = ["unitary_calculator_avx512.h"], + deps = [ + ":bits", + ":unitaryspace_avx512" + ], +) + cc_library( name = "unitary_calculator_basic", hdrs = ["unitary_calculator_basic.h"], @@ -498,6 +540,7 @@ cc_library( hdrs = ["umux.h"], deps = [ ":unitary_calculator_avx", + ":unitary_calculator_avx512", ":unitary_calculator_basic", ":unitary_calculator_sse", ], diff --git a/lib/simmux.h b/lib/simmux.h index 626eab1cc..d3c4074ef 100644 --- a/lib/simmux.h +++ b/lib/simmux.h @@ -15,7 +15,13 @@ #ifndef SIMMUX_H_ #define SIMMUX_H_ -#ifdef __AVX2__ +#ifdef __AVX512F__ +# include "simulator_avx512.h" + namespace qsim { + template + using Simulator = SimulatorAVX512; + } +#elif __AVX2__ # include "simulator_avx.h" namespace qsim { template diff --git a/lib/simulator_avx512.h b/lib/simulator_avx512.h new file mode 100644 index 000000000..ddce44db3 --- /dev/null +++ b/lib/simulator_avx512.h @@ -0,0 +1,8878 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SIMULATOR_AVX512_H_ +#define SIMULATOR_AVX512_H_ + +#include + +#include +#include +#include + +#include "bits.h" +#include "statespace_avx512.h" + +namespace qsim { + +/** + * Quantum circuit simulator with AVX512 vectorization. + */ +template +class SimulatorAVX512 final { + public: + using StateSpace = StateSpaceAVX512; + using State = typename StateSpace::State; + using fp_type = typename StateSpace::fp_type; + + template + explicit SimulatorAVX512(ForArgs&&... args) : for_(args...) {} + + /** + * Applies a gate using AVX512 instructions. + * @param qs Indices of the qubits affected by this gate. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyGate(const std::vector& qs, + const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + switch (qs.size()) { + case 1: + if (qs[0] > 3) { + ApplyGate1H(qs, matrix, state); + } else { + ApplyGate1L(qs, matrix, state); + } + break; + case 2: + if (qs[0] > 3) { + ApplyGate2HH(qs, matrix, state); + } else if (qs[1] > 3) { + ApplyGate2HL(qs, matrix, state); + } else { + ApplyGate2LL(qs, matrix, state); + } + break; + case 3: + if (qs[0] > 3) { + ApplyGate3HHH(qs, matrix, state); + } else if (qs[1] > 3) { + ApplyGate3HHL(qs, matrix, state); + } else if (qs[2] > 3) { + ApplyGate3HLL(qs, matrix, state); + } else { + ApplyGate3LLL(qs, matrix, state); + } + break; + case 4: + if (qs[0] > 3) { + ApplyGate4HHHH(qs, matrix, state); + } else if (qs[1] > 3) { + ApplyGate4HHHL(qs, matrix, state); + } else if (qs[2] > 3) { + ApplyGate4HHLL(qs, matrix, state); + } else if (qs[3] > 3) { + ApplyGate4HLLL(qs, matrix, state); + } else { + ApplyGate4LLLL(qs, matrix, state); + } + break; + case 5: + if (qs[0] > 3) { + ApplyGate5HHHHH(qs, matrix, state); + } else if (qs[1] > 3) { + ApplyGate5HHHHL(qs, matrix, state); + } else if (qs[2] > 3) { + ApplyGate5HHHLL(qs, matrix, state); + } else if (qs[3] > 3) { + ApplyGate5HHLLL(qs, matrix, state); + } else { + ApplyGate5HLLLL(qs, matrix, state); + } + break; + case 6: + if (qs[0] > 3) { + ApplyGate6HHHHHH(qs, matrix, state); + } else if (qs[1] > 3) { + ApplyGate6HHHHHL(qs, matrix, state); + } else if (qs[2] > 3) { + ApplyGate6HHHHLL(qs, matrix, state); + } else if (qs[3] > 3) { + ApplyGate6HHHLLL(qs, matrix, state); + } else { + ApplyGate6HHLLLL(qs, matrix, state); + } + break; + default: + // Not implemented. + break; + } + } + + /** + * Applies a controlled gate using AVX512 instructions. + * @param qs Indices of the qubits affected by this gate. + * @param cqs Indices of control qubits. + * @param cmask Bit mask of control qubit values. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyControlledGate(const std::vector& qs, + const std::vector& cqs, uint64_t cmask, + const fp_type* matrix, State& state) const { + if (cqs.size() == 0) { + ApplyGate(qs, matrix, state); + return; + } + + switch (qs.size()) { + case 1: + if (qs[0] > 3) { + if (cqs[0] > 3) { + ApplyControlledGate1H_H(qs, cqs, cmask, matrix, state); + } else { + ApplyControlledGate1H_L(qs, cqs, cmask, matrix, state); + } + } else { + if (cqs[0] > 3) { + ApplyControlledGate1L_H(qs, cqs, cmask, matrix, state); + } else { + ApplyControlledGate1L_L(qs, cqs, cmask, matrix, state); + } + } + break; + case 2: + if (qs[0] > 3) { + if (cqs[0] > 3) { + ApplyControlledGate2HH_H(qs, cqs, cmask, matrix, state); + } else { + ApplyControlledGate2HH_L(qs, cqs, cmask, matrix, state); + } + } else if (qs[1] > 3) { + if (cqs[0] > 3) { + ApplyControlledGate2HL_H(qs, cqs, cmask, matrix, state); + } else { + ApplyControlledGate2HL_L(qs, cqs, cmask, matrix, state); + } + } else { + if (cqs[0] > 3) { + ApplyControlledGate2LL_H(qs, cqs, cmask, matrix, state); + } else { + ApplyControlledGate2LL_L(qs, cqs, cmask, matrix, state); + } + } + break; + case 3: + if (qs[0] > 3) { + if (cqs[0] > 3) { + ApplyControlledGate3HHH_H(qs, cqs, cmask, matrix, state); + } else { + ApplyControlledGate3HHH_L(qs, cqs, cmask, matrix, state); + } + } else if (qs[1] > 3) { + if (cqs[0] > 3) { + ApplyControlledGate3HHL_H(qs, cqs, cmask, matrix, state); + } else { + ApplyControlledGate3HHL_L(qs, cqs, cmask, matrix, state); + } + } else if (qs[2] > 3) { + if (cqs[0] > 3) { + ApplyControlledGate3HLL_H(qs, cqs, cmask, matrix, state); + } else { + ApplyControlledGate3HLL_L(qs, cqs, cmask, matrix, state); + } + } else { + if (cqs[0] > 3) { + ApplyControlledGate3LLL_H(qs, cqs, cmask, matrix, state); + } else { + ApplyControlledGate3LLL_L(qs, cqs, cmask, matrix, state); + } + } + break; + case 4: + if (qs[0] > 3) { + if (cqs[0] > 3) { + ApplyControlledGate4HHHH_H(qs, cqs, cmask, matrix, state); + } else { + ApplyControlledGate4HHHH_L(qs, cqs, cmask, matrix, state); + } + } else if (qs[1] > 3) { + if (cqs[0] > 3) { + ApplyControlledGate4HHHL_H(qs, cqs, cmask, matrix, state); + } else { + ApplyControlledGate4HHHL_L(qs, cqs, cmask, matrix, state); + } + } else if (qs[2] > 3) { + if (cqs[0] > 3) { + ApplyControlledGate4HHLL_H(qs, cqs, cmask, matrix, state); + } else { + ApplyControlledGate4HHLL_L(qs, cqs, cmask, matrix, state); + } + } else if (qs[3] > 3) { + if (cqs[0] > 3) { + ApplyControlledGate4HLLL_H(qs, cqs, cmask, matrix, state); + } else { + ApplyControlledGate4HLLL_L(qs, cqs, cmask, matrix, state); + } + } else { + if (cqs[0] > 3) { + ApplyControlledGate4LLLL_H(qs, cqs, cmask, matrix, state); + } else { + ApplyControlledGate4LLLL_L(qs, cqs, cmask, matrix, state); + } + } + break; + default: + // Not implemented. + break; + } + } + + /** + * Computes the expectation value of an operator using AVX512 instructions. + * @param qs Indices of the qubits the operator acts on. + * @param matrix The operator matrix. + * @param state The state of the system. + * @return The computed expectation value. + */ + std::complex ExpectationValue(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + switch (qs.size()) { + case 1: + if (qs[0] > 3) { + return ExpectationValue1H(qs, matrix, state); + } else { + return ExpectationValue1L(qs, matrix, state); + } + break; + case 2: + if (qs[0] > 3) { + return ExpectationValue2HH(qs, matrix, state); + } else if (qs[1] > 3) { + return ExpectationValue2HL(qs, matrix, state); + } else { + return ExpectationValue2LL(qs, matrix, state); + } + break; + case 3: + if (qs[0] > 3) { + return ExpectationValue3HHH(qs, matrix, state); + } else if (qs[1] > 3) { + return ExpectationValue3HHL(qs, matrix, state); + } else if (qs[2] > 3) { + return ExpectationValue3HLL(qs, matrix, state); + } else { + return ExpectationValue3LLL(qs, matrix, state); + } + break; + case 4: + if (qs[0] > 3) { + return ExpectationValue4HHHH(qs, matrix, state); + } else if (qs[1] > 3) { + return ExpectationValue4HHHL(qs, matrix, state); + } else if (qs[2] > 3) { + return ExpectationValue4HHLL(qs, matrix, state); + } else if (qs[3] > 3) { + return ExpectationValue4HLLL(qs, matrix, state); + } else { + return ExpectationValue4LLLL(qs, matrix, state); + } + break; + case 5: + if (qs[0] > 3) { + return ExpectationValue5HHHHH(qs, matrix, state); + } else if (qs[1] > 3) { + return ExpectationValue5HHHHL(qs, matrix, state); + } else if (qs[2] > 3) { + return ExpectationValue5HHHLL(qs, matrix, state); + } else if (qs[3] > 3) { + return ExpectationValue5HHLLL(qs, matrix, state); + } else { + return ExpectationValue5HLLLL(qs, matrix, state); + } + break; + case 6: + if (qs[0] > 3) { + return ExpectationValue6HHHHHH(qs, matrix, state); + } else if (qs[1] > 3) { + return ExpectationValue6HHHHHL(qs, matrix, state); + } else if (qs[2] > 3) { + return ExpectationValue6HHHHLL(qs, matrix, state); + } else if (qs[3] > 3) { + return ExpectationValue6HHHLLL(qs, matrix, state); + } else { + return ExpectationValue6HHLLLL(qs, matrix, state); + } + break; + default: + // Not implemented. + break; + } + + return 0; + } + + /** + * @return The size of SIMD register if applicable. + */ + static unsigned SIMDRegisterSize() { + return 16; + } + + private: + void ApplyGate1H(const std::vector& qs, + const fp_type* matrix, State& state) const { + uint64_t xs[1]; + uint64_t ms[2]; + + xs[0] = uint64_t{1} << (qs[0] + 1); + ms[0] = (uint64_t{1} << qs[0]) - 1; + ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); + + uint64_t xss[2]; + for (unsigned i = 0; i < 2; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 1; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, + fp_type* rstate) { + __m512 ru, iu, rn, in; + __m512 rs[2], is[2]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 2; ++l) { + rs[l] = _mm512_load_ps(p0 + xss[l]); + is[l] = _mm512_load_ps(p0 + xss[l] + 16); + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 2; ++l) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_mul_ps(rs[0], ru); + in = _mm512_mul_ps(rs[0], iu); + rn = _mm512_fnmadd_ps(is[0], iu, rn); + in = _mm512_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned n = 1; n < 2; ++n) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_fmadd_ps(rs[n], ru, rn); + in = _mm512_fmadd_ps(rs[n], iu, in); + rn = _mm512_fnmadd_ps(is[n], iu, rn); + in = _mm512_fmadd_ps(is[n], ru, in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 5; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, matrix, ms, xss, rstate); + } + + void ApplyGate1L(const std::vector& qs, + const fp_type* matrix, State& state) const { + unsigned p[16]; + __m512i idx[1]; + + auto s = StateSpace::Create(5); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned m = 0; m < 2; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2); + } + + unsigned l = 2 * (2 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[2], is[2]; + + auto p0 = rstate + 32 * i; + + for (unsigned l = 0; l < 1; ++l) { + rs[2 * l] = _mm512_load_ps(p0); + is[2 * l] = _mm512_load_ps(p0 + 16); + + for (unsigned j = 1; j < 2; ++j) { + rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]); + is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 1; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 2; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0, rn); + _mm512_store_ps(p0 + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 4; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, idx, rstate); + } + + void ApplyGate2HH(const std::vector& qs, + const fp_type* matrix, State& state) const { + uint64_t xs[2]; + uint64_t ms[3]; + + xs[0] = uint64_t{1} << (qs[0] + 1); + ms[0] = (uint64_t{1} << qs[0]) - 1; + for (unsigned i = 1; i < 2; ++i) { + xs[i] = uint64_t{1} << (qs[i + 0] + 1); + ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); + } + ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); + + uint64_t xss[4]; + for (unsigned i = 0; i < 4; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 2; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, + fp_type* rstate) { + __m512 ru, iu, rn, in; + __m512 rs[4], is[4]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 4; ++l) { + rs[l] = _mm512_load_ps(p0 + xss[l]); + is[l] = _mm512_load_ps(p0 + xss[l] + 16); + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 4; ++l) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_mul_ps(rs[0], ru); + in = _mm512_mul_ps(rs[0], iu); + rn = _mm512_fnmadd_ps(is[0], iu, rn); + in = _mm512_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned n = 1; n < 4; ++n) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_fmadd_ps(rs[n], ru, rn); + in = _mm512_fmadd_ps(rs[n], iu, in); + rn = _mm512_fnmadd_ps(is[n], iu, rn); + in = _mm512_fmadd_ps(is[n], ru, in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 6; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, matrix, ms, xss, rstate); + } + + void ApplyGate2HL(const std::vector& qs, + const fp_type* matrix, State& state) const { + uint64_t xs[1]; + uint64_t ms[2]; + + xs[0] = uint64_t{1} << (qs[1] + 1); + ms[0] = (uint64_t{1} << qs[1]) - 1; + ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); + + uint64_t xss[2]; + for (unsigned i = 0; i < 2; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 1; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[1]; + + auto s = StateSpace::Create(7); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 2; ++i) { + for (unsigned m = 0; m < 4; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2); + } + + unsigned l = 2 * (4 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[4], is[4]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 2; ++l) { + rs[2 * l] = _mm512_load_ps(p0 + xss[l]); + is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 2; ++j) { + rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]); + is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 2; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 4; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 5; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, idx, rstate); + } + + void ApplyGate2LL(const std::vector& qs, + const fp_type* matrix, State& state) const { + unsigned p[16]; + __m512i idx[3]; + + auto s = StateSpace::Create(6); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]); + + for (unsigned i = 0; i < 3; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned m = 0; m < 4; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4); + } + + unsigned l = 2 * (4 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[4], is[4]; + + auto p0 = rstate + 32 * i; + + for (unsigned l = 0; l < 1; ++l) { + rs[4 * l] = _mm512_load_ps(p0); + is[4 * l] = _mm512_load_ps(p0 + 16); + + for (unsigned j = 1; j < 4; ++j) { + rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]); + is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 1; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 4; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0, rn); + _mm512_store_ps(p0 + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 4; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, idx, rstate); + } + + void ApplyGate3HHH(const std::vector& qs, + const fp_type* matrix, State& state) const { + uint64_t xs[3]; + uint64_t ms[4]; + + xs[0] = uint64_t{1} << (qs[0] + 1); + ms[0] = (uint64_t{1} << qs[0]) - 1; + for (unsigned i = 1; i < 3; ++i) { + xs[i] = uint64_t{1} << (qs[i + 0] + 1); + ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); + } + ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1); + + uint64_t xss[8]; + for (unsigned i = 0; i < 8; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 3; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, + fp_type* rstate) { + __m512 ru, iu, rn, in; + __m512 rs[8], is[8]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]) + | (128 * i & ms[3]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 8; ++l) { + rs[l] = _mm512_load_ps(p0 + xss[l]); + is[l] = _mm512_load_ps(p0 + xss[l] + 16); + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 8; ++l) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_mul_ps(rs[0], ru); + in = _mm512_mul_ps(rs[0], iu); + rn = _mm512_fnmadd_ps(is[0], iu, rn); + in = _mm512_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned n = 1; n < 8; ++n) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_fmadd_ps(rs[n], ru, rn); + in = _mm512_fmadd_ps(rs[n], iu, in); + rn = _mm512_fnmadd_ps(is[n], iu, rn); + in = _mm512_fmadd_ps(is[n], ru, in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 7; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, matrix, ms, xss, rstate); + } + + void ApplyGate3HHL(const std::vector& qs, + const fp_type* matrix, State& state) const { + uint64_t xs[2]; + uint64_t ms[3]; + + xs[0] = uint64_t{1} << (qs[1] + 1); + ms[0] = (uint64_t{1} << qs[1]) - 1; + for (unsigned i = 1; i < 2; ++i) { + xs[i] = uint64_t{1} << (qs[i + 1] + 1); + ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1); + } + ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); + + uint64_t xss[4]; + for (unsigned i = 0; i < 4; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 2; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[1]; + + auto s = StateSpace::Create(9); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 4; ++i) { + for (unsigned m = 0; m < 8; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2); + } + + unsigned l = 2 * (8 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[8], is[8]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 4; ++l) { + rs[2 * l] = _mm512_load_ps(p0 + xss[l]); + is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 2; ++j) { + rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]); + is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 4; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 8; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 6; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, idx, rstate); + } + + void ApplyGate3HLL(const std::vector& qs, + const fp_type* matrix, State& state) const { + uint64_t xs[1]; + uint64_t ms[2]; + + xs[0] = uint64_t{1} << (qs[2] + 1); + ms[0] = (uint64_t{1} << qs[2]) - 1; + ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); + + uint64_t xss[2]; + for (unsigned i = 0; i < 2; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 1; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[3]; + + auto s = StateSpace::Create(8); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]); + + for (unsigned i = 0; i < 3; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 2; ++i) { + for (unsigned m = 0; m < 8; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4); + } + + unsigned l = 2 * (8 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[8], is[8]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 2; ++l) { + rs[4 * l] = _mm512_load_ps(p0 + xss[l]); + is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 4; ++j) { + rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]); + is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 2; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 8; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 5; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, idx, rstate); + } + + void ApplyGate3LLL(const std::vector& qs, + const fp_type* matrix, State& state) const { + unsigned p[16]; + __m512i idx[7]; + + auto s = StateSpace::Create(7); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]); + + for (unsigned i = 0; i < 7; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned m = 0; m < 8; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (64 * i + 8 * k + 8 * (m / 8) + (k + m) % 8); + } + + unsigned l = 2 * (8 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[8], is[8]; + + auto p0 = rstate + 32 * i; + + for (unsigned l = 0; l < 1; ++l) { + rs[8 * l] = _mm512_load_ps(p0); + is[8 * l] = _mm512_load_ps(p0 + 16); + + for (unsigned j = 1; j < 8; ++j) { + rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]); + is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 1; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 8; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0, rn); + _mm512_store_ps(p0 + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 4; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, idx, rstate); + } + + void ApplyGate4HHHH(const std::vector& qs, + const fp_type* matrix, State& state) const { + uint64_t xs[4]; + uint64_t ms[5]; + + xs[0] = uint64_t{1} << (qs[0] + 1); + ms[0] = (uint64_t{1} << qs[0]) - 1; + for (unsigned i = 1; i < 4; ++i) { + xs[i] = uint64_t{1} << (qs[i + 0] + 1); + ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); + } + ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1); + + uint64_t xss[16]; + for (unsigned i = 0; i < 16; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 4; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, + fp_type* rstate) { + __m512 ru, iu, rn, in; + __m512 rs[16], is[16]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]) + | (128 * i & ms[3]) | (256 * i & ms[4]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 16; ++l) { + rs[l] = _mm512_load_ps(p0 + xss[l]); + is[l] = _mm512_load_ps(p0 + xss[l] + 16); + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 16; ++l) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_mul_ps(rs[0], ru); + in = _mm512_mul_ps(rs[0], iu); + rn = _mm512_fnmadd_ps(is[0], iu, rn); + in = _mm512_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned n = 1; n < 16; ++n) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_fmadd_ps(rs[n], ru, rn); + in = _mm512_fmadd_ps(rs[n], iu, in); + rn = _mm512_fnmadd_ps(is[n], iu, rn); + in = _mm512_fmadd_ps(is[n], ru, in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 8; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, matrix, ms, xss, rstate); + } + + void ApplyGate4HHHL(const std::vector& qs, + const fp_type* matrix, State& state) const { + uint64_t xs[3]; + uint64_t ms[4]; + + xs[0] = uint64_t{1} << (qs[1] + 1); + ms[0] = (uint64_t{1} << qs[1]) - 1; + for (unsigned i = 1; i < 3; ++i) { + xs[i] = uint64_t{1} << (qs[i + 1] + 1); + ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1); + } + ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1); + + uint64_t xss[8]; + for (unsigned i = 0; i < 8; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 3; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[1]; + + auto s = StateSpace::Create(11); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 8; ++i) { + for (unsigned m = 0; m < 16; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2); + } + + unsigned l = 2 * (16 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[16], is[16]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]) + | (128 * i & ms[3]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 8; ++l) { + rs[2 * l] = _mm512_load_ps(p0 + xss[l]); + is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 2; ++j) { + rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]); + is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 8; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 16; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 7; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, idx, rstate); + } + + void ApplyGate4HHLL(const std::vector& qs, + const fp_type* matrix, State& state) const { + uint64_t xs[2]; + uint64_t ms[3]; + + xs[0] = uint64_t{1} << (qs[2] + 1); + ms[0] = (uint64_t{1} << qs[2]) - 1; + for (unsigned i = 1; i < 2; ++i) { + xs[i] = uint64_t{1} << (qs[i + 2] + 1); + ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1); + } + ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); + + uint64_t xss[4]; + for (unsigned i = 0; i < 4; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 2; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[3]; + + auto s = StateSpace::Create(10); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]); + + for (unsigned i = 0; i < 3; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 4; ++i) { + for (unsigned m = 0; m < 16; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4); + } + + unsigned l = 2 * (16 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[16], is[16]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 4; ++l) { + rs[4 * l] = _mm512_load_ps(p0 + xss[l]); + is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 4; ++j) { + rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]); + is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 4; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 16; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 6; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, idx, rstate); + } + + void ApplyGate4HLLL(const std::vector& qs, + const fp_type* matrix, State& state) const { + uint64_t xs[1]; + uint64_t ms[2]; + + xs[0] = uint64_t{1} << (qs[3] + 1); + ms[0] = (uint64_t{1} << qs[3]) - 1; + ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); + + uint64_t xss[2]; + for (unsigned i = 0; i < 2; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 1; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[7]; + + auto s = StateSpace::Create(9); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]); + + for (unsigned i = 0; i < 7; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 2; ++i) { + for (unsigned m = 0; m < 16; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (128 * i + 16 * k + 8 * (m / 8) + (k + m) % 8); + } + + unsigned l = 2 * (16 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[16], is[16]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 2; ++l) { + rs[8 * l] = _mm512_load_ps(p0 + xss[l]); + is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 8; ++j) { + rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]); + is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 2; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 16; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 5; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, idx, rstate); + } + + void ApplyGate4LLLL(const std::vector& qs, + const fp_type* matrix, State& state) const { + unsigned p[16]; + __m512i idx[15]; + + auto s = StateSpace::Create(8); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]); + + for (unsigned i = 0; i < 15; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned m = 0; m < 16; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (256 * i + 16 * k + 16 * (m / 16) + (k + m) % 16); + } + + unsigned l = 2 * (16 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[16], is[16]; + + auto p0 = rstate + 32 * i; + + for (unsigned l = 0; l < 1; ++l) { + rs[16 * l] = _mm512_load_ps(p0); + is[16 * l] = _mm512_load_ps(p0 + 16); + + for (unsigned j = 1; j < 16; ++j) { + rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]); + is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 1; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 16; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0, rn); + _mm512_store_ps(p0 + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 4; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, idx, rstate); + } + + void ApplyGate5HHHHH(const std::vector& qs, + const fp_type* matrix, State& state) const { + uint64_t xs[5]; + uint64_t ms[6]; + + xs[0] = uint64_t{1} << (qs[0] + 1); + ms[0] = (uint64_t{1} << qs[0]) - 1; + for (unsigned i = 1; i < 5; ++i) { + xs[i] = uint64_t{1} << (qs[i + 0] + 1); + ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); + } + ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1); + + uint64_t xss[32]; + for (unsigned i = 0; i < 32; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 5; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, + fp_type* rstate) { + __m512 ru, iu, rn, in; + __m512 rs[32], is[32]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]) + | (128 * i & ms[3]) | (256 * i & ms[4]) | (512 * i & ms[5]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 32; ++l) { + rs[l] = _mm512_load_ps(p0 + xss[l]); + is[l] = _mm512_load_ps(p0 + xss[l] + 16); + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 32; ++l) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_mul_ps(rs[0], ru); + in = _mm512_mul_ps(rs[0], iu); + rn = _mm512_fnmadd_ps(is[0], iu, rn); + in = _mm512_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned n = 1; n < 32; ++n) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_fmadd_ps(rs[n], ru, rn); + in = _mm512_fmadd_ps(rs[n], iu, in); + rn = _mm512_fnmadd_ps(is[n], iu, rn); + in = _mm512_fmadd_ps(is[n], ru, in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 9; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, matrix, ms, xss, rstate); + } + + void ApplyGate5HHHHL(const std::vector& qs, + const fp_type* matrix, State& state) const { + uint64_t xs[4]; + uint64_t ms[5]; + + xs[0] = uint64_t{1} << (qs[1] + 1); + ms[0] = (uint64_t{1} << qs[1]) - 1; + for (unsigned i = 1; i < 4; ++i) { + xs[i] = uint64_t{1} << (qs[i + 1] + 1); + ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1); + } + ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1); + + uint64_t xss[16]; + for (unsigned i = 0; i < 16; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 4; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[1]; + + auto s = StateSpace::Create(13); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 16; ++i) { + for (unsigned m = 0; m < 32; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (64 * i + 32 * k + 2 * (m / 2) + (k + m) % 2); + } + + unsigned l = 2 * (32 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[32], is[32]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]) + | (128 * i & ms[3]) | (256 * i & ms[4]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 16; ++l) { + rs[2 * l] = _mm512_load_ps(p0 + xss[l]); + is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 2; ++j) { + rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]); + is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 16; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 32; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 8; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, idx, rstate); + } + + void ApplyGate5HHHLL(const std::vector& qs, + const fp_type* matrix, State& state) const { + uint64_t xs[3]; + uint64_t ms[4]; + + xs[0] = uint64_t{1} << (qs[2] + 1); + ms[0] = (uint64_t{1} << qs[2]) - 1; + for (unsigned i = 1; i < 3; ++i) { + xs[i] = uint64_t{1} << (qs[i + 2] + 1); + ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1); + } + ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1); + + uint64_t xss[8]; + for (unsigned i = 0; i < 8; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 3; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[3]; + + auto s = StateSpace::Create(12); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]); + + for (unsigned i = 0; i < 3; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 8; ++i) { + for (unsigned m = 0; m < 32; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (128 * i + 32 * k + 4 * (m / 4) + (k + m) % 4); + } + + unsigned l = 2 * (32 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[32], is[32]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]) + | (128 * i & ms[3]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 8; ++l) { + rs[4 * l] = _mm512_load_ps(p0 + xss[l]); + is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 4; ++j) { + rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]); + is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 8; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 32; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 7; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, idx, rstate); + } + + void ApplyGate5HHLLL(const std::vector& qs, + const fp_type* matrix, State& state) const { + uint64_t xs[2]; + uint64_t ms[3]; + + xs[0] = uint64_t{1} << (qs[3] + 1); + ms[0] = (uint64_t{1} << qs[3]) - 1; + for (unsigned i = 1; i < 2; ++i) { + xs[i] = uint64_t{1} << (qs[i + 3] + 1); + ms[i] = ((uint64_t{1} << qs[i + 3]) - 1) ^ (xs[i - 1] - 1); + } + ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); + + uint64_t xss[4]; + for (unsigned i = 0; i < 4; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 2; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[7]; + + auto s = StateSpace::Create(11); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]); + + for (unsigned i = 0; i < 7; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 4; ++i) { + for (unsigned m = 0; m < 32; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (256 * i + 32 * k + 8 * (m / 8) + (k + m) % 8); + } + + unsigned l = 2 * (32 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[32], is[32]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 4; ++l) { + rs[8 * l] = _mm512_load_ps(p0 + xss[l]); + is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 8; ++j) { + rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]); + is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 4; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 32; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 6; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, idx, rstate); + } + + void ApplyGate5HLLLL(const std::vector& qs, + const fp_type* matrix, State& state) const { + uint64_t xs[1]; + uint64_t ms[2]; + + xs[0] = uint64_t{1} << (qs[4] + 1); + ms[0] = (uint64_t{1} << qs[4]) - 1; + ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); + + uint64_t xss[2]; + for (unsigned i = 0; i < 2; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 1; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[15]; + + auto s = StateSpace::Create(10); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]); + + for (unsigned i = 0; i < 15; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 2; ++i) { + for (unsigned m = 0; m < 32; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (512 * i + 32 * k + 16 * (m / 16) + (k + m) % 16); + } + + unsigned l = 2 * (32 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[32], is[32]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 2; ++l) { + rs[16 * l] = _mm512_load_ps(p0 + xss[l]); + is[16 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 16; ++j) { + rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]); + is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 2; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 32; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 5; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, idx, rstate); + } + + void ApplyGate6HHHHHH(const std::vector& qs, + const fp_type* matrix, State& state) const { + uint64_t xs[6]; + uint64_t ms[7]; + + xs[0] = uint64_t{1} << (qs[0] + 1); + ms[0] = (uint64_t{1} << qs[0]) - 1; + for (unsigned i = 1; i < 6; ++i) { + xs[i] = uint64_t{1} << (qs[i + 0] + 1); + ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); + } + ms[6] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[5] - 1); + + uint64_t xss[64]; + for (unsigned i = 0; i < 64; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 6; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, + fp_type* rstate) { + __m512 ru, iu, rn, in; + __m512 rs[64], is[64]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]) + | (128 * i & ms[3]) | (256 * i & ms[4]) | (512 * i & ms[5]) + | (1024 * i & ms[6]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 64; ++l) { + rs[l] = _mm512_load_ps(p0 + xss[l]); + is[l] = _mm512_load_ps(p0 + xss[l] + 16); + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 64; ++l) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_mul_ps(rs[0], ru); + in = _mm512_mul_ps(rs[0], iu); + rn = _mm512_fnmadd_ps(is[0], iu, rn); + in = _mm512_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned n = 1; n < 64; ++n) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_fmadd_ps(rs[n], ru, rn); + in = _mm512_fmadd_ps(rs[n], iu, in); + rn = _mm512_fnmadd_ps(is[n], iu, rn); + in = _mm512_fmadd_ps(is[n], ru, in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 10; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, matrix, ms, xss, rstate); + } + + void ApplyGate6HHHHHL(const std::vector& qs, + const fp_type* matrix, State& state) const { + uint64_t xs[5]; + uint64_t ms[6]; + + xs[0] = uint64_t{1} << (qs[1] + 1); + ms[0] = (uint64_t{1} << qs[1]) - 1; + for (unsigned i = 1; i < 5; ++i) { + xs[i] = uint64_t{1} << (qs[i + 1] + 1); + ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1); + } + ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1); + + uint64_t xss[32]; + for (unsigned i = 0; i < 32; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 5; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[1]; + + auto s = StateSpace::Create(15); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 32; ++i) { + for (unsigned m = 0; m < 64; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (128 * i + 64 * k + 2 * (m / 2) + (k + m) % 2); + } + + unsigned l = 2 * (64 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[64], is[64]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]) + | (128 * i & ms[3]) | (256 * i & ms[4]) | (512 * i & ms[5]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 32; ++l) { + rs[2 * l] = _mm512_load_ps(p0 + xss[l]); + is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 2; ++j) { + rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]); + is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 32; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 64; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 9; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, idx, rstate); + } + + void ApplyGate6HHHHLL(const std::vector& qs, + const fp_type* matrix, State& state) const { + uint64_t xs[4]; + uint64_t ms[5]; + + xs[0] = uint64_t{1} << (qs[2] + 1); + ms[0] = (uint64_t{1} << qs[2]) - 1; + for (unsigned i = 1; i < 4; ++i) { + xs[i] = uint64_t{1} << (qs[i + 2] + 1); + ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1); + } + ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1); + + uint64_t xss[16]; + for (unsigned i = 0; i < 16; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 4; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[3]; + + auto s = StateSpace::Create(14); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]); + + for (unsigned i = 0; i < 3; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 16; ++i) { + for (unsigned m = 0; m < 64; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (256 * i + 64 * k + 4 * (m / 4) + (k + m) % 4); + } + + unsigned l = 2 * (64 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[64], is[64]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]) + | (128 * i & ms[3]) | (256 * i & ms[4]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 16; ++l) { + rs[4 * l] = _mm512_load_ps(p0 + xss[l]); + is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 4; ++j) { + rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]); + is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 16; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 64; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 8; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, idx, rstate); + } + + void ApplyGate6HHHLLL(const std::vector& qs, + const fp_type* matrix, State& state) const { + uint64_t xs[3]; + uint64_t ms[4]; + + xs[0] = uint64_t{1} << (qs[3] + 1); + ms[0] = (uint64_t{1} << qs[3]) - 1; + for (unsigned i = 1; i < 3; ++i) { + xs[i] = uint64_t{1} << (qs[i + 3] + 1); + ms[i] = ((uint64_t{1} << qs[i + 3]) - 1) ^ (xs[i - 1] - 1); + } + ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1); + + uint64_t xss[8]; + for (unsigned i = 0; i < 8; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 3; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[7]; + + auto s = StateSpace::Create(13); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]); + + for (unsigned i = 0; i < 7; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 8; ++i) { + for (unsigned m = 0; m < 64; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (512 * i + 64 * k + 8 * (m / 8) + (k + m) % 8); + } + + unsigned l = 2 * (64 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[64], is[64]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]) + | (128 * i & ms[3]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 8; ++l) { + rs[8 * l] = _mm512_load_ps(p0 + xss[l]); + is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 8; ++j) { + rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]); + is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 8; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 64; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 7; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, idx, rstate); + } + + void ApplyGate6HHLLLL(const std::vector& qs, + const fp_type* matrix, State& state) const { + uint64_t xs[2]; + uint64_t ms[3]; + + xs[0] = uint64_t{1} << (qs[4] + 1); + ms[0] = (uint64_t{1} << qs[4]) - 1; + for (unsigned i = 1; i < 2; ++i) { + xs[i] = uint64_t{1} << (qs[i + 4] + 1); + ms[i] = ((uint64_t{1} << qs[i + 4]) - 1) ^ (xs[i - 1] - 1); + } + ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); + + uint64_t xss[4]; + for (unsigned i = 0; i < 4; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 2; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[15]; + + auto s = StateSpace::Create(12); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]); + + for (unsigned i = 0; i < 15; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 4; ++i) { + for (unsigned m = 0; m < 64; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (1024 * i + 64 * k + 16 * (m / 16) + (k + m) % 16); + } + + unsigned l = 2 * (64 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[64], is[64]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 4; ++l) { + rs[16 * l] = _mm512_load_ps(p0 + xss[l]); + is[16 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 16; ++j) { + rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]); + is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 4; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 64; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 6; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, idx, rstate); + } + + void ApplyControlledGate1H_H(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + State& state) const { + uint64_t xs[1]; + uint64_t ms[2]; + + xs[0] = uint64_t{1} << (qs[0] + 1); + ms[0] = (uint64_t{1} << qs[0]) - 1; + ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); + + uint64_t xss[2]; + for (unsigned i = 0; i < 2; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 1; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + uint64_t emaskh = 0; + + for (auto q : cqs) { + emaskh |= uint64_t{1} << q; + } + + uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); + + for (auto q : qs) { + emaskh |= uint64_t{1} << q; + } + + emaskh = ~emaskh ^ 15; + + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + fp_type* rstate) { + __m512 ru, iu, rn, in; + __m512 rs[2], is[2]; + + uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh; + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 2; ++l) { + rs[l] = _mm512_load_ps(p0 + xss[l]); + is[l] = _mm512_load_ps(p0 + xss[l] + 16); + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 2; ++l) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_mul_ps(rs[0], ru); + in = _mm512_mul_ps(rs[0], iu); + rn = _mm512_fnmadd_ps(is[0], iu, rn); + in = _mm512_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned n = 1; n < 2; ++n) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_fmadd_ps(rs[n], ru, rn); + in = _mm512_fmadd_ps(rs[n], iu, in); + rn = _mm512_fnmadd_ps(is[n], iu, rn); + in = _mm512_fmadd_ps(is[n], ru, in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 5 + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, matrix, ms, xss, + state.num_qubits(), cmaskh, emaskh, rstate); + } + + void ApplyControlledGate1H_L(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + State& state) const { + uint64_t xs[1]; + uint64_t ms[2]; + + xs[0] = uint64_t{1} << (qs[0] + 1); + ms[0] = (uint64_t{1} << qs[0]) - 1; + ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); + + uint64_t xss[2]; + for (unsigned i = 0; i < 2; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 1; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned cl = 0; + uint64_t emaskl = 0; + uint64_t emaskh = 0; + + for (auto q : cqs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } else { + ++cl; + emaskl |= uint64_t{1} << q; + } + } + + uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); + uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl); + + for (auto q : qs) { + emaskh |= uint64_t{1} << q; + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + + auto s = StateSpace::Create(6); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 2; ++i) { + for (unsigned m = 0; m < 2; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (2 * i + 2 * k + m); + } + + unsigned l = 2 * (2 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + fp_type v = (p[j] / 2) / 2 == (p[j] / 2) % 2 ? 1 : 0; + wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[2], is[2]; + + uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh; + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 2; ++l) { + rs[l] = _mm512_load_ps(p0 + xss[l]); + is[l] = _mm512_load_ps(p0 + xss[l] + 16); + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 2; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 2; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 5 + cqs.size() - cl; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, + state.num_qubits(), cmaskh, emaskh, rstate); + } + + void ApplyControlledGate1L_H(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + State& state) const { + uint64_t emaskh = 0; + + for (auto q : cqs) { + emaskh |= uint64_t{1} << q; + } + + uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[1]; + + auto s = StateSpace::Create(5); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned m = 0; m < 2; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2); + } + + unsigned l = 2 * (2 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[2], is[2]; + + uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh; + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 1; ++l) { + rs[2 * l] = _mm512_load_ps(p0); + is[2 * l] = _mm512_load_ps(p0 + 16); + + for (unsigned j = 1; j < 2; ++j) { + rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]); + is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 1; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 2; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0, rn); + _mm512_store_ps(p0 + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 4 + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, + state.num_qubits(), cmaskh, emaskh, idx, rstate); + } + + void ApplyControlledGate1L_L(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + State& state) const { + unsigned cl = 0; + uint64_t emaskl = 0; + uint64_t emaskh = 0; + + for (auto q : cqs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } else { + ++cl; + emaskl |= uint64_t{1} << q; + } + } + + uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); + uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[1]; + + auto s = StateSpace::Create(5); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned m = 0; m < 2; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2); + } + + unsigned l = 2 * (2 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + fp_type v = (p[j] / 2) / 2 == (p[j] / 2) % 2 ? 1 : 0; + wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[2], is[2]; + + uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh; + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 1; ++l) { + rs[2 * l] = _mm512_load_ps(p0); + is[2 * l] = _mm512_load_ps(p0 + 16); + + for (unsigned j = 1; j < 2; ++j) { + rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]); + is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 1; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 2; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0, rn); + _mm512_store_ps(p0 + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 4 + cqs.size() - cl; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, + state.num_qubits(), cmaskh, emaskh, idx, rstate); + } + + void ApplyControlledGate2HH_H(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + State& state) const { + uint64_t xs[2]; + uint64_t ms[3]; + + xs[0] = uint64_t{1} << (qs[0] + 1); + ms[0] = (uint64_t{1} << qs[0]) - 1; + for (unsigned i = 1; i < 2; ++i) { + xs[i] = uint64_t{1} << (qs[i + 0] + 1); + ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); + } + ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); + + uint64_t xss[4]; + for (unsigned i = 0; i < 4; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 2; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + uint64_t emaskh = 0; + + for (auto q : cqs) { + emaskh |= uint64_t{1} << q; + } + + uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); + + for (auto q : qs) { + emaskh |= uint64_t{1} << q; + } + + emaskh = ~emaskh ^ 15; + + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + fp_type* rstate) { + __m512 ru, iu, rn, in; + __m512 rs[4], is[4]; + + uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh; + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 4; ++l) { + rs[l] = _mm512_load_ps(p0 + xss[l]); + is[l] = _mm512_load_ps(p0 + xss[l] + 16); + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 4; ++l) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_mul_ps(rs[0], ru); + in = _mm512_mul_ps(rs[0], iu); + rn = _mm512_fnmadd_ps(is[0], iu, rn); + in = _mm512_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned n = 1; n < 4; ++n) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_fmadd_ps(rs[n], ru, rn); + in = _mm512_fmadd_ps(rs[n], iu, in); + rn = _mm512_fnmadd_ps(is[n], iu, rn); + in = _mm512_fmadd_ps(is[n], ru, in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 6 + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, matrix, ms, xss, + state.num_qubits(), cmaskh, emaskh, rstate); + } + + void ApplyControlledGate2HH_L(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + State& state) const { + uint64_t xs[2]; + uint64_t ms[3]; + + xs[0] = uint64_t{1} << (qs[0] + 1); + ms[0] = (uint64_t{1} << qs[0]) - 1; + for (unsigned i = 1; i < 2; ++i) { + xs[i] = uint64_t{1} << (qs[i + 0] + 1); + ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); + } + ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); + + uint64_t xss[4]; + for (unsigned i = 0; i < 4; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 2; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned cl = 0; + uint64_t emaskl = 0; + uint64_t emaskh = 0; + + for (auto q : cqs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } else { + ++cl; + emaskl |= uint64_t{1} << q; + } + } + + uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); + uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl); + + for (auto q : qs) { + emaskh |= uint64_t{1} << q; + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + + auto s = StateSpace::Create(8); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 4; ++i) { + for (unsigned m = 0; m < 4; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (4 * i + 4 * k + m); + } + + unsigned l = 2 * (4 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + fp_type v = (p[j] / 2) / 4 == (p[j] / 2) % 4 ? 1 : 0; + wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[4], is[4]; + + uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh; + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 4; ++l) { + rs[l] = _mm512_load_ps(p0 + xss[l]); + is[l] = _mm512_load_ps(p0 + xss[l] + 16); + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 4; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 4; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 6 + cqs.size() - cl; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, + state.num_qubits(), cmaskh, emaskh, rstate); + } + + void ApplyControlledGate2HL_H(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + State& state) const { + uint64_t xs[1]; + uint64_t ms[2]; + + xs[0] = uint64_t{1} << (qs[1] + 1); + ms[0] = (uint64_t{1} << qs[1]) - 1; + ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); + + uint64_t xss[2]; + for (unsigned i = 0; i < 2; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 1; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + uint64_t emaskh = 0; + + for (auto q : cqs) { + emaskh |= uint64_t{1} << q; + } + + uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[1]; + + auto s = StateSpace::Create(7); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 2; ++i) { + for (unsigned m = 0; m < 4; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2); + } + + unsigned l = 2 * (4 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[4], is[4]; + + uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh; + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 2; ++l) { + rs[2 * l] = _mm512_load_ps(p0 + xss[l]); + is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 2; ++j) { + rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]); + is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 2; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 4; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 5 + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, + state.num_qubits(), cmaskh, emaskh, idx, rstate); + } + + void ApplyControlledGate2HL_L(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + State& state) const { + uint64_t xs[1]; + uint64_t ms[2]; + + xs[0] = uint64_t{1} << (qs[1] + 1); + ms[0] = (uint64_t{1} << qs[1]) - 1; + ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); + + uint64_t xss[2]; + for (unsigned i = 0; i < 2; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 1; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned cl = 0; + uint64_t emaskl = 0; + uint64_t emaskh = 0; + + for (auto q : cqs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } else { + ++cl; + emaskl |= uint64_t{1} << q; + } + } + + uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); + uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[1]; + + auto s = StateSpace::Create(7); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 2; ++i) { + for (unsigned m = 0; m < 4; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2); + } + + unsigned l = 2 * (4 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + fp_type v = (p[j] / 2) / 4 == (p[j] / 2) % 4 ? 1 : 0; + wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[4], is[4]; + + uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh; + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 2; ++l) { + rs[2 * l] = _mm512_load_ps(p0 + xss[l]); + is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 2; ++j) { + rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]); + is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 2; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 4; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 5 + cqs.size() - cl; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, + state.num_qubits(), cmaskh, emaskh, idx, rstate); + } + + void ApplyControlledGate2LL_H(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + State& state) const { + uint64_t emaskh = 0; + + for (auto q : cqs) { + emaskh |= uint64_t{1} << q; + } + + uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[3]; + + auto s = StateSpace::Create(6); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]); + + for (unsigned i = 0; i < 3; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned m = 0; m < 4; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4); + } + + unsigned l = 2 * (4 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[4], is[4]; + + uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh; + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 1; ++l) { + rs[4 * l] = _mm512_load_ps(p0); + is[4 * l] = _mm512_load_ps(p0 + 16); + + for (unsigned j = 1; j < 4; ++j) { + rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]); + is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 1; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 4; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0, rn); + _mm512_store_ps(p0 + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 4 + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, + state.num_qubits(), cmaskh, emaskh, idx, rstate); + } + + void ApplyControlledGate2LL_L(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + State& state) const { + unsigned cl = 0; + uint64_t emaskl = 0; + uint64_t emaskh = 0; + + for (auto q : cqs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } else { + ++cl; + emaskl |= uint64_t{1} << q; + } + } + + uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); + uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[3]; + + auto s = StateSpace::Create(6); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]); + + for (unsigned i = 0; i < 3; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned m = 0; m < 4; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4); + } + + unsigned l = 2 * (4 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + fp_type v = (p[j] / 2) / 4 == (p[j] / 2) % 4 ? 1 : 0; + wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[4], is[4]; + + uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh; + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 1; ++l) { + rs[4 * l] = _mm512_load_ps(p0); + is[4 * l] = _mm512_load_ps(p0 + 16); + + for (unsigned j = 1; j < 4; ++j) { + rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]); + is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 1; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 4; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0, rn); + _mm512_store_ps(p0 + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 4 + cqs.size() - cl; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, + state.num_qubits(), cmaskh, emaskh, idx, rstate); + } + + void ApplyControlledGate3HHH_H(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + State& state) const { + uint64_t xs[3]; + uint64_t ms[4]; + + xs[0] = uint64_t{1} << (qs[0] + 1); + ms[0] = (uint64_t{1} << qs[0]) - 1; + for (unsigned i = 1; i < 3; ++i) { + xs[i] = uint64_t{1} << (qs[i + 0] + 1); + ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); + } + ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1); + + uint64_t xss[8]; + for (unsigned i = 0; i < 8; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 3; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + uint64_t emaskh = 0; + + for (auto q : cqs) { + emaskh |= uint64_t{1} << q; + } + + uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); + + for (auto q : qs) { + emaskh |= uint64_t{1} << q; + } + + emaskh = ~emaskh ^ 15; + + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + fp_type* rstate) { + __m512 ru, iu, rn, in; + __m512 rs[8], is[8]; + + uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh; + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 8; ++l) { + rs[l] = _mm512_load_ps(p0 + xss[l]); + is[l] = _mm512_load_ps(p0 + xss[l] + 16); + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 8; ++l) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_mul_ps(rs[0], ru); + in = _mm512_mul_ps(rs[0], iu); + rn = _mm512_fnmadd_ps(is[0], iu, rn); + in = _mm512_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned n = 1; n < 8; ++n) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_fmadd_ps(rs[n], ru, rn); + in = _mm512_fmadd_ps(rs[n], iu, in); + rn = _mm512_fnmadd_ps(is[n], iu, rn); + in = _mm512_fmadd_ps(is[n], ru, in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 7 + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, matrix, ms, xss, + state.num_qubits(), cmaskh, emaskh, rstate); + } + + void ApplyControlledGate3HHH_L(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + State& state) const { + uint64_t xs[3]; + uint64_t ms[4]; + + xs[0] = uint64_t{1} << (qs[0] + 1); + ms[0] = (uint64_t{1} << qs[0]) - 1; + for (unsigned i = 1; i < 3; ++i) { + xs[i] = uint64_t{1} << (qs[i + 0] + 1); + ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); + } + ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1); + + uint64_t xss[8]; + for (unsigned i = 0; i < 8; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 3; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned cl = 0; + uint64_t emaskl = 0; + uint64_t emaskh = 0; + + for (auto q : cqs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } else { + ++cl; + emaskl |= uint64_t{1} << q; + } + } + + uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); + uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl); + + for (auto q : qs) { + emaskh |= uint64_t{1} << q; + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + + auto s = StateSpace::Create(10); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 8; ++i) { + for (unsigned m = 0; m < 8; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (8 * i + 8 * k + m); + } + + unsigned l = 2 * (8 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0; + wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[8], is[8]; + + uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh; + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 8; ++l) { + rs[l] = _mm512_load_ps(p0 + xss[l]); + is[l] = _mm512_load_ps(p0 + xss[l] + 16); + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 8; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 8; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 7 + cqs.size() - cl; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, + state.num_qubits(), cmaskh, emaskh, rstate); + } + + void ApplyControlledGate3HHL_H(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + State& state) const { + uint64_t xs[2]; + uint64_t ms[3]; + + xs[0] = uint64_t{1} << (qs[1] + 1); + ms[0] = (uint64_t{1} << qs[1]) - 1; + for (unsigned i = 1; i < 2; ++i) { + xs[i] = uint64_t{1} << (qs[i + 1] + 1); + ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1); + } + ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); + + uint64_t xss[4]; + for (unsigned i = 0; i < 4; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 2; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + uint64_t emaskh = 0; + + for (auto q : cqs) { + emaskh |= uint64_t{1} << q; + } + + uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[1]; + + auto s = StateSpace::Create(9); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 4; ++i) { + for (unsigned m = 0; m < 8; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2); + } + + unsigned l = 2 * (8 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[8], is[8]; + + uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh; + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 4; ++l) { + rs[2 * l] = _mm512_load_ps(p0 + xss[l]); + is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 2; ++j) { + rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]); + is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 4; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 8; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 6 + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, + state.num_qubits(), cmaskh, emaskh, idx, rstate); + } + + void ApplyControlledGate3HHL_L(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + State& state) const { + uint64_t xs[2]; + uint64_t ms[3]; + + xs[0] = uint64_t{1} << (qs[1] + 1); + ms[0] = (uint64_t{1} << qs[1]) - 1; + for (unsigned i = 1; i < 2; ++i) { + xs[i] = uint64_t{1} << (qs[i + 1] + 1); + ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1); + } + ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); + + uint64_t xss[4]; + for (unsigned i = 0; i < 4; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 2; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned cl = 0; + uint64_t emaskl = 0; + uint64_t emaskh = 0; + + for (auto q : cqs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } else { + ++cl; + emaskl |= uint64_t{1} << q; + } + } + + uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); + uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[1]; + + auto s = StateSpace::Create(9); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 4; ++i) { + for (unsigned m = 0; m < 8; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2); + } + + unsigned l = 2 * (8 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0; + wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[8], is[8]; + + uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh; + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 4; ++l) { + rs[2 * l] = _mm512_load_ps(p0 + xss[l]); + is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 2; ++j) { + rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]); + is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 4; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 8; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 6 + cqs.size() - cl; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, + state.num_qubits(), cmaskh, emaskh, idx, rstate); + } + + void ApplyControlledGate3HLL_H(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + State& state) const { + uint64_t xs[1]; + uint64_t ms[2]; + + xs[0] = uint64_t{1} << (qs[2] + 1); + ms[0] = (uint64_t{1} << qs[2]) - 1; + ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); + + uint64_t xss[2]; + for (unsigned i = 0; i < 2; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 1; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + uint64_t emaskh = 0; + + for (auto q : cqs) { + emaskh |= uint64_t{1} << q; + } + + uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[3]; + + auto s = StateSpace::Create(8); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]); + + for (unsigned i = 0; i < 3; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 2; ++i) { + for (unsigned m = 0; m < 8; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4); + } + + unsigned l = 2 * (8 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[8], is[8]; + + uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh; + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 2; ++l) { + rs[4 * l] = _mm512_load_ps(p0 + xss[l]); + is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 4; ++j) { + rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]); + is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 2; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 8; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 5 + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, + state.num_qubits(), cmaskh, emaskh, idx, rstate); + } + + void ApplyControlledGate3HLL_L(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + State& state) const { + uint64_t xs[1]; + uint64_t ms[2]; + + xs[0] = uint64_t{1} << (qs[2] + 1); + ms[0] = (uint64_t{1} << qs[2]) - 1; + ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); + + uint64_t xss[2]; + for (unsigned i = 0; i < 2; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 1; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned cl = 0; + uint64_t emaskl = 0; + uint64_t emaskh = 0; + + for (auto q : cqs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } else { + ++cl; + emaskl |= uint64_t{1} << q; + } + } + + uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); + uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[3]; + + auto s = StateSpace::Create(8); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]); + + for (unsigned i = 0; i < 3; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 2; ++i) { + for (unsigned m = 0; m < 8; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4); + } + + unsigned l = 2 * (8 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0; + wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[8], is[8]; + + uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh; + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 2; ++l) { + rs[4 * l] = _mm512_load_ps(p0 + xss[l]); + is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 4; ++j) { + rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]); + is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 2; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 8; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 5 + cqs.size() - cl; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, + state.num_qubits(), cmaskh, emaskh, idx, rstate); + } + + void ApplyControlledGate3LLL_H(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + State& state) const { + uint64_t emaskh = 0; + + for (auto q : cqs) { + emaskh |= uint64_t{1} << q; + } + + uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[7]; + + auto s = StateSpace::Create(7); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]); + + for (unsigned i = 0; i < 7; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned m = 0; m < 8; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (64 * i + 8 * k + 8 * (m / 8) + (k + m) % 8); + } + + unsigned l = 2 * (8 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[8], is[8]; + + uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh; + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 1; ++l) { + rs[8 * l] = _mm512_load_ps(p0); + is[8 * l] = _mm512_load_ps(p0 + 16); + + for (unsigned j = 1; j < 8; ++j) { + rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]); + is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 1; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 8; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0, rn); + _mm512_store_ps(p0 + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 4 + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, + state.num_qubits(), cmaskh, emaskh, idx, rstate); + } + + void ApplyControlledGate3LLL_L(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + State& state) const { + unsigned cl = 0; + uint64_t emaskl = 0; + uint64_t emaskh = 0; + + for (auto q : cqs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } else { + ++cl; + emaskl |= uint64_t{1} << q; + } + } + + uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); + uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[7]; + + auto s = StateSpace::Create(7); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]); + + for (unsigned i = 0; i < 7; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned m = 0; m < 8; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (64 * i + 8 * k + 8 * (m / 8) + (k + m) % 8); + } + + unsigned l = 2 * (8 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0; + wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[8], is[8]; + + uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh; + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 1; ++l) { + rs[8 * l] = _mm512_load_ps(p0); + is[8 * l] = _mm512_load_ps(p0 + 16); + + for (unsigned j = 1; j < 8; ++j) { + rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]); + is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 1; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 8; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0, rn); + _mm512_store_ps(p0 + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 4 + cqs.size() - cl; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, + state.num_qubits(), cmaskh, emaskh, idx, rstate); + } + + void ApplyControlledGate4HHHH_H(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + State& state) const { + uint64_t xs[4]; + uint64_t ms[5]; + + xs[0] = uint64_t{1} << (qs[0] + 1); + ms[0] = (uint64_t{1} << qs[0]) - 1; + for (unsigned i = 1; i < 4; ++i) { + xs[i] = uint64_t{1} << (qs[i + 0] + 1); + ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); + } + ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1); + + uint64_t xss[16]; + for (unsigned i = 0; i < 16; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 4; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + uint64_t emaskh = 0; + + for (auto q : cqs) { + emaskh |= uint64_t{1} << q; + } + + uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); + + for (auto q : qs) { + emaskh |= uint64_t{1} << q; + } + + emaskh = ~emaskh ^ 15; + + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + fp_type* rstate) { + __m512 ru, iu, rn, in; + __m512 rs[16], is[16]; + + uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh; + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 16; ++l) { + rs[l] = _mm512_load_ps(p0 + xss[l]); + is[l] = _mm512_load_ps(p0 + xss[l] + 16); + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 16; ++l) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_mul_ps(rs[0], ru); + in = _mm512_mul_ps(rs[0], iu); + rn = _mm512_fnmadd_ps(is[0], iu, rn); + in = _mm512_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned n = 1; n < 16; ++n) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_fmadd_ps(rs[n], ru, rn); + in = _mm512_fmadd_ps(rs[n], iu, in); + rn = _mm512_fnmadd_ps(is[n], iu, rn); + in = _mm512_fmadd_ps(is[n], ru, in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 8 + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, matrix, ms, xss, + state.num_qubits(), cmaskh, emaskh, rstate); + } + + void ApplyControlledGate4HHHH_L(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + State& state) const { + uint64_t xs[4]; + uint64_t ms[5]; + + xs[0] = uint64_t{1} << (qs[0] + 1); + ms[0] = (uint64_t{1} << qs[0]) - 1; + for (unsigned i = 1; i < 4; ++i) { + xs[i] = uint64_t{1} << (qs[i + 0] + 1); + ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); + } + ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1); + + uint64_t xss[16]; + for (unsigned i = 0; i < 16; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 4; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned cl = 0; + uint64_t emaskl = 0; + uint64_t emaskh = 0; + + for (auto q : cqs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } else { + ++cl; + emaskl |= uint64_t{1} << q; + } + } + + uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); + uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl); + + for (auto q : qs) { + emaskh |= uint64_t{1} << q; + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + + auto s = StateSpace::Create(12); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 16; ++i) { + for (unsigned m = 0; m < 16; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (16 * i + 16 * k + m); + } + + unsigned l = 2 * (16 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0; + wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[16], is[16]; + + uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh; + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 16; ++l) { + rs[l] = _mm512_load_ps(p0 + xss[l]); + is[l] = _mm512_load_ps(p0 + xss[l] + 16); + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 16; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 16; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 8 + cqs.size() - cl; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, + state.num_qubits(), cmaskh, emaskh, rstate); + } + + void ApplyControlledGate4HHHL_H(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + State& state) const { + uint64_t xs[3]; + uint64_t ms[4]; + + xs[0] = uint64_t{1} << (qs[1] + 1); + ms[0] = (uint64_t{1} << qs[1]) - 1; + for (unsigned i = 1; i < 3; ++i) { + xs[i] = uint64_t{1} << (qs[i + 1] + 1); + ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1); + } + ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1); + + uint64_t xss[8]; + for (unsigned i = 0; i < 8; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 3; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + uint64_t emaskh = 0; + + for (auto q : cqs) { + emaskh |= uint64_t{1} << q; + } + + uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[1]; + + auto s = StateSpace::Create(11); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 8; ++i) { + for (unsigned m = 0; m < 16; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2); + } + + unsigned l = 2 * (16 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[16], is[16]; + + uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh; + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 8; ++l) { + rs[2 * l] = _mm512_load_ps(p0 + xss[l]); + is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 2; ++j) { + rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]); + is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 8; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 16; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 7 + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, + state.num_qubits(), cmaskh, emaskh, idx, rstate); + } + + void ApplyControlledGate4HHHL_L(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + State& state) const { + uint64_t xs[3]; + uint64_t ms[4]; + + xs[0] = uint64_t{1} << (qs[1] + 1); + ms[0] = (uint64_t{1} << qs[1]) - 1; + for (unsigned i = 1; i < 3; ++i) { + xs[i] = uint64_t{1} << (qs[i + 1] + 1); + ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1); + } + ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1); + + uint64_t xss[8]; + for (unsigned i = 0; i < 8; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 3; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned cl = 0; + uint64_t emaskl = 0; + uint64_t emaskh = 0; + + for (auto q : cqs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } else { + ++cl; + emaskl |= uint64_t{1} << q; + } + } + + uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); + uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[1]; + + auto s = StateSpace::Create(11); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 8; ++i) { + for (unsigned m = 0; m < 16; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2); + } + + unsigned l = 2 * (16 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0; + wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[16], is[16]; + + uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh; + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 8; ++l) { + rs[2 * l] = _mm512_load_ps(p0 + xss[l]); + is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 2; ++j) { + rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]); + is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 8; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 16; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 7 + cqs.size() - cl; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, + state.num_qubits(), cmaskh, emaskh, idx, rstate); + } + + void ApplyControlledGate4HHLL_H(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + State& state) const { + uint64_t xs[2]; + uint64_t ms[3]; + + xs[0] = uint64_t{1} << (qs[2] + 1); + ms[0] = (uint64_t{1} << qs[2]) - 1; + for (unsigned i = 1; i < 2; ++i) { + xs[i] = uint64_t{1} << (qs[i + 2] + 1); + ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1); + } + ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); + + uint64_t xss[4]; + for (unsigned i = 0; i < 4; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 2; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + uint64_t emaskh = 0; + + for (auto q : cqs) { + emaskh |= uint64_t{1} << q; + } + + uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[3]; + + auto s = StateSpace::Create(10); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]); + + for (unsigned i = 0; i < 3; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 4; ++i) { + for (unsigned m = 0; m < 16; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4); + } + + unsigned l = 2 * (16 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[16], is[16]; + + uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh; + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 4; ++l) { + rs[4 * l] = _mm512_load_ps(p0 + xss[l]); + is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 4; ++j) { + rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]); + is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 4; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 16; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 6 + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, + state.num_qubits(), cmaskh, emaskh, idx, rstate); + } + + void ApplyControlledGate4HHLL_L(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + State& state) const { + uint64_t xs[2]; + uint64_t ms[3]; + + xs[0] = uint64_t{1} << (qs[2] + 1); + ms[0] = (uint64_t{1} << qs[2]) - 1; + for (unsigned i = 1; i < 2; ++i) { + xs[i] = uint64_t{1} << (qs[i + 2] + 1); + ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1); + } + ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); + + uint64_t xss[4]; + for (unsigned i = 0; i < 4; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 2; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned cl = 0; + uint64_t emaskl = 0; + uint64_t emaskh = 0; + + for (auto q : cqs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } else { + ++cl; + emaskl |= uint64_t{1} << q; + } + } + + uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); + uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[3]; + + auto s = StateSpace::Create(10); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]); + + for (unsigned i = 0; i < 3; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 4; ++i) { + for (unsigned m = 0; m < 16; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4); + } + + unsigned l = 2 * (16 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0; + wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[16], is[16]; + + uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh; + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 4; ++l) { + rs[4 * l] = _mm512_load_ps(p0 + xss[l]); + is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 4; ++j) { + rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]); + is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 4; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 16; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 6 + cqs.size() - cl; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, + state.num_qubits(), cmaskh, emaskh, idx, rstate); + } + + void ApplyControlledGate4HLLL_H(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + State& state) const { + uint64_t xs[1]; + uint64_t ms[2]; + + xs[0] = uint64_t{1} << (qs[3] + 1); + ms[0] = (uint64_t{1} << qs[3]) - 1; + ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); + + uint64_t xss[2]; + for (unsigned i = 0; i < 2; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 1; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + uint64_t emaskh = 0; + + for (auto q : cqs) { + emaskh |= uint64_t{1} << q; + } + + uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[7]; + + auto s = StateSpace::Create(9); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]); + + for (unsigned i = 0; i < 7; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 2; ++i) { + for (unsigned m = 0; m < 16; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (128 * i + 16 * k + 8 * (m / 8) + (k + m) % 8); + } + + unsigned l = 2 * (16 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[16], is[16]; + + uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh; + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 2; ++l) { + rs[8 * l] = _mm512_load_ps(p0 + xss[l]); + is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 8; ++j) { + rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]); + is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 2; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 16; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 5 + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, + state.num_qubits(), cmaskh, emaskh, idx, rstate); + } + + void ApplyControlledGate4HLLL_L(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + State& state) const { + uint64_t xs[1]; + uint64_t ms[2]; + + xs[0] = uint64_t{1} << (qs[3] + 1); + ms[0] = (uint64_t{1} << qs[3]) - 1; + ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); + + uint64_t xss[2]; + for (unsigned i = 0; i < 2; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 1; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned cl = 0; + uint64_t emaskl = 0; + uint64_t emaskh = 0; + + for (auto q : cqs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } else { + ++cl; + emaskl |= uint64_t{1} << q; + } + } + + uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); + uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[7]; + + auto s = StateSpace::Create(9); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]); + + for (unsigned i = 0; i < 7; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 2; ++i) { + for (unsigned m = 0; m < 16; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (128 * i + 16 * k + 8 * (m / 8) + (k + m) % 8); + } + + unsigned l = 2 * (16 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0; + wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[16], is[16]; + + uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh; + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 2; ++l) { + rs[8 * l] = _mm512_load_ps(p0 + xss[l]); + is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 8; ++j) { + rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]); + is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 2; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 16; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 5 + cqs.size() - cl; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, + state.num_qubits(), cmaskh, emaskh, idx, rstate); + } + + void ApplyControlledGate4LLLL_H(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + State& state) const { + uint64_t emaskh = 0; + + for (auto q : cqs) { + emaskh |= uint64_t{1} << q; + } + + uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[15]; + + auto s = StateSpace::Create(8); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]); + + for (unsigned i = 0; i < 15; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned m = 0; m < 16; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (256 * i + 16 * k + 16 * (m / 16) + (k + m) % 16); + } + + unsigned l = 2 * (16 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[16], is[16]; + + uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh; + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 1; ++l) { + rs[16 * l] = _mm512_load_ps(p0); + is[16 * l] = _mm512_load_ps(p0 + 16); + + for (unsigned j = 1; j < 16; ++j) { + rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]); + is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 1; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 16; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0, rn); + _mm512_store_ps(p0 + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 4 + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, + state.num_qubits(), cmaskh, emaskh, idx, rstate); + } + + void ApplyControlledGate4LLLL_L(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + State& state) const { + unsigned cl = 0; + uint64_t emaskl = 0; + uint64_t emaskh = 0; + + for (auto q : cqs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } else { + ++cl; + emaskl |= uint64_t{1} << q; + } + } + + uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); + uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[15]; + + auto s = StateSpace::Create(8); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]); + + for (unsigned i = 0; i < 15; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned m = 0; m < 16; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (256 * i + 16 * k + 16 * (m / 16) + (k + m) % 16); + } + + unsigned l = 2 * (16 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0; + wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, fp_type* rstate) { + __m512 rn, in; + __m512 rs[16], is[16]; + + uint64_t k = bits::ExpandBits(i, num_qubits, emaskh) | cmaskh; + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 1; ++l) { + rs[16 * l] = _mm512_load_ps(p0); + is[16 * l] = _mm512_load_ps(p0 + 16); + + for (unsigned j = 1; j < 16; ++j) { + rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]); + is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 1; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 16; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0, rn); + _mm512_store_ps(p0 + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 4 + cqs.size() - cl; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, + state.num_qubits(), cmaskh, emaskh, idx, rstate); + } + + std::complex ExpectationValue1H(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + uint64_t xs[1]; + uint64_t ms[2]; + + xs[0] = uint64_t{1} << (qs[0] + 1); + ms[0] = (uint64_t{1} << qs[0]) - 1; + ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); + + uint64_t xss[2]; + for (unsigned i = 0; i < 2; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 1; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, + const fp_type* rstate) { + __m512 ru, iu, rn, in; + __m512 rs[2], is[2]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 2; ++l) { + rs[l] = _mm512_load_ps(p0 + xss[l]); + is[l] = _mm512_load_ps(p0 + xss[l] + 16); + } + + double re = 0; + double im = 0; + + uint64_t j = 0; + + for (unsigned l = 0; l < 2; ++l) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_mul_ps(rs[0], ru); + in = _mm512_mul_ps(rs[0], iu); + rn = _mm512_fnmadd_ps(is[0], iu, rn); + in = _mm512_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned n = 1; n < 2; ++n) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_fmadd_ps(rs[n], ru, rn); + in = _mm512_fmadd_ps(rs[n], iu, in); + rn = _mm512_fnmadd_ps(is[n], iu, rn); + in = _mm512_fmadd_ps(is[n], ru, in); + + j += 2; + } + + __m512 v_re = _mm512_fmadd_ps(is[l], in, _mm512_mul_ps(rs[l], rn)); + __m512 v_im = _mm512_fnmadd_ps(is[l], rn, _mm512_mul_ps(rs[l], in)); + + re += detail::HorizontalSumAVX512(v_re); + im += detail::HorizontalSumAVX512(v_im); + } + + return std::complex{re, im}; + }; + + const fp_type* rstate = state.get(); + + unsigned k = 5; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return for_.RunReduce(size, f, Op(), matrix, ms, xss, rstate); + } + + std::complex ExpectationValue1L(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + unsigned p[16]; + __m512i idx[1]; + + auto s = StateSpace::Create(5); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned m = 0; m < 2; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2); + } + + unsigned l = 2 * (2 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const __m512i* idx, const fp_type* rstate) { + __m512 rn, in; + __m512 rs[2], is[2]; + + auto p0 = rstate + 32 * i; + + for (unsigned l = 0; l < 1; ++l) { + rs[2 * l] = _mm512_load_ps(p0); + is[2 * l] = _mm512_load_ps(p0 + 16); + + for (unsigned j = 1; j < 2; ++j) { + rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]); + is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]); + } + } + + double re = 0; + double im = 0; + + uint64_t j = 0; + + for (unsigned l = 0; l < 1; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 2; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + __m512 v_re = _mm512_fmadd_ps(is[l], in, _mm512_mul_ps(rs[l], rn)); + __m512 v_im = _mm512_fnmadd_ps(is[l], rn, _mm512_mul_ps(rs[l], in)); + + re += detail::HorizontalSumAVX512(v_re); + im += detail::HorizontalSumAVX512(v_im); + } + + return std::complex{re, im}; + }; + + const fp_type* rstate = state.get(); + + unsigned k = 4; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return for_.RunReduce(size, f, Op(), w, idx, rstate); + } + + std::complex ExpectationValue2HH(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + uint64_t xs[2]; + uint64_t ms[3]; + + xs[0] = uint64_t{1} << (qs[0] + 1); + ms[0] = (uint64_t{1} << qs[0]) - 1; + for (unsigned i = 1; i < 2; ++i) { + xs[i] = uint64_t{1} << (qs[i + 0] + 1); + ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); + } + ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); + + uint64_t xss[4]; + for (unsigned i = 0; i < 4; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 2; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, + const fp_type* rstate) { + __m512 ru, iu, rn, in; + __m512 rs[4], is[4]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 4; ++l) { + rs[l] = _mm512_load_ps(p0 + xss[l]); + is[l] = _mm512_load_ps(p0 + xss[l] + 16); + } + + double re = 0; + double im = 0; + + uint64_t j = 0; + + for (unsigned l = 0; l < 4; ++l) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_mul_ps(rs[0], ru); + in = _mm512_mul_ps(rs[0], iu); + rn = _mm512_fnmadd_ps(is[0], iu, rn); + in = _mm512_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned n = 1; n < 4; ++n) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_fmadd_ps(rs[n], ru, rn); + in = _mm512_fmadd_ps(rs[n], iu, in); + rn = _mm512_fnmadd_ps(is[n], iu, rn); + in = _mm512_fmadd_ps(is[n], ru, in); + + j += 2; + } + + __m512 v_re = _mm512_fmadd_ps(is[l], in, _mm512_mul_ps(rs[l], rn)); + __m512 v_im = _mm512_fnmadd_ps(is[l], rn, _mm512_mul_ps(rs[l], in)); + + re += detail::HorizontalSumAVX512(v_re); + im += detail::HorizontalSumAVX512(v_im); + } + + return std::complex{re, im}; + }; + + const fp_type* rstate = state.get(); + + unsigned k = 6; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return for_.RunReduce(size, f, Op(), matrix, ms, xss, rstate); + } + + std::complex ExpectationValue2HL(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + uint64_t xs[1]; + uint64_t ms[2]; + + xs[0] = uint64_t{1} << (qs[1] + 1); + ms[0] = (uint64_t{1} << qs[1]) - 1; + ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); + + uint64_t xss[2]; + for (unsigned i = 0; i < 2; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 1; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[1]; + + auto s = StateSpace::Create(7); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 2; ++i) { + for (unsigned m = 0; m < 4; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2); + } + + unsigned l = 2 * (4 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, const fp_type* rstate) { + __m512 rn, in; + __m512 rs[4], is[4]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 2; ++l) { + rs[2 * l] = _mm512_load_ps(p0 + xss[l]); + is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 2; ++j) { + rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]); + is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]); + } + } + + double re = 0; + double im = 0; + + uint64_t j = 0; + + for (unsigned l = 0; l < 2; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 4; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + unsigned m = 2 * l; + + __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn)); + __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in)); + + re += detail::HorizontalSumAVX512(v_re); + im += detail::HorizontalSumAVX512(v_im); + } + + return std::complex{re, im}; + }; + + const fp_type* rstate = state.get(); + + unsigned k = 5; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate); + } + + std::complex ExpectationValue2LL(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + unsigned p[16]; + __m512i idx[3]; + + auto s = StateSpace::Create(6); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]); + + for (unsigned i = 0; i < 3; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned m = 0; m < 4; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4); + } + + unsigned l = 2 * (4 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const __m512i* idx, const fp_type* rstate) { + __m512 rn, in; + __m512 rs[4], is[4]; + + auto p0 = rstate + 32 * i; + + for (unsigned l = 0; l < 1; ++l) { + rs[4 * l] = _mm512_load_ps(p0); + is[4 * l] = _mm512_load_ps(p0 + 16); + + for (unsigned j = 1; j < 4; ++j) { + rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]); + is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]); + } + } + + double re = 0; + double im = 0; + + uint64_t j = 0; + + for (unsigned l = 0; l < 1; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 4; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + __m512 v_re = _mm512_fmadd_ps(is[l], in, _mm512_mul_ps(rs[l], rn)); + __m512 v_im = _mm512_fnmadd_ps(is[l], rn, _mm512_mul_ps(rs[l], in)); + + re += detail::HorizontalSumAVX512(v_re); + im += detail::HorizontalSumAVX512(v_im); + } + + return std::complex{re, im}; + }; + + const fp_type* rstate = state.get(); + + unsigned k = 4; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return for_.RunReduce(size, f, Op(), w, idx, rstate); + } + + std::complex ExpectationValue3HHH(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + uint64_t xs[3]; + uint64_t ms[4]; + + xs[0] = uint64_t{1} << (qs[0] + 1); + ms[0] = (uint64_t{1} << qs[0]) - 1; + for (unsigned i = 1; i < 3; ++i) { + xs[i] = uint64_t{1} << (qs[i + 0] + 1); + ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); + } + ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1); + + uint64_t xss[8]; + for (unsigned i = 0; i < 8; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 3; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, + const fp_type* rstate) { + __m512 ru, iu, rn, in; + __m512 rs[8], is[8]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]) + | (128 * i & ms[3]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 8; ++l) { + rs[l] = _mm512_load_ps(p0 + xss[l]); + is[l] = _mm512_load_ps(p0 + xss[l] + 16); + } + + double re = 0; + double im = 0; + + uint64_t j = 0; + + for (unsigned l = 0; l < 8; ++l) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_mul_ps(rs[0], ru); + in = _mm512_mul_ps(rs[0], iu); + rn = _mm512_fnmadd_ps(is[0], iu, rn); + in = _mm512_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned n = 1; n < 8; ++n) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_fmadd_ps(rs[n], ru, rn); + in = _mm512_fmadd_ps(rs[n], iu, in); + rn = _mm512_fnmadd_ps(is[n], iu, rn); + in = _mm512_fmadd_ps(is[n], ru, in); + + j += 2; + } + + __m512 v_re = _mm512_fmadd_ps(is[l], in, _mm512_mul_ps(rs[l], rn)); + __m512 v_im = _mm512_fnmadd_ps(is[l], rn, _mm512_mul_ps(rs[l], in)); + + re += detail::HorizontalSumAVX512(v_re); + im += detail::HorizontalSumAVX512(v_im); + } + + return std::complex{re, im}; + }; + + const fp_type* rstate = state.get(); + + unsigned k = 7; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return for_.RunReduce(size, f, Op(), matrix, ms, xss, rstate); + } + + std::complex ExpectationValue3HHL(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + uint64_t xs[2]; + uint64_t ms[3]; + + xs[0] = uint64_t{1} << (qs[1] + 1); + ms[0] = (uint64_t{1} << qs[1]) - 1; + for (unsigned i = 1; i < 2; ++i) { + xs[i] = uint64_t{1} << (qs[i + 1] + 1); + ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1); + } + ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); + + uint64_t xss[4]; + for (unsigned i = 0; i < 4; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 2; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[1]; + + auto s = StateSpace::Create(9); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 4; ++i) { + for (unsigned m = 0; m < 8; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2); + } + + unsigned l = 2 * (8 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, const fp_type* rstate) { + __m512 rn, in; + __m512 rs[8], is[8]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 4; ++l) { + rs[2 * l] = _mm512_load_ps(p0 + xss[l]); + is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 2; ++j) { + rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]); + is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]); + } + } + + double re = 0; + double im = 0; + + uint64_t j = 0; + + for (unsigned l = 0; l < 4; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 8; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + unsigned m = 2 * l; + + __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn)); + __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in)); + + re += detail::HorizontalSumAVX512(v_re); + im += detail::HorizontalSumAVX512(v_im); + } + + return std::complex{re, im}; + }; + + const fp_type* rstate = state.get(); + + unsigned k = 6; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate); + } + + std::complex ExpectationValue3HLL(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + uint64_t xs[1]; + uint64_t ms[2]; + + xs[0] = uint64_t{1} << (qs[2] + 1); + ms[0] = (uint64_t{1} << qs[2]) - 1; + ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); + + uint64_t xss[2]; + for (unsigned i = 0; i < 2; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 1; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[3]; + + auto s = StateSpace::Create(8); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]); + + for (unsigned i = 0; i < 3; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 2; ++i) { + for (unsigned m = 0; m < 8; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4); + } + + unsigned l = 2 * (8 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, const fp_type* rstate) { + __m512 rn, in; + __m512 rs[8], is[8]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 2; ++l) { + rs[4 * l] = _mm512_load_ps(p0 + xss[l]); + is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 4; ++j) { + rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]); + is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]); + } + } + + double re = 0; + double im = 0; + + uint64_t j = 0; + + for (unsigned l = 0; l < 2; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 8; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + unsigned m = 4 * l; + + __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn)); + __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in)); + + re += detail::HorizontalSumAVX512(v_re); + im += detail::HorizontalSumAVX512(v_im); + } + + return std::complex{re, im}; + }; + + const fp_type* rstate = state.get(); + + unsigned k = 5; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate); + } + + std::complex ExpectationValue3LLL(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + unsigned p[16]; + __m512i idx[7]; + + auto s = StateSpace::Create(7); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]); + + for (unsigned i = 0; i < 7; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned m = 0; m < 8; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (64 * i + 8 * k + 8 * (m / 8) + (k + m) % 8); + } + + unsigned l = 2 * (8 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const __m512i* idx, const fp_type* rstate) { + __m512 rn, in; + __m512 rs[8], is[8]; + + auto p0 = rstate + 32 * i; + + for (unsigned l = 0; l < 1; ++l) { + rs[8 * l] = _mm512_load_ps(p0); + is[8 * l] = _mm512_load_ps(p0 + 16); + + for (unsigned j = 1; j < 8; ++j) { + rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]); + is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]); + } + } + + double re = 0; + double im = 0; + + uint64_t j = 0; + + for (unsigned l = 0; l < 1; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 8; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + __m512 v_re = _mm512_fmadd_ps(is[l], in, _mm512_mul_ps(rs[l], rn)); + __m512 v_im = _mm512_fnmadd_ps(is[l], rn, _mm512_mul_ps(rs[l], in)); + + re += detail::HorizontalSumAVX512(v_re); + im += detail::HorizontalSumAVX512(v_im); + } + + return std::complex{re, im}; + }; + + const fp_type* rstate = state.get(); + + unsigned k = 4; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return for_.RunReduce(size, f, Op(), w, idx, rstate); + } + + std::complex ExpectationValue4HHHH(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + uint64_t xs[4]; + uint64_t ms[5]; + + xs[0] = uint64_t{1} << (qs[0] + 1); + ms[0] = (uint64_t{1} << qs[0]) - 1; + for (unsigned i = 1; i < 4; ++i) { + xs[i] = uint64_t{1} << (qs[i + 0] + 1); + ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); + } + ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1); + + uint64_t xss[16]; + for (unsigned i = 0; i < 16; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 4; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, + const fp_type* rstate) { + __m512 ru, iu, rn, in; + __m512 rs[16], is[16]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]) + | (128 * i & ms[3]) | (256 * i & ms[4]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 16; ++l) { + rs[l] = _mm512_load_ps(p0 + xss[l]); + is[l] = _mm512_load_ps(p0 + xss[l] + 16); + } + + double re = 0; + double im = 0; + + uint64_t j = 0; + + for (unsigned l = 0; l < 16; ++l) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_mul_ps(rs[0], ru); + in = _mm512_mul_ps(rs[0], iu); + rn = _mm512_fnmadd_ps(is[0], iu, rn); + in = _mm512_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned n = 1; n < 16; ++n) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_fmadd_ps(rs[n], ru, rn); + in = _mm512_fmadd_ps(rs[n], iu, in); + rn = _mm512_fnmadd_ps(is[n], iu, rn); + in = _mm512_fmadd_ps(is[n], ru, in); + + j += 2; + } + + __m512 v_re = _mm512_fmadd_ps(is[l], in, _mm512_mul_ps(rs[l], rn)); + __m512 v_im = _mm512_fnmadd_ps(is[l], rn, _mm512_mul_ps(rs[l], in)); + + re += detail::HorizontalSumAVX512(v_re); + im += detail::HorizontalSumAVX512(v_im); + } + + return std::complex{re, im}; + }; + + const fp_type* rstate = state.get(); + + unsigned k = 8; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return for_.RunReduce(size, f, Op(), matrix, ms, xss, rstate); + } + + std::complex ExpectationValue4HHHL(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + uint64_t xs[3]; + uint64_t ms[4]; + + xs[0] = uint64_t{1} << (qs[1] + 1); + ms[0] = (uint64_t{1} << qs[1]) - 1; + for (unsigned i = 1; i < 3; ++i) { + xs[i] = uint64_t{1} << (qs[i + 1] + 1); + ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1); + } + ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1); + + uint64_t xss[8]; + for (unsigned i = 0; i < 8; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 3; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[1]; + + auto s = StateSpace::Create(11); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 8; ++i) { + for (unsigned m = 0; m < 16; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2); + } + + unsigned l = 2 * (16 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, const fp_type* rstate) { + __m512 rn, in; + __m512 rs[16], is[16]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]) + | (128 * i & ms[3]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 8; ++l) { + rs[2 * l] = _mm512_load_ps(p0 + xss[l]); + is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 2; ++j) { + rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]); + is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]); + } + } + + double re = 0; + double im = 0; + + uint64_t j = 0; + + for (unsigned l = 0; l < 8; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 16; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + unsigned m = 2 * l; + + __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn)); + __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in)); + + re += detail::HorizontalSumAVX512(v_re); + im += detail::HorizontalSumAVX512(v_im); + } + + return std::complex{re, im}; + }; + + const fp_type* rstate = state.get(); + + unsigned k = 7; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate); + } + + std::complex ExpectationValue4HHLL(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + uint64_t xs[2]; + uint64_t ms[3]; + + xs[0] = uint64_t{1} << (qs[2] + 1); + ms[0] = (uint64_t{1} << qs[2]) - 1; + for (unsigned i = 1; i < 2; ++i) { + xs[i] = uint64_t{1} << (qs[i + 2] + 1); + ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1); + } + ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); + + uint64_t xss[4]; + for (unsigned i = 0; i < 4; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 2; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[3]; + + auto s = StateSpace::Create(10); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]); + + for (unsigned i = 0; i < 3; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 4; ++i) { + for (unsigned m = 0; m < 16; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4); + } + + unsigned l = 2 * (16 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, const fp_type* rstate) { + __m512 rn, in; + __m512 rs[16], is[16]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 4; ++l) { + rs[4 * l] = _mm512_load_ps(p0 + xss[l]); + is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 4; ++j) { + rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]); + is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]); + } + } + + double re = 0; + double im = 0; + + uint64_t j = 0; + + for (unsigned l = 0; l < 4; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 16; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + unsigned m = 4 * l; + + __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn)); + __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in)); + + re += detail::HorizontalSumAVX512(v_re); + im += detail::HorizontalSumAVX512(v_im); + } + + return std::complex{re, im}; + }; + + const fp_type* rstate = state.get(); + + unsigned k = 6; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate); + } + + std::complex ExpectationValue4HLLL(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + uint64_t xs[1]; + uint64_t ms[2]; + + xs[0] = uint64_t{1} << (qs[3] + 1); + ms[0] = (uint64_t{1} << qs[3]) - 1; + ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); + + uint64_t xss[2]; + for (unsigned i = 0; i < 2; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 1; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[7]; + + auto s = StateSpace::Create(9); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]); + + for (unsigned i = 0; i < 7; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 2; ++i) { + for (unsigned m = 0; m < 16; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (128 * i + 16 * k + 8 * (m / 8) + (k + m) % 8); + } + + unsigned l = 2 * (16 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, const fp_type* rstate) { + __m512 rn, in; + __m512 rs[16], is[16]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 2; ++l) { + rs[8 * l] = _mm512_load_ps(p0 + xss[l]); + is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 8; ++j) { + rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]); + is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]); + } + } + + double re = 0; + double im = 0; + + uint64_t j = 0; + + for (unsigned l = 0; l < 2; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 16; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + unsigned m = 8 * l; + + __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn)); + __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in)); + + re += detail::HorizontalSumAVX512(v_re); + im += detail::HorizontalSumAVX512(v_im); + } + + return std::complex{re, im}; + }; + + const fp_type* rstate = state.get(); + + unsigned k = 5; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate); + } + + std::complex ExpectationValue4LLLL(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + unsigned p[16]; + __m512i idx[15]; + + auto s = StateSpace::Create(8); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]); + + for (unsigned i = 0; i < 15; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned m = 0; m < 16; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (256 * i + 16 * k + 16 * (m / 16) + (k + m) % 16); + } + + unsigned l = 2 * (16 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const __m512i* idx, const fp_type* rstate) { + __m512 rn, in; + __m512 rs[16], is[16]; + + auto p0 = rstate + 32 * i; + + for (unsigned l = 0; l < 1; ++l) { + rs[16 * l] = _mm512_load_ps(p0); + is[16 * l] = _mm512_load_ps(p0 + 16); + + for (unsigned j = 1; j < 16; ++j) { + rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]); + is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]); + } + } + + double re = 0; + double im = 0; + + uint64_t j = 0; + + for (unsigned l = 0; l < 1; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 16; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + __m512 v_re = _mm512_fmadd_ps(is[l], in, _mm512_mul_ps(rs[l], rn)); + __m512 v_im = _mm512_fnmadd_ps(is[l], rn, _mm512_mul_ps(rs[l], in)); + + re += detail::HorizontalSumAVX512(v_re); + im += detail::HorizontalSumAVX512(v_im); + } + + return std::complex{re, im}; + }; + + const fp_type* rstate = state.get(); + + unsigned k = 4; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return for_.RunReduce(size, f, Op(), w, idx, rstate); + } + + std::complex ExpectationValue5HHHHH(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + uint64_t xs[5]; + uint64_t ms[6]; + + xs[0] = uint64_t{1} << (qs[0] + 1); + ms[0] = (uint64_t{1} << qs[0]) - 1; + for (unsigned i = 1; i < 5; ++i) { + xs[i] = uint64_t{1} << (qs[i + 0] + 1); + ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); + } + ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1); + + uint64_t xss[32]; + for (unsigned i = 0; i < 32; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 5; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, + const fp_type* rstate) { + __m512 ru, iu, rn, in; + __m512 rs[32], is[32]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]) + | (128 * i & ms[3]) | (256 * i & ms[4]) | (512 * i & ms[5]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 32; ++l) { + rs[l] = _mm512_load_ps(p0 + xss[l]); + is[l] = _mm512_load_ps(p0 + xss[l] + 16); + } + + double re = 0; + double im = 0; + + uint64_t j = 0; + + for (unsigned l = 0; l < 32; ++l) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_mul_ps(rs[0], ru); + in = _mm512_mul_ps(rs[0], iu); + rn = _mm512_fnmadd_ps(is[0], iu, rn); + in = _mm512_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned n = 1; n < 32; ++n) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_fmadd_ps(rs[n], ru, rn); + in = _mm512_fmadd_ps(rs[n], iu, in); + rn = _mm512_fnmadd_ps(is[n], iu, rn); + in = _mm512_fmadd_ps(is[n], ru, in); + + j += 2; + } + + __m512 v_re = _mm512_fmadd_ps(is[l], in, _mm512_mul_ps(rs[l], rn)); + __m512 v_im = _mm512_fnmadd_ps(is[l], rn, _mm512_mul_ps(rs[l], in)); + + re += detail::HorizontalSumAVX512(v_re); + im += detail::HorizontalSumAVX512(v_im); + } + + return std::complex{re, im}; + }; + + const fp_type* rstate = state.get(); + + unsigned k = 9; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return for_.RunReduce(size, f, Op(), matrix, ms, xss, rstate); + } + + std::complex ExpectationValue5HHHHL(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + uint64_t xs[4]; + uint64_t ms[5]; + + xs[0] = uint64_t{1} << (qs[1] + 1); + ms[0] = (uint64_t{1} << qs[1]) - 1; + for (unsigned i = 1; i < 4; ++i) { + xs[i] = uint64_t{1} << (qs[i + 1] + 1); + ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1); + } + ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1); + + uint64_t xss[16]; + for (unsigned i = 0; i < 16; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 4; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[1]; + + auto s = StateSpace::Create(13); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 16; ++i) { + for (unsigned m = 0; m < 32; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (64 * i + 32 * k + 2 * (m / 2) + (k + m) % 2); + } + + unsigned l = 2 * (32 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, const fp_type* rstate) { + __m512 rn, in; + __m512 rs[32], is[32]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]) + | (128 * i & ms[3]) | (256 * i & ms[4]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 16; ++l) { + rs[2 * l] = _mm512_load_ps(p0 + xss[l]); + is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 2; ++j) { + rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]); + is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]); + } + } + + double re = 0; + double im = 0; + + uint64_t j = 0; + + for (unsigned l = 0; l < 16; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 32; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + unsigned m = 2 * l; + + __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn)); + __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in)); + + re += detail::HorizontalSumAVX512(v_re); + im += detail::HorizontalSumAVX512(v_im); + } + + return std::complex{re, im}; + }; + + const fp_type* rstate = state.get(); + + unsigned k = 8; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate); + } + + std::complex ExpectationValue5HHHLL(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + uint64_t xs[3]; + uint64_t ms[4]; + + xs[0] = uint64_t{1} << (qs[2] + 1); + ms[0] = (uint64_t{1} << qs[2]) - 1; + for (unsigned i = 1; i < 3; ++i) { + xs[i] = uint64_t{1} << (qs[i + 2] + 1); + ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1); + } + ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1); + + uint64_t xss[8]; + for (unsigned i = 0; i < 8; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 3; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[3]; + + auto s = StateSpace::Create(12); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]); + + for (unsigned i = 0; i < 3; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 8; ++i) { + for (unsigned m = 0; m < 32; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (128 * i + 32 * k + 4 * (m / 4) + (k + m) % 4); + } + + unsigned l = 2 * (32 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, const fp_type* rstate) { + __m512 rn, in; + __m512 rs[32], is[32]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]) + | (128 * i & ms[3]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 8; ++l) { + rs[4 * l] = _mm512_load_ps(p0 + xss[l]); + is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 4; ++j) { + rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]); + is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]); + } + } + + double re = 0; + double im = 0; + + uint64_t j = 0; + + for (unsigned l = 0; l < 8; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 32; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + unsigned m = 4 * l; + + __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn)); + __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in)); + + re += detail::HorizontalSumAVX512(v_re); + im += detail::HorizontalSumAVX512(v_im); + } + + return std::complex{re, im}; + }; + + const fp_type* rstate = state.get(); + + unsigned k = 7; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate); + } + + std::complex ExpectationValue5HHLLL(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + uint64_t xs[2]; + uint64_t ms[3]; + + xs[0] = uint64_t{1} << (qs[3] + 1); + ms[0] = (uint64_t{1} << qs[3]) - 1; + for (unsigned i = 1; i < 2; ++i) { + xs[i] = uint64_t{1} << (qs[i + 3] + 1); + ms[i] = ((uint64_t{1} << qs[i + 3]) - 1) ^ (xs[i - 1] - 1); + } + ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); + + uint64_t xss[4]; + for (unsigned i = 0; i < 4; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 2; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[7]; + + auto s = StateSpace::Create(11); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]); + + for (unsigned i = 0; i < 7; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 4; ++i) { + for (unsigned m = 0; m < 32; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (256 * i + 32 * k + 8 * (m / 8) + (k + m) % 8); + } + + unsigned l = 2 * (32 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, const fp_type* rstate) { + __m512 rn, in; + __m512 rs[32], is[32]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 4; ++l) { + rs[8 * l] = _mm512_load_ps(p0 + xss[l]); + is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 8; ++j) { + rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]); + is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]); + } + } + + double re = 0; + double im = 0; + + uint64_t j = 0; + + for (unsigned l = 0; l < 4; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 32; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + unsigned m = 8 * l; + + __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn)); + __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in)); + + re += detail::HorizontalSumAVX512(v_re); + im += detail::HorizontalSumAVX512(v_im); + } + + return std::complex{re, im}; + }; + + const fp_type* rstate = state.get(); + + unsigned k = 6; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate); + } + + std::complex ExpectationValue5HLLLL(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + uint64_t xs[1]; + uint64_t ms[2]; + + xs[0] = uint64_t{1} << (qs[4] + 1); + ms[0] = (uint64_t{1} << qs[4]) - 1; + ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); + + uint64_t xss[2]; + for (unsigned i = 0; i < 2; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 1; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[15]; + + auto s = StateSpace::Create(10); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]); + + for (unsigned i = 0; i < 15; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 2; ++i) { + for (unsigned m = 0; m < 32; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (512 * i + 32 * k + 16 * (m / 16) + (k + m) % 16); + } + + unsigned l = 2 * (32 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, const fp_type* rstate) { + __m512 rn, in; + __m512 rs[32], is[32]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 2; ++l) { + rs[16 * l] = _mm512_load_ps(p0 + xss[l]); + is[16 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 16; ++j) { + rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]); + is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]); + } + } + + double re = 0; + double im = 0; + + uint64_t j = 0; + + for (unsigned l = 0; l < 2; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 32; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + unsigned m = 16 * l; + + __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn)); + __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in)); + + re += detail::HorizontalSumAVX512(v_re); + im += detail::HorizontalSumAVX512(v_im); + } + + return std::complex{re, im}; + }; + + const fp_type* rstate = state.get(); + + unsigned k = 5; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate); + } + + std::complex ExpectationValue6HHHHHH(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + uint64_t xs[6]; + uint64_t ms[7]; + + xs[0] = uint64_t{1} << (qs[0] + 1); + ms[0] = (uint64_t{1} << qs[0]) - 1; + for (unsigned i = 1; i < 6; ++i) { + xs[i] = uint64_t{1} << (qs[i + 0] + 1); + ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); + } + ms[6] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[5] - 1); + + uint64_t xss[64]; + for (unsigned i = 0; i < 64; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 6; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, + const fp_type* rstate) { + __m512 ru, iu, rn, in; + __m512 rs[64], is[64]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]) + | (128 * i & ms[3]) | (256 * i & ms[4]) | (512 * i & ms[5]) + | (1024 * i & ms[6]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 64; ++l) { + rs[l] = _mm512_load_ps(p0 + xss[l]); + is[l] = _mm512_load_ps(p0 + xss[l] + 16); + } + + double re = 0; + double im = 0; + + uint64_t j = 0; + + for (unsigned l = 0; l < 64; ++l) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_mul_ps(rs[0], ru); + in = _mm512_mul_ps(rs[0], iu); + rn = _mm512_fnmadd_ps(is[0], iu, rn); + in = _mm512_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned n = 1; n < 64; ++n) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_fmadd_ps(rs[n], ru, rn); + in = _mm512_fmadd_ps(rs[n], iu, in); + rn = _mm512_fnmadd_ps(is[n], iu, rn); + in = _mm512_fmadd_ps(is[n], ru, in); + + j += 2; + } + + __m512 v_re = _mm512_fmadd_ps(is[l], in, _mm512_mul_ps(rs[l], rn)); + __m512 v_im = _mm512_fnmadd_ps(is[l], rn, _mm512_mul_ps(rs[l], in)); + + re += detail::HorizontalSumAVX512(v_re); + im += detail::HorizontalSumAVX512(v_im); + } + + return std::complex{re, im}; + }; + + const fp_type* rstate = state.get(); + + unsigned k = 10; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return for_.RunReduce(size, f, Op(), matrix, ms, xss, rstate); + } + + std::complex ExpectationValue6HHHHHL(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + uint64_t xs[5]; + uint64_t ms[6]; + + xs[0] = uint64_t{1} << (qs[1] + 1); + ms[0] = (uint64_t{1} << qs[1]) - 1; + for (unsigned i = 1; i < 5; ++i) { + xs[i] = uint64_t{1} << (qs[i + 1] + 1); + ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1); + } + ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1); + + uint64_t xss[32]; + for (unsigned i = 0; i < 32; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 5; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[1]; + + auto s = StateSpace::Create(15); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 32; ++i) { + for (unsigned m = 0; m < 64; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (128 * i + 64 * k + 2 * (m / 2) + (k + m) % 2); + } + + unsigned l = 2 * (64 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, const fp_type* rstate) { + __m512 rn, in; + __m512 rs[64], is[64]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]) + | (128 * i & ms[3]) | (256 * i & ms[4]) | (512 * i & ms[5]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 32; ++l) { + rs[2 * l] = _mm512_load_ps(p0 + xss[l]); + is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 2; ++j) { + rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]); + is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]); + } + } + + double re = 0; + double im = 0; + + uint64_t j = 0; + + for (unsigned l = 0; l < 32; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 64; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + unsigned m = 2 * l; + + __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn)); + __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in)); + + re += detail::HorizontalSumAVX512(v_re); + im += detail::HorizontalSumAVX512(v_im); + } + + return std::complex{re, im}; + }; + + const fp_type* rstate = state.get(); + + unsigned k = 9; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate); + } + + std::complex ExpectationValue6HHHHLL(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + uint64_t xs[4]; + uint64_t ms[5]; + + xs[0] = uint64_t{1} << (qs[2] + 1); + ms[0] = (uint64_t{1} << qs[2]) - 1; + for (unsigned i = 1; i < 4; ++i) { + xs[i] = uint64_t{1} << (qs[i + 2] + 1); + ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1); + } + ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1); + + uint64_t xss[16]; + for (unsigned i = 0; i < 16; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 4; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[3]; + + auto s = StateSpace::Create(14); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]); + + for (unsigned i = 0; i < 3; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 16; ++i) { + for (unsigned m = 0; m < 64; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (256 * i + 64 * k + 4 * (m / 4) + (k + m) % 4); + } + + unsigned l = 2 * (64 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, const fp_type* rstate) { + __m512 rn, in; + __m512 rs[64], is[64]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]) + | (128 * i & ms[3]) | (256 * i & ms[4]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 16; ++l) { + rs[4 * l] = _mm512_load_ps(p0 + xss[l]); + is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 4; ++j) { + rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]); + is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]); + } + } + + double re = 0; + double im = 0; + + uint64_t j = 0; + + for (unsigned l = 0; l < 16; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 64; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + unsigned m = 4 * l; + + __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn)); + __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in)); + + re += detail::HorizontalSumAVX512(v_re); + im += detail::HorizontalSumAVX512(v_im); + } + + return std::complex{re, im}; + }; + + const fp_type* rstate = state.get(); + + unsigned k = 8; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate); + } + + std::complex ExpectationValue6HHHLLL(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + uint64_t xs[3]; + uint64_t ms[4]; + + xs[0] = uint64_t{1} << (qs[3] + 1); + ms[0] = (uint64_t{1} << qs[3]) - 1; + for (unsigned i = 1; i < 3; ++i) { + xs[i] = uint64_t{1} << (qs[i + 3] + 1); + ms[i] = ((uint64_t{1} << qs[i + 3]) - 1) ^ (xs[i - 1] - 1); + } + ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1); + + uint64_t xss[8]; + for (unsigned i = 0; i < 8; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 3; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[7]; + + auto s = StateSpace::Create(13); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]); + + for (unsigned i = 0; i < 7; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 8; ++i) { + for (unsigned m = 0; m < 64; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (512 * i + 64 * k + 8 * (m / 8) + (k + m) % 8); + } + + unsigned l = 2 * (64 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, const fp_type* rstate) { + __m512 rn, in; + __m512 rs[64], is[64]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]) + | (128 * i & ms[3]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 8; ++l) { + rs[8 * l] = _mm512_load_ps(p0 + xss[l]); + is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 8; ++j) { + rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]); + is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]); + } + } + + double re = 0; + double im = 0; + + uint64_t j = 0; + + for (unsigned l = 0; l < 8; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 64; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + unsigned m = 8 * l; + + __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn)); + __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in)); + + re += detail::HorizontalSumAVX512(v_re); + im += detail::HorizontalSumAVX512(v_im); + } + + return std::complex{re, im}; + }; + + const fp_type* rstate = state.get(); + + unsigned k = 7; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate); + } + + std::complex ExpectationValue6HHLLLL(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + uint64_t xs[2]; + uint64_t ms[3]; + + xs[0] = uint64_t{1} << (qs[4] + 1); + ms[0] = (uint64_t{1} << qs[4]) - 1; + for (unsigned i = 1; i < 2; ++i) { + xs[i] = uint64_t{1} << (qs[i + 4] + 1); + ms[i] = ((uint64_t{1} << qs[i + 4]) - 1) ^ (xs[i - 1] - 1); + } + ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); + + uint64_t xss[4]; + for (unsigned i = 0; i < 4; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 2; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[15]; + + auto s = StateSpace::Create(12); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]); + + for (unsigned i = 0; i < 15; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 4; ++i) { + for (unsigned m = 0; m < 64; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (1024 * i + 64 * k + 16 * (m / 16) + (k + m) % 16); + } + + unsigned l = 2 * (64 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, const fp_type* rstate) { + __m512 rn, in; + __m512 rs[64], is[64]; + + uint64_t k = (16 * i & ms[0]) | (32 * i & ms[1]) | (64 * i & ms[2]); + + auto p0 = rstate + 2 * k; + + for (unsigned l = 0; l < 4; ++l) { + rs[16 * l] = _mm512_load_ps(p0 + xss[l]); + is[16 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 16; ++j) { + rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]); + is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]); + } + } + + double re = 0; + double im = 0; + + uint64_t j = 0; + + for (unsigned l = 0; l < 4; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 64; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + unsigned m = 16 * l; + + __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn)); + __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in)); + + re += detail::HorizontalSumAVX512(v_re); + im += detail::HorizontalSumAVX512(v_im); + } + + return std::complex{re, im}; + }; + + const fp_type* rstate = state.get(); + + unsigned k = 6; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return for_.RunReduce(size, f, Op(), w, ms, xss, idx, rstate); + } + + static unsigned MaskedAdd( + unsigned a, unsigned b, unsigned mask, unsigned lsize) { + unsigned c = bits::CompressBits(a, 4, mask); + return bits::ExpandBits((c + b) % lsize, 4, mask); + } + + For for_; +}; + +} // namespace qsim + +#endif // SIMULATOR_AVX512_H_ diff --git a/lib/statespace_avx512.h b/lib/statespace_avx512.h new file mode 100644 index 000000000..3b98250be --- /dev/null +++ b/lib/statespace_avx512.h @@ -0,0 +1,423 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef STATESPACE_AVX512_H_ +#define STATESPACE_AVX512_H_ + +#include + +#include +#include +#include +#include +#include + +#include "statespace.h" +#include "util.h" + +namespace qsim { + +namespace detail { + +inline unsigned GetZeroMaskAVX512(uint64_t i, uint64_t mask, uint64_t bits) { + __m512i s1 = _mm512_setr_epi64( + i + 0, i + 1, i + 2, i + 3, i + 4, i + 5, i + 6, i + 7); + __m512i s2 = _mm512_setr_epi64( + i + 8, i + 9, i + 10, i + 11, i + 12, i + 13, i + 14, i + 15); + __m512i ma = _mm512_set1_epi64(mask); + __m512i bi = _mm512_set1_epi64(bits); + + s1 = _mm512_and_si512(s1, ma); + s2 = _mm512_and_si512(s2, ma); + + unsigned m1 = _mm512_cmpeq_epu64_mask(s1, bi); + unsigned m2 = _mm512_cmpeq_epu64_mask(s2, bi); + + return (m2 << 8) | m1; +} + +inline double HorizontalSumAVX512(__m512 s) { + return _mm512_reduce_add_ps(s); +} + +} // namespace detail + +/** + * Object containing context and routines for AVX state-vector manipulations. + * State is a vectorized sequence of sixteen real components followed by + * sixteen imaginary components. Sixteen single-precison floating numbers can + * be loaded into an AVX512 register. + */ +template +struct StateSpaceAVX512 : public StateSpace, For, float> { + using Base = StateSpace, For, float>; + using State = typename Base::State; + using fp_type = typename Base::fp_type; + + template + explicit StateSpaceAVX512(ForArgs&&... args) : Base(args...) {} + + static uint64_t MinSize(unsigned num_qubits) { + return std::max(uint64_t{32}, 2 * (uint64_t{1} << num_qubits)); + }; + + void InternalToNormalOrder(State& state) const { + __m512i idx1 = _mm512_setr_epi32( + 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); + __m512i idx2 = _mm512_setr_epi32( + 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); + + auto f = [](unsigned n, unsigned m, uint64_t i, + __m512i idx1, __m512i idx2, fp_type* p) { + __m512 v1 = _mm512_load_ps(p + 32 * i); + __m512 v2 = _mm512_load_ps(p + 32 * i + 16); + + _mm512_store_ps(p + 32 * i, _mm512_permutex2var_ps(v1, idx1, v2)); + _mm512_store_ps(p + 32 * i + 16, _mm512_permutex2var_ps(v1, idx2, v2)); + }; + + Base::for_.Run( + MinSize(state.num_qubits()) / 32, f, idx1, idx2, state.get()); + } + + void NormalToInternalOrder(State& state) const { + __m512i idx1 = _mm512_setr_epi32( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30); + __m512i idx2 = _mm512_setr_epi32( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31); + + auto f = [](unsigned n, unsigned m, uint64_t i, + __m512i idx1, __m512i idx2, fp_type* p) { + __m512 re = _mm512_load_ps(p + 32 * i); + __m512 im = _mm512_load_ps(p + 32 * i + 16); + + _mm512_store_ps(p + 32 * i, _mm512_permutex2var_ps(re, idx1, im)); + _mm512_store_ps(p + 32 * i + 16, _mm512_permutex2var_ps(re, idx2, im)); + }; + + Base::for_.Run( + MinSize(state.num_qubits()) / 32, f, idx1, idx2, state.get()); + } + + void SetAllZeros(State& state) const { + __m512 val0 = _mm512_setzero_ps(); + + auto f = [](unsigned n, unsigned m, uint64_t i, __m512 val0, fp_type* p) { + _mm512_store_ps(p + 32 * i, val0); + _mm512_store_ps(p + 32 * i + 16, val0); + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 32, f, val0, state.get()); + } + + // Uniform superposition. + void SetStateUniform(State& state) const { + __m512 val0 = _mm512_setzero_ps(); + __m512 valu; + + fp_type v = double{1} / std::sqrt(uint64_t{1} << state.num_qubits()); + + switch (state.num_qubits()) { + case 1: + valu = _mm512_set_ps(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, v, v); + break; + case 2: + valu = _mm512_set_ps(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, v, v, v, v); + break; + case 3: + valu = _mm512_set_ps(0, 0, 0, 0, 0, 0, 0, 0, v, v, v, v, v, v, v, v); + break; + default: + valu = _mm512_set1_ps(v); + break; + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + const __m512& val0, const __m512& valu, fp_type* p) { + _mm512_store_ps(p + 32 * i, valu); + _mm512_store_ps(p + 32 * i + 16, val0); + }; + + Base::for_.Run( + MinSize(state.num_qubits()) / 32, f, val0, valu, state.get()); + } + + // |0> state. + void SetStateZero(State& state) const { + SetAllZeros(state); + state.get()[0] = 1; + } + + static std::complex GetAmpl(const State& state, uint64_t i) { + uint64_t p = (32 * (i / 16)) + (i % 16); + return std::complex(state.get()[p], state.get()[p + 16]); + } + + static void SetAmpl( + State& state, uint64_t i, const std::complex& ampl) { + uint64_t p = (32 * (i / 16)) + (i % 16); + state.get()[p] = std::real(ampl); + state.get()[p + 16] = std::imag(ampl); + } + + static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) { + uint64_t p = (32 * (i / 16)) + (i % 16); + state.get()[p] = re; + state.get()[p + 16] = im; + } + + // Sets state[i] = complex(re, im) where (i & mask) == bits. + // if `exclude` is true then the criteria becomes (i & mask) != bits. + void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, + const std::complex& val, + bool exclude = false) const { + BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val), exclude); + } + + // Sets state[i] = complex(re, im) where (i & mask) == bits. + // if `exclude` is true then the criteria becomes (i & mask) != bits. + void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re, + fp_type im, bool exclude = false) const { + __m512 re_reg = _mm512_set1_ps(re); + __m512 im_reg = _mm512_set1_ps(im); + + __mmask16 exclude_n(-exclude); + + auto f = [](unsigned n, unsigned m, uint64_t i, uint64_t maskv, + uint64_t bitsv, __m512 re_n, __m512 im_n, __mmask16 exclude_n, + fp_type* p) { + __m512 re = _mm512_load_ps(p + 32 * i); + __m512 im = _mm512_load_ps(p + 32 * i + 16); + + __mmask16 ml = + detail::GetZeroMaskAVX512(16 * i, maskv, bitsv) ^ exclude_n; + + re = _mm512_mask_blend_ps(ml, re, re_n); + im = _mm512_mask_blend_ps(ml, im, im_n); + + _mm512_store_ps(p + 32 * i, re); + _mm512_store_ps(p + 32 * i + 16, im); + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 32, f, mask, bits, + re_reg, im_reg, exclude_n, state.get()); + } + +// Does the equivalent of dest += src elementwise. + bool Add(const State& src, State& dest) const { + if (src.num_qubits() != dest.num_qubits()) { + return false; + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* p1, fp_type* p2) { + __m512 re1 = _mm512_load_ps(p1 + 32 * i); + __m512 im1 = _mm512_load_ps(p1 + 32 * i + 16); + __m512 re2 = _mm512_load_ps(p2 + 32 * i); + __m512 im2 = _mm512_load_ps(p2 + 32 * i + 16); + + _mm512_store_ps(p2 + 32 * i, _mm512_add_ps(re1, re2)); + _mm512_store_ps(p2 + 32 * i + 16, _mm512_add_ps(im1, im2)); + }; + + Base::for_.Run(MinSize(src.num_qubits()) / 32, f, src.get(), dest.get()); + + return true; + } + + // Does the equivalent of state *= a elementwise. + void Multiply(fp_type a, State& state) const { + __m512 r = _mm512_set1_ps(a); + + auto f = [](unsigned n, unsigned m, uint64_t i, __m512 r, fp_type* p) { + __m512 re = _mm512_load_ps(p + 32 * i); + __m512 im = _mm512_load_ps(p + 32 * i + 16); + + _mm512_store_ps(p + 32 * i, _mm512_mul_ps(re, r)); + _mm512_store_ps(p + 32 * i + 16, _mm512_mul_ps(im, r)); + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 32, f, r, state.get()); + } + + std::complex InnerProduct( + const State& state1, const State& state2) const { + if (state1.num_qubits() != state2.num_qubits()) { + return std::nan(""); + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* p1, const fp_type* p2) -> std::complex { + __m512 re1 = _mm512_load_ps(p1 + 32 * i); + __m512 im1 = _mm512_load_ps(p1 + 32 * i + 16); + __m512 re2 = _mm512_load_ps(p2 + 32 * i); + __m512 im2 = _mm512_load_ps(p2 + 32 * i + 16); + + __m512 ip_re = _mm512_fmadd_ps(im1, im2, _mm512_mul_ps(re1, re2)); + __m512 ip_im = _mm512_fnmadd_ps(im1, re2, _mm512_mul_ps(re1, im2)); + + double re = detail::HorizontalSumAVX512(ip_re); + double im = detail::HorizontalSumAVX512(ip_im); + + return std::complex{re, im}; + }; + + using Op = std::plus>; + return Base::for_.RunReduce(MinSize(state1.num_qubits()) / 32, f, + Op(), state1.get(), state2.get()); + } + + double RealInnerProduct(const State& state1, const State& state2) const { + if (state1.num_qubits() != state2.num_qubits()) { + return std::nan(""); + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* p1, const fp_type* p2) -> double { + __m512 re1 = _mm512_load_ps(p1 + 32 * i); + __m512 im1 = _mm512_load_ps(p1 + 32 * i + 16); + __m512 re2 = _mm512_load_ps(p2 + 32 * i); + __m512 im2 = _mm512_load_ps(p2 + 32 * i + 16); + + __m512 ip_re = _mm512_fmadd_ps(im1, im2, _mm512_mul_ps(re1, re2)); + + return detail::HorizontalSumAVX512(ip_re); + }; + + using Op = std::plus; + return Base::for_.RunReduce(MinSize(state1.num_qubits()) / 32, f, + Op(), state1.get(), state2.get()); + } + + template + std::vector Sample( + const State& state, uint64_t num_samples, unsigned seed) const { + std::vector bitstrings; + + if (num_samples > 0) { + double norm = 0; + uint64_t size = MinSize(state.num_qubits()) / 32; + const fp_type* p = state.get(); + + for (uint64_t k = 0; k < size; ++k) { + for (unsigned j = 0; j < 16; ++j) { + auto re = p[32 * k + j]; + auto im = p[32 * k + 16 + j]; + norm += re * re + im * im; + } + } + + auto rs = GenerateRandomValues(num_samples, seed, norm); + + uint64_t m = 0; + double csum = 0; + bitstrings.reserve(num_samples); + + for (uint64_t k = 0; k < size; ++k) { + for (unsigned j = 0; j < 16; ++j) { + auto re = p[32 * k + j]; + auto im = p[32 * k + 16 + j]; + csum += re * re + im * im; + while (rs[m] < csum && m < num_samples) { + bitstrings.emplace_back(16 * k + j); + ++m; + } + } + } + } + + return bitstrings; + } + + using MeasurementResult = typename Base::MeasurementResult; + + void Collapse(const MeasurementResult& mr, State& state) const { + auto f1 = [](unsigned n, unsigned m, uint64_t i, + uint64_t mask, uint64_t bits, const fp_type* p) -> double { + __mmask16 ml = detail::GetZeroMaskAVX512(16 * i, mask, bits); + + __m512 re = _mm512_maskz_load_ps(ml, p + 32 * i); + __m512 im = _mm512_maskz_load_ps(ml, p + 32 * i + 16); + __m512 s1 = _mm512_fmadd_ps(im, im, _mm512_mul_ps(re, re)); + + return detail::HorizontalSumAVX512(s1); + }; + + using Op = std::plus; + double norm = Base::for_.RunReduce(MinSize(state.num_qubits()) / 32, f1, + Op(), mr.mask, mr.bits, state.get()); + + __m512 renorm = _mm512_set1_ps(1.0 / std::sqrt(norm)); + + auto f2 = [](unsigned n, unsigned m, uint64_t i, + uint64_t mask, uint64_t bits, __m512 renorm, fp_type* p) { + __mmask16 ml = detail::GetZeroMaskAVX512(16 * i, mask, bits); + + __m512 re = _mm512_maskz_load_ps(ml, p + 32 * i); + __m512 im = _mm512_maskz_load_ps(ml, p + 32 * i + 16); + + re = _mm512_mul_ps(re, renorm); + im = _mm512_mul_ps(im, renorm); + + _mm512_store_ps(p + 32 * i, re); + _mm512_store_ps(p + 32 * i + 16, im); + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 32, f2, + mr.mask, mr.bits, renorm, state.get()); + } + + std::vector PartialNorms(const State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* p) -> double { + __m512 re = _mm512_load_ps(p + 32 * i); + __m512 im = _mm512_load_ps(p + 32 * i + 16); + __m512 s1 = _mm512_fmadd_ps(im, im, _mm512_mul_ps(re, re)); + + return detail::HorizontalSumAVX512(s1); + }; + + using Op = std::plus; + return Base::for_.RunReduceP( + MinSize(state.num_qubits()) / 32, f, Op(), state.get()); + } + + uint64_t FindMeasuredBits( + unsigned m, double r, uint64_t mask, const State& state) const { + double csum = 0; + + uint64_t k0 = Base::for_.GetIndex0(MinSize(state.num_qubits()) / 32, m); + uint64_t k1 = Base::for_.GetIndex1(MinSize(state.num_qubits()) / 32, m); + + const fp_type* p = state.get(); + + for (uint64_t k = k0; k < k1; ++k) { + for (uint64_t j = 0; j < 16; ++j) { + auto re = p[32 * k + j]; + auto im = p[32 * k + j + 16]; + csum += re * re + im * im; + if (r < csum) { + return (16 * k + j) & mask; + } + } + } + + // Return the last bitstring in the unlikely case of underflow. + return (16 * k1 - 1) & mask; + } +}; + +} // namespace qsim + +#endif // STATESPACE_AVX512_H_ diff --git a/lib/umux.h b/lib/umux.h index 31de85118..83b951b86 100644 --- a/lib/umux.h +++ b/lib/umux.h @@ -15,7 +15,15 @@ #ifndef UMUX_H_ #define UMUX_H_ -#ifdef __AVX2__ +#ifdef __AVX512F__ +# include "unitary_calculator_avx512.h" + namespace qsim { + namespace unitary { + template + using UnitaryCalculator = UnitaryCalculatorAVX512; + } + } +#elif __AVX2__ # include "unitary_calculator_avx.h" namespace qsim { namespace unitary { diff --git a/lib/unitary_calculator_avx512.h b/lib/unitary_calculator_avx512.h new file mode 100644 index 000000000..174f44af1 --- /dev/null +++ b/lib/unitary_calculator_avx512.h @@ -0,0 +1,6385 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef UNITARY_CALCULATOR_AVX512_H_ +#define UNITARY_CALCULATOR_AVX512_H_ + +#include + +#include +#include +#include + +#include "bits.h" +#include "unitaryspace_avx512.h" + +namespace qsim { +namespace unitary { + +/** + * Quantum circuit unitary calculator with AVX512 vectorization. + */ +template +class UnitaryCalculatorAVX512 final { + public: + using UnitarySpace = UnitarySpaceAVX512; + using Unitary = typename UnitarySpace::Unitary; + using fp_type = typename UnitarySpace::fp_type; + + using StateSpace = UnitarySpace; + using State = Unitary; + + template + explicit UnitaryCalculatorAVX512(ForArgs&&... args) : for_(args...) {} + + /** + * Applies a gate using AVX512 instructions. + * @param qs Indices of the qubits affected by this gate. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyGate(const std::vector& qs, + const fp_type* matrix, Unitary& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + switch (qs.size()) { + case 1: + if (qs[0] > 3) { + ApplyGate1H(qs, matrix, state); + } else { + ApplyGate1L(qs, matrix, state); + } + break; + case 2: + if (qs[0] > 3) { + ApplyGate2HH(qs, matrix, state); + } else if (qs[1] > 3) { + ApplyGate2HL(qs, matrix, state); + } else { + ApplyGate2LL(qs, matrix, state); + } + break; + case 3: + if (qs[0] > 3) { + ApplyGate3HHH(qs, matrix, state); + } else if (qs[1] > 3) { + ApplyGate3HHL(qs, matrix, state); + } else if (qs[2] > 3) { + ApplyGate3HLL(qs, matrix, state); + } else { + ApplyGate3LLL(qs, matrix, state); + } + break; + case 4: + if (qs[0] > 3) { + ApplyGate4HHHH(qs, matrix, state); + } else if (qs[1] > 3) { + ApplyGate4HHHL(qs, matrix, state); + } else if (qs[2] > 3) { + ApplyGate4HHLL(qs, matrix, state); + } else if (qs[3] > 3) { + ApplyGate4HLLL(qs, matrix, state); + } else { + ApplyGate4LLLL(qs, matrix, state); + } + break; + case 5: + if (qs[0] > 3) { + ApplyGate5HHHHH(qs, matrix, state); + } else if (qs[1] > 3) { + ApplyGate5HHHHL(qs, matrix, state); + } else if (qs[2] > 3) { + ApplyGate5HHHLL(qs, matrix, state); + } else if (qs[3] > 3) { + ApplyGate5HHLLL(qs, matrix, state); + } else { + ApplyGate5HLLLL(qs, matrix, state); + } + break; + case 6: + if (qs[0] > 3) { + ApplyGate6HHHHHH(qs, matrix, state); + } else if (qs[1] > 3) { + ApplyGate6HHHHHL(qs, matrix, state); + } else if (qs[2] > 3) { + ApplyGate6HHHHLL(qs, matrix, state); + } else if (qs[3] > 3) { + ApplyGate6HHHLLL(qs, matrix, state); + } else { + ApplyGate6HHLLLL(qs, matrix, state); + } + break; + default: + // Not implemented. + break; + } + } + + /** + * Applies a controlled gate using AVX512 instructions. + * @param qs Indices of the qubits affected by this gate. + * @param cqs Indices of control qubits. + * @param cmask Bit mask of control qubit values. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyControlledGate(const std::vector& qs, + const std::vector& cqs, uint64_t cmask, + const fp_type* matrix, Unitary& state) const { + if (cqs.size() == 0) { + ApplyGate(qs, matrix, state); + return; + } + + switch (qs.size()) { + case 1: + if (qs[0] > 3) { + if (cqs[0] > 3) { + ApplyControlledGate1H_H(qs, cqs, cmask, matrix, state); + } else { + ApplyControlledGate1H_L(qs, cqs, cmask, matrix, state); + } + } else { + if (cqs[0] > 3) { + ApplyControlledGate1L_H(qs, cqs, cmask, matrix, state); + } else { + ApplyControlledGate1L_L(qs, cqs, cmask, matrix, state); + } + } + break; + case 2: + if (qs[0] > 3) { + if (cqs[0] > 3) { + ApplyControlledGate2HH_H(qs, cqs, cmask, matrix, state); + } else { + ApplyControlledGate2HH_L(qs, cqs, cmask, matrix, state); + } + } else if (qs[1] > 3) { + if (cqs[0] > 3) { + ApplyControlledGate2HL_H(qs, cqs, cmask, matrix, state); + } else { + ApplyControlledGate2HL_L(qs, cqs, cmask, matrix, state); + } + } else { + if (cqs[0] > 3) { + ApplyControlledGate2LL_H(qs, cqs, cmask, matrix, state); + } else { + ApplyControlledGate2LL_L(qs, cqs, cmask, matrix, state); + } + } + break; + case 3: + if (qs[0] > 3) { + if (cqs[0] > 3) { + ApplyControlledGate3HHH_H(qs, cqs, cmask, matrix, state); + } else { + ApplyControlledGate3HHH_L(qs, cqs, cmask, matrix, state); + } + } else if (qs[1] > 3) { + if (cqs[0] > 3) { + ApplyControlledGate3HHL_H(qs, cqs, cmask, matrix, state); + } else { + ApplyControlledGate3HHL_L(qs, cqs, cmask, matrix, state); + } + } else if (qs[2] > 3) { + if (cqs[0] > 3) { + ApplyControlledGate3HLL_H(qs, cqs, cmask, matrix, state); + } else { + ApplyControlledGate3HLL_L(qs, cqs, cmask, matrix, state); + } + } else { + if (cqs[0] > 3) { + ApplyControlledGate3LLL_H(qs, cqs, cmask, matrix, state); + } else { + ApplyControlledGate3LLL_L(qs, cqs, cmask, matrix, state); + } + } + break; + case 4: + if (qs[0] > 3) { + if (cqs[0] > 3) { + ApplyControlledGate4HHHH_H(qs, cqs, cmask, matrix, state); + } else { + ApplyControlledGate4HHHH_L(qs, cqs, cmask, matrix, state); + } + } else if (qs[1] > 3) { + if (cqs[0] > 3) { + ApplyControlledGate4HHHL_H(qs, cqs, cmask, matrix, state); + } else { + ApplyControlledGate4HHHL_L(qs, cqs, cmask, matrix, state); + } + } else if (qs[2] > 3) { + if (cqs[0] > 3) { + ApplyControlledGate4HHLL_H(qs, cqs, cmask, matrix, state); + } else { + ApplyControlledGate4HHLL_L(qs, cqs, cmask, matrix, state); + } + } else if (qs[3] > 3) { + if (cqs[0] > 3) { + ApplyControlledGate4HLLL_H(qs, cqs, cmask, matrix, state); + } else { + ApplyControlledGate4HLLL_L(qs, cqs, cmask, matrix, state); + } + } else { + if (cqs[0] > 3) { + ApplyControlledGate4LLLL_H(qs, cqs, cmask, matrix, state); + } else { + ApplyControlledGate4LLLL_L(qs, cqs, cmask, matrix, state); + } + } + break; + default: + // Not implemented. + break; + } + } + + /** + * @return The size of SIMD register if applicable. + */ + static unsigned SIMDRegisterSize() { + return 16; + } + + private: + void ApplyGate1H(const std::vector& qs, + const fp_type* matrix, Unitary& state) const { + uint64_t xs[1]; + uint64_t ms[2]; + + xs[0] = uint64_t{1} << (qs[0] + 1); + ms[0] = (uint64_t{1} << qs[0]) - 1; + ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); + + uint64_t xss[2]; + for (unsigned i = 0; i < 2; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 1; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, + uint64_t size, uint64_t row_size, fp_type* rstate) { + __m512 ru, iu, rn, in; + __m512 rs[2], is[2]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]); + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 2; ++l) { + rs[l] = _mm512_load_ps(p0 + xss[l]); + is[l] = _mm512_load_ps(p0 + xss[l] + 16); + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 2; ++l) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_mul_ps(rs[0], ru); + in = _mm512_mul_ps(rs[0], iu); + rn = _mm512_fnmadd_ps(is[0], iu, rn); + in = _mm512_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned n = 1; n < 2; ++n) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_fmadd_ps(rs[n], ru, rn); + in = _mm512_fmadd_ps(rs[n], iu, in); + rn = _mm512_fnmadd_ps(is[n], iu, rn); + in = _mm512_fmadd_ps(is[n], ru, in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 5; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, rstate); + } + + void ApplyGate1L(const std::vector& qs, + const fp_type* matrix, Unitary& state) const { + unsigned p[16]; + __m512i idx[1]; + + auto s = UnitarySpace::Create(5); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned m = 0; m < 2; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2); + } + + unsigned l = 2 * (2 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[2], is[2]; + + uint64_t ii = i % size; + uint64_t r = i / size; + auto p0 = rstate + row_size * r + 32 * ii; + + for (unsigned l = 0; l < 1; ++l) { + rs[2 * l] = _mm512_load_ps(p0); + is[2 * l] = _mm512_load_ps(p0 + 16); + + for (unsigned j = 1; j < 2; ++j) { + rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]); + is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 1; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 2; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0, rn); + _mm512_store_ps(p0 + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 4; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, idx, size, raw_size, rstate); + } + + void ApplyGate2HH(const std::vector& qs, + const fp_type* matrix, Unitary& state) const { + uint64_t xs[2]; + uint64_t ms[3]; + + xs[0] = uint64_t{1} << (qs[0] + 1); + ms[0] = (uint64_t{1} << qs[0]) - 1; + for (unsigned i = 1; i < 2; ++i) { + xs[i] = uint64_t{1} << (qs[i + 0] + 1); + ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); + } + ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); + + uint64_t xss[4]; + for (unsigned i = 0; i < 4; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 2; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, + uint64_t size, uint64_t row_size, fp_type* rstate) { + __m512 ru, iu, rn, in; + __m512 rs[4], is[4]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2]); + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 4; ++l) { + rs[l] = _mm512_load_ps(p0 + xss[l]); + is[l] = _mm512_load_ps(p0 + xss[l] + 16); + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 4; ++l) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_mul_ps(rs[0], ru); + in = _mm512_mul_ps(rs[0], iu); + rn = _mm512_fnmadd_ps(is[0], iu, rn); + in = _mm512_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned n = 1; n < 4; ++n) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_fmadd_ps(rs[n], ru, rn); + in = _mm512_fmadd_ps(rs[n], iu, in); + rn = _mm512_fnmadd_ps(is[n], iu, rn); + in = _mm512_fmadd_ps(is[n], ru, in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 6; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, rstate); + } + + void ApplyGate2HL(const std::vector& qs, + const fp_type* matrix, Unitary& state) const { + uint64_t xs[1]; + uint64_t ms[2]; + + xs[0] = uint64_t{1} << (qs[1] + 1); + ms[0] = (uint64_t{1} << qs[1]) - 1; + ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); + + uint64_t xss[2]; + for (unsigned i = 0; i < 2; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 1; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[1]; + + auto s = UnitarySpace::Create(7); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 2; ++i) { + for (unsigned m = 0; m < 4; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2); + } + + unsigned l = 2 * (4 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[4], is[4]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]); + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 2; ++l) { + rs[2 * l] = _mm512_load_ps(p0 + xss[l]); + is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 2; ++j) { + rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]); + is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 2; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 4; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 5; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate); + } + + void ApplyGate2LL(const std::vector& qs, + const fp_type* matrix, Unitary& state) const { + unsigned p[16]; + __m512i idx[3]; + + auto s = UnitarySpace::Create(6); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]); + + for (unsigned i = 0; i < 3; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned m = 0; m < 4; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4); + } + + unsigned l = 2 * (4 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[4], is[4]; + + uint64_t ii = i % size; + uint64_t r = i / size; + auto p0 = rstate + row_size * r + 32 * ii; + + for (unsigned l = 0; l < 1; ++l) { + rs[4 * l] = _mm512_load_ps(p0); + is[4 * l] = _mm512_load_ps(p0 + 16); + + for (unsigned j = 1; j < 4; ++j) { + rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]); + is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 1; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 4; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0, rn); + _mm512_store_ps(p0 + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 4; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, idx, size, raw_size, rstate); + } + + void ApplyGate3HHH(const std::vector& qs, + const fp_type* matrix, Unitary& state) const { + uint64_t xs[3]; + uint64_t ms[4]; + + xs[0] = uint64_t{1} << (qs[0] + 1); + ms[0] = (uint64_t{1} << qs[0]) - 1; + for (unsigned i = 1; i < 3; ++i) { + xs[i] = uint64_t{1} << (qs[i + 0] + 1); + ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); + } + ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1); + + uint64_t xss[8]; + for (unsigned i = 0; i < 8; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 3; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, + uint64_t size, uint64_t row_size, fp_type* rstate) { + __m512 ru, iu, rn, in; + __m512 rs[8], is[8]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2]) + | (128 * ii & ms[3]); + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 8; ++l) { + rs[l] = _mm512_load_ps(p0 + xss[l]); + is[l] = _mm512_load_ps(p0 + xss[l] + 16); + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 8; ++l) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_mul_ps(rs[0], ru); + in = _mm512_mul_ps(rs[0], iu); + rn = _mm512_fnmadd_ps(is[0], iu, rn); + in = _mm512_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned n = 1; n < 8; ++n) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_fmadd_ps(rs[n], ru, rn); + in = _mm512_fmadd_ps(rs[n], iu, in); + rn = _mm512_fnmadd_ps(is[n], iu, rn); + in = _mm512_fmadd_ps(is[n], ru, in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 7; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, rstate); + } + + void ApplyGate3HHL(const std::vector& qs, + const fp_type* matrix, Unitary& state) const { + uint64_t xs[2]; + uint64_t ms[3]; + + xs[0] = uint64_t{1} << (qs[1] + 1); + ms[0] = (uint64_t{1} << qs[1]) - 1; + for (unsigned i = 1; i < 2; ++i) { + xs[i] = uint64_t{1} << (qs[i + 1] + 1); + ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1); + } + ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); + + uint64_t xss[4]; + for (unsigned i = 0; i < 4; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 2; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[1]; + + auto s = UnitarySpace::Create(9); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 4; ++i) { + for (unsigned m = 0; m < 8; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2); + } + + unsigned l = 2 * (8 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[8], is[8]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2]); + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 4; ++l) { + rs[2 * l] = _mm512_load_ps(p0 + xss[l]); + is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 2; ++j) { + rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]); + is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 4; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 8; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 6; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate); + } + + void ApplyGate3HLL(const std::vector& qs, + const fp_type* matrix, Unitary& state) const { + uint64_t xs[1]; + uint64_t ms[2]; + + xs[0] = uint64_t{1} << (qs[2] + 1); + ms[0] = (uint64_t{1} << qs[2]) - 1; + ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); + + uint64_t xss[2]; + for (unsigned i = 0; i < 2; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 1; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[3]; + + auto s = UnitarySpace::Create(8); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]); + + for (unsigned i = 0; i < 3; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 2; ++i) { + for (unsigned m = 0; m < 8; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4); + } + + unsigned l = 2 * (8 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[8], is[8]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]); + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 2; ++l) { + rs[4 * l] = _mm512_load_ps(p0 + xss[l]); + is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 4; ++j) { + rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]); + is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 2; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 8; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 5; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate); + } + + void ApplyGate3LLL(const std::vector& qs, + const fp_type* matrix, Unitary& state) const { + unsigned p[16]; + __m512i idx[7]; + + auto s = UnitarySpace::Create(7); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]); + + for (unsigned i = 0; i < 7; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned m = 0; m < 8; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (64 * i + 8 * k + 8 * (m / 8) + (k + m) % 8); + } + + unsigned l = 2 * (8 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[8], is[8]; + + uint64_t ii = i % size; + uint64_t r = i / size; + auto p0 = rstate + row_size * r + 32 * ii; + + for (unsigned l = 0; l < 1; ++l) { + rs[8 * l] = _mm512_load_ps(p0); + is[8 * l] = _mm512_load_ps(p0 + 16); + + for (unsigned j = 1; j < 8; ++j) { + rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]); + is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 1; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 8; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0, rn); + _mm512_store_ps(p0 + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 4; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, idx, size, raw_size, rstate); + } + + void ApplyGate4HHHH(const std::vector& qs, + const fp_type* matrix, Unitary& state) const { + uint64_t xs[4]; + uint64_t ms[5]; + + xs[0] = uint64_t{1} << (qs[0] + 1); + ms[0] = (uint64_t{1} << qs[0]) - 1; + for (unsigned i = 1; i < 4; ++i) { + xs[i] = uint64_t{1} << (qs[i + 0] + 1); + ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); + } + ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1); + + uint64_t xss[16]; + for (unsigned i = 0; i < 16; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 4; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, + uint64_t size, uint64_t row_size, fp_type* rstate) { + __m512 ru, iu, rn, in; + __m512 rs[16], is[16]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2]) + | (128 * ii & ms[3]) | (256 * ii & ms[4]); + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 16; ++l) { + rs[l] = _mm512_load_ps(p0 + xss[l]); + is[l] = _mm512_load_ps(p0 + xss[l] + 16); + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 16; ++l) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_mul_ps(rs[0], ru); + in = _mm512_mul_ps(rs[0], iu); + rn = _mm512_fnmadd_ps(is[0], iu, rn); + in = _mm512_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned n = 1; n < 16; ++n) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_fmadd_ps(rs[n], ru, rn); + in = _mm512_fmadd_ps(rs[n], iu, in); + rn = _mm512_fnmadd_ps(is[n], iu, rn); + in = _mm512_fmadd_ps(is[n], ru, in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 8; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, rstate); + } + + void ApplyGate4HHHL(const std::vector& qs, + const fp_type* matrix, Unitary& state) const { + uint64_t xs[3]; + uint64_t ms[4]; + + xs[0] = uint64_t{1} << (qs[1] + 1); + ms[0] = (uint64_t{1} << qs[1]) - 1; + for (unsigned i = 1; i < 3; ++i) { + xs[i] = uint64_t{1} << (qs[i + 1] + 1); + ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1); + } + ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1); + + uint64_t xss[8]; + for (unsigned i = 0; i < 8; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 3; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[1]; + + auto s = UnitarySpace::Create(11); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 8; ++i) { + for (unsigned m = 0; m < 16; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2); + } + + unsigned l = 2 * (16 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[16], is[16]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2]) + | (128 * ii & ms[3]); + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 8; ++l) { + rs[2 * l] = _mm512_load_ps(p0 + xss[l]); + is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 2; ++j) { + rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]); + is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 8; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 16; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 7; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate); + } + + void ApplyGate4HHLL(const std::vector& qs, + const fp_type* matrix, Unitary& state) const { + uint64_t xs[2]; + uint64_t ms[3]; + + xs[0] = uint64_t{1} << (qs[2] + 1); + ms[0] = (uint64_t{1} << qs[2]) - 1; + for (unsigned i = 1; i < 2; ++i) { + xs[i] = uint64_t{1} << (qs[i + 2] + 1); + ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1); + } + ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); + + uint64_t xss[4]; + for (unsigned i = 0; i < 4; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 2; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[3]; + + auto s = UnitarySpace::Create(10); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]); + + for (unsigned i = 0; i < 3; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 4; ++i) { + for (unsigned m = 0; m < 16; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4); + } + + unsigned l = 2 * (16 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[16], is[16]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2]); + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 4; ++l) { + rs[4 * l] = _mm512_load_ps(p0 + xss[l]); + is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 4; ++j) { + rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]); + is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 4; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 16; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 6; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate); + } + + void ApplyGate4HLLL(const std::vector& qs, + const fp_type* matrix, Unitary& state) const { + uint64_t xs[1]; + uint64_t ms[2]; + + xs[0] = uint64_t{1} << (qs[3] + 1); + ms[0] = (uint64_t{1} << qs[3]) - 1; + ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); + + uint64_t xss[2]; + for (unsigned i = 0; i < 2; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 1; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[7]; + + auto s = UnitarySpace::Create(9); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]); + + for (unsigned i = 0; i < 7; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 2; ++i) { + for (unsigned m = 0; m < 16; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (128 * i + 16 * k + 8 * (m / 8) + (k + m) % 8); + } + + unsigned l = 2 * (16 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[16], is[16]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]); + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 2; ++l) { + rs[8 * l] = _mm512_load_ps(p0 + xss[l]); + is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 8; ++j) { + rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]); + is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 2; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 16; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 5; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate); + } + + void ApplyGate4LLLL(const std::vector& qs, + const fp_type* matrix, Unitary& state) const { + unsigned p[16]; + __m512i idx[15]; + + auto s = UnitarySpace::Create(8); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]); + + for (unsigned i = 0; i < 15; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned m = 0; m < 16; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (256 * i + 16 * k + 16 * (m / 16) + (k + m) % 16); + } + + unsigned l = 2 * (16 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[16], is[16]; + + uint64_t ii = i % size; + uint64_t r = i / size; + auto p0 = rstate + row_size * r + 32 * ii; + + for (unsigned l = 0; l < 1; ++l) { + rs[16 * l] = _mm512_load_ps(p0); + is[16 * l] = _mm512_load_ps(p0 + 16); + + for (unsigned j = 1; j < 16; ++j) { + rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]); + is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 1; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 16; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0, rn); + _mm512_store_ps(p0 + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 4; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, idx, size, raw_size, rstate); + } + + void ApplyGate5HHHHH(const std::vector& qs, + const fp_type* matrix, Unitary& state) const { + uint64_t xs[5]; + uint64_t ms[6]; + + xs[0] = uint64_t{1} << (qs[0] + 1); + ms[0] = (uint64_t{1} << qs[0]) - 1; + for (unsigned i = 1; i < 5; ++i) { + xs[i] = uint64_t{1} << (qs[i + 0] + 1); + ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); + } + ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1); + + uint64_t xss[32]; + for (unsigned i = 0; i < 32; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 5; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, + uint64_t size, uint64_t row_size, fp_type* rstate) { + __m512 ru, iu, rn, in; + __m512 rs[32], is[32]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2]) + | (128 * ii & ms[3]) | (256 * ii & ms[4]) | (512 * ii & ms[5]); + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 32; ++l) { + rs[l] = _mm512_load_ps(p0 + xss[l]); + is[l] = _mm512_load_ps(p0 + xss[l] + 16); + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 32; ++l) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_mul_ps(rs[0], ru); + in = _mm512_mul_ps(rs[0], iu); + rn = _mm512_fnmadd_ps(is[0], iu, rn); + in = _mm512_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned n = 1; n < 32; ++n) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_fmadd_ps(rs[n], ru, rn); + in = _mm512_fmadd_ps(rs[n], iu, in); + rn = _mm512_fnmadd_ps(is[n], iu, rn); + in = _mm512_fmadd_ps(is[n], ru, in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 9; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, rstate); + } + + void ApplyGate5HHHHL(const std::vector& qs, + const fp_type* matrix, Unitary& state) const { + uint64_t xs[4]; + uint64_t ms[5]; + + xs[0] = uint64_t{1} << (qs[1] + 1); + ms[0] = (uint64_t{1} << qs[1]) - 1; + for (unsigned i = 1; i < 4; ++i) { + xs[i] = uint64_t{1} << (qs[i + 1] + 1); + ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1); + } + ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1); + + uint64_t xss[16]; + for (unsigned i = 0; i < 16; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 4; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[1]; + + auto s = UnitarySpace::Create(13); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 16; ++i) { + for (unsigned m = 0; m < 32; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (64 * i + 32 * k + 2 * (m / 2) + (k + m) % 2); + } + + unsigned l = 2 * (32 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[32], is[32]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2]) + | (128 * ii & ms[3]) | (256 * ii & ms[4]); + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 16; ++l) { + rs[2 * l] = _mm512_load_ps(p0 + xss[l]); + is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 2; ++j) { + rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]); + is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 16; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 32; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 8; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate); + } + + void ApplyGate5HHHLL(const std::vector& qs, + const fp_type* matrix, Unitary& state) const { + uint64_t xs[3]; + uint64_t ms[4]; + + xs[0] = uint64_t{1} << (qs[2] + 1); + ms[0] = (uint64_t{1} << qs[2]) - 1; + for (unsigned i = 1; i < 3; ++i) { + xs[i] = uint64_t{1} << (qs[i + 2] + 1); + ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1); + } + ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1); + + uint64_t xss[8]; + for (unsigned i = 0; i < 8; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 3; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[3]; + + auto s = UnitarySpace::Create(12); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]); + + for (unsigned i = 0; i < 3; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 8; ++i) { + for (unsigned m = 0; m < 32; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (128 * i + 32 * k + 4 * (m / 4) + (k + m) % 4); + } + + unsigned l = 2 * (32 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[32], is[32]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2]) + | (128 * ii & ms[3]); + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 8; ++l) { + rs[4 * l] = _mm512_load_ps(p0 + xss[l]); + is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 4; ++j) { + rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]); + is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 8; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 32; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 7; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate); + } + + void ApplyGate5HHLLL(const std::vector& qs, + const fp_type* matrix, Unitary& state) const { + uint64_t xs[2]; + uint64_t ms[3]; + + xs[0] = uint64_t{1} << (qs[3] + 1); + ms[0] = (uint64_t{1} << qs[3]) - 1; + for (unsigned i = 1; i < 2; ++i) { + xs[i] = uint64_t{1} << (qs[i + 3] + 1); + ms[i] = ((uint64_t{1} << qs[i + 3]) - 1) ^ (xs[i - 1] - 1); + } + ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); + + uint64_t xss[4]; + for (unsigned i = 0; i < 4; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 2; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[7]; + + auto s = UnitarySpace::Create(11); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]); + + for (unsigned i = 0; i < 7; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 4; ++i) { + for (unsigned m = 0; m < 32; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (256 * i + 32 * k + 8 * (m / 8) + (k + m) % 8); + } + + unsigned l = 2 * (32 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[32], is[32]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2]); + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 4; ++l) { + rs[8 * l] = _mm512_load_ps(p0 + xss[l]); + is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 8; ++j) { + rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]); + is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 4; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 32; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 6; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate); + } + + void ApplyGate5HLLLL(const std::vector& qs, + const fp_type* matrix, Unitary& state) const { + uint64_t xs[1]; + uint64_t ms[2]; + + xs[0] = uint64_t{1} << (qs[4] + 1); + ms[0] = (uint64_t{1} << qs[4]) - 1; + ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); + + uint64_t xss[2]; + for (unsigned i = 0; i < 2; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 1; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[15]; + + auto s = UnitarySpace::Create(10); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]); + + for (unsigned i = 0; i < 15; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 2; ++i) { + for (unsigned m = 0; m < 32; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (512 * i + 32 * k + 16 * (m / 16) + (k + m) % 16); + } + + unsigned l = 2 * (32 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[32], is[32]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]); + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 2; ++l) { + rs[16 * l] = _mm512_load_ps(p0 + xss[l]); + is[16 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 16; ++j) { + rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]); + is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 2; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 32; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 5; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate); + } + + void ApplyGate6HHHHHH(const std::vector& qs, + const fp_type* matrix, Unitary& state) const { + uint64_t xs[6]; + uint64_t ms[7]; + + xs[0] = uint64_t{1} << (qs[0] + 1); + ms[0] = (uint64_t{1} << qs[0]) - 1; + for (unsigned i = 1; i < 6; ++i) { + xs[i] = uint64_t{1} << (qs[i + 0] + 1); + ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); + } + ms[6] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[5] - 1); + + uint64_t xss[64]; + for (unsigned i = 0; i < 64; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 6; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, + uint64_t size, uint64_t row_size, fp_type* rstate) { + __m512 ru, iu, rn, in; + __m512 rs[64], is[64]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2]) + | (128 * ii & ms[3]) | (256 * ii & ms[4]) | (512 * ii & ms[5]) + | (1024 * ii & ms[6]); + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 64; ++l) { + rs[l] = _mm512_load_ps(p0 + xss[l]); + is[l] = _mm512_load_ps(p0 + xss[l] + 16); + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 64; ++l) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_mul_ps(rs[0], ru); + in = _mm512_mul_ps(rs[0], iu); + rn = _mm512_fnmadd_ps(is[0], iu, rn); + in = _mm512_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned n = 1; n < 64; ++n) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_fmadd_ps(rs[n], ru, rn); + in = _mm512_fmadd_ps(rs[n], iu, in); + rn = _mm512_fnmadd_ps(is[n], iu, rn); + in = _mm512_fmadd_ps(is[n], ru, in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 10; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, rstate); + } + + void ApplyGate6HHHHHL(const std::vector& qs, + const fp_type* matrix, Unitary& state) const { + uint64_t xs[5]; + uint64_t ms[6]; + + xs[0] = uint64_t{1} << (qs[1] + 1); + ms[0] = (uint64_t{1} << qs[1]) - 1; + for (unsigned i = 1; i < 5; ++i) { + xs[i] = uint64_t{1} << (qs[i + 1] + 1); + ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1); + } + ms[5] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[4] - 1); + + uint64_t xss[32]; + for (unsigned i = 0; i < 32; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 5; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[1]; + + auto s = UnitarySpace::Create(15); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 32; ++i) { + for (unsigned m = 0; m < 64; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (128 * i + 64 * k + 2 * (m / 2) + (k + m) % 2); + } + + unsigned l = 2 * (64 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[64], is[64]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2]) + | (128 * ii & ms[3]) | (256 * ii & ms[4]) | (512 * ii & ms[5]); + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 32; ++l) { + rs[2 * l] = _mm512_load_ps(p0 + xss[l]); + is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 2; ++j) { + rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]); + is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 32; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 64; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 9; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate); + } + + void ApplyGate6HHHHLL(const std::vector& qs, + const fp_type* matrix, Unitary& state) const { + uint64_t xs[4]; + uint64_t ms[5]; + + xs[0] = uint64_t{1} << (qs[2] + 1); + ms[0] = (uint64_t{1} << qs[2]) - 1; + for (unsigned i = 1; i < 4; ++i) { + xs[i] = uint64_t{1} << (qs[i + 2] + 1); + ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1); + } + ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1); + + uint64_t xss[16]; + for (unsigned i = 0; i < 16; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 4; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[3]; + + auto s = UnitarySpace::Create(14); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]); + + for (unsigned i = 0; i < 3; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 16; ++i) { + for (unsigned m = 0; m < 64; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (256 * i + 64 * k + 4 * (m / 4) + (k + m) % 4); + } + + unsigned l = 2 * (64 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[64], is[64]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2]) + | (128 * ii & ms[3]) | (256 * ii & ms[4]); + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 16; ++l) { + rs[4 * l] = _mm512_load_ps(p0 + xss[l]); + is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 4; ++j) { + rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]); + is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 16; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 64; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 8; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate); + } + + void ApplyGate6HHHLLL(const std::vector& qs, + const fp_type* matrix, Unitary& state) const { + uint64_t xs[3]; + uint64_t ms[4]; + + xs[0] = uint64_t{1} << (qs[3] + 1); + ms[0] = (uint64_t{1} << qs[3]) - 1; + for (unsigned i = 1; i < 3; ++i) { + xs[i] = uint64_t{1} << (qs[i + 3] + 1); + ms[i] = ((uint64_t{1} << qs[i + 3]) - 1) ^ (xs[i - 1] - 1); + } + ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1); + + uint64_t xss[8]; + for (unsigned i = 0; i < 8; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 3; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[7]; + + auto s = UnitarySpace::Create(13); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]); + + for (unsigned i = 0; i < 7; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 8; ++i) { + for (unsigned m = 0; m < 64; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (512 * i + 64 * k + 8 * (m / 8) + (k + m) % 8); + } + + unsigned l = 2 * (64 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[64], is[64]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2]) + | (128 * ii & ms[3]); + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 8; ++l) { + rs[8 * l] = _mm512_load_ps(p0 + xss[l]); + is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 8; ++j) { + rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]); + is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 8; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 64; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 7; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate); + } + + void ApplyGate6HHLLLL(const std::vector& qs, + const fp_type* matrix, Unitary& state) const { + uint64_t xs[2]; + uint64_t ms[3]; + + xs[0] = uint64_t{1} << (qs[4] + 1); + ms[0] = (uint64_t{1} << qs[4]) - 1; + for (unsigned i = 1; i < 2; ++i) { + xs[i] = uint64_t{1} << (qs[i + 4] + 1); + ms[i] = ((uint64_t{1} << qs[i + 4]) - 1) ^ (xs[i - 1] - 1); + } + ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); + + uint64_t xss[4]; + for (unsigned i = 0; i < 4; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 2; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned p[16]; + __m512i idx[15]; + + auto s = UnitarySpace::Create(12); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]); + + for (unsigned i = 0; i < 15; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 4; ++i) { + for (unsigned m = 0; m < 64; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (1024 * i + 64 * k + 16 * (m / 16) + (k + m) % 16); + } + + unsigned l = 2 * (64 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[64], is[64]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = (16 * ii & ms[0]) | (32 * ii & ms[1]) | (64 * ii & ms[2]); + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 4; ++l) { + rs[16 * l] = _mm512_load_ps(p0 + xss[l]); + is[16 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 16; ++j) { + rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]); + is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 4; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 64; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 6; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, rstate); + } + + void ApplyControlledGate1H_H(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + Unitary& state) const { + uint64_t xs[1]; + uint64_t ms[2]; + + xs[0] = uint64_t{1} << (qs[0] + 1); + ms[0] = (uint64_t{1} << qs[0]) - 1; + ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); + + uint64_t xss[2]; + for (unsigned i = 0; i < 2; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 1; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + uint64_t emaskh = 0; + + for (auto q : cqs) { + emaskh |= uint64_t{1} << q; + } + + uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); + + for (auto q : qs) { + emaskh |= uint64_t{1} << q; + } + + emaskh = ~emaskh ^ 15; + + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + uint64_t size, uint64_t row_size, fp_type* rstate) { + __m512 ru, iu, rn, in; + __m512 rs[2], is[2]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh; + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 2; ++l) { + rs[l] = _mm512_load_ps(p0 + xss[l]); + is[l] = _mm512_load_ps(p0 + xss[l] + 16); + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 2; ++l) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_mul_ps(rs[0], ru); + in = _mm512_mul_ps(rs[0], iu); + rn = _mm512_fnmadd_ps(is[0], iu, rn); + in = _mm512_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned n = 1; n < 2; ++n) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_fmadd_ps(rs[n], ru, rn); + in = _mm512_fmadd_ps(rs[n], iu, in); + rn = _mm512_fnmadd_ps(is[n], iu, rn); + in = _mm512_fmadd_ps(is[n], ru, in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 5 + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, matrix, ms, xss, + state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate); + } + + void ApplyControlledGate1H_L(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + Unitary& state) const { + uint64_t xs[1]; + uint64_t ms[2]; + + xs[0] = uint64_t{1} << (qs[0] + 1); + ms[0] = (uint64_t{1} << qs[0]) - 1; + ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); + + uint64_t xss[2]; + for (unsigned i = 0; i < 2; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 1; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned cl = 0; + uint64_t emaskl = 0; + uint64_t emaskh = 0; + + for (auto q : cqs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } else { + ++cl; + emaskl |= uint64_t{1} << q; + } + } + + uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); + uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl); + + for (auto q : qs) { + emaskh |= uint64_t{1} << q; + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + + auto s = UnitarySpace::Create(6); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 2; ++i) { + for (unsigned m = 0; m < 2; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (2 * i + 2 * k + m); + } + + unsigned l = 2 * (2 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + fp_type v = (p[j] / 2) / 2 == (p[j] / 2) % 2 ? 1 : 0; + wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + uint64_t size, uint64_t row_size, fp_type* rstate) { + __m512 rn, in; + __m512 rs[2], is[2]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh; + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 2; ++l) { + rs[l] = _mm512_load_ps(p0 + xss[l]); + is[l] = _mm512_load_ps(p0 + xss[l] + 16); + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 2; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 2; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 5 + cqs.size() - cl; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, ms, xss, + state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate); + } + + void ApplyControlledGate1L_H(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + Unitary& state) const { + uint64_t emaskh = 0; + + for (auto q : cqs) { + emaskh |= uint64_t{1} << q; + } + + uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[1]; + + auto s = UnitarySpace::Create(5); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned m = 0; m < 2; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2); + } + + unsigned l = 2 * (2 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[2], is[2]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh; + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 1; ++l) { + rs[2 * l] = _mm512_load_ps(p0); + is[2 * l] = _mm512_load_ps(p0 + 16); + + for (unsigned j = 1; j < 2; ++j) { + rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]); + is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 1; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 2; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0, rn); + _mm512_store_ps(p0 + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 4 + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, + state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate); + } + + void ApplyControlledGate1L_L(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + Unitary& state) const { + unsigned cl = 0; + uint64_t emaskl = 0; + uint64_t emaskh = 0; + + for (auto q : cqs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } else { + ++cl; + emaskl |= uint64_t{1} << q; + } + } + + uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); + uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[1]; + + auto s = UnitarySpace::Create(5); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned m = 0; m < 2; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (4 * i + 2 * k + 2 * (m / 2) + (k + m) % 2); + } + + unsigned l = 2 * (2 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + fp_type v = (p[j] / 2) / 2 == (p[j] / 2) % 2 ? 1 : 0; + wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[2], is[2]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh; + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 1; ++l) { + rs[2 * l] = _mm512_load_ps(p0); + is[2 * l] = _mm512_load_ps(p0 + 16); + + for (unsigned j = 1; j < 2; ++j) { + rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]); + is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 1; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 2; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0, rn); + _mm512_store_ps(p0 + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 4 + cqs.size() - cl; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, + state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate); + } + + void ApplyControlledGate2HH_H(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + Unitary& state) const { + uint64_t xs[2]; + uint64_t ms[3]; + + xs[0] = uint64_t{1} << (qs[0] + 1); + ms[0] = (uint64_t{1} << qs[0]) - 1; + for (unsigned i = 1; i < 2; ++i) { + xs[i] = uint64_t{1} << (qs[i + 0] + 1); + ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); + } + ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); + + uint64_t xss[4]; + for (unsigned i = 0; i < 4; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 2; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + uint64_t emaskh = 0; + + for (auto q : cqs) { + emaskh |= uint64_t{1} << q; + } + + uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); + + for (auto q : qs) { + emaskh |= uint64_t{1} << q; + } + + emaskh = ~emaskh ^ 15; + + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + uint64_t size, uint64_t row_size, fp_type* rstate) { + __m512 ru, iu, rn, in; + __m512 rs[4], is[4]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh; + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 4; ++l) { + rs[l] = _mm512_load_ps(p0 + xss[l]); + is[l] = _mm512_load_ps(p0 + xss[l] + 16); + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 4; ++l) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_mul_ps(rs[0], ru); + in = _mm512_mul_ps(rs[0], iu); + rn = _mm512_fnmadd_ps(is[0], iu, rn); + in = _mm512_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned n = 1; n < 4; ++n) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_fmadd_ps(rs[n], ru, rn); + in = _mm512_fmadd_ps(rs[n], iu, in); + rn = _mm512_fnmadd_ps(is[n], iu, rn); + in = _mm512_fmadd_ps(is[n], ru, in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 6 + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, matrix, ms, xss, + state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate); + } + + void ApplyControlledGate2HH_L(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + Unitary& state) const { + uint64_t xs[2]; + uint64_t ms[3]; + + xs[0] = uint64_t{1} << (qs[0] + 1); + ms[0] = (uint64_t{1} << qs[0]) - 1; + for (unsigned i = 1; i < 2; ++i) { + xs[i] = uint64_t{1} << (qs[i + 0] + 1); + ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); + } + ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); + + uint64_t xss[4]; + for (unsigned i = 0; i < 4; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 2; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned cl = 0; + uint64_t emaskl = 0; + uint64_t emaskh = 0; + + for (auto q : cqs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } else { + ++cl; + emaskl |= uint64_t{1} << q; + } + } + + uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); + uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl); + + for (auto q : qs) { + emaskh |= uint64_t{1} << q; + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + + auto s = UnitarySpace::Create(8); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 4; ++i) { + for (unsigned m = 0; m < 4; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (4 * i + 4 * k + m); + } + + unsigned l = 2 * (4 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + fp_type v = (p[j] / 2) / 4 == (p[j] / 2) % 4 ? 1 : 0; + wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + uint64_t size, uint64_t row_size, fp_type* rstate) { + __m512 rn, in; + __m512 rs[4], is[4]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh; + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 4; ++l) { + rs[l] = _mm512_load_ps(p0 + xss[l]); + is[l] = _mm512_load_ps(p0 + xss[l] + 16); + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 4; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 4; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 6 + cqs.size() - cl; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, ms, xss, + state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate); + } + + void ApplyControlledGate2HL_H(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + Unitary& state) const { + uint64_t xs[1]; + uint64_t ms[2]; + + xs[0] = uint64_t{1} << (qs[1] + 1); + ms[0] = (uint64_t{1} << qs[1]) - 1; + ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); + + uint64_t xss[2]; + for (unsigned i = 0; i < 2; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 1; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + uint64_t emaskh = 0; + + for (auto q : cqs) { + emaskh |= uint64_t{1} << q; + } + + uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[1]; + + auto s = UnitarySpace::Create(7); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 2; ++i) { + for (unsigned m = 0; m < 4; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2); + } + + unsigned l = 2 * (4 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[4], is[4]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh; + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 2; ++l) { + rs[2 * l] = _mm512_load_ps(p0 + xss[l]); + is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 2; ++j) { + rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]); + is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 2; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 4; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 5 + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, ms, xss, + state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate); + } + + void ApplyControlledGate2HL_L(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + Unitary& state) const { + uint64_t xs[1]; + uint64_t ms[2]; + + xs[0] = uint64_t{1} << (qs[1] + 1); + ms[0] = (uint64_t{1} << qs[1]) - 1; + ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); + + uint64_t xss[2]; + for (unsigned i = 0; i < 2; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 1; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned cl = 0; + uint64_t emaskl = 0; + uint64_t emaskh = 0; + + for (auto q : cqs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } else { + ++cl; + emaskl |= uint64_t{1} << q; + } + } + + uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); + uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[1]; + + auto s = UnitarySpace::Create(7); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 2; ++i) { + for (unsigned m = 0; m < 4; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (8 * i + 4 * k + 2 * (m / 2) + (k + m) % 2); + } + + unsigned l = 2 * (4 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + fp_type v = (p[j] / 2) / 4 == (p[j] / 2) % 4 ? 1 : 0; + wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[4], is[4]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh; + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 2; ++l) { + rs[2 * l] = _mm512_load_ps(p0 + xss[l]); + is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 2; ++j) { + rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]); + is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 2; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 4; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 5 + cqs.size() - cl; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, ms, xss, + state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate); + } + + void ApplyControlledGate2LL_H(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + Unitary& state) const { + uint64_t emaskh = 0; + + for (auto q : cqs) { + emaskh |= uint64_t{1} << q; + } + + uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[3]; + + auto s = UnitarySpace::Create(6); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]); + + for (unsigned i = 0; i < 3; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned m = 0; m < 4; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4); + } + + unsigned l = 2 * (4 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[4], is[4]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh; + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 1; ++l) { + rs[4 * l] = _mm512_load_ps(p0); + is[4 * l] = _mm512_load_ps(p0 + 16); + + for (unsigned j = 1; j < 4; ++j) { + rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]); + is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 1; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 4; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0, rn); + _mm512_store_ps(p0 + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 4 + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, + state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate); + } + + void ApplyControlledGate2LL_L(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + Unitary& state) const { + unsigned cl = 0; + uint64_t emaskl = 0; + uint64_t emaskh = 0; + + for (auto q : cqs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } else { + ++cl; + emaskl |= uint64_t{1} << q; + } + } + + uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); + uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[3]; + + auto s = UnitarySpace::Create(6); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]); + + for (unsigned i = 0; i < 3; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned m = 0; m < 4; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (16 * i + 4 * k + 4 * (m / 4) + (k + m) % 4); + } + + unsigned l = 2 * (4 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + fp_type v = (p[j] / 2) / 4 == (p[j] / 2) % 4 ? 1 : 0; + wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[4], is[4]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh; + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 1; ++l) { + rs[4 * l] = _mm512_load_ps(p0); + is[4 * l] = _mm512_load_ps(p0 + 16); + + for (unsigned j = 1; j < 4; ++j) { + rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]); + is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 1; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 4; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0, rn); + _mm512_store_ps(p0 + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 4 + cqs.size() - cl; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, + state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate); + } + + void ApplyControlledGate3HHH_H(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + Unitary& state) const { + uint64_t xs[3]; + uint64_t ms[4]; + + xs[0] = uint64_t{1} << (qs[0] + 1); + ms[0] = (uint64_t{1} << qs[0]) - 1; + for (unsigned i = 1; i < 3; ++i) { + xs[i] = uint64_t{1} << (qs[i + 0] + 1); + ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); + } + ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1); + + uint64_t xss[8]; + for (unsigned i = 0; i < 8; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 3; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + uint64_t emaskh = 0; + + for (auto q : cqs) { + emaskh |= uint64_t{1} << q; + } + + uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); + + for (auto q : qs) { + emaskh |= uint64_t{1} << q; + } + + emaskh = ~emaskh ^ 15; + + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + uint64_t size, uint64_t row_size, fp_type* rstate) { + __m512 ru, iu, rn, in; + __m512 rs[8], is[8]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh; + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 8; ++l) { + rs[l] = _mm512_load_ps(p0 + xss[l]); + is[l] = _mm512_load_ps(p0 + xss[l] + 16); + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 8; ++l) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_mul_ps(rs[0], ru); + in = _mm512_mul_ps(rs[0], iu); + rn = _mm512_fnmadd_ps(is[0], iu, rn); + in = _mm512_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned n = 1; n < 8; ++n) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_fmadd_ps(rs[n], ru, rn); + in = _mm512_fmadd_ps(rs[n], iu, in); + rn = _mm512_fnmadd_ps(is[n], iu, rn); + in = _mm512_fmadd_ps(is[n], ru, in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 7 + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, matrix, ms, xss, + state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate); + } + + void ApplyControlledGate3HHH_L(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + Unitary& state) const { + uint64_t xs[3]; + uint64_t ms[4]; + + xs[0] = uint64_t{1} << (qs[0] + 1); + ms[0] = (uint64_t{1} << qs[0]) - 1; + for (unsigned i = 1; i < 3; ++i) { + xs[i] = uint64_t{1} << (qs[i + 0] + 1); + ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); + } + ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1); + + uint64_t xss[8]; + for (unsigned i = 0; i < 8; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 3; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned cl = 0; + uint64_t emaskl = 0; + uint64_t emaskh = 0; + + for (auto q : cqs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } else { + ++cl; + emaskl |= uint64_t{1} << q; + } + } + + uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); + uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl); + + for (auto q : qs) { + emaskh |= uint64_t{1} << q; + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + + auto s = UnitarySpace::Create(10); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 8; ++i) { + for (unsigned m = 0; m < 8; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (8 * i + 8 * k + m); + } + + unsigned l = 2 * (8 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0; + wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + uint64_t size, uint64_t row_size, fp_type* rstate) { + __m512 rn, in; + __m512 rs[8], is[8]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh; + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 8; ++l) { + rs[l] = _mm512_load_ps(p0 + xss[l]); + is[l] = _mm512_load_ps(p0 + xss[l] + 16); + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 8; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 8; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 7 + cqs.size() - cl; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, ms, xss, + state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate); + } + + void ApplyControlledGate3HHL_H(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + Unitary& state) const { + uint64_t xs[2]; + uint64_t ms[3]; + + xs[0] = uint64_t{1} << (qs[1] + 1); + ms[0] = (uint64_t{1} << qs[1]) - 1; + for (unsigned i = 1; i < 2; ++i) { + xs[i] = uint64_t{1} << (qs[i + 1] + 1); + ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1); + } + ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); + + uint64_t xss[4]; + for (unsigned i = 0; i < 4; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 2; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + uint64_t emaskh = 0; + + for (auto q : cqs) { + emaskh |= uint64_t{1} << q; + } + + uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[1]; + + auto s = UnitarySpace::Create(9); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 4; ++i) { + for (unsigned m = 0; m < 8; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2); + } + + unsigned l = 2 * (8 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[8], is[8]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh; + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 4; ++l) { + rs[2 * l] = _mm512_load_ps(p0 + xss[l]); + is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 2; ++j) { + rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]); + is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 4; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 8; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 6 + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, ms, xss, + state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate); + } + + void ApplyControlledGate3HHL_L(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + Unitary& state) const { + uint64_t xs[2]; + uint64_t ms[3]; + + xs[0] = uint64_t{1} << (qs[1] + 1); + ms[0] = (uint64_t{1} << qs[1]) - 1; + for (unsigned i = 1; i < 2; ++i) { + xs[i] = uint64_t{1} << (qs[i + 1] + 1); + ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1); + } + ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); + + uint64_t xss[4]; + for (unsigned i = 0; i < 4; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 2; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned cl = 0; + uint64_t emaskl = 0; + uint64_t emaskh = 0; + + for (auto q : cqs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } else { + ++cl; + emaskl |= uint64_t{1} << q; + } + } + + uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); + uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[1]; + + auto s = UnitarySpace::Create(9); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 4; ++i) { + for (unsigned m = 0; m < 8; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (16 * i + 8 * k + 2 * (m / 2) + (k + m) % 2); + } + + unsigned l = 2 * (8 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0; + wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[8], is[8]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh; + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 4; ++l) { + rs[2 * l] = _mm512_load_ps(p0 + xss[l]); + is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 2; ++j) { + rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]); + is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 4; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 8; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 6 + cqs.size() - cl; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, ms, xss, + state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate); + } + + void ApplyControlledGate3HLL_H(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + Unitary& state) const { + uint64_t xs[1]; + uint64_t ms[2]; + + xs[0] = uint64_t{1} << (qs[2] + 1); + ms[0] = (uint64_t{1} << qs[2]) - 1; + ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); + + uint64_t xss[2]; + for (unsigned i = 0; i < 2; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 1; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + uint64_t emaskh = 0; + + for (auto q : cqs) { + emaskh |= uint64_t{1} << q; + } + + uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[3]; + + auto s = UnitarySpace::Create(8); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]); + + for (unsigned i = 0; i < 3; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 2; ++i) { + for (unsigned m = 0; m < 8; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4); + } + + unsigned l = 2 * (8 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[8], is[8]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh; + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 2; ++l) { + rs[4 * l] = _mm512_load_ps(p0 + xss[l]); + is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 4; ++j) { + rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]); + is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 2; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 8; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 5 + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, ms, xss, + state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate); + } + + void ApplyControlledGate3HLL_L(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + Unitary& state) const { + uint64_t xs[1]; + uint64_t ms[2]; + + xs[0] = uint64_t{1} << (qs[2] + 1); + ms[0] = (uint64_t{1} << qs[2]) - 1; + ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); + + uint64_t xss[2]; + for (unsigned i = 0; i < 2; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 1; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned cl = 0; + uint64_t emaskl = 0; + uint64_t emaskh = 0; + + for (auto q : cqs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } else { + ++cl; + emaskl |= uint64_t{1} << q; + } + } + + uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); + uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[3]; + + auto s = UnitarySpace::Create(8); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]); + + for (unsigned i = 0; i < 3; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 2; ++i) { + for (unsigned m = 0; m < 8; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (32 * i + 8 * k + 4 * (m / 4) + (k + m) % 4); + } + + unsigned l = 2 * (8 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0; + wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[8], is[8]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh; + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 2; ++l) { + rs[4 * l] = _mm512_load_ps(p0 + xss[l]); + is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 4; ++j) { + rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]); + is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 2; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 8; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 5 + cqs.size() - cl; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, ms, xss, + state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate); + } + + void ApplyControlledGate3LLL_H(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + Unitary& state) const { + uint64_t emaskh = 0; + + for (auto q : cqs) { + emaskh |= uint64_t{1} << q; + } + + uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[7]; + + auto s = UnitarySpace::Create(7); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]); + + for (unsigned i = 0; i < 7; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned m = 0; m < 8; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (64 * i + 8 * k + 8 * (m / 8) + (k + m) % 8); + } + + unsigned l = 2 * (8 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[8], is[8]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh; + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 1; ++l) { + rs[8 * l] = _mm512_load_ps(p0); + is[8 * l] = _mm512_load_ps(p0 + 16); + + for (unsigned j = 1; j < 8; ++j) { + rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]); + is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 1; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 8; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0, rn); + _mm512_store_ps(p0 + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 4 + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, + state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate); + } + + void ApplyControlledGate3LLL_L(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + Unitary& state) const { + unsigned cl = 0; + uint64_t emaskl = 0; + uint64_t emaskh = 0; + + for (auto q : cqs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } else { + ++cl; + emaskl |= uint64_t{1} << q; + } + } + + uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); + uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[7]; + + auto s = UnitarySpace::Create(7); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]); + + for (unsigned i = 0; i < 7; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned m = 0; m < 8; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (64 * i + 8 * k + 8 * (m / 8) + (k + m) % 8); + } + + unsigned l = 2 * (8 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + fp_type v = (p[j] / 2) / 8 == (p[j] / 2) % 8 ? 1 : 0; + wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[8], is[8]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh; + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 1; ++l) { + rs[8 * l] = _mm512_load_ps(p0); + is[8 * l] = _mm512_load_ps(p0 + 16); + + for (unsigned j = 1; j < 8; ++j) { + rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]); + is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 1; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 8; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0, rn); + _mm512_store_ps(p0 + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 4 + cqs.size() - cl; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, + state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate); + } + + void ApplyControlledGate4HHHH_H(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + Unitary& state) const { + uint64_t xs[4]; + uint64_t ms[5]; + + xs[0] = uint64_t{1} << (qs[0] + 1); + ms[0] = (uint64_t{1} << qs[0]) - 1; + for (unsigned i = 1; i < 4; ++i) { + xs[i] = uint64_t{1} << (qs[i + 0] + 1); + ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); + } + ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1); + + uint64_t xss[16]; + for (unsigned i = 0; i < 16; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 4; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + uint64_t emaskh = 0; + + for (auto q : cqs) { + emaskh |= uint64_t{1} << q; + } + + uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); + + for (auto q : qs) { + emaskh |= uint64_t{1} << q; + } + + emaskh = ~emaskh ^ 15; + + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + uint64_t size, uint64_t row_size, fp_type* rstate) { + __m512 ru, iu, rn, in; + __m512 rs[16], is[16]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh; + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 16; ++l) { + rs[l] = _mm512_load_ps(p0 + xss[l]); + is[l] = _mm512_load_ps(p0 + xss[l] + 16); + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 16; ++l) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_mul_ps(rs[0], ru); + in = _mm512_mul_ps(rs[0], iu); + rn = _mm512_fnmadd_ps(is[0], iu, rn); + in = _mm512_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned n = 1; n < 16; ++n) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_fmadd_ps(rs[n], ru, rn); + in = _mm512_fmadd_ps(rs[n], iu, in); + rn = _mm512_fnmadd_ps(is[n], iu, rn); + in = _mm512_fmadd_ps(is[n], ru, in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 8 + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, matrix, ms, xss, + state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate); + } + + void ApplyControlledGate4HHHH_L(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + Unitary& state) const { + uint64_t xs[4]; + uint64_t ms[5]; + + xs[0] = uint64_t{1} << (qs[0] + 1); + ms[0] = (uint64_t{1} << qs[0]) - 1; + for (unsigned i = 1; i < 4; ++i) { + xs[i] = uint64_t{1} << (qs[i + 0] + 1); + ms[i] = ((uint64_t{1} << qs[i + 0]) - 1) ^ (xs[i - 1] - 1); + } + ms[4] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[3] - 1); + + uint64_t xss[16]; + for (unsigned i = 0; i < 16; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 4; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned cl = 0; + uint64_t emaskl = 0; + uint64_t emaskh = 0; + + for (auto q : cqs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } else { + ++cl; + emaskl |= uint64_t{1} << q; + } + } + + uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); + uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl); + + for (auto q : qs) { + emaskh |= uint64_t{1} << q; + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + + auto s = UnitarySpace::Create(12); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 16; ++i) { + for (unsigned m = 0; m < 16; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (16 * i + 16 * k + m); + } + + unsigned l = 2 * (16 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0; + wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + uint64_t size, uint64_t row_size, fp_type* rstate) { + __m512 rn, in; + __m512 rs[16], is[16]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh; + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 16; ++l) { + rs[l] = _mm512_load_ps(p0 + xss[l]); + is[l] = _mm512_load_ps(p0 + xss[l] + 16); + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 16; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 16; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 8 + cqs.size() - cl; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, ms, xss, + state.num_qubits(), cmaskh, emaskh, size, raw_size, rstate); + } + + void ApplyControlledGate4HHHL_H(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + Unitary& state) const { + uint64_t xs[3]; + uint64_t ms[4]; + + xs[0] = uint64_t{1} << (qs[1] + 1); + ms[0] = (uint64_t{1} << qs[1]) - 1; + for (unsigned i = 1; i < 3; ++i) { + xs[i] = uint64_t{1} << (qs[i + 1] + 1); + ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1); + } + ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1); + + uint64_t xss[8]; + for (unsigned i = 0; i < 8; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 3; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + uint64_t emaskh = 0; + + for (auto q : cqs) { + emaskh |= uint64_t{1} << q; + } + + uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[1]; + + auto s = UnitarySpace::Create(11); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 8; ++i) { + for (unsigned m = 0; m < 16; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2); + } + + unsigned l = 2 * (16 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[16], is[16]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh; + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 8; ++l) { + rs[2 * l] = _mm512_load_ps(p0 + xss[l]); + is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 2; ++j) { + rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]); + is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 8; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 16; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 7 + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, ms, xss, + state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate); + } + + void ApplyControlledGate4HHHL_L(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + Unitary& state) const { + uint64_t xs[3]; + uint64_t ms[4]; + + xs[0] = uint64_t{1} << (qs[1] + 1); + ms[0] = (uint64_t{1} << qs[1]) - 1; + for (unsigned i = 1; i < 3; ++i) { + xs[i] = uint64_t{1} << (qs[i + 1] + 1); + ms[i] = ((uint64_t{1} << qs[i + 1]) - 1) ^ (xs[i - 1] - 1); + } + ms[3] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[2] - 1); + + uint64_t xss[8]; + for (unsigned i = 0; i < 8; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 3; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned cl = 0; + uint64_t emaskl = 0; + uint64_t emaskh = 0; + + for (auto q : cqs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } else { + ++cl; + emaskl |= uint64_t{1} << q; + } + } + + uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); + uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[1]; + + auto s = UnitarySpace::Create(11); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]); + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 2) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 8; ++i) { + for (unsigned m = 0; m < 16; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (32 * i + 16 * k + 2 * (m / 2) + (k + m) % 2); + } + + unsigned l = 2 * (16 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0; + wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[16], is[16]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh; + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 8; ++l) { + rs[2 * l] = _mm512_load_ps(p0 + xss[l]); + is[2 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 2; ++j) { + rs[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[2 * l]); + is[2 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[2 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 8; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 16; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 7 + cqs.size() - cl; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, ms, xss, + state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate); + } + + void ApplyControlledGate4HHLL_H(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + Unitary& state) const { + uint64_t xs[2]; + uint64_t ms[3]; + + xs[0] = uint64_t{1} << (qs[2] + 1); + ms[0] = (uint64_t{1} << qs[2]) - 1; + for (unsigned i = 1; i < 2; ++i) { + xs[i] = uint64_t{1} << (qs[i + 2] + 1); + ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1); + } + ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); + + uint64_t xss[4]; + for (unsigned i = 0; i < 4; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 2; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + uint64_t emaskh = 0; + + for (auto q : cqs) { + emaskh |= uint64_t{1} << q; + } + + uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[3]; + + auto s = UnitarySpace::Create(10); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]); + + for (unsigned i = 0; i < 3; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 4; ++i) { + for (unsigned m = 0; m < 16; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4); + } + + unsigned l = 2 * (16 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[16], is[16]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh; + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 4; ++l) { + rs[4 * l] = _mm512_load_ps(p0 + xss[l]); + is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 4; ++j) { + rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]); + is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 4; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 16; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 6 + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, ms, xss, + state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate); + } + + void ApplyControlledGate4HHLL_L(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + Unitary& state) const { + uint64_t xs[2]; + uint64_t ms[3]; + + xs[0] = uint64_t{1} << (qs[2] + 1); + ms[0] = (uint64_t{1} << qs[2]) - 1; + for (unsigned i = 1; i < 2; ++i) { + xs[i] = uint64_t{1} << (qs[i + 2] + 1); + ms[i] = ((uint64_t{1} << qs[i + 2]) - 1) ^ (xs[i - 1] - 1); + } + ms[2] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[1] - 1); + + uint64_t xss[4]; + for (unsigned i = 0; i < 4; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 2; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned cl = 0; + uint64_t emaskl = 0; + uint64_t emaskh = 0; + + for (auto q : cqs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } else { + ++cl; + emaskl |= uint64_t{1} << q; + } + } + + uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); + uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[3]; + + auto s = UnitarySpace::Create(10); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]); + + for (unsigned i = 0; i < 3; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 4) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 4; ++i) { + for (unsigned m = 0; m < 16; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (64 * i + 16 * k + 4 * (m / 4) + (k + m) % 4); + } + + unsigned l = 2 * (16 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0; + wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[16], is[16]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh; + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 4; ++l) { + rs[4 * l] = _mm512_load_ps(p0 + xss[l]); + is[4 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 4; ++j) { + rs[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[4 * l]); + is[4 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[4 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 4; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 16; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 6 + cqs.size() - cl; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, ms, xss, + state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate); + } + + void ApplyControlledGate4HLLL_H(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + Unitary& state) const { + uint64_t xs[1]; + uint64_t ms[2]; + + xs[0] = uint64_t{1} << (qs[3] + 1); + ms[0] = (uint64_t{1} << qs[3]) - 1; + ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); + + uint64_t xss[2]; + for (unsigned i = 0; i < 2; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 1; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + uint64_t emaskh = 0; + + for (auto q : cqs) { + emaskh |= uint64_t{1} << q; + } + + uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[7]; + + auto s = UnitarySpace::Create(9); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]); + + for (unsigned i = 0; i < 7; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 2; ++i) { + for (unsigned m = 0; m < 16; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (128 * i + 16 * k + 8 * (m / 8) + (k + m) % 8); + } + + unsigned l = 2 * (16 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[16], is[16]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh; + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 2; ++l) { + rs[8 * l] = _mm512_load_ps(p0 + xss[l]); + is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 8; ++j) { + rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]); + is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 2; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 16; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 5 + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, ms, xss, + state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate); + } + + void ApplyControlledGate4HLLL_L(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + Unitary& state) const { + uint64_t xs[1]; + uint64_t ms[2]; + + xs[0] = uint64_t{1} << (qs[3] + 1); + ms[0] = (uint64_t{1} << qs[3]) - 1; + ms[1] = ((uint64_t{1} << state.num_qubits()) - 1) ^ (xs[0] - 1); + + uint64_t xss[2]; + for (unsigned i = 0; i < 2; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < 1; ++k) { + if (((i >> k) & 1) == 1) { + a += xs[k]; + } + } + xss[i] = a; + } + + unsigned cl = 0; + uint64_t emaskl = 0; + uint64_t emaskh = 0; + + for (auto q : cqs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } else { + ++cl; + emaskl |= uint64_t{1} << q; + } + } + + uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); + uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[7]; + + auto s = UnitarySpace::Create(9); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]); + + for (unsigned i = 0; i < 7; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 8) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 2; ++i) { + for (unsigned m = 0; m < 16; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (128 * i + 16 * k + 8 * (m / 8) + (k + m) % 8); + } + + unsigned l = 2 * (16 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0; + wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + const uint64_t* ms, const uint64_t* xss, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[16], is[16]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh; + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 2; ++l) { + rs[8 * l] = _mm512_load_ps(p0 + xss[l]); + is[8 * l] = _mm512_load_ps(p0 + xss[l] + 16); + + for (unsigned j = 1; j < 8; ++j) { + rs[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[8 * l]); + is[8 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[8 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 2; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 16; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0 + xss[l], rn); + _mm512_store_ps(p0 + xss[l] + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 5 + cqs.size() - cl; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, ms, xss, + state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate); + } + + void ApplyControlledGate4LLLL_H(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + Unitary& state) const { + uint64_t emaskh = 0; + + for (auto q : cqs) { + emaskh |= uint64_t{1} << q; + } + + uint64_t cmaskh = bits::ExpandBits(cmask, state.num_qubits(), emaskh); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[15]; + + auto s = UnitarySpace::Create(8); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]); + + for (unsigned i = 0; i < 15; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned m = 0; m < 16; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (256 * i + 16 * k + 16 * (m / 16) + (k + m) % 16); + } + + unsigned l = 2 * (16 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j] = matrix[p[j]]; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = matrix[p[j] + 1]; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[16], is[16]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh; + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 1; ++l) { + rs[16 * l] = _mm512_load_ps(p0); + is[16 * l] = _mm512_load_ps(p0 + 16); + + for (unsigned j = 1; j < 16; ++j) { + rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]); + is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 1; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 16; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0, rn); + _mm512_store_ps(p0 + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 4 + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, + state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate); + } + + void ApplyControlledGate4LLLL_L(const std::vector& qs, + const std::vector& cqs, + uint64_t cmask, const fp_type* matrix, + Unitary& state) const { + unsigned cl = 0; + uint64_t emaskl = 0; + uint64_t emaskh = 0; + + for (auto q : cqs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } else { + ++cl; + emaskl |= uint64_t{1} << q; + } + } + + uint64_t cmaskh = bits::ExpandBits(cmask >> cl, state.num_qubits(), emaskh); + uint64_t cmaskl = bits::ExpandBits(cmask & ((1 << cl) - 1), 4, emaskl); + + for (auto q : qs) { + if (q > 3) { + emaskh |= uint64_t{1} << q; + } + } + + emaskh = ~emaskh ^ 15; + + unsigned p[16]; + __m512i idx[15]; + + auto s = UnitarySpace::Create(8); + __m512* w = (__m512*) s.get(); + fp_type* wf = (fp_type*) w; + + unsigned qmask = (1 << qs[0]) | (1 << qs[1]) | (1 << qs[2]) | (1 << qs[3]); + + for (unsigned i = 0; i < 15; ++i) { + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd(j, i + 1, qmask, 16) | (j & (-1 ^ qmask)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + + for (unsigned i = 0; i < 1; ++i) { + for (unsigned m = 0; m < 16; ++m) { + for (unsigned j = 0; j < 16; ++j) { + unsigned k = bits::CompressBits(j, 4, qmask); + p[j] = 2 * (256 * i + 16 * k + 16 * (m / 16) + (k + m) % 16); + } + + unsigned l = 2 * (16 * i + m); + + for (unsigned j = 0; j < 16; ++j) { + fp_type v = (p[j] / 2) / 16 == (p[j] / 2) % 16 ? 1 : 0; + wf[16 * l + j] = cmaskl == (j & emaskl) ? matrix[p[j]] : v; + } + + for (unsigned j = 0; j < 16; ++j) { + wf[16 * l + j + 16] = cmaskl == (j & emaskl) ? matrix[p[j] + 1] : 0; + } + } + } + + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + unsigned num_qubits, uint64_t cmaskh, uint64_t emaskh, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + __m512 rn, in; + __m512 rs[16], is[16]; + + uint64_t ii = i % size; + uint64_t r = i / size; + uint64_t c = bits::ExpandBits(ii, num_qubits, emaskh) | cmaskh; + + auto p0 = rstate + row_size * r + 2 * c; + + for (unsigned l = 0; l < 1; ++l) { + rs[16 * l] = _mm512_load_ps(p0); + is[16 * l] = _mm512_load_ps(p0 + 16); + + for (unsigned j = 1; j < 16; ++j) { + rs[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], rs[16 * l]); + is[16 * l + j] = _mm512_permutexvar_ps(idx[j - 1], is[16 * l]); + } + } + + uint64_t j = 0; + + for (unsigned l = 0; l < 1; ++l) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned n = 1; n < 16; ++n) { + rn = _mm512_fmadd_ps(rs[n], w[j], rn); + in = _mm512_fmadd_ps(rs[n], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[n], w[j + 1], rn); + in = _mm512_fmadd_ps(is[n], w[j], in); + + j += 2; + } + + _mm512_store_ps(p0, rn); + _mm512_store_ps(p0 + 16, in); + } + }; + + fp_type* rstate = state.get(); + + unsigned k = 4 + cqs.size() - cl; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, + state.num_qubits(), cmaskh, emaskh, idx, size, raw_size, rstate); + } + + static unsigned MaskedAdd( + unsigned a, unsigned b, unsigned mask, unsigned lsize) { + unsigned c = bits::CompressBits(a, 4, mask); + return bits::ExpandBits((c + b) % lsize, 4, mask); + } + + For for_; +}; + +} // namespace unitary +} // namespace qsim + +#endif // UNITARY_CALCULATOR_AVX512_H_ diff --git a/lib/unitaryspace_avx512.h b/lib/unitaryspace_avx512.h new file mode 100644 index 000000000..ec1a6b5d1 --- /dev/null +++ b/lib/unitaryspace_avx512.h @@ -0,0 +1,110 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef UNITARYSPACE_AVX512_H_ +#define UNITARYSPACE_AVX512_H_ + +#include + +#include +#include +#include +#include + +#include "unitaryspace.h" + +namespace qsim { + +namespace unitary { + +/** + * Object containing context and routines for unitary manipulations. + * State is a vectorized sequence of sixteen real components followed by + * sixteen imaginary components. Sixteen single-precison floating numbers can + * be loaded into an AVX512 register. + */ +template +struct UnitarySpaceAVX512 : + public UnitarySpace, For, float> { + private: + using Base = UnitarySpace, For, float>; + + public: + using Unitary = typename Base::Unitary; + using fp_type = typename Base::fp_type; + + template + explicit UnitarySpaceAVX512(ForArgs&&... args) : Base(args...) {} + + static uint64_t MinRowSize(unsigned num_qubits) { + return std::max(uint64_t{32}, 2 * (uint64_t{1} << num_qubits)); + }; + + static uint64_t MinSize(unsigned num_qubits) { + return Base::Size(num_qubits) * MinRowSize(num_qubits); + }; + + void SetAllZeros(Unitary& state) const { + __m512 val0 = _mm512_setzero_ps(); + + auto f = [](unsigned n, unsigned m, uint64_t i, __m512 val0, fp_type* p) { + _mm512_store_ps(p + 32 * i, val0); + _mm512_store_ps(p + 32 * i + 16, val0); + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 32, f, val0, state.get()); + } + + void SetIdentity(Unitary& state) { + SetAllZeros(state); + + auto f = [](unsigned n, unsigned m, uint64_t i, + uint64_t row_size, fp_type* p) { + p[row_size * i + (32 * (i / 16)) + (i % 16)] = 1; + }; + + uint64_t size = Base::Size(state.num_qubits()); + uint64_t row_size = MinRowSize(state.num_qubits()); + Base::for_.Run(size, f, row_size, state.get()); + } + + static std::complex GetEntry(const Unitary& state, + uint64_t i, uint64_t j) { + uint64_t row_size = MinRowSize(state.num_qubits()); + uint64_t k = (32 * (j / 16)) + (j % 16); + return std::complex(state.get()[row_size * i + k], + state.get()[row_size * i + k + 16]); + } + + static void SetEntry(Unitary& state, uint64_t i, uint64_t j, + const std::complex& ampl) { + uint64_t row_size = MinRowSize(state.num_qubits()); + uint64_t k = (32 * (j / 16)) + (j % 16); + state.get()[row_size * i + k] = std::real(ampl); + state.get()[row_size * i + k + 16] = std::imag(ampl); + } + + static void SetEntry(Unitary& state, uint64_t i, uint64_t j, fp_type re, + fp_type im) { + uint64_t row_size = MinRowSize(state.num_qubits()); + uint64_t k = (32 * (j / 16)) + (j % 16); + state.get()[row_size * i + k] = re; + state.get()[row_size * i + k + 16] = im; + } +}; + +} // namespace unitary +} // namespace qsim + +#endif // UNITARYSPACE_AVX512_H_ diff --git a/tests/BUILD b/tests/BUILD index aac574b73..015464670 100644 --- a/tests/BUILD +++ b/tests/BUILD @@ -1,5 +1,6 @@ # Options for testing different simulator types. avx_copts = ['-mavx2', '-mfma'] +avx512_copts = ['-mavx512f'] sse_copts = ['-msse4'] windows_copts = [ @@ -7,6 +8,11 @@ windows_copts = [ "/std:c++14", ] +windows_avx512_copts = [ + "/arch:AVX512", + "/std:c++14", +] + config_setting( name = "windows", constraint_values = ["@bazel_tools//platforms:windows"], @@ -250,6 +256,22 @@ cc_test( }), ) +cc_test( + name = "simulator_avx512_test", + srcs = ["simulator_avx512_test.cc"], + deps = [ + ":simulator_testfixture", + "@com_google_googletest//:gtest_main", + "//lib:parfor", + "//lib:seqfor", + "//lib:simulator_avx512", + ], + copts = select({ + ":windows": windows_avx512_copts, + "//conditions:default": avx512_copts, + }), +) + cc_test( name = "simulator_basic_test", srcs = ["simulator_basic_test.cc"], @@ -317,6 +339,23 @@ cc_test( }), ) +cc_test( + name = "statespace_avx512_test", + srcs = ["statespace_avx512_test.cc"], + deps = [ + ":statespace_testfixture", + "@com_google_googletest//:gtest_main", + "//lib:parfor", + "//lib:seqfor", + "//lib:simulator_avx512", + "//lib:statespace_avx512", + ], + copts = select({ + ":windows": windows_avx512_copts, + "//conditions:default": avx512_copts, + }), +) + cc_test( name = "statespace_basic_test", srcs = ["statespace_basic_test.cc"], @@ -379,6 +418,21 @@ cc_test( ], ) +cc_test( + name = "unitaryspace_avx512_test", + srcs = ["unitaryspace_avx512_test.cc"], + copts = select({ + ":windows": windows_avx512_copts, + "//conditions:default": avx512_copts, + }), + deps = [ + ":unitaryspace_testfixture", + "@com_google_googletest//:gtest_main", + "//lib:formux", + "//lib:unitaryspace_avx512" + ], +) + cc_test( name = "unitaryspace_basic_test", srcs = ["unitaryspace_basic_test.cc"], @@ -418,7 +472,6 @@ cc_library( }), deps = [ "@com_google_googletest//:gtest_main", - "//lib:bits", "//lib:fuser", "//lib:gate_appl", "//lib:gates_cirq", @@ -442,6 +495,22 @@ cc_test( ], ) +cc_test( + name = "unitary_calculator_avx512_test", + srcs = ["unitary_calculator_avx512_test.cc"], + copts = select({ + ":windows": windows_avx512_copts, + "//conditions:default": avx512_copts, + }), + deps = [ + ":unitary_calculator_testfixture", + "@com_google_googletest//:gtest_main", + "//lib:formux", + "//lib:unitaryspace_avx512", + "//lib:unitary_calculator_avx512", + ], +) + cc_test( name = "unitary_calculator_basic_test", srcs = ["unitary_calculator_basic_test.cc"], diff --git a/tests/make.sh b/tests/make.sh index 63d4fcfde..6abf59860 100755 --- a/tests/make.sh +++ b/tests/make.sh @@ -33,15 +33,19 @@ g++ -O3 -I$path_to_include -L$path_to_lib -mavx2 -mfma -fopenmp -o qtrajectory_t g++ -O3 -I$path_to_include -L$path_to_lib -mavx2 -mfma -fopenmp -o run_qsim_test.x run_qsim_test.cc -lgtest -lpthread g++ -O3 -I$path_to_include -L$path_to_lib -mavx2 -mfma -fopenmp -o run_qsimh_test.x run_qsimh_test.cc -lgtest -lpthread g++ -O3 -I$path_to_include -L$path_to_lib -mavx2 -mfma -fopenmp -o simulator_avx_test.x simulator_avx_test.cc -lgtest -lpthread +g++ -O3 -I$path_to_include -L$path_to_lib -mavx512f -fopenmp -o simulator_avx512_test.x simulator_avx512_test.cc -lgtest -lpthread g++ -O3 -I$path_to_include -L$path_to_lib -fopenmp -o simulator_basic_test.x simulator_basic_test.cc -lgtest -lpthread g++ -O3 -I$path_to_include -L$path_to_lib -msse4 -fopenmp -o simulator_sse_test.x simulator_sse_test.cc -lgtest -lpthread g++ -O3 -I$path_to_include -L$path_to_lib -mavx2 -mfma -fopenmp -o statespace_avx_test.x statespace_avx_test.cc -lgtest -lpthread +g++ -O3 -I$path_to_include -L$path_to_lib -mavx512f -fopenmp -o statespace_avx512_test.x statespace_avx512_test.cc -lgtest -lpthread g++ -O3 -I$path_to_include -L$path_to_lib -fopenmp -o statespace_basic_test.x statespace_basic_test.cc -lgtest -lpthread g++ -O3 -I$path_to_include -L$path_to_lib -msse4 -fopenmp -o statespace_sse_test.x statespace_sse_test.cc -lgtest -lpthread g++ -O3 -I$path_to_include -L$path_to_lib -mavx2 -mfma -fopenmp -o unitary_calculator_avx_test.x unitary_calculator_avx_test.cc -lgtest -lpthread +g++ -O3 -I$path_to_include -L$path_to_lib -mavx512f -mfma -fopenmp -o unitary_calculator_avx512_test.x unitary_calculator_avx512_test.cc -lgtest -lpthread g++ -O3 -I$path_to_include -L$path_to_lib -fopenmp -o unitary_calculator_basic_test.x unitary_calculator_basic_test.cc -lgtest -lpthread g++ -O3 -I$path_to_include -L$path_to_lib -msse4 -fopenmp -o unitary_calculator_sse_test.x unitary_calculator_sse_test.cc -lgtest -lpthread g++ -O3 -I$path_to_include -L$path_to_lib -mavx2 -mfma -fopenmp -o unitaryspace_avx_test.x unitaryspace_avx_test.cc -lgtest -lpthread +g++ -O3 -I$path_to_include -L$path_to_lib -mavx512f -mfma -fopenmp -o unitaryspace_avx512_test.x unitaryspace_avx512_test.cc -lgtest -lpthread g++ -O3 -I$path_to_include -L$path_to_lib -fopenmp -o unitaryspace_basic_test.x unitaryspace_basic_test.cc -lgtest -lpthread g++ -O3 -I$path_to_include -L$path_to_lib -msse4 -fopenmp -o unitaryspace_sse_test.x unitaryspace_sse_test.cc -lgtest -lpthread g++ -O3 -I$path_to_include -L$path_to_lib -o vectorspace_test.x vectorspace_test.cc -lgtest -lpthread diff --git a/tests/simulator_avx512_test.cc b/tests/simulator_avx512_test.cc new file mode 100644 index 000000000..ec3b7b2ab --- /dev/null +++ b/tests/simulator_avx512_test.cc @@ -0,0 +1,84 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "simulator_testfixture.h" + +#include "gtest/gtest.h" + +#ifdef _OPENMP +#include "../lib/parfor.h" +#endif +#include "../lib/seqfor.h" +#include "../lib/simulator_avx512.h" + +namespace qsim { + +template +class SimulatorAVX512Test : public testing::Test {}; + +using ::testing::Types; +#ifdef _OPENMP +typedef Types for_impl; +#else +typedef Types for_impl; +#endif + +TYPED_TEST_SUITE(SimulatorAVX512Test, for_impl); + +TYPED_TEST(SimulatorAVX512Test, ApplyGate1) { + TestApplyGate1>(); +} + +TYPED_TEST(SimulatorAVX512Test, ApplyGate2) { + TestApplyGate2>(); +} + +TYPED_TEST(SimulatorAVX512Test, ApplyGate3) { + TestApplyGate3>(); +} + +TYPED_TEST(SimulatorAVX512Test, ApplyGate5) { + TestApplyGate5>(); +} + +TYPED_TEST(SimulatorAVX512Test, CircuitWithControlledGates) { + TestCircuitWithControlledGates>(); +} + +TYPED_TEST(SimulatorAVX512Test, CircuitWithControlledGatesDagger) { + TestCircuitWithControlledGatesDagger>(); +} + +TYPED_TEST(SimulatorAVX512Test, MultiQubitGates) { + TestMultiQubitGates>(); +} + +TYPED_TEST(SimulatorAVX512Test, ControlledGates) { + TestControlledGates>(false); +} + +TYPED_TEST(SimulatorAVX512Test, ExpectationValue1) { + TestExpectationValue1>(); +} + +TYPED_TEST(SimulatorAVX512Test, ExpectationValue2) { + TestExpectationValue2>(); +} + +} // namespace qsim + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/tests/statespace_avx512_test.cc b/tests/statespace_avx512_test.cc new file mode 100644 index 000000000..a7333f9e3 --- /dev/null +++ b/tests/statespace_avx512_test.cc @@ -0,0 +1,105 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "statespace_testfixture.h" + +#include "gtest/gtest.h" + +#ifdef _OPENMP +#include "../lib/parfor.h" +#endif +#include "../lib/seqfor.h" +#include "../lib/simulator_avx512.h" +#include "../lib/statespace_avx512.h" + +namespace qsim { + +template +class StateSpaceAVX512Test : public testing::Test {}; + +using ::testing::Types; +#ifdef _OPENMP +typedef Types for_impl; +#else +typedef Types for_impl; +#endif + +TYPED_TEST_SUITE(StateSpaceAVX512Test, for_impl); + +TYPED_TEST(StateSpaceAVX512Test, Add) { + TestAdd>(); +} + +TYPED_TEST(StateSpaceAVX512Test, NormSmall) { + TestNormSmall>(); +} + +TYPED_TEST(StateSpaceAVX512Test, NormAndInnerProductSmall) { + TestNormAndInnerProductSmall>(); +} + +TYPED_TEST(StateSpaceAVX512Test, NormAndInnerProduct) { + TestNormAndInnerProduct>(); +} + +TYPED_TEST(StateSpaceAVX512Test, SamplingSmall) { + TestSamplingSmall>(); +} + +TYPED_TEST(StateSpaceAVX512Test, SamplingCrossEntropyDifference) { + TestSamplingCrossEntropyDifference>(); +} + +TYPED_TEST(StateSpaceAVX512Test, Ordering) { + TestOrdering>(); +} + +TYPED_TEST(StateSpaceAVX512Test, MeasurementSmall) { + TestMeasurementSmall, TypeParam>(); +} + +TYPED_TEST(StateSpaceAVX512Test, MeasurementLarge) { + TestMeasurementLarge>(); +} + +TYPED_TEST(StateSpaceAVX512Test, Collapse) { + TestCollapse>(); +} + +TYPED_TEST(StateSpaceAVX512Test, InvalidStateSize) { + TestInvalidStateSize>(); +} + +TYPED_TEST(StateSpaceAVX512Test, BulkSetAmpl) { + TestBulkSetAmplitude>(); +} + +TYPED_TEST(StateSpaceAVX512Test, BulkSetAmplExclude) { + TestBulkSetAmplitudeExclusion>(); +} + +TYPED_TEST(StateSpaceAVX512Test, BulkSetAmplDefault) { + TestBulkSetAmplitudeDefault>(); +} + +TYPED_TEST(StateSpaceAVX512Test, ThreadThrashing) { + TestThreadThrashing>(); +} + +} // namespace qsim + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/tests/statespace_avx_test.cc b/tests/statespace_avx_test.cc index f2ae4e210..5bc1128ea 100644 --- a/tests/statespace_avx_test.cc +++ b/tests/statespace_avx_test.cc @@ -73,6 +73,10 @@ TYPED_TEST(StateSpaceAVXTest, MeasurementLarge) { TestMeasurementLarge>(); } +TYPED_TEST(StateSpaceAVXTest, Collapse) { + TestCollapse>(); +} + TYPED_TEST(StateSpaceAVXTest, InvalidStateSize) { TestInvalidStateSize>(); } diff --git a/tests/statespace_basic_test.cc b/tests/statespace_basic_test.cc index eb5c53069..6eaf6d38e 100644 --- a/tests/statespace_basic_test.cc +++ b/tests/statespace_basic_test.cc @@ -73,6 +73,10 @@ TYPED_TEST(StateSpaceBasicTest, MeasurementLarge) { TestMeasurementLarge>(); } +TYPED_TEST(StateSpaceBasicTest, Collapse) { + TestCollapse>(); +} + TYPED_TEST(StateSpaceBasicTest, InvalidStateSize) { TestInvalidStateSize>(); } diff --git a/tests/statespace_sse_test.cc b/tests/statespace_sse_test.cc index b1e0810bd..419976d0c 100644 --- a/tests/statespace_sse_test.cc +++ b/tests/statespace_sse_test.cc @@ -73,6 +73,10 @@ TYPED_TEST(StateSpaceSSETest, MeasurementLarge) { TestMeasurementLarge>(); } +TYPED_TEST(StateSpaceSSETest, Collapse) { + TestCollapse>(); +} + TYPED_TEST(StateSpaceSSETest, InvalidStateSize) { TestInvalidStateSize>(); } diff --git a/tests/statespace_testfixture.h b/tests/statespace_testfixture.h index 6dff358cd..265afe20c 100644 --- a/tests/statespace_testfixture.h +++ b/tests/statespace_testfixture.h @@ -359,13 +359,16 @@ void TestAdd() { constexpr unsigned num_qubits = 2; StateSpace state_space(1); + State state1 = state_space.Create(num_qubits); + state_space.SetAllZeros(state1); state_space.SetAmpl(state1, 0, 1, 2); state_space.SetAmpl(state1, 1, 3, 4); state_space.SetAmpl(state1, 2, 5, 6); state_space.SetAmpl(state1, 3, 7, 8); State state2 = state_space.Create(num_qubits); + state_space.SetAllZeros(state2); state_space.SetAmpl(state2, 0, 1, 2); state_space.SetAmpl(state2, 1, 3, 4); state_space.SetAmpl(state2, 2, 5, 6); @@ -528,6 +531,8 @@ void TestSamplingSmall() { EXPECT_FALSE(state_space.IsNull(state)); + state_space.SetAllZeros(state); + std::array ps = {0.1, 0.2, 0.13, 0.12, 0.18, 0.15, 0.07, 0.05}; for (uint64_t i = 0; i < size; ++i) { @@ -597,31 +602,33 @@ void TestOrdering() { using fp_type = typename StateSpace::fp_type; using State = typename StateSpace::State; - for (unsigned num_qubits : {1, 2, 5}) { + for (unsigned num_qubits : {1, 2, 3, 4, 5, 6, 7, 8, 9}) { uint64_t size = uint64_t{1} << num_qubits; StateSpace state_space(1); - uint64_t raw_size = state_space.MinSize(num_qubits); - std::vector vec(raw_size, 0); - State state = state_space.Create(vec.data(), num_qubits); + State state = state_space.Create(num_qubits); + + state_space.SetAllZeros(state); for (uint64_t i = 0; i < size; ++i) { - state_space.SetAmpl(state, i, std::complex(i, size + i)); + state_space.SetAmpl(state, i, std::complex(i + 1, size + i + 1)); } state_space.InternalToNormalOrder(state); + const fp_type* vec = state.get(); + for (uint64_t i = 0; i < size; ++i) { - EXPECT_NEAR(vec[2 * i], fp_type(i), 1e-8); - EXPECT_NEAR(vec[2 * i + 1], fp_type(size + i), 1e-8); + EXPECT_NEAR(vec[2 * i], fp_type(i + 1), 1e-8); + EXPECT_NEAR(vec[2 * i + 1], fp_type(size + i + 1), 1e-8); } state_space.NormalToInternalOrder(state); for (uint64_t i = 0; i < size; ++i) { auto a = state_space.GetAmpl(state, i); - EXPECT_NEAR(std::real(a), fp_type(i), 1e-8); - EXPECT_NEAR(std::imag(a), fp_type(size + i), 1e-8); + EXPECT_NEAR(std::real(a), fp_type(i + 1), 1e-8); + EXPECT_NEAR(std::imag(a), fp_type(size + i + 1), 1e-8); } } } @@ -716,6 +723,12 @@ void TestMeasurementSmall() { MeasureSmall(num_measurements, 1, 3, {2}, ps3, rgen); MeasureSmall(num_measurements, 1, 3, {0, 1, 2}, ps3, rgen); + std::vector ps4 = { + 0.06, 0.10, 0.07, 0.06, 0.09, 0.08, 0.03, 0.03, + 0.03, 0.07, 0.11, 0.04, 0.05, 0.06, 0.07, 0.05 + }; + MeasureSmall(num_measurements, 1, 4, {0, 3}, ps4, rgen); + std::vector ps5 = { 0.041, 0.043, 0.028, 0.042, 0.002, 0.008, 0.039, 0.020, 0.017, 0.030, 0.020, 0.048, 0.020, 0.044, 0.032, 0.048, @@ -788,6 +801,53 @@ void TestMeasurementLarge() { } } +template +void TestCollapse() { + using State = typename StateSpace::State; + using fp_type = typename StateSpace::fp_type; + using MeasurementResult = typename StateSpace::MeasurementResult; + + MeasurementResult mr; + StateSpace state_space(1); + + for (unsigned num_qubits = 2; num_qubits <= 6; ++num_qubits) { + State state = state_space.Create(num_qubits); + + unsigned size = 1 << num_qubits; + + for (unsigned mask = 1; mask < size; ++mask) { + for (unsigned bits = 0; bits < size; ++bits) { + if ((mask | bits) != mask) continue; + + mr.mask = mask; + mr.bits = bits; + + unsigned num_measured_qubits = 0; + for (unsigned i = 0; i < num_qubits; ++i) { + if (((mask >> i) & 1) == 1) { + ++num_measured_qubits; + } + } + + fp_type r = 1.0 / std::sqrt(1 << (num_qubits - num_measured_qubits)); + + state_space.SetStateUniform(state); + state_space.Collapse(mr, state); + + EXPECT_NEAR(state_space.Norm(state), 1.0, 1e-6); + + for (unsigned i = 0; i < size; ++i) { + if ((i & mask) != bits) { + EXPECT_EQ(state_space.GetAmpl(state, i), std::complex(0, 0)); + } else { + EXPECT_EQ(state_space.GetAmpl(state, i), std::complex(r, 0)); + } + } + } + } + } +} + template void TestInvalidStateSize() { using State = typename StateSpace::State; @@ -812,123 +872,69 @@ void TestInvalidStateSize() { template void TestBulkSetAmplitude() { using State = typename StateSpace::State; - unsigned num_qubits = 3; StateSpace state_space(1); - State state = state_space.Create(num_qubits); - for(int i = 0; i < 8; i++) { - state_space.SetAmpl(state, i, 1, 1); - } - state_space.BulkSetAmpl(state, 1, 0, 0, 0, false); - EXPECT_EQ(state_space.GetAmpl(state, 0), std::complex(0, 0)); - EXPECT_EQ(state_space.GetAmpl(state, 1), std::complex(1, 1)); - EXPECT_EQ(state_space.GetAmpl(state, 2), std::complex(0, 0)); - EXPECT_EQ(state_space.GetAmpl(state, 3), std::complex(1, 1)); - EXPECT_EQ(state_space.GetAmpl(state, 4), std::complex(0, 0)); - EXPECT_EQ(state_space.GetAmpl(state, 5), std::complex(1, 1)); - EXPECT_EQ(state_space.GetAmpl(state, 6), std::complex(0, 0)); - EXPECT_EQ(state_space.GetAmpl(state, 7), std::complex(1, 1)); + for (unsigned num_qubits = 2; num_qubits <= 6; ++num_qubits) { + State state = state_space.Create(num_qubits); + state_space.SetAllZeros(state); - for(int i = 0; i < 8; i++) { - state_space.SetAmpl(state, i, 1, 1); - } - state_space.BulkSetAmpl(state, 2, 0, 0, 0, false); - EXPECT_EQ(state_space.GetAmpl(state, 0), std::complex(0, 0)); - EXPECT_EQ(state_space.GetAmpl(state, 1), std::complex(0, 0)); - EXPECT_EQ(state_space.GetAmpl(state, 2), std::complex(1, 1)); - EXPECT_EQ(state_space.GetAmpl(state, 3), std::complex(1, 1)); - EXPECT_EQ(state_space.GetAmpl(state, 4), std::complex(0, 0)); - EXPECT_EQ(state_space.GetAmpl(state, 5), std::complex(0, 0)); - EXPECT_EQ(state_space.GetAmpl(state, 6), std::complex(1, 1)); - EXPECT_EQ(state_space.GetAmpl(state, 7), std::complex(1, 1)); + unsigned size = 1 << num_qubits; - for(int i = 0; i < 8; i++) { - state_space.SetAmpl(state, i, 1, 1); - } - state_space.BulkSetAmpl(state, 4, 0, 0, 0, false); - EXPECT_EQ(state_space.GetAmpl(state, 0), std::complex(0, 0)); - EXPECT_EQ(state_space.GetAmpl(state, 1), std::complex(0, 0)); - EXPECT_EQ(state_space.GetAmpl(state, 2), std::complex(0, 0)); - EXPECT_EQ(state_space.GetAmpl(state, 3), std::complex(0, 0)); - EXPECT_EQ(state_space.GetAmpl(state, 4), std::complex(1, 1)); - EXPECT_EQ(state_space.GetAmpl(state, 5), std::complex(1, 1)); - EXPECT_EQ(state_space.GetAmpl(state, 6), std::complex(1, 1)); - EXPECT_EQ(state_space.GetAmpl(state, 7), std::complex(1, 1)); + for (unsigned mask = 1; mask < size; ++mask) { + for (unsigned bits = 0; bits < size; ++bits) { + if ((mask | bits) != mask) continue; - for(int i = 0; i < 8; i++) { - state_space.SetAmpl(state, i, 1, 1); + for (unsigned i = 0; i < size; ++i) { + state_space.SetAmpl(state, i, 1, 1); + } + + state_space.BulkSetAmpl(state, mask, bits, 0, 0, false); + + for (unsigned i = 0; i < size; ++i) { + if ((i & mask) == bits) { + EXPECT_EQ(state_space.GetAmpl(state, i), std::complex(0, 0)); + } else { + EXPECT_EQ(state_space.GetAmpl(state, i), std::complex(1, 1)); + } + } + } + } } - state_space.BulkSetAmpl(state, 4 | 1, 4, 0, 0, false); - EXPECT_EQ(state_space.GetAmpl(state, 0), std::complex(1, 1)); - EXPECT_EQ(state_space.GetAmpl(state, 1), std::complex(1, 1)); - EXPECT_EQ(state_space.GetAmpl(state, 2), std::complex(1, 1)); - EXPECT_EQ(state_space.GetAmpl(state, 3), std::complex(1, 1)); - EXPECT_EQ(state_space.GetAmpl(state, 4), std::complex(0, 0)); - EXPECT_EQ(state_space.GetAmpl(state, 5), std::complex(1, 1)); - EXPECT_EQ(state_space.GetAmpl(state, 6), std::complex(0, 0)); - EXPECT_EQ(state_space.GetAmpl(state, 7), std::complex(1, 1)); } template void TestBulkSetAmplitudeExclusion() { using State = typename StateSpace::State; - unsigned num_qubits = 3; StateSpace state_space(1); - State state = state_space.Create(num_qubits); - for(int i = 0; i < 8; i++) { - state_space.SetAmpl(state, i, 1, 1); - } - state_space.BulkSetAmpl(state, 1, 0, 0, 0, true); - EXPECT_EQ(state_space.GetAmpl(state, 0), std::complex(1, 1)); - EXPECT_EQ(state_space.GetAmpl(state, 1), std::complex(0, 0)); - EXPECT_EQ(state_space.GetAmpl(state, 2), std::complex(1, 1)); - EXPECT_EQ(state_space.GetAmpl(state, 3), std::complex(0, 0)); - EXPECT_EQ(state_space.GetAmpl(state, 4), std::complex(1, 1)); - EXPECT_EQ(state_space.GetAmpl(state, 5), std::complex(0, 0)); - EXPECT_EQ(state_space.GetAmpl(state, 6), std::complex(1, 1)); - EXPECT_EQ(state_space.GetAmpl(state, 7), std::complex(0, 0)); + for (unsigned num_qubits = 2; num_qubits <= 6; ++num_qubits) { + State state = state_space.Create(num_qubits); + state_space.SetAllZeros(state); - for(int i = 0; i < 8; i++) { - state_space.SetAmpl(state, i, 1, 1); - } - state_space.BulkSetAmpl(state, 2, 0, 0, 0, true); - EXPECT_EQ(state_space.GetAmpl(state, 0), std::complex(1, 1)); - EXPECT_EQ(state_space.GetAmpl(state, 1), std::complex(1, 1)); - EXPECT_EQ(state_space.GetAmpl(state, 2), std::complex(0, 0)); - EXPECT_EQ(state_space.GetAmpl(state, 3), std::complex(0, 0)); - EXPECT_EQ(state_space.GetAmpl(state, 4), std::complex(1, 1)); - EXPECT_EQ(state_space.GetAmpl(state, 5), std::complex(1, 1)); - EXPECT_EQ(state_space.GetAmpl(state, 6), std::complex(0, 0)); - EXPECT_EQ(state_space.GetAmpl(state, 7), std::complex(0, 0)); + unsigned size = 1 << num_qubits; - for(int i = 0; i < 8; i++) { - state_space.SetAmpl(state, i, 1, 1); - } - state_space.BulkSetAmpl(state, 4, 0, 0, 0, true); - EXPECT_EQ(state_space.GetAmpl(state, 0), std::complex(1, 1)); - EXPECT_EQ(state_space.GetAmpl(state, 1), std::complex(1, 1)); - EXPECT_EQ(state_space.GetAmpl(state, 2), std::complex(1, 1)); - EXPECT_EQ(state_space.GetAmpl(state, 3), std::complex(1, 1)); - EXPECT_EQ(state_space.GetAmpl(state, 4), std::complex(0, 0)); - EXPECT_EQ(state_space.GetAmpl(state, 5), std::complex(0, 0)); - EXPECT_EQ(state_space.GetAmpl(state, 6), std::complex(0, 0)); - EXPECT_EQ(state_space.GetAmpl(state, 7), std::complex(0, 0)); + for (unsigned mask = 1; mask < size; ++mask) { + for (unsigned bits = 0; bits < size; ++bits) { + if ((mask | bits) != mask) continue; - for(int i = 0; i < 8; i++) { - state_space.SetAmpl(state, i, 1, 1); + for (unsigned i = 0; i < size; ++i) { + state_space.SetAmpl(state, i, 1, 1); + } + + state_space.BulkSetAmpl(state, mask, bits, 0, 0, true); + + for (unsigned i = 0; i < size; ++i) { + if ((i & mask) != bits) { + EXPECT_EQ(state_space.GetAmpl(state, i), std::complex(0, 0)); + } else { + EXPECT_EQ(state_space.GetAmpl(state, i), std::complex(1, 1)); + } + } + } + } } - state_space.BulkSetAmpl(state, 4 | 1, 4, 0, 0, true); - EXPECT_EQ(state_space.GetAmpl(state, 0), std::complex(0, 0)); - EXPECT_EQ(state_space.GetAmpl(state, 1), std::complex(0, 0)); - EXPECT_EQ(state_space.GetAmpl(state, 2), std::complex(0, 0)); - EXPECT_EQ(state_space.GetAmpl(state, 3), std::complex(0, 0)); - EXPECT_EQ(state_space.GetAmpl(state, 4), std::complex(1, 1)); - EXPECT_EQ(state_space.GetAmpl(state, 5), std::complex(0, 0)); - EXPECT_EQ(state_space.GetAmpl(state, 6), std::complex(1, 1)); - EXPECT_EQ(state_space.GetAmpl(state, 7), std::complex(0, 0)); } template @@ -939,7 +945,9 @@ void TestBulkSetAmplitudeDefault() { StateSpace state_space(1); State state = state_space.Create(num_qubits); - for(int i = 0; i < 8; i++) { + state_space.SetAllZeros(state); + + for (unsigned i = 0; i < 8; ++i) { state_space.SetAmpl(state, i, 1, 1); } state_space.BulkSetAmpl(state, 4 | 1, 4, 0, 0); @@ -957,13 +965,17 @@ template void TestThreadThrashing() { using State = typename StateSpace::State; StateSpace state_space(1024); - unsigned n_qubits = 13; - State state = state_space.Create(n_qubits); // must be larger than MIN_SIZE. - for (int i = 0; i < 100; i++) { + unsigned num_qubits = 13; + + State state = state_space.Create(num_qubits); // must be larger than MIN_SIZE. + for (unsigned i = 0; i < 100; ++i) { state_space.SetStateZero(state); } + EXPECT_EQ(state_space.GetAmpl(state, 0), std::complex(1, 0)); - for (int i = 1; i < (1 << n_qubits); i++) { + + unsigned size = 1 << num_qubits; + for (unsigned i = 1; i < size; ++i) { EXPECT_EQ(state_space.GetAmpl(state, i), std::complex(0, 0)); } } diff --git a/tests/unitary_calculator_avx512_test.cc b/tests/unitary_calculator_avx512_test.cc new file mode 100644 index 000000000..24381b69a --- /dev/null +++ b/tests/unitary_calculator_avx512_test.cc @@ -0,0 +1,65 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "unitary_calculator_testfixture.h" + +#include "gtest/gtest.h" + +#include "../lib/formux.h" +#include "../lib/unitary_calculator_avx512.h" + +namespace qsim { +namespace unitary { +namespace { + +TEST(UnitaryCalculatorAVX512Test, ApplyGate1) { + TestApplyGate1>(); +} + +TEST(UnitaryCalculatorAVX512Test, ApplyControlledGate1) { + TestApplyControlledGate1>(); +} + +TEST(UnitaryCalculatorAVX512Test, ApplyGate2) { + TestApplyGate2>(); +} + +TEST(UnitaryCalculatorAVX512Test, ApplyControlledGate2) { + TestApplyControlledGate2>(); +} + +TEST(UnitaryCalculatorAVX512Test, ApplyFusedGate) { + TestApplyFusedGate>(); +} + +TEST(UnitaryCalculatorAVX512Test, ApplyGates) { + TestApplyGates>(false); +} + +TEST(UnitaryCalculatorAVX512Test, ApplyControlledGates) { + TestApplyControlledGates>(false); +} + +TEST(UnitaryCalculatorAVX512Test, SmallCircuits) { + TestSmallCircuits>(); +} + +} // namespace +} // namespace unitary +} // namespace qsim + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/tests/unitary_calculator_testfixture.h b/tests/unitary_calculator_testfixture.h index b6a5880e3..3b7707c38 100644 --- a/tests/unitary_calculator_testfixture.h +++ b/tests/unitary_calculator_testfixture.h @@ -20,7 +20,6 @@ #include #include -#include "../lib/bits.h" #include "../lib/fuser.h" #include "../lib/gate_appl.h" #include "../lib/gates_cirq.h" @@ -530,7 +529,7 @@ void TestApplyControlledGates(bool test_double) { using UnitarySpace = typename UnitaryCalculator::UnitarySpace; using fp_type = typename UnitaryCalculator::fp_type; - unsigned max_qubits = 4 + std::log2(UnitaryCalculator::SIMDRegisterSize()); + unsigned max_qubits = 3 + std::log2(UnitaryCalculator::SIMDRegisterSize()); unsigned max_target_qubits = 3; unsigned max_control_qubits = 2; diff --git a/tests/unitaryspace_avx512_test.cc b/tests/unitaryspace_avx512_test.cc new file mode 100644 index 000000000..61260d678 --- /dev/null +++ b/tests/unitaryspace_avx512_test.cc @@ -0,0 +1,46 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "unitaryspace_testfixture.h" + +#include "gtest/gtest.h" + +#include "../lib/formux.h" +#include "../lib/unitaryspace_avx512.h" + +namespace qsim { + +namespace unitary { +namespace { + +TEST(UnitarySpaceAVX512Test, SetZero) { + TestSetZeros>(); +} + +TEST(UnitarySpaceAVX512Test, SetIdentity) { + TestSetIdentity>(); +} + +TEST(UnitarySpaceAVX512Test, GetEntry) { + TestSetEntry>(); +} + +} // namspace +} // namespace unitary +} // namespace qsim + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} From 442b6ba323e701b5387cacade24abfca9a80b363 Mon Sep 17 00:00:00 2001 From: Sergei Isakov Date: Tue, 20 Apr 2021 18:35:33 +0200 Subject: [PATCH 02/10] Add support for avx512. --- .bazelrc | 4 ++++ .github/workflows/bazeltest.yml | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.bazelrc b/.bazelrc index 79fe3007b..a2895b8bc 100644 --- a/.bazelrc +++ b/.bazelrc @@ -29,6 +29,10 @@ build:avx --copt -O3 build:avx --copt -mavx2 build:avx --copt -mfma +# Build with AVX512 +build:avx512 --copt -O3 +build:avx512 --copt -mavx512f + # Build with SSE build:sse --copt -O3 build:sse --copt -msse4 diff --git a/.github/workflows/bazeltest.yml b/.github/workflows/bazeltest.yml index 1bbfa1592..018da315b 100644 --- a/.github/workflows/bazeltest.yml +++ b/.github/workflows/bazeltest.yml @@ -16,7 +16,7 @@ jobs: strategy: matrix: # Hardware optimizers. - hardware_opt: [avx,sse,basic] + hardware_opt: [avx,avx512,sse,basic] # Optimizers for parallelism. parallel_opt: [openmp,nopenmp] @@ -58,7 +58,7 @@ jobs: run: | bazel test --config=avx --config=openmp \ --config=${{ matrix.sanitizer_opt }} tests:all - + test-mem: name: Test with tcmalloc runs-on: ubuntu-16.04 From f87657d421afe0d37c5778194eae739d591524f5 Mon Sep 17 00:00:00 2001 From: Sergei Isakov Date: Tue, 20 Apr 2021 18:42:10 +0200 Subject: [PATCH 03/10] Add support for avx512. --- lib/statespace_avx512.h | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/lib/statespace_avx512.h b/lib/statespace_avx512.h index 3b98250be..07b7485fc 100644 --- a/lib/statespace_avx512.h +++ b/lib/statespace_avx512.h @@ -47,8 +47,24 @@ inline unsigned GetZeroMaskAVX512(uint64_t i, uint64_t mask, uint64_t bits) { return (m2 << 8) | m1; } +inline double HorizontalSumAVX(__m256 s) { + __m128 l = _mm256_castps256_ps128(s); + __m128 h = _mm256_extractf128_ps(s, 1); + __m128 s1 = _mm_add_ps(h, l); + __m128 s1s = _mm_movehdup_ps(s1); + __m128 s2 = _mm_add_ps(s1, s1s); + + return _mm_cvtss_f32(_mm_add_ss(s2, _mm_movehl_ps(s1s, s2))); +} + inline double HorizontalSumAVX512(__m512 s) { - return _mm512_reduce_add_ps(s); + __m256 l = _mm512_castps512_ps256(s); + __m512d sd = _mm512_castps_pd(s); + __m256d hd = _mm512_extractf64x4_pd(sd, 1); + __m256 h = _mm256_castpd_ps(hd); + __m256 p = _mm256_add_ps(h, l); + + return HorizontalSumAVX(p); } } // namespace detail From 34bbd5666b579a68eba1c12d27df406586eae836 Mon Sep 17 00:00:00 2001 From: Sergei Isakov Date: Tue, 20 Apr 2021 19:17:41 +0200 Subject: [PATCH 04/10] Add support for avx512. --- .github/workflows/bazeltest.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/bazeltest.yml b/.github/workflows/bazeltest.yml index 018da315b..6064db68a 100644 --- a/.github/workflows/bazeltest.yml +++ b/.github/workflows/bazeltest.yml @@ -56,7 +56,7 @@ jobs: sudo dpkg -i bazel_0.26.0-linux-x86_64.deb - name: Run C++ tests run: | - bazel test --config=avx --config=openmp \ + bazel test --config=avx512 --config=openmp \ --config=${{ matrix.sanitizer_opt }} tests:all test-mem: @@ -75,5 +75,5 @@ jobs: run: sudo apt-get install libgoogle-perftools-dev - name: Run C++ tests run: | - bazel test --config=avx --config=openmp \ + bazel test --config=avx512 --config=openmp \ --config=tcmalloc tests:all From f4bda1a80e2e4cfcb66ca3b19e2b5d94fe8d1171 Mon Sep 17 00:00:00 2001 From: Sergei Isakov Date: Wed, 21 Apr 2021 13:01:47 +0200 Subject: [PATCH 05/10] Remove avx512 tests from BUILD and config files. --- .bazelrc | 4 -- .github/workflows/bazeltest.yml | 8 ++-- tests/BUILD | 70 --------------------------------- 3 files changed, 4 insertions(+), 78 deletions(-) diff --git a/.bazelrc b/.bazelrc index a2895b8bc..79fe3007b 100644 --- a/.bazelrc +++ b/.bazelrc @@ -29,10 +29,6 @@ build:avx --copt -O3 build:avx --copt -mavx2 build:avx --copt -mfma -# Build with AVX512 -build:avx512 --copt -O3 -build:avx512 --copt -mavx512f - # Build with SSE build:sse --copt -O3 build:sse --copt -msse4 diff --git a/.github/workflows/bazeltest.yml b/.github/workflows/bazeltest.yml index 6064db68a..1bbfa1592 100644 --- a/.github/workflows/bazeltest.yml +++ b/.github/workflows/bazeltest.yml @@ -16,7 +16,7 @@ jobs: strategy: matrix: # Hardware optimizers. - hardware_opt: [avx,avx512,sse,basic] + hardware_opt: [avx,sse,basic] # Optimizers for parallelism. parallel_opt: [openmp,nopenmp] @@ -56,9 +56,9 @@ jobs: sudo dpkg -i bazel_0.26.0-linux-x86_64.deb - name: Run C++ tests run: | - bazel test --config=avx512 --config=openmp \ + bazel test --config=avx --config=openmp \ --config=${{ matrix.sanitizer_opt }} tests:all - + test-mem: name: Test with tcmalloc runs-on: ubuntu-16.04 @@ -75,5 +75,5 @@ jobs: run: sudo apt-get install libgoogle-perftools-dev - name: Run C++ tests run: | - bazel test --config=avx512 --config=openmp \ + bazel test --config=avx --config=openmp \ --config=tcmalloc tests:all diff --git a/tests/BUILD b/tests/BUILD index 015464670..d15286f96 100644 --- a/tests/BUILD +++ b/tests/BUILD @@ -1,6 +1,5 @@ # Options for testing different simulator types. avx_copts = ['-mavx2', '-mfma'] -avx512_copts = ['-mavx512f'] sse_copts = ['-msse4'] windows_copts = [ @@ -8,11 +7,6 @@ windows_copts = [ "/std:c++14", ] -windows_avx512_copts = [ - "/arch:AVX512", - "/std:c++14", -] - config_setting( name = "windows", constraint_values = ["@bazel_tools//platforms:windows"], @@ -256,22 +250,6 @@ cc_test( }), ) -cc_test( - name = "simulator_avx512_test", - srcs = ["simulator_avx512_test.cc"], - deps = [ - ":simulator_testfixture", - "@com_google_googletest//:gtest_main", - "//lib:parfor", - "//lib:seqfor", - "//lib:simulator_avx512", - ], - copts = select({ - ":windows": windows_avx512_copts, - "//conditions:default": avx512_copts, - }), -) - cc_test( name = "simulator_basic_test", srcs = ["simulator_basic_test.cc"], @@ -339,23 +317,6 @@ cc_test( }), ) -cc_test( - name = "statespace_avx512_test", - srcs = ["statespace_avx512_test.cc"], - deps = [ - ":statespace_testfixture", - "@com_google_googletest//:gtest_main", - "//lib:parfor", - "//lib:seqfor", - "//lib:simulator_avx512", - "//lib:statespace_avx512", - ], - copts = select({ - ":windows": windows_avx512_copts, - "//conditions:default": avx512_copts, - }), -) - cc_test( name = "statespace_basic_test", srcs = ["statespace_basic_test.cc"], @@ -418,21 +379,6 @@ cc_test( ], ) -cc_test( - name = "unitaryspace_avx512_test", - srcs = ["unitaryspace_avx512_test.cc"], - copts = select({ - ":windows": windows_avx512_copts, - "//conditions:default": avx512_copts, - }), - deps = [ - ":unitaryspace_testfixture", - "@com_google_googletest//:gtest_main", - "//lib:formux", - "//lib:unitaryspace_avx512" - ], -) - cc_test( name = "unitaryspace_basic_test", srcs = ["unitaryspace_basic_test.cc"], @@ -495,22 +441,6 @@ cc_test( ], ) -cc_test( - name = "unitary_calculator_avx512_test", - srcs = ["unitary_calculator_avx512_test.cc"], - copts = select({ - ":windows": windows_avx512_copts, - "//conditions:default": avx512_copts, - }), - deps = [ - ":unitary_calculator_testfixture", - "@com_google_googletest//:gtest_main", - "//lib:formux", - "//lib:unitaryspace_avx512", - "//lib:unitary_calculator_avx512", - ], -) - cc_test( name = "unitary_calculator_basic_test", srcs = ["unitary_calculator_basic_test.cc"], From 3c9e463dc9508ef6a6f9ff31b376550f07bfb5e2 Mon Sep 17 00:00:00 2001 From: Sergei Isakov Date: Wed, 21 Apr 2021 13:20:47 +0200 Subject: [PATCH 06/10] Remove avx512 tests from tests/Makefile. --- tests/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/Makefile b/tests/Makefile index cc2c8299b..087af58be 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -1,4 +1,4 @@ -TARGETS = $(shell find . -maxdepth 1 -name '*_test.cc') +TARGETS = $(shell find . -maxdepth 1 -name '*_test.cc' ! -name '*512*') TARGETS := $(TARGETS:%.cc=%.x) GTEST_DIR = $(CURDIR)/googletest/googletest From d3273ba97003b85463ae606f02ccd0e130799c01 Mon Sep 17 00:00:00 2001 From: Sergei Isakov Date: Thu, 22 Apr 2021 15:13:05 +0200 Subject: [PATCH 07/10] Fix excessive memory usage in unitary_calculator_*.h. --- lib/unitary_calculator_avx.h | 74 ++++++++++++++--------------- lib/unitary_calculator_avx512.h | 84 ++++++++++++++++----------------- lib/unitary_calculator_sse.h | 58 +++++++++++------------ 3 files changed, 108 insertions(+), 108 deletions(-) diff --git a/lib/unitary_calculator_avx.h b/lib/unitary_calculator_avx.h index a01612d83..519ff26c1 100644 --- a/lib/unitary_calculator_avx.h +++ b/lib/unitary_calculator_avx.h @@ -322,7 +322,7 @@ class UnitaryCalculatorAVX final { unsigned p[8]; __m256i idx[1]; - auto s = UnitarySpace::Create(4); + auto s = UnitarySpace::Create(2); __m256* w = (__m256*) s.get(); fp_type* wf = (fp_type*) w; @@ -513,7 +513,7 @@ class UnitaryCalculatorAVX final { unsigned p[8]; __m256i idx[1]; - auto s = UnitarySpace::Create(6); + auto s = UnitarySpace::Create(3); __m256* w = (__m256*) s.get(); fp_type* wf = (fp_type*) w; @@ -609,7 +609,7 @@ class UnitaryCalculatorAVX final { unsigned p[8]; __m256i idx[3]; - auto s = UnitarySpace::Create(5); + auto s = UnitarySpace::Create(3); __m256* w = (__m256*) s.get(); fp_type* wf = (fp_type*) w; @@ -805,7 +805,7 @@ class UnitaryCalculatorAVX final { unsigned p[8]; __m256i idx[1]; - auto s = UnitarySpace::Create(8); + auto s = UnitarySpace::Create(4); __m256* w = (__m256*) s.get(); fp_type* wf = (fp_type*) w; @@ -919,7 +919,7 @@ class UnitaryCalculatorAVX final { unsigned p[8]; __m256i idx[3]; - auto s = UnitarySpace::Create(7); + auto s = UnitarySpace::Create(4); __m256* w = (__m256*) s.get(); fp_type* wf = (fp_type*) w; @@ -1015,7 +1015,7 @@ class UnitaryCalculatorAVX final { unsigned p[8]; __m256i idx[7]; - auto s = UnitarySpace::Create(6); + auto s = UnitarySpace::Create(3); __m256* w = (__m256*) s.get(); fp_type* wf = (fp_type*) w; @@ -1211,7 +1211,7 @@ class UnitaryCalculatorAVX final { unsigned p[8]; __m256i idx[1]; - auto s = UnitarySpace::Create(10); + auto s = UnitarySpace::Create(5); __m256* w = (__m256*) s.get(); fp_type* wf = (fp_type*) w; @@ -1330,7 +1330,7 @@ class UnitaryCalculatorAVX final { unsigned p[8]; __m256i idx[3]; - auto s = UnitarySpace::Create(9); + auto s = UnitarySpace::Create(5); __m256* w = (__m256*) s.get(); fp_type* wf = (fp_type*) w; @@ -1444,7 +1444,7 @@ class UnitaryCalculatorAVX final { unsigned p[8]; __m256i idx[7]; - auto s = UnitarySpace::Create(8); + auto s = UnitarySpace::Create(4); __m256* w = (__m256*) s.get(); fp_type* wf = (fp_type*) w; @@ -1643,7 +1643,7 @@ class UnitaryCalculatorAVX final { unsigned p[8]; __m256i idx[1]; - auto s = UnitarySpace::Create(12); + auto s = UnitarySpace::Create(6); __m256* w = (__m256*) s.get(); fp_type* wf = (fp_type*) w; @@ -1762,7 +1762,7 @@ class UnitaryCalculatorAVX final { unsigned p[8]; __m256i idx[3]; - auto s = UnitarySpace::Create(11); + auto s = UnitarySpace::Create(6); __m256* w = (__m256*) s.get(); fp_type* wf = (fp_type*) w; @@ -1881,7 +1881,7 @@ class UnitaryCalculatorAVX final { unsigned p[8]; __m256i idx[7]; - auto s = UnitarySpace::Create(10); + auto s = UnitarySpace::Create(5); __m256* w = (__m256*) s.get(); fp_type* wf = (fp_type*) w; @@ -2081,7 +2081,7 @@ class UnitaryCalculatorAVX final { unsigned p[8]; __m256i idx[1]; - auto s = UnitarySpace::Create(14); + auto s = UnitarySpace::Create(7); __m256* w = (__m256*) s.get(); fp_type* wf = (fp_type*) w; @@ -2200,7 +2200,7 @@ class UnitaryCalculatorAVX final { unsigned p[8]; __m256i idx[3]; - auto s = UnitarySpace::Create(13); + auto s = UnitarySpace::Create(7); __m256* w = (__m256*) s.get(); fp_type* wf = (fp_type*) w; @@ -2319,7 +2319,7 @@ class UnitaryCalculatorAVX final { unsigned p[8]; __m256i idx[7]; - auto s = UnitarySpace::Create(12); + auto s = UnitarySpace::Create(6); __m256* w = (__m256*) s.get(); fp_type* wf = (fp_type*) w; @@ -2551,7 +2551,7 @@ class UnitaryCalculatorAVX final { unsigned p[8]; - auto s = UnitarySpace::Create(5); + auto s = UnitarySpace::Create(3); __m256* w = (__m256*) s.get(); fp_type* wf = (fp_type*) w; @@ -2654,7 +2654,7 @@ class UnitaryCalculatorAVX final { unsigned p[8]; __m256i idx[1]; - auto s = UnitarySpace::Create(4); + auto s = UnitarySpace::Create(2); __m256* w = (__m256*) s.get(); fp_type* wf = (fp_type*) w; @@ -2777,7 +2777,7 @@ class UnitaryCalculatorAVX final { unsigned p[8]; __m256i idx[1]; - auto s = UnitarySpace::Create(4); + auto s = UnitarySpace::Create(2); __m256* w = (__m256*) s.get(); fp_type* wf = (fp_type*) w; @@ -3018,7 +3018,7 @@ class UnitaryCalculatorAVX final { unsigned p[8]; - auto s = UnitarySpace::Create(7); + auto s = UnitarySpace::Create(4); __m256* w = (__m256*) s.get(); fp_type* wf = (fp_type*) w; @@ -3139,7 +3139,7 @@ class UnitaryCalculatorAVX final { unsigned p[8]; __m256i idx[1]; - auto s = UnitarySpace::Create(6); + auto s = UnitarySpace::Create(3); __m256* w = (__m256*) s.get(); fp_type* wf = (fp_type*) w; @@ -3281,7 +3281,7 @@ class UnitaryCalculatorAVX final { unsigned p[8]; __m256i idx[1]; - auto s = UnitarySpace::Create(6); + auto s = UnitarySpace::Create(3); __m256* w = (__m256*) s.get(); fp_type* wf = (fp_type*) w; @@ -3398,7 +3398,7 @@ class UnitaryCalculatorAVX final { unsigned p[8]; __m256i idx[3]; - auto s = UnitarySpace::Create(5); + auto s = UnitarySpace::Create(3); __m256* w = (__m256*) s.get(); fp_type* wf = (fp_type*) w; @@ -3521,7 +3521,7 @@ class UnitaryCalculatorAVX final { unsigned p[8]; __m256i idx[3]; - auto s = UnitarySpace::Create(5); + auto s = UnitarySpace::Create(3); __m256* w = (__m256*) s.get(); fp_type* wf = (fp_type*) w; @@ -3762,7 +3762,7 @@ class UnitaryCalculatorAVX final { unsigned p[8]; - auto s = UnitarySpace::Create(9); + auto s = UnitarySpace::Create(5); __m256* w = (__m256*) s.get(); fp_type* wf = (fp_type*) w; @@ -3887,7 +3887,7 @@ class UnitaryCalculatorAVX final { unsigned p[8]; __m256i idx[1]; - auto s = UnitarySpace::Create(8); + auto s = UnitarySpace::Create(4); __m256* w = (__m256*) s.get(); fp_type* wf = (fp_type*) w; @@ -4033,7 +4033,7 @@ class UnitaryCalculatorAVX final { unsigned p[8]; __m256i idx[1]; - auto s = UnitarySpace::Create(8); + auto s = UnitarySpace::Create(4); __m256* w = (__m256*) s.get(); fp_type* wf = (fp_type*) w; @@ -4168,7 +4168,7 @@ class UnitaryCalculatorAVX final { unsigned p[8]; __m256i idx[3]; - auto s = UnitarySpace::Create(7); + auto s = UnitarySpace::Create(4); __m256* w = (__m256*) s.get(); fp_type* wf = (fp_type*) w; @@ -4310,7 +4310,7 @@ class UnitaryCalculatorAVX final { unsigned p[8]; __m256i idx[3]; - auto s = UnitarySpace::Create(7); + auto s = UnitarySpace::Create(4); __m256* w = (__m256*) s.get(); fp_type* wf = (fp_type*) w; @@ -4427,7 +4427,7 @@ class UnitaryCalculatorAVX final { unsigned p[8]; __m256i idx[7]; - auto s = UnitarySpace::Create(6); + auto s = UnitarySpace::Create(3); __m256* w = (__m256*) s.get(); fp_type* wf = (fp_type*) w; @@ -4550,7 +4550,7 @@ class UnitaryCalculatorAVX final { unsigned p[8]; __m256i idx[7]; - auto s = UnitarySpace::Create(6); + auto s = UnitarySpace::Create(3); __m256* w = (__m256*) s.get(); fp_type* wf = (fp_type*) w; @@ -4791,7 +4791,7 @@ class UnitaryCalculatorAVX final { unsigned p[8]; - auto s = UnitarySpace::Create(11); + auto s = UnitarySpace::Create(6); __m256* w = (__m256*) s.get(); fp_type* wf = (fp_type*) w; @@ -4916,7 +4916,7 @@ class UnitaryCalculatorAVX final { unsigned p[8]; __m256i idx[1]; - auto s = UnitarySpace::Create(10); + auto s = UnitarySpace::Create(5); __m256* w = (__m256*) s.get(); fp_type* wf = (fp_type*) w; @@ -5062,7 +5062,7 @@ class UnitaryCalculatorAVX final { unsigned p[8]; __m256i idx[1]; - auto s = UnitarySpace::Create(10); + auto s = UnitarySpace::Create(5); __m256* w = (__m256*) s.get(); fp_type* wf = (fp_type*) w; @@ -5201,7 +5201,7 @@ class UnitaryCalculatorAVX final { unsigned p[8]; __m256i idx[3]; - auto s = UnitarySpace::Create(9); + auto s = UnitarySpace::Create(5); __m256* w = (__m256*) s.get(); fp_type* wf = (fp_type*) w; @@ -5347,7 +5347,7 @@ class UnitaryCalculatorAVX final { unsigned p[8]; __m256i idx[3]; - auto s = UnitarySpace::Create(9); + auto s = UnitarySpace::Create(5); __m256* w = (__m256*) s.get(); fp_type* wf = (fp_type*) w; @@ -5482,7 +5482,7 @@ class UnitaryCalculatorAVX final { unsigned p[8]; __m256i idx[7]; - auto s = UnitarySpace::Create(8); + auto s = UnitarySpace::Create(4); __m256* w = (__m256*) s.get(); fp_type* wf = (fp_type*) w; @@ -5624,7 +5624,7 @@ class UnitaryCalculatorAVX final { unsigned p[8]; __m256i idx[7]; - auto s = UnitarySpace::Create(8); + auto s = UnitarySpace::Create(4); __m256* w = (__m256*) s.get(); fp_type* wf = (fp_type*) w; diff --git a/lib/unitary_calculator_avx512.h b/lib/unitary_calculator_avx512.h index 174f44af1..57ba64848 100644 --- a/lib/unitary_calculator_avx512.h +++ b/lib/unitary_calculator_avx512.h @@ -334,7 +334,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[1]; - auto s = UnitarySpace::Create(5); + auto s = UnitarySpace::Create(3); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -527,7 +527,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[1]; - auto s = UnitarySpace::Create(7); + auto s = UnitarySpace::Create(4); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -625,7 +625,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[3]; - auto s = UnitarySpace::Create(6); + auto s = UnitarySpace::Create(3); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -823,7 +823,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[1]; - auto s = UnitarySpace::Create(9); + auto s = UnitarySpace::Create(5); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -939,7 +939,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[3]; - auto s = UnitarySpace::Create(8); + auto s = UnitarySpace::Create(4); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -1037,7 +1037,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[7]; - auto s = UnitarySpace::Create(7); + auto s = UnitarySpace::Create(4); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -1235,7 +1235,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[1]; - auto s = UnitarySpace::Create(11); + auto s = UnitarySpace::Create(6); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -1356,7 +1356,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[3]; - auto s = UnitarySpace::Create(10); + auto s = UnitarySpace::Create(5); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -1472,7 +1472,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[7]; - auto s = UnitarySpace::Create(9); + auto s = UnitarySpace::Create(5); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -1570,7 +1570,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[15]; - auto s = UnitarySpace::Create(8); + auto s = UnitarySpace::Create(4); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -1768,7 +1768,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[1]; - auto s = UnitarySpace::Create(13); + auto s = UnitarySpace::Create(7); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -1889,7 +1889,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[3]; - auto s = UnitarySpace::Create(12); + auto s = UnitarySpace::Create(6); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -2010,7 +2010,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[7]; - auto s = UnitarySpace::Create(11); + auto s = UnitarySpace::Create(6); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -2126,7 +2126,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[15]; - auto s = UnitarySpace::Create(10); + auto s = UnitarySpace::Create(5); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -2328,7 +2328,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[1]; - auto s = UnitarySpace::Create(15); + auto s = UnitarySpace::Create(8); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -2449,7 +2449,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[3]; - auto s = UnitarySpace::Create(14); + auto s = UnitarySpace::Create(7); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -2570,7 +2570,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[7]; - auto s = UnitarySpace::Create(13); + auto s = UnitarySpace::Create(7); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -2691,7 +2691,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[15]; - auto s = UnitarySpace::Create(12); + auto s = UnitarySpace::Create(6); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -2924,7 +2924,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; - auto s = UnitarySpace::Create(6); + auto s = UnitarySpace::Create(3); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -3027,7 +3027,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[1]; - auto s = UnitarySpace::Create(5); + auto s = UnitarySpace::Create(3); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -3152,7 +3152,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[1]; - auto s = UnitarySpace::Create(5); + auto s = UnitarySpace::Create(3); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -3395,7 +3395,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; - auto s = UnitarySpace::Create(8); + auto s = UnitarySpace::Create(4); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -3516,7 +3516,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[1]; - auto s = UnitarySpace::Create(7); + auto s = UnitarySpace::Create(4); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -3660,7 +3660,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[1]; - auto s = UnitarySpace::Create(7); + auto s = UnitarySpace::Create(4); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -3779,7 +3779,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[3]; - auto s = UnitarySpace::Create(6); + auto s = UnitarySpace::Create(3); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -3904,7 +3904,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[3]; - auto s = UnitarySpace::Create(6); + auto s = UnitarySpace::Create(3); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -4147,7 +4147,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; - auto s = UnitarySpace::Create(10); + auto s = UnitarySpace::Create(5); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -4272,7 +4272,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[1]; - auto s = UnitarySpace::Create(9); + auto s = UnitarySpace::Create(5); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -4420,7 +4420,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[1]; - auto s = UnitarySpace::Create(9); + auto s = UnitarySpace::Create(5); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -4557,7 +4557,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[3]; - auto s = UnitarySpace::Create(8); + auto s = UnitarySpace::Create(4); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -4701,7 +4701,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[3]; - auto s = UnitarySpace::Create(8); + auto s = UnitarySpace::Create(4); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -4820,7 +4820,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[7]; - auto s = UnitarySpace::Create(7); + auto s = UnitarySpace::Create(4); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -4945,7 +4945,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[7]; - auto s = UnitarySpace::Create(7); + auto s = UnitarySpace::Create(4); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -5188,7 +5188,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; - auto s = UnitarySpace::Create(12); + auto s = UnitarySpace::Create(6); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -5313,7 +5313,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[1]; - auto s = UnitarySpace::Create(11); + auto s = UnitarySpace::Create(6); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -5461,7 +5461,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[1]; - auto s = UnitarySpace::Create(11); + auto s = UnitarySpace::Create(6); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -5602,7 +5602,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[3]; - auto s = UnitarySpace::Create(10); + auto s = UnitarySpace::Create(5); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -5750,7 +5750,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[3]; - auto s = UnitarySpace::Create(10); + auto s = UnitarySpace::Create(5); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -5887,7 +5887,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[7]; - auto s = UnitarySpace::Create(9); + auto s = UnitarySpace::Create(5); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -6031,7 +6031,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[7]; - auto s = UnitarySpace::Create(9); + auto s = UnitarySpace::Create(5); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -6150,7 +6150,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[15]; - auto s = UnitarySpace::Create(8); + auto s = UnitarySpace::Create(4); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; @@ -6275,7 +6275,7 @@ class UnitaryCalculatorAVX512 final { unsigned p[16]; __m512i idx[15]; - auto s = UnitarySpace::Create(8); + auto s = UnitarySpace::Create(4); __m512* w = (__m512*) s.get(); fp_type* wf = (fp_type*) w; diff --git a/lib/unitary_calculator_sse.h b/lib/unitary_calculator_sse.h index 89bb21d20..2e458526e 100644 --- a/lib/unitary_calculator_sse.h +++ b/lib/unitary_calculator_sse.h @@ -301,7 +301,7 @@ class UnitaryCalculatorSSE final { const fp_type* matrix, Unitary& state) const { unsigned p[4]; - auto s = UnitarySpace::Create(3); + auto s = UnitarySpace::Create(2); __m128* w = (__m128*) s.get(); fp_type* wf = (fp_type*) w; @@ -483,7 +483,7 @@ class UnitaryCalculatorSSE final { unsigned p[4]; - auto s = UnitarySpace::Create(5); + auto s = UnitarySpace::Create(3); __m128* w = (__m128*) s.get(); fp_type* wf = (fp_type*) w; @@ -570,7 +570,7 @@ class UnitaryCalculatorSSE final { const fp_type* matrix, Unitary& state) const { unsigned p[4]; - auto s = UnitarySpace::Create(4); + auto s = UnitarySpace::Create(2); __m128* w = (__m128*) s.get(); fp_type* wf = (fp_type*) w; @@ -758,7 +758,7 @@ class UnitaryCalculatorSSE final { unsigned p[4]; - auto s = UnitarySpace::Create(7); + auto s = UnitarySpace::Create(4); __m128* w = (__m128*) s.get(); fp_type* wf = (fp_type*) w; @@ -863,7 +863,7 @@ class UnitaryCalculatorSSE final { unsigned p[4]; - auto s = UnitarySpace::Create(6); + auto s = UnitarySpace::Create(3); __m128* w = (__m128*) s.get(); fp_type* wf = (fp_type*) w; @@ -1054,7 +1054,7 @@ class UnitaryCalculatorSSE final { unsigned p[4]; - auto s = UnitarySpace::Create(9); + auto s = UnitarySpace::Create(5); __m128* w = (__m128*) s.get(); fp_type* wf = (fp_type*) w; @@ -1164,7 +1164,7 @@ class UnitaryCalculatorSSE final { unsigned p[4]; - auto s = UnitarySpace::Create(8); + auto s = UnitarySpace::Create(4); __m128* w = (__m128*) s.get(); fp_type* wf = (fp_type*) w; @@ -1355,7 +1355,7 @@ class UnitaryCalculatorSSE final { unsigned p[4]; - auto s = UnitarySpace::Create(11); + auto s = UnitarySpace::Create(6); __m128* w = (__m128*) s.get(); fp_type* wf = (fp_type*) w; @@ -1465,7 +1465,7 @@ class UnitaryCalculatorSSE final { unsigned p[4]; - auto s = UnitarySpace::Create(10); + auto s = UnitarySpace::Create(5); __m128* w = (__m128*) s.get(); fp_type* wf = (fp_type*) w; @@ -1658,7 +1658,7 @@ class UnitaryCalculatorSSE final { unsigned p[4]; - auto s = UnitarySpace::Create(13); + auto s = UnitarySpace::Create(7); __m128* w = (__m128*) s.get(); fp_type* wf = (fp_type*) w; @@ -1768,7 +1768,7 @@ class UnitaryCalculatorSSE final { unsigned p[4]; - auto s = UnitarySpace::Create(12); + auto s = UnitarySpace::Create(6); __m128* w = (__m128*) s.get(); fp_type* wf = (fp_type*) w; @@ -1993,7 +1993,7 @@ class UnitaryCalculatorSSE final { unsigned p[4]; - auto s = UnitarySpace::Create(4); + auto s = UnitarySpace::Create(2); __m128* w = (__m128*) s.get(); fp_type* wf = (fp_type*) w; @@ -2095,7 +2095,7 @@ class UnitaryCalculatorSSE final { unsigned p[4]; - auto s = UnitarySpace::Create(3); + auto s = UnitarySpace::Create(2); __m128* w = (__m128*) s.get(); fp_type* wf = (fp_type*) w; @@ -2209,7 +2209,7 @@ class UnitaryCalculatorSSE final { unsigned p[4]; - auto s = UnitarySpace::Create(3); + auto s = UnitarySpace::Create(2); __m128* w = (__m128*) s.get(); fp_type* wf = (fp_type*) w; @@ -2442,7 +2442,7 @@ class UnitaryCalculatorSSE final { unsigned p[4]; - auto s = UnitarySpace::Create(6); + auto s = UnitarySpace::Create(3); __m128* w = (__m128*) s.get(); fp_type* wf = (fp_type*) w; @@ -2562,7 +2562,7 @@ class UnitaryCalculatorSSE final { unsigned p[4]; - auto s = UnitarySpace::Create(5); + auto s = UnitarySpace::Create(3); __m128* w = (__m128*) s.get(); fp_type* wf = (fp_type*) w; @@ -2695,7 +2695,7 @@ class UnitaryCalculatorSSE final { unsigned p[4]; - auto s = UnitarySpace::Create(5); + auto s = UnitarySpace::Create(3); __m128* w = (__m128*) s.get(); fp_type* wf = (fp_type*) w; @@ -2803,7 +2803,7 @@ class UnitaryCalculatorSSE final { unsigned p[4]; - auto s = UnitarySpace::Create(4); + auto s = UnitarySpace::Create(2); __m128* w = (__m128*) s.get(); fp_type* wf = (fp_type*) w; @@ -2918,7 +2918,7 @@ class UnitaryCalculatorSSE final { unsigned p[4]; - auto s = UnitarySpace::Create(4); + auto s = UnitarySpace::Create(2); __m128* w = (__m128*) s.get(); fp_type* wf = (fp_type*) w; @@ -3152,7 +3152,7 @@ class UnitaryCalculatorSSE final { unsigned p[4]; - auto s = UnitarySpace::Create(8); + auto s = UnitarySpace::Create(4); __m128* w = (__m128*) s.get(); fp_type* wf = (fp_type*) w; @@ -3276,7 +3276,7 @@ class UnitaryCalculatorSSE final { unsigned p[4]; - auto s = UnitarySpace::Create(7); + auto s = UnitarySpace::Create(4); __m128* w = (__m128*) s.get(); fp_type* wf = (fp_type*) w; @@ -3413,7 +3413,7 @@ class UnitaryCalculatorSSE final { unsigned p[4]; - auto s = UnitarySpace::Create(7); + auto s = UnitarySpace::Create(4); __m128* w = (__m128*) s.get(); fp_type* wf = (fp_type*) w; @@ -3539,7 +3539,7 @@ class UnitaryCalculatorSSE final { unsigned p[4]; - auto s = UnitarySpace::Create(6); + auto s = UnitarySpace::Create(3); __m128* w = (__m128*) s.get(); fp_type* wf = (fp_type*) w; @@ -3673,7 +3673,7 @@ class UnitaryCalculatorSSE final { unsigned p[4]; - auto s = UnitarySpace::Create(6); + auto s = UnitarySpace::Create(3); __m128* w = (__m128*) s.get(); fp_type* wf = (fp_type*) w; @@ -3908,7 +3908,7 @@ class UnitaryCalculatorSSE final { unsigned p[4]; - auto s = UnitarySpace::Create(10); + auto s = UnitarySpace::Create(5); __m128* w = (__m128*) s.get(); fp_type* wf = (fp_type*) w; @@ -4032,7 +4032,7 @@ class UnitaryCalculatorSSE final { unsigned p[4]; - auto s = UnitarySpace::Create(9); + auto s = UnitarySpace::Create(5); __m128* w = (__m128*) s.get(); fp_type* wf = (fp_type*) w; @@ -4169,7 +4169,7 @@ class UnitaryCalculatorSSE final { unsigned p[4]; - auto s = UnitarySpace::Create(9); + auto s = UnitarySpace::Create(5); __m128* w = (__m128*) s.get(); fp_type* wf = (fp_type*) w; @@ -4299,7 +4299,7 @@ class UnitaryCalculatorSSE final { unsigned p[4]; - auto s = UnitarySpace::Create(8); + auto s = UnitarySpace::Create(4); __m128* w = (__m128*) s.get(); fp_type* wf = (fp_type*) w; @@ -4437,7 +4437,7 @@ class UnitaryCalculatorSSE final { unsigned p[4]; - auto s = UnitarySpace::Create(8); + auto s = UnitarySpace::Create(4); __m128* w = (__m128*) s.get(); fp_type* wf = (fp_type*) w; From 09e9607cc428f4df13cd6a3ee9846948cc729fd4 Mon Sep 17 00:00:00 2001 From: Sergei Isakov Date: Fri, 23 Apr 2021 19:24:12 +0200 Subject: [PATCH 08/10] Add runtime detection of AVX512 in tests. --- tests/BUILD | 79 +++++++++++++++++++++++++ tests/Makefile | 9 ++- tests/cpuinfo.h | 42 +++++++++++++ tests/simulator_avx512_test.cc | 10 +++- tests/statespace_avx512_test.cc | 10 +++- tests/unitary_calculator_avx512_test.cc | 26 +++++--- tests/unitaryspace_avx512_test.cc | 17 ++++-- 7 files changed, 178 insertions(+), 15 deletions(-) create mode 100644 tests/cpuinfo.h diff --git a/tests/BUILD b/tests/BUILD index d15286f96..531a4f255 100644 --- a/tests/BUILD +++ b/tests/BUILD @@ -1,5 +1,6 @@ # Options for testing different simulator types. avx_copts = ['-mavx2', '-mfma'] +avx512_copts = ['-mavx512f'] sse_copts = ['-msse4'] windows_copts = [ @@ -7,6 +8,11 @@ windows_copts = [ "/std:c++14", ] +windows_avx512_copts = [ + "/arch:AVX512", + "/std:c++14", +] + config_setting( name = "windows", constraint_values = ["@bazel_tools//platforms:windows"], @@ -250,6 +256,23 @@ cc_test( }), ) +cc_test( + name = "simulator_avx512_test", + srcs = ["simulator_avx512_test.cc"], + deps = [ + ":cpuinfo", + ":simulator_testfixture", + "@com_google_googletest//:gtest_main", + "//lib:parfor", + "//lib:seqfor", + "//lib:simulator_avx512", + ], + copts = select({ + ":windows": windows_avx512_copts, + "//conditions:default": avx512_copts, + }), +) + cc_test( name = "simulator_basic_test", srcs = ["simulator_basic_test.cc"], @@ -317,6 +340,24 @@ cc_test( }), ) +cc_test( + name = "statespace_avx512_test", + srcs = ["statespace_avx512_test.cc"], + deps = [ + ":cpuinfo", + ":statespace_testfixture", + "@com_google_googletest//:gtest_main", + "//lib:parfor", + "//lib:seqfor", + "//lib:simulator_avx512", + "//lib:statespace_avx512", + ], + copts = select({ + ":windows": windows_avx512_copts, + "//conditions:default": avx512_copts, + }), +) + cc_test( name = "statespace_basic_test", srcs = ["statespace_basic_test.cc"], @@ -379,6 +420,22 @@ cc_test( ], ) +cc_test( + name = "unitaryspace_avx512_test", + srcs = ["unitaryspace_avx512_test.cc"], + copts = select({ + ":windows": windows_avx512_copts, + "//conditions:default": avx512_copts, + }), + deps = [ + ":cpuinfo", + ":unitaryspace_testfixture", + "@com_google_googletest//:gtest_main", + "//lib:formux", + "//lib:unitaryspace_avx512" + ], +) + cc_test( name = "unitaryspace_basic_test", srcs = ["unitaryspace_basic_test.cc"], @@ -441,6 +498,23 @@ cc_test( ], ) +cc_test( + name = "unitary_calculator_avx512_test", + srcs = ["unitary_calculator_avx512_test.cc"], + copts = select({ + ":windows": windows_avx512_copts, + "//conditions:default": avx512_copts, + }), + deps = [ + ":cpuinfo", + ":unitary_calculator_testfixture", + "@com_google_googletest//:gtest_main", + "//lib:formux", + "//lib:unitaryspace_avx512", + "//lib:unitary_calculator_avx512", + ], +) + cc_test( name = "unitary_calculator_basic_test", srcs = ["unitary_calculator_basic_test.cc"], @@ -486,3 +560,8 @@ cc_test( "//lib:vectorspace", ], ) + +cc_library( + name = "cpuinfo", + hdrs = ["cpuinfo.h"], +) diff --git a/tests/Makefile b/tests/Makefile index 087af58be..016771e48 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -1,4 +1,11 @@ -TARGETS = $(shell find . -maxdepth 1 -name '*_test.cc' ! -name '*512*') +HAVE_AVX512 = $(shell grep avx512f /proc/cpuinfo) +TARGETS = $(shell\ + if [ -z "$(HAVE_AVX512)" ] ; then\ + find . -maxdepth 1 -name "*_test.cc" ! -name "*512*";\ + else\ + find . -maxdepth 1 -name "*_test.cc";\ + fi\ +) TARGETS := $(TARGETS:%.cc=%.x) GTEST_DIR = $(CURDIR)/googletest/googletest diff --git a/tests/cpuinfo.h b/tests/cpuinfo.h new file mode 100644 index 000000000..3f1f68689 --- /dev/null +++ b/tests/cpuinfo.h @@ -0,0 +1,42 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CPUINFO_H_ +#define CPUINFO_H_ + +#ifdef _WIN32 +#include +#else +#include +#endif + +namespace qsim { + +inline bool HaveAVX512() { + unsigned info[4]; + +#ifdef _WIN32 + __cpuidex(info, 7, 0); +#else + if (__get_cpuid_count(7, 0, info, info + 1, info + 2, info + 3) == 0) { + info[1] = 0; + } +#endif + + return (info[1] & (unsigned{1} << 16)) != 0; +} + +} // namespace qsim + +#endif // CPUINFO_H_ diff --git a/tests/simulator_avx512_test.cc b/tests/simulator_avx512_test.cc index ec3b7b2ab..e5c57f854 100644 --- a/tests/simulator_avx512_test.cc +++ b/tests/simulator_avx512_test.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "simulator_testfixture.h" +#include "cpuinfo.h" #include "gtest/gtest.h" @@ -25,7 +26,14 @@ namespace qsim { template -class SimulatorAVX512Test : public testing::Test {}; +class SimulatorAVX512Test : public testing::Test { + protected: + void SetUp() override { + if (!HaveAVX512()) { + GTEST_SKIP() << "Skipping all AVX512 tests."; + } + } +}; using ::testing::Types; #ifdef _OPENMP diff --git a/tests/statespace_avx512_test.cc b/tests/statespace_avx512_test.cc index a7333f9e3..3b8cd34ed 100644 --- a/tests/statespace_avx512_test.cc +++ b/tests/statespace_avx512_test.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "statespace_testfixture.h" +#include "cpuinfo.h" #include "gtest/gtest.h" @@ -26,7 +27,14 @@ namespace qsim { template -class StateSpaceAVX512Test : public testing::Test {}; +class StateSpaceAVX512Test : public testing::Test { + protected: + void SetUp() override { + if (!HaveAVX512()) { + GTEST_SKIP() << "Skipping all AVX512 tests."; + } + } +}; using ::testing::Types; #ifdef _OPENMP diff --git a/tests/unitary_calculator_avx512_test.cc b/tests/unitary_calculator_avx512_test.cc index 24381b69a..4c9314171 100644 --- a/tests/unitary_calculator_avx512_test.cc +++ b/tests/unitary_calculator_avx512_test.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "unitary_calculator_testfixture.h" +#include "cpuinfo.h" #include "gtest/gtest.h" @@ -23,35 +24,44 @@ namespace qsim { namespace unitary { namespace { -TEST(UnitaryCalculatorAVX512Test, ApplyGate1) { +class UnitaryCalculatorAVX512Test : public testing::Test { + protected: + void SetUp() override { + if (!HaveAVX512()) { + GTEST_SKIP() << "Skipping all AVX512 tests."; + } + } +}; + +TEST_F(UnitaryCalculatorAVX512Test, ApplyGate1) { TestApplyGate1>(); } -TEST(UnitaryCalculatorAVX512Test, ApplyControlledGate1) { +TEST_F(UnitaryCalculatorAVX512Test, ApplyControlledGate1) { TestApplyControlledGate1>(); } -TEST(UnitaryCalculatorAVX512Test, ApplyGate2) { +TEST_F(UnitaryCalculatorAVX512Test, ApplyGate2) { TestApplyGate2>(); } -TEST(UnitaryCalculatorAVX512Test, ApplyControlledGate2) { +TEST_F(UnitaryCalculatorAVX512Test, ApplyControlledGate2) { TestApplyControlledGate2>(); } -TEST(UnitaryCalculatorAVX512Test, ApplyFusedGate) { +TEST_F(UnitaryCalculatorAVX512Test, ApplyFusedGate) { TestApplyFusedGate>(); } -TEST(UnitaryCalculatorAVX512Test, ApplyGates) { +TEST_F(UnitaryCalculatorAVX512Test, ApplyGates) { TestApplyGates>(false); } -TEST(UnitaryCalculatorAVX512Test, ApplyControlledGates) { +TEST_F(UnitaryCalculatorAVX512Test, ApplyControlledGates) { TestApplyControlledGates>(false); } -TEST(UnitaryCalculatorAVX512Test, SmallCircuits) { +TEST_F(UnitaryCalculatorAVX512Test, SmallCircuits) { TestSmallCircuits>(); } diff --git a/tests/unitaryspace_avx512_test.cc b/tests/unitaryspace_avx512_test.cc index 61260d678..460c8f1ed 100644 --- a/tests/unitaryspace_avx512_test.cc +++ b/tests/unitaryspace_avx512_test.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "unitaryspace_testfixture.h" +#include "cpuinfo.h" #include "gtest/gtest.h" @@ -20,19 +21,27 @@ #include "../lib/unitaryspace_avx512.h" namespace qsim { - namespace unitary { namespace { -TEST(UnitarySpaceAVX512Test, SetZero) { +class UnitarySpaceAVX512Test : public testing::Test { + protected: + void SetUp() override { + if (!HaveAVX512()) { + GTEST_SKIP() << "Skipping all AVX512 tests."; + } + } +}; + +TEST_F(UnitarySpaceAVX512Test, SetZero) { TestSetZeros>(); } -TEST(UnitarySpaceAVX512Test, SetIdentity) { +TEST_F(UnitarySpaceAVX512Test, SetIdentity) { TestSetIdentity>(); } -TEST(UnitarySpaceAVX512Test, GetEntry) { +TEST_F(UnitarySpaceAVX512Test, GetEntry) { TestSetEntry>(); } From b11741500dd5c1ca768c1c10e1d1562f75edd5b1 Mon Sep 17 00:00:00 2001 From: Sergei Isakov Date: Mon, 26 Apr 2021 16:26:29 +0200 Subject: [PATCH 09/10] Remove runtime detection of AVX512 in tests. --- tests/BUILD | 11 +------ tests/cpuinfo.h | 42 ------------------------- tests/simulator_avx512_test.cc | 14 +++------ tests/statespace_avx512_test.cc | 14 +++------ tests/unitary_calculator_avx512_test.cc | 30 +++++++----------- tests/unitaryspace_avx512_test.cc | 20 +++++------- 6 files changed, 30 insertions(+), 101 deletions(-) delete mode 100644 tests/cpuinfo.h diff --git a/tests/BUILD b/tests/BUILD index 531a4f255..f60b952e2 100644 --- a/tests/BUILD +++ b/tests/BUILD @@ -1,6 +1,6 @@ # Options for testing different simulator types. avx_copts = ['-mavx2', '-mfma'] -avx512_copts = ['-mavx512f'] +avx512_copts = ['-march=native'] sse_copts = ['-msse4'] windows_copts = [ @@ -260,7 +260,6 @@ cc_test( name = "simulator_avx512_test", srcs = ["simulator_avx512_test.cc"], deps = [ - ":cpuinfo", ":simulator_testfixture", "@com_google_googletest//:gtest_main", "//lib:parfor", @@ -344,7 +343,6 @@ cc_test( name = "statespace_avx512_test", srcs = ["statespace_avx512_test.cc"], deps = [ - ":cpuinfo", ":statespace_testfixture", "@com_google_googletest//:gtest_main", "//lib:parfor", @@ -428,7 +426,6 @@ cc_test( "//conditions:default": avx512_copts, }), deps = [ - ":cpuinfo", ":unitaryspace_testfixture", "@com_google_googletest//:gtest_main", "//lib:formux", @@ -506,7 +503,6 @@ cc_test( "//conditions:default": avx512_copts, }), deps = [ - ":cpuinfo", ":unitary_calculator_testfixture", "@com_google_googletest//:gtest_main", "//lib:formux", @@ -560,8 +556,3 @@ cc_test( "//lib:vectorspace", ], ) - -cc_library( - name = "cpuinfo", - hdrs = ["cpuinfo.h"], -) diff --git a/tests/cpuinfo.h b/tests/cpuinfo.h deleted file mode 100644 index 3f1f68689..000000000 --- a/tests/cpuinfo.h +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef CPUINFO_H_ -#define CPUINFO_H_ - -#ifdef _WIN32 -#include -#else -#include -#endif - -namespace qsim { - -inline bool HaveAVX512() { - unsigned info[4]; - -#ifdef _WIN32 - __cpuidex(info, 7, 0); -#else - if (__get_cpuid_count(7, 0, info, info + 1, info + 2, info + 3) == 0) { - info[1] = 0; - } -#endif - - return (info[1] & (unsigned{1} << 16)) != 0; -} - -} // namespace qsim - -#endif // CPUINFO_H_ diff --git a/tests/simulator_avx512_test.cc b/tests/simulator_avx512_test.cc index e5c57f854..448666447 100644 --- a/tests/simulator_avx512_test.cc +++ b/tests/simulator_avx512_test.cc @@ -13,10 +13,11 @@ // limitations under the License. #include "simulator_testfixture.h" -#include "cpuinfo.h" #include "gtest/gtest.h" +#ifdef __AVX512F__ + #ifdef _OPENMP #include "../lib/parfor.h" #endif @@ -26,14 +27,7 @@ namespace qsim { template -class SimulatorAVX512Test : public testing::Test { - protected: - void SetUp() override { - if (!HaveAVX512()) { - GTEST_SKIP() << "Skipping all AVX512 tests."; - } - } -}; +class SimulatorAVX512Test : public testing::Test {}; using ::testing::Types; #ifdef _OPENMP @@ -86,6 +80,8 @@ TYPED_TEST(SimulatorAVX512Test, ExpectationValue2) { } // namespace qsim +#endif // __AVX512F__ + int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/tests/statespace_avx512_test.cc b/tests/statespace_avx512_test.cc index 3b8cd34ed..53c751168 100644 --- a/tests/statespace_avx512_test.cc +++ b/tests/statespace_avx512_test.cc @@ -13,10 +13,11 @@ // limitations under the License. #include "statespace_testfixture.h" -#include "cpuinfo.h" #include "gtest/gtest.h" +#ifdef __AVX512F__ + #ifdef _OPENMP #include "../lib/parfor.h" #endif @@ -27,14 +28,7 @@ namespace qsim { template -class StateSpaceAVX512Test : public testing::Test { - protected: - void SetUp() override { - if (!HaveAVX512()) { - GTEST_SKIP() << "Skipping all AVX512 tests."; - } - } -}; +class StateSpaceAVX512Test : public testing::Test {}; using ::testing::Types; #ifdef _OPENMP @@ -107,6 +101,8 @@ TYPED_TEST(StateSpaceAVX512Test, ThreadThrashing) { } // namespace qsim +#endif // __AVX512F__ + int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/tests/unitary_calculator_avx512_test.cc b/tests/unitary_calculator_avx512_test.cc index 4c9314171..10de2db07 100644 --- a/tests/unitary_calculator_avx512_test.cc +++ b/tests/unitary_calculator_avx512_test.cc @@ -13,10 +13,11 @@ // limitations under the License. #include "unitary_calculator_testfixture.h" -#include "cpuinfo.h" #include "gtest/gtest.h" +#ifdef __AVX512F__ + #include "../lib/formux.h" #include "../lib/unitary_calculator_avx512.h" @@ -24,44 +25,35 @@ namespace qsim { namespace unitary { namespace { -class UnitaryCalculatorAVX512Test : public testing::Test { - protected: - void SetUp() override { - if (!HaveAVX512()) { - GTEST_SKIP() << "Skipping all AVX512 tests."; - } - } -}; - -TEST_F(UnitaryCalculatorAVX512Test, ApplyGate1) { +TEST(UnitaryCalculatorAVX512Test, ApplyGate1) { TestApplyGate1>(); } -TEST_F(UnitaryCalculatorAVX512Test, ApplyControlledGate1) { +TEST(UnitaryCalculatorAVX512Test, ApplyControlledGate1) { TestApplyControlledGate1>(); } -TEST_F(UnitaryCalculatorAVX512Test, ApplyGate2) { +TEST(UnitaryCalculatorAVX512Test, ApplyGate2) { TestApplyGate2>(); } -TEST_F(UnitaryCalculatorAVX512Test, ApplyControlledGate2) { +TEST(UnitaryCalculatorAVX512Test, ApplyControlledGate2) { TestApplyControlledGate2>(); } -TEST_F(UnitaryCalculatorAVX512Test, ApplyFusedGate) { +TEST(UnitaryCalculatorAVX512Test, ApplyFusedGate) { TestApplyFusedGate>(); } -TEST_F(UnitaryCalculatorAVX512Test, ApplyGates) { +TEST(UnitaryCalculatorAVX512Test, ApplyGates) { TestApplyGates>(false); } -TEST_F(UnitaryCalculatorAVX512Test, ApplyControlledGates) { +TEST(UnitaryCalculatorAVX512Test, ApplyControlledGates) { TestApplyControlledGates>(false); } -TEST_F(UnitaryCalculatorAVX512Test, SmallCircuits) { +TEST(UnitaryCalculatorAVX512Test, SmallCircuits) { TestSmallCircuits>(); } @@ -69,6 +61,8 @@ TEST_F(UnitaryCalculatorAVX512Test, SmallCircuits) { } // namespace unitary } // namespace qsim +#endif // __AVX512F__ + int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/tests/unitaryspace_avx512_test.cc b/tests/unitaryspace_avx512_test.cc index 460c8f1ed..5f1001236 100644 --- a/tests/unitaryspace_avx512_test.cc +++ b/tests/unitaryspace_avx512_test.cc @@ -13,10 +13,11 @@ // limitations under the License. #include "unitaryspace_testfixture.h" -#include "cpuinfo.h" #include "gtest/gtest.h" +#ifdef __AVX512F__ + #include "../lib/formux.h" #include "../lib/unitaryspace_avx512.h" @@ -24,24 +25,15 @@ namespace qsim { namespace unitary { namespace { -class UnitarySpaceAVX512Test : public testing::Test { - protected: - void SetUp() override { - if (!HaveAVX512()) { - GTEST_SKIP() << "Skipping all AVX512 tests."; - } - } -}; - -TEST_F(UnitarySpaceAVX512Test, SetZero) { +TEST(UnitarySpaceAVX512Test, SetZero) { TestSetZeros>(); } -TEST_F(UnitarySpaceAVX512Test, SetIdentity) { +TEST(UnitarySpaceAVX512Test, SetIdentity) { TestSetIdentity>(); } -TEST_F(UnitarySpaceAVX512Test, GetEntry) { +TEST(UnitarySpaceAVX512Test, GetEntry) { TestSetEntry>(); } @@ -49,6 +41,8 @@ TEST_F(UnitarySpaceAVX512Test, GetEntry) { } // namespace unitary } // namespace qsim +#endif // __AVX512F__ + int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); From 9ef5fb12fd4596967014d6a7897c9c7f5791ac6c Mon Sep 17 00:00:00 2001 From: Sergei Isakov Date: Mon, 26 Apr 2021 20:51:23 +0200 Subject: [PATCH 10/10] Don't run AVX512 tests on Windows. --- tests/simulator_avx512_test.cc | 4 ++-- tests/statespace_avx512_test.cc | 4 ++-- tests/unitary_calculator_avx512_test.cc | 4 ++-- tests/unitaryspace_avx512_test.cc | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/simulator_avx512_test.cc b/tests/simulator_avx512_test.cc index 448666447..287a61524 100644 --- a/tests/simulator_avx512_test.cc +++ b/tests/simulator_avx512_test.cc @@ -16,7 +16,7 @@ #include "gtest/gtest.h" -#ifdef __AVX512F__ +#if defined(__AVX512F__) && !defined(_WIN32) #ifdef _OPENMP #include "../lib/parfor.h" @@ -80,7 +80,7 @@ TYPED_TEST(SimulatorAVX512Test, ExpectationValue2) { } // namespace qsim -#endif // __AVX512F__ +#endif // __AVX512F__ && !_WIN32 int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); diff --git a/tests/statespace_avx512_test.cc b/tests/statespace_avx512_test.cc index 53c751168..d9913aafb 100644 --- a/tests/statespace_avx512_test.cc +++ b/tests/statespace_avx512_test.cc @@ -16,7 +16,7 @@ #include "gtest/gtest.h" -#ifdef __AVX512F__ +#if defined(__AVX512F__) && !defined(_WIN32) #ifdef _OPENMP #include "../lib/parfor.h" @@ -101,7 +101,7 @@ TYPED_TEST(StateSpaceAVX512Test, ThreadThrashing) { } // namespace qsim -#endif // __AVX512F__ +#endif // __AVX512F__ && !_WIN32 int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); diff --git a/tests/unitary_calculator_avx512_test.cc b/tests/unitary_calculator_avx512_test.cc index 10de2db07..569826880 100644 --- a/tests/unitary_calculator_avx512_test.cc +++ b/tests/unitary_calculator_avx512_test.cc @@ -16,7 +16,7 @@ #include "gtest/gtest.h" -#ifdef __AVX512F__ +#if defined(__AVX512F__) && !defined(_WIN32) #include "../lib/formux.h" #include "../lib/unitary_calculator_avx512.h" @@ -61,7 +61,7 @@ TEST(UnitaryCalculatorAVX512Test, SmallCircuits) { } // namespace unitary } // namespace qsim -#endif // __AVX512F__ +#endif // __AVX512F__ && !_WIN32 int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); diff --git a/tests/unitaryspace_avx512_test.cc b/tests/unitaryspace_avx512_test.cc index 5f1001236..6301ec4db 100644 --- a/tests/unitaryspace_avx512_test.cc +++ b/tests/unitaryspace_avx512_test.cc @@ -16,7 +16,7 @@ #include "gtest/gtest.h" -#ifdef __AVX512F__ +#if defined(__AVX512F__) && !defined(_WIN32) #include "../lib/formux.h" #include "../lib/unitaryspace_avx512.h" @@ -41,7 +41,7 @@ TEST(UnitarySpaceAVX512Test, GetEntry) { } // namespace unitary } // namespace qsim -#endif // __AVX512F__ +#endif // __AVX512F__ && !_WIN32 int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv);