From 65ab60b18597321001bbefef66e04a4fbb4a44b2 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 13 May 2026 18:06:15 +0200 Subject: [PATCH 01/12] Add CMakePresets.txt --- .github/workflows/linux.yml | 162 ++++++++------------ CMakePresets.json | 214 +++++++++++++++++++++++++++ include/xsimd/arch/xsimd_avx512f.hpp | 123 +++++++++++++-- 3 files changed, 391 insertions(+), 108 deletions(-) create mode 100644 CMakePresets.json diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 03a914bda..0bbcccd82 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -10,46 +10,50 @@ defaults: jobs: build: runs-on: ubuntu-latest - name: '${{ matrix.sys.compiler }} ${{ matrix.sys.version }} - ${{ matrix.sys.flags }}' + name: '${{ matrix.sys.compiler }} ${{ matrix.sys.version }} - ${{ matrix.sys.preset }} - ${{ matrix.sys.flags }}' strategy: matrix: sys: - - { compiler: 'gcc', version: '12', flags: 'force_no_instr_set' } - - { compiler: 'gcc', version: '13', flags: 'enable_xtl_complex' } - - { compiler: 'gcc', version: '14', flags: 'avx' } - - { compiler: 'gcc', version: '14', flags: 'avx2' } - - { compiler: 'gcc', version: '13', flags: 'avx512' } - - { compiler: 'gcc', version: '10', flags: 'avx512' } - - { compiler: 'gcc', version: '12', flags: 'i386' } - - { compiler: 'gcc', version: '13', flags: 'avx512pf' } - - { compiler: 'gcc', version: '13', flags: 'avx512vbmi' } - - { compiler: 'gcc', version: '14', flags: 'avx512vbmi2' } - - { compiler: 'gcc', version: '13', flags: 'avx512vnni' } - - { compiler: 'clang', version: '16', flags: 'force_no_instr_set' } - - { compiler: 'clang', version: '16', flags: 'enable_xtl_complex' } - - { compiler: 'clang', version: '17', flags: 'avx' } - - { compiler: 'clang', version: '17', flags: 'sse3' } - - { compiler: 'clang', version: '18', flags: 'avx512' } - - { compiler: 'clang', version: '18', flags: 'avx_128' } - - { compiler: 'clang', version: '18', flags: 'avx2_128' } - - { compiler: 'clang', version: '18', flags: 'avx512vl_128' } - - { compiler: 'clang', version: '18', flags: 'avx512vl_256' } + - { compiler: 'gcc', version: '12', flags: 'force_no_instr_set', preset: 'native' } + - { compiler: 'gcc', version: '13', flags: 'enable_xtl_complex', preset: 'native' } + - { compiler: 'gcc', version: '14', flags: '', preset: 'avx' } + - { compiler: 'gcc', version: '14', flags: '', preset: 'avx2' } + - { compiler: 'gcc', version: '13', flags: '', preset: 'avx512f' } + - { compiler: 'gcc', version: '10', flags: '', preset: 'avx512f' } + - { compiler: 'gcc', version: '12', flags: 'i386', preset: 'native' } + - { compiler: 'gcc', version: '13', flags: '', preset: 'avx512pf' } + - { compiler: 'gcc', version: '13', flags: '', preset: 'avx512vbmi' } + - { compiler: 'gcc', version: '14', flags: '', preset: 'avx512vbmi2' } + - { compiler: 'gcc', version: '13', flags: '', preset: 'avx512vnni_avx512bw' } + - { compiler: 'clang', version: '16', flags: 'force_no_instr_set', preset: 'native' } + - { compiler: 'clang', version: '16', flags: 'enable_xtl_complex', preset: 'native' } + - { compiler: 'clang', version: '17', flags: '', preset: 'avx' } + - { compiler: 'clang', version: '17', flags: '', preset: 'sse3' } + - { compiler: 'clang', version: '18', flags: '', preset: 'avx512f' } + - { compiler: 'clang', version: '18', flags: '', preset: 'avx_128' } + - { compiler: 'clang', version: '18', flags: '', preset: 'avx2_128' } + - { compiler: 'clang', version: '18', flags: '', preset: 'avx512vl_128' } + - { compiler: 'clang', version: '18', flags: '', preset: 'avx512vl_256' } steps: - - name: Setup compiler + - name: Setup GCC compiler if: ${{ matrix.sys.compiler == 'gcc' }} run: | GCC_VERSION=${{ matrix.sys.version }} sudo apt-get update sudo apt-get --no-install-suggests --no-install-recommends install g++-$GCC_VERSION - sudo dpkg --add-architecture i386 - sudo add-apt-repository ppa:ubuntu-toolchain-r/test - sudo apt-get update - sudo apt-get --no-install-suggests --no-install-recommends install gcc-$GCC_VERSION-multilib g++-$GCC_VERSION-multilib linux-libc-dev:i386 - CC=gcc-$GCC_VERSION - echo "CC=$CC" >> $GITHUB_ENV - CXX=g++-$GCC_VERSION - echo "CXX=$CXX" >> $GITHUB_ENV - - name: Setup compiler + # Setup i386 as needed + if [[ '${{ matrix.sys.flags }}' == 'i386' ]]; then + sudo dpkg --add-architecture i386 + sudo add-apt-repository ppa:ubuntu-toolchain-r/test + sudo apt-get update + sudo apt-get --no-install-suggests --no-install-recommends install \ + gcc-$GCC_VERSION-multilib g++-$GCC_VERSION-multilib linux-libc-dev:i386 + fi + # Export compiler as environment var + echo "CC=gcc-$GCC_VERSION" >> $GITHUB_ENV + echo "CXX=g++-$GCC_VERSION" >> $GITHUB_ENV + + - name: Setup Clang compiler if: ${{ matrix.sys.compiler == 'clang' }} run: | LLVM_VERSION=${{ matrix.sys.version }} @@ -57,91 +61,51 @@ jobs: sudo apt-get --no-install-suggests --no-install-recommends install clang-$LLVM_VERSION || exit 1 sudo apt-get --no-install-suggests --no-install-recommends install g++ g++-multilib || exit 1 sudo ln -s /usr/include/asm-generic /usr/include/asm - CC=clang-$LLVM_VERSION - echo "CC=$CC" >> $GITHUB_ENV - CXX=clang++-$LLVM_VERSION - echo "CXX=$CXX" >> $GITHUB_ENV + # Export compiler as environment var + echo "CC=clang-$LLVM_VERSION" >> $GITHUB_ENV + echo "CXX=clang++-$LLVM_VERSION" >> $GITHUB_ENV + - name: Checkout xsimd uses: actions/checkout@v6 + - name: Install mamba - uses: mamba-org/setup-micromamba@v2 + uses: mamba-org/setup-micromamba@v3 with: environment-file: environment.yml + - name: Setup SDE - if: startswith(matrix.sys.flags, 'avx512') + if: startswith(matrix.sys.preset, 'avx512') run: sh install_sde.sh + - name: Configure build - env: - CC: ${{ env.CC }} - CXX: ${{ env.CXX }} run: | if [[ '${{ matrix.sys.flags }}' == 'enable_xtl_complex' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DENABLE_XTL_COMPLEX=ON" fi - if [[ '${{ matrix.sys.flags }}' == 'avx' ]]; then - CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=sandybridge" - fi - if [[ '${{ matrix.sys.flags }}' == 'avx_128' ]]; then - CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=sandybridge" - CXXFLAGS="$CXX_FLAGS -DXSIMD_DEFAULT_ARCH=avx_128" - fi - if [[ '${{ matrix.sys.flags }}' == 'avx2' ]]; then - CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=haswell" - fi - if [[ '${{ matrix.sys.flags }}' == 'avx2_128' ]]; then - CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=haswell" - CXXFLAGS="$CXX_FLAGS -DXSIMD_DEFAULT_ARCH=avx2_128" - fi - if [[ '${{ matrix.sys.flags }}' == 'sse3' ]]; then - CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=nocona" - fi - if [[ '${{ matrix.sys.flags }}' == 'avx512' ]]; then - CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512" - fi - if [[ '${{ matrix.sys.flags }}' == 'avx512vl_128' ]]; then - CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512" - CXXFLAGS="$CXX_FLAGS -DXSIMD_DEFAULT_ARCH=avx512vl_128" - fi - if [[ '${{ matrix.sys.flags }}' == 'avx512vl_256' ]]; then - CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512" - CXXFLAGS="$CXX_FLAGS -DXSIMD_DEFAULT_ARCH=avx512vl_256" - fi - if [[ '${{ matrix.sys.flags }}' == 'avx512pf' ]]; then - CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=knl" - fi - if [[ '${{ matrix.sys.flags }}' == 'avx512vbmi' ]]; then - CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=cannonlake" - fi - if [[ '${{ matrix.sys.flags }}' == 'avx512vbmi2' ]]; then - CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=icelake-server" - fi - if [[ '${{ matrix.sys.flags }}' == 'avx512vnni' ]]; then - CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=knm" - fi if [[ '${{ matrix.sys.flags }}' == 'i386' ]]; then - CXX_FLAGS="$CXX_FLAGS -m32" + export CXXFLAGS="$CXXFLAGS -m32" fi - if [[ '${{ matrix.sys.flags }}' == 'force_no_instr_set' ]]; then - : - else + if [[ '${{ matrix.sys.flags }}' != 'force_no_instr_set' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DXSIMD_ENABLE_WERROR=ON" fi - # Cheap way of spotting uninitialized read - CXX_FLAGS="$CXX_FLAGS -ftrivial-auto-var-init=pattern" + # Cheap way of spotting uninitialized read; presets pick this up via $env{CXXFLAGS}. + export CXXFLAGS="$CXXFLAGS -ftrivial-auto-var-init=pattern" cmake -B _build \ - -DBUILD_TESTS=ON \ - -DBUILD_BENCHMARK=ON \ - -DBUILD_EXAMPLES=ON \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_C_COMPILER=$CC \ - -DCMAKE_CXX_COMPILER=$CXX \ - $CMAKE_EXTRA_ARGS \ - -DCMAKE_CXX_FLAGS='$CXX_FLAGS' \ + --preset ${{ matrix.sys.preset }} \ + -D BUILD_TESTS=ON \ + -D BUILD_BENCHMARK=ON \ + -D BUILD_EXAMPLES=ON \ + -D CMAKE_BUILD_TYPE=Release \ + -D CMAKE_C_COMPILER="${CC}" \ + -D CMAKE_CXX_COMPILER="${CXX}" \ + -D TARGET_ARCH="x86-64" \ + "${CMAKE_EXTRA_ARGS}" \ -G Ninja + - name: Build - run: cmake --build _build + run: cmake --build _build --parallel - name: Test run: | # Set CPU feature test expectations, 0 is explicit absence of the feature @@ -149,15 +113,15 @@ jobs: export XSIMD_TEST_CPU_ASSUME_RVV="0" export XSIMD_TEST_CPU_ASSUME_VSX="0" export XSIMD_TEST_CPU_ASSUME_VXE="0" - cd _build/test - if echo '${{ matrix.sys.flags }}' | grep -q 'avx512' ; then + + if echo '${{ matrix.sys.preset }}' | grep -q 'avx512' ; then # Running with emulation, must have AVX512, lower tier are checked by implications in tests export XSIMD_TEST_CPU_ASSUME_AVX512F="1" - ../../sde-external-9.48.0-2024-11-25-lin/sde64 -tgl -- ./test_xsimd + ./sde-external-9.48.0-2024-11-25-lin/sde64 -skx -- ./_build/test/test_xsimd else export XSIMD_TEST_CPU_ASSUME_SSE4_2=$(grep -q 'sse4_2' /proc/cpuinfo && echo "1" || echo "0") export XSIMD_TEST_CPU_ASSUME_AVX=$(grep -q 'avx' /proc/cpuinfo && echo "1" || echo "0") export XSIMD_TEST_CPU_ASSUME_AVX512F=$(grep -q 'avx512f' /proc/cpuinfo && echo "1" || echo "0") export XSIMD_TEST_CPU_ASSUME_MANUFACTURER="intel,amd" - ./test_xsimd + ./_build/test/test_xsimd fi diff --git a/CMakePresets.json b/CMakePresets.json new file mode 100644 index 000000000..ed5314dda --- /dev/null +++ b/CMakePresets.json @@ -0,0 +1,214 @@ +{ + "version": 5, + "cmakeMinimumRequired": { + "major": 3, + "minor": 23, + "patch": 0 + }, + "configurePresets": [ + { + "name": "native", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=native" + } + }, + { + "name": "sse2", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64 -mno-sse4a -msse2 -mno-sse3" + } + }, + { + "name": "sse3", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64 -mno-sse4a -msse3 -mno-ssse3" + } + }, + { + "name": "ssse3", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64 -mno-sse4a -mssse3 -mno-sse4.1" + } + }, + { + "name": "sse4.1", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64 -mno-sse4a -msse4.1 -mno-sse4.2" + } + }, + { + "name": "sse4.2", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64 -mno-sse4a -msse4.2 -mno-avx" + } + }, + { + "name": "avx", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64-v2 -mno-sse4a -mavx -mno-avx2" + } + }, + { + "name": "avx_128", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64-v2 -mno-sse4a -mavx -mno-avx2 -DXSIMD_DEFAULT_ARCH=avx_128" + } + }, + { + "name": "avx2", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64-v2 -mno-sse4a -mavx2 -mno-avx512f" + } + }, + { + "name": "avx2_128", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64-v2 -mno-sse4a -mavx2 -mno-avx512f -DXSIMD_DEFAULT_ARCH=avx2_128" + } + }, + { + "name": "avx512f", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64-v2 -mno-sse4a -mavx -mavx2 -mavx512f -mno-avx512cd -mno-avx512dq -mno-avx512bw -mno-avx512er -mno-avx512pf -mno-avx512ifma -mno-avx512vbmi -mno-avx512vbmi2 -mno-avx512vnni" + } + }, + { + "name": "avx512cd", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64-v2 -mno-sse4a -mavx -mavx2 -mavx512f -mavx512cd -mno-avx512dq -mno-avx512bw -mno-avx512er -mno-avx512pf -mno-avx512ifma -mno-avx512vbmi -mno-avx512vbmi2 -mno-avx512vnni" + } + }, + { + "name": "avx512dq", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64-v2 -mno-sse4a -mavx -mavx2 -mavx512f -mavx512cd -mavx512dq -mno-avx512bw -mno-avx512er -mno-avx512pf -mno-avx512ifma -mno-avx512vbmi -mno-avx512vbmi2 -mno-avx512vnni" + } + }, + { + "name": "avx512bw", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64-v2 -mno-sse4a -mavx -mavx2 -mavx512f -mavx512cd -mavx512dq -mavx512bw -mno-avx512er -mno-avx512pf -mno-avx512ifma -mno-avx512vbmi -mno-avx512vbmi2 -mno-avx512vnni" + } + }, + { + "name": "avx512er", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64-v2 -mno-sse4a -mavx -mavx2 -mavx512f -mavx512cd -mavx512dq -mno-avx512bw -mavx512er -mno-avx512pf -mno-avx512ifma -mno-avx512vbmi -mno-avx512vbmi2 -mno-avx512vnni" + } + }, + { + "name": "avx512pf", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64-v2 -mno-sse4a -mavx -mavx2 -mavx512f -mavx512cd -mavx512dq -mno-avx512bw -mavx512er -mavx512pf -mno-avx512ifma -mno-avx512vbmi -mno-avx512vbmi2 -mno-avx512vnni" + } + }, + { + "name": "avx512ifma", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64-v2 -mno-sse4a -mavx -mavx2 -mavx512f -mavx512cd -mavx512dq -mavx512bw -mno-avx512er -mno-avx512pf -mavx512ifma -mno-avx512vbmi -mno-avx512vbmi2 -mno-avx512vnni" + } + }, + { + "name": "avx512vbmi", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64-v2 -mno-sse4a -mavx -mavx2 -mavx512f -mavx512cd -mavx512dq -mavx512bw -mno-avx512er -mno-avx512pf -mavx512ifma -mavx512vbmi -mno-avx512vbmi2 -mno-avx512vnni" + } + }, + { + "name": "avx512vbmi2", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64-v2 -mno-sse4a -mavx -mavx2 -mavx512f -mavx512cd -mavx512dq -mavx512bw -mno-avx512er -mno-avx512pf -mavx512ifma -mavx512vbmi -mavx512vbmi2 -mno-avx512vnni" + } + }, + { + "name": "avx512vnni_avx512bw", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64-v2 -mno-sse4a -mavx -mavx2 -mavx512f -mavx512cd -mavx512dq -mavx512bw -mno-avx512er -mno-avx512pf -mno-avx512ifma -mno-avx512vbmi -mno-avx512vbmi2 -mavx512vnni" + } + }, + { + "name": "avx512vnni_avx512vbmi2", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64-v2 -mno-sse4a -mavx -mavx2 -mavx512f -mavx512cd -mavx512dq -mavx512bw -mno-avx512er -mno-avx512pf -mavx512ifma -mavx512vbmi -mavx512vbmi2 -mavx512vnni" + } + }, + { + "name": "avx512vl_128", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64-v2 -mno-sse4a -mavx -mavx2 -mavx512f -mavx512cd -mavx512vl -mno-avx512dq -mno-avx512bw -mno-avx512er -mno-avx512pf -mno-avx512ifma -mno-avx512vbmi -mno-avx512vbmi2 -mno-avx512vnni -DXSIMD_DEFAULT_ARCH=avx512vl_128" + } + }, + { + "name": "avx512vl_256", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64-v2 -mno-sse4a -mavx -mavx2 -mavx512f -mavx512cd -mavx512vl -mno-avx512dq -mno-avx512bw -mno-avx512er -mno-avx512pf -mno-avx512ifma -mno-avx512vbmi -mno-avx512vbmi2 -mno-avx512vnni -DXSIMD_DEFAULT_ARCH=avx512vl_256" + } + }, + { + "name": "neon", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=armv7-a -mfpu=neon -mfloat-abi=softfp" + } + }, + { + "name": "neon64", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=armv8-a" + } + }, + { + "name": "sve128", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=armv8.2-a+sve -msve-vector-bits=128" + } + }, + { + "name": "sve256", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=armv8.2-a+sve -msve-vector-bits=256" + } + }, + { + "name": "sve512", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=armv8.2-a+sve -msve-vector-bits=512" + } + }, + { + "name": "rvv128", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=rv64gcv_zvl128b_zba_zbb_zbs -mrvv-vector-bits=zvl" + } + }, + { + "name": "rvv256", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=rv64gcv_zvl256b_zba_zbb_zbs -mrvv-vector-bits=zvl" + } + }, + { + "name": "rvv512", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=rv64gcv_zvl512b_zba_zbb_zbs -mrvv-vector-bits=zvl" + } + }, + { + "name": "vsx2", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -mcpu=power8 -maltivec -mvsx" + } + }, + { + "name": "vsx3", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -mcpu=power9 -maltivec -mvsx" + } + }, + { + "name": "vsx4", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -mcpu=power10 -maltivec -mvsx" + } + } + ] +} diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp index 6a7316722..63e023ed5 100644 --- a/include/xsimd/arch/xsimd_avx512f.hpp +++ b/include/xsimd/arch/xsimd_avx512f.hpp @@ -298,11 +298,25 @@ namespace xsimd } // namespace detail - template = 4)>> - XSIMD_INLINE batch load_masked(T const* mem, - batch_bool_constant mask, - convert, Mode, requires_arch) noexcept + // The AVX512F masked-load logic lives in this plain `*_avx512f` helper + // (no `requires_arch` tag) and is exposed through the concrete + // element-type overloads below. + // + // Why not a single generic `load_masked(T const*, ..., requires_arch)`? + // It is ambiguous against the concrete-type / generic-arch overloads in + // xsimd_common_memory.hpp (e.g. `load_masked(int32_t const*, ..., + // requires_arch)`): the avx512f overload is more specialized on the + // architecture while the common one is more specialized on the pointer + // type, so partial ordering cannot pick a winner. When AVX512DQ/BW is + // available a fully concrete `requires_arch` overload is the + // unique best match and hides this, but a pure-AVX512F target (the + // `avx512f` preset) has no such tie-breaker and the call fails to + // compile. Concrete element-type `requires_arch` overloads make + // the avx512f candidate the unique best match for every integer type. + template + XSIMD_INLINE batch load_masked_avx512f(T const* mem, + batch_bool_constant mask, + Mode) noexcept { constexpr auto half = batch::size / 2; XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half) // lower-half AVX2 forwarding @@ -324,12 +338,59 @@ namespace xsimd } } + template + XSIMD_INLINE batch load_masked(int32_t const* mem, + batch_bool_constant mask, + convert, Mode, requires_arch) noexcept + { + return load_masked_avx512f(mem, mask, Mode {}); + } + + template + XSIMD_INLINE batch load_masked(uint32_t const* mem, + batch_bool_constant mask, + convert, Mode, requires_arch) noexcept + { + return load_masked_avx512f(mem, mask, Mode {}); + } + + template + XSIMD_INLINE batch load_masked(int64_t const* mem, + batch_bool_constant mask, + convert, Mode, requires_arch) noexcept + { + return load_masked_avx512f(mem, mask, Mode {}); + } + + template + XSIMD_INLINE batch load_masked(uint64_t const* mem, + batch_bool_constant mask, + convert, Mode, requires_arch) noexcept + { + return load_masked_avx512f(mem, mask, Mode {}); + } + + // float/double (and any other >=4-byte type) have no concrete-type + // generic-arch competitor in xsimd_common_memory.hpp, so a single + // generic avx512f overload stays unambiguous for them. template = 4)>> - XSIMD_INLINE void store_masked(T* mem, - batch const& src, - batch_bool_constant mask, - Mode, requires_arch) noexcept + XSIMD_INLINE batch load_masked(T const* mem, + batch_bool_constant mask, + convert, Mode, requires_arch) noexcept + { + return load_masked_avx512f(mem, mask, Mode {}); + } + + // Same ambiguity as load_masked above (see comment there): factor the + // AVX512F store logic into a plain helper and expose it via concrete + // element-type `requires_arch` overloads so a pure-AVX512F + // target has a unique best match. + template + XSIMD_INLINE void store_masked_avx512f(T* mem, + batch const& src, + batch_bool_constant mask, + Mode) noexcept { constexpr auto half = batch::size / 2; XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half) // lower-half AVX2 forwarding @@ -351,6 +412,50 @@ namespace xsimd } } + template + XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, + batch_bool_constant mask, + Mode, requires_arch) noexcept + { + store_masked_avx512f(mem, src, mask, Mode {}); + } + + template + XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, + batch_bool_constant mask, + Mode, requires_arch) noexcept + { + store_masked_avx512f(mem, src, mask, Mode {}); + } + + template + XSIMD_INLINE void store_masked(int64_t* mem, batch const& src, + batch_bool_constant mask, + Mode, requires_arch) noexcept + { + store_masked_avx512f(mem, src, mask, Mode {}); + } + + template + XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, + batch_bool_constant mask, + Mode, requires_arch) noexcept + { + store_masked_avx512f(mem, src, mask, Mode {}); + } + + // float/double (and any other >=4-byte type) have no concrete-type + // generic-arch competitor, so a single generic overload is unambiguous. + template = 4)>> + XSIMD_INLINE void store_masked(T* mem, + batch const& src, + batch_bool_constant mask, + Mode, requires_arch) noexcept + { + store_masked_avx512f(mem, src, mask, Mode {}); + } + // abs template XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept From ef1f17846ca659f5d98beb6c87d1a61f040034e5 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 25 May 2026 10:53:49 +0200 Subject: [PATCH 02/12] Safer trilival-auto-var-init flag --- .github/workflows/linux.yml | 4 +--- CMakeLists.txt | 15 +++++++++++++++ cmake/Hardening.cmake | 37 +++++++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 3 deletions(-) create mode 100644 cmake/Hardening.cmake diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 0bbcccd82..e1481b105 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -89,9 +89,6 @@ jobs: CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DXSIMD_ENABLE_WERROR=ON" fi - # Cheap way of spotting uninitialized read; presets pick this up via $env{CXXFLAGS}. - export CXXFLAGS="$CXXFLAGS -ftrivial-auto-var-init=pattern" - cmake -B _build \ --preset ${{ matrix.sys.preset }} \ -D BUILD_TESTS=ON \ @@ -101,6 +98,7 @@ jobs: -D CMAKE_C_COMPILER="${CC}" \ -D CMAKE_CXX_COMPILER="${CXX}" \ -D TARGET_ARCH="x86-64" \ + -D XSIMD_HARDEN_TRIVIAL_AUTO_VAR_INIT=ON \ "${CMAKE_EXTRA_ARGS}" \ -G Ninja diff --git a/CMakeLists.txt b/CMakeLists.txt index 66c01f281..92f722aa8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,6 +54,21 @@ if(ENABLE_XTL_COMPLEX) ) endif() +# Dev options +# =========== + +include (cmake/Hardening.cmake) + +option( + XSIMD_HARDEN_TRIVIAL_AUTO_VAR_INIT + "Enable -ftrivial-auto-var-init hardening flag if supported" + OFF +) + +if(XSIMD_HARDEN_TRIVIAL_AUTO_VAR_INIT) + xsimd_harden_trivial_auto_var_init(xsimd INTERFACE) +endif() + if(BUILD_TESTS) enable_testing() add_subdirectory(test) diff --git a/cmake/Hardening.cmake b/cmake/Hardening.cmake new file mode 100644 index 000000000..cc49bb378 --- /dev/null +++ b/cmake/Hardening.cmake @@ -0,0 +1,37 @@ +include(CheckCXXCompilerFlag) + + +function(xsimd_harden_trivial_auto_var_init target scope) + # Names of option parameters (without arguments) + set(options) + # Names of named parameters with a single argument + set(one_value_args PATTERN) + # Names of named parameters with a multiple arguments + set(multi_values_args) + cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_values_args}" ${ARGN}) + if(ARG_UNPARSED_ARGUMENTS) + message( + AUTHOR_WARNING + "Unrecoginzed options passed to ${CMAKE_CURRENT_FUNCTION}: " + "${ARG_UNPARSED_ARGUMENTS}" + ) + endif() + + if(NOT scope STREQUAL "PUBLIC" AND NOT scope STREQUAL "PRIVATE" AND NOT scope STREQUAL "INTERFACE") + message(FATAL_ERROR "scope must be PUBLIC, PRIVATE, or INTERFACE, got: ${scope}") + endif() + + if(NOT XSIMD_HARDEN_TRIVIAL_AUTO_VAR_INIT) + return() + endif() + + if(NOT ARG_PATTERN) + set(ARG_PATTERN "pattern") + endif() + + set(flag "-ftrivial-auto-var-init=${ARG_PATTERN}") + check_cxx_compiler_flag("${flag}" XSIMD_HAS_FTRIVIAL_AUTO_VAR_INIT_${ARG_PATTERN}) + if(XSIMD_HAS_FTRIVIAL_AUTO_VAR_INIT_${ARG_PATTERN}) + target_compile_options(${target} ${scope} "${flag}") + endif() +endfunction() From 4fcb0fd8d33cfd9ad98d2b8f490ff32d1a2e41a2 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 25 May 2026 11:11:20 +0200 Subject: [PATCH 03/12] Fix ambiguity --- include/xsimd/arch/xsimd_avx512f.hpp | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp index 63e023ed5..6a3c32341 100644 --- a/include/xsimd/arch/xsimd_avx512f.hpp +++ b/include/xsimd/arch/xsimd_avx512f.hpp @@ -370,11 +370,13 @@ namespace xsimd return load_masked_avx512f(mem, mask, Mode {}); } - // float/double (and any other >=4-byte type) have no concrete-type - // generic-arch competitor in xsimd_common_memory.hpp, so a single - // generic avx512f overload stays unambiguous for them. + // Non-integer element types only (float, double, ...): integer types + // are handled by the concrete overloads above. gcc 10's partial + // ordering cannot break the tie between a concrete-type avx512f + // overload and a generic-T avx512f overload, so this catch-all must + // exclude the integer types we already specialized. template = 4)>> + typename = std::enable_if_t<(sizeof(T) >= 4) && !std::is_integral::value>> XSIMD_INLINE batch load_masked(T const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept @@ -444,10 +446,10 @@ namespace xsimd store_masked_avx512f(mem, src, mask, Mode {}); } - // float/double (and any other >=4-byte type) have no concrete-type - // generic-arch competitor, so a single generic overload is unambiguous. + // Non-integer element types only: see load_masked above for the gcc 10 + // partial ordering rationale. template = 4)>> + typename = std::enable_if_t<(sizeof(T) >= 4) && !std::is_integral::value>> XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool_constant mask, From c4c12c5c0fa86a465f42f65c4957937dffe74669 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 25 May 2026 11:37:10 +0200 Subject: [PATCH 04/12] Bump SDE target --- .github/workflows/linux.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index e1481b105..afae73268 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -115,7 +115,7 @@ jobs: if echo '${{ matrix.sys.preset }}' | grep -q 'avx512' ; then # Running with emulation, must have AVX512, lower tier are checked by implications in tests export XSIMD_TEST_CPU_ASSUME_AVX512F="1" - ./sde-external-9.48.0-2024-11-25-lin/sde64 -skx -- ./_build/test/test_xsimd + ./sde-external-9.48.0-2024-11-25-lin/sde64 -spr -- ./_build/test/test_xsimd else export XSIMD_TEST_CPU_ASSUME_SSE4_2=$(grep -q 'sse4_2' /proc/cpuinfo && echo "1" || echo "0") export XSIMD_TEST_CPU_ASSUME_AVX=$(grep -q 'avx' /proc/cpuinfo && echo "1" || echo "0") From 7a256e5c15b61e9b22f0c69e87af00688bf1cef8 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 25 May 2026 12:49:48 +0200 Subject: [PATCH 05/12] Simplify common_memory load_masked --- .../xsimd/arch/common/xsimd_common_memory.hpp | 132 +++++++++--------- include/xsimd/arch/xsimd_avx512f.hpp | 127 ++--------------- include/xsimd/arch/xsimd_common_fwd.hpp | 16 --- include/xsimd/arch/xsimd_sse2.hpp | 2 +- 4 files changed, 74 insertions(+), 203 deletions(-) diff --git a/include/xsimd/arch/common/xsimd_common_memory.hpp b/include/xsimd/arch/common/xsimd_common_memory.hpp index 7a1ed73a3..23b0606ac 100644 --- a/include/xsimd/arch/common/xsimd_common_memory.hpp +++ b/include/xsimd/arch/common/xsimd_common_memory.hpp @@ -13,6 +13,7 @@ #define XSIMD_COMMON_MEMORY_HPP #include "../../types/xsimd_batch_constant.hpp" +#include "../../utils/xsimd_type_traits.hpp" #include "./xsimd_common_details.hpp" #include @@ -360,88 +361,81 @@ namespace xsimd return load_unaligned(mem, convert {}, A {}); } - template - XSIMD_INLINE batch - load_masked(T_in const* mem, batch_bool_constant, convert, alignment, requires_arch) noexcept - { - constexpr std::size_t size = batch::size; - alignas(A::alignment()) std::array buffer {}; - constexpr bool mask[size] = { Values... }; - - for (std::size_t i = 0; i < size; ++i) - buffer[i] = mask[i] ? static_cast(mem[i]) : T_out(0); - - return batch::load(buffer.data(), aligned_mode {}); - } - - template - XSIMD_INLINE void - store_masked(T_out* mem, batch const& src, batch_bool_constant, alignment, requires_arch) noexcept + namespace detail { - constexpr std::size_t size = batch::size; - constexpr bool mask[size] = { Values... }; + // Compile-time dispatch tag for the common `load_masked`/ `store_masked` + // implementations: true iff we can use the int->float bitcast path (matching-size + // integer T_in/T_out with a SIMD register available for the matching + // floating-point type), false otherwise (use the scalar buffer fallback). + template + using common_masked_via_fp = std::integral_constant::value + && std::is_integral::value + && !std::is_void>::value + && types::has_simd_register, A>::value>; - for (std::size_t i = 0; i < size; ++i) - if (mask[i]) - { - mem[i] = static_cast(src.get(i)); - } - } + // Scalar-buffer fallback: works for any T_in/T_out. + template + XSIMD_INLINE batch + load_masked_common(T_in const* mem, batch_bool_constant, convert, alignment, std::false_type /* via_fp */) noexcept + { + constexpr std::size_t size = batch::size; + alignas(A::alignment()) std::array buffer {}; + constexpr bool mask[size] = { Values... }; - template - XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept - { - const auto f = load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, A {}); - return bitwise_cast(f); - } + for (std::size_t i = 0; i < size; ++i) + buffer[i] = mask[i] ? static_cast(mem[i]) : T_out(0); - template - XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept - { - const auto f = load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, A {}); - return bitwise_cast(f); - } + return batch::load(buffer.data(), aligned_mode {}); + } - template - XSIMD_INLINE std::enable_if_t::value, batch> - load_masked(int64_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept - { - const auto d = load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, A {}); - return bitwise_cast(d); - } + // Integer-via-float bitcast: T_in == T_out == integral T with a matching + // `sized_fp_t` for which the arch has a SIMD register. + // Dispatches to the floating `load_masked` (which is arch-specialized) and bitcasts back. + template + XSIMD_INLINE batch + load_masked_common(T const* mem, batch_bool_constant, convert, Mode, std::true_type /* via_fp */) noexcept + { + using fp_t = sized_fp_t; + const auto f = ::xsimd::kernel::load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, A {}); + return bitwise_cast(f); + } - template - XSIMD_INLINE std::enable_if_t::value, batch> - load_masked(uint64_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept - { - const auto d = load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, A {}); - return bitwise_cast(d); - } + template + XSIMD_INLINE void + store_masked_common(T_out* mem, batch const& src, batch_bool_constant, alignment, std::false_type /* via_fp */) noexcept + { + constexpr std::size_t size = batch::size; + constexpr bool mask[size] = { Values... }; - template - XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept - { - store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, A {}); - } + for (std::size_t i = 0; i < size; ++i) + if (mask[i]) + { + mem[i] = static_cast(src.get(i)); + } + } - template - XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept - { - store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, A {}); - } + template + XSIMD_INLINE void + store_masked_common(T* mem, batch const& src, batch_bool_constant, Mode, std::true_type /* via_fp */) noexcept + { + using fp_t = sized_fp_t; + ::xsimd::kernel::store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, A {}); + } + } // namespace detail - template - XSIMD_INLINE std::enable_if_t::value> - store_masked(int64_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept + template + XSIMD_INLINE batch + load_masked(T_in const* mem, batch_bool_constant mask, convert cvt, alignment mode, requires_arch) noexcept { - store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, A {}); + return detail::load_masked_common(mem, mask, cvt, mode, detail::common_masked_via_fp {}); } - template - XSIMD_INLINE std::enable_if_t::value> - store_masked(uint64_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept + template + XSIMD_INLINE void + store_masked(T_out* mem, batch const& src, batch_bool_constant mask, alignment mode, requires_arch) noexcept { - store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, A {}); + detail::store_masked_common(mem, src, mask, mode, detail::common_masked_via_fp {}); } template diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp index 6a3c32341..6a7316722 100644 --- a/include/xsimd/arch/xsimd_avx512f.hpp +++ b/include/xsimd/arch/xsimd_avx512f.hpp @@ -298,25 +298,11 @@ namespace xsimd } // namespace detail - // The AVX512F masked-load logic lives in this plain `*_avx512f` helper - // (no `requires_arch` tag) and is exposed through the concrete - // element-type overloads below. - // - // Why not a single generic `load_masked(T const*, ..., requires_arch)`? - // It is ambiguous against the concrete-type / generic-arch overloads in - // xsimd_common_memory.hpp (e.g. `load_masked(int32_t const*, ..., - // requires_arch)`): the avx512f overload is more specialized on the - // architecture while the common one is more specialized on the pointer - // type, so partial ordering cannot pick a winner. When AVX512DQ/BW is - // available a fully concrete `requires_arch` overload is the - // unique best match and hides this, but a pure-AVX512F target (the - // `avx512f` preset) has no such tie-breaker and the call fails to - // compile. Concrete element-type `requires_arch` overloads make - // the avx512f candidate the unique best match for every integer type. - template - XSIMD_INLINE batch load_masked_avx512f(T const* mem, - batch_bool_constant mask, - Mode) noexcept + template = 4)>> + XSIMD_INLINE batch load_masked(T const* mem, + batch_bool_constant mask, + convert, Mode, requires_arch) noexcept { constexpr auto half = batch::size / 2; XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half) // lower-half AVX2 forwarding @@ -338,61 +324,12 @@ namespace xsimd } } - template - XSIMD_INLINE batch load_masked(int32_t const* mem, - batch_bool_constant mask, - convert, Mode, requires_arch) noexcept - { - return load_masked_avx512f(mem, mask, Mode {}); - } - - template - XSIMD_INLINE batch load_masked(uint32_t const* mem, - batch_bool_constant mask, - convert, Mode, requires_arch) noexcept - { - return load_masked_avx512f(mem, mask, Mode {}); - } - - template - XSIMD_INLINE batch load_masked(int64_t const* mem, - batch_bool_constant mask, - convert, Mode, requires_arch) noexcept - { - return load_masked_avx512f(mem, mask, Mode {}); - } - - template - XSIMD_INLINE batch load_masked(uint64_t const* mem, - batch_bool_constant mask, - convert, Mode, requires_arch) noexcept - { - return load_masked_avx512f(mem, mask, Mode {}); - } - - // Non-integer element types only (float, double, ...): integer types - // are handled by the concrete overloads above. gcc 10's partial - // ordering cannot break the tie between a concrete-type avx512f - // overload and a generic-T avx512f overload, so this catch-all must - // exclude the integer types we already specialized. template = 4) && !std::is_integral::value>> - XSIMD_INLINE batch load_masked(T const* mem, - batch_bool_constant mask, - convert, Mode, requires_arch) noexcept - { - return load_masked_avx512f(mem, mask, Mode {}); - } - - // Same ambiguity as load_masked above (see comment there): factor the - // AVX512F store logic into a plain helper and expose it via concrete - // element-type `requires_arch` overloads so a pure-AVX512F - // target has a unique best match. - template - XSIMD_INLINE void store_masked_avx512f(T* mem, - batch const& src, - batch_bool_constant mask, - Mode) noexcept + typename = std::enable_if_t<(sizeof(T) >= 4)>> + XSIMD_INLINE void store_masked(T* mem, + batch const& src, + batch_bool_constant mask, + Mode, requires_arch) noexcept { constexpr auto half = batch::size / 2; XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half) // lower-half AVX2 forwarding @@ -414,50 +351,6 @@ namespace xsimd } } - template - XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, - batch_bool_constant mask, - Mode, requires_arch) noexcept - { - store_masked_avx512f(mem, src, mask, Mode {}); - } - - template - XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, - batch_bool_constant mask, - Mode, requires_arch) noexcept - { - store_masked_avx512f(mem, src, mask, Mode {}); - } - - template - XSIMD_INLINE void store_masked(int64_t* mem, batch const& src, - batch_bool_constant mask, - Mode, requires_arch) noexcept - { - store_masked_avx512f(mem, src, mask, Mode {}); - } - - template - XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, - batch_bool_constant mask, - Mode, requires_arch) noexcept - { - store_masked_avx512f(mem, src, mask, Mode {}); - } - - // Non-integer element types only: see load_masked above for the gcc 10 - // partial ordering rationale. - template = 4) && !std::is_integral::value>> - XSIMD_INLINE void store_masked(T* mem, - batch const& src, - batch_bool_constant mask, - Mode, requires_arch) noexcept - { - store_masked_avx512f(mem, src, mask, Mode {}); - } - // abs template XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_common_fwd.hpp b/include/xsimd/arch/xsimd_common_fwd.hpp index f5a7f4ffe..e9ecf79ad 100644 --- a/include/xsimd/arch/xsimd_common_fwd.hpp +++ b/include/xsimd/arch/xsimd_common_fwd.hpp @@ -87,22 +87,6 @@ namespace xsimd XSIMD_INLINE batch load_masked(T_in const* mem, batch_bool_constant mask, convert, alignment, requires_arch) noexcept; template XSIMD_INLINE void store_masked(T_out* mem, batch const& src, batch_bool_constant mask, alignment, requires_arch) noexcept; - template - XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept; - template - XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept; - template - XSIMD_INLINE std::enable_if_t::value, batch> load_masked(int64_t const*, batch_bool_constant, convert, Mode, requires_arch) noexcept; - template - XSIMD_INLINE std::enable_if_t::value, batch> load_masked(uint64_t const*, batch_bool_constant, convert, Mode, requires_arch) noexcept; - template - XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept; - template - XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept; - template - XSIMD_INLINE std::enable_if_t::value> store_masked(int64_t*, batch const&, batch_bool_constant, Mode, requires_arch) noexcept; - template - XSIMD_INLINE std::enable_if_t::value> store_masked(uint64_t*, batch const&, batch_bool_constant, Mode, requires_arch) noexcept; // Forward declarations for pack-level helpers namespace detail diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp index c6cfb5f07..0a95aae8b 100644 --- a/include/xsimd/arch/xsimd_sse2.hpp +++ b/include/xsimd/arch/xsimd_sse2.hpp @@ -2331,7 +2331,7 @@ namespace xsimd } else { - store_masked(mem, src, mask, requires_arch {}); + store_masked(mem, src, mask, aligned_mode {}, common {}); } } From ca1f5295d03bd9a18028a09b2bf0b1971638ac27 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 25 May 2026 13:24:24 +0200 Subject: [PATCH 06/12] Fix avx2 --- include/xsimd/arch/xsimd_avx2.hpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index e2c223cc7..97bc0c3ae 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -14,6 +14,7 @@ #include "../types/xsimd_avx2_register.hpp" #include "../types/xsimd_batch_constant.hpp" +#include "../utils/xsimd_type_traits.hpp" #include "./utils/shifts.hpp" #include @@ -175,7 +176,7 @@ namespace xsimd } // store_masked - namespace detail + namespace detail_avx2 { template XSIMD_INLINE void maskstore(int32_t* mem, __m256i mask, __m256i src) noexcept @@ -211,7 +212,9 @@ namespace xsimd } else { - detail::maskstore(mem, mask.as_batch(), src); + detail_avx2::maskstore( + reinterpret_cast*>(mem), + mask.as_batch(), src); } } From 1605e310717e3444e76293856bf808ef843c80be Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 25 May 2026 13:35:50 +0200 Subject: [PATCH 07/12] Fix avx2 --- include/xsimd/arch/xsimd_avx512f.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp index 6a7316722..879c56a07 100644 --- a/include/xsimd/arch/xsimd_avx512f.hpp +++ b/include/xsimd/arch/xsimd_avx512f.hpp @@ -335,13 +335,13 @@ namespace xsimd XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half) // lower-half AVX2 forwarding { constexpr auto mlo = ::xsimd::detail::lower_half(mask); - const auto lo = detail::lower_half(src); + const auto lo = xsimd::batch(detail::lower_half(src)); store_masked(mem, lo, mlo, Mode {}, avx2 {}); } else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half) // upper-half AVX2 forwarding { constexpr auto mhi = ::xsimd::detail::upper_half(mask); - const auto hi = detail::upper_half(src); + const auto hi = xsimd::batch(detail::upper_half(src)); store_masked(mem + half, hi, mhi, Mode {}, avx2 {}); } else From f88e98a217c17630d64ba45849563e3977fdc836 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 25 May 2026 13:44:19 +0200 Subject: [PATCH 08/12] fix sse call --- include/xsimd/arch/xsimd_avx.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index 1ee0c5b89..93d008fc0 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -1040,14 +1040,14 @@ namespace xsimd XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size) { constexpr auto mlo = ::xsimd::detail::lower_half(mask); - const auto lo = detail::lower_half(src); + const auto lo = xsimd::batch(detail::lower_half(src)); store_masked(mem, lo, mlo, Mode {}, sse4_2 {}); } // confined to upper 128-bit half → forward to 128 bit else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size) { constexpr auto mhi = ::xsimd::detail::upper_half(mask); - const auto hi = detail::upper_half(src); + const auto hi = xsimd::batch(detail::upper_half(src)); store_masked(mem + half_size, hi, mhi, Mode {}, sse4_2 {}); } else From ec071a8cefb6eba77766c178b8cf5403da7605df Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 25 May 2026 13:51:23 +0200 Subject: [PATCH 09/12] Fix reinterpre cast --- include/xsimd/arch/xsimd_avx2.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index 97bc0c3ae..8ef4c85f0 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -212,9 +212,11 @@ namespace xsimd } else { + using int_t = sized_int_t; detail_avx2::maskstore( - reinterpret_cast*>(mem), - mask.as_batch(), src); + reinterpret_cast(mem), + mask.as_batch(), + bitwise_cast(src)); } } From 81879b7f7196661d110fc632164eac6142c34e6e Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 25 May 2026 14:40:40 +0200 Subject: [PATCH 10/12] Fix more --- include/xsimd/arch/xsimd_avx.hpp | 13 +++++++------ include/xsimd/arch/xsimd_avx2.hpp | 13 +++++++------ 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index 93d008fc0..3cbcdba61 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -1035,20 +1035,21 @@ namespace xsimd XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { constexpr size_t half_size = batch::size / 2; + using half_arch = avx_128; // confined to lower 128-bit half → forward to 128 bit XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size) { - constexpr auto mlo = ::xsimd::detail::lower_half(mask); - const auto lo = xsimd::batch(detail::lower_half(src)); - store_masked(mem, lo, mlo, Mode {}, sse4_2 {}); + constexpr auto mlo = ::xsimd::detail::lower_half(mask); + const auto lo = xsimd::batch(detail::lower_half(src)); + store_masked(mem, lo, mlo, Mode {}, half_arch {}); } // confined to upper 128-bit half → forward to 128 bit else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size) { - constexpr auto mhi = ::xsimd::detail::upper_half(mask); - const auto hi = xsimd::batch(detail::upper_half(src)); - store_masked(mem + half_size, hi, mhi, Mode {}, sse4_2 {}); + constexpr auto mhi = ::xsimd::detail::upper_half(mask); + const auto hi = xsimd::batch(detail::upper_half(src)); + store_masked(mem + half_size, hi, mhi, Mode {}, half_arch {}); } else { diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index 8ef4c85f0..9b0d05a15 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -195,20 +195,21 @@ namespace xsimd XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { constexpr size_t lanes_per_half = batch::size / 2; + using half_arch = avx2_128; // confined to lower 128-bit half → forward to SSE XSIMD_IF_CONSTEXPR(mask.countl_zero() >= lanes_per_half) { - constexpr auto mlo = ::xsimd::detail::lower_half(mask); - const auto lo = detail::lower_half(src); - store_masked(mem, lo, mlo, Mode {}, sse4_2 {}); + constexpr auto mlo = ::xsimd::detail::lower_half(mask); + const auto lo = xsimd::batch(detail::lower_half(src)); + store_masked(mem, lo, mlo, Mode {}, half_arch {}); } // confined to upper 128-bit half → forward to SSE else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= lanes_per_half) { - constexpr auto mhi = ::xsimd::detail::upper_half(mask); - const auto hi = detail::upper_half(src); - store_masked(mem + lanes_per_half, hi, mhi, Mode {}, sse4_2 {}); + constexpr auto mhi = ::xsimd::detail::upper_half(mask); + const auto hi = xsimd::batch(detail::upper_half(src)); + store_masked(mem + lanes_per_half, hi, mhi, Mode {}, half_arch {}); } else { From e91290dd84fd4f28a6295942920148599c31b8d9 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 25 May 2026 15:13:06 +0200 Subject: [PATCH 11/12] Fix batch_bool --- include/xsimd/arch/xsimd_avx.hpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index 3cbcdba61..79719dc65 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -1016,7 +1016,7 @@ namespace xsimd } // store_masked - namespace detail + namespace detail_avx { template XSIMD_INLINE void maskstore(float* mem, batch_bool const& mask, batch const& src) noexcept @@ -1053,7 +1053,11 @@ namespace xsimd } else { - detail::maskstore(mem, mask.as_batch(), src); + using fp_t = sized_fp_t; + detail_avx::maskstore( + reinterpret_cast(mem), + mask.as_batch_bool(), + bitwise_cast(src)); } } From e368737198dfd793fb13c03e8f46afc4593ccc60 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 25 May 2026 16:57:22 +0200 Subject: [PATCH 12/12] Fix batch_bool --- include/xsimd/arch/xsimd_avx.hpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index 79719dc65..24736821a 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -1019,13 +1019,13 @@ namespace xsimd namespace detail_avx { template - XSIMD_INLINE void maskstore(float* mem, batch_bool const& mask, batch const& src) noexcept + XSIMD_INLINE void maskstore(float* mem, batch const& mask, batch const& src) noexcept { _mm256_maskstore_ps(mem, mask, src); } template - XSIMD_INLINE void maskstore(double* mem, batch_bool const& mask, batch const& src) noexcept + XSIMD_INLINE void maskstore(double* mem, batch const& mask, batch const& src) noexcept { _mm256_maskstore_pd(mem, mask, src); } @@ -1054,9 +1054,10 @@ namespace xsimd else { using fp_t = sized_fp_t; + using int_t = sized_int_t; detail_avx::maskstore( reinterpret_cast(mem), - mask.as_batch_bool(), + bitwise_cast(mask.as_batch()), bitwise_cast(src)); } }