diff --git a/.gitignore b/.gitignore
index 103505f..9115599 100644
--- a/.gitignore
+++ b/.gitignore
@@ -42,7 +42,7 @@
 # Build directories
 build/
 Build/
-build-*/
+*build*/
 /cmake-build-*/
 
 # CMake generated files
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9739d57..23b1bc5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,6 +7,7 @@ set(CMAKE_CXX_EXTENSIONS OFF)
 
 option(ARNOLDI_BUILD_EXAMPLES "Build example programs" OFF)
 option(ARNOLDI_REGRESSION "Build regression benchmark against original ARPACK (requires arpackng)" OFF)
+option(ARNOLDI_USE_CUDA "Build with CUDA + cuBLAS backend (Sym / Herm only)" OFF)
 
 find_package(BLAS REQUIRED)
 find_package(LAPACK REQUIRED)
@@ -25,6 +26,14 @@ else()
     target_link_libraries(arnoldi INTERFACE ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES})
 endif()
 
+if(ARNOLDI_USE_CUDA)
+    # cuBLAS / cudart are C-API libraries; no nvcc required for the wrappers.
+    find_package(CUDAToolkit REQUIRED)
+    target_link_libraries(arnoldi INTERFACE CUDA::cudart CUDA::cublas)
+    target_compile_definitions(arnoldi INTERFACE ARNOLDI_USE_CUDA)
+    message(STATUS "Arnoldi: CUDA backend enabled (cuBLAS=${CUDAToolkit_VERSION})")
+endif()
+
 include(GNUInstallDirs)
 install(TARGETS arnoldi EXPORT arnoldiTargets)
 install(DIRECTORY include/arnoldi DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
diff --git a/README.md b/README.md
index 6bf10c1..ab088ad 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,8 @@ Header-only API for a C++ port of ARPACK’s symmetric and nonsymmetric eigenval
 
 - C++17 compiler
 - BLAS and LAPACK (Fortran calling convention, `_` suffix)
+- *(optional, GPU backend)* CUDA Toolkit with cuBLAS — see [GPU (CUDA) backend](#gpu-cuda-backend)
+- *(optional, tests)* none to install: Catch2 v3 is fetched automatically via CMake `FetchContent` (needs network on first configure)
 
 ## Build (library interface + examples)
 
@@ -19,6 +21,20 @@ cmake --build build
 ./build/examples/arpack_cpp_examples dnsimp   # Fortran EXAMPLES catalog (see examples/README.md)
 ```
 
+Options:
+
+| Option | Default | Effect |
+| --- | --- | --- |
+| `ARNOLDI_BUILD_EXAMPLES` | OFF | build the `examples/` programs |
+| `ARNOLDI_BUILD_TESTS` | OFF | build the Catch2 test suite (`ctest`) |
+| `ARNOLDI_USE_CUDA` | OFF | enable the CUDA + cuBLAS backend (Sym / Herm) |
+
+```bash
+# tests
+cmake -S . -B build -DARNOLDI_BUILD_TESTS=ON
+cmake --build build && ctest --test-dir build
+```
+
 ## Use in your project
 
 ```cmake
@@ -31,11 +47,69 @@ target_link_libraries(your_target PRIVATE arpack::callback)
 // arnoldi::detail::naupd, neupd, saupd, seupd, ...
 ```
 
+## GPU (CUDA) backend
+
+`Arnoldi<K, Scalar, Comm, Backend>` is templated on a backend. The default
+`CpuBackend` runs every length-`n` BLAS on the host (unchanged behaviour).
+`CudaBackend` keeps the Lanczos workspace (`resid`, `v`, `workd`) in device
+memory and routes those operations through cuBLAS, so the **matvec runs
+entirely on the GPU**. Supported for `Kind::Sym` and `Kind::Herm` (Nonsym on
+device is rejected at compile time — no cuSOLVER analogue for its real-Schur
+path). The small implicitly-restarted shift machinery (`ncv²`-sized `h`,
+`q`, `workl`) stays host-resident by design.
+
+Enable it:
+
+```bash
+cmake -S . -B build -DARNOLDI_USE_CUDA=ON \
+      -DCMAKE_CUDA_ARCHITECTURES=<your_sm>          # e.g. 61 for Quadro P1000
+cmake --build build
+```
+
+`cuda.hpp` is plain C++ calling the cuBLAS C API — only the `.cu`
+tests/examples need `nvcc`. If your system compiler is newer than what your
+`nvcc` supports (e.g. CUDA 12.1 + GCC ≥ 13), point `nvcc` at an older host
+compiler:
+
+```bash
+cmake -S . -B build -DARNOLDI_USE_CUDA=ON \
+      -DCMAKE_CUDA_HOST_COMPILER=/path/to/g++-12 \
+      -DCMAKE_CUDA_ARCHITECTURES=61
+```
+
+Usage — the matvec callback receives **device** pointers and should launch on
+the backend's stream:
+
+```cpp
+#include <arnoldi/arnoldi.hpp>
+#include <arnoldi/cuda.hpp>
+
+arnoldi::Arnoldi<arnoldi::Kind::Sym, double,
+                 arnoldi::SerialComm, arnoldi::CudaBackend>
+    s("I", n, "LM", nev, ncv);
+s.tol(1e-10).maxiter(1000);
+
+cudaStream_t stream = s.backend().stream();
+s.solve([&](const double* d_x, double* d_y) {       // device pointers
+    my_matvec_kernel<<<blocks, threads, 0, stream>>>(n, d_x, d_y);
+});
+
+auto r  = s.eigenpairs();          // values + vectors copied to host
+auto dr = s.eigenpairs_device();   // eigenvectors left on the GPU
+```
+
+Also available under a device backend: `s.initial_resid_device(devptr)` to
+seed from a device-resident vector, and `s.backend().handle()` for the cuBLAS
+handle. See `examples/arnoldi_sym_cuda.cu` (and `arnoldi_sym_cpu.cpp` for the
+matching CPU baseline / benchmark).
+
 ## Layout
 
 - `include/arnoldi/arnoldi.hpp` — include this
+- `include/arnoldi/cuda.hpp` — opt-in CUDA backend (`-DARNOLDI_USE_CUDA=ON`)
 - `include/arnoldi/detail/` — implementation headers
-- `examples/` — `example_symmetric_laplacian`, plus `arpack_cpp_examples` (Fortran `EXAMPLES/` name catalog)
+- `examples/` — `example_symmetric_laplacian`, `arnoldi_sym_cuda` / `arnoldi_sym_cpu`, plus `arpack_cpp_examples` (Fortran `EXAMPLES/` name catalog)
+- `tests/` — Catch2 suite (`ctest`), incl. `test_solver_cuda` when CUDA is on
 
 ## Origin
 
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index d987cf5..dc9c9cd 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,9 +1,23 @@
 # Included only when ARNOLDI_BUILD_EXAMPLES=ON (see parent CMakeLists.txt).
 
+if(ARNOLDI_USE_CUDA AND NOT CMAKE_CUDA_ARCHITECTURES)
+    if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.24)
+        set(CMAKE_CUDA_ARCHITECTURES native)
+    else()
+        set(CMAKE_CUDA_ARCHITECTURES 61)  # Pascal — safe minimum baseline
+    endif()
+    message(STATUS "Arnoldi examples: CMAKE_CUDA_ARCHITECTURES defaulted to "
+        "${CMAKE_CUDA_ARCHITECTURES} (override with -DCMAKE_CUDA_ARCHITECTURES=<cc>)")
+endif()
+
 if(ARNOLDI_BUILD_EXAMPLES)
     add_executable(example_symmetric_laplacian symmetric_laplacian.cpp)
     target_link_libraries(example_symmetric_laplacian PRIVATE arnoldi)
 
+    # CPU twin of arnoldi_sym_cuda — same problem, for backend benchmarking.
+    add_executable(example_arnoldi_sym_cpu arnoldi_sym_cpu.cpp)
+    target_link_libraries(example_arnoldi_sym_cpu PRIVATE arnoldi)
+
     # Class-based Arnoldi<Kind, Scalar> demos (one per kind)
     foreach(_demo IN ITEMS arnoldi_sym arnoldi_nonsym arnoldi_herm)
         add_executable(example_${_demo} ${_demo}.cpp)
@@ -18,6 +32,16 @@ if(ARNOLDI_BUILD_EXAMPLES)
         target_link_libraries(example_${_demo} PRIVATE arnoldi)
     endforeach()
 
+    # CUDA device-matvec example (only when the CUDA backend is enabled)
+    if(ARNOLDI_USE_CUDA)
+        enable_language(CUDA)
+        add_executable(example_arnoldi_sym_cuda arnoldi_sym_cuda.cu)
+        set_target_properties(example_arnoldi_sym_cuda PROPERTIES
+            CUDA_STANDARD 17
+            CUDA_STANDARD_REQUIRED ON)
+        target_link_libraries(example_arnoldi_sym_cuda PRIVATE arnoldi)
+    endif()
+
     # MPI parallel example (optional — only if MPI found)
     find_package(MPI COMPONENTS CXX QUIET)
     if(MPI_CXX_FOUND)
@@ -30,8 +54,22 @@ endif()
 # Performance + correctness benchmark against original ARPACK
 if(ARNOLDI_BUILD_EXAMPLES AND ORIGINAL_ARPACK_LIB)
     enable_language(Fortran)
+
+    # With the CUDA backend the benchmark adds a GPU-vs-Fortran tier with
+    # device matvec kernels (__global__ / <<<>>>), so the .cpp must be
+    # compiled by nvcc.
+    if(ARNOLDI_USE_CUDA)
+        enable_language(CUDA)
+        set_source_files_properties(benchmark.cpp PROPERTIES LANGUAGE CUDA)
+    endif()
+
     add_executable(bench_arpack benchmark.cpp)
     target_include_directories(bench_arpack PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
+    if(ARNOLDI_USE_CUDA)
+        set_target_properties(bench_arpack PROPERTIES
+            CUDA_STANDARD 17
+            CUDA_STANDARD_REQUIRED ON)
+    endif()
     if(TARGET BLAS::BLAS AND TARGET LAPACK::LAPACK)
         target_link_libraries(bench_arpack PRIVATE
             arnoldi ${ORIGINAL_ARPACK_LIB} LAPACK::LAPACK BLAS::BLAS)
diff --git a/examples/arnoldi_sym_cpu.cpp b/examples/arnoldi_sym_cpu.cpp
new file mode 100644
index 0000000..88fb97e
--- /dev/null
+++ b/examples/arnoldi_sym_cpu.cpp
@@ -0,0 +1,52 @@
+// arnoldi_sym_cpu — CPU twin of examples/arnoldi_sym_cuda.cu.
+//
+// Same problem (diagonal operator A(i,i) = i+1, n = 8192, nev = 5,
+// ncv = 8*nev, "LM", tol 1e-10) solved with the default CpuBackend and a
+// host matvec. Times the solve() call so it can be compared directly
+// against the CUDA example as a backend performance benchmark.
+//
+// Build: configure with -DARNOLDI_BUILD_EXAMPLES=ON (no CUDA needed).
+
+#include <arnoldi/arnoldi.hpp>
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <cstdio>
+#include <functional>
+#include <vector>
+
+int main() {
+    const int n   = 1 << 18;  // 8192 — same size as the CUDA example
+    const int nev = 5;
+    const int ncv = 8 * nev;  // matches arnoldi_sym_cuda.cu
+
+    // "LM": the nev largest eigenvalues, i.e. n, n-1, ..., n-nev+1.
+    arnoldi::Arnoldi<arnoldi::Kind::Sym, double> solver("I", n, "LM", nev, ncv);
+    solver.tol(1e-10).maxiter(1000);
+
+    auto t0 = std::chrono::steady_clock::now();
+    solver.solve([n](const double* x, double* y) {
+        for (int i = 0; i < n; ++i) y[i] = static_cast<double>(i + 1) * x[i];
+    });
+    auto t1     = std::chrono::steady_clock::now();
+    double secs = std::chrono::duration<double>(t1 - t0).count();
+
+    if (!solver.converged()) {
+        std::printf("CPU saupd failed, info = %d\n", solver.info());
+        return 1;
+    }
+
+    auto r = solver.eigenpairs(false);  // values only
+    std::sort(r.values.begin(), r.values.end(), std::greater<double>());
+    std::printf("Largest %d eigenvalues of diag(1..%d), CPU backend:\n", nev, n);
+    for (int i = 0; i < nev; ++i) {
+        double exact = static_cast<double>(n - i);
+        std::printf("  lambda[%d] = %.6f   (exact %.0f, |err| = %.2e)\n",
+                    i, r.values[i], exact, std::fabs(r.values[i] - exact));
+    }
+    std::printf("iterations=%d, OP applies=%d\n",
+                solver.num_iterations(), solver.num_op_applies());
+    std::printf("solve() wall time: %.4f s\n", secs);
+    return 0;
+}
diff --git a/examples/arnoldi_sym_cuda.cu b/examples/arnoldi_sym_cuda.cu
new file mode 100644
index 0000000..bf5b1ec
--- /dev/null
+++ b/examples/arnoldi_sym_cuda.cu
@@ -0,0 +1,94 @@
+// arnoldi_sym_cuda — smallest eigenvalues of the 1D Dirichlet Laplacian
+// with the matvec running entirely on the GPU.
+//
+// Demonstrates the CudaBackend device-matvec pattern:
+//   - Arnoldi<Kind::Sym, double, SerialComm, CudaBackend> keeps the Lanczos
+//     workspace (resid/v/workd) in device memory.
+//   - The user matvec callback receives DEVICE pointers and launches a CUDA
+//     kernel on the backend's stream.
+//   - eigenpairs() copies results to the host; eigenpairs_device() leaves
+//     the eigenvectors on the GPU for downstream device work.
+//
+// Build: configure with -DARNOLDI_USE_CUDA=ON -DARNOLDI_BUILD_EXAMPLES=ON.
+
+#include <arnoldi/arnoldi.hpp>
+#include <arnoldi/cuda.hpp>
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <cstdio>
+#include <cuda_runtime.h>
+#include <functional>
+#include <vector>
+
+// y = A x for the diagonal operator A(i,i) = i+1. The spectrum {1,2,...,n}
+// is perfectly separated, so IRA converges fast at any n — a clean showcase
+// of a large GPU matvec with exactly-known eigenvalues. (The tridiagonal
+// Laplacian is clustered at both spectral ends for large n and would need
+// shift-invert to target either end quickly — out of scope for a demo.)
+__global__ void diag_matvec(int n, const double* x, double* y) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= n) return;
+    y[i] = static_cast<double>(i + 1) * x[i];
+}
+
+int main() {
+    int dev_count = 0;
+    cudaGetDeviceCount(&dev_count);
+    if (dev_count == 0) {
+        std::printf("no CUDA device available\n");
+        return 0;
+    }
+
+    const int n   = 1 << 18;  // 8192 — a non-trivial GPU matvec
+    const int nev = 5;
+    const int ncv = 8 * nev;  // larger Krylov subspace: the relative gap of
+                              // the top eigenvalues is ~1/n, so a bigger ncv
+                              // keeps IRA convergence fast at large n
+
+    // "LM": the nev largest eigenvalues, i.e. n, n-1, ..., n-nev+1.
+    arnoldi::Arnoldi<arnoldi::Kind::Sym, double, arnoldi::SerialComm,
+                     arnoldi::CudaBackend> solver("I", n, "LM", nev, ncv);
+    solver.tol(1e-10).maxiter(1000);
+
+    cudaStream_t stream  = solver.backend().stream();
+    const int    threads = 256;
+    const int    blocks  = (n + threads - 1) / threads;
+
+    // x, y are device pointers (workspace lives on the GPU). Launch on the
+    // backend's stream so the surrounding cuBLAS calls order correctly.
+    auto t0 = std::chrono::steady_clock::now();
+    solver.solve([&](const double* x, double* y) {
+        diag_matvec<<<blocks, threads, 0, stream>>>(n, x, y);
+    });
+    cudaStreamSynchronize(stream);  // ensure all device work is finished
+    auto t1     = std::chrono::steady_clock::now();
+    double secs = std::chrono::duration<double>(t1 - t0).count();
+
+    if (!solver.converged()) {
+        std::printf("CUDA saupd failed, info = %d\n", solver.info());
+        return 1;
+    }
+
+    auto r = solver.eigenpairs(false);  // values only (host)
+    // eigenpairs(false) returns the wanted set in which-selection order, not
+    // sorted by value — sort descending for a clean presentation.
+    std::sort(r.values.begin(), r.values.end(), std::greater<double>());
+    std::printf("Largest %d eigenvalues of diag(1..%d), CUDA backend:\n", nev, n);
+    for (int i = 0; i < nev; ++i) {
+        double exact = static_cast<double>(n - i);  // n, n-1, ..., n-nev+1
+        std::printf("  lambda[%d] = %.6f   (exact %.0f, |err| = %.2e)\n",
+                    i, r.values[i], exact, std::fabs(r.values[i] - exact));
+    }
+    std::printf("iterations=%d, OP applies=%d\n",
+                solver.num_iterations(), solver.num_op_applies());
+    std::printf("solve() wall time: %.4f s\n", secs);
+
+    // Eigenvectors can also be kept on the device for further GPU work.
+    auto dr = solver.eigenpairs_device(true);
+    std::printf("eigenpairs_device: %d eigenvalues on host, "
+                "%zu-element eigenvector matrix left on device\n",
+                (int)dr.values.size(), (size_t)(n) * nev);
+    return 0;
+}
diff --git a/examples/benchmark.cpp b/examples/benchmark.cpp
index 84dece7..6a6d335 100644
--- a/examples/benchmark.cpp
+++ b/examples/benchmark.cpp
@@ -15,6 +15,11 @@
 #include "kernels.hpp"
 #include "lapack_extra.hpp"
 
+#ifdef ARNOLDI_USE_CUDA
+#include <arnoldi/cuda.hpp>
+#include <cuda_runtime.h>
+#endif
+
 #include <algorithm>
 #include <chrono>
 #include <cmath>
@@ -98,9 +103,12 @@ using arnoldi_examples::av_laplacian_1d;
 // C++ solvers (return converged eigenvalues).
 
 static std::vector<Eig> cpp_dsdrv1(int nx) {
-    int n = nx * nx, nev = 4, ncv = std::min(20, n - 1);
+    // ncv=50 (>> 2*nev): the 2D-Laplacian low spectrum is tightly clustered,
+    // so a larger Krylov subspace is required to converge at large nx
+    // (ncv=20 stalls past n~10000 in regular mode-1).
+    int n = nx * nx, nev = 4, ncv = std::min(50, n - 1);
     arnoldi::Arnoldi<arnoldi::Kind::Sym, double> s("I", n, "SM", nev, ncv);
-    s.tol(0.0).maxiter(300).mode(1).ishift(1);
+    s.tol(0.0).maxiter(500).mode(1).ishift(1);
     s.solve([&](const double* x, double* y) { av_sym_laplacian_2d(nx, n, x, y); });
     auto r = s.eigenpairs(true, 0.0);
     std::vector<Eig> out;
@@ -110,9 +118,12 @@ static std::vector<Eig> cpp_dsdrv1(int nx) {
 }
 
 static std::vector<Eig> cpp_dndrv1(int nx) {
-    int n = nx * nx, nev = 4, ncv = std::min(20, n - 1);
+    // ncv=50 (>> 2*nev): the 2D-Laplacian low spectrum is tightly clustered,
+    // so a larger Krylov subspace is required to converge at large nx
+    // (ncv=20 stalls past n~10000 in regular mode-1).
+    int n = nx * nx, nev = 4, ncv = std::min(50, n - 1);
     arnoldi::Arnoldi<arnoldi::Kind::Nonsym, double> s("I", n, "SM", nev, ncv);
-    s.tol(0.0).maxiter(300).mode(1).ishift(1);
+    s.tol(0.0).maxiter(500).mode(1).ishift(1);
     s.solve([&](const double* x, double* y) { av_conv_diff_2d(nx, n, 0.0, x, y); });
     auto r = s.eigenpairs(true, 0.0, 0.0);
     std::vector<Eig> out;
@@ -170,13 +181,16 @@ static std::vector<Eig> cpp_dndrv2(int n) {
 // Fortran solvers (return converged eigenvalues).
 
 static std::vector<Eig> f90_dsdrv1(int nx) {
-    int n = nx * nx, nev = 4, ncv = std::min(20, n - 1);
+    // ncv=50 (>> 2*nev): the 2D-Laplacian low spectrum is tightly clustered,
+    // so a larger Krylov subspace is required to converge at large nx
+    // (ncv=20 stalls past n~10000 in regular mode-1).
+    int n = nx * nx, nev = 4, ncv = std::min(50, n - 1);
     int ldv = n, lworkl = ncv * (ncv + 8);
     double tol = 0.0, sigma = 0.0;
     std::vector<double> v(ldv * ncv), workd(3 * n), workl(lworkl), resid(n), d(ncv * 2);
     std::vector<int> sel(ncv);
     int iparam[11] = {}, ipntr[11] = {};
-    iparam[0] = 1; iparam[2] = 300; iparam[6] = 1;
+    iparam[0] = 1; iparam[2] = 500; iparam[6] = 1;
     int ido = 0, info = 0;
     while (true) {
         dsaupd_(&ido, "I", &n, "SM", &nev, &tol, resid.data(), &ncv,
@@ -197,14 +211,17 @@ static std::vector<Eig> f90_dsdrv1(int nx) {
 }
 
 static std::vector<Eig> f90_dndrv1(int nx) {
-    int n = nx * nx, nev = 4, ncv = std::min(20, n - 1);
+    // ncv=50 (>> 2*nev): the 2D-Laplacian low spectrum is tightly clustered,
+    // so a larger Krylov subspace is required to converge at large nx
+    // (ncv=20 stalls past n~10000 in regular mode-1).
+    int n = nx * nx, nev = 4, ncv = std::min(50, n - 1);
     int ldv = n, lworkl = 3 * ncv * ncv + 6 * ncv;
     double tol = 0.0, sigmar = 0.0, sigmai = 0.0;
     std::vector<double> v(ldv * ncv), workd(3 * n), workl(lworkl), resid(n);
     std::vector<double> dr(ncv + 1), di(ncv + 1), workev(3 * ncv);
     std::vector<int> sel(ncv);
     int iparam[11] = {}, ipntr[14] = {};
-    iparam[0] = 1; iparam[2] = 300; iparam[6] = 1;
+    iparam[0] = 1; iparam[2] = 500; iparam[6] = 1;
     int ido = 0, info = 0;
     while (true) {
         dnaupd_(&ido, "I", &n, "SM", &nev, &tol, resid.data(), &ncv,
@@ -305,6 +322,46 @@ static std::vector<Eig> f90_dndrv2(int n) {
     return out;
 }
 
+#ifdef ARNOLDI_USE_CUDA
+__global__ void bench_laplacian_2d(int nx, int n, double inv_h2,
+                                    const double* v, double* w) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= n) return;
+    int i = idx % nx;          // column within the grid row
+    int j = idx / nx;          // grid row
+    double s = 4.0 * v[idx];
+    if (i > 0)      s -= v[idx - 1];
+    if (i < nx - 1) s -= v[idx + 1];
+    if (j > 0)      s -= v[idx - nx];
+    if (j < nx - 1) s -= v[idx + nx];
+    w[idx] = inv_h2 * s;
+}
+
+static std::vector<Eig> gpu_dsdrv1(int nx) {
+    // ncv=50 (>> 2*nev): the 2D-Laplacian low spectrum is tightly clustered,
+    // so a larger Krylov subspace is required to converge at large nx
+    // (ncv=20 stalls past n~10000 in regular mode-1).
+    int n = nx * nx, nev = 4, ncv = std::min(50, n - 1);
+    double inv_h2 = static_cast<double>(nx + 1) * (nx + 1);  // 1/h2
+    arnoldi::Arnoldi<arnoldi::Kind::Sym, double, arnoldi::SerialComm,
+                     arnoldi::CudaBackend> s("I", n, "SM", nev, ncv);
+    s.tol(0.0).maxiter(500).mode(1).ishift(1);  // matches cpp_dsdrv1
+    cudaStream_t stream = s.backend().stream();
+    const int threads = 256, blocks = (n + threads - 1) / threads;
+    // x, y are device pointers (workspace lives on the GPU). Launch on the
+    // backend's stream so the surrounding cuBLAS calls order correctly.
+    s.solve([&](const double* x, double* y) {
+        bench_laplacian_2d<<<blocks, threads, 0, stream>>>(nx, n, inv_h2, x, y);
+    });
+    cudaStreamSynchronize(stream);
+    auto r = s.eigenpairs(true, 0.0);
+    std::vector<Eig> out;
+    for (int i = 0; i < s.num_converged(); ++i)
+        out.push_back({r.values[i], 0.0});
+    return out;
+}
+#endif  // ARNOLDI_USE_CUDA
+
 struct ProblemResult {
     const char* name;
     int n, nev;
@@ -367,11 +424,19 @@ int main(int argc, char* argv[]) {
     }
 
     struct Size { const char* label; int n; };
-    Size sizes[] = {{"SMALL (n=100)", 100}, {"MEDIUM (n=2500)", 2500}, {"LARGE (n=10000)", 10000}};
+    Size sizes[] = {{"SMALL (n=100)", 100}, {"MEDIUM (n=5000)", 5000}, {"LARGE (n=40000)", 40000}};
 
     std::vector<std::pair<const char*, std::vector<ProblemResult>>> all_tiers;
     bool all_ok = true;
 
+#ifdef ARNOLDI_USE_CUDA
+    int cuda_dev_count = 0;
+    cudaGetDeviceCount(&cuda_dev_count);
+    if (cuda_dev_count == 0)
+        std::printf("[CUDA] backend built but no device available — "
+                    "skipping GPU rows\n");
+#endif
+
     for (auto& sz : sizes) {
         int nx = int(std::sqrt(double(sz.n)));
         int n2d = nx * nx;
@@ -407,6 +472,17 @@ int main(int argc, char* argv[]) {
             rows.push_back({"dndrv2", sz.n, 4, ct, ft, err, (int)ce.size(), (int)fe.size()});
         }
 
+#ifdef ARNOLDI_USE_CUDA
+        if (cuda_dev_count > 0) {
+            std::vector<Eig> ge, fe;
+            auto gt = time_fn([&]{ ge = gpu_dsdrv1(nx); }, warmup, reps);
+            auto ft = time_fn([&]{ fe = f90_dsdrv1(nx); }, warmup, reps);
+            double err = eig_max_error(ge, fe);
+            rows.push_back({"dsdrv1-gpu", n2d, 4, gt, ft, err,
+                            (int)ge.size(), (int)fe.size()});
+        }
+#endif
+
         for (auto& r : rows) {
             bool val_ok = r.max_rel_err < 1e-10
                           && r.nconv_cpp >= r.nev && r.nconv_f90 >= r.nev;
diff --git a/examples/kernels.hpp b/examples/kernels.hpp
index d588bc5..641b1aa 100644
--- a/examples/kernels.hpp
+++ b/examples/kernels.hpp
@@ -21,7 +21,8 @@ namespace arnoldi_examples {
     Real one = 1;
     tv_sym_row(nx, v, w);
     RO::axpy(nx, -one, v + nx, 1, w, 1);
-    for (int j = 2; j < nx - 1; j++) {
+    // Fortran dsdrv1.f: do 30 j = 2, nx-1  (inclusive) -> blocks 1..nx-2.
+    for (int j = 2; j < nx; j++) {
       int lo = (j - 1) * nx;
       tv_sym_row(nx, v + lo, w + lo);
       RO::axpy(nx, -one, v + lo - nx, 1, w + lo, 1);
@@ -55,7 +56,8 @@ namespace arnoldi_examples {
     Real h2  = one / static_cast<Real>((nx + 1) * (nx + 1));
     tv_cd_row(nx, rho, v, w);
     RO::axpy(nx, -one / h2, v + nx, 1, w, 1);
-    for (int j = 2; j < nx - 1; j++) {
+    // Fortran dndrv1.f: do 30 j = 2, nx-1  (inclusive) -> blocks 1..nx-2.
+    for (int j = 2; j < nx; j++) {
       int lo = (j - 1) * nx;
       tv_cd_row(nx, rho, v + lo, w + lo);
       RO::axpy(nx, -one / h2, v + lo - nx, 1, w + lo, 1);
diff --git a/include/arnoldi/arnoldi.hpp b/include/arnoldi/arnoldi.hpp
index 0f1d477..725cdbf 100644
--- a/include/arnoldi/arnoldi.hpp
+++ b/include/arnoldi/arnoldi.hpp
@@ -52,11 +52,12 @@ namespace arnoldi {
 
   }  // namespace detail
 
-  template <Kind K, typename Scalar, typename Comm = SerialComm>
+  template <Kind K, typename Scalar, typename Comm = SerialComm, typename Backend = arnoldi::detail::CpuBackend>
   class Arnoldi {
   public:
     using real_type   = typename detail::real_of<Scalar>::type;
     using scalar_type = Scalar;
+    using backend_type = Backend;
 
   private:
     static_assert(std::is_floating_point_v<real_type>,
@@ -72,9 +73,17 @@ namespace arnoldi {
     static_assert(K != Kind::Herm || detail::is_complex_v<Scalar>,
                   "Kind::Herm requires a complex Scalar (std::complex<float/double>)");
 
+    // The nonsymmetric path goes through real-Schur LAPACK auxiliaries
+    // (dlahqr_/dtrsen_/dtrevc_) that have no cuSOLVER analogue for
+    // Hessenberg matrices. Only Sym / Herm are supported under device
+    // backends; reject Nonsym + non-CpuBackend at compile time.
+    static_assert(K != Kind::Nonsym || std::is_same_v<Backend, arnoldi::detail::CpuBackend>,
+                  "Kind::Nonsym requires CpuBackend (no device specialization for the real-Schur LAPACK aux)");
+
   public:
-    Arnoldi(std::string bmat, int n, std::string which, int nev, int ncv, Comm comm = Comm{}) :
-        bmat_(std::move(bmat)), which_(std::move(which)), n_(n), nev_(nev), ncv_(ncv), ldv_(n), comm_(std::move(comm)) {
+    Arnoldi(std::string bmat, int n, std::string which, int nev, int ncv, Comm comm = Comm{}, Backend backend = Backend{}) :
+        bmat_(std::move(bmat)), which_(std::move(which)), n_(n), nev_(nev), ncv_(ncv), ldv_(n), comm_(std::move(comm)),
+        backend_(std::move(backend)) {
       if (n_ <= 0) throw std::invalid_argument("Arnoldi: n must be > 0");
       if (nev_ <= 0) throw std::invalid_argument("Arnoldi: nev must be > 0");
       int n_global = comm_.allreduce_sum(n_);
@@ -128,12 +137,32 @@ namespace arnoldi {
       return *this;
     }
 
+    // Seed the iteration with a user-provided initial residual vector. `r`
+    // is a HOST pointer; under CudaBackend the data is staged H->D once,
+    // here, into the device-resident resid_ buffer.
     Arnoldi& initial_resid(const Scalar* r) {
-      std::copy(r, r + n_, resid_.begin());
+      arnoldi::detail::buffer_traits<Backend, scalar_type>::copy_from_host(resid_, r, static_cast<std::size_t>(n_));
+      info_in_ = 1;
+      return *this;
+    }
+
+    // Device-pointer overload: only available under a non-CpuBackend (e.g.
+    // CudaBackend). Copies device-to-device into resid_. The device-specific
+    // buffer_traits specialization supplies copy_from_device; this member
+    // is SFINAE'd out under CpuBackend.
+    template <typename B = Backend, std::enable_if_t<!std::is_same_v<B, arnoldi::detail::CpuBackend>, int> = 0>
+    Arnoldi& initial_resid_device(const Scalar* r) {
+      arnoldi::detail::buffer_traits<B, scalar_type>::copy_from_device(resid_, r, static_cast<std::size_t>(n_));
       info_in_ = 1;
       return *this;
     }
 
+    // Accessor returning a reference to the owned backend. Users invoking
+    // a GPU matvec from inside their callback can grab the cuBLAS handle
+    // and stream via `solver.backend().handle()` / `solver.backend().stream()`.
+    Backend&       backend() noexcept { return backend_; }
+    const Backend& backend() const noexcept { return backend_; }
+
     template <class Op>
     void solve(Op&& op) {
       info_ = info_in_;
@@ -165,6 +194,16 @@ namespace arnoldi {
 
     using Result = std::conditional_t<K == Kind::Sym, SymResult, std::conditional_t<K == Kind::Nonsym, NonsymResult, HermResult>>;
 
+    // Sym / Herm result with eigenvectors left on the device. Eigenvalues are
+    // always host-resident (they come out of the ncv-sized Hessenberg
+    // subproblem, computed on host). The eigenvector matrix is the device
+    // buffer seupd's final GEMM wrote into — handed back without the n×nev
+    // device→host copy that eigenpairs() performs.
+    struct DeviceResult {
+      std::vector<real_type>                                                    values;   // size nev (host)
+      typename arnoldi::detail::buffer_traits<Backend, scalar_type>::vector_type vectors;  // n × nev (device)
+    };
+
     // Real shift (Sym / Herm only).
     template <Kind KK = K, std::enable_if_t<KK != Kind::Nonsym, int> = 0>
     Result eigenpairs(bool compute_vectors = true, real_type sigma = real_type{}) {
@@ -177,6 +216,25 @@ namespace arnoldi {
       return extract_(compute_vectors, sigmar, sigmai);
     }
 
+    // Sym / Herm only, device backends only: extract eigenpairs leaving the
+    // eigenvector matrix on the device (column-major, n × nev). Use this when
+    // downstream work consumes the eigenvectors on the GPU and the host copy
+    // that eigenpairs() does would be wasted.
+    template <Kind KK = K, typename B = Backend,
+              std::enable_if_t<KK != Kind::Nonsym && !std::is_same_v<B, arnoldi::detail::CpuBackend>, int> = 0>
+    DeviceResult eigenpairs_device(bool compute_vectors = true, real_type sigma = real_type{}) {
+      DeviceResult out{};
+      out.values.assign(static_cast<std::size_t>(nev_), real_type{});
+      if (compute_vectors) out.vectors.assign(static_cast<std::size_t>(n_) * nev_, Scalar{});
+
+      auto bref = backend_.ref();
+      arnoldi::detail::seupd<Scalar, Backend>(bref, compute_vectors, "A", out.values.data(),
+                                              compute_vectors ? out.vectors.data() : nullptr, n_, sigma, bmat_.c_str(), n_,
+                                              which_.c_str(), nev_, tol_, resid_.data(), ncv_, v_.data(), ldv_, iparam_.data(),
+                                              ipntr_.data(), workd_.data(), workl_.data(), lworkl_, info_, comm_);
+      return out;
+    }
+
     bool             converged() const noexcept { return info_ == 0 && iparam_[4] >= nev_; }
     int              num_converged() const noexcept { return iparam_[4]; }
     int              num_iterations() const noexcept { return iparam_[2]; }
@@ -192,27 +250,31 @@ namespace arnoldi {
   private:
     template <class Op, class Bop>
     void dispatch_aupd_(Op&& op, Bop&& bop) {
+      auto bref = backend_.ref();
       if constexpr (K == Kind::Nonsym) {
-        arnoldi::detail::naupd<real_type>(bmat_.c_str(), n_, which_.c_str(), nev_, tol_, resid_.data(), ncv_, v_.data(), ldv_,
-                                      iparam_.data(), ipntr_.data(), workd_.data(), workl_.data(), lworkl_, info_,
-                                      std::forward<Op>(op), std::forward<Bop>(bop), comm_);
+        arnoldi::detail::naupd<real_type, Backend>(bref, bmat_.c_str(), n_, which_.c_str(), nev_, tol_, resid_.data(), ncv_,
+                                                   v_.data(), ldv_, iparam_.data(), ipntr_.data(), workd_.data(), workl_.data(),
+                                                   lworkl_, info_, std::forward<Op>(op), std::forward<Bop>(bop), comm_);
       } else {  // Sym or Herm
-        arnoldi::detail::saupd<Scalar>(bmat_.c_str(), n_, which_.c_str(), nev_, tol_, resid_.data(), ncv_, v_.data(), ldv_,
-                                   iparam_.data(), ipntr_.data(), workd_.data(), workl_.data(), lworkl_, info_,
-                                   std::forward<Op>(op), std::forward<Bop>(bop), comm_);
+        arnoldi::detail::saupd<Scalar, Backend>(bref, bmat_.c_str(), n_, which_.c_str(), nev_, tol_, resid_.data(), ncv_,
+                                                v_.data(), ldv_, iparam_.data(), ipntr_.data(), workd_.data(), workl_.data(),
+                                                lworkl_, info_, std::forward<Op>(op), std::forward<Bop>(bop), comm_);
       }
     }
 
     template <class Op>
     void dispatch_aupd_(Op&& op) {
+      auto bref = backend_.ref();
       if constexpr (K == Kind::Nonsym) {
-        arnoldi::detail::naupd<real_type>(
-            bmat_.c_str(), n_, which_.c_str(), nev_, tol_, resid_.data(), ncv_, v_.data(), ldv_, iparam_.data(), ipntr_.data(),
-            workd_.data(), workl_.data(), lworkl_, info_, std::forward<Op>(op), [](const real_type*, real_type*) {}, comm_);
+        arnoldi::detail::naupd<real_type, Backend>(
+            bref, bmat_.c_str(), n_, which_.c_str(), nev_, tol_, resid_.data(), ncv_, v_.data(), ldv_, iparam_.data(),
+            ipntr_.data(), workd_.data(), workl_.data(), lworkl_, info_, std::forward<Op>(op),
+            [](const real_type*, real_type*) {}, comm_);
       } else {
-        arnoldi::detail::saupd<Scalar>(
-            bmat_.c_str(), n_, which_.c_str(), nev_, tol_, resid_.data(), ncv_, v_.data(), ldv_, iparam_.data(), ipntr_.data(),
-            workd_.data(), workl_.data(), lworkl_, info_, std::forward<Op>(op), [](const Scalar*, Scalar*) {}, comm_);
+        arnoldi::detail::saupd<Scalar, Backend>(
+            bref, bmat_.c_str(), n_, which_.c_str(), nev_, tol_, resid_.data(), ncv_, v_.data(), ldv_, iparam_.data(),
+            ipntr_.data(), workd_.data(), workl_.data(), lworkl_, info_, std::forward<Op>(op), [](const Scalar*, Scalar*) {},
+            comm_);
       }
     }
 
@@ -221,9 +283,28 @@ namespace arnoldi {
       out.values.assign(static_cast<std::size_t>(nev_), real_type{});
       if (rvec) out.vectors.assign(static_cast<std::size_t>(n_) * nev_, Scalar{});
 
-      arnoldi::detail::seupd<Scalar>(rvec, "A", out.values.data(), rvec ? out.vectors.data() : nullptr, n_, sigma, bmat_.c_str(), n_,
-                                 which_.c_str(), nev_, tol_, resid_.data(), ncv_, v_.data(), ldv_, iparam_.data(), ipntr_.data(),
-                                 workd_.data(), workl_.data(), lworkl_, info_, comm_);
+      auto bref = backend_.ref();
+      if constexpr (std::is_same_v<Backend, arnoldi::detail::CpuBackend>) {
+        // CPU: seupd writes the final V*S directly into host out.vectors.
+        arnoldi::detail::seupd<Scalar, Backend>(bref, rvec, "A", out.values.data(), rvec ? out.vectors.data() : nullptr, n_, sigma,
+                                                bmat_.c_str(), n_, which_.c_str(), nev_, tol_, resid_.data(), ncv_, v_.data(),
+                                                ldv_, iparam_.data(), ipntr_.data(), workd_.data(), workl_.data(), lworkl_, info_,
+                                                comm_);
+      } else {
+        // Device: allocate a device buffer for z, run seupd into it (final
+        // GEMM lands on device), then copy down to host out.vectors. The
+        // matching `buffer_copy_to_host` and `buffer_traits` specialisations
+        // come from the device-specific backend header (e.g. cuda.hpp).
+        typename arnoldi::detail::buffer_traits<Backend, scalar_type>::vector_type z_dev;
+        if (rvec) z_dev.assign(static_cast<std::size_t>(n_) * nev_, Scalar{});
+        arnoldi::detail::seupd<Scalar, Backend>(bref, rvec, "A", out.values.data(), rvec ? z_dev.data() : nullptr, n_, sigma,
+                                                bmat_.c_str(), n_, which_.c_str(), nev_, tol_, resid_.data(), ncv_, v_.data(),
+                                                ldv_, iparam_.data(), ipntr_.data(), workd_.data(), workl_.data(), lworkl_, info_,
+                                                comm_);
+        if (rvec)
+          arnoldi::detail::buffer_traits<Backend, scalar_type>::copy_to_host(out.vectors.data(), z_dev,
+                                                                              static_cast<std::size_t>(n_) * nev_);
+      }
       return out;
     }
 
@@ -235,10 +316,11 @@ namespace arnoldi {
       out.values_im.assign(static_cast<std::size_t>(nev_ + 1), real_type{});
       if (rvec) out.vectors.assign(static_cast<std::size_t>(n_) * (nev_ + 1), Scalar{});
 
-      arnoldi::detail::neupd<real_type>(rvec, "A", out.values_re.data(), out.values_im.data(), rvec ? out.vectors.data() : nullptr,
-                                    n_, sigmar, sigmai, workev_.data(), bmat_.c_str(), n_, which_.c_str(), nev_, tol_,
-                                    resid_.data(), ncv_, v_.data(), ldv_, iparam_.data(), ipntr_.data(), workd_.data(),
-                                    workl_.data(), lworkl_, info_);
+      auto bref = backend_.ref();
+      arnoldi::detail::neupd<real_type, Backend>(bref, rvec, "A", out.values_re.data(), out.values_im.data(),
+                                                 rvec ? out.vectors.data() : nullptr, n_, sigmar, sigmai, workev_.data(),
+                                                 bmat_.c_str(), n_, which_.c_str(), nev_, tol_, resid_.data(), ncv_, v_.data(),
+                                                 ldv_, iparam_.data(), ipntr_.data(), workd_.data(), workl_.data(), lworkl_, info_);
       return out;
     }
 
@@ -250,10 +332,15 @@ namespace arnoldi {
     int                      info_in_ = 0;  // initial value fed into *aupd (1 if user gave resid)
     real_type                tol_     = real_type{};
     Comm                     comm_;
-
-    std::vector<scalar_type> resid_;
-    std::vector<scalar_type> v_;
-    std::vector<scalar_type> workd_;
+    Backend                  backend_;
+
+    // Length-n workspace lives on the backend (host std::vector under
+    // CpuBackend, device_vector under CudaBackend).
+    typename arnoldi::detail::buffer_traits<Backend, scalar_type>::vector_type resid_;
+    typename arnoldi::detail::buffer_traits<Backend, scalar_type>::vector_type v_;
+    typename arnoldi::detail::buffer_traits<Backend, scalar_type>::vector_type workd_;
+    // The IRA bookkeeping (Hessenberg, Ritz values/bounds, Givens scratch)
+    // is small (ncv²-sized) and stays host-resident under every backend.
     std::vector<real_type>   workl_;
     std::vector<real_type>   workev_;  // Nonsym only
 
@@ -261,12 +348,12 @@ namespace arnoldi {
     std::array<int, 14>      ipntr_{};
   };
 
-  template <class R, class C = SerialComm>
-  using SymArnoldi = Arnoldi<Kind::Sym, R, C>;
-  template <class R, class C = SerialComm>
-  using NonsymArnoldi = Arnoldi<Kind::Nonsym, R, C>;
-  template <class R, class C = SerialComm>
-  using HermArnoldi = Arnoldi<Kind::Herm, std::complex<R>, C>;
+  template <class R, class C = SerialComm, class B = arnoldi::detail::CpuBackend>
+  using SymArnoldi = Arnoldi<Kind::Sym, R, C, B>;
+  template <class R, class C = SerialComm, class B = arnoldi::detail::CpuBackend>
+  using NonsymArnoldi = Arnoldi<Kind::Nonsym, R, C, B>;
+  template <class R, class C = SerialComm, class B = arnoldi::detail::CpuBackend>
+  using HermArnoldi = Arnoldi<Kind::Herm, std::complex<R>, C, B>;
 
 }  // namespace arnoldi
 
diff --git a/include/arnoldi/cuda.hpp b/include/arnoldi/cuda.hpp
new file mode 100644
index 0000000..55d3c15
--- /dev/null
+++ b/include/arnoldi/cuda.hpp
@@ -0,0 +1,607 @@
+#ifndef ARNOLDI_CUDA_HPP
+#define ARNOLDI_CUDA_HPP
+
+// CUDA + cuBLAS backend for the symmetric / Hermitian Arnoldi path.
+//
+// Gated behind ARNOLDI_USE_CUDA. When the macro is defined, this header
+// provides:
+//   - arnoldi::detail::CudaBackend         (RAII over cuBLAS handle + stream)
+//   - arnoldi::detail::BackendRef<CudaBackend> partial spec (handle + stream)
+//   - arnoldi::detail::Ops<Scalar, CudaBackend> partial spec (cuBLAS dispatch)
+//
+// Callback contract under CudaBackend:
+//
+//   The user matvec callback `op(const Scalar* x, Scalar* y)` receives
+//   DEVICE pointers. The matvec runs on the stream returned by
+//   backend.stream(); if the user launches on a different stream they
+//   must synchronise it before returning, otherwise the subsequent
+//   cuBLAS calls will race against the user kernel.
+//
+// Eigenvalues: extracted by seupd into host arrays (Real eigenvalues, plus
+// optionally Scalar eigenvectors after a final cublasGEMM). The Arnoldi
+// class transfers the result to host std::vector on eigenpairs() return.
+//
+// Pointer-mode policy: CUBLAS_POINTER_MODE_HOST. dotc/nrm2 return a host
+// scalar synchronously, matching CPU semantics exactly.
+//
+// Supported scalar types: double, float, std::complex<double>,
+// std::complex<float>. Sym + real, Herm + complex.
+
+#ifdef ARNOLDI_USE_CUDA
+
+#include <arnoldi/detail/ops.hpp>
+#include <complex>
+#include <cstddef>
+#include <cstdio>
+#include <cstdlib>
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace arnoldi {
+
+  namespace detail {
+
+    // Internal error-check helpers. Keep them small and noisy; on failure
+    // throw so the user-facing exception carries a meaningful message.
+    inline void cuda_check(cudaError_t e, const char* what) {
+      if (e != cudaSuccess) {
+        throw std::runtime_error(std::string("arnoldi/cuda: ") + what + ": " + cudaGetErrorString(e));
+      }
+    }
+    inline void cublas_check(cublasStatus_t s, const char* what) {
+      if (s != CUBLAS_STATUS_SUCCESS) {
+        throw std::runtime_error(std::string("arnoldi/cuda: ") + what + ": cublas error " + std::to_string(static_cast<int>(s)));
+      }
+    }
+
+    inline cublasOperation_t cublas_op(const char* s) {
+      switch (s[0]) {
+        case 'N': case 'n': return CUBLAS_OP_N;
+        case 'T': case 't': return CUBLAS_OP_T;
+        case 'C': case 'c': return CUBLAS_OP_C;
+      }
+      return CUBLAS_OP_N;
+    }
+
+  }  // namespace detail
+
+  // RAII over a cuBLAS handle bound to a stream. The Arnoldi class stores
+  // one of these by value; ref() returns a trivially-copyable view that
+  // gets threaded through the algorithmic templates.
+  //
+  // Construction modes:
+  //   CudaBackend()                       — creates own handle + stream.
+  //   CudaBackend(cudaStream_t s)         — uses caller's stream; creates own
+  //                                          handle bound to that stream.
+  //
+  // Non-copyable, movable. Default ctor owns its stream and destroys it.
+  class CudaBackend {
+  public:
+    CudaBackend() : owns_stream_(true) {
+      detail::cuda_check(cudaStreamCreate(&stream_), "cudaStreamCreate");
+      detail::cublas_check(cublasCreate(&handle_), "cublasCreate");
+      detail::cublas_check(cublasSetStream(handle_, stream_), "cublasSetStream");
+      detail::cublas_check(cublasSetPointerMode(handle_, CUBLAS_POINTER_MODE_HOST), "cublasSetPointerMode(HOST)");
+    }
+    explicit CudaBackend(cudaStream_t s) : stream_(s), owns_stream_(false) {
+      detail::cublas_check(cublasCreate(&handle_), "cublasCreate");
+      detail::cublas_check(cublasSetStream(handle_, stream_), "cublasSetStream");
+      detail::cublas_check(cublasSetPointerMode(handle_, CUBLAS_POINTER_MODE_HOST), "cublasSetPointerMode(HOST)");
+    }
+
+    CudaBackend(const CudaBackend&)            = delete;
+    CudaBackend& operator=(const CudaBackend&) = delete;
+
+    CudaBackend(CudaBackend&& other) noexcept :
+        handle_(other.handle_), stream_(other.stream_), owns_stream_(other.owns_stream_) {
+      other.handle_       = nullptr;
+      other.stream_       = nullptr;
+      other.owns_stream_  = false;
+    }
+    CudaBackend& operator=(CudaBackend&& other) noexcept {
+      if (this != &other) {
+        release_();
+        handle_            = other.handle_;
+        stream_            = other.stream_;
+        owns_stream_       = other.owns_stream_;
+        other.handle_      = nullptr;
+        other.stream_      = nullptr;
+        other.owns_stream_ = false;
+      }
+      return *this;
+    }
+
+    ~CudaBackend() { release_(); }
+
+    cublasHandle_t handle() const noexcept { return handle_; }
+    cudaStream_t   stream() const noexcept { return stream_; }
+
+    detail::BackendRef<CudaBackend> ref() const noexcept;
+
+  private:
+    void release_() noexcept {
+      if (handle_) cublasDestroy(handle_);
+      if (owns_stream_ && stream_) cudaStreamDestroy(stream_);
+      handle_      = nullptr;
+      stream_      = nullptr;
+      owns_stream_ = false;
+    }
+
+    cublasHandle_t handle_      = nullptr;
+    cudaStream_t   stream_      = nullptr;
+    bool           owns_stream_ = false;
+  };
+
+  namespace detail {
+
+    // Non-owning view threaded through the algorithmic templates.
+    template <>
+    struct BackendRef<CudaBackend> {
+      cublasHandle_t handle = nullptr;
+      cudaStream_t   stream = nullptr;
+    };
+
+    // Minimal RAII vector over device memory. Mirrors the small slice of
+    // std::vector<T> that the Arnoldi class actually uses: default ctor,
+    // .assign(n, T{}) (zero-fill via cudaMemset), .data(), .size(), and a
+    // destructor that frees. Non-copyable, movable.
+    //
+    // IEEE-754 zero is all-bits-zero for both real and complex (each
+    // component zero), so a single cudaMemset suffices for the zero-fill
+    // initialization that Arnoldi's ctor performs.
+    template <typename T>
+    class device_vector {
+    public:
+      device_vector() noexcept = default;
+
+      device_vector(const device_vector&)            = delete;
+      device_vector& operator=(const device_vector&) = delete;
+
+      device_vector(device_vector&& other) noexcept : data_(other.data_), size_(other.size_) {
+        other.data_ = nullptr;
+        other.size_ = 0;
+      }
+      device_vector& operator=(device_vector&& other) noexcept {
+        if (this != &other) {
+          release_();
+          data_       = other.data_;
+          size_       = other.size_;
+          other.data_ = nullptr;
+          other.size_ = 0;
+        }
+        return *this;
+      }
+
+      ~device_vector() { release_(); }
+
+      // Allocate n elements and fill with `val`. Currently only val == T{}
+      // (zero) is needed by Arnoldi; we'd add a host-staged fill if we ever
+      // needed a non-zero initial value.
+      void assign(std::size_t n, const T& val) {
+        resize_(n);
+        if (n == 0) return;
+        if (val == T{}) {
+          cuda_check(cudaMemset(data_, 0, n * sizeof(T)), "device_vector::assign(zero) cudaMemset");
+        } else {
+          std::vector<T> host(n, val);
+          cuda_check(cudaMemcpy(data_, host.data(), n * sizeof(T), cudaMemcpyHostToDevice),
+                     "device_vector::assign(fill) cudaMemcpy");
+        }
+      }
+
+      T*          data() noexcept { return data_; }
+      const T*    data() const noexcept { return data_; }
+      std::size_t size() const noexcept { return size_; }
+
+    private:
+      void resize_(std::size_t n) {
+        if (size_ == n) return;
+        release_();
+        if (n > 0) {
+          void* p = nullptr;
+          cuda_check(cudaMalloc(&p, n * sizeof(T)), "device_vector::resize_ cudaMalloc");
+          data_ = static_cast<T*>(p);
+          size_ = n;
+        }
+      }
+      void release_() noexcept {
+        if (data_) cudaFree(data_);
+        data_ = nullptr;
+        size_ = 0;
+      }
+
+      T*          data_ = nullptr;
+      std::size_t size_ = 0;
+    };
+
+    // Buffer-type trait specialization: under CudaBackend the Arnoldi
+    // workspace lives in device_vector. Static helpers shuttle data
+    // between host pointers and the device buffer for initial_resid,
+    // initial_resid_device, and extract_'s eigenvector copy-down.
+    template <typename T>
+    struct buffer_traits<CudaBackend, T> {
+      using vector_type = device_vector<T>;
+
+      // Host pointer -> device buffer (initial_resid upload).
+      static void copy_from_host(vector_type& dst, const T* src, std::size_t n) {
+        cuda_check(cudaMemcpy(dst.data(), src, n * sizeof(T), cudaMemcpyHostToDevice), "buffer_traits::copy_from_host");
+      }
+
+      // Device pointer -> device buffer (initial_resid_device pass-through).
+      static void copy_from_device(vector_type& dst, const T* src, std::size_t n) {
+        cuda_check(cudaMemcpy(dst.data(), src, n * sizeof(T), cudaMemcpyDeviceToDevice), "buffer_traits::copy_from_device");
+      }
+
+      // Device buffer -> host pointer (extract_ eigenvector copy-down).
+      static void copy_to_host(T* host_dst, const vector_type& src, std::size_t n) {
+        cuda_check(cudaMemcpy(host_dst, src.data(), n * sizeof(T), cudaMemcpyDeviceToHost), "buffer_traits::copy_to_host");
+      }
+    };
+
+    // Overloads for the backend-aware extension points (declared in ops.hpp).
+
+    // Length-`len` allreduce on a device buffer: stage D->H, allreduce on a
+    // host scratch buffer, copy H->D, sync. Always small (<= ncv).
+    template <typename Scalar, typename Comm>
+    void backend_allreduce(BackendRef<CudaBackend> bref, const Comm& comm, Scalar* buf, int len) {
+      // SerialComm::allreduce_sum is an identity no-op. Staging the buffer
+      // D->H->D and synchronizing twice purely to feed it is wasted work on
+      // the Lanczos hot path — skip the round-trip entirely. The buffer is
+      // already correct in device memory.
+      if constexpr (std::is_same_v<Comm, ::arnoldi::SerialComm>) {
+        (void)bref;
+        (void)comm;
+        (void)buf;
+        (void)len;
+        return;
+      } else {
+        std::vector<Scalar> host_buf(static_cast<std::size_t>(len));
+      cuda_check(cudaMemcpyAsync(host_buf.data(), buf, static_cast<std::size_t>(len) * sizeof(Scalar), cudaMemcpyDeviceToHost,
+                                 bref.stream),
+                 "backend_allreduce D2H");
+      cuda_check(cudaStreamSynchronize(bref.stream), "backend_allreduce sync D2H");
+      comm.allreduce_sum(host_buf.data(), len);
+      cuda_check(cudaMemcpyAsync(buf, host_buf.data(), static_cast<std::size_t>(len) * sizeof(Scalar), cudaMemcpyHostToDevice,
+                                 bref.stream),
+                 "backend_allreduce H2D");
+      cuda_check(cudaStreamSynchronize(bref.stream), "backend_allreduce sync H2D");
+      }
+    }
+
+  }  // namespace detail
+
+  inline detail::BackendRef<CudaBackend> CudaBackend::ref() const noexcept {
+    return detail::BackendRef<CudaBackend>{handle_, stream_};
+  }
+
+  namespace detail {
+
+    // Ops<Scalar, CudaBackend> partial specialisation.
+    //
+    // Provides the length-n dispatch surface for the Sym / Herm path:
+    //   copy/scal/rscal/axpy/raxpy/nrm2/dotc/rdotc/gemv/gemm/gemv_rv/larnv/
+    //   lascl_/zero/read_scalar
+    //
+    // Does NOT provide the host-only LAPACK aux (lamch, lapy2, steqr,
+    // laset_, larfg_, lartg_, lahqr_, trevc_, geqr2_, trsen_, trmm_, ger,
+    // lacpy, lanhs, lae2_, laev2_, lasr_, lasrt_, lanst_, swap). Those run
+    // on host-resident workl/h/q via the primary CPU Ops template, which
+    // is reached by Ops<Real>::method(...) call sites (default CpuBackend).
+    template <typename Scalar>
+    struct Ops<Scalar, CudaBackend> {
+      using Real                       = real_t<Scalar>;
+      static constexpr bool is_complex = !std::is_same_v<Scalar, Real>;
+      using BRef                       = BackendRef<CudaBackend>;
+
+      // ---- fills / scalar reads ---------------------------------------------
+      static void zero(BRef bref, int n, Scalar* x) {
+        // IEEE-754 zero is all-bits-zero for both real and complex (each
+        // component zero). cudaMemsetAsync is async on the stream.
+        cuda_check(cudaMemsetAsync(x, 0, static_cast<std::size_t>(n) * sizeof(Scalar), bref.stream), "cudaMemsetAsync(zero)");
+      }
+
+      static Scalar read_scalar(BRef bref, const Scalar* p) {
+        Scalar host_val{};
+        cuda_check(cudaMemcpyAsync(&host_val, p, sizeof(Scalar), cudaMemcpyDeviceToHost, bref.stream),
+                   "cudaMemcpyAsync(read_scalar)");
+        cuda_check(cudaStreamSynchronize(bref.stream), "cudaStreamSynchronize(read_scalar)");
+        return host_val;
+      }
+
+      // Async single-element D->H stage (no per-call sync). The caller batches
+      // many of these and issues one sync() before reading host memory,
+      // collapsing what would be one stream stall per Lanczos step into one
+      // stall per saitr call.
+      static void read_scalar_async(BRef bref, Scalar* host_dst, const Scalar* src) {
+        cuda_check(cudaMemcpyAsync(host_dst, src, sizeof(Scalar), cudaMemcpyDeviceToHost, bref.stream),
+                   "cudaMemcpyAsync(read_scalar_async)");
+      }
+
+      static void sync(BRef bref) { cuda_check(cudaStreamSynchronize(bref.stream), "cudaStreamSynchronize(sync)"); }
+
+      // ---- BLAS-1 -----------------------------------------------------------
+      static void copy(BRef bref, int n, const Scalar* x, int incx, Scalar* y, int incy) {
+        if constexpr (std::is_same_v<Scalar, double>)
+          cublas_check(cublasDcopy(bref.handle, n, x, incx, y, incy), "cublasDcopy");
+        else if constexpr (std::is_same_v<Scalar, float>)
+          cublas_check(cublasScopy(bref.handle, n, x, incx, y, incy), "cublasScopy");
+        else if constexpr (std::is_same_v<Scalar, std::complex<double>>)
+          cublas_check(cublasZcopy(bref.handle, n, reinterpret_cast<const cuDoubleComplex*>(x), incx,
+                                   reinterpret_cast<cuDoubleComplex*>(y), incy),
+                       "cublasZcopy");
+        else
+          cublas_check(cublasCcopy(bref.handle, n, reinterpret_cast<const cuComplex*>(x), incx,
+                                   reinterpret_cast<cuComplex*>(y), incy),
+                       "cublasCcopy");
+      }
+
+      static void scal(BRef bref, int n, Scalar a, Scalar* x, int incx) {
+        if constexpr (std::is_same_v<Scalar, double>)
+          cublas_check(cublasDscal(bref.handle, n, &a, x, incx), "cublasDscal");
+        else if constexpr (std::is_same_v<Scalar, float>)
+          cublas_check(cublasSscal(bref.handle, n, &a, x, incx), "cublasSscal");
+        else if constexpr (std::is_same_v<Scalar, std::complex<double>>)
+          cublas_check(cublasZscal(bref.handle, n, reinterpret_cast<const cuDoubleComplex*>(&a),
+                                   reinterpret_cast<cuDoubleComplex*>(x), incx),
+                       "cublasZscal");
+        else
+          cublas_check(cublasCscal(bref.handle, n, reinterpret_cast<const cuComplex*>(&a),
+                                   reinterpret_cast<cuComplex*>(x), incx),
+                       "cublasCscal");
+      }
+
+      static void rscal(BRef bref, int n, Real a, Scalar* x, int incx) {
+        if constexpr (!is_complex) {
+          scal(bref, n, a, x, incx);
+        } else if constexpr (std::is_same_v<Real, double>) {
+          cublas_check(cublasZdscal(bref.handle, n, &a, reinterpret_cast<cuDoubleComplex*>(x), incx), "cublasZdscal");
+        } else {
+          cublas_check(cublasCsscal(bref.handle, n, &a, reinterpret_cast<cuComplex*>(x), incx), "cublasCsscal");
+        }
+      }
+
+      static void axpy(BRef bref, int n, Scalar a, const Scalar* x, int incx, Scalar* y, int incy) {
+        if constexpr (std::is_same_v<Scalar, double>)
+          cublas_check(cublasDaxpy(bref.handle, n, &a, x, incx, y, incy), "cublasDaxpy");
+        else if constexpr (std::is_same_v<Scalar, float>)
+          cublas_check(cublasSaxpy(bref.handle, n, &a, x, incx, y, incy), "cublasSaxpy");
+        else if constexpr (std::is_same_v<Scalar, std::complex<double>>)
+          cublas_check(cublasZaxpy(bref.handle, n, reinterpret_cast<const cuDoubleComplex*>(&a),
+                                   reinterpret_cast<const cuDoubleComplex*>(x), incx,
+                                   reinterpret_cast<cuDoubleComplex*>(y), incy),
+                       "cublasZaxpy");
+        else
+          cublas_check(cublasCaxpy(bref.handle, n, reinterpret_cast<const cuComplex*>(&a),
+                                   reinterpret_cast<const cuComplex*>(x), incx,
+                                   reinterpret_cast<cuComplex*>(y), incy),
+                       "cublasCaxpy");
+      }
+
+      static void raxpy(BRef bref, int n, Real a, const Scalar* x, int incx, Scalar* y, int incy) {
+        if constexpr (!is_complex) {
+          axpy(bref, n, a, x, incx, y, incy);
+        } else {
+          axpy(bref, n, Scalar(a, 0), x, incx, y, incy);
+        }
+      }
+
+      static Real nrm2(BRef bref, int n, const Scalar* x, int incx) {
+        Real result{};
+        if constexpr (std::is_same_v<Scalar, double>)
+          cublas_check(cublasDnrm2(bref.handle, n, x, incx, &result), "cublasDnrm2");
+        else if constexpr (std::is_same_v<Scalar, float>)
+          cublas_check(cublasSnrm2(bref.handle, n, x, incx, &result), "cublasSnrm2");
+        else if constexpr (std::is_same_v<Scalar, std::complex<double>>)
+          cublas_check(cublasDznrm2(bref.handle, n, reinterpret_cast<const cuDoubleComplex*>(x), incx, &result), "cublasDznrm2");
+        else
+          cublas_check(cublasScnrm2(bref.handle, n, reinterpret_cast<const cuComplex*>(x), incx, &result), "cublasScnrm2");
+        return result;
+      }
+
+      static Scalar dot(BRef bref, int n, const Scalar* x, int incx, const Scalar* y, int incy) {
+        // Only meaningful for real Scalar; the sym/herm path uses dotc/rdotc.
+        Scalar result{};
+        if constexpr (std::is_same_v<Scalar, double>)
+          cublas_check(cublasDdot(bref.handle, n, x, incx, y, incy, &result), "cublasDdot");
+        else if constexpr (std::is_same_v<Scalar, float>)
+          cublas_check(cublasSdot(bref.handle, n, x, incx, y, incy, &result), "cublasSdot");
+        return result;
+      }
+
+      static Scalar dotc(BRef bref, int n, const Scalar* x, int incx, const Scalar* y, int incy) {
+        Scalar result{};
+        if constexpr (std::is_same_v<Scalar, double>) {
+          cublas_check(cublasDdot(bref.handle, n, x, incx, y, incy, &result), "cublasDdot");
+        } else if constexpr (std::is_same_v<Scalar, float>) {
+          cublas_check(cublasSdot(bref.handle, n, x, incx, y, incy, &result), "cublasSdot");
+        } else if constexpr (std::is_same_v<Scalar, std::complex<double>>) {
+          cublas_check(cublasZdotc(bref.handle, n, reinterpret_cast<const cuDoubleComplex*>(x), incx,
+                                   reinterpret_cast<const cuDoubleComplex*>(y), incy, reinterpret_cast<cuDoubleComplex*>(&result)),
+                       "cublasZdotc");
+        } else {
+          cublas_check(cublasCdotc(bref.handle, n, reinterpret_cast<const cuComplex*>(x), incx,
+                                   reinterpret_cast<const cuComplex*>(y), incy, reinterpret_cast<cuComplex*>(&result)),
+                       "cublasCdotc");
+        }
+        return result;
+      }
+
+      static Real rdotc(BRef bref, int n, const Scalar* x, int incx, const Scalar* y, int incy) {
+        return std::real(dotc(bref, n, x, incx, y, incy));
+      }
+
+      static const char* herm_trans() {
+        if constexpr (is_complex) return "C";
+        else                      return "T";
+      }
+
+      // ---- BLAS-2 / BLAS-3 --------------------------------------------------
+      static void gemv(BRef bref, const char* trans, int m, int n, Scalar alpha, const Scalar* a, int lda, const Scalar* x,
+                       int incx, Scalar beta, Scalar* y, int incy) {
+        auto op = cublas_op(trans);
+        if constexpr (std::is_same_v<Scalar, double>)
+          cublas_check(cublasDgemv(bref.handle, op, m, n, &alpha, a, lda, x, incx, &beta, y, incy), "cublasDgemv");
+        else if constexpr (std::is_same_v<Scalar, float>)
+          cublas_check(cublasSgemv(bref.handle, op, m, n, &alpha, a, lda, x, incx, &beta, y, incy), "cublasSgemv");
+        else if constexpr (std::is_same_v<Scalar, std::complex<double>>)
+          cublas_check(cublasZgemv(bref.handle, op, m, n, reinterpret_cast<const cuDoubleComplex*>(&alpha),
+                                   reinterpret_cast<const cuDoubleComplex*>(a), lda,
+                                   reinterpret_cast<const cuDoubleComplex*>(x), incx,
+                                   reinterpret_cast<const cuDoubleComplex*>(&beta),
+                                   reinterpret_cast<cuDoubleComplex*>(y), incy),
+                       "cublasZgemv");
+        else
+          cublas_check(cublasCgemv(bref.handle, op, m, n, reinterpret_cast<const cuComplex*>(&alpha),
+                                   reinterpret_cast<const cuComplex*>(a), lda, reinterpret_cast<const cuComplex*>(x), incx,
+                                   reinterpret_cast<const cuComplex*>(&beta), reinterpret_cast<cuComplex*>(y), incy),
+                       "cublasCgemv");
+      }
+
+      static void gemm(BRef bref, const char* transa, const char* transb, int m, int n, int k, Scalar alpha, const Scalar* a,
+                       int lda, const Scalar* b, int ldb, Scalar beta, Scalar* c, int ldc) {
+        auto opa = cublas_op(transa);
+        auto opb = cublas_op(transb);
+        if constexpr (std::is_same_v<Scalar, double>)
+          cublas_check(cublasDgemm(bref.handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc), "cublasDgemm");
+        else if constexpr (std::is_same_v<Scalar, float>)
+          cublas_check(cublasSgemm(bref.handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc), "cublasSgemm");
+        else if constexpr (std::is_same_v<Scalar, std::complex<double>>)
+          cublas_check(cublasZgemm(bref.handle, opa, opb, m, n, k, reinterpret_cast<const cuDoubleComplex*>(&alpha),
+                                   reinterpret_cast<const cuDoubleComplex*>(a), lda,
+                                   reinterpret_cast<const cuDoubleComplex*>(b), ldb,
+                                   reinterpret_cast<const cuDoubleComplex*>(&beta),
+                                   reinterpret_cast<cuDoubleComplex*>(c), ldc),
+                       "cublasZgemm");
+        else
+          cublas_check(cublasCgemm(bref.handle, opa, opb, m, n, k, reinterpret_cast<const cuComplex*>(&alpha),
+                                   reinterpret_cast<const cuComplex*>(a), lda, reinterpret_cast<const cuComplex*>(b), ldb,
+                                   reinterpret_cast<const cuComplex*>(&beta), reinterpret_cast<cuComplex*>(c), ldc),
+                       "cublasCgemm");
+      }
+
+      // gemv with a real x-vector applied to a possibly-complex matrix.
+      // The Real x in callers (sapps, seupd) always lives in host workl/h/q.
+      // Under CudaBackend we gather x into a contiguous Scalar host buffer
+      // (widening on the complex path), upload to a device temp, then call
+      // the regular gemv. y is device-resident (workd / V column).
+      static void gemv_rv(BRef bref, const char* trans, int m, int ncols, Real alpha, const Scalar* a, int lda, const Real* x,
+                          int incx, Real beta, Scalar* y, int incy) {
+        int                 xlen = (*trans == 'N' || *trans == 'n') ? ncols : m;
+        std::vector<Scalar> host_x(xlen);
+        if constexpr (!is_complex) {
+          for (int i = 0; i < xlen; ++i) host_x[i] = x[i * incx];
+        } else {
+          for (int i = 0; i < xlen; ++i) host_x[i] = Scalar(x[i * incx], 0);
+        }
+        Scalar* dev_x = nullptr;
+        cuda_check(cudaMallocAsync(reinterpret_cast<void**>(&dev_x), static_cast<std::size_t>(xlen) * sizeof(Scalar), bref.stream),
+                   "cudaMallocAsync(gemv_rv temp)");
+        cuda_check(cudaMemcpyAsync(dev_x, host_x.data(), static_cast<std::size_t>(xlen) * sizeof(Scalar), cudaMemcpyHostToDevice,
+                                   bref.stream),
+                   "cudaMemcpyAsync(gemv_rv H2D)");
+        // Sync so host_x outlives the upload (host_x is local).
+        cuda_check(cudaStreamSynchronize(bref.stream), "cudaStreamSynchronize(gemv_rv upload)");
+        if constexpr (!is_complex) {
+          gemv(bref, trans, m, ncols, alpha, a, lda, dev_x, 1, beta, y, incy);
+        } else {
+          gemv(bref, trans, m, ncols, Scalar(alpha, 0), a, lda, dev_x, 1, Scalar(beta, 0), y, incy);
+        }
+        cuda_check(cudaFreeAsync(dev_x, bref.stream), "cudaFreeAsync(gemv_rv temp)");
+      }
+
+      // ---- LAPACK aux that operate on length-n device data ------------------
+
+      // larnv: generate random vector on host via the CPU Ops, then upload.
+      // Avoids the bookkeeping of cuRAND for a once-per-restart-attempt call.
+      static void larnv(BRef bref, int idist, int* iseed, int n, Scalar* x) {
+        std::vector<Scalar> host_x(n);
+        Ops<Scalar, CpuBackend>::larnv(idist, iseed, n, host_x.data());
+        cuda_check(cudaMemcpyAsync(x, host_x.data(), static_cast<std::size_t>(n) * sizeof(Scalar), cudaMemcpyHostToDevice,
+                                   bref.stream),
+                   "cudaMemcpyAsync(larnv H2D)");
+        cuda_check(cudaStreamSynchronize(bref.stream), "cudaStreamSynchronize(larnv)");
+      }
+
+      // lascl_: scale a device-resident block by cto/cfrom, applying the
+      // overflow-safe multi-step ratio that LAPACK dlascl uses (this matters
+      // because the only caller triggers it when cfrom = rnorm < safmin, so
+      // 1/cfrom would overflow). The mul sequence is pure scalar arithmetic
+      // computed on the host; each step is applied on-device with rscal
+      // (a real scaling — cublasXscal / Zdscal / Csscal), keeping the path
+      // nvcc-free. Only type 'G' (general, full m x n block) is supported,
+      // which is the only type any caller reaching this backend uses; other
+      // types fall back to the host path.
+      static void lascl_(BRef bref, const char* type, const int* kl, const int* ku, const Real* cfrom, const Real* cto,
+                         const int* m, const int* n, Scalar* a, const int* lda, int* info) {
+        *info = 0;
+        if (!(type[0] == 'G' || type[0] == 'g')) {
+          // Unexpected type (none of the algorithmic call sites use these
+          // under a device backend) — preserve correctness via host fallback.
+          const int           nelem = (*lda) * (*n);
+          std::vector<Scalar> host_a(nelem);
+          cuda_check(cudaMemcpyAsync(host_a.data(), a, static_cast<std::size_t>(nelem) * sizeof(Scalar), cudaMemcpyDeviceToHost,
+                                     bref.stream),
+                     "cudaMemcpyAsync(lascl D2H)");
+          cuda_check(cudaStreamSynchronize(bref.stream), "cudaStreamSynchronize(lascl D2H)");
+          Ops<Scalar, CpuBackend>::lascl_(type, kl, ku, cfrom, cto, m, n, host_a.data(), lda, info);
+          cuda_check(cudaMemcpyAsync(a, host_a.data(), static_cast<std::size_t>(nelem) * sizeof(Scalar), cudaMemcpyHostToDevice,
+                                     bref.stream),
+                     "cudaMemcpyAsync(lascl H2D)");
+          cuda_check(cudaStreamSynchronize(bref.stream), "cudaStreamSynchronize(lascl H2D)");
+          return;
+        }
+
+        const Real smlnum = Ops<Real, CpuBackend>::lamch("safe minimum");
+        const Real bignum = Real(1) / smlnum;
+        Real       cfromc = *cfrom;
+        Real       ctoc   = *cto;
+        const int  rows   = *m;
+        const int  cols   = *n;
+        const int  ld     = *lda;
+
+        bool done = false;
+        while (!done) {
+          Real cfrom1 = cfromc * smlnum;
+          Real mul;
+          if (cfrom1 == cfromc) {
+            // cfromc is an infinity; cto/cfrom is the well-defined ratio.
+            mul  = ctoc / cfromc;
+            done = true;
+          } else {
+            Real cto1 = ctoc / bignum;
+            if (cto1 == ctoc) {
+              // ctoc is zero.
+              mul    = ctoc;
+              done   = true;
+              cfromc = Real(1);
+            } else if (std::abs(cfrom1) > std::abs(ctoc) && ctoc != Real(0)) {
+              mul    = smlnum;
+              done   = false;
+              cfromc = cfrom1;
+            } else if (std::abs(cto1) > std::abs(cfromc)) {
+              mul    = bignum;
+              done   = false;
+              ctoc   = cto1;
+            } else {
+              mul  = ctoc / cfromc;
+              done = true;
+            }
+          }
+          // Apply A *= mul over the general m x n block (column-major, ld).
+          for (int j = 0; j < cols; ++j)
+            rscal(bref, rows, mul, a + static_cast<std::size_t>(j) * ld, 1);
+        }
+      }
+    };
+
+  }  // namespace detail
+
+}  // namespace arnoldi
+
+#endif  // ARNOLDI_USE_CUDA
+
+#endif  // ARNOLDI_CUDA_HPP
diff --git a/include/arnoldi/detail/blas_bindings.hpp b/include/arnoldi/detail/blas_bindings.hpp
index 8abff30..b30866e 100644
--- a/include/arnoldi/detail/blas_bindings.hpp
+++ b/include/arnoldi/detail/blas_bindings.hpp
@@ -66,6 +66,18 @@ void cgemv_(const char* trans, const int* m, const int* n, const std::complex<fl
 void dgbmv_(const char* trans, const int* m, const int* n, const int* kl, const int* ku, const double* alpha, const double* a,
             const int* lda, const double* x, const int* incx, const double* beta, double* y, const int* incy);
 
+// ---- BLAS-3 (gemm, all types) ---------------------------------------------
+void dgemm_(const char* transa, const char* transb, const int* m, const int* n, const int* k, const double* alpha, const double* a,
+            const int* lda, const double* b, const int* ldb, const double* beta, double* c, const int* ldc);
+void sgemm_(const char* transa, const char* transb, const int* m, const int* n, const int* k, const float* alpha, const float* a,
+            const int* lda, const float* b, const int* ldb, const float* beta, float* c, const int* ldc);
+void zgemm_(const char* transa, const char* transb, const int* m, const int* n, const int* k, const std::complex<double>* alpha,
+            const std::complex<double>* a, const int* lda, const std::complex<double>* b, const int* ldb,
+            const std::complex<double>* beta, std::complex<double>* c, const int* ldc);
+void cgemm_(const char* transa, const char* transb, const int* m, const int* n, const int* k, const std::complex<float>* alpha,
+            const std::complex<float>* a, const int* lda, const std::complex<float>* b, const int* ldb,
+            const std::complex<float>* beta, std::complex<float>* c, const int* ldc);
+
 // ---- BLAS-2 (ger / geru) --------------------------------------------------
 void dger_(const int* m, const int* n, const double* alpha, const double* x, const int* incx, const double* y, const int* incy,
            double* a, const int* lda);
@@ -238,6 +250,10 @@ inline void dgemv(const char* trans, int m, int n, double alpha, const double* a
                   double* y, int incy) {
   dgemv_(trans, &m, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy);
 }
+inline void dgemm(const char* transa, const char* transb, int m, int n, int k, double alpha, const double* a, int lda,
+                  const double* b, int ldb, double beta, double* c, int ldc) {
+  dgemm_(transa, transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+}
 inline void   dlarnv(int idist, int* iseed, int n, double* x) { dlarnv_(&idist, iseed, &n, x); }
 inline double dlamch(const char* cmach) { return dlamch_(cmach); }
 inline double dlapy2(double x, double y) { return dlapy2_(&x, &y); }
@@ -264,6 +280,10 @@ inline void   sgemv(const char* trans, int m, int n, float alpha, const float* a
                     float* y, int incy) {
     sgemv_(trans, &m, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy);
 }
+inline void   sgemm(const char* transa, const char* transb, int m, int n, int k, float alpha, const float* a, int lda,
+                    const float* b, int ldb, float beta, float* c, int ldc) {
+    sgemm_(transa, transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+}
 inline void  slarnv(int idist, int* iseed, int n, float* x) { slarnv_(&idist, iseed, &n, x); }
 inline float slamch(const char* cmach) { return slamch_(cmach); }
 inline float slapy2(float x, float y) { return slapy2_(&x, &y); }
@@ -289,6 +309,11 @@ inline void   zgemv(const char* trans, int m, int n, std::complex<double> alpha,
                     const std::complex<double>* x, int incx, std::complex<double> beta, std::complex<double>* y, int incy) {
     zgemv_(trans, &m, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy);
 }
+inline void   zgemm(const char* transa, const char* transb, int m, int n, int k, std::complex<double> alpha,
+                    const std::complex<double>* a, int lda, const std::complex<double>* b, int ldb, std::complex<double> beta,
+                    std::complex<double>* c, int ldc) {
+    zgemm_(transa, transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+}
 inline void zlarnv(int idist, int* iseed, int n, std::complex<double>* x) { zlarnv_(&idist, iseed, &n, x); }
 inline void zlacpy(const char* uplo, int m, int n, const std::complex<double>* a, int lda, std::complex<double>* b, int ldb) {
   zlacpy_(uplo, &m, &n, a, &lda, b, &ldb);
@@ -340,6 +365,11 @@ inline void  cgemv(const char* trans, int m, int n, std::complex<float> alpha, c
                    const std::complex<float>* x, int incx, std::complex<float> beta, std::complex<float>* y, int incy) {
    cgemv_(trans, &m, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy);
 }
+inline void  cgemm(const char* transa, const char* transb, int m, int n, int k, std::complex<float> alpha,
+                   const std::complex<float>* a, int lda, const std::complex<float>* b, int ldb, std::complex<float> beta,
+                   std::complex<float>* c, int ldc) {
+   cgemm_(transa, transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+}
 inline void clarnv(int idist, int* iseed, int n, std::complex<float>* x) { clarnv_(&idist, iseed, &n, x); }
 inline void clacpy(const char* uplo, int m, int n, const std::complex<float>* a, int lda, std::complex<float>* b, int ldb) {
   clacpy_(uplo, &m, &n, a, &lda, b, &ldb);
diff --git a/include/arnoldi/detail/getv0.hpp b/include/arnoldi/detail/getv0.hpp
index aaa712d..d2405a2 100644
--- a/include/arnoldi/detail/getv0.hpp
+++ b/include/arnoldi/detail/getv0.hpp
@@ -10,9 +10,9 @@ namespace arnoldi::detail {
   // getv0 — generate a starting vector (shared by sym / nonsym).
   // Replaces the reverse-communication version: callbacks op/bop are
   // invoked directly instead of returning ido.
-  template <typename Scalar, typename OP, typename BOP, typename Comm>
-  void getv0(const char* bmat, int itry, bool initv, int n, int j, Scalar* v, int ldv, Scalar* resid,
-             detail::real_t<Scalar>& rnorm, Scalar* workd, int& ierr, OP&& op, BOP&& bop, const Comm& comm) {
+  template <typename Scalar, typename Backend = detail::CpuBackend, typename OP, typename BOP, typename Comm>
+  void getv0(detail::BackendRef<Backend> bref, const char* bmat, int itry, bool initv, int n, int j, Scalar* v, int ldv,
+             Scalar* resid, detail::real_t<Scalar>& rnorm, Scalar* workd, int& ierr, OP&& op, BOP&& bop, const Comm& comm) {
     using Real          = detail::real_t<Scalar>;
     static int iseed[4] = {1, 3, 5, 7};
     int        msglvl   = detail::debug.getv0;
@@ -22,25 +22,25 @@ namespace arnoldi::detail {
     ierr = 0;
 
     if (!initv) {
-      detail::Ops<Scalar>::larnv(2, iseed, n, resid);
+      detail::Ops<Scalar, Backend>::larnv(bref, 2, iseed, n, resid);
     }
 
     if (itry == 1) {
       detail::stats.nopx++;
       detail::arscnd(t2);
-      detail::Ops<Scalar>::copy(n, resid, 1, workd, 1);
+      detail::Ops<Scalar, Backend>::copy(bref, n, resid, 1, workd, 1);
       op(workd, &workd[n]);
       if (*bmat == 'G') {
         detail::arscnd(t3);
         detail::stats.mvopx += (t3 - t2);
       }
-      detail::Ops<Scalar>::copy(n, &workd[n], 1, resid, 1);
+      detail::Ops<Scalar, Backend>::copy(bref, n, &workd[n], 1, resid, 1);
     } else if (itry > 1 && *bmat == 'G') {
-      detail::Ops<Scalar>::copy(n, resid, 1, &workd[n], 1);
+      detail::Ops<Scalar, Backend>::copy(bref, n, resid, 1, &workd[n], 1);
     }
 
     detail::arscnd(t2);
-    Real rnorm0 = detail::bnorm<Scalar>(*bmat, n, resid, workd, &workd[n], bop, comm);
+    Real rnorm0 = detail::bnorm<Scalar, Backend>(bref, *bmat, n, resid, workd, &workd[n], bop, comm);
     if (*bmat == 'G') {
       detail::arscnd(t3);
       detail::stats.mvbx += (t3 - t2);
@@ -49,13 +49,13 @@ namespace arnoldi::detail {
 
     if (j != 1) {
       for (int iter = 0;; ++iter) {
-        detail::Ops<Scalar>::gemv(detail::Ops<Scalar>::herm_trans(), n, j - 1, Scalar(1), v, ldv, workd, 1, Scalar(0), &workd[n],
-                                  1);
-        comm.allreduce_sum(&workd[n], j - 1);
-        detail::Ops<Scalar>::gemv("N", n, j - 1, Scalar(-1), v, ldv, &workd[n], 1, Scalar(1), resid, 1);
+        detail::Ops<Scalar, Backend>::gemv(bref, detail::Ops<Scalar, Backend>::herm_trans(), n, j - 1, Scalar(1), v, ldv, workd, 1,
+                                           Scalar(0), &workd[n], 1);
+        detail::backend_allreduce(bref, comm, &workd[n], j - 1);
+        detail::Ops<Scalar, Backend>::gemv(bref, "N", n, j - 1, Scalar(-1), v, ldv, &workd[n], 1, Scalar(1), resid, 1);
 
         detail::arscnd(t2);
-        rnorm = detail::bnorm<Scalar>(*bmat, n, resid, workd, &workd[n], bop, comm);
+        rnorm = detail::bnorm<Scalar, Backend>(bref, *bmat, n, resid, workd, &workd[n], bop, comm);
         if (*bmat == 'G') {
           detail::arscnd(t3);
           detail::stats.mvbx += (t3 - t2);
@@ -65,7 +65,7 @@ namespace arnoldi::detail {
 
         rnorm0 = rnorm;
         if (iter >= 5) {
-          for (int jj = 0; jj < n; jj++) resid[jj] = Scalar(0);
+          detail::Ops<Scalar, Backend>::zero(bref, n, resid);
           rnorm = Real(0);
           ierr  = -1;
           break;
@@ -84,6 +84,15 @@ namespace arnoldi::detail {
     detail::stats.getv0 += (t1 - t0);
   }
 
+  // getv0 without explicit BackendRef — defaults to CpuBackend, for the
+  // historical public callback API (tests, examples).
+  template <typename Scalar, typename OP, typename BOP, typename Comm>
+  void getv0(const char* bmat, int itry, bool initv, int n, int j, Scalar* v, int ldv, Scalar* resid,
+             detail::real_t<Scalar>& rnorm, Scalar* workd, int& ierr, OP&& op, BOP&& bop, const Comm& comm) {
+    getv0<Scalar, detail::CpuBackend>(detail::BackendRef<detail::CpuBackend>{}, bmat, itry, initv, n, j, v, ldv, resid, rnorm,
+                                      workd, ierr, std::forward<OP>(op), std::forward<BOP>(bop), comm);
+  }
+
 }  // namespace arnoldi::detail
 
 #endif  // ARNOLDI_DETAIL_GETV0_HPP
diff --git a/include/arnoldi/detail/nonsym.hpp b/include/arnoldi/detail/nonsym.hpp
index a47587b..13f9015 100644
--- a/include/arnoldi/detail/nonsym.hpp
+++ b/include/arnoldi/detail/nonsym.hpp
@@ -19,9 +19,9 @@
 namespace arnoldi::detail {
 
   // naitr — Arnoldi iteration (nonsymmetric).
-  template <typename Real, typename OP, typename BOP, typename Comm>
-  void naitr(const char* bmat, int n, int k, int np, int nb, Real* resid, Real& rnorm, Real* v, int ldv, Real* h, int ldh,
-             Real* workd, int& info, OP&& op, BOP&& bop, const Comm& comm) {
+  template <typename Real, typename Backend = detail::CpuBackend, typename OP, typename BOP, typename Comm>
+  void naitr(detail::BackendRef<Backend> bref, const char* bmat, int n, int k, int np, int nb, Real* resid, Real& rnorm, Real* v,
+             int ldv, Real* h, int ldh, Real* workd, int& info, OP&& op, BOP&& bop, const Comm& comm) {
     const int ipj = 0, irj = n, ivj = 2 * n;
 
     Real      unfl = detail::Ops<Real>::lamch("safe minimum");
@@ -56,7 +56,7 @@ namespace arnoldi::detail {
 
         int ierr = -1;
         for (int itry = 1; itry <= 3; ++itry) {
-          getv0<Real>(bmat, itry, false, n, j, v, ldv, resid, rnorm, workd, ierr, op, bop, comm);
+          getv0<Real, Backend>(bref, bmat, itry, false, n, j, v, ldv, resid, rnorm, workd, ierr, op, bop, comm);
           if (ierr >= 0) break;
         }
         if (ierr < 0) {
@@ -67,27 +67,27 @@ namespace arnoldi::detail {
         }
       }
 
-      detail::Ops<Real>::copy(n, resid, 1, &v[(j - 1) * ldv], 1);
+      detail::Ops<Real, Backend>::copy(bref, n, resid, 1, &v[(j - 1) * ldv], 1);
       if (rnorm >= unfl) {
         Real temp1 = (Real)1 / rnorm;
-        detail::Ops<Real>::scal(n, temp1, &v[(j - 1) * ldv], 1);
-        detail::Ops<Real>::scal(n, temp1, &workd[ipj], 1);
+        detail::Ops<Real, Backend>::scal(bref, n, temp1, &v[(j - 1) * ldv], 1);
+        detail::Ops<Real, Backend>::scal(bref, n, temp1, &workd[ipj], 1);
       } else {
         int  ione = 1;
         int  infol;
         Real done = (Real)1;
-        detail::Ops<Real>::lascl_("General", &ione, &ione, &rnorm, &done, &n, &ione, &v[(j - 1) * ldv], &n, &infol);
-        detail::Ops<Real>::lascl_("General", &ione, &ione, &rnorm, &done, &n, &ione, &workd[ipj], &n, &infol);
+        detail::Ops<Real, Backend>::lascl_(bref, "General", &ione, &ione, &rnorm, &done, &n, &ione, &v[(j - 1) * ldv], &n, &infol);
+        detail::Ops<Real, Backend>::lascl_(bref, "General", &ione, &ione, &rnorm, &done, &n, &ione, &workd[ipj], &n, &infol);
       }
 
       detail::stats.nopx++;
       detail::arscnd(t2);
-      detail::Ops<Real>::copy(n, &v[(j - 1) * ldv], 1, &workd[ivj], 1);
+      detail::Ops<Real, Backend>::copy(bref, n, &v[(j - 1) * ldv], 1, &workd[ivj], 1);
       op(&workd[ivj], &workd[irj]);
       detail::arscnd(t3);
       detail::stats.mvopx += (t3 - t2);
 
-      detail::Ops<Real>::copy(n, &workd[irj], 1, resid, 1);
+      detail::Ops<Real, Backend>::copy(bref, n, &workd[irj], 1, resid, 1);
 
       // B-multiply w = OP*v_j (or copy through if bmat='I').
       detail::arscnd(t2);
@@ -95,7 +95,7 @@ namespace arnoldi::detail {
         detail::stats.nbx++;
         bop(&workd[irj], &workd[ipj]);
       } else {
-        detail::Ops<Real>::copy(n, resid, 1, &workd[ipj], 1);
+        detail::Ops<Real, Backend>::copy(bref, n, resid, 1, &workd[ipj], 1);
       }
       if (bmat[0] == 'G') {
         detail::arscnd(t3);
@@ -104,16 +104,16 @@ namespace arnoldi::detail {
 
       Real wnorm;
       if (bmat[0] == 'G') {
-        wnorm = detail::pdot<Real>(comm, n, resid, 1, &workd[ipj], 1);
+        wnorm = detail::pdot<Real, Backend>(bref, comm, n, resid, 1, &workd[ipj], 1);
         wnorm = std::sqrt(std::abs(wnorm));
       } else {
-        wnorm = detail::pnrm2_real<Real>(comm, n, resid, 1);
+        wnorm = detail::pnrm2_real<Real, Backend>(bref, comm, n, resid, 1);
       }
 
       // h(:,j) = V' * B*w ;  resid -= V * h(:,j)
-      detail::Ops<Real>::gemv("T", n, j, (Real)1, v, ldv, &workd[ipj], 1, (Real)0, &h[(j - 1) * ldh], 1);
-      comm.allreduce_sum(&h[(j - 1) * ldh], j);
-      detail::Ops<Real>::gemv("N", n, j, (Real)-1, v, ldv, &h[(j - 1) * ldh], 1, (Real)1, resid, 1);
+      detail::Ops<Real, Backend>::gemv(bref, "T", n, j, (Real)1, v, ldv, &workd[ipj], 1, (Real)0, &h[(j - 1) * ldh], 1);
+      detail::backend_allreduce(bref, comm, &h[(j - 1) * ldh], j);
+      detail::Ops<Real, Backend>::gemv(bref, "N", n, j, (Real)-1, v, ldv, &h[(j - 1) * ldh], 1, (Real)1, resid, 1);
 
       if (j > 1) h[(j - 2) * ldh + (j - 1)] = betaj;
 
@@ -122,10 +122,10 @@ namespace arnoldi::detail {
       detail::arscnd(t2);
       if (bmat[0] == 'G') {
         detail::stats.nbx++;
-        detail::Ops<Real>::copy(n, resid, 1, &workd[irj], 1);
+        detail::Ops<Real, Backend>::copy(bref, n, resid, 1, &workd[irj], 1);
         bop(&workd[irj], &workd[ipj]);
       } else {
-        detail::Ops<Real>::copy(n, resid, 1, &workd[ipj], 1);
+        detail::Ops<Real, Backend>::copy(bref, n, resid, 1, &workd[ipj], 1);
       }
       if (bmat[0] == 'G') {
         detail::arscnd(t3);
@@ -133,10 +133,10 @@ namespace arnoldi::detail {
       }
 
       if (bmat[0] == 'G') {
-        rnorm = detail::pdot<Real>(comm, n, resid, 1, &workd[ipj], 1);
+        rnorm = detail::pdot<Real, Backend>(bref, comm, n, resid, 1, &workd[ipj], 1);
         rnorm = std::sqrt(std::abs(rnorm));
       } else {
-        rnorm = detail::pnrm2_real<Real>(comm, n, resid, 1);
+        rnorm = detail::pnrm2_real<Real, Backend>(bref, comm, n, resid, 1);
       }
 
       if (rnorm <= 0.717 * wnorm) {
@@ -148,19 +148,19 @@ namespace arnoldi::detail {
             detail::debug.vout(j, &h[(j - 1) * ldh], "_naitr: j-th column of H");
           }
 
-          detail::Ops<Real>::gemv("T", n, j, (Real)1, v, ldv, &workd[ipj], 1, (Real)0, &workd[irj], 1);
-          comm.allreduce_sum(&workd[irj], j);
-          detail::Ops<Real>::gemv("N", n, j, (Real)-1, v, ldv, &workd[irj], 1, (Real)1, resid, 1);
-          detail::Ops<Real>::axpy(j, (Real)1, &workd[irj], 1, &h[(j - 1) * ldh], 1);
+          detail::Ops<Real, Backend>::gemv(bref, "T", n, j, (Real)1, v, ldv, &workd[ipj], 1, (Real)0, &workd[irj], 1);
+          detail::backend_allreduce(bref, comm, &workd[irj], j);
+          detail::Ops<Real, Backend>::gemv(bref, "N", n, j, (Real)-1, v, ldv, &workd[irj], 1, (Real)1, resid, 1);
+          detail::Ops<Real, Backend>::axpy(bref, j, (Real)1, &workd[irj], 1, &h[(j - 1) * ldh], 1);
 
           detail::arscnd(t2);
           Real rnorm1;
           if (bmat[0] == 'G') {
             detail::stats.nbx++;
-            detail::Ops<Real>::copy(n, resid, 1, &workd[irj], 1);
+            detail::Ops<Real, Backend>::copy(bref, n, resid, 1, &workd[irj], 1);
             bop(&workd[irj], &workd[ipj]);
           } else {
-            detail::Ops<Real>::copy(n, resid, 1, &workd[ipj], 1);
+            detail::Ops<Real, Backend>::copy(bref, n, resid, 1, &workd[ipj], 1);
           }
           if (bmat[0] == 'G') {
             detail::arscnd(t3);
@@ -168,10 +168,10 @@ namespace arnoldi::detail {
           }
 
           if (bmat[0] == 'G') {
-            rnorm1 = detail::pdot<Real>(comm, n, resid, 1, &workd[ipj], 1);
+            rnorm1 = detail::pdot<Real, Backend>(bref, comm, n, resid, 1, &workd[ipj], 1);
             rnorm1 = std::sqrt(std::abs(rnorm1));
           } else {
-            rnorm1 = detail::pnrm2_real<Real>(comm, n, resid, 1);
+            rnorm1 = detail::pnrm2_real<Real, Backend>(bref, comm, n, resid, 1);
           }
 
           if (msglvl > 0 && iter > 0) {
@@ -190,7 +190,7 @@ namespace arnoldi::detail {
           detail::stats.nitref++;
           rnorm = rnorm1;
           if (iter == 1) {
-            for (int jj = 0; jj < n; jj++) resid[jj] = (Real)0;
+            detail::Ops<Real, Backend>::zero(bref, n, resid);
             rnorm = (Real)0;
           }
         }
@@ -221,9 +221,9 @@ namespace arnoldi::detail {
 
   // napps — apply implicit shifts to nonsymmetric Arnoldi factorisation.
   // Ported from ARPACK dnapps.f (goto-free version).
-  template <typename Real>
-  void napps(int n, int& kev, int np, Real* shiftr, Real* shifti, Real* v, int ldv, Real* h, int ldh, Real* resid, Real* q,
-             int ldq, Real* workl, Real* workd) {
+  template <typename Real, typename Backend = detail::CpuBackend>
+  void napps(detail::BackendRef<Backend> bref, int n, int& kev, int np, Real* shiftr, Real* shifti, Real* v, int ldv, Real* h,
+             int ldh, Real* resid, Real* q, int ldq, Real* workl, Real* workd) {
     Real unfl = detail::Ops<Real>::lamch("safe minimum");
     Real ovfl = Real(1) / unfl;
     detail::Ops<Real>::labad(unfl, ovfl);
@@ -394,19 +394,20 @@ namespace arnoldi::detail {
     }
 
     if (h[(kev - 1) * ldh + kev] > Real(0))
-      detail::Ops<Real>::gemv("N", n, kplusp, Real(1), v, ldv, &q[kev * ldq], 1, Real(0), &workd[n], 1);
+      detail::Ops<Real, Backend>::gemv(bref, "N", n, kplusp, Real(1), v, ldv, &q[kev * ldq], 1, Real(0), &workd[n], 1);
 
     for (int i = 1; i <= kev; i++) {
-      detail::Ops<Real>::gemv("N", n, kplusp - i + 1, Real(1), v, ldv, &q[(kev - i) * ldq], 1, Real(0), workd, 1);
-      detail::Ops<Real>::copy(n, workd, 1, &v[(kplusp - i) * ldv], 1);
+      detail::Ops<Real, Backend>::gemv(bref, "N", n, kplusp - i + 1, Real(1), v, ldv, &q[(kev - i) * ldq], 1, Real(0), workd, 1);
+      detail::Ops<Real, Backend>::copy(bref, n, workd, 1, &v[(kplusp - i) * ldv], 1);
     }
 
-    detail::Ops<Real>::lacpy("A", n, kev, &v[(kplusp - kev) * ldv], ldv, v, ldv);
+    detail::Ops<Real, Backend>::lacpy("A", n, kev, &v[(kplusp - kev) * ldv], ldv, v, ldv);
 
-    if (h[(kev - 1) * ldh + kev] > Real(0)) detail::Ops<Real>::copy(n, &workd[n], 1, &v[kev * ldv], 1);
+    if (h[(kev - 1) * ldh + kev] > Real(0)) detail::Ops<Real, Backend>::copy(bref, n, &workd[n], 1, &v[kev * ldv], 1);
 
-    detail::Ops<Real>::scal(n, q[(kev - 1) * ldq + (kplusp - 1)], resid, 1);
-    if (h[(kev - 1) * ldh + kev] > Real(0)) detail::Ops<Real>::axpy(n, h[(kev - 1) * ldh + kev], &v[kev * ldv], 1, resid, 1);
+    detail::Ops<Real, Backend>::scal(bref, n, q[(kev - 1) * ldq + (kplusp - 1)], resid, 1);
+    if (h[(kev - 1) * ldh + kev] > Real(0))
+      detail::Ops<Real, Backend>::axpy(bref, n, h[(kev - 1) * ldh + kev], &v[kev * ldv], 1, resid, 1);
 
     if (msglvl > 1) {
       detail::debug.vout(1, &q[(kev - 1) * ldq + (kplusp - 1)], "_napps: sigmak = (e_{kev+p}^T*Q)*e_{kev}");
@@ -420,10 +421,11 @@ namespace arnoldi::detail {
   }
 
   // naup2 — main Arnoldi iteration driver (nonsymmetric).
-  template <typename Real, typename OP, typename BOP, typename Comm>
-  void naup2(const char* bmat, int n, const char* which, int& nev, int& np, Real tol, Real* resid, int mode, int iupd, int ishift,
-             int& mxiter, Real* v, int ldv, Real* h, int ldh, Real* ritzr, Real* ritzi, Real* bounds, Real* q, int ldq,
-             Real* workl, Real* workd, int& info, OP&& op, BOP&& bop, const Comm& comm) {
+  template <typename Real, typename Backend = detail::CpuBackend, typename OP, typename BOP, typename Comm>
+  void naup2(detail::BackendRef<Backend> bref, const char* bmat, int n, const char* which, int& nev, int& np, Real tol,
+             Real* resid, int mode, int iupd, int ishift, int& mxiter, Real* v, int ldv, Real* h, int ldh, Real* ritzr,
+             Real* ritzi, Real* bounds, Real* q, int ldq, Real* workl, Real* workd, int& info, OP&& op, BOP&& bop,
+             const Comm& comm) {
     double t0, t1, t2, t3;
     detail::arscnd(t0);
     int  msglvl = detail::debug.aup2;
@@ -448,14 +450,14 @@ namespace arnoldi::detail {
       detail::stats.aup2 = t1 - t0;
     };
 
-    getv0<Real>(bmat, 1, initv, n, 1, v, ldv, resid, rnorm, workd, info, op, bop, comm);
+    getv0<Real, Backend>(bref, bmat, 1, initv, n, 1, v, ldv, resid, rnorm, workd, info, op, bop, comm);
     if (rnorm == (Real)0) {
       info = -9;
       end_naup2();
       return;
     }
 
-    naitr<Real>(bmat, n, 0, nev, mode, resid, rnorm, v, ldv, h, ldh, workd, info, op, bop, comm);
+    naitr<Real, Backend>(bref, bmat, n, 0, nev, mode, resid, rnorm, v, ldv, h, ldh, workd, info, op, bop, comm);
     if (info > 0) {
       np     = info;
       mxiter = iter;
@@ -477,7 +479,7 @@ namespace arnoldi::detail {
         detail::debug.ivout(1, &np, "_naup2: Extend the Arnoldi factorization by");
       }
 
-      naitr<Real>(bmat, n, nev, np, mode, resid, rnorm, v, ldv, h, ldh, workd, info, op, bop, comm);
+      naitr<Real, Backend>(bref, bmat, n, nev, np, mode, resid, rnorm, v, ldv, h, ldh, workd, info, op, bop, comm);
       if (info > 0) {
         np     = info;
         mxiter = iter;
@@ -604,19 +606,19 @@ namespace arnoldi::detail {
         }
       }
 
-      napps<Real>(n, nev, np, ritzr, ritzi, v, ldv, h, ldh, resid, q, ldq, workl, workd);
+      napps<Real, Backend>(bref, n, nev, np, ritzr, ritzi, v, ldv, h, ldh, resid, q, ldq, workl, workd);
 
       // B-norm of residual after compression.
       detail::arscnd(t2);
       if (bmat[0] == 'G') {
         detail::stats.nbx++;
-        detail::Ops<Real>::copy(n, resid, 1, &workd[n], 1);
+        detail::Ops<Real, Backend>::copy(bref, n, resid, 1, &workd[n], 1);
         bop(&workd[n], workd);
-        rnorm = detail::pdot<Real>(comm, n, resid, 1, workd, 1);
+        rnorm = detail::pdot<Real, Backend>(bref, comm, n, resid, 1, workd, 1);
         rnorm = std::sqrt(std::abs(rnorm));
       } else {
-        detail::Ops<Real>::copy(n, resid, 1, workd, 1);
-        rnorm = detail::pnrm2_real<Real>(comm, n, resid, 1);
+        detail::Ops<Real, Backend>::copy(bref, n, resid, 1, workd, 1);
+        rnorm = detail::pnrm2_real<Real, Backend>(bref, comm, n, resid, 1);
       }
       if (bmat[0] == 'G') {
         detail::arscnd(t3);
@@ -635,9 +637,10 @@ namespace arnoldi::detail {
   }
 
   // naupd — nonsymmetric eigensolver entry point (callback version).
-  template <typename Real, typename OP, typename BOP, typename Comm>
-  void naupd(const char* bmat, int n, const char* which, int nev, Real& tol, Real* resid, int ncv, Real* v, int ldv, int* iparam,
-             int* ipntr, Real* workd, Real* workl, int lworkl, int& info, OP&& op, BOP&& bop, const Comm& comm) {
+  template <typename Real, typename Backend = detail::CpuBackend, typename OP, typename BOP, typename Comm>
+  void naupd(detail::BackendRef<Backend> bref, const char* bmat, int n, const char* which, int nev, Real& tol, Real* resid,
+             int ncv, Real* v, int ldv, int* iparam, int* ipntr, Real* workd, Real* workl, int lworkl, int& info, OP&& op,
+             BOP&& bop, const Comm& comm) {
     detail::stats.reset();
     double t0, t1;
     detail::arscnd(t0);
@@ -702,8 +705,8 @@ namespace arnoldi::detail {
     ipntr[7]   = bounds;
     ipntr[13]  = iw;
 
-    naup2<Real>(bmat, n, which, nev0, np, tol, resid, mode, iupd, ishift, mxiter, v, ldv, &workl[ih], ldh, &workl[ritzr],
-                &workl[ritzi], &workl[bounds], &workl[iq], ldq, &workl[iw], workd, info, op, bop, comm);
+    naup2<Real, Backend>(bref, bmat, n, which, nev0, np, tol, resid, mode, iupd, ishift, mxiter, v, ldv, &workl[ih], ldh,
+                         &workl[ritzr], &workl[ritzi], &workl[bounds], &workl[iq], ldq, &workl[iw], workd, info, op, bop, comm);
 
     iparam[2]  = mxiter;
     iparam[4]  = np;
@@ -751,20 +754,43 @@ namespace arnoldi::detail {
   }
 
   // op + bop, no comm (defaults to SerialComm).
+  template <typename Real, typename Backend = detail::CpuBackend, typename OP, typename BOP>
+  void naupd(detail::BackendRef<Backend> bref, const char* bmat, int n, const char* which, int nev, Real& tol, Real* resid,
+             int ncv, Real* v, int ldv, int* iparam, int* ipntr, Real* workd, Real* workl, int lworkl, int& info, OP&& op,
+             BOP&& bop) {
+    naupd<Real, Backend>(bref, bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr, workd, workl, lworkl, info,
+                         std::forward<OP>(op), std::forward<BOP>(bop), SerialComm{});
+  }
+
+  // Standard problem (bmat='I'), no bop, no comm.
+  template <typename Real, typename Backend = detail::CpuBackend, typename OP>
+  void naupd(detail::BackendRef<Backend> bref, const char* bmat, int n, const char* which, int nev, Real& tol, Real* resid,
+             int ncv, Real* v, int ldv, int* iparam, int* ipntr, Real* workd, Real* workl, int lworkl, int& info, OP&& op) {
+    naupd<Real, Backend>(
+        bref, bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr, workd, workl, lworkl, info, std::forward<OP>(op),
+        [](const Real*, Real*) {}, SerialComm{});
+  }
+
+  // ---------------------------------------------------------------------------
+  // naupd no-BackendRef overloads — defaults to CpuBackend for the historical
+  // public callback API (tests, examples).
+  template <typename Real, typename OP, typename BOP, typename Comm>
+  void naupd(const char* bmat, int n, const char* which, int nev, Real& tol, Real* resid, int ncv, Real* v, int ldv, int* iparam,
+             int* ipntr, Real* workd, Real* workl, int lworkl, int& info, OP&& op, BOP&& bop, const Comm& comm) {
+    naupd<Real, detail::CpuBackend>(detail::BackendRef<detail::CpuBackend>{}, bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam,
+                                    ipntr, workd, workl, lworkl, info, std::forward<OP>(op), std::forward<BOP>(bop), comm);
+  }
   template <typename Real, typename OP, typename BOP>
   void naupd(const char* bmat, int n, const char* which, int nev, Real& tol, Real* resid, int ncv, Real* v, int ldv, int* iparam,
              int* ipntr, Real* workd, Real* workl, int lworkl, int& info, OP&& op, BOP&& bop) {
-    naupd<Real>(bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr, workd, workl, lworkl, info, std::forward<OP>(op),
-                std::forward<BOP>(bop), SerialComm{});
+    naupd<Real, detail::CpuBackend>(detail::BackendRef<detail::CpuBackend>{}, bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam,
+                                    ipntr, workd, workl, lworkl, info, std::forward<OP>(op), std::forward<BOP>(bop));
   }
-
-  // Standard problem (bmat='I'), no bop, no comm.
   template <typename Real, typename OP>
   void naupd(const char* bmat, int n, const char* which, int nev, Real& tol, Real* resid, int ncv, Real* v, int ldv, int* iparam,
              int* ipntr, Real* workd, Real* workl, int lworkl, int& info, OP&& op) {
-    naupd<Real>(
-        bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr, workd, workl, lworkl, info, std::forward<OP>(op),
-        [](const Real*, Real*) {}, SerialComm{});
+    naupd<Real, detail::CpuBackend>(detail::BackendRef<detail::CpuBackend>{}, bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam,
+                                    ipntr, workd, workl, lworkl, info, std::forward<OP>(op));
   }
 
   // neupd — nonsymmetric eigenvector extraction (callback version).
@@ -772,10 +798,10 @@ namespace arnoldi::detail {
   // Reads the Hessenberg factorization and Ritz data left by cb::naupd in
   // workl/ipntr/iparam (cb::naupd stores 0-based offsets in ipntr[4..7,13])
   // and computes eigenvalues (dr,di) and optionally eigenvectors z.
-  template <typename Real>
-  void neupd(bool rvec, const char* howmny, Real* dr, Real* di, Real* z, int ldz, Real sigmar, Real sigmai, Real* workev,
-             const char* bmat, int n, const char* which, int nev, Real tol, Real* resid, int ncv, Real* v, int ldv, int* iparam,
-             int* ipntr, Real* workd, Real* workl, int lworkl, int& info) {
+  template <typename Real, typename Backend = detail::CpuBackend>
+  void neupd(detail::BackendRef<Backend> bref, bool rvec, const char* howmny, Real* dr, Real* di, Real* z, int ldz, Real sigmar,
+             Real sigmai, Real* workev, const char* bmat, int n, const char* which, int nev, Real tol, Real* resid, int ncv,
+             Real* v, int ldv, int* iparam, int* ipntr, Real* workd, Real* workl, int lworkl, int& info) {
     int msglvl = detail::debug.eupd;
     int mode   = iparam[6];
     int nconv  = iparam[4];
@@ -957,9 +983,9 @@ namespace arnoldi::detail {
 
       detail::Ops<Real>::geqr2_(&ncv, &nconv, &workl[invsub], &ldq, workev, &workev[ncv], &ierr);
 
-      detail::Ops<Real>::orm2r_("Right", "Notranspose", &n, &ncv, &nconv, &workl[invsub], &ldq, workev, v, &ldv, &workd[n],
-                                &ierr);
-      detail::Ops<Real>::lacpy("All", n, nconv, v, ldv, z, ldz);
+      detail::Ops<Real, Backend>::orm2r_("Right", "Notranspose", &n, &ncv, &nconv, &workl[invsub], &ldq, workev, v, &ldv,
+                                         &workd[n], &ierr);
+      detail::Ops<Real, Backend>::lacpy("All", n, nconv, v, ldv, z, ldz);
 
       for (int j = 0; j < nconv; j++) {
         if (workl[invsub + j * ldq + j] < Real(0)) {
@@ -1026,13 +1052,13 @@ namespace arnoldi::detail {
 
         detail::Ops<Real>::geqr2_(&ncv, &nconv, &workl[invsub], &ldq, workev, &workev[ncv], &ierr);
 
-        detail::Ops<Real>::orm2r_("Right", "Notranspose", &n, &ncv, &nconv, &workl[invsub], &ldq, workev, z, &ldz, &workd[n],
-                                  &ierr);
+        detail::Ops<Real, Backend>::orm2r_("Right", "Notranspose", &n, &ncv, &nconv, &workl[invsub], &ldq, workev, z, &ldz,
+                                           &workd[n], &ierr);
 
         {
           Real done = Real(1);
-          detail::Ops<Real>::trmm_("Right", "Upper", "No transpose", "Non-unit", &n, &nconv, &done, &workl[invsub], &ldq, z,
-                                   &ldz);
+          detail::Ops<Real, Backend>::trmm_("Right", "Upper", "No transpose", "Non-unit", &n, &nconv, &done, &workl[invsub], &ldq,
+                                            z, &ldz);
         }
       }
 
@@ -1100,10 +1126,20 @@ namespace arnoldi::detail {
           iconj = 0;
         }
       }
-      detail::Ops<Real>::ger(n, nconv, Real(1), resid, 1, workev, 1, z, ldz);
+      detail::Ops<Real, Backend>::ger(n, nconv, Real(1), resid, 1, workev, 1, z, ldz);
     }
   }
 
+  // neupd no-BackendRef overload — defaults to CpuBackend for the historical
+  // public callback API (tests, examples).
+  template <typename Real>
+  void neupd(bool rvec, const char* howmny, Real* dr, Real* di, Real* z, int ldz, Real sigmar, Real sigmai, Real* workev,
+             const char* bmat, int n, const char* which, int nev, Real tol, Real* resid, int ncv, Real* v, int ldv, int* iparam,
+             int* ipntr, Real* workd, Real* workl, int lworkl, int& info) {
+    neupd<Real, detail::CpuBackend>(detail::BackendRef<detail::CpuBackend>{}, rvec, howmny, dr, di, z, ldz, sigmar, sigmai, workev,
+                                    bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr, workd, workl, lworkl, info);
+  }
+
 }  // namespace arnoldi::detail
 
 #endif  // ARNOLDI_DETAIL_NONSYM_HPP
diff --git a/include/arnoldi/detail/ops.hpp b/include/arnoldi/detail/ops.hpp
index af3fbd1..957d9f1 100644
--- a/include/arnoldi/detail/ops.hpp
+++ b/include/arnoldi/detail/ops.hpp
@@ -9,11 +9,13 @@
 // compile if instantiated for complex — which is correct since the Arnoldi
 // algorithms only call them on real scalars.
 
+#include <algorithm>
 #include <arnoldi/comm.hpp>
 #include <arnoldi/detail/blas_bindings.hpp>
 #include <arnoldi/detail/stats.hpp>
 #include <cmath>
 #include <complex>
+#include <cstddef>
 #include <cstdio>
 #include <type_traits>
 #include <vector>
@@ -31,7 +33,40 @@ namespace arnoldi::detail {
   template <typename T>
   using real_t = typename real_type<T>::type;
 
-  template <typename Scalar>
+  // Non-owning view threaded through the deep algorithmic templates, mirroring
+  // the way `const Comm&` is passed today. For CpuBackend it carries no state;
+  // device backends specialize this to hold the cuBLAS handle, stream, etc.
+  template <typename Backend>
+  struct BackendRef {};
+
+  // Default backend: all length-n BLAS calls hit host-side BLAS via blas_bindings.
+  // GPU backends (e.g. CudaBackend in arnoldi/cuda.hpp) partially specialize
+  // Ops<Scalar, Backend> to route those calls to device libraries instead.
+  struct CpuBackend {
+    BackendRef<CpuBackend> ref() const noexcept { return {}; }
+  };
+
+  // Selects the owning buffer type for Arnoldi's length-n workspace
+  // (resid_, v_, workd_) under each Backend, and provides static helpers
+  // for transferring data between host pointers and the buffer. Primary
+  // template is the CPU path; device backends specialize this (see
+  // cuda.hpp). Member access is dependent on Backend, so the right
+  // specialization is selected at instantiation.
+  template <typename Backend, typename T>
+  struct buffer_traits {
+    using vector_type = std::vector<T>;
+
+    // Host pointer -> host vector. Trivial std::copy.
+    static void copy_from_host(vector_type& dst, const T* src, std::size_t n) {
+      std::copy(src, src + n, dst.data());
+    }
+
+    // copy_from_device / copy_to_host are not provided for CpuBackend:
+    // arnoldi::Arnoldi only invokes them on non-CpuBackend (initial_resid_device
+    // is SFINAE'd out, extract_'s else branch is `if constexpr`-discarded).
+  };
+
+  template <typename Scalar, typename Backend = CpuBackend>
   struct Ops {
     using Real                       = real_t<Scalar>;
     static constexpr bool is_complex = !std::is_same_v<Scalar, Real>;
@@ -55,7 +90,31 @@ namespace arnoldi::detail {
         return slapy2(x, y);
     }
 
-    static void copy(int n, const Scalar* x, int incx, Scalar* y, int incy) {
+    // Fill a length-n buffer with Scalar(0). CPU: trivial loop. Device backends
+    // specialize this to a single asynchronous memset.
+    static void zero([[maybe_unused]] BackendRef<Backend> bref, int n, Scalar* x) {
+      for (int i = 0; i < n; ++i) x[i] = Scalar(0);
+    }
+
+    // Read a single Scalar from a buffer that may be device-resident.
+    // CPU: deref. Device backends specialize to cudaMemcpy + sync.
+    static Scalar read_scalar([[maybe_unused]] BackendRef<Backend> bref, const Scalar* p) { return *p; }
+
+    // Stage a single Scalar from a (possibly device-resident) buffer into a
+    // host slot WITHOUT synchronizing. Callers must Ops::sync() before
+    // reading host_dst. CPU: an immediate copy (sync() is then a no-op), so
+    // numerics and ordering are bit-identical to a direct read_scalar.
+    // Device backends specialize this to an async cudaMemcpy so per-step
+    // device->host stalls collapse into one sync per call site batch.
+    static void read_scalar_async([[maybe_unused]] BackendRef<Backend> bref, Scalar* host_dst, const Scalar* src) {
+      *host_dst = *src;
+    }
+
+    // Drain the backend stream so all prior read_scalar_async stages have
+    // landed in host memory. CPU: no-op.
+    static void sync([[maybe_unused]] BackendRef<Backend> bref) {}
+
+    static void copy([[maybe_unused]] BackendRef<Backend> bref, int n, const Scalar* x, int incx, Scalar* y, int incy) {
       if constexpr (std::is_same_v<Scalar, double>)
         dcopy(n, x, incx, y, incy);
       else if constexpr (std::is_same_v<Scalar, float>)
@@ -65,7 +124,7 @@ namespace arnoldi::detail {
       else
         ccopy(n, x, incx, y, incy);
     }
-    static void scal(int n, Scalar a, Scalar* x, int incx) {
+    static void scal([[maybe_unused]] BackendRef<Backend> bref, int n, Scalar a, Scalar* x, int incx) {
       if constexpr (std::is_same_v<Scalar, double>)
         dscal(n, a, x, incx);
       else if constexpr (std::is_same_v<Scalar, float>)
@@ -75,7 +134,7 @@ namespace arnoldi::detail {
       else
         cscal(n, a, x, incx);
     }
-    static void axpy(int n, Scalar a, const Scalar* x, int incx, Scalar* y, int incy) {
+    static void axpy([[maybe_unused]] BackendRef<Backend> bref, int n, Scalar a, const Scalar* x, int incx, Scalar* y, int incy) {
       if constexpr (std::is_same_v<Scalar, double>)
         daxpy(n, a, x, incx, y, incy);
       else if constexpr (std::is_same_v<Scalar, float>)
@@ -85,7 +144,7 @@ namespace arnoldi::detail {
       else
         caxpy(n, a, x, incx, y, incy);
     }
-    static Real nrm2(int n, const Scalar* x, int incx) {
+    static Real nrm2([[maybe_unused]] BackendRef<Backend> bref, int n, const Scalar* x, int incx) {
       if constexpr (std::is_same_v<Scalar, double>)
         return dnrm2(n, x, incx);
       else if constexpr (std::is_same_v<Scalar, float>)
@@ -95,7 +154,7 @@ namespace arnoldi::detail {
       else
         return scnrm2(n, x, incx);
     }
-    static void larnv(int idist, int* iseed, int n, Scalar* x) {
+    static void larnv([[maybe_unused]] BackendRef<Backend> bref, int idist, int* iseed, int n, Scalar* x) {
       if constexpr (std::is_same_v<Scalar, double>)
         dlarnv(idist, iseed, n, x);
       else if constexpr (std::is_same_v<Scalar, float>)
@@ -107,22 +166,22 @@ namespace arnoldi::detail {
     }
 
     // Scale complex vector by real factor; for real types collapses to scal().
-    static void rscal(int n, Real a, Scalar* x, int incx) {
+    static void rscal(BackendRef<Backend> bref, int n, Real a, Scalar* x, int incx) {
       if constexpr (!is_complex)
-        scal(n, a, x, incx);
+        scal(bref, n, a, x, incx);
       else if constexpr (std::is_same_v<Real, double>)
         zdscal(n, a, x, incx);
       else
         csscal(n, a, x, incx);
     }
-    static void raxpy(int n, Real a, const Scalar* x, int incx, Scalar* y, int incy) {
+    static void raxpy(BackendRef<Backend> bref, int n, Real a, const Scalar* x, int incx, Scalar* y, int incy) {
       if constexpr (!is_complex)
-        axpy(n, a, x, incx, y, incy);
+        axpy(bref, n, a, x, incx, y, incy);
       else
-        axpy(n, Scalar(a, 0), x, incx, y, incy);
+        axpy(bref, n, Scalar(a, 0), x, incx, y, incy);
     }
 
-    static Scalar dot(int n, const Scalar* x, int incx, const Scalar* y, int incy) {
+    static Scalar dot([[maybe_unused]] BackendRef<Backend> bref, int n, const Scalar* x, int incx, const Scalar* y, int incy) {
       if constexpr (std::is_same_v<Scalar, double>)
         return ddot(n, x, incx, y, incy);
       else if constexpr (std::is_same_v<Scalar, float>)
@@ -135,7 +194,7 @@ namespace arnoldi::detail {
         sswap(n, x, incx, y, incy);
     }
 
-    static Scalar dotc(int n, const Scalar* x, int incx, const Scalar* y, int incy) {
+    static Scalar dotc([[maybe_unused]] BackendRef<Backend> bref, int n, const Scalar* x, int incx, const Scalar* y, int incy) {
       if constexpr (std::is_same_v<Scalar, double>)
         return ddot(n, x, incx, y, incy);
       else if constexpr (std::is_same_v<Scalar, float>)
@@ -150,8 +209,8 @@ namespace arnoldi::detail {
         return r;
       }
     }
-    static Real rdotc(int n, const Scalar* x, int incx, const Scalar* y, int incy) {
-      return std::real(dotc(n, x, incx, y, incy));
+    static Real rdotc(BackendRef<Backend> bref, int n, const Scalar* x, int incx, const Scalar* y, int incy) {
+      return std::real(dotc(bref, n, x, incx, y, incy));
     }
 
     static const char* herm_trans() {
@@ -161,8 +220,8 @@ namespace arnoldi::detail {
         return "T";
     }
 
-    static void gemv(const char* trans, int m, int n, Scalar alpha, const Scalar* a, int lda, const Scalar* x, int incx,
-                     Scalar beta, Scalar* y, int incy) {
+    static void gemv([[maybe_unused]] BackendRef<Backend> bref, const char* trans, int m, int n, Scalar alpha, const Scalar* a,
+                     int lda, const Scalar* x, int incx, Scalar beta, Scalar* y, int incy) {
       if constexpr (std::is_same_v<Scalar, double>)
         dgemv(trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
       else if constexpr (std::is_same_v<Scalar, float>)
@@ -173,19 +232,77 @@ namespace arnoldi::detail {
         cgemv(trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
     }
 
+    static void gemm([[maybe_unused]] BackendRef<Backend> bref, const char* transa, const char* transb, int m, int n, int k,
+                     Scalar alpha, const Scalar* a, int lda, const Scalar* b, int ldb, Scalar beta, Scalar* c, int ldc) {
+      if constexpr (std::is_same_v<Scalar, double>)
+        dgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+      else if constexpr (std::is_same_v<Scalar, float>)
+        sgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+      else if constexpr (std::is_same_v<Scalar, std::complex<double>>)
+        zgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+      else
+        cgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    }
+
     // gemv with a real x-vector applied to a possibly-complex matrix.
-    static void gemv_rv(const char* trans, int m, int ncols, Real alpha, const Scalar* a, int lda, const Real* x, int incx,
-                        Real beta, Scalar* y, int incy) {
+    static void gemv_rv(BackendRef<Backend> bref, const char* trans, int m, int ncols, Real alpha, const Scalar* a, int lda,
+                        const Real* x, int incx, Real beta, Scalar* y, int incy) {
       if constexpr (!is_complex) {
-        gemv(trans, m, ncols, alpha, a, lda, x, incx, beta, y, incy);
+        gemv(bref, trans, m, ncols, alpha, a, lda, x, incx, beta, y, incy);
       } else {
         int                 xlen = (*trans == 'N' || *trans == 'n') ? ncols : m;
         std::vector<Scalar> cx(xlen);
         for (int i = 0; i < xlen; i++) cx[i] = Scalar(x[i * incx], 0);
-        gemv(trans, m, ncols, Scalar(alpha, 0), a, lda, cx.data(), 1, Scalar(beta, 0), y, incy);
+        gemv(bref, trans, m, ncols, Scalar(alpha, 0), a, lda, cx.data(), 1, Scalar(beta, 0), y, incy);
       }
     }
 
+    // -----------------------------------------------------------------------
+    // No-BackendRef overloads — for host-only call sites (e.g. eig.hpp's
+    // seigt / neigh on workl/h/q, and other small host-side BLAS-1/2 calls).
+    // These forward to the BackendRef-taking version with an empty
+    // BackendRef<Backend>{}; under CudaBackend the partial specialization
+    // does NOT provide these, so an accidental device-side host-style call
+    // becomes a compile error.
+    static void   zero(int n, Scalar* x) { zero(BackendRef<Backend>{}, n, x); }
+    static Scalar read_scalar(const Scalar* p) { return read_scalar(BackendRef<Backend>{}, p); }
+    static void copy(int n, const Scalar* x, int incx, Scalar* y, int incy) { copy(BackendRef<Backend>{}, n, x, incx, y, incy); }
+    static void scal(int n, Scalar a, Scalar* x, int incx) { scal(BackendRef<Backend>{}, n, a, x, incx); }
+    static void axpy(int n, Scalar a, const Scalar* x, int incx, Scalar* y, int incy) {
+      axpy(BackendRef<Backend>{}, n, a, x, incx, y, incy);
+    }
+    static Real nrm2(int n, const Scalar* x, int incx) { return nrm2(BackendRef<Backend>{}, n, x, incx); }
+    static void larnv(int idist, int* iseed, int n, Scalar* x) { larnv(BackendRef<Backend>{}, idist, iseed, n, x); }
+    static void rscal(int n, Real a, Scalar* x, int incx) { rscal(BackendRef<Backend>{}, n, a, x, incx); }
+    static void raxpy(int n, Real a, const Scalar* x, int incx, Scalar* y, int incy) {
+      raxpy(BackendRef<Backend>{}, n, a, x, incx, y, incy);
+    }
+    static Scalar dot(int n, const Scalar* x, int incx, const Scalar* y, int incy) {
+      return dot(BackendRef<Backend>{}, n, x, incx, y, incy);
+    }
+    static Scalar dotc(int n, const Scalar* x, int incx, const Scalar* y, int incy) {
+      return dotc(BackendRef<Backend>{}, n, x, incx, y, incy);
+    }
+    static Real rdotc(int n, const Scalar* x, int incx, const Scalar* y, int incy) {
+      return rdotc(BackendRef<Backend>{}, n, x, incx, y, incy);
+    }
+    static void gemv(const char* trans, int m, int n, Scalar alpha, const Scalar* a, int lda, const Scalar* x, int incx,
+                     Scalar beta, Scalar* y, int incy) {
+      gemv(BackendRef<Backend>{}, trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
+    }
+    static void gemm(const char* transa, const char* transb, int m, int n, int k, Scalar alpha, const Scalar* a, int lda,
+                     const Scalar* b, int ldb, Scalar beta, Scalar* c, int ldc) {
+      gemm(BackendRef<Backend>{}, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    }
+    static void gemv_rv(const char* trans, int m, int ncols, Real alpha, const Scalar* a, int lda, const Real* x, int incx,
+                        Real beta, Scalar* y, int incy) {
+      gemv_rv(BackendRef<Backend>{}, trans, m, ncols, alpha, a, lda, x, incx, beta, y, incy);
+    }
+    static void lascl_(const char* type, const int* kl, const int* ku, const Real* cfrom, const Real* cto, const int* m,
+                       const int* n, Scalar* a, const int* lda, int* info) {
+      lascl_(BackendRef<Backend>{}, type, kl, ku, cfrom, cto, m, n, a, lda, info);
+    }
+
     static void ger(int m, int n, Scalar alpha, const Scalar* x, int incx, const Scalar* y, int incy, Scalar* a, int lda) {
       if constexpr (std::is_same_v<Scalar, double>)
         dger(m, n, alpha, x, incx, y, incy, a, lda);
@@ -209,8 +326,8 @@ namespace arnoldi::detail {
       else
         clacpy(uplo, m, n, a, lda, b, ldb);
     }
-    static void lascl_(const char* type, const int* kl, const int* ku, const Real* cfrom, const Real* cto, const int* m,
-                       const int* n, Scalar* a, const int* lda, int* info) {
+    static void lascl_([[maybe_unused]] BackendRef<Backend> bref, const char* type, const int* kl, const int* ku, const Real* cfrom,
+                       const Real* cto, const int* m, const int* n, Scalar* a, const int* lda, int* info) {
       if constexpr (std::is_same_v<Scalar, double>)
         ::dlascl_(type, kl, ku, cfrom, cto, m, n, a, lda, info);
       else if constexpr (std::is_same_v<Scalar, float>)
@@ -332,45 +449,91 @@ namespace arnoldi::detail {
 
   // Distributed reduction helpers (PARPACK-style).
   // For SerialComm these compile to bare BLAS calls with zero overhead.
-  template <typename Real, typename Comm>
-  Real pdot(const Comm& c, int n, const Real* x, int incx, const Real* y, int incy) {
-    return c.allreduce_sum(Ops<Real>::dot(n, x, incx, y, incy));
+  // Backend defaults to CpuBackend so existing pdot<Real>(...) call sites keep
+  // compiling; algorithmic code that lives under a Backend template forwards
+  // it explicitly: pdot<Real, Backend>(bref, ...).
+  template <typename Real, typename Backend = CpuBackend, typename Comm>
+  Real pdot(BackendRef<Backend> bref, const Comm& c, int n, const Real* x, int incx, const Real* y, int incy) {
+    return c.allreduce_sum(Ops<Real, Backend>::dot(bref, n, x, incx, y, incy));
   }
-  template <typename Scalar, typename Comm>
-  real_t<Scalar> prdotc(const Comm& c, int n, const Scalar* x, int incx, const Scalar* y, int incy) {
-    return c.allreduce_sum(Ops<Scalar>::rdotc(n, x, incx, y, incy));
+  template <typename Scalar, typename Backend = CpuBackend, typename Comm>
+  real_t<Scalar> prdotc(BackendRef<Backend> bref, const Comm& c, int n, const Scalar* x, int incx, const Scalar* y, int incy) {
+    return c.allreduce_sum(Ops<Scalar, Backend>::rdotc(bref, n, x, incx, y, incy));
   }
-  template <typename Real, typename Comm>
-  Real pnrm2_real(const Comm& c, int n, const Real* x, int incx) {
-    Real s = c.allreduce_sum(Ops<Real>::dot(n, x, incx, x, incx));
+  template <typename Real, typename Backend = CpuBackend, typename Comm>
+  Real pnrm2_real(BackendRef<Backend> bref, const Comm& c, int n, const Real* x, int incx) {
+    Real s = c.allreduce_sum(Ops<Real, Backend>::dot(bref, n, x, incx, x, incx));
     return std::sqrt(std::abs(s));
   }
-  template <typename Scalar, typename Comm>
-  real_t<Scalar> pnrm2(const Comm& c, int n, const Scalar* x, int incx) {
-    auto s = c.allreduce_sum(Ops<Scalar>::rdotc(n, x, incx, x, incx));
+  template <typename Scalar, typename Backend = CpuBackend, typename Comm>
+  real_t<Scalar> pnrm2(BackendRef<Backend> bref, const Comm& c, int n, const Scalar* x, int incx) {
+    auto s = c.allreduce_sum(Ops<Scalar, Backend>::rdotc(bref, n, x, incx, x, incx));
     return std::sqrt(std::abs(s));
   }
 
   // B-norm helper: compute rnorm = B-norm(resid) using workd as scratch.
   // For bmat='G': bop(resid, workd[0..]), rnorm = sqrt(|resid . workd|).
   // For bmat='I': rnorm = nrm2(resid).
-  template <typename Scalar, typename BOP, typename Comm>
-  real_t<Scalar> bnorm(const char bmat, int n, Scalar* resid, Scalar* workd_lo, Scalar* workd_hi, BOP&& bop, const Comm& comm) {
+  template <typename Scalar, typename Backend = CpuBackend, typename BOP, typename Comm>
+  real_t<Scalar> bnorm(BackendRef<Backend> bref, const char bmat, int n, Scalar* resid, Scalar* workd_lo, Scalar* workd_hi,
+                       BOP&& bop, const Comm& comm) {
     using Real = real_t<Scalar>;
     Real val;
     if (bmat == 'G') {
       stats.nbx++;
-      Ops<Scalar>::copy(n, resid, 1, workd_hi, 1);
+      Ops<Scalar, Backend>::copy(bref, n, resid, 1, workd_hi, 1);
       bop(workd_hi, workd_lo);
-      val = prdotc<Scalar>(comm, n, resid, 1, workd_lo, 1);
+      val = prdotc<Scalar, Backend>(bref, comm, n, resid, 1, workd_lo, 1);
       val = std::sqrt(std::abs(val));
     } else {
-      Ops<Scalar>::copy(n, resid, 1, workd_lo, 1);
-      val = pnrm2<Scalar>(comm, n, resid, 1);
+      Ops<Scalar, Backend>::copy(bref, n, resid, 1, workd_lo, 1);
+      val = pnrm2<Scalar, Backend>(bref, comm, n, resid, 1);
     }
     return val;
   }
 
+  // ---------------------------------------------------------------------------
+  // No-BackendRef overloads for the reduction helpers — defaults to CpuBackend
+  // for the historical public API (tests, examples).
+  template <typename Real, typename Comm>
+  Real pdot(const Comm& c, int n, const Real* x, int incx, const Real* y, int incy) {
+    return pdot<Real, CpuBackend>(BackendRef<CpuBackend>{}, c, n, x, incx, y, incy);
+  }
+  template <typename Scalar, typename Comm>
+  real_t<Scalar> prdotc(const Comm& c, int n, const Scalar* x, int incx, const Scalar* y, int incy) {
+    return prdotc<Scalar, CpuBackend>(BackendRef<CpuBackend>{}, c, n, x, incx, y, incy);
+  }
+  template <typename Real, typename Comm>
+  Real pnrm2_real(const Comm& c, int n, const Real* x, int incx) {
+    return pnrm2_real<Real, CpuBackend>(BackendRef<CpuBackend>{}, c, n, x, incx);
+  }
+  template <typename Scalar, typename Comm>
+  real_t<Scalar> pnrm2(const Comm& c, int n, const Scalar* x, int incx) {
+    return pnrm2<Scalar, CpuBackend>(BackendRef<CpuBackend>{}, c, n, x, incx);
+  }
+  template <typename Scalar, typename BOP, typename Comm>
+  real_t<Scalar> bnorm(const char bmat, int n, Scalar* resid, Scalar* workd_lo, Scalar* workd_hi, BOP&& bop, const Comm& comm) {
+    return bnorm<Scalar, CpuBackend>(BackendRef<CpuBackend>{}, bmat, n, resid, workd_lo, workd_hi, std::forward<BOP>(bop), comm);
+  }
+
+  // ---------------------------------------------------------------------------
+  // Backend-aware extension points. Primary versions are CPU; the CUDA
+  // backend overrides them via additional overloads declared in cuda.hpp.
+  // Algorithmic templates call these by name; ADL + overload resolution
+  // picks the right one based on the BackendRef argument.
+  // ---------------------------------------------------------------------------
+
+  // In-place allreduce on a length-`len` buffer. Under CpuBackend the buffer
+  // is host-resident, so we forward to comm.allreduce_sum directly. The CUDA
+  // backend provides an overload that stages D->H, allreduces on a host
+  // scratch buffer, then copies H->D. Length is always small (<= ncv) — the
+  // staging cost is irrelevant compared to the matvec.
+  template <typename Backend, typename Scalar, typename Comm>
+  void backend_allreduce(BackendRef<Backend>, const Comm& comm, Scalar* buf, int len) {
+    comm.allreduce_sum(buf, len);
+  }
+
+
 }  // namespace arnoldi::detail
 
 #endif  // ARNOLDI_DETAIL_OPS_HPP
diff --git a/include/arnoldi/detail/sym.hpp b/include/arnoldi/detail/sym.hpp
index 8b88b6b..e3ae2f3 100644
--- a/include/arnoldi/detail/sym.hpp
+++ b/include/arnoldi/detail/sym.hpp
@@ -21,9 +21,10 @@ namespace arnoldi::detail {
   // saitr — Lanczos iteration (symmetric / Hermitian).
   // Scalar = double|float for real symmetric, complex<T> for Hermitian.
   // Vectors (v, resid, workd) are Scalar; tridiagonal h is detail::real_t<Scalar>.
-  template <typename Scalar, typename OP, typename BOP, typename Comm>
-  void saitr(const char* bmat, int n, int k, int np, int mode, Scalar* resid, detail::real_t<Scalar>& rnorm, Scalar* v, int ldv,
-             detail::real_t<Scalar>* h, int ldh, Scalar* workd, int& info, OP&& op, BOP&& bop, const Comm& comm) {
+  template <typename Scalar, typename Backend = detail::CpuBackend, typename OP, typename BOP, typename Comm>
+  void saitr(detail::BackendRef<Backend> bref, const char* bmat, int n, int k, int np, int mode, Scalar* resid,
+             detail::real_t<Scalar>& rnorm, Scalar* v, int ldv, detail::real_t<Scalar>* h, int ldh, Scalar* workd, int& info,
+             OP&& op, BOP&& bop, const Comm& comm) {
     using Real    = detail::real_t<Scalar>;
     const int ipj = 0, irj = n, ivj = 2 * n;
 
@@ -33,6 +34,26 @@ namespace arnoldi::detail {
     detail::arscnd(t0);
     info = 0;
 
+    // The diagonal of H (h[ldh + j - 1]) is write-only inside this loop and
+    // is consumed only later by seigt. Stage each per-step device read
+    // asynchronously and reconstruct the host diagonal with a single stream
+    // sync per saitr call, instead of one device->host stall per Lanczos
+    // step. Under CpuBackend read_scalar_async is an immediate copy and
+    // sync() a no-op, so the staged values and their summation order are
+    // identical to a direct read — numerically bit-for-bit unchanged.
+    std::vector<Scalar> hdiag_stage(static_cast<std::size_t>(k + np));
+    std::vector<Scalar> hreorth_stage(static_cast<std::size_t>(k + np) * 2);
+    std::vector<int>    hreorth_cnt(static_cast<std::size_t>(k + np), 0);
+    auto                flush_hdiag = [&](int jlast) {
+      detail::Ops<Scalar, Backend>::sync(bref);
+      for (int jj = k + 1; jj <= jlast; ++jj) {
+        Real d   = std::real(hdiag_stage[jj - 1]);
+        int  cnt = hreorth_cnt[jj - 1];
+        for (int r = 0; r < cnt; ++r) d += std::real(hreorth_stage[static_cast<std::size_t>(jj - 1) * 2 + r]);
+        h[ldh + jj - 1] = d;
+      }
+    };
+
     for (int j = k + 1; j <= k + np; ++j) {
       if (msglvl > 2) {
         int jj = j;
@@ -56,38 +77,40 @@ namespace arnoldi::detail {
 
         int ierr = -1;
         for (int itry = 1; itry <= 3; ++itry) {
-          getv0<Scalar>(bmat, itry, false, n, j, v, ldv, resid, rnorm, workd, ierr, op, bop, comm);
+          getv0<Scalar, Backend>(bref, bmat, itry, false, n, j, v, ldv, resid, rnorm, workd, ierr, op, bop, comm);
           if (ierr >= 0) break;
         }
         if (ierr < 0) {
           info = j - 1;
+          flush_hdiag(j - 1);
           detail::arscnd(t1);
           detail::stats.aitr += (t1 - t0);
           return;
         }
       }
 
-      detail::Ops<Scalar>::copy(n, resid, 1, &v[(j - 1) * ldv], 1);
+      detail::Ops<Scalar, Backend>::copy(bref, n, resid, 1, &v[(j - 1) * ldv], 1);
       if (rnorm >= safmin) {
         Real temp1 = Real(1) / rnorm;
-        detail::Ops<Scalar>::rscal(n, temp1, &v[(j - 1) * ldv], 1);
-        detail::Ops<Scalar>::rscal(n, temp1, &workd[ipj], 1);
+        detail::Ops<Scalar, Backend>::rscal(bref, n, temp1, &v[(j - 1) * ldv], 1);
+        detail::Ops<Scalar, Backend>::rscal(bref, n, temp1, &workd[ipj], 1);
       } else {
         int  i_zero = 0, i_one = 1;
         int  infol;
         Real r_one = Real(1);
-        detail::Ops<Scalar>::lascl_("G", &i_zero, &i_zero, &rnorm, &r_one, &n, &i_one, &v[(j - 1) * ldv], &n, &infol);
-        detail::Ops<Scalar>::lascl_("G", &i_zero, &i_zero, &rnorm, &r_one, &n, &i_one, &workd[ipj], &n, &infol);
+        detail::Ops<Scalar, Backend>::lascl_(bref, "G", &i_zero, &i_zero, &rnorm, &r_one, &n, &i_one, &v[(j - 1) * ldv], &n,
+                                             &infol);
+        detail::Ops<Scalar, Backend>::lascl_(bref, "G", &i_zero, &i_zero, &rnorm, &r_one, &n, &i_one, &workd[ipj], &n, &infol);
       }
 
       detail::stats.nopx++;
       detail::arscnd(t2);
-      detail::Ops<Scalar>::copy(n, &v[(j - 1) * ldv], 1, &workd[ivj], 1);
+      detail::Ops<Scalar, Backend>::copy(bref, n, &v[(j - 1) * ldv], 1, &workd[ivj], 1);
       op(&workd[ivj], &workd[irj]);
       detail::arscnd(t3);
       detail::stats.mvopx += (t3 - t2);
 
-      detail::Ops<Scalar>::copy(n, &workd[irj], 1, resid, 1);
+      detail::Ops<Scalar, Backend>::copy(bref, n, &workd[irj], 1, resid, 1);
 
       if (mode != 2) {
         detail::arscnd(t2);
@@ -95,7 +118,7 @@ namespace arnoldi::detail {
           detail::stats.nbx++;
           bop(&workd[irj], &workd[ipj]);
         } else {
-          detail::Ops<Scalar>::copy(n, resid, 1, &workd[ipj], 1);
+          detail::Ops<Scalar, Backend>::copy(bref, n, resid, 1, &workd[ipj], 1);
         }
         if (*bmat == 'G') {
           detail::arscnd(t3);
@@ -105,26 +128,26 @@ namespace arnoldi::detail {
 
       Real wnorm;
       if (mode == 2) {
-        wnorm = detail::prdotc<Scalar>(comm, n, resid, 1, &workd[ivj], 1);
+        wnorm = detail::prdotc<Scalar, Backend>(bref, comm, n, resid, 1, &workd[ivj], 1);
         wnorm = std::sqrt(std::abs(wnorm));
       } else if (*bmat == 'G') {
-        wnorm = detail::prdotc<Scalar>(comm, n, resid, 1, &workd[ipj], 1);
+        wnorm = detail::prdotc<Scalar, Backend>(bref, comm, n, resid, 1, &workd[ipj], 1);
         wnorm = std::sqrt(std::abs(wnorm));
       } else {
-        wnorm = detail::pnrm2<Scalar>(comm, n, resid, 1);
+        wnorm = detail::pnrm2<Scalar, Backend>(bref, comm, n, resid, 1);
       }
 
       if (mode != 2) {
-        detail::Ops<Scalar>::gemv(detail::Ops<Scalar>::herm_trans(), n, j, Scalar(1), v, ldv, &workd[ipj], 1, Scalar(0),
-                                  &workd[irj], 1);
+        detail::Ops<Scalar, Backend>::gemv(bref, detail::Ops<Scalar, Backend>::herm_trans(), n, j, Scalar(1), v, ldv, &workd[ipj],
+                                           1, Scalar(0), &workd[irj], 1);
       } else {
-        detail::Ops<Scalar>::gemv(detail::Ops<Scalar>::herm_trans(), n, j, Scalar(1), v, ldv, &workd[ivj], 1, Scalar(0),
-                                  &workd[irj], 1);
+        detail::Ops<Scalar, Backend>::gemv(bref, detail::Ops<Scalar, Backend>::herm_trans(), n, j, Scalar(1), v, ldv, &workd[ivj],
+                                           1, Scalar(0), &workd[irj], 1);
       }
-      comm.allreduce_sum(&workd[irj], j);
-      detail::Ops<Scalar>::gemv("N", n, j, Scalar(-1), v, ldv, &workd[irj], 1, Scalar(1), resid, 1);
+      detail::backend_allreduce(bref, comm, &workd[irj], j);
+      detail::Ops<Scalar, Backend>::gemv(bref, "N", n, j, Scalar(-1), v, ldv, &workd[irj], 1, Scalar(1), resid, 1);
 
-      h[ldh + j - 1] = std::real(workd[irj + j - 1]);
+      detail::Ops<Scalar, Backend>::read_scalar_async(bref, &hdiag_stage[j - 1], &workd[irj + j - 1]);
       if (j == 1 || rstart)
         h[j - 1] = Real(0);
       else
@@ -135,10 +158,10 @@ namespace arnoldi::detail {
       detail::arscnd(t2);
       if (*bmat == 'G') {
         detail::stats.nbx++;
-        detail::Ops<Scalar>::copy(n, resid, 1, &workd[irj], 1);
+        detail::Ops<Scalar, Backend>::copy(bref, n, resid, 1, &workd[irj], 1);
         bop(&workd[irj], &workd[ipj]);
       } else {
-        detail::Ops<Scalar>::copy(n, resid, 1, &workd[ipj], 1);
+        detail::Ops<Scalar, Backend>::copy(bref, n, resid, 1, &workd[ipj], 1);
       }
       if (*bmat == 'G') {
         detail::arscnd(t3);
@@ -146,10 +169,10 @@ namespace arnoldi::detail {
       }
 
       if (*bmat == 'G') {
-        rnorm = detail::prdotc<Scalar>(comm, n, resid, 1, &workd[ipj], 1);
+        rnorm = detail::prdotc<Scalar, Backend>(bref, comm, n, resid, 1, &workd[ipj], 1);
         rnorm = std::sqrt(std::abs(rnorm));
       } else {
-        rnorm = detail::pnrm2<Scalar>(comm, n, resid, 1);
+        rnorm = detail::pnrm2<Scalar, Backend>(bref, comm, n, resid, 1);
       }
 
       if (rnorm <= Real(0.717) * wnorm) {
@@ -160,22 +183,27 @@ namespace arnoldi::detail {
             detail::debug.vout(2, xtemp, "_saitr: re-orthonalization ; wnorm and rnorm are");
           }
 
-          detail::Ops<Scalar>::gemv(detail::Ops<Scalar>::herm_trans(), n, j, Scalar(1), v, ldv, &workd[ipj], 1, Scalar(0),
-                                    &workd[irj], 1);
-          comm.allreduce_sum(&workd[irj], j);
-          detail::Ops<Scalar>::gemv("N", n, j, Scalar(-1), v, ldv, &workd[irj], 1, Scalar(1), resid, 1);
+          detail::Ops<Scalar, Backend>::gemv(bref, detail::Ops<Scalar, Backend>::herm_trans(), n, j, Scalar(1), v, ldv,
+                                             &workd[ipj], 1, Scalar(0), &workd[irj], 1);
+          detail::backend_allreduce(bref, comm, &workd[irj], j);
+          detail::Ops<Scalar, Backend>::gemv(bref, "N", n, j, Scalar(-1), v, ldv, &workd[irj], 1, Scalar(1), resid, 1);
 
           if (j == 1 || rstart) h[j - 1] = Real(0);
-          h[ldh + j - 1] = h[ldh + j - 1] + std::real(workd[irj + j - 1]);
+          {
+            int& rc = hreorth_cnt[j - 1];
+            detail::Ops<Scalar, Backend>::read_scalar_async(bref, &hreorth_stage[static_cast<std::size_t>(j - 1) * 2 + rc],
+                                                            &workd[irj + j - 1]);
+            ++rc;
+          }
 
           detail::arscnd(t2);
           Real rnorm1;
           if (*bmat == 'G') {
             detail::stats.nbx++;
-            detail::Ops<Scalar>::copy(n, resid, 1, &workd[irj], 1);
+            detail::Ops<Scalar, Backend>::copy(bref, n, resid, 1, &workd[irj], 1);
             bop(&workd[irj], &workd[ipj]);
           } else {
-            detail::Ops<Scalar>::copy(n, resid, 1, &workd[ipj], 1);
+            detail::Ops<Scalar, Backend>::copy(bref, n, resid, 1, &workd[ipj], 1);
           }
           if (*bmat == 'G') {
             detail::arscnd(t3);
@@ -183,10 +211,10 @@ namespace arnoldi::detail {
           }
 
           if (*bmat == 'G') {
-            rnorm1 = detail::prdotc<Scalar>(comm, n, resid, 1, &workd[ipj], 1);
+            rnorm1 = detail::prdotc<Scalar, Backend>(bref, comm, n, resid, 1, &workd[ipj], 1);
             rnorm1 = std::sqrt(std::abs(rnorm1));
           } else {
-            rnorm1 = detail::pnrm2<Scalar>(comm, n, resid, 1);
+            rnorm1 = detail::pnrm2<Scalar, Backend>(bref, comm, n, resid, 1);
           }
 
           if (msglvl > 0 && riter > 0) {
@@ -205,7 +233,7 @@ namespace arnoldi::detail {
           detail::stats.nitref++;
           rnorm = rnorm1;
           if (riter == 1) {
-            for (int jj = 0; jj < n; jj++) resid[jj] = Scalar(0);
+            detail::Ops<Scalar, Backend>::zero(bref, n, resid);
             rnorm = Real(0);
           }
         }
@@ -218,12 +246,14 @@ namespace arnoldi::detail {
       if (h[j - 1] < Real(0)) {
         h[j - 1] = -h[j - 1];
         if (j < k + np)
-          detail::Ops<Scalar>::rscal(n, Real(-1), &v[j * ldv], 1);
+          detail::Ops<Scalar, Backend>::rscal(bref, n, Real(-1), &v[j * ldv], 1);
         else
-          detail::Ops<Scalar>::rscal(n, Real(-1), resid, 1);
+          detail::Ops<Scalar, Backend>::rscal(bref, n, Real(-1), resid, 1);
       }
     }
 
+    flush_hdiag(k + np);
+
     detail::arscnd(t1);
     detail::stats.aitr += (t1 - t0);
 
@@ -238,9 +268,9 @@ namespace arnoldi::detail {
 
   // sapps — implicit QR shifts for symmetric/Hermitian Lanczos.
   // Scalar-aware: Givens rotations on Real h/q, final V update on Scalar.
-  template <typename Scalar>
-  void sapps(int n, int kev, int np, detail::real_t<Scalar>* shift, Scalar* v, int ldv, detail::real_t<Scalar>* h, int ldh,
-             Scalar* resid, detail::real_t<Scalar>* q, int ldq, Scalar* workd) {
+  template <typename Scalar, typename Backend = detail::CpuBackend>
+  void sapps(detail::BackendRef<Backend> bref, int n, int kev, int np, detail::real_t<Scalar>* shift, Scalar* v, int ldv,
+             detail::real_t<Scalar>* h, int ldh, Scalar* resid, detail::real_t<Scalar>* q, int ldq, Scalar* workd) {
     using Real     = detail::real_t<Scalar>;
     const Real one = Real(1), zero = Real(0);
 
@@ -355,21 +385,22 @@ namespace arnoldi::detail {
     }
 
     // V_new = V * Q and resid update.
-    if (h[kev] > zero) detail::Ops<Scalar>::gemv_rv("N", n, kplusp, one, v, ldv, &q[kev * ldq], 1, zero, &workd[n], 1);
+    if (h[kev] > zero)
+      detail::Ops<Scalar, Backend>::gemv_rv(bref, "N", n, kplusp, one, v, ldv, &q[kev * ldq], 1, zero, &workd[n], 1);
 
     for (i = 1; i <= kev; i++) {
-      detail::Ops<Scalar>::gemv_rv("N", n, kplusp - i + 1, one, v, ldv, &q[(kev - i) * ldq], 1, zero, workd, 1);
-      detail::Ops<Scalar>::copy(n, workd, 1, &v[(kplusp - i) * ldv], 1);
+      detail::Ops<Scalar, Backend>::gemv_rv(bref, "N", n, kplusp - i + 1, one, v, ldv, &q[(kev - i) * ldq], 1, zero, workd, 1);
+      detail::Ops<Scalar, Backend>::copy(bref, n, workd, 1, &v[(kplusp - i) * ldv], 1);
     }
 
     for (i = 1; i <= kev; i++) {
-      detail::Ops<Scalar>::copy(n, &v[(np + i - 1) * ldv], 1, &v[(i - 1) * ldv], 1);
+      detail::Ops<Scalar, Backend>::copy(bref, n, &v[(np + i - 1) * ldv], 1, &v[(i - 1) * ldv], 1);
     }
 
-    if (h[kev] > zero) detail::Ops<Scalar>::copy(n, &workd[n], 1, &v[kev * ldv], 1);
+    if (h[kev] > zero) detail::Ops<Scalar, Backend>::copy(bref, n, &workd[n], 1, &v[kev * ldv], 1);
 
-    detail::Ops<Scalar>::rscal(n, q[(kev - 1) * ldq + kplusp - 1], resid, 1);
-    if (h[kev] > zero) detail::Ops<Scalar>::raxpy(n, h[kev], &v[kev * ldv], 1, resid, 1);
+    detail::Ops<Scalar, Backend>::rscal(bref, n, q[(kev - 1) * ldq + kplusp - 1], resid, 1);
+    if (h[kev] > zero) detail::Ops<Scalar, Backend>::raxpy(bref, n, h[kev], &v[kev * ldv], 1, resid, 1);
 
     if (msglvl > 1) {
       detail::debug.vout(1, &q[(kev - 1) * ldq + kplusp - 1], "_sapps: sigmak of the updated residual vector");
@@ -385,11 +416,12 @@ namespace arnoldi::detail {
   }
 
   // saup2 — main Lanczos iteration driver (symmetric / Hermitian).
-  template <typename Scalar, typename OP, typename BOP, typename Comm>
-  void saup2(const char* bmat, int n, const char* which, int& nev, int& np, detail::real_t<Scalar> tol, Scalar* resid, int mode,
-             int iupd, int ishift, int& mxiter, Scalar* v, int ldv, detail::real_t<Scalar>* h, int ldh,
-             detail::real_t<Scalar>* ritz, detail::real_t<Scalar>* bounds, detail::real_t<Scalar>* q, int ldq,
-             detail::real_t<Scalar>* workl, Scalar* workd, int& info, OP&& op, BOP&& bop, const Comm& comm) {
+  template <typename Scalar, typename Backend = detail::CpuBackend, typename OP, typename BOP, typename Comm>
+  void saup2(detail::BackendRef<Backend> bref, const char* bmat, int n, const char* which, int& nev, int& np,
+             detail::real_t<Scalar> tol, Scalar* resid, int mode, int iupd, int ishift, int& mxiter, Scalar* v, int ldv,
+             detail::real_t<Scalar>* h, int ldh, detail::real_t<Scalar>* ritz, detail::real_t<Scalar>* bounds,
+             detail::real_t<Scalar>* q, int ldq, detail::real_t<Scalar>* workl, Scalar* workd, int& info, OP&& op, BOP&& bop,
+             const Comm& comm) {
     using Real = detail::real_t<Scalar>;
     double t0, t1, t2, t3;
     detail::arscnd(t0);
@@ -415,14 +447,14 @@ namespace arnoldi::detail {
       detail::stats.aup2 = t1 - t0;
     };
 
-    getv0<Scalar>(bmat, 1, initv, n, 1, v, ldv, resid, rnorm, workd, info, op, bop, comm);
+    getv0<Scalar, Backend>(bref, bmat, 1, initv, n, 1, v, ldv, resid, rnorm, workd, info, op, bop, comm);
     if (rnorm == Real(0)) {
       info = -9;
       end_saup2();
       return;
     }
 
-    saitr<Scalar>(bmat, n, 0, nev0, mode, resid, rnorm, v, ldv, h, ldh, workd, info, op, bop, comm);
+    saitr<Scalar, Backend>(bref, bmat, n, 0, nev0, mode, resid, rnorm, v, ldv, h, ldh, workd, info, op, bop, comm);
     if (info > 0) {
       np     = info;
       mxiter = iter;
@@ -441,7 +473,7 @@ namespace arnoldi::detail {
         detail::debug.ivout(1, &np, "_saup2: Extend the Lanczos factorization by");
       }
 
-      saitr<Scalar>(bmat, n, nev, np, mode, resid, rnorm, v, ldv, h, ldh, workd, info, op, bop, comm);
+      saitr<Scalar, Backend>(bref, bmat, n, nev, np, mode, resid, rnorm, v, ldv, h, ldh, workd, info, op, bop, comm);
       if (info > 0) {
         np     = info;
         mxiter = iter;
@@ -563,18 +595,18 @@ namespace arnoldi::detail {
         }
       }
 
-      sapps<Scalar>(n, nev, np, ritz, v, ldv, h, ldh, resid, q, ldq, workd);
+      sapps<Scalar, Backend>(bref, n, nev, np, ritz, v, ldv, h, ldh, resid, q, ldq, workd);
 
       detail::arscnd(t2);
       if (*bmat == 'G') {
         detail::stats.nbx++;
-        detail::Ops<Scalar>::copy(n, resid, 1, &workd[n], 1);
+        detail::Ops<Scalar, Backend>::copy(bref, n, resid, 1, &workd[n], 1);
         bop(&workd[n], workd);
-        rnorm = detail::prdotc<Scalar>(comm, n, resid, 1, workd, 1);
+        rnorm = detail::prdotc<Scalar, Backend>(bref, comm, n, resid, 1, workd, 1);
         rnorm = std::sqrt(std::abs(rnorm));
       } else {
-        detail::Ops<Scalar>::copy(n, resid, 1, workd, 1);
-        rnorm = detail::pnrm2<Scalar>(comm, n, resid, 1);
+        detail::Ops<Scalar, Backend>::copy(bref, n, resid, 1, workd, 1);
+        rnorm = detail::pnrm2<Scalar, Backend>(bref, comm, n, resid, 1);
       }
       if (*bmat == 'G') {
         detail::arscnd(t3);
@@ -598,10 +630,10 @@ namespace arnoldi::detail {
   // saupd — symmetric/Hermitian eigensolver entry point (callback).
   // Scalar = double|float for real, complex<T> for Hermitian.
   // workd is Scalar* (3n), workl is detail::real_t<Scalar>* (ncv^2 + 8*ncv).
-  template <typename Scalar, typename OP, typename BOP, typename Comm>
-  void saupd(const char* bmat, int n, const char* which, int nev, detail::real_t<Scalar>& tol, Scalar* resid, int ncv, Scalar* v,
-             int ldv, int* iparam, int* ipntr, Scalar* workd, detail::real_t<Scalar>* workl, int lworkl, int& info, OP&& op,
-             BOP&& bop, const Comm& comm) {
+  template <typename Scalar, typename Backend = detail::CpuBackend, typename OP, typename BOP, typename Comm>
+  void saupd(detail::BackendRef<Backend> bref, const char* bmat, int n, const char* which, int nev, detail::real_t<Scalar>& tol,
+             Scalar* resid, int ncv, Scalar* v, int ldv, int* iparam, int* ipntr, Scalar* workd, detail::real_t<Scalar>* workl,
+             int lworkl, int& info, OP&& op, BOP&& bop, const Comm& comm) {
     using Real = detail::real_t<Scalar>;
     detail::stats.reset();
     double t0, t1;
@@ -665,8 +697,8 @@ namespace arnoldi::detail {
     ipntr[6]   = bounds + 1;
     ipntr[10]  = iw + 1;
 
-    saup2<Scalar>(bmat, n, which, nev0, np, tol, resid, mode, iupd, ishift, mxiter, v, ldv, &workl[ih], ldh, &workl[ritz],
-                  &workl[bounds], &workl[iq], ldq, &workl[iw], workd, info, op, bop, comm);
+    saup2<Scalar, Backend>(bref, bmat, n, which, nev0, np, tol, resid, mode, iupd, ishift, mxiter, v, ldv, &workl[ih], ldh,
+                           &workl[ritz], &workl[bounds], &workl[iq], ldq, &workl[iw], workd, info, op, bop, comm);
 
     iparam[2]  = mxiter;
     iparam[4]  = np;
@@ -699,30 +731,56 @@ namespace arnoldi::detail {
   }
 
   // op + bop, no comm (defaults to SerialComm).
+  template <typename Scalar, typename Backend = detail::CpuBackend, typename OP, typename BOP>
+  void saupd(detail::BackendRef<Backend> bref, const char* bmat, int n, const char* which, int nev, detail::real_t<Scalar>& tol,
+             Scalar* resid, int ncv, Scalar* v, int ldv, int* iparam, int* ipntr, Scalar* workd, detail::real_t<Scalar>* workl,
+             int lworkl, int& info, OP&& op, BOP&& bop) {
+    saupd<Scalar, Backend>(bref, bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr, workd, workl, lworkl, info,
+                           std::forward<OP>(op), std::forward<BOP>(bop), SerialComm{});
+  }
+
+  // Standard problem (bmat='I'), no bop, no comm.
+  template <typename Scalar, typename Backend = detail::CpuBackend, typename OP>
+  void saupd(detail::BackendRef<Backend> bref, const char* bmat, int n, const char* which, int nev, detail::real_t<Scalar>& tol,
+             Scalar* resid, int ncv, Scalar* v, int ldv, int* iparam, int* ipntr, Scalar* workd, detail::real_t<Scalar>* workl,
+             int lworkl, int& info, OP&& op) {
+    saupd<Scalar, Backend>(
+        bref, bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr, workd, workl, lworkl, info, std::forward<OP>(op),
+        [](const Scalar*, Scalar*) {}, SerialComm{});
+  }
+
+  // ---------------------------------------------------------------------------
+  // saupd no-BackendRef overloads — defaults to CpuBackend for the historical
+  // public callback API (tests, examples).
+  template <typename Scalar, typename OP, typename BOP, typename Comm>
+  void saupd(const char* bmat, int n, const char* which, int nev, detail::real_t<Scalar>& tol, Scalar* resid, int ncv, Scalar* v,
+             int ldv, int* iparam, int* ipntr, Scalar* workd, detail::real_t<Scalar>* workl, int lworkl, int& info, OP&& op,
+             BOP&& bop, const Comm& comm) {
+    saupd<Scalar, detail::CpuBackend>(detail::BackendRef<detail::CpuBackend>{}, bmat, n, which, nev, tol, resid, ncv, v, ldv,
+                                      iparam, ipntr, workd, workl, lworkl, info, std::forward<OP>(op), std::forward<BOP>(bop),
+                                      comm);
+  }
   template <typename Scalar, typename OP, typename BOP>
   void saupd(const char* bmat, int n, const char* which, int nev, detail::real_t<Scalar>& tol, Scalar* resid, int ncv, Scalar* v,
              int ldv, int* iparam, int* ipntr, Scalar* workd, detail::real_t<Scalar>* workl, int lworkl, int& info, OP&& op,
              BOP&& bop) {
-    saupd<Scalar>(bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr, workd, workl, lworkl, info, std::forward<OP>(op),
-                  std::forward<BOP>(bop), SerialComm{});
+    saupd<Scalar, detail::CpuBackend>(detail::BackendRef<detail::CpuBackend>{}, bmat, n, which, nev, tol, resid, ncv, v, ldv,
+                                      iparam, ipntr, workd, workl, lworkl, info, std::forward<OP>(op), std::forward<BOP>(bop));
   }
-
-  // Standard problem (bmat='I'), no bop, no comm.
   template <typename Scalar, typename OP>
   void saupd(const char* bmat, int n, const char* which, int nev, detail::real_t<Scalar>& tol, Scalar* resid, int ncv, Scalar* v,
              int ldv, int* iparam, int* ipntr, Scalar* workd, detail::real_t<Scalar>* workl, int lworkl, int& info, OP&& op) {
-    saupd<Scalar>(
-        bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr, workd, workl, lworkl, info, std::forward<OP>(op),
-        [](const Scalar*, Scalar*) {}, SerialComm{});
+    saupd<Scalar, detail::CpuBackend>(detail::BackendRef<detail::CpuBackend>{}, bmat, n, which, nev, tol, resid, ncv, v, ldv,
+                                      iparam, ipntr, workd, workl, lworkl, info, std::forward<OP>(op));
   }
 
   // seupd — symmetric/Hermitian eigenvector extraction (callback).
   // d is Real* (eigenvalues), z is Scalar* (eigenvectors).
-  template <typename Scalar, typename Comm>
-  void seupd(bool rvec, const char* howmny, detail::real_t<Scalar>* d, Scalar* z, int ldz, detail::real_t<Scalar> sigma,
-             const char* bmat, int n, const char* which, int nev, detail::real_t<Scalar> tol, Scalar* resid, int ncv, Scalar* v,
-             int ldv, int* iparam, int* ipntr, Scalar* workd, detail::real_t<Scalar>* workl, int lworkl, int& info,
-             const Comm& comm) {
+  template <typename Scalar, typename Backend = detail::CpuBackend, typename Comm>
+  void seupd(detail::BackendRef<Backend> bref, bool rvec, const char* howmny, detail::real_t<Scalar>* d, Scalar* z, int ldz,
+             detail::real_t<Scalar> sigma, const char* bmat, int n, const char* which, int nev, detail::real_t<Scalar> tol,
+             Scalar* resid, int ncv, Scalar* v, int ldv, int* iparam, int* ipntr, Scalar* workd, detail::real_t<Scalar>* workl,
+             int lworkl, int& info, const Comm& comm) {
     using Real = detail::real_t<Scalar>;
 
     char type[7];
@@ -793,7 +851,7 @@ namespace arnoldi::detail {
     if (*bmat == 'I')
       bnorm2 = rnorm;
     else
-      bnorm2 = detail::pnrm2<Scalar>(comm, n, workd, 1);
+      bnorm2 = detail::pnrm2<Scalar, Backend>(bref, comm, n, workd, 1);
 
     std::vector<int> select(ncv, 0);
 
@@ -890,23 +948,45 @@ namespace arnoldi::detail {
     }
 
     if (rvec && *howmny == 'A') {
-      // z = V * S (Scalar V times Real eigenvector matrix S). When z
-      // aliases v, a column-by-column multiply would destroy V columns
-      // before they are fully consumed; use a temporary in that case.
+      // z = V * S (Scalar V times Real eigenvector matrix S = workl[iq..]).
+      // When z aliases v, the GEMM source/destination would overlap; copy V
+      // into a tmpbuf in that case so the GEMM has a clean source.
       Scalar*             vbuf = v;
       std::vector<Scalar> tmpbuf;
       if (z == v) {
         tmpbuf.resize(static_cast<std::size_t>(ldv) * ncv);
-        detail::Ops<Scalar>::copy(ldv * ncv, v, 1, tmpbuf.data(), 1);
+        detail::Ops<Scalar, Backend>::copy(bref, ldv * ncv, v, 1, tmpbuf.data(), 1);
         vbuf = tmpbuf.data();
       }
-      for (j = 0; j < nconv; j++) {
-        for (int ii = 0; ii < n; ii++) z[j * ldz + ii] = Scalar(0);
-        for (k = 0; k < ncv; k++) {
-          detail::Ops<Scalar>::raxpy(n, workl[iq + j * ldq + k], &vbuf[k * ldv], 1, &z[j * ldz], 1);
-        }
+
+      // S is Real, ncv-rows × nconv-cols, column-major, leading dim ldq.
+      // For real Scalar, S has the right storage type already; for complex
+      // Scalar, widen S to Scalar (im = 0) on the host first.
+      std::vector<Scalar> S_host(static_cast<std::size_t>(ldq) * nconv);
+      if constexpr (std::is_same_v<Scalar, Real>) {
+        for (j = 0; j < ldq * nconv; j++) S_host[j] = workl[iq + j];
+      } else {
+        for (j = 0; j < nconv; j++)
+          for (k = 0; k < ncv; k++) S_host[j * ldq + k] = Scalar(workl[iq + j * ldq + k], 0);
       }
 
+      // Under CpuBackend the gemm consumes S directly from the host buffer.
+      // Under a device backend we upload S into a backend-typed buffer first;
+      // the cuBLAS gemm requires all three operands to be device pointers.
+      // S is small (ldq*nconv <= ncv²), so the upload cost is negligible.
+      typename detail::buffer_traits<Backend, Scalar>::vector_type S_buf;
+      const Scalar*                                                S_ptr;
+      if constexpr (std::is_same_v<Backend, detail::CpuBackend>) {
+        S_ptr = S_host.data();
+      } else {
+        S_buf.assign(static_cast<std::size_t>(ldq) * nconv, Scalar{});
+        detail::buffer_traits<Backend, Scalar>::copy_from_host(S_buf, S_host.data(),
+                                                               static_cast<std::size_t>(ldq) * nconv);
+        S_ptr = S_buf.data();
+      }
+
+      detail::Ops<Scalar, Backend>::gemm(bref, "N", "N", n, nconv, ncv, Scalar(1), vbuf, ldv, S_ptr, ldq, Scalar(0), z, ldz);
+
       // Last-row weights for error bounds.
       for (j = 1; j <= ncv - 1; j++) workl[ihb + j - 1] = Real(0);
       workl[ihb + ncv - 1] = Real(1);
@@ -941,18 +1021,37 @@ namespace arnoldi::detail {
 
     if (rvec && std::strcmp(type, "REGULR") != 0) {
       for (k = 0; k < nconv; k++) {
-        detail::Ops<Scalar>::raxpy(n, workl[iw + k], resid, 1, &z[k * ldz], 1);
+        detail::Ops<Scalar, Backend>::raxpy(bref, n, workl[iw + k], resid, 1, &z[k * ldz], 1);
       }
     }
   }
 
   // seupd without Comm (defaults to SerialComm).
+  template <typename Scalar, typename Backend = detail::CpuBackend>
+  void seupd(detail::BackendRef<Backend> bref, bool rvec, const char* howmny, detail::real_t<Scalar>* d, Scalar* z, int ldz,
+             detail::real_t<Scalar> sigma, const char* bmat, int n, const char* which, int nev, detail::real_t<Scalar> tol,
+             Scalar* resid, int ncv, Scalar* v, int ldv, int* iparam, int* ipntr, Scalar* workd, detail::real_t<Scalar>* workl,
+             int lworkl, int& info) {
+    seupd<Scalar, Backend>(bref, rvec, howmny, d, z, ldz, sigma, bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr,
+                           workd, workl, lworkl, info, SerialComm{});
+  }
+
+  // seupd no-BackendRef overloads — defaults to CpuBackend for the historical
+  // public callback API (tests, examples).
+  template <typename Scalar, typename Comm>
+  void seupd(bool rvec, const char* howmny, detail::real_t<Scalar>* d, Scalar* z, int ldz, detail::real_t<Scalar> sigma,
+             const char* bmat, int n, const char* which, int nev, detail::real_t<Scalar> tol, Scalar* resid, int ncv, Scalar* v,
+             int ldv, int* iparam, int* ipntr, Scalar* workd, detail::real_t<Scalar>* workl, int lworkl, int& info,
+             const Comm& comm) {
+    seupd<Scalar, detail::CpuBackend>(detail::BackendRef<detail::CpuBackend>{}, rvec, howmny, d, z, ldz, sigma, bmat, n, which,
+                                      nev, tol, resid, ncv, v, ldv, iparam, ipntr, workd, workl, lworkl, info, comm);
+  }
   template <typename Scalar>
   void seupd(bool rvec, const char* howmny, detail::real_t<Scalar>* d, Scalar* z, int ldz, detail::real_t<Scalar> sigma,
              const char* bmat, int n, const char* which, int nev, detail::real_t<Scalar> tol, Scalar* resid, int ncv, Scalar* v,
              int ldv, int* iparam, int* ipntr, Scalar* workd, detail::real_t<Scalar>* workl, int lworkl, int& info) {
-    seupd<Scalar>(rvec, howmny, d, z, ldz, sigma, bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr, workd, workl,
-                  lworkl, info, SerialComm{});
+    seupd<Scalar, detail::CpuBackend>(detail::BackendRef<detail::CpuBackend>{}, rvec, howmny, d, z, ldz, sigma, bmat, n, which,
+                                      nev, tol, resid, ncv, v, ldv, iparam, ipntr, workd, workl, lworkl, info);
   }
 
 }  // namespace arnoldi::detail
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 116f301..e9aee6f 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,5 +1,33 @@
+# Test suite — built on Catch2 v3 (fetched at configure time).
+
+include(FetchContent)
+FetchContent_Declare(
+    Catch2
+    GIT_REPOSITORY https://github.com/catchorg/Catch2.git
+    GIT_TAG        v3.7.1
+)
+FetchContent_MakeAvailable(Catch2)
+
+# Catch2 ships CMake helpers (catch_discover_tests) under extras/.
+list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
+include(Catch)
+
 foreach(_test IN ITEMS test_validation test_getv0 test_solver test_helpers test_modes test_low_level_validation)
     add_executable(${_test} ${_test}.cpp)
-    target_link_libraries(${_test} PRIVATE arnoldi)
-    add_test(NAME ${_test} COMMAND ${_test})
+    target_link_libraries(${_test} PRIVATE arnoldi Catch2::Catch2WithMain)
+    catch_discover_tests(${_test})
 endforeach()
+
+# CUDA-backend test. Mirrors test_solver but launches the matvec as a CUDA
+# kernel and uses Arnoldi<..., CudaBackend>. Built only when ARNOLDI_USE_CUDA
+# is ON; that option already enables CUDA::cudart and CUDA::cublas on the
+# arnoldi interface target.
+if(ARNOLDI_USE_CUDA)
+    enable_language(CUDA)
+    add_executable(test_solver_cuda test_solver_cuda.cu)
+    set_target_properties(test_solver_cuda PROPERTIES
+        CUDA_STANDARD 17
+        CUDA_STANDARD_REQUIRED ON)
+    target_link_libraries(test_solver_cuda PRIVATE arnoldi Catch2::Catch2WithMain)
+    catch_discover_tests(test_solver_cuda)
+endif()
diff --git a/tests/test_getv0.cpp b/tests/test_getv0.cpp
index 775e2e9..8186475 100644
--- a/tests/test_getv0.cpp
+++ b/tests/test_getv0.cpp
@@ -131,14 +131,12 @@ static bool test_debug_output() {
     return true;
 }
 
-int main() {
-    std::printf("test_getv0:\n");
-    bool ok = true;
-    ok &= test_basic();
-    ok &= test_generalized_retry();
-    ok &= test_orthogonalization();
-    ok &= test_orthogonalization_generalized();
-    ok &= test_debug_output();
-    std::printf("%s\n", ok ? "PASS" : "FAIL");
-    return ok ? 0 : 1;
-}
+#include <catch2/catch_test_macros.hpp>
+
+// The helpers above return bool and print their own progress; wrap each as
+// a Catch2 test case so they're individually discovered and reported.
+TEST_CASE("getv0 basic", "[getv0]") { CHECK(test_basic()); }
+TEST_CASE("getv0 generalized retry", "[getv0]") { CHECK(test_generalized_retry()); }
+TEST_CASE("getv0 orthogonalization", "[getv0]") { CHECK(test_orthogonalization()); }
+TEST_CASE("getv0 orthogonalization generalized", "[getv0]") { CHECK(test_orthogonalization_generalized()); }
+TEST_CASE("getv0 debug output", "[getv0]") { CHECK(test_debug_output()); }
diff --git a/tests/test_helpers.cpp b/tests/test_helpers.cpp
index 37667f1..459349a 100644
--- a/tests/test_helpers.cpp
+++ b/tests/test_helpers.cpp
@@ -17,17 +17,14 @@
 #include <string>
 #include <vector>
 
-static int g_pass = 0, g_fail = 0;
-
-static void check(const char* name, bool cond) {
-    if (cond) {
-        std::printf("  OK    %s\n", name);
-        ++g_pass;
-    } else {
-        std::printf("  FAIL  %s\n", name);
-        ++g_fail;
-    }
-}
+#include <catch2/catch_test_macros.hpp>
+
+// Bridge legacy check("desc", cond) calls onto Catch2 assertions.
+#define check(msg, cond) \
+    do {                 \
+        INFO(msg);       \
+        CHECK((cond));   \
+    } while (0)
 
 template <typename Real>
 static bool nearly(Real a, Real b, Real tol = Real(1e-10)) { return std::abs(a - b) <= tol; }
@@ -35,7 +32,7 @@ static bool nearly(Real a, Real b, Real tol = Real(1e-10)) { return std::abs(a -
 // In ARPACK the "wanted" values end up at the end of the sorted array.  So
 // sortc/sortr with "LM" arranges by ascending magnitude; "SM" by descending
 // magnitude, etc.
-static void test_sortc_all() {
+TEST_CASE("test_sortc_all", "[helpers]") {
     using arnoldi::detail::sortc;
 
     auto run = [&](const char* which, std::vector<double> xr, std::vector<double> xi,
@@ -53,7 +50,7 @@ static void test_sortc_all() {
     run("SI", {0, 0, 0}, {1, 3, 2}, {1, 2, 3}, {0, 0, 0}, {3, 2, 1});
 }
 
-static void test_sortr_all() {
+TEST_CASE("test_sortr_all", "[helpers]") {
     using arnoldi::detail::sortr;
     auto run = [&](const char* which, std::vector<double> a, std::vector<double> b, std::vector<double> exp_a) {
         sortr<double>(which, true, (int)a.size(), a.data(), b.data());
@@ -67,7 +64,7 @@ static void test_sortr_all() {
 
 // Sort a 3x3 matrix by columns based on a key vector x; verify both the key
 // reordering and that columns of `a` followed (apply=true).
-static void test_sesrt_all() {
+TEST_CASE("test_sesrt_all", "[helpers]") {
     using arnoldi::detail::sesrt;
     const int n = 3, lda = 3;
     auto run = [&](const char* which, std::vector<double> x, std::vector<double> exp_x) {
@@ -85,7 +82,7 @@ static void test_sesrt_all() {
 
 // Verify ngets walks each branch of the which-translation switch and the
 // "split conjugate pair" guard that pulls the boundary index back by one.
-static void test_ngets_branches() {
+TEST_CASE("test_ngets_branches", "[helpers]") {
     using arnoldi::detail::ngets;
     int kev = 2, np = 2;
     std::vector<double> rr = {1, 2, 3, 4}, ri = {0, 0, 0, 0}, bnd = {.1, .2, .3, .4};
@@ -108,7 +105,7 @@ static void test_ngets_branches() {
 
 // sgets has a BE special case that swaps two halves of the spectrum to keep
 // "both ends" balanced. Exercise both BE and a regular which.
-static void test_sgets_branches() {
+TEST_CASE("test_sgets_branches", "[helpers]") {
     using arnoldi::detail::sgets;
     std::vector<double> ritz = {1, 2, 3, 4, 5, 6}, bounds = {.1, .2, .3, .4, .5, .6};
     std::vector<double> shifts(3);
@@ -122,7 +119,7 @@ static void test_sgets_branches() {
 
 // Convergence check returns the number of Ritz values whose error bound is
 // below tol*max(eps23, |ritz|).
-static void test_conv_check() {
+TEST_CASE("test_conv_check", "[helpers]") {
     using arnoldi::detail::nconv;
     using arnoldi::detail::sconv;
     std::vector<double> ritzr = {1, 2, 3}, ritzi = {0, 0, 0}, bounds = {1e-15, 1e-15, 1e-1};
@@ -137,7 +134,7 @@ static void test_conv_check() {
 
 // stqrb is the tridiagonal QR used by seigt; cover n=0, n=1 and a small
 // generic case so the LAPACK glue and label-90 pivot loop both run.
-static void test_stqrb() {
+TEST_CASE("test_stqrb", "[helpers]") {
     using arnoldi::detail::stqrb;
     int info = 0;
     std::vector<double> z;
@@ -160,7 +157,7 @@ static void test_stqrb() {
 
 // Exercise the seigt entry point used by sym Lanczos extraction; also covers
 // the n>1 sub-diagonal debug print branch when msglvl>0.
-static void test_seigt_neigh() {
+TEST_CASE("test_seigt_neigh", "[helpers]") {
     using arnoldi::detail::neigh;
     using arnoldi::detail::seigt;
 
@@ -195,7 +192,7 @@ static void test_seigt_neigh() {
 // Hit each print routine in detail::debug for both real and complex element
 // types so the four print_elem overloads are all instantiated.  Output is
 // noisy but small; we just want the lines exercised for coverage.
-static void test_debug_prints() {
+TEST_CASE("test_debug_prints", "[helpers]") {
     using namespace arnoldi::detail;
 
     int    iv[3]   = {1, 2, 3};
@@ -217,7 +214,7 @@ static void test_debug_prints() {
 
 // Exercise the Real/Complex Ops dispatches that the standard double symmetric
 // solver path doesn't reach: rscal/raxpy on a complex vector, dotc, gemv_rv.
-static void test_ops_complex_dispatch() {
+TEST_CASE("test_ops_complex_dispatch", "[helpers]") {
     using cplx = std::complex<double>;
     using O    = arnoldi::detail::Ops<cplx>;
 
@@ -257,7 +254,7 @@ static void test_ops_complex_dispatch() {
 
 // pdot/prdotc/pnrm2_real/pnrm2 reduction wrappers: ensure they call through
 // SerialComm without modifying anything beyond the BLAS forwarding.
-static void test_parallel_reductions() {
+TEST_CASE("test_parallel_reductions", "[helpers]") {
     using arnoldi::detail::pdot;
     using arnoldi::detail::pnrm2;
     using arnoldi::detail::pnrm2_real;
@@ -273,21 +270,3 @@ static void test_parallel_reductions() {
     check("pnrm2 serial complex", nearly(pnrm2<cplx>(comm, 2, z.data(), 1), 5.0));
     check("prdotc serial complex", nearly(prdotc<cplx>(comm, 2, z.data(), 1, z.data(), 1), 25.0));
 }
-
-int main() {
-    std::printf("test_helpers:\n");
-    setvbuf(stdout, nullptr, _IOLBF, 0);
-    test_sortc_all();
-    test_sortr_all();
-    test_sesrt_all();
-    test_ngets_branches();
-    test_sgets_branches();
-    test_conv_check();
-    test_stqrb();
-    test_seigt_neigh();
-    test_debug_prints();
-    test_ops_complex_dispatch();
-    test_parallel_reductions();
-    std::printf("\n%d passed, %d failed\n", g_pass, g_fail);
-    return g_fail > 0 ? 1 : 0;
-}
diff --git a/tests/test_low_level_validation.cpp b/tests/test_low_level_validation.cpp
index ce9e1e6..c7ca4ce 100644
--- a/tests/test_low_level_validation.cpp
+++ b/tests/test_low_level_validation.cpp
@@ -13,16 +13,14 @@
 #include <cstdio>
 #include <vector>
 
-static int g_pass = 0, g_fail = 0;
+#include <catch2/catch_test_macros.hpp>
 
+// Catch2-backed check(): the optional got/want ints are surfaced as scoped
+// INFO context (only printed on failure). Valid inside helper functions
+// because Catch2 records the assertion against the running TEST_CASE.
 static void check(const char* name, bool cond, int got = 0, int want = 0) {
-    if (cond) {
-        std::printf("  OK    %s\n", name);
-        ++g_pass;
-    } else {
-        std::printf("  FAIL  %s (got info=%d, want=%d)\n", name, got, want);
-        ++g_fail;
-    }
+    INFO(name << " (got info=" << got << ", want=" << want << ")");
+    CHECK(cond);
 }
 
 // Convenience holder for a "valid" workspace; individual fields are then
@@ -89,7 +87,7 @@ struct NaupdHarness {
     }
 };
 
-static void test_saupd_errors() {
+TEST_CASE("test_saupd_errors", "[lowlevel]") {
     {
         SaupdHarness h; h.n = 0;
         check("saupd n<=0  -> -1", h.run() == -1, h.info, -1);
@@ -147,7 +145,7 @@ static void test_saupd_errors() {
     }
 }
 
-static void test_naupd_errors() {
+TEST_CASE("test_naupd_errors", "[lowlevel]") {
     {
         NaupdHarness h; h.n = 0;
         check("naupd n<=0  -> -1", h.run() == -1, h.info, -1);
@@ -206,7 +204,7 @@ static void test_naupd_errors() {
 
 // seupd validates its arguments separately; drive several of those branches.
 // seupd assumes nconv>0 (read from iparam[4]); pre-set it so other checks fire.
-static void test_seupd_errors() {
+TEST_CASE("test_seupd_errors", "[lowlevel]") {
     SaupdHarness base;
     base.iparam[4] = 1;  // nconv
 
@@ -294,7 +292,7 @@ static void test_seupd_errors() {
     }
 }
 
-static void test_neupd_errors() {
+TEST_CASE("test_neupd_errors", "[lowlevel]") {
     NaupdHarness base;
 
     auto run_neupd = [&](const char* bmat, const char* which, int n, int nev, int ncv,
@@ -388,7 +386,7 @@ static void test_neupd_errors() {
 // Drive a real solve all the way through saupd, then call seupd directly with
 // z aliasing v.  Forces the seupd `if (z == v) tmpbuf` allocation branch
 // which the high-level Arnoldi class never triggers (it owns separate buffers).
-static void test_seupd_aliased_z_v() {
+TEST_CASE("test_seupd_aliased_z_v", "[lowlevel]") {
     const int n = 16, nev = 3, ncv = 8;
     int   lworkl = ncv * (ncv + 8);
     double tol   = 0.0;
@@ -427,7 +425,7 @@ static void test_seupd_aliased_z_v() {
 // Call the no-bop, no-comm 3-arg saupd/naupd helper overloads directly.
 // The high-level Arnoldi class always passes a bop lambda so these helper
 // overloads are unreachable through the public API.
-static void test_no_bop_overloads() {
+TEST_CASE("test_no_bop_overloads", "[lowlevel]") {
     {
         const int n = 16, nev = 2, ncv = 8;
         int   lworkl = ncv * (ncv + 8);
@@ -471,7 +469,7 @@ static void test_no_bop_overloads() {
 }
 
 // User-supplied initial residual: covers info_in_=1 path through naupd.
-static void test_user_initial_residual_nonsym() {
+TEST_CASE("test_user_initial_residual_nonsym", "[lowlevel]") {
     const int n = 32, nev = 3, ncv = 12;
     std::vector<double> resid(n, 1.0);
 
@@ -484,16 +482,3 @@ static void test_user_initial_residual_nonsym() {
     });
     check("Nonsym user initial resid: solve ok", s.info() >= 0);
 }
-
-int main() {
-    std::printf("test_low_level_validation:\n");
-    test_saupd_errors();
-    test_naupd_errors();
-    test_seupd_errors();
-    test_neupd_errors();
-    test_user_initial_residual_nonsym();
-    test_seupd_aliased_z_v();
-    test_no_bop_overloads();
-    std::printf("\n%d passed, %d failed\n", g_pass, g_fail);
-    return g_fail > 0 ? 1 : 0;
-}
diff --git a/tests/test_modes.cpp b/tests/test_modes.cpp
index f51edae..426c972 100644
--- a/tests/test_modes.cpp
+++ b/tests/test_modes.cpp
@@ -21,17 +21,14 @@ void dgttrs_(const char* trans, const int* n, const int* nrhs, const double* dl,
              const double* du2, const int* ipiv, double* b, const int* ldb, int* info);
 }
 
-static int g_pass = 0, g_fail = 0;
-
-static void check(const char* name, bool cond) {
-    if (cond) {
-        std::printf("  OK    %s\n", name);
-        ++g_pass;
-    } else {
-        std::printf("  FAIL  %s\n", name);
-        ++g_fail;
-    }
-}
+#include <catch2/catch_test_macros.hpp>
+
+// Bridge legacy check("desc", cond) calls onto Catch2 assertions.
+#define check(msg, cond) \
+    do {                 \
+        INFO(msg);       \
+        CHECK((cond));   \
+    } while (0)
 
 // ---------------------------------------------------------------------------
 // Analytic spectra of the test problems.
@@ -177,7 +174,7 @@ static void mv_mass(int n, const double* v, double* w) {
 // Mode 3: A*x = lambda*M*x via OP = inv(A - sigma*M)*M, B = M.
 // Shift-invert with σ=0 converges to the eigenvalues of (A,M) closest to 0,
 // i.e. the smallest eigenvalues of the FE Laplacian.
-static void test_sym_mode3_shift_invert() {
+TEST_CASE("test_sym_mode3_shift_invert", "[modes]") {
     const int    n = 100, nev = 4, ncv = 10;
     const double sigma = 0.0;
     ShiftInvertFE shift(n, sigma);
@@ -210,7 +207,7 @@ static void test_sym_mode3_shift_invert() {
 // Mode 4 (buckling): K*x = lambda*KG*x via OP = inv(K - sigma*KG)*K, B = K.
 // In our setup K = FE stiffness, KG = FE mass, so the underlying spectrum is
 // still the FE Laplacian spectrum and the converged values are nearest σ.
-static void test_sym_mode4_buckling() {
+TEST_CASE("test_sym_mode4_buckling", "[modes]") {
     const int    n = 100, nev = 4, ncv = 10;
     const double sigma = 1.0;
     ShiftInvertFE shift(n, sigma);
@@ -240,7 +237,7 @@ static void test_sym_mode4_buckling() {
 
 // Mode 5 (Cayley): A*x = lambda*M*x via OP = inv(A - sigma*M)*(A + sigma*M).
 // μ = (λ+σ)/(λ−σ); LM Ritz values give λ closest to σ.
-static void test_sym_mode5_cayley() {
+TEST_CASE("test_sym_mode5_cayley", "[modes]") {
     const int    n = 100, nev = 4, ncv = 20;
     const double sigma = 150.0;
     ShiftInvertFE shift(n, sigma);
@@ -272,7 +269,7 @@ static void test_sym_mode5_cayley() {
 // with A*x because saitr later reads it back when computing the M-norm
 // (wnorm² = (M^-1 A x)·x where x is required to be A*x at that point).
 // A = standard tridiagonal Laplacian, M = 2*I, so eigenvalues are λ(A)/2.
-static void test_sym_mode2_generalized() {
+TEST_CASE("test_sym_mode2_generalized", "[modes]") {
     const int n = 64, nev = 3, ncv = 10;
 
     arnoldi::Arnoldi<arnoldi::Kind::Sym, double> s("G", n, "LM", nev, ncv);
@@ -304,7 +301,7 @@ static void test_sym_mode2_generalized() {
 // Nonsymmetric shift-invert (mode 3, real shift).  The underlying problem is
 // still symmetric (FE Laplacian generalized), so all eigenvalues are real and
 // nev nearest σ should appear in r.values_re with values_im ≈ 0.
-static void test_nonsym_mode3_real_shift() {
+TEST_CASE("test_nonsym_mode3_real_shift", "[modes]") {
     const int    n = 100, nev = 4, ncv = 20;
     const double sigma = 1.0;
     ShiftInvertFE shift(n, sigma);
@@ -341,7 +338,7 @@ static void test_nonsym_mode3_real_shift() {
 
 // Sym which="LM" with rvec=false hits the REGULR branch of seupd that copies
 // ritz into d, distinct from the rvec=true sesrt path.
-static void test_sym_regular_rvec_false_LM() {
+TEST_CASE("test_sym_regular_rvec_false_LM", "[modes]") {
     const int n = 32, nev = 3, ncv = 10;
     arnoldi::Arnoldi<arnoldi::Kind::Sym, double> s("I", n, "LM", nev, ncv);
     s.tol(1e-10).maxiter(500);
@@ -361,7 +358,7 @@ static void test_sym_regular_rvec_false_LM() {
 
 // Nonsym with which="SM" exercises the SM mapping in ngets and the regular
 // extraction path.  Underlying matrix is symmetric so eigenvalues are real.
-static void test_nonsym_which_SM() {
+TEST_CASE("test_nonsym_which_SM", "[modes]") {
     const int n = 32, nev = 3, ncv = 12;
     arnoldi::Arnoldi<arnoldi::Kind::Nonsym, double> s("I", n, "SM", nev, ncv);
     s.tol(1e-10).maxiter(500);
@@ -382,7 +379,7 @@ static void test_nonsym_which_SM() {
 
 // Herm with which="LM" + rvec=false; covers the saupd<complex> regular path
 // and the rvec=false copy-Ritz path in seupd<complex>.
-static void test_herm_LM_LA() {
+TEST_CASE("test_herm_LM_LA", "[modes]") {
     using cplx = std::complex<double>;
     const int n = 32, nev = 3, ncv = 10;
     arnoldi::Arnoldi<arnoldi::Kind::Herm, cplx> s("I", n, "LM", nev, ncv);
@@ -428,7 +425,7 @@ struct DebugAllOn {
 // *aupd, sapps/napps, sgets/ngets, *eupd, and getv0 by running once with
 // every debug knob turned all the way up.  Output is voluminous but only
 // printed during this single test.
-static void test_debug_branches() {
+TEST_CASE("test_debug_branches", "[modes]") {
     DebugAllOn guard;
     {
         const int n = 10, nev = 2, ncv = 6;
@@ -482,7 +479,7 @@ static void test_debug_branches() {
 // running both Sym<float> + which="BE" (BE branch swaps with sswap_) and
 // Nonsym<float>.  Standard tridiagonal Laplacian, small problem size.
 // Tolerances are looser than the double tests because of single precision.
-static void test_float_overloads() {
+TEST_CASE("test_float_overloads", "[modes]") {
     {
         const int n = 16, nev = 4, ncv = 12;
         arnoldi::Arnoldi<arnoldi::Kind::Sym, float> s("I", n, "BE", nev, ncv);
@@ -521,7 +518,7 @@ static void test_float_overloads() {
 // naup2 runs many times.  Each pass through the loop hits the bmat=='G'
 // post-napps recompute that the more relaxed shift-invert tests skip
 // because they converge in a single iter.
-static void test_nonsym_generalized_many_iters() {
+TEST_CASE("test_nonsym_generalized_many_iters", "[modes]") {
     const int n = 60, nev = 6, ncv = 10;
     const double sigma = 0.5;
     ShiftInvertFE shift(n, sigma);
@@ -546,7 +543,7 @@ static void test_nonsym_generalized_many_iters() {
 // dimensionless tridiagonal A = [-1, 2, -1] and mass M = [1, 4, 1] / 6;
 // the analytic spectrum is the FE Laplacian's with h ≡ 1 (no scaling):
 //   λ_k = 6 (1 - cos(kπ/(n+1))) / (2 + cos(kπ/(n+1))).
-static void test_sym_generalized_many_iters() {
+TEST_CASE("test_sym_generalized_many_iters", "[modes]") {
     const int n = 60, nev = 4, ncv = 8;
     arnoldi::Arnoldi<arnoldi::Kind::Sym, double> s("G", n, "LM", nev, ncv);
     s.tol(1e-12).maxiter(1000).mode(2);
@@ -598,7 +595,7 @@ static void test_sym_generalized_many_iters() {
 
 // Force the REALPT (mode 3 with sigmai != 0) and IMAGPT (mode 4) branches in
 // neupd's type-selection logic.
-static void test_nonsym_neupd_realpt_imagpt() {
+TEST_CASE("test_nonsym_neupd_realpt_imagpt", "[modes]") {
     const int n = 16, nev = 2, ncv = 8;
     auto av = [n](const double* x, double* y) {
         y[0] = x[1];
@@ -629,7 +626,7 @@ static void test_nonsym_neupd_realpt_imagpt() {
 // Spectrum is ±2i cos(kπ/(n+1)) so neupd's conjugate-pair post-processing
 // branches (REGULR & SHIFTI lapy2/scal blocks) get exercised. Real parts must
 // be ≈ 0 and imaginary magnitudes must match the exact closed form.
-static void test_nonsym_complex_eigenvalues() {
+TEST_CASE("test_nonsym_complex_eigenvalues", "[modes]") {
     const int n = 16, nev = 4, ncv = 12;
     auto av = [n](const double* x, double* y) {
         y[0] = x[1];
@@ -668,23 +665,3 @@ static void test_nonsym_complex_eigenvalues() {
         check("Nonsym complex LI rvec=false: matches exact imag pairs", err < 1e-9);
     }
 }
-
-int main() {
-    std::printf("test_modes:\n");
-    test_sym_mode2_generalized();
-    test_sym_mode3_shift_invert();
-    test_sym_mode4_buckling();
-    test_sym_mode5_cayley();
-    test_nonsym_mode3_real_shift();
-    test_sym_regular_rvec_false_LM();
-    test_nonsym_which_SM();
-    test_herm_LM_LA();
-    test_debug_branches();
-    test_float_overloads();
-    test_nonsym_complex_eigenvalues();
-    test_nonsym_neupd_realpt_imagpt();
-    test_nonsym_generalized_many_iters();
-    test_sym_generalized_many_iters();
-    std::printf("\n%d passed, %d failed\n", g_pass, g_fail);
-    return g_fail > 0 ? 1 : 0;
-}
diff --git a/tests/test_solver.cpp b/tests/test_solver.cpp
index 70c8e55..ed8e63b 100644
--- a/tests/test_solver.cpp
+++ b/tests/test_solver.cpp
@@ -5,24 +5,19 @@
 #include <arnoldi/arnoldi.hpp>
 
 #include <algorithm>
+#include <catch2/catch_test_macros.hpp>
 #include <cmath>
 #include <complex>
-#include <cstdio>
 #include <numeric>
 #include <vector>
 
-
-static int g_pass = 0, g_fail = 0;
-
-static void check(const char* name, bool cond) {
-    if (cond) {
-        std::printf("  OK    %s\n", name);
-        ++g_pass;
-    } else {
-        std::printf("  FAIL  %s\n", name);
-        ++g_fail;
-    }
-}
+// Bridge the legacy check("desc", cond) calls onto Catch2: the description
+// becomes scoped INFO context (shown on failure) and the condition a CHECK.
+#define check(msg, cond) \
+    do {                 \
+        INFO(msg);       \
+        CHECK((cond));   \
+    } while (0)
 
 // 1D Laplacian: A(i,i)=2, A(i,i±1)=-1, eigenvalues = 2-2*cos(k*pi/(n+1)).
 static void av_laplacian(int n, const double* x, double* y) {
@@ -36,7 +31,7 @@ static double exact_eig_laplacian(int n, int k) {
     return 2.0 - 2.0 * std::cos(k * M_PI / (n + 1));
 }
 
-void test_sym_double() {
+TEST_CASE("test_sym_double", "[solver]") {
     const int n = 64, nev = 4, ncv = 12;
     arnoldi::Arnoldi<arnoldi::Kind::Sym, double> s("I", n, "SM", nev, ncv);
     s.tol(1e-12).maxiter(500);
@@ -60,7 +55,7 @@ void test_sym_double() {
     check("Sym<double>: eigenvalue accuracy < 1e-10", maxerr < 1e-10);
 }
 
-void test_sym_float() {
+TEST_CASE("test_sym_float", "[solver]") {
     const int n = 32, nev = 3, ncv = 10;
     arnoldi::Arnoldi<arnoldi::Kind::Sym, float> s("I", n, "SM", nev, ncv);
     s.tol(1e-5f).maxiter(500);
@@ -81,7 +76,7 @@ void test_sym_float() {
     check("Sym<float>: eigenvalue accuracy < 1e-4", maxerr < 1e-4f);
 }
 
-void test_nonsym_double() {
+TEST_CASE("test_nonsym_double", "[solver]") {
     const int n = 64, nev = 4, ncv = 14;
     const double rho = 10.0;
     const double h = 1.0 / (n + 1);
@@ -115,7 +110,7 @@ void test_nonsym_double() {
     check("Nonsym<double>: LM eigenvalues have large magnitude", min_mag > 100.0);
 }
 
-void test_herm_complex_double() {
+TEST_CASE("test_herm_complex_double", "[solver]") {
     using cplx = std::complex<double>;
     const int n = 64, nev = 4, ncv = 12;
     const cplx off(-1.0, 0.25);
@@ -138,7 +133,7 @@ void test_herm_complex_double() {
         check("Herm<cdouble>: eigenvalues are real-valued (small)", std::isfinite(r.values[i]));
 }
 
-void test_user_initial_resid() {
+TEST_CASE("test_user_initial_resid", "[solver]") {
     const int n = 32, nev = 3, ncv = 10;
     std::vector<double> resid(n, 1.0);
 
@@ -153,7 +148,7 @@ void test_user_initial_resid() {
     check("user_resid: eigenvalue accurate", err < 1e-10);
 }
 
-void test_values_only() {
+TEST_CASE("test_values_only", "[solver]") {
     const int n = 32, nev = 3, ncv = 10;
     arnoldi::Arnoldi<arnoldi::Kind::Sym, double> s("I", n, "SM", nev, ncv);
     s.tol(1e-12).maxiter(500);
@@ -167,7 +162,7 @@ void test_values_only() {
     check("values_only: eigenvalue is small", r.values[0] < 0.1);
 }
 
-void test_nonsym_values_only() {
+TEST_CASE("test_nonsym_values_only", "[solver]") {
     const int n = 32, nev = 3, ncv = 10;
     arnoldi::Arnoldi<arnoldi::Kind::Nonsym, double> s("I", n, "LM", nev, ncv);
     s.tol(1e-12).maxiter(500);
@@ -179,7 +174,7 @@ void test_nonsym_values_only() {
     check("nonsym_values_only: values_re non-zero", std::abs(r.values_re[0]) > 0.01);
 }
 
-void test_sym_which_LM() {
+TEST_CASE("test_sym_which_LM", "[solver]") {
     const int n = 32, nev = 3, ncv = 10;
     arnoldi::Arnoldi<arnoldi::Kind::Sym, double> s("I", n, "LM", nev, ncv);
     s.tol(1e-12).maxiter(500);
@@ -190,7 +185,7 @@ void test_sym_which_LM() {
     check("Sym which=LM: largest eig > 3.5", r.values[0] > 3.5);
 }
 
-void test_sym_which_LA() {
+TEST_CASE("test_sym_which_LA", "[solver]") {
     const int n = 32, nev = 3, ncv = 10;
     arnoldi::Arnoldi<arnoldi::Kind::Sym, double> s("I", n, "LA", nev, ncv);
     s.tol(1e-12).maxiter(500);
@@ -201,7 +196,7 @@ void test_sym_which_LA() {
     check("Sym which=LA: largest algebraic > 3.5", r.values[nev - 1] > 3.5);
 }
 
-void test_sym_which_SA() {
+TEST_CASE("test_sym_which_SA", "[solver]") {
     const int n = 32, nev = 3, ncv = 10;
     arnoldi::Arnoldi<arnoldi::Kind::Sym, double> s("I", n, "SA", nev, ncv);
     s.tol(1e-12).maxiter(500);
@@ -213,7 +208,7 @@ void test_sym_which_SA() {
     check("Sym which=SA: all eigenvalues small", maxv < 0.15);
 }
 
-void test_sym_which_BE() {
+TEST_CASE("test_sym_which_BE", "[solver]") {
     const int n = 32, nev = 4, ncv = 12;
     arnoldi::Arnoldi<arnoldi::Kind::Sym, double> s("I", n, "BE", nev, ncv);
     s.tol(1e-12).maxiter(500);
@@ -228,7 +223,7 @@ void test_sym_which_BE() {
 
 // A = Laplacian, B = 2*I.  Eigenvalues of A*x = lambda*B*x are half the
 // standard eigenvalues, so OP = inv(B)*A = A/2 in mode 2.
-void test_sym_generalized() {
+TEST_CASE("test_sym_generalized", "[solver]") {
     const int n = 32, nev = 3, ncv = 10;
 
     arnoldi::Arnoldi<arnoldi::Kind::Sym, double> s("G", n, "LM", nev, ncv);
@@ -250,7 +245,7 @@ void test_sym_generalized() {
     check("Sym generalized: largest eig > 1.5", r.values[0] > 1.5);
 }
 
-void test_insufficient_maxiter() {
+TEST_CASE("test_insufficient_maxiter", "[solver]") {
     const int n = 64, nev = 4, ncv = 12;
     arnoldi::Arnoldi<arnoldi::Kind::Sym, double> s("I", n, "SM", nev, ncv);
     s.tol(1e-14).maxiter(1);
@@ -261,7 +256,7 @@ void test_insufficient_maxiter() {
           s.info() != 0 || s.num_converged() < nev);
 }
 
-void test_accessors() {
+TEST_CASE("test_accessors", "[solver]") {
     const int n = 16, nev = 2, ncv = 6;
     arnoldi::Arnoldi<arnoldi::Kind::Sym, double> s("I", n, "SM", nev, ncv);
     s.tol(1e-10).maxiter(300);
@@ -272,25 +267,3 @@ void test_accessors() {
     check("accessors: iparam() != nullptr", s.iparam() != nullptr);
     check("accessors: ipntr() != nullptr", s.ipntr() != nullptr);
 }
-
-int main() {
-    std::printf("test_solver:\n");
-
-    test_sym_double();
-    test_sym_float();
-    test_nonsym_double();
-    test_herm_complex_double();
-    test_user_initial_resid();
-    test_values_only();
-    test_nonsym_values_only();
-    test_sym_which_LM();
-    test_sym_which_LA();
-    test_sym_which_SA();
-    test_sym_which_BE();
-    test_sym_generalized();
-    test_insufficient_maxiter();
-    test_accessors();
-
-    std::printf("\n%d passed, %d failed\n", g_pass, g_fail);
-    return g_fail > 0 ? 1 : 0;
-}
diff --git a/tests/test_solver_cuda.cu b/tests/test_solver_cuda.cu
new file mode 100644
index 0000000..b7d4c2d
--- /dev/null
+++ b/tests/test_solver_cuda.cu
@@ -0,0 +1,282 @@
+// CUDA-backend tests for arnoldi::Arnoldi.
+//
+// Mirror of tests/test_solver.cpp for the Sym / Herm path, but with the
+// matvec implemented as a CUDA kernel receiving device pointers. The user
+// matvec callback receives DEVICE pointers (per the cuda.hpp contract);
+// solver workspace lives in device_vector under CudaBackend.
+//
+// Cases:
+//   - Sym Laplacian (n=128): eigenvalues match the exact analytical formula
+//     2 - 2 cos(k pi / (n+1)).
+//   - Herm complex tridiagonal (n=128): eigenvalues real, residual
+//     ||A v - lambda v|| / ||A|| below tolerance.
+//   - which="LM": largest-magnitude eigenvalue is large.
+//   - Eigenvector residual: ||A v - lambda v|| is small.
+//   - initial_resid_device: solve seeded from a device-resident vector.
+
+#include <arnoldi/arnoldi.hpp>
+#include <arnoldi/cuda.hpp>
+
+#include <algorithm>
+#include <cmath>
+#include <complex>
+#include <cstdio>
+#include <cuComplex.h>
+#include <cuda_runtime.h>
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+
+// Bridge legacy check("desc", cond) calls onto Catch2 assertions.
+#define check(msg, cond) \
+    do {                 \
+        INFO(msg);       \
+        CHECK((cond));   \
+    } while (0)
+
+// Skip a CUDA test case gracefully when no device is present.
+static bool cuda_device_present() {
+    int c = 0;
+    return cudaGetDeviceCount(&c) == cudaSuccess && c > 0;
+}
+#define REQUIRE_CUDA()                                  \
+    do {                                                \
+        if (!cuda_device_present())                     \
+            SKIP("no CUDA device available");           \
+    } while (0)
+
+static double exact_eig_laplacian(int n, int k) {
+    return 2.0 - 2.0 * std::cos(k * M_PI / (n + 1));
+}
+
+// -------- Kernels ------------------------------------------------------------
+// 1D Dirichlet Laplacian:  y = A x  with A(i,i)=2, A(i,i±1)=-1.
+__global__ void d_av_laplacian(int n, const double* x, double* y) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= n) return;
+    double left  = (i == 0)     ? 0.0 : x[i - 1];
+    double right = (i == n - 1) ? 0.0 : x[i + 1];
+    y[i] = 2.0 * x[i] - left - right;
+}
+
+// Hermitian complex tridiagonal:
+//   H(i,i)   = 2
+//   H(i,i+1) = off          (and H(i+1,i) = conj(off))
+// with off = -1 + 0.25 i. The eigenvalues are real because H is Hermitian.
+__global__ void d_av_hermitian(int n, const cuDoubleComplex* x, cuDoubleComplex* y) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= n) return;
+    const cuDoubleComplex off      = make_cuDoubleComplex(-1.0, 0.25);
+    const cuDoubleComplex off_conj = make_cuDoubleComplex(-1.0, -0.25);
+    cuDoubleComplex left  = (i == 0)     ? make_cuDoubleComplex(0.0, 0.0) : cuCmul(off_conj, x[i - 1]);
+    cuDoubleComplex right = (i == n - 1) ? make_cuDoubleComplex(0.0, 0.0) : cuCmul(off,      x[i + 1]);
+    cuDoubleComplex diag  = make_cuDoubleComplex(2.0 * cuCreal(x[i]), 2.0 * cuCimag(x[i]));
+    y[i] = cuCadd(cuCadd(left, diag), right);
+}
+
+static int blocks_for(int n, int threads = 256) { return (n + threads - 1) / threads; }
+
+// -------- Tests --------------------------------------------------------------
+
+TEST_CASE("test_sym_cuda_laplacian", "[cuda]") {
+    REQUIRE_CUDA();
+    const int n = 128, nev = 4, ncv = 12;
+    arnoldi::Arnoldi<arnoldi::Kind::Sym, double, arnoldi::SerialComm,
+                     arnoldi::CudaBackend> s("I", n, "SM", nev, ncv);
+    s.tol(1e-12).maxiter(500);
+    cudaStream_t stream = s.backend().stream();
+
+    s.solve([&](const double* x, double* y) {
+        d_av_laplacian<<<blocks_for(n), 256, 0, stream>>>(n, x, y);
+    });
+
+    check("Sym<CUDA>: converged", s.converged());
+    check("Sym<CUDA>: nconv >= nev", s.num_converged() >= nev);
+    check("Sym<CUDA>: info == 0", s.info() == 0);
+
+    auto r = s.eigenpairs(false);
+    // seupd does not guarantee a particular order within the converged set;
+    // compare as sorted sets. The nev smallest exact Laplacian eigenvalues
+    // are exact_eig_laplacian(n, 1..nev).
+    std::vector<double> got(r.values.begin(), r.values.end());
+    std::sort(got.begin(), got.end());
+    std::vector<double> want(nev);
+    for (int k = 0; k < nev; ++k) want[k] = exact_eig_laplacian(n, k + 1);
+    std::sort(want.begin(), want.end());
+    double max_err = 0.0;
+    for (int k = 0; k < nev; ++k) max_err = std::max(max_err, std::abs(got[k] - want[k]));
+    check("Sym<CUDA>: eigenvalues match exact to 1e-10", max_err < 1e-10);
+}
+
+TEST_CASE("test_sym_cuda_which_LM", "[cuda]") {
+    REQUIRE_CUDA();
+    const int n = 64, nev = 3, ncv = 10;
+    arnoldi::Arnoldi<arnoldi::Kind::Sym, double, arnoldi::SerialComm,
+                     arnoldi::CudaBackend> s("I", n, "LM", nev, ncv);
+    s.tol(1e-12).maxiter(500);
+    cudaStream_t stream = s.backend().stream();
+
+    s.solve([&](const double* x, double* y) {
+        d_av_laplacian<<<blocks_for(n), 256, 0, stream>>>(n, x, y);
+    });
+
+    check("Sym<CUDA> which=LM: converged", s.converged());
+    auto r = s.eigenpairs(false);
+    check("Sym<CUDA> which=LM: largest eig > 3.5", r.values[0] > 3.5);
+}
+
+TEST_CASE("test_sym_cuda_eigenvectors", "[cuda]") {
+    REQUIRE_CUDA();
+    const int n = 64, nev = 3, ncv = 10;
+    arnoldi::Arnoldi<arnoldi::Kind::Sym, double, arnoldi::SerialComm,
+                     arnoldi::CudaBackend> s("I", n, "SM", nev, ncv);
+    s.tol(1e-12).maxiter(500);
+    cudaStream_t stream = s.backend().stream();
+
+    s.solve([&](const double* x, double* y) {
+        d_av_laplacian<<<blocks_for(n), 256, 0, stream>>>(n, x, y);
+    });
+
+    auto r = s.eigenpairs(true);
+    check("Sym<CUDA> evecs: vectors.size() == n*nev", (int)r.vectors.size() == n * nev);
+
+    // Residual check: for each (lambda, v), compute ||A v - lambda v|| / ||v||
+    // on the host (host-side A applied to the host-resident eigenvectors).
+    double max_res = 0.0;
+    std::vector<double> Av(n);
+    for (int k = 0; k < nev; ++k) {
+        const double* vk = &r.vectors[k * n];
+        Av[0] = 2.0 * vk[0] - vk[1];
+        for (int i = 1; i < n - 1; ++i) Av[i] = -vk[i - 1] + 2.0 * vk[i] - vk[i + 1];
+        Av[n - 1] = -vk[n - 2] + 2.0 * vk[n - 1];
+
+        double rnorm = 0.0, vnorm = 0.0;
+        for (int i = 0; i < n; ++i) {
+            double d = Av[i] - r.values[k] * vk[i];
+            rnorm += d * d;
+            vnorm += vk[i] * vk[i];
+        }
+        rnorm = std::sqrt(rnorm / vnorm);
+        if (rnorm > max_res) max_res = rnorm;
+    }
+    check("Sym<CUDA> evecs: max residual < 1e-10", max_res < 1e-10);
+}
+
+TEST_CASE("test_herm_cuda", "[cuda]") {
+    REQUIRE_CUDA();
+    using cplx       = std::complex<double>;
+    const int n      = 128, nev = 4, ncv = 12;
+    arnoldi::Arnoldi<arnoldi::Kind::Herm, cplx, arnoldi::SerialComm,
+                     arnoldi::CudaBackend> s("I", n, "SM", nev, ncv);
+    s.tol(1e-12).maxiter(500);
+    cudaStream_t stream = s.backend().stream();
+
+    s.solve([&](const cplx* x, cplx* y) {
+        d_av_hermitian<<<blocks_for(n), 256, 0, stream>>>(
+            n,
+            reinterpret_cast<const cuDoubleComplex*>(x),
+            reinterpret_cast<cuDoubleComplex*>(y));
+    });
+
+    check("Herm<CUDA>: converged", s.converged());
+    check("Herm<CUDA>: nconv >= nev", s.num_converged() >= nev);
+
+    auto r = s.eigenpairs(true);
+    check("Herm<CUDA>: vectors.size() == n*nev", (int)r.vectors.size() == n * nev);
+
+    // Hermitian eigenvalues must be real-valued (the solver returns them
+    // as Real, so this is automatic; just verify finiteness).
+    for (int k = 0; k < nev; ++k)
+        check("Herm<CUDA>: eigenvalue finite", std::isfinite(r.values[k]));
+
+    // Residual: ||H v - lambda v|| / ||v|| on the host.
+    const cplx        off(-1.0, 0.25);
+    std::vector<cplx> Hv(n);
+    double            max_res = 0.0;
+    for (int k = 0; k < nev; ++k) {
+        const cplx* vk = &r.vectors[k * n];
+        Hv[0]          = 2.0 * vk[0] + off * vk[1];
+        for (int i = 1; i < n - 1; ++i)
+            Hv[i] = std::conj(off) * vk[i - 1] + 2.0 * vk[i] + off * vk[i + 1];
+        Hv[n - 1]   = std::conj(off) * vk[n - 2] + 2.0 * vk[n - 1];
+
+        double rnorm = 0.0, vnorm = 0.0;
+        for (int i = 0; i < n; ++i) {
+            cplx d = Hv[i] - r.values[k] * vk[i];
+            rnorm += std::norm(d);
+            vnorm += std::norm(vk[i]);
+        }
+        rnorm = std::sqrt(rnorm / vnorm);
+        if (rnorm > max_res) max_res = rnorm;
+    }
+    check("Herm<CUDA>: max residual < 1e-9", max_res < 1e-9);
+}
+
+TEST_CASE("test_initial_resid_device", "[cuda]") {
+    REQUIRE_CUDA();
+    const int n = 64, nev = 3, ncv = 10;
+
+    // Allocate a device buffer of size n, seed with 1.0 from the host.
+    double* d_resid = nullptr;
+    cudaMalloc(&d_resid, n * sizeof(double));
+    std::vector<double> h_resid(n, 1.0);
+    cudaMemcpy(d_resid, h_resid.data(), n * sizeof(double), cudaMemcpyHostToDevice);
+
+    arnoldi::Arnoldi<arnoldi::Kind::Sym, double, arnoldi::SerialComm,
+                     arnoldi::CudaBackend> s("I", n, "SM", nev, ncv);
+    s.tol(1e-12).maxiter(500);
+    s.initial_resid_device(d_resid);  // device pointer overload
+    cudaStream_t stream = s.backend().stream();
+
+    s.solve([&](const double* x, double* y) {
+        d_av_laplacian<<<blocks_for(n), 256, 0, stream>>>(n, x, y);
+    });
+
+    check("initial_resid_device: converged", s.converged());
+    auto r = s.eigenpairs(false);
+    // Smallest converged eigenvalue (order within the set is unspecified).
+    double got_min = *std::min_element(r.values.begin(), r.values.end());
+    double err     = std::abs(got_min - exact_eig_laplacian(n, 1));
+    check("initial_resid_device: smallest eigenvalue accurate", err < 1e-10);
+
+    cudaFree(d_resid);
+}
+
+TEST_CASE("test_eigenpairs_device", "[cuda]") {
+    REQUIRE_CUDA();
+    const int n = 64, nev = 3, ncv = 10;
+    arnoldi::Arnoldi<arnoldi::Kind::Sym, double, arnoldi::SerialComm,
+                     arnoldi::CudaBackend> s("I", n, "SM", nev, ncv);
+    s.tol(1e-12).maxiter(500);
+    cudaStream_t stream = s.backend().stream();
+
+    s.solve([&](const double* x, double* y) {
+        d_av_laplacian<<<blocks_for(n), 256, 0, stream>>>(n, x, y);
+    });
+    check("eigenpairs_device: converged", s.converged());
+
+    // Eigenvectors stay on the device; eigenvalues come back on the host.
+    auto dr = s.eigenpairs_device(true);
+    check("eigenpairs_device: values.size() == nev", (int)dr.values.size() == nev);
+
+    // Pull the device eigenvectors down ourselves and residual-check.
+    std::vector<double> hv(n * nev);
+    cudaMemcpy(hv.data(), dr.vectors.data(), n * nev * sizeof(double), cudaMemcpyDeviceToHost);
+
+    double              max_res = 0.0;
+    std::vector<double> Av(n);
+    for (int k = 0; k < nev; ++k) {
+        const double* vk = &hv[k * n];
+        Av[0]            = 2.0 * vk[0] - vk[1];
+        for (int i = 1; i < n - 1; ++i) Av[i] = -vk[i - 1] + 2.0 * vk[i] - vk[i + 1];
+        Av[n - 1]   = -vk[n - 2] + 2.0 * vk[n - 1];
+        double rn = 0.0, vn = 0.0;
+        for (int i = 0; i < n; ++i) {
+            double d = Av[i] - dr.values[k] * vk[i];
+            rn += d * d;
+            vn += vk[i] * vk[i];
+        }
+        max_res = std::max(max_res, std::sqrt(rn / vn));
+    }
+    check("eigenpairs_device: device-evec residual < 1e-10", max_res < 1e-10);
+}
diff --git a/tests/test_validation.cpp b/tests/test_validation.cpp
index bea7541..6032c8b 100644
--- a/tests/test_validation.cpp
+++ b/tests/test_validation.cpp
@@ -7,139 +7,112 @@
 #include <stdexcept>
 #include <string>
 
-static int g_pass = 0, g_fail = 0;
+#include <catch2/catch_test_macros.hpp>
 
-template<typename F>
+// Catch2-backed expectation helpers. Signatures unchanged so the existing
+// call sites need no edits; assertions are recorded against the running
+// TEST_CASE. expect_throw requires a std::invalid_argument whose message
+// contains `substr` (when non-empty).
+template <typename F>
 void expect_throw(const char* name, F&& f, const std::string& substr = "") {
+    INFO(name);
     try {
         f();
-        std::printf("  FAIL  %s — no exception thrown\n", name);
-        ++g_fail;
+        FAIL("no exception thrown");
     } catch (const std::invalid_argument& e) {
-        if (!substr.empty() && std::string(e.what()).find(substr) == std::string::npos) {
-            std::printf("  FAIL  %s — wrong message: %s\n", name, e.what());
-            ++g_fail;
-        } else {
-            std::printf("  OK    %s\n", name);
-            ++g_pass;
-        }
+        INFO("message: " << e.what());
+        if (!substr.empty())
+            CHECK(std::string(e.what()).find(substr) != std::string::npos);
+        else
+            SUCCEED();
     } catch (const std::exception& e) {
-        std::printf("  FAIL  %s — unexpected exception: %s\n", name, e.what());
-        ++g_fail;
+        FAIL("unexpected exception: " << e.what());
     }
 }
 
-template<typename F>
+template <typename F>
 void expect_no_throw(const char* name, F&& f) {
+    INFO(name);
     try {
         f();
-        std::printf("  OK    %s\n", name);
-        ++g_pass;
+        SUCCEED();
     } catch (const std::exception& e) {
-        std::printf("  FAIL  %s — exception: %s\n", name, e.what());
-        ++g_fail;
+        FAIL("exception: " << e.what());
     }
 }
 
-void test_n_zero() {
+TEST_CASE("test_n_zero", "[validation]") {
     expect_throw("Sym: n=0", []{ arnoldi::Arnoldi<arnoldi::Kind::Sym, double>("I", 0, "LM", 1, 3); }, "n must be");
 }
-void test_n_negative() {
+TEST_CASE("test_n_negative", "[validation]") {
     expect_throw("Sym: n=-1", []{ arnoldi::Arnoldi<arnoldi::Kind::Sym, double>("I", -1, "LM", 1, 3); }, "n must be");
 }
 
-void test_nev_zero() {
+TEST_CASE("test_nev_zero", "[validation]") {
     expect_throw("Sym: nev=0", []{ arnoldi::Arnoldi<arnoldi::Kind::Sym, double>("I", 10, "LM", 0, 5); }, "nev must be");
 }
-void test_nev_negative() {
+TEST_CASE("test_nev_negative", "[validation]") {
     expect_throw("Nonsym: nev=-1", []{ arnoldi::Arnoldi<arnoldi::Kind::Nonsym, double>("I", 10, "LM", -1, 5); }, "nev must be");
 }
 
-void test_ncv_too_large() {
+TEST_CASE("test_ncv_too_large", "[validation]") {
     expect_throw("Sym: ncv > n", []{ arnoldi::Arnoldi<arnoldi::Kind::Sym, double>("I", 5, "LM", 2, 10); }, "ncv must be <= global");
 }
-void test_ncv_sym_too_small() {
+TEST_CASE("test_ncv_sym_too_small", "[validation]") {
     expect_throw("Sym: ncv == nev", []{ arnoldi::Arnoldi<arnoldi::Kind::Sym, double>("I", 20, "LM", 5, 5); }, "ncv must satisfy");
 }
-void test_ncv_nonsym_too_small() {
+TEST_CASE("test_ncv_nonsym_too_small", "[validation]") {
     expect_throw("Nonsym: ncv == nev+1", []{ arnoldi::Arnoldi<arnoldi::Kind::Nonsym, double>("I", 20, "LM", 4, 5); }, "ncv must satisfy");
 }
-void test_ncv_nonsym_boundary() {
+TEST_CASE("test_ncv_nonsym_boundary", "[validation]") {
     expect_no_throw("Nonsym: ncv == nev+2 (valid)", []{ arnoldi::Arnoldi<arnoldi::Kind::Nonsym, double>("I", 20, "LM", 4, 6); });
 }
-void test_ncv_sym_boundary() {
+TEST_CASE("test_ncv_sym_boundary", "[validation]") {
     expect_no_throw("Sym: ncv == nev+1 (valid)", []{ arnoldi::Arnoldi<arnoldi::Kind::Sym, double>("I", 20, "LM", 5, 6); });
 }
 
-void test_bmat_invalid() {
+TEST_CASE("test_bmat_invalid", "[validation]") {
     expect_throw("Sym: bmat='X'", []{ arnoldi::Arnoldi<arnoldi::Kind::Sym, double>("X", 10, "LM", 2, 5); }, "bmat must be");
 }
-void test_bmat_empty() {
+TEST_CASE("test_bmat_empty", "[validation]") {
     expect_throw("Sym: bmat=''", []{ arnoldi::Arnoldi<arnoldi::Kind::Sym, double>("", 10, "LM", 2, 5); }, "bmat must be");
 }
-void test_bmat_generalized() {
+TEST_CASE("test_bmat_generalized", "[validation]") {
     expect_no_throw("Sym: bmat='G' (valid)", []{ arnoldi::Arnoldi<arnoldi::Kind::Sym, double>("G", 10, "LM", 2, 5); });
 }
 
-void test_which_sym_invalid() {
+TEST_CASE("test_which_sym_invalid", "[validation]") {
     expect_throw("Sym: which='XX'", []{ arnoldi::Arnoldi<arnoldi::Kind::Sym, double>("I", 10, "XX", 2, 5); }, "which must be");
 }
-void test_which_sym_LI() {
+TEST_CASE("test_which_sym_LI", "[validation]") {
     // LI is valid for Nonsym but not Sym.
     expect_throw("Sym: which='LI'", []{ arnoldi::Arnoldi<arnoldi::Kind::Sym, double>("I", 10, "LI", 2, 5); }, "which must be");
 }
-void test_which_sym_all_valid() {
+TEST_CASE("test_which_sym_all_valid", "[validation]") {
     for (auto w : {"LM", "SM", "LA", "SA", "BE"}) {
         std::string name = std::string("Sym: which='") + w + "'";
         expect_no_throw(name.c_str(), [w]{ arnoldi::Arnoldi<arnoldi::Kind::Sym, double>("I", 20, w, 2, 5); });
     }
 }
 
-void test_which_nonsym_invalid() {
+TEST_CASE("test_which_nonsym_invalid", "[validation]") {
     expect_throw("Nonsym: which='LA'", []{ arnoldi::Arnoldi<arnoldi::Kind::Nonsym, double>("I", 20, "LA", 2, 5); }, "which must be");
 }
-void test_which_nonsym_all_valid() {
+TEST_CASE("test_which_nonsym_all_valid", "[validation]") {
     for (auto w : {"LM", "SM", "LR", "SR", "LI", "SI"}) {
         std::string name = std::string("Nonsym: which='") + w + "'";
         expect_no_throw(name.c_str(), [w]{ arnoldi::Arnoldi<arnoldi::Kind::Nonsym, double>("I", 20, w, 2, 5); });
     }
 }
 
-void test_which_herm_invalid() {
+TEST_CASE("test_which_herm_invalid", "[validation]") {
     using cplx = std::complex<double>;
     expect_throw("Herm: which='LI'", []{ arnoldi::Arnoldi<arnoldi::Kind::Herm, cplx>("I", 20, "LI", 2, 5); }, "which must be");
 }
-void test_which_herm_all_valid() {
+TEST_CASE("test_which_herm_all_valid", "[validation]") {
     using cplx = std::complex<double>;
     for (auto w : {"LM", "SM", "LA", "SA", "BE"}) {
         std::string name = std::string("Herm: which='") + w + "'";
         expect_no_throw(name.c_str(), [w]{ arnoldi::Arnoldi<arnoldi::Kind::Herm, cplx>("I", 20, w, 2, 5); });
     }
 }
-
-int main() {
-    std::printf("test_validation:\n");
-
-    test_n_zero();
-    test_n_negative();
-    test_nev_zero();
-    test_nev_negative();
-    test_ncv_too_large();
-    test_ncv_sym_too_small();
-    test_ncv_nonsym_too_small();
-    test_ncv_nonsym_boundary();
-    test_ncv_sym_boundary();
-    test_bmat_invalid();
-    test_bmat_empty();
-    test_bmat_generalized();
-    test_which_sym_invalid();
-    test_which_sym_LI();
-    test_which_sym_all_valid();
-    test_which_nonsym_invalid();
-    test_which_nonsym_all_valid();
-    test_which_herm_invalid();
-    test_which_herm_all_valid();
-
-    std::printf("\n%d passed, %d failed\n", g_pass, g_fail);
-    return g_fail > 0 ? 1 : 0;
-}