Q-solvers · iskakoff · May 19, 2026 · May 17, 2026 · May 19, 2026
diff --git a/.gitignore b/.gitignore
@@ -42,7 +42,7 @@
 # Build directories
 build/
 Build/
-build-*/
+*build*/
 /cmake-build-*/
 
 # CMake generated files

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -7,6 +7,7 @@ set(CMAKE_CXX_EXTENSIONS OFF)
 
 option(ARNOLDI_BUILD_EXAMPLES "Build example programs" OFF)
 option(ARNOLDI_REGRESSION "Build regression benchmark against original ARPACK (requires arpackng)" OFF)
+option(ARNOLDI_USE_CUDA "Build with CUDA + cuBLAS backend (Sym / Herm only)" OFF)
 
 find_package(BLAS REQUIRED)
 find_package(LAPACK REQUIRED)
@@ -25,6 +26,14 @@ else()
     target_link_libraries(arnoldi INTERFACE ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES})
 endif()
 
+if(ARNOLDI_USE_CUDA)
+    # cuBLAS / cudart are C-API libraries; no nvcc required for the wrappers.
+    find_package(CUDAToolkit REQUIRED)
+    target_link_libraries(arnoldi INTERFACE CUDA::cudart CUDA::cublas)
+    target_compile_definitions(arnoldi INTERFACE ARNOLDI_USE_CUDA)
+    message(STATUS "Arnoldi: CUDA backend enabled (cuBLAS=${CUDAToolkit_VERSION})")
+endif()
+
 include(GNUInstallDirs)
 install(TARGETS arnoldi EXPORT arnoldiTargets)
 install(DIRECTORY include/arnoldi DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})

diff --git a/README.md b/README.md
@@ -9,6 +9,8 @@ Header-only API for a C++ port of ARPACK’s symmetric and nonsymmetric eigenval
 
 - C++17 compiler
 - BLAS and LAPACK (Fortran calling convention, `_` suffix)
+- *(optional, GPU backend)* CUDA Toolkit with cuBLAS — see [GPU (CUDA) backend](#gpu-cuda-backend)
+- *(optional, tests)* none to install: Catch2 v3 is fetched automatically via CMake `FetchContent` (needs network on first configure)
 
 ## Build (library interface + examples)
 
@@ -19,6 +21,20 @@ cmake --build build
 ./build/examples/arpack_cpp_examples dnsimp   # Fortran EXAMPLES catalog (see examples/README.md)
 ```
 
+Options:
+
+| Option | Default | Effect |
+| --- | --- | --- |
+| `ARNOLDI_BUILD_EXAMPLES` | OFF | build the `examples/` programs |
+| `ARNOLDI_BUILD_TESTS` | OFF | build the Catch2 test suite (`ctest`) |
+| `ARNOLDI_USE_CUDA` | OFF | enable the CUDA + cuBLAS backend (Sym / Herm) |
+
+```bash
+# tests
+cmake -S . -B build -DARNOLDI_BUILD_TESTS=ON
+cmake --build build && ctest --test-dir build
+```
+
 ## Use in your project
 
 ```cmake
@@ -31,11 +47,69 @@ target_link_libraries(your_target PRIVATE arpack::callback)
 // arnoldi::detail::naupd, neupd, saupd, seupd, ...
 ```
 
+## GPU (CUDA) backend
+
+`Arnoldi<K, Scalar, Comm, Backend>` is templated on a backend. The default
+`CpuBackend` runs every length-`n` BLAS on the host (unchanged behaviour).
+`CudaBackend` keeps the Lanczos workspace (`resid`, `v`, `workd`) in device
+memory and routes those operations through cuBLAS, so the **matvec runs
+entirely on the GPU**. Supported for `Kind::Sym` and `Kind::Herm` (Nonsym on
+device is rejected at compile time — no cuSOLVER analogue for its real-Schur
+path). The small implicitly-restarted shift machinery (`ncv²`-sized `h`,
+`q`, `workl`) stays host-resident by design.
+
+Enable it:
+
+```bash
+cmake -S . -B build -DARNOLDI_USE_CUDA=ON \
+      -DCMAKE_CUDA_ARCHITECTURES=<your_sm>          # e.g. 61 for Quadro P1000
+cmake --build build
+```
+
+`cuda.hpp` is plain C++ calling the cuBLAS C API — only the `.cu`
+tests/examples need `nvcc`. If your system compiler is newer than what your
+`nvcc` supports (e.g. CUDA 12.1 + GCC ≥ 13), point `nvcc` at an older host
+compiler:
+
+```bash
+cmake -S . -B build -DARNOLDI_USE_CUDA=ON \
+      -DCMAKE_CUDA_HOST_COMPILER=/path/to/g++-12 \
+      -DCMAKE_CUDA_ARCHITECTURES=61
+```
+
+Usage — the matvec callback receives **device** pointers and should launch on
+the backend's stream:
+
+```cpp
+#include <arnoldi/arnoldi.hpp>
+#include <arnoldi/cuda.hpp>
+
+arnoldi::Arnoldi<arnoldi::Kind::Sym, double,
+                 arnoldi::SerialComm, arnoldi::CudaBackend>
+    s("I", n, "LM", nev, ncv);
+s.tol(1e-10).maxiter(1000);
+
+cudaStream_t stream = s.backend().stream();
+s.solve([&](const double* d_x, double* d_y) {       // device pointers
+    my_matvec_kernel<<<blocks, threads, 0, stream>>>(n, d_x, d_y);
+});
+
+auto r  = s.eigenpairs();          // values + vectors copied to host
+auto dr = s.eigenpairs_device();   // eigenvectors left on the GPU
+```
+
+Also available under a device backend: `s.initial_resid_device(devptr)` to
+seed from a device-resident vector, and `s.backend().handle()` for the cuBLAS
+handle. See `examples/arnoldi_sym_cuda.cu` (and `arnoldi_sym_cpu.cpp` for the
+matching CPU baseline / benchmark).
+
 ## Layout
 
 - `include/arnoldi/arnoldi.hpp` — include this
+- `include/arnoldi/cuda.hpp` — opt-in CUDA backend (`-DARNOLDI_USE_CUDA=ON`)
 - `include/arnoldi/detail/` — implementation headers
-- `examples/` — `example_symmetric_laplacian`, plus `arpack_cpp_examples` (Fortran `EXAMPLES/` name catalog)
+- `examples/` — `example_symmetric_laplacian`, `arnoldi_sym_cuda` / `arnoldi_sym_cpu`, plus `arpack_cpp_examples` (Fortran `EXAMPLES/` name catalog)
+- `tests/` — Catch2 suite (`ctest`), incl. `test_solver_cuda` when CUDA is on
 
 ## Origin
 

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -1,9 +1,23 @@
 # Included only when ARNOLDI_BUILD_EXAMPLES=ON (see parent CMakeLists.txt).
 
+if(ARNOLDI_USE_CUDA AND NOT CMAKE_CUDA_ARCHITECTURES)
+    if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.24)
+        set(CMAKE_CUDA_ARCHITECTURES native)
+    else()
+        set(CMAKE_CUDA_ARCHITECTURES 61)  # Pascal — safe minimum baseline
+    endif()
+    message(STATUS "Arnoldi examples: CMAKE_CUDA_ARCHITECTURES defaulted to "
+        "${CMAKE_CUDA_ARCHITECTURES} (override with -DCMAKE_CUDA_ARCHITECTURES=<cc>)")
+endif()
+
 if(ARNOLDI_BUILD_EXAMPLES)
     add_executable(example_symmetric_laplacian symmetric_laplacian.cpp)
     target_link_libraries(example_symmetric_laplacian PRIVATE arnoldi)
 
+    # CPU twin of arnoldi_sym_cuda — same problem, for backend benchmarking.
+    add_executable(example_arnoldi_sym_cpu arnoldi_sym_cpu.cpp)
+    target_link_libraries(example_arnoldi_sym_cpu PRIVATE arnoldi)
+
     # Class-based Arnoldi<Kind, Scalar> demos (one per kind)
     foreach(_demo IN ITEMS arnoldi_sym arnoldi_nonsym arnoldi_herm)
         add_executable(example_${_demo} ${_demo}.cpp)
@@ -18,6 +32,16 @@ if(ARNOLDI_BUILD_EXAMPLES)
         target_link_libraries(example_${_demo} PRIVATE arnoldi)
     endforeach()
 
+    # CUDA device-matvec example (only when the CUDA backend is enabled)
+    if(ARNOLDI_USE_CUDA)
+        enable_language(CUDA)
+        add_executable(example_arnoldi_sym_cuda arnoldi_sym_cuda.cu)
+        set_target_properties(example_arnoldi_sym_cuda PROPERTIES
+            CUDA_STANDARD 17
+            CUDA_STANDARD_REQUIRED ON)
+        target_link_libraries(example_arnoldi_sym_cuda PRIVATE arnoldi)
+    endif()
+
     # MPI parallel example (optional — only if MPI found)
     find_package(MPI COMPONENTS CXX QUIET)
     if(MPI_CXX_FOUND)
@@ -30,8 +54,22 @@ endif()
 # Performance + correctness benchmark against original ARPACK
 if(ARNOLDI_BUILD_EXAMPLES AND ORIGINAL_ARPACK_LIB)
     enable_language(Fortran)
+
+    # With the CUDA backend the benchmark adds a GPU-vs-Fortran tier with
+    # device matvec kernels (__global__ / <<<>>>), so the .cpp must be
+    # compiled by nvcc.
+    if(ARNOLDI_USE_CUDA)
+        enable_language(CUDA)
+        set_source_files_properties(benchmark.cpp PROPERTIES LANGUAGE CUDA)
+    endif()
+
     add_executable(bench_arpack benchmark.cpp)
     target_include_directories(bench_arpack PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
+    if(ARNOLDI_USE_CUDA)
+        set_target_properties(bench_arpack PROPERTIES
+            CUDA_STANDARD 17
+            CUDA_STANDARD_REQUIRED ON)
+    endif()
     if(TARGET BLAS::BLAS AND TARGET LAPACK::LAPACK)
         target_link_libraries(bench_arpack PRIVATE
             arnoldi ${ORIGINAL_ARPACK_LIB} LAPACK::LAPACK BLAS::BLAS)

diff --git a/examples/arnoldi_sym_cpu.cpp b/examples/arnoldi_sym_cpu.cpp
@@ -0,0 +1,52 @@
+// arnoldi_sym_cpu — CPU twin of examples/arnoldi_sym_cuda.cu.
+//
+// Same problem (diagonal operator A(i,i) = i+1, n = 8192, nev = 5,
+// ncv = 8*nev, "LM", tol 1e-10) solved with the default CpuBackend and a
+// host matvec. Times the solve() call so it can be compared directly
+// against the CUDA example as a backend performance benchmark.
+//
+// Build: configure with -DARNOLDI_BUILD_EXAMPLES=ON (no CUDA needed).
+
+#include <arnoldi/arnoldi.hpp>
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <cstdio>
+#include <functional>
+#include <vector>
+
+int main() {
+    const int n   = 1 << 18;  // 8192 — same size as the CUDA example
+    const int nev = 5;
+    const int ncv = 8 * nev;  // matches arnoldi_sym_cuda.cu
+
+    // "LM": the nev largest eigenvalues, i.e. n, n-1, ..., n-nev+1.
+    arnoldi::Arnoldi<arnoldi::Kind::Sym, double> solver("I", n, "LM", nev, ncv);
+    solver.tol(1e-10).maxiter(1000);
+
+    auto t0 = std::chrono::steady_clock::now();
+    solver.solve([n](const double* x, double* y) {
+        for (int i = 0; i < n; ++i) y[i] = static_cast<double>(i + 1) * x[i];
+    });
+    auto t1     = std::chrono::steady_clock::now();
+    double secs = std::chrono::duration<double>(t1 - t0).count();
+
+    if (!solver.converged()) {
+        std::printf("CPU saupd failed, info = %d\n", solver.info());
+        return 1;
+    }
+
+    auto r = solver.eigenpairs(false);  // values only
+    std::sort(r.values.begin(), r.values.end(), std::greater<double>());
+    std::printf("Largest %d eigenvalues of diag(1..%d), CPU backend:\n", nev, n);
+    for (int i = 0; i < nev; ++i) {
+        double exact = static_cast<double>(n - i);
+        std::printf("  lambda[%d] = %.6f   (exact %.0f, |err| = %.2e)\n",
+                    i, r.values[i], exact, std::fabs(r.values[i] - exact));
+    }
+    std::printf("iterations=%d, OP applies=%d\n",
+                solver.num_iterations(), solver.num_op_applies());
+    std::printf("solve() wall time: %.4f s\n", secs);
+    return 0;
+}
diff --git a/examples/arnoldi_sym_cuda.cu b/examples/arnoldi_sym_cuda.cu
@@ -0,0 +1,94 @@
+// arnoldi_sym_cuda — smallest eigenvalues of the 1D Dirichlet Laplacian
+// with the matvec running entirely on the GPU.
+//
+// Demonstrates the CudaBackend device-matvec pattern:
+//   - Arnoldi<Kind::Sym, double, SerialComm, CudaBackend> keeps the Lanczos
+//     workspace (resid/v/workd) in device memory.
+//   - The user matvec callback receives DEVICE pointers and launches a CUDA
+//     kernel on the backend's stream.
+//   - eigenpairs() copies results to the host; eigenpairs_device() leaves
+//     the eigenvectors on the GPU for downstream device work.
+//
+// Build: configure with -DARNOLDI_USE_CUDA=ON -DARNOLDI_BUILD_EXAMPLES=ON.
+
+#include <arnoldi/arnoldi.hpp>
+#include <arnoldi/cuda.hpp>
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <cstdio>
+#include <cuda_runtime.h>
+#include <functional>
+#include <vector>
+
+// y = A x for the diagonal operator A(i,i) = i+1. The spectrum {1,2,...,n}
+// is perfectly separated, so IRA converges fast at any n — a clean showcase
+// of a large GPU matvec with exactly-known eigenvalues. (The tridiagonal
+// Laplacian is clustered at both spectral ends for large n and would need
+// shift-invert to target either end quickly — out of scope for a demo.)
+__global__ void diag_matvec(int n, const double* x, double* y) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= n) return;
+    y[i] = static_cast<double>(i + 1) * x[i];
+}
+
+int main() {
+    int dev_count = 0;
+    cudaGetDeviceCount(&dev_count);
+    if (dev_count == 0) {
+        std::printf("no CUDA device available\n");
+        return 0;
+    }
+
+    const int n   = 1 << 18;  // 8192 — a non-trivial GPU matvec
+    const int nev = 5;
+    const int ncv = 8 * nev;  // larger Krylov subspace: the relative gap of
+                              // the top eigenvalues is ~1/n, so a bigger ncv
+                              // keeps IRA convergence fast at large n
+
+    // "LM": the nev largest eigenvalues, i.e. n, n-1, ..., n-nev+1.
+    arnoldi::Arnoldi<arnoldi::Kind::Sym, double, arnoldi::SerialComm,
+                     arnoldi::CudaBackend> solver("I", n, "LM", nev, ncv);
+    solver.tol(1e-10).maxiter(1000);
+
+    cudaStream_t stream  = solver.backend().stream();
+    const int    threads = 256;
+    const int    blocks  = (n + threads - 1) / threads;
+
+    // x, y are device pointers (workspace lives on the GPU). Launch on the
+    // backend's stream so the surrounding cuBLAS calls order correctly.
+    auto t0 = std::chrono::steady_clock::now();
+    solver.solve([&](const double* x, double* y) {
+        diag_matvec<<<blocks, threads, 0, stream>>>(n, x, y);
+    });
+    cudaStreamSynchronize(stream);  // ensure all device work is finished
+    auto t1     = std::chrono::steady_clock::now();
+    double secs = std::chrono::duration<double>(t1 - t0).count();
+
+    if (!solver.converged()) {
+        std::printf("CUDA saupd failed, info = %d\n", solver.info());
+        return 1;
+    }
+
+    auto r = solver.eigenpairs(false);  // values only (host)
+    // eigenpairs(false) returns the wanted set in which-selection order, not
+    // sorted by value — sort descending for a clean presentation.
+    std::sort(r.values.begin(), r.values.end(), std::greater<double>());
+    std::printf("Largest %d eigenvalues of diag(1..%d), CUDA backend:\n", nev, n);
+    for (int i = 0; i < nev; ++i) {
+        double exact = static_cast<double>(n - i);  // n, n-1, ..., n-nev+1
+        std::printf("  lambda[%d] = %.6f   (exact %.0f, |err| = %.2e)\n",
+                    i, r.values[i], exact, std::fabs(r.values[i] - exact));
+    }
+    std::printf("iterations=%d, OP applies=%d\n",
+                solver.num_iterations(), solver.num_op_applies());
+    std::printf("solve() wall time: %.4f s\n", secs);
+
+    // Eigenvectors can also be kept on the device for further GPU work.
+    auto dr = solver.eigenpairs_device(true);
+    std::printf("eigenpairs_device: %d eigenvalues on host, "
+                "%zu-element eigenvector matrix left on device\n",
+                (int)dr.values.size(), (size_t)(n) * nev);
+    return 0;
+}