diff --git a/.gitignore b/.gitignore index 103505f..9115599 100644 --- a/.gitignore +++ b/.gitignore @@ -42,7 +42,7 @@ # Build directories build/ Build/ -build-*/ +*build*/ /cmake-build-*/ # CMake generated files diff --git a/CMakeLists.txt b/CMakeLists.txt index 9739d57..23b1bc5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,6 +7,7 @@ set(CMAKE_CXX_EXTENSIONS OFF) option(ARNOLDI_BUILD_EXAMPLES "Build example programs" OFF) option(ARNOLDI_REGRESSION "Build regression benchmark against original ARPACK (requires arpackng)" OFF) +option(ARNOLDI_USE_CUDA "Build with CUDA + cuBLAS backend (Sym / Herm only)" OFF) find_package(BLAS REQUIRED) find_package(LAPACK REQUIRED) @@ -25,6 +26,14 @@ else() target_link_libraries(arnoldi INTERFACE ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) endif() +if(ARNOLDI_USE_CUDA) + # cuBLAS / cudart are C-API libraries; no nvcc required for the wrappers. + find_package(CUDAToolkit REQUIRED) + target_link_libraries(arnoldi INTERFACE CUDA::cudart CUDA::cublas) + target_compile_definitions(arnoldi INTERFACE ARNOLDI_USE_CUDA) + message(STATUS "Arnoldi: CUDA backend enabled (cuBLAS=${CUDAToolkit_VERSION})") +endif() + include(GNUInstallDirs) install(TARGETS arnoldi EXPORT arnoldiTargets) install(DIRECTORY include/arnoldi DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) diff --git a/README.md b/README.md index 6bf10c1..ab088ad 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,8 @@ Header-only API for a C++ port of ARPACK’s symmetric and nonsymmetric eigenval - C++17 compiler - BLAS and LAPACK (Fortran calling convention, `_` suffix) +- *(optional, GPU backend)* CUDA Toolkit with cuBLAS — see [GPU (CUDA) backend](#gpu-cuda-backend) +- *(optional, tests)* none to install: Catch2 v3 is fetched automatically via CMake `FetchContent` (needs network on first configure) ## Build (library interface + examples) @@ -19,6 +21,20 @@ cmake --build build ./build/examples/arpack_cpp_examples dnsimp # Fortran EXAMPLES catalog (see examples/README.md) ``` +Options: + +| Option | Default | Effect | +| --- | --- | --- | +| `ARNOLDI_BUILD_EXAMPLES` | OFF | build the `examples/` programs | +| `ARNOLDI_BUILD_TESTS` | OFF | build the Catch2 test suite (`ctest`) | +| `ARNOLDI_USE_CUDA` | OFF | enable the CUDA + cuBLAS backend (Sym / Herm) | + +```bash +# tests +cmake -S . -B build -DARNOLDI_BUILD_TESTS=ON +cmake --build build && ctest --test-dir build +``` + ## Use in your project ```cmake @@ -31,11 +47,69 @@ target_link_libraries(your_target PRIVATE arpack::callback) // arnoldi::detail::naupd, neupd, saupd, seupd, ... ``` +## GPU (CUDA) backend + +`Arnoldi` is templated on a backend. The default +`CpuBackend` runs every length-`n` BLAS on the host (unchanged behaviour). +`CudaBackend` keeps the Lanczos workspace (`resid`, `v`, `workd`) in device +memory and routes those operations through cuBLAS, so the **matvec runs +entirely on the GPU**. Supported for `Kind::Sym` and `Kind::Herm` (Nonsym on +device is rejected at compile time — no cuSOLVER analogue for its real-Schur +path). The small implicitly-restarted shift machinery (`ncv²`-sized `h`, +`q`, `workl`) stays host-resident by design. + +Enable it: + +```bash +cmake -S . -B build -DARNOLDI_USE_CUDA=ON \ + -DCMAKE_CUDA_ARCHITECTURES= # e.g. 61 for Quadro P1000 +cmake --build build +``` + +`cuda.hpp` is plain C++ calling the cuBLAS C API — only the `.cu` +tests/examples need `nvcc`. If your system compiler is newer than what your +`nvcc` supports (e.g. CUDA 12.1 + GCC ≥ 13), point `nvcc` at an older host +compiler: + +```bash +cmake -S . -B build -DARNOLDI_USE_CUDA=ON \ + -DCMAKE_CUDA_HOST_COMPILER=/path/to/g++-12 \ + -DCMAKE_CUDA_ARCHITECTURES=61 +``` + +Usage — the matvec callback receives **device** pointers and should launch on +the backend's stream: + +```cpp +#include +#include + +arnoldi::Arnoldi + s("I", n, "LM", nev, ncv); +s.tol(1e-10).maxiter(1000); + +cudaStream_t stream = s.backend().stream(); +s.solve([&](const double* d_x, double* d_y) { // device pointers + my_matvec_kernel<<>>(n, d_x, d_y); +}); + +auto r = s.eigenpairs(); // values + vectors copied to host +auto dr = s.eigenpairs_device(); // eigenvectors left on the GPU +``` + +Also available under a device backend: `s.initial_resid_device(devptr)` to +seed from a device-resident vector, and `s.backend().handle()` for the cuBLAS +handle. See `examples/arnoldi_sym_cuda.cu` (and `arnoldi_sym_cpu.cpp` for the +matching CPU baseline / benchmark). + ## Layout - `include/arnoldi/arnoldi.hpp` — include this +- `include/arnoldi/cuda.hpp` — opt-in CUDA backend (`-DARNOLDI_USE_CUDA=ON`) - `include/arnoldi/detail/` — implementation headers -- `examples/` — `example_symmetric_laplacian`, plus `arpack_cpp_examples` (Fortran `EXAMPLES/` name catalog) +- `examples/` — `example_symmetric_laplacian`, `arnoldi_sym_cuda` / `arnoldi_sym_cpu`, plus `arpack_cpp_examples` (Fortran `EXAMPLES/` name catalog) +- `tests/` — Catch2 suite (`ctest`), incl. `test_solver_cuda` when CUDA is on ## Origin diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index d987cf5..dc9c9cd 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -1,9 +1,23 @@ # Included only when ARNOLDI_BUILD_EXAMPLES=ON (see parent CMakeLists.txt). +if(ARNOLDI_USE_CUDA AND NOT CMAKE_CUDA_ARCHITECTURES) + if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.24) + set(CMAKE_CUDA_ARCHITECTURES native) + else() + set(CMAKE_CUDA_ARCHITECTURES 61) # Pascal — safe minimum baseline + endif() + message(STATUS "Arnoldi examples: CMAKE_CUDA_ARCHITECTURES defaulted to " + "${CMAKE_CUDA_ARCHITECTURES} (override with -DCMAKE_CUDA_ARCHITECTURES=)") +endif() + if(ARNOLDI_BUILD_EXAMPLES) add_executable(example_symmetric_laplacian symmetric_laplacian.cpp) target_link_libraries(example_symmetric_laplacian PRIVATE arnoldi) + # CPU twin of arnoldi_sym_cuda — same problem, for backend benchmarking. + add_executable(example_arnoldi_sym_cpu arnoldi_sym_cpu.cpp) + target_link_libraries(example_arnoldi_sym_cpu PRIVATE arnoldi) + # Class-based Arnoldi demos (one per kind) foreach(_demo IN ITEMS arnoldi_sym arnoldi_nonsym arnoldi_herm) add_executable(example_${_demo} ${_demo}.cpp) @@ -18,6 +32,16 @@ if(ARNOLDI_BUILD_EXAMPLES) target_link_libraries(example_${_demo} PRIVATE arnoldi) endforeach() + # CUDA device-matvec example (only when the CUDA backend is enabled) + if(ARNOLDI_USE_CUDA) + enable_language(CUDA) + add_executable(example_arnoldi_sym_cuda arnoldi_sym_cuda.cu) + set_target_properties(example_arnoldi_sym_cuda PROPERTIES + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON) + target_link_libraries(example_arnoldi_sym_cuda PRIVATE arnoldi) + endif() + # MPI parallel example (optional — only if MPI found) find_package(MPI COMPONENTS CXX QUIET) if(MPI_CXX_FOUND) @@ -30,8 +54,22 @@ endif() # Performance + correctness benchmark against original ARPACK if(ARNOLDI_BUILD_EXAMPLES AND ORIGINAL_ARPACK_LIB) enable_language(Fortran) + + # With the CUDA backend the benchmark adds a GPU-vs-Fortran tier with + # device matvec kernels (__global__ / <<<>>>), so the .cpp must be + # compiled by nvcc. + if(ARNOLDI_USE_CUDA) + enable_language(CUDA) + set_source_files_properties(benchmark.cpp PROPERTIES LANGUAGE CUDA) + endif() + add_executable(bench_arpack benchmark.cpp) target_include_directories(bench_arpack PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) + if(ARNOLDI_USE_CUDA) + set_target_properties(bench_arpack PROPERTIES + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON) + endif() if(TARGET BLAS::BLAS AND TARGET LAPACK::LAPACK) target_link_libraries(bench_arpack PRIVATE arnoldi ${ORIGINAL_ARPACK_LIB} LAPACK::LAPACK BLAS::BLAS) diff --git a/examples/arnoldi_sym_cpu.cpp b/examples/arnoldi_sym_cpu.cpp new file mode 100644 index 0000000..88fb97e --- /dev/null +++ b/examples/arnoldi_sym_cpu.cpp @@ -0,0 +1,52 @@ +// arnoldi_sym_cpu — CPU twin of examples/arnoldi_sym_cuda.cu. +// +// Same problem (diagonal operator A(i,i) = i+1, n = 8192, nev = 5, +// ncv = 8*nev, "LM", tol 1e-10) solved with the default CpuBackend and a +// host matvec. Times the solve() call so it can be compared directly +// against the CUDA example as a backend performance benchmark. +// +// Build: configure with -DARNOLDI_BUILD_EXAMPLES=ON (no CUDA needed). + +#include + +#include +#include +#include +#include +#include +#include + +int main() { + const int n = 1 << 18; // 8192 — same size as the CUDA example + const int nev = 5; + const int ncv = 8 * nev; // matches arnoldi_sym_cuda.cu + + // "LM": the nev largest eigenvalues, i.e. n, n-1, ..., n-nev+1. + arnoldi::Arnoldi solver("I", n, "LM", nev, ncv); + solver.tol(1e-10).maxiter(1000); + + auto t0 = std::chrono::steady_clock::now(); + solver.solve([n](const double* x, double* y) { + for (int i = 0; i < n; ++i) y[i] = static_cast(i + 1) * x[i]; + }); + auto t1 = std::chrono::steady_clock::now(); + double secs = std::chrono::duration(t1 - t0).count(); + + if (!solver.converged()) { + std::printf("CPU saupd failed, info = %d\n", solver.info()); + return 1; + } + + auto r = solver.eigenpairs(false); // values only + std::sort(r.values.begin(), r.values.end(), std::greater()); + std::printf("Largest %d eigenvalues of diag(1..%d), CPU backend:\n", nev, n); + for (int i = 0; i < nev; ++i) { + double exact = static_cast(n - i); + std::printf(" lambda[%d] = %.6f (exact %.0f, |err| = %.2e)\n", + i, r.values[i], exact, std::fabs(r.values[i] - exact)); + } + std::printf("iterations=%d, OP applies=%d\n", + solver.num_iterations(), solver.num_op_applies()); + std::printf("solve() wall time: %.4f s\n", secs); + return 0; +} diff --git a/examples/arnoldi_sym_cuda.cu b/examples/arnoldi_sym_cuda.cu new file mode 100644 index 0000000..bf5b1ec --- /dev/null +++ b/examples/arnoldi_sym_cuda.cu @@ -0,0 +1,94 @@ +// arnoldi_sym_cuda — smallest eigenvalues of the 1D Dirichlet Laplacian +// with the matvec running entirely on the GPU. +// +// Demonstrates the CudaBackend device-matvec pattern: +// - Arnoldi keeps the Lanczos +// workspace (resid/v/workd) in device memory. +// - The user matvec callback receives DEVICE pointers and launches a CUDA +// kernel on the backend's stream. +// - eigenpairs() copies results to the host; eigenpairs_device() leaves +// the eigenvectors on the GPU for downstream device work. +// +// Build: configure with -DARNOLDI_USE_CUDA=ON -DARNOLDI_BUILD_EXAMPLES=ON. + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +// y = A x for the diagonal operator A(i,i) = i+1. The spectrum {1,2,...,n} +// is perfectly separated, so IRA converges fast at any n — a clean showcase +// of a large GPU matvec with exactly-known eigenvalues. (The tridiagonal +// Laplacian is clustered at both spectral ends for large n and would need +// shift-invert to target either end quickly — out of scope for a demo.) +__global__ void diag_matvec(int n, const double* x, double* y) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) return; + y[i] = static_cast(i + 1) * x[i]; +} + +int main() { + int dev_count = 0; + cudaGetDeviceCount(&dev_count); + if (dev_count == 0) { + std::printf("no CUDA device available\n"); + return 0; + } + + const int n = 1 << 18; // 8192 — a non-trivial GPU matvec + const int nev = 5; + const int ncv = 8 * nev; // larger Krylov subspace: the relative gap of + // the top eigenvalues is ~1/n, so a bigger ncv + // keeps IRA convergence fast at large n + + // "LM": the nev largest eigenvalues, i.e. n, n-1, ..., n-nev+1. + arnoldi::Arnoldi solver("I", n, "LM", nev, ncv); + solver.tol(1e-10).maxiter(1000); + + cudaStream_t stream = solver.backend().stream(); + const int threads = 256; + const int blocks = (n + threads - 1) / threads; + + // x, y are device pointers (workspace lives on the GPU). Launch on the + // backend's stream so the surrounding cuBLAS calls order correctly. + auto t0 = std::chrono::steady_clock::now(); + solver.solve([&](const double* x, double* y) { + diag_matvec<<>>(n, x, y); + }); + cudaStreamSynchronize(stream); // ensure all device work is finished + auto t1 = std::chrono::steady_clock::now(); + double secs = std::chrono::duration(t1 - t0).count(); + + if (!solver.converged()) { + std::printf("CUDA saupd failed, info = %d\n", solver.info()); + return 1; + } + + auto r = solver.eigenpairs(false); // values only (host) + // eigenpairs(false) returns the wanted set in which-selection order, not + // sorted by value — sort descending for a clean presentation. + std::sort(r.values.begin(), r.values.end(), std::greater()); + std::printf("Largest %d eigenvalues of diag(1..%d), CUDA backend:\n", nev, n); + for (int i = 0; i < nev; ++i) { + double exact = static_cast(n - i); // n, n-1, ..., n-nev+1 + std::printf(" lambda[%d] = %.6f (exact %.0f, |err| = %.2e)\n", + i, r.values[i], exact, std::fabs(r.values[i] - exact)); + } + std::printf("iterations=%d, OP applies=%d\n", + solver.num_iterations(), solver.num_op_applies()); + std::printf("solve() wall time: %.4f s\n", secs); + + // Eigenvectors can also be kept on the device for further GPU work. + auto dr = solver.eigenpairs_device(true); + std::printf("eigenpairs_device: %d eigenvalues on host, " + "%zu-element eigenvector matrix left on device\n", + (int)dr.values.size(), (size_t)(n) * nev); + return 0; +} diff --git a/examples/benchmark.cpp b/examples/benchmark.cpp index 84dece7..6a6d335 100644 --- a/examples/benchmark.cpp +++ b/examples/benchmark.cpp @@ -15,6 +15,11 @@ #include "kernels.hpp" #include "lapack_extra.hpp" +#ifdef ARNOLDI_USE_CUDA +#include +#include +#endif + #include #include #include @@ -98,9 +103,12 @@ using arnoldi_examples::av_laplacian_1d; // C++ solvers (return converged eigenvalues). static std::vector cpp_dsdrv1(int nx) { - int n = nx * nx, nev = 4, ncv = std::min(20, n - 1); + // ncv=50 (>> 2*nev): the 2D-Laplacian low spectrum is tightly clustered, + // so a larger Krylov subspace is required to converge at large nx + // (ncv=20 stalls past n~10000 in regular mode-1). + int n = nx * nx, nev = 4, ncv = std::min(50, n - 1); arnoldi::Arnoldi s("I", n, "SM", nev, ncv); - s.tol(0.0).maxiter(300).mode(1).ishift(1); + s.tol(0.0).maxiter(500).mode(1).ishift(1); s.solve([&](const double* x, double* y) { av_sym_laplacian_2d(nx, n, x, y); }); auto r = s.eigenpairs(true, 0.0); std::vector out; @@ -110,9 +118,12 @@ static std::vector cpp_dsdrv1(int nx) { } static std::vector cpp_dndrv1(int nx) { - int n = nx * nx, nev = 4, ncv = std::min(20, n - 1); + // ncv=50 (>> 2*nev): the 2D-Laplacian low spectrum is tightly clustered, + // so a larger Krylov subspace is required to converge at large nx + // (ncv=20 stalls past n~10000 in regular mode-1). + int n = nx * nx, nev = 4, ncv = std::min(50, n - 1); arnoldi::Arnoldi s("I", n, "SM", nev, ncv); - s.tol(0.0).maxiter(300).mode(1).ishift(1); + s.tol(0.0).maxiter(500).mode(1).ishift(1); s.solve([&](const double* x, double* y) { av_conv_diff_2d(nx, n, 0.0, x, y); }); auto r = s.eigenpairs(true, 0.0, 0.0); std::vector out; @@ -170,13 +181,16 @@ static std::vector cpp_dndrv2(int n) { // Fortran solvers (return converged eigenvalues). static std::vector f90_dsdrv1(int nx) { - int n = nx * nx, nev = 4, ncv = std::min(20, n - 1); + // ncv=50 (>> 2*nev): the 2D-Laplacian low spectrum is tightly clustered, + // so a larger Krylov subspace is required to converge at large nx + // (ncv=20 stalls past n~10000 in regular mode-1). + int n = nx * nx, nev = 4, ncv = std::min(50, n - 1); int ldv = n, lworkl = ncv * (ncv + 8); double tol = 0.0, sigma = 0.0; std::vector v(ldv * ncv), workd(3 * n), workl(lworkl), resid(n), d(ncv * 2); std::vector sel(ncv); int iparam[11] = {}, ipntr[11] = {}; - iparam[0] = 1; iparam[2] = 300; iparam[6] = 1; + iparam[0] = 1; iparam[2] = 500; iparam[6] = 1; int ido = 0, info = 0; while (true) { dsaupd_(&ido, "I", &n, "SM", &nev, &tol, resid.data(), &ncv, @@ -197,14 +211,17 @@ static std::vector f90_dsdrv1(int nx) { } static std::vector f90_dndrv1(int nx) { - int n = nx * nx, nev = 4, ncv = std::min(20, n - 1); + // ncv=50 (>> 2*nev): the 2D-Laplacian low spectrum is tightly clustered, + // so a larger Krylov subspace is required to converge at large nx + // (ncv=20 stalls past n~10000 in regular mode-1). + int n = nx * nx, nev = 4, ncv = std::min(50, n - 1); int ldv = n, lworkl = 3 * ncv * ncv + 6 * ncv; double tol = 0.0, sigmar = 0.0, sigmai = 0.0; std::vector v(ldv * ncv), workd(3 * n), workl(lworkl), resid(n); std::vector dr(ncv + 1), di(ncv + 1), workev(3 * ncv); std::vector sel(ncv); int iparam[11] = {}, ipntr[14] = {}; - iparam[0] = 1; iparam[2] = 300; iparam[6] = 1; + iparam[0] = 1; iparam[2] = 500; iparam[6] = 1; int ido = 0, info = 0; while (true) { dnaupd_(&ido, "I", &n, "SM", &nev, &tol, resid.data(), &ncv, @@ -305,6 +322,46 @@ static std::vector f90_dndrv2(int n) { return out; } +#ifdef ARNOLDI_USE_CUDA +__global__ void bench_laplacian_2d(int nx, int n, double inv_h2, + const double* v, double* w) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= n) return; + int i = idx % nx; // column within the grid row + int j = idx / nx; // grid row + double s = 4.0 * v[idx]; + if (i > 0) s -= v[idx - 1]; + if (i < nx - 1) s -= v[idx + 1]; + if (j > 0) s -= v[idx - nx]; + if (j < nx - 1) s -= v[idx + nx]; + w[idx] = inv_h2 * s; +} + +static std::vector gpu_dsdrv1(int nx) { + // ncv=50 (>> 2*nev): the 2D-Laplacian low spectrum is tightly clustered, + // so a larger Krylov subspace is required to converge at large nx + // (ncv=20 stalls past n~10000 in regular mode-1). + int n = nx * nx, nev = 4, ncv = std::min(50, n - 1); + double inv_h2 = static_cast(nx + 1) * (nx + 1); // 1/h2 + arnoldi::Arnoldi s("I", n, "SM", nev, ncv); + s.tol(0.0).maxiter(500).mode(1).ishift(1); // matches cpp_dsdrv1 + cudaStream_t stream = s.backend().stream(); + const int threads = 256, blocks = (n + threads - 1) / threads; + // x, y are device pointers (workspace lives on the GPU). Launch on the + // backend's stream so the surrounding cuBLAS calls order correctly. + s.solve([&](const double* x, double* y) { + bench_laplacian_2d<<>>(nx, n, inv_h2, x, y); + }); + cudaStreamSynchronize(stream); + auto r = s.eigenpairs(true, 0.0); + std::vector out; + for (int i = 0; i < s.num_converged(); ++i) + out.push_back({r.values[i], 0.0}); + return out; +} +#endif // ARNOLDI_USE_CUDA + struct ProblemResult { const char* name; int n, nev; @@ -367,11 +424,19 @@ int main(int argc, char* argv[]) { } struct Size { const char* label; int n; }; - Size sizes[] = {{"SMALL (n=100)", 100}, {"MEDIUM (n=2500)", 2500}, {"LARGE (n=10000)", 10000}}; + Size sizes[] = {{"SMALL (n=100)", 100}, {"MEDIUM (n=5000)", 5000}, {"LARGE (n=40000)", 40000}}; std::vector>> all_tiers; bool all_ok = true; +#ifdef ARNOLDI_USE_CUDA + int cuda_dev_count = 0; + cudaGetDeviceCount(&cuda_dev_count); + if (cuda_dev_count == 0) + std::printf("[CUDA] backend built but no device available — " + "skipping GPU rows\n"); +#endif + for (auto& sz : sizes) { int nx = int(std::sqrt(double(sz.n))); int n2d = nx * nx; @@ -407,6 +472,17 @@ int main(int argc, char* argv[]) { rows.push_back({"dndrv2", sz.n, 4, ct, ft, err, (int)ce.size(), (int)fe.size()}); } +#ifdef ARNOLDI_USE_CUDA + if (cuda_dev_count > 0) { + std::vector ge, fe; + auto gt = time_fn([&]{ ge = gpu_dsdrv1(nx); }, warmup, reps); + auto ft = time_fn([&]{ fe = f90_dsdrv1(nx); }, warmup, reps); + double err = eig_max_error(ge, fe); + rows.push_back({"dsdrv1-gpu", n2d, 4, gt, ft, err, + (int)ge.size(), (int)fe.size()}); + } +#endif + for (auto& r : rows) { bool val_ok = r.max_rel_err < 1e-10 && r.nconv_cpp >= r.nev && r.nconv_f90 >= r.nev; diff --git a/examples/kernels.hpp b/examples/kernels.hpp index d588bc5..641b1aa 100644 --- a/examples/kernels.hpp +++ b/examples/kernels.hpp @@ -21,7 +21,8 @@ namespace arnoldi_examples { Real one = 1; tv_sym_row(nx, v, w); RO::axpy(nx, -one, v + nx, 1, w, 1); - for (int j = 2; j < nx - 1; j++) { + // Fortran dsdrv1.f: do 30 j = 2, nx-1 (inclusive) -> blocks 1..nx-2. + for (int j = 2; j < nx; j++) { int lo = (j - 1) * nx; tv_sym_row(nx, v + lo, w + lo); RO::axpy(nx, -one, v + lo - nx, 1, w + lo, 1); @@ -55,7 +56,8 @@ namespace arnoldi_examples { Real h2 = one / static_cast((nx + 1) * (nx + 1)); tv_cd_row(nx, rho, v, w); RO::axpy(nx, -one / h2, v + nx, 1, w, 1); - for (int j = 2; j < nx - 1; j++) { + // Fortran dndrv1.f: do 30 j = 2, nx-1 (inclusive) -> blocks 1..nx-2. + for (int j = 2; j < nx; j++) { int lo = (j - 1) * nx; tv_cd_row(nx, rho, v + lo, w + lo); RO::axpy(nx, -one / h2, v + lo - nx, 1, w + lo, 1); diff --git a/include/arnoldi/arnoldi.hpp b/include/arnoldi/arnoldi.hpp index 0f1d477..725cdbf 100644 --- a/include/arnoldi/arnoldi.hpp +++ b/include/arnoldi/arnoldi.hpp @@ -52,11 +52,12 @@ namespace arnoldi { } // namespace detail - template + template class Arnoldi { public: using real_type = typename detail::real_of::type; using scalar_type = Scalar; + using backend_type = Backend; private: static_assert(std::is_floating_point_v, @@ -72,9 +73,17 @@ namespace arnoldi { static_assert(K != Kind::Herm || detail::is_complex_v, "Kind::Herm requires a complex Scalar (std::complex)"); + // The nonsymmetric path goes through real-Schur LAPACK auxiliaries + // (dlahqr_/dtrsen_/dtrevc_) that have no cuSOLVER analogue for + // Hessenberg matrices. Only Sym / Herm are supported under device + // backends; reject Nonsym + non-CpuBackend at compile time. + static_assert(K != Kind::Nonsym || std::is_same_v, + "Kind::Nonsym requires CpuBackend (no device specialization for the real-Schur LAPACK aux)"); + public: - Arnoldi(std::string bmat, int n, std::string which, int nev, int ncv, Comm comm = Comm{}) : - bmat_(std::move(bmat)), which_(std::move(which)), n_(n), nev_(nev), ncv_(ncv), ldv_(n), comm_(std::move(comm)) { + Arnoldi(std::string bmat, int n, std::string which, int nev, int ncv, Comm comm = Comm{}, Backend backend = Backend{}) : + bmat_(std::move(bmat)), which_(std::move(which)), n_(n), nev_(nev), ncv_(ncv), ldv_(n), comm_(std::move(comm)), + backend_(std::move(backend)) { if (n_ <= 0) throw std::invalid_argument("Arnoldi: n must be > 0"); if (nev_ <= 0) throw std::invalid_argument("Arnoldi: nev must be > 0"); int n_global = comm_.allreduce_sum(n_); @@ -128,12 +137,32 @@ namespace arnoldi { return *this; } + // Seed the iteration with a user-provided initial residual vector. `r` + // is a HOST pointer; under CudaBackend the data is staged H->D once, + // here, into the device-resident resid_ buffer. Arnoldi& initial_resid(const Scalar* r) { - std::copy(r, r + n_, resid_.begin()); + arnoldi::detail::buffer_traits::copy_from_host(resid_, r, static_cast(n_)); + info_in_ = 1; + return *this; + } + + // Device-pointer overload: only available under a non-CpuBackend (e.g. + // CudaBackend). Copies device-to-device into resid_. The device-specific + // buffer_traits specialization supplies copy_from_device; this member + // is SFINAE'd out under CpuBackend. + template , int> = 0> + Arnoldi& initial_resid_device(const Scalar* r) { + arnoldi::detail::buffer_traits::copy_from_device(resid_, r, static_cast(n_)); info_in_ = 1; return *this; } + // Accessor returning a reference to the owned backend. Users invoking + // a GPU matvec from inside their callback can grab the cuBLAS handle + // and stream via `solver.backend().handle()` / `solver.backend().stream()`. + Backend& backend() noexcept { return backend_; } + const Backend& backend() const noexcept { return backend_; } + template void solve(Op&& op) { info_ = info_in_; @@ -165,6 +194,16 @@ namespace arnoldi { using Result = std::conditional_t>; + // Sym / Herm result with eigenvectors left on the device. Eigenvalues are + // always host-resident (they come out of the ncv-sized Hessenberg + // subproblem, computed on host). The eigenvector matrix is the device + // buffer seupd's final GEMM wrote into — handed back without the n×nev + // device→host copy that eigenpairs() performs. + struct DeviceResult { + std::vector values; // size nev (host) + typename arnoldi::detail::buffer_traits::vector_type vectors; // n × nev (device) + }; + // Real shift (Sym / Herm only). template = 0> Result eigenpairs(bool compute_vectors = true, real_type sigma = real_type{}) { @@ -177,6 +216,25 @@ namespace arnoldi { return extract_(compute_vectors, sigmar, sigmai); } + // Sym / Herm only, device backends only: extract eigenpairs leaving the + // eigenvector matrix on the device (column-major, n × nev). Use this when + // downstream work consumes the eigenvectors on the GPU and the host copy + // that eigenpairs() does would be wasted. + template , int> = 0> + DeviceResult eigenpairs_device(bool compute_vectors = true, real_type sigma = real_type{}) { + DeviceResult out{}; + out.values.assign(static_cast(nev_), real_type{}); + if (compute_vectors) out.vectors.assign(static_cast(n_) * nev_, Scalar{}); + + auto bref = backend_.ref(); + arnoldi::detail::seupd(bref, compute_vectors, "A", out.values.data(), + compute_vectors ? out.vectors.data() : nullptr, n_, sigma, bmat_.c_str(), n_, + which_.c_str(), nev_, tol_, resid_.data(), ncv_, v_.data(), ldv_, iparam_.data(), + ipntr_.data(), workd_.data(), workl_.data(), lworkl_, info_, comm_); + return out; + } + bool converged() const noexcept { return info_ == 0 && iparam_[4] >= nev_; } int num_converged() const noexcept { return iparam_[4]; } int num_iterations() const noexcept { return iparam_[2]; } @@ -192,27 +250,31 @@ namespace arnoldi { private: template void dispatch_aupd_(Op&& op, Bop&& bop) { + auto bref = backend_.ref(); if constexpr (K == Kind::Nonsym) { - arnoldi::detail::naupd(bmat_.c_str(), n_, which_.c_str(), nev_, tol_, resid_.data(), ncv_, v_.data(), ldv_, - iparam_.data(), ipntr_.data(), workd_.data(), workl_.data(), lworkl_, info_, - std::forward(op), std::forward(bop), comm_); + arnoldi::detail::naupd(bref, bmat_.c_str(), n_, which_.c_str(), nev_, tol_, resid_.data(), ncv_, + v_.data(), ldv_, iparam_.data(), ipntr_.data(), workd_.data(), workl_.data(), + lworkl_, info_, std::forward(op), std::forward(bop), comm_); } else { // Sym or Herm - arnoldi::detail::saupd(bmat_.c_str(), n_, which_.c_str(), nev_, tol_, resid_.data(), ncv_, v_.data(), ldv_, - iparam_.data(), ipntr_.data(), workd_.data(), workl_.data(), lworkl_, info_, - std::forward(op), std::forward(bop), comm_); + arnoldi::detail::saupd(bref, bmat_.c_str(), n_, which_.c_str(), nev_, tol_, resid_.data(), ncv_, + v_.data(), ldv_, iparam_.data(), ipntr_.data(), workd_.data(), workl_.data(), + lworkl_, info_, std::forward(op), std::forward(bop), comm_); } } template void dispatch_aupd_(Op&& op) { + auto bref = backend_.ref(); if constexpr (K == Kind::Nonsym) { - arnoldi::detail::naupd( - bmat_.c_str(), n_, which_.c_str(), nev_, tol_, resid_.data(), ncv_, v_.data(), ldv_, iparam_.data(), ipntr_.data(), - workd_.data(), workl_.data(), lworkl_, info_, std::forward(op), [](const real_type*, real_type*) {}, comm_); + arnoldi::detail::naupd( + bref, bmat_.c_str(), n_, which_.c_str(), nev_, tol_, resid_.data(), ncv_, v_.data(), ldv_, iparam_.data(), + ipntr_.data(), workd_.data(), workl_.data(), lworkl_, info_, std::forward(op), + [](const real_type*, real_type*) {}, comm_); } else { - arnoldi::detail::saupd( - bmat_.c_str(), n_, which_.c_str(), nev_, tol_, resid_.data(), ncv_, v_.data(), ldv_, iparam_.data(), ipntr_.data(), - workd_.data(), workl_.data(), lworkl_, info_, std::forward(op), [](const Scalar*, Scalar*) {}, comm_); + arnoldi::detail::saupd( + bref, bmat_.c_str(), n_, which_.c_str(), nev_, tol_, resid_.data(), ncv_, v_.data(), ldv_, iparam_.data(), + ipntr_.data(), workd_.data(), workl_.data(), lworkl_, info_, std::forward(op), [](const Scalar*, Scalar*) {}, + comm_); } } @@ -221,9 +283,28 @@ namespace arnoldi { out.values.assign(static_cast(nev_), real_type{}); if (rvec) out.vectors.assign(static_cast(n_) * nev_, Scalar{}); - arnoldi::detail::seupd(rvec, "A", out.values.data(), rvec ? out.vectors.data() : nullptr, n_, sigma, bmat_.c_str(), n_, - which_.c_str(), nev_, tol_, resid_.data(), ncv_, v_.data(), ldv_, iparam_.data(), ipntr_.data(), - workd_.data(), workl_.data(), lworkl_, info_, comm_); + auto bref = backend_.ref(); + if constexpr (std::is_same_v) { + // CPU: seupd writes the final V*S directly into host out.vectors. + arnoldi::detail::seupd(bref, rvec, "A", out.values.data(), rvec ? out.vectors.data() : nullptr, n_, sigma, + bmat_.c_str(), n_, which_.c_str(), nev_, tol_, resid_.data(), ncv_, v_.data(), + ldv_, iparam_.data(), ipntr_.data(), workd_.data(), workl_.data(), lworkl_, info_, + comm_); + } else { + // Device: allocate a device buffer for z, run seupd into it (final + // GEMM lands on device), then copy down to host out.vectors. The + // matching `buffer_copy_to_host` and `buffer_traits` specialisations + // come from the device-specific backend header (e.g. cuda.hpp). + typename arnoldi::detail::buffer_traits::vector_type z_dev; + if (rvec) z_dev.assign(static_cast(n_) * nev_, Scalar{}); + arnoldi::detail::seupd(bref, rvec, "A", out.values.data(), rvec ? z_dev.data() : nullptr, n_, sigma, + bmat_.c_str(), n_, which_.c_str(), nev_, tol_, resid_.data(), ncv_, v_.data(), + ldv_, iparam_.data(), ipntr_.data(), workd_.data(), workl_.data(), lworkl_, info_, + comm_); + if (rvec) + arnoldi::detail::buffer_traits::copy_to_host(out.vectors.data(), z_dev, + static_cast(n_) * nev_); + } return out; } @@ -235,10 +316,11 @@ namespace arnoldi { out.values_im.assign(static_cast(nev_ + 1), real_type{}); if (rvec) out.vectors.assign(static_cast(n_) * (nev_ + 1), Scalar{}); - arnoldi::detail::neupd(rvec, "A", out.values_re.data(), out.values_im.data(), rvec ? out.vectors.data() : nullptr, - n_, sigmar, sigmai, workev_.data(), bmat_.c_str(), n_, which_.c_str(), nev_, tol_, - resid_.data(), ncv_, v_.data(), ldv_, iparam_.data(), ipntr_.data(), workd_.data(), - workl_.data(), lworkl_, info_); + auto bref = backend_.ref(); + arnoldi::detail::neupd(bref, rvec, "A", out.values_re.data(), out.values_im.data(), + rvec ? out.vectors.data() : nullptr, n_, sigmar, sigmai, workev_.data(), + bmat_.c_str(), n_, which_.c_str(), nev_, tol_, resid_.data(), ncv_, v_.data(), + ldv_, iparam_.data(), ipntr_.data(), workd_.data(), workl_.data(), lworkl_, info_); return out; } @@ -250,10 +332,15 @@ namespace arnoldi { int info_in_ = 0; // initial value fed into *aupd (1 if user gave resid) real_type tol_ = real_type{}; Comm comm_; - - std::vector resid_; - std::vector v_; - std::vector workd_; + Backend backend_; + + // Length-n workspace lives on the backend (host std::vector under + // CpuBackend, device_vector under CudaBackend). + typename arnoldi::detail::buffer_traits::vector_type resid_; + typename arnoldi::detail::buffer_traits::vector_type v_; + typename arnoldi::detail::buffer_traits::vector_type workd_; + // The IRA bookkeeping (Hessenberg, Ritz values/bounds, Givens scratch) + // is small (ncv²-sized) and stays host-resident under every backend. std::vector workl_; std::vector workev_; // Nonsym only @@ -261,12 +348,12 @@ namespace arnoldi { std::array ipntr_{}; }; - template - using SymArnoldi = Arnoldi; - template - using NonsymArnoldi = Arnoldi; - template - using HermArnoldi = Arnoldi, C>; + template + using SymArnoldi = Arnoldi; + template + using NonsymArnoldi = Arnoldi; + template + using HermArnoldi = Arnoldi, C, B>; } // namespace arnoldi diff --git a/include/arnoldi/cuda.hpp b/include/arnoldi/cuda.hpp new file mode 100644 index 0000000..55d3c15 --- /dev/null +++ b/include/arnoldi/cuda.hpp @@ -0,0 +1,607 @@ +#ifndef ARNOLDI_CUDA_HPP +#define ARNOLDI_CUDA_HPP + +// CUDA + cuBLAS backend for the symmetric / Hermitian Arnoldi path. +// +// Gated behind ARNOLDI_USE_CUDA. When the macro is defined, this header +// provides: +// - arnoldi::detail::CudaBackend (RAII over cuBLAS handle + stream) +// - arnoldi::detail::BackendRef partial spec (handle + stream) +// - arnoldi::detail::Ops partial spec (cuBLAS dispatch) +// +// Callback contract under CudaBackend: +// +// The user matvec callback `op(const Scalar* x, Scalar* y)` receives +// DEVICE pointers. The matvec runs on the stream returned by +// backend.stream(); if the user launches on a different stream they +// must synchronise it before returning, otherwise the subsequent +// cuBLAS calls will race against the user kernel. +// +// Eigenvalues: extracted by seupd into host arrays (Real eigenvalues, plus +// optionally Scalar eigenvectors after a final cublasGEMM). The Arnoldi +// class transfers the result to host std::vector on eigenpairs() return. +// +// Pointer-mode policy: CUBLAS_POINTER_MODE_HOST. dotc/nrm2 return a host +// scalar synchronously, matching CPU semantics exactly. +// +// Supported scalar types: double, float, std::complex, +// std::complex. Sym + real, Herm + complex. + +#ifdef ARNOLDI_USE_CUDA + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace arnoldi { + + namespace detail { + + // Internal error-check helpers. Keep them small and noisy; on failure + // throw so the user-facing exception carries a meaningful message. + inline void cuda_check(cudaError_t e, const char* what) { + if (e != cudaSuccess) { + throw std::runtime_error(std::string("arnoldi/cuda: ") + what + ": " + cudaGetErrorString(e)); + } + } + inline void cublas_check(cublasStatus_t s, const char* what) { + if (s != CUBLAS_STATUS_SUCCESS) { + throw std::runtime_error(std::string("arnoldi/cuda: ") + what + ": cublas error " + std::to_string(static_cast(s))); + } + } + + inline cublasOperation_t cublas_op(const char* s) { + switch (s[0]) { + case 'N': case 'n': return CUBLAS_OP_N; + case 'T': case 't': return CUBLAS_OP_T; + case 'C': case 'c': return CUBLAS_OP_C; + } + return CUBLAS_OP_N; + } + + } // namespace detail + + // RAII over a cuBLAS handle bound to a stream. The Arnoldi class stores + // one of these by value; ref() returns a trivially-copyable view that + // gets threaded through the algorithmic templates. + // + // Construction modes: + // CudaBackend() — creates own handle + stream. + // CudaBackend(cudaStream_t s) — uses caller's stream; creates own + // handle bound to that stream. + // + // Non-copyable, movable. Default ctor owns its stream and destroys it. + class CudaBackend { + public: + CudaBackend() : owns_stream_(true) { + detail::cuda_check(cudaStreamCreate(&stream_), "cudaStreamCreate"); + detail::cublas_check(cublasCreate(&handle_), "cublasCreate"); + detail::cublas_check(cublasSetStream(handle_, stream_), "cublasSetStream"); + detail::cublas_check(cublasSetPointerMode(handle_, CUBLAS_POINTER_MODE_HOST), "cublasSetPointerMode(HOST)"); + } + explicit CudaBackend(cudaStream_t s) : stream_(s), owns_stream_(false) { + detail::cublas_check(cublasCreate(&handle_), "cublasCreate"); + detail::cublas_check(cublasSetStream(handle_, stream_), "cublasSetStream"); + detail::cublas_check(cublasSetPointerMode(handle_, CUBLAS_POINTER_MODE_HOST), "cublasSetPointerMode(HOST)"); + } + + CudaBackend(const CudaBackend&) = delete; + CudaBackend& operator=(const CudaBackend&) = delete; + + CudaBackend(CudaBackend&& other) noexcept : + handle_(other.handle_), stream_(other.stream_), owns_stream_(other.owns_stream_) { + other.handle_ = nullptr; + other.stream_ = nullptr; + other.owns_stream_ = false; + } + CudaBackend& operator=(CudaBackend&& other) noexcept { + if (this != &other) { + release_(); + handle_ = other.handle_; + stream_ = other.stream_; + owns_stream_ = other.owns_stream_; + other.handle_ = nullptr; + other.stream_ = nullptr; + other.owns_stream_ = false; + } + return *this; + } + + ~CudaBackend() { release_(); } + + cublasHandle_t handle() const noexcept { return handle_; } + cudaStream_t stream() const noexcept { return stream_; } + + detail::BackendRef ref() const noexcept; + + private: + void release_() noexcept { + if (handle_) cublasDestroy(handle_); + if (owns_stream_ && stream_) cudaStreamDestroy(stream_); + handle_ = nullptr; + stream_ = nullptr; + owns_stream_ = false; + } + + cublasHandle_t handle_ = nullptr; + cudaStream_t stream_ = nullptr; + bool owns_stream_ = false; + }; + + namespace detail { + + // Non-owning view threaded through the algorithmic templates. + template <> + struct BackendRef { + cublasHandle_t handle = nullptr; + cudaStream_t stream = nullptr; + }; + + // Minimal RAII vector over device memory. Mirrors the small slice of + // std::vector that the Arnoldi class actually uses: default ctor, + // .assign(n, T{}) (zero-fill via cudaMemset), .data(), .size(), and a + // destructor that frees. Non-copyable, movable. + // + // IEEE-754 zero is all-bits-zero for both real and complex (each + // component zero), so a single cudaMemset suffices for the zero-fill + // initialization that Arnoldi's ctor performs. + template + class device_vector { + public: + device_vector() noexcept = default; + + device_vector(const device_vector&) = delete; + device_vector& operator=(const device_vector&) = delete; + + device_vector(device_vector&& other) noexcept : data_(other.data_), size_(other.size_) { + other.data_ = nullptr; + other.size_ = 0; + } + device_vector& operator=(device_vector&& other) noexcept { + if (this != &other) { + release_(); + data_ = other.data_; + size_ = other.size_; + other.data_ = nullptr; + other.size_ = 0; + } + return *this; + } + + ~device_vector() { release_(); } + + // Allocate n elements and fill with `val`. Currently only val == T{} + // (zero) is needed by Arnoldi; we'd add a host-staged fill if we ever + // needed a non-zero initial value. + void assign(std::size_t n, const T& val) { + resize_(n); + if (n == 0) return; + if (val == T{}) { + cuda_check(cudaMemset(data_, 0, n * sizeof(T)), "device_vector::assign(zero) cudaMemset"); + } else { + std::vector host(n, val); + cuda_check(cudaMemcpy(data_, host.data(), n * sizeof(T), cudaMemcpyHostToDevice), + "device_vector::assign(fill) cudaMemcpy"); + } + } + + T* data() noexcept { return data_; } + const T* data() const noexcept { return data_; } + std::size_t size() const noexcept { return size_; } + + private: + void resize_(std::size_t n) { + if (size_ == n) return; + release_(); + if (n > 0) { + void* p = nullptr; + cuda_check(cudaMalloc(&p, n * sizeof(T)), "device_vector::resize_ cudaMalloc"); + data_ = static_cast(p); + size_ = n; + } + } + void release_() noexcept { + if (data_) cudaFree(data_); + data_ = nullptr; + size_ = 0; + } + + T* data_ = nullptr; + std::size_t size_ = 0; + }; + + // Buffer-type trait specialization: under CudaBackend the Arnoldi + // workspace lives in device_vector. Static helpers shuttle data + // between host pointers and the device buffer for initial_resid, + // initial_resid_device, and extract_'s eigenvector copy-down. + template + struct buffer_traits { + using vector_type = device_vector; + + // Host pointer -> device buffer (initial_resid upload). + static void copy_from_host(vector_type& dst, const T* src, std::size_t n) { + cuda_check(cudaMemcpy(dst.data(), src, n * sizeof(T), cudaMemcpyHostToDevice), "buffer_traits::copy_from_host"); + } + + // Device pointer -> device buffer (initial_resid_device pass-through). + static void copy_from_device(vector_type& dst, const T* src, std::size_t n) { + cuda_check(cudaMemcpy(dst.data(), src, n * sizeof(T), cudaMemcpyDeviceToDevice), "buffer_traits::copy_from_device"); + } + + // Device buffer -> host pointer (extract_ eigenvector copy-down). + static void copy_to_host(T* host_dst, const vector_type& src, std::size_t n) { + cuda_check(cudaMemcpy(host_dst, src.data(), n * sizeof(T), cudaMemcpyDeviceToHost), "buffer_traits::copy_to_host"); + } + }; + + // Overloads for the backend-aware extension points (declared in ops.hpp). + + // Length-`len` allreduce on a device buffer: stage D->H, allreduce on a + // host scratch buffer, copy H->D, sync. Always small (<= ncv). + template + void backend_allreduce(BackendRef bref, const Comm& comm, Scalar* buf, int len) { + // SerialComm::allreduce_sum is an identity no-op. Staging the buffer + // D->H->D and synchronizing twice purely to feed it is wasted work on + // the Lanczos hot path — skip the round-trip entirely. The buffer is + // already correct in device memory. + if constexpr (std::is_same_v) { + (void)bref; + (void)comm; + (void)buf; + (void)len; + return; + } else { + std::vector host_buf(static_cast(len)); + cuda_check(cudaMemcpyAsync(host_buf.data(), buf, static_cast(len) * sizeof(Scalar), cudaMemcpyDeviceToHost, + bref.stream), + "backend_allreduce D2H"); + cuda_check(cudaStreamSynchronize(bref.stream), "backend_allreduce sync D2H"); + comm.allreduce_sum(host_buf.data(), len); + cuda_check(cudaMemcpyAsync(buf, host_buf.data(), static_cast(len) * sizeof(Scalar), cudaMemcpyHostToDevice, + bref.stream), + "backend_allreduce H2D"); + cuda_check(cudaStreamSynchronize(bref.stream), "backend_allreduce sync H2D"); + } + } + + } // namespace detail + + inline detail::BackendRef CudaBackend::ref() const noexcept { + return detail::BackendRef{handle_, stream_}; + } + + namespace detail { + + // Ops partial specialisation. + // + // Provides the length-n dispatch surface for the Sym / Herm path: + // copy/scal/rscal/axpy/raxpy/nrm2/dotc/rdotc/gemv/gemm/gemv_rv/larnv/ + // lascl_/zero/read_scalar + // + // Does NOT provide the host-only LAPACK aux (lamch, lapy2, steqr, + // laset_, larfg_, lartg_, lahqr_, trevc_, geqr2_, trsen_, trmm_, ger, + // lacpy, lanhs, lae2_, laev2_, lasr_, lasrt_, lanst_, swap). Those run + // on host-resident workl/h/q via the primary CPU Ops template, which + // is reached by Ops::method(...) call sites (default CpuBackend). + template + struct Ops { + using Real = real_t; + static constexpr bool is_complex = !std::is_same_v; + using BRef = BackendRef; + + // ---- fills / scalar reads --------------------------------------------- + static void zero(BRef bref, int n, Scalar* x) { + // IEEE-754 zero is all-bits-zero for both real and complex (each + // component zero). cudaMemsetAsync is async on the stream. + cuda_check(cudaMemsetAsync(x, 0, static_cast(n) * sizeof(Scalar), bref.stream), "cudaMemsetAsync(zero)"); + } + + static Scalar read_scalar(BRef bref, const Scalar* p) { + Scalar host_val{}; + cuda_check(cudaMemcpyAsync(&host_val, p, sizeof(Scalar), cudaMemcpyDeviceToHost, bref.stream), + "cudaMemcpyAsync(read_scalar)"); + cuda_check(cudaStreamSynchronize(bref.stream), "cudaStreamSynchronize(read_scalar)"); + return host_val; + } + + // Async single-element D->H stage (no per-call sync). The caller batches + // many of these and issues one sync() before reading host memory, + // collapsing what would be one stream stall per Lanczos step into one + // stall per saitr call. + static void read_scalar_async(BRef bref, Scalar* host_dst, const Scalar* src) { + cuda_check(cudaMemcpyAsync(host_dst, src, sizeof(Scalar), cudaMemcpyDeviceToHost, bref.stream), + "cudaMemcpyAsync(read_scalar_async)"); + } + + static void sync(BRef bref) { cuda_check(cudaStreamSynchronize(bref.stream), "cudaStreamSynchronize(sync)"); } + + // ---- BLAS-1 ----------------------------------------------------------- + static void copy(BRef bref, int n, const Scalar* x, int incx, Scalar* y, int incy) { + if constexpr (std::is_same_v) + cublas_check(cublasDcopy(bref.handle, n, x, incx, y, incy), "cublasDcopy"); + else if constexpr (std::is_same_v) + cublas_check(cublasScopy(bref.handle, n, x, incx, y, incy), "cublasScopy"); + else if constexpr (std::is_same_v>) + cublas_check(cublasZcopy(bref.handle, n, reinterpret_cast(x), incx, + reinterpret_cast(y), incy), + "cublasZcopy"); + else + cublas_check(cublasCcopy(bref.handle, n, reinterpret_cast(x), incx, + reinterpret_cast(y), incy), + "cublasCcopy"); + } + + static void scal(BRef bref, int n, Scalar a, Scalar* x, int incx) { + if constexpr (std::is_same_v) + cublas_check(cublasDscal(bref.handle, n, &a, x, incx), "cublasDscal"); + else if constexpr (std::is_same_v) + cublas_check(cublasSscal(bref.handle, n, &a, x, incx), "cublasSscal"); + else if constexpr (std::is_same_v>) + cublas_check(cublasZscal(bref.handle, n, reinterpret_cast(&a), + reinterpret_cast(x), incx), + "cublasZscal"); + else + cublas_check(cublasCscal(bref.handle, n, reinterpret_cast(&a), + reinterpret_cast(x), incx), + "cublasCscal"); + } + + static void rscal(BRef bref, int n, Real a, Scalar* x, int incx) { + if constexpr (!is_complex) { + scal(bref, n, a, x, incx); + } else if constexpr (std::is_same_v) { + cublas_check(cublasZdscal(bref.handle, n, &a, reinterpret_cast(x), incx), "cublasZdscal"); + } else { + cublas_check(cublasCsscal(bref.handle, n, &a, reinterpret_cast(x), incx), "cublasCsscal"); + } + } + + static void axpy(BRef bref, int n, Scalar a, const Scalar* x, int incx, Scalar* y, int incy) { + if constexpr (std::is_same_v) + cublas_check(cublasDaxpy(bref.handle, n, &a, x, incx, y, incy), "cublasDaxpy"); + else if constexpr (std::is_same_v) + cublas_check(cublasSaxpy(bref.handle, n, &a, x, incx, y, incy), "cublasSaxpy"); + else if constexpr (std::is_same_v>) + cublas_check(cublasZaxpy(bref.handle, n, reinterpret_cast(&a), + reinterpret_cast(x), incx, + reinterpret_cast(y), incy), + "cublasZaxpy"); + else + cublas_check(cublasCaxpy(bref.handle, n, reinterpret_cast(&a), + reinterpret_cast(x), incx, + reinterpret_cast(y), incy), + "cublasCaxpy"); + } + + static void raxpy(BRef bref, int n, Real a, const Scalar* x, int incx, Scalar* y, int incy) { + if constexpr (!is_complex) { + axpy(bref, n, a, x, incx, y, incy); + } else { + axpy(bref, n, Scalar(a, 0), x, incx, y, incy); + } + } + + static Real nrm2(BRef bref, int n, const Scalar* x, int incx) { + Real result{}; + if constexpr (std::is_same_v) + cublas_check(cublasDnrm2(bref.handle, n, x, incx, &result), "cublasDnrm2"); + else if constexpr (std::is_same_v) + cublas_check(cublasSnrm2(bref.handle, n, x, incx, &result), "cublasSnrm2"); + else if constexpr (std::is_same_v>) + cublas_check(cublasDznrm2(bref.handle, n, reinterpret_cast(x), incx, &result), "cublasDznrm2"); + else + cublas_check(cublasScnrm2(bref.handle, n, reinterpret_cast(x), incx, &result), "cublasScnrm2"); + return result; + } + + static Scalar dot(BRef bref, int n, const Scalar* x, int incx, const Scalar* y, int incy) { + // Only meaningful for real Scalar; the sym/herm path uses dotc/rdotc. + Scalar result{}; + if constexpr (std::is_same_v) + cublas_check(cublasDdot(bref.handle, n, x, incx, y, incy, &result), "cublasDdot"); + else if constexpr (std::is_same_v) + cublas_check(cublasSdot(bref.handle, n, x, incx, y, incy, &result), "cublasSdot"); + return result; + } + + static Scalar dotc(BRef bref, int n, const Scalar* x, int incx, const Scalar* y, int incy) { + Scalar result{}; + if constexpr (std::is_same_v) { + cublas_check(cublasDdot(bref.handle, n, x, incx, y, incy, &result), "cublasDdot"); + } else if constexpr (std::is_same_v) { + cublas_check(cublasSdot(bref.handle, n, x, incx, y, incy, &result), "cublasSdot"); + } else if constexpr (std::is_same_v>) { + cublas_check(cublasZdotc(bref.handle, n, reinterpret_cast(x), incx, + reinterpret_cast(y), incy, reinterpret_cast(&result)), + "cublasZdotc"); + } else { + cublas_check(cublasCdotc(bref.handle, n, reinterpret_cast(x), incx, + reinterpret_cast(y), incy, reinterpret_cast(&result)), + "cublasCdotc"); + } + return result; + } + + static Real rdotc(BRef bref, int n, const Scalar* x, int incx, const Scalar* y, int incy) { + return std::real(dotc(bref, n, x, incx, y, incy)); + } + + static const char* herm_trans() { + if constexpr (is_complex) return "C"; + else return "T"; + } + + // ---- BLAS-2 / BLAS-3 -------------------------------------------------- + static void gemv(BRef bref, const char* trans, int m, int n, Scalar alpha, const Scalar* a, int lda, const Scalar* x, + int incx, Scalar beta, Scalar* y, int incy) { + auto op = cublas_op(trans); + if constexpr (std::is_same_v) + cublas_check(cublasDgemv(bref.handle, op, m, n, &alpha, a, lda, x, incx, &beta, y, incy), "cublasDgemv"); + else if constexpr (std::is_same_v) + cublas_check(cublasSgemv(bref.handle, op, m, n, &alpha, a, lda, x, incx, &beta, y, incy), "cublasSgemv"); + else if constexpr (std::is_same_v>) + cublas_check(cublasZgemv(bref.handle, op, m, n, reinterpret_cast(&alpha), + reinterpret_cast(a), lda, + reinterpret_cast(x), incx, + reinterpret_cast(&beta), + reinterpret_cast(y), incy), + "cublasZgemv"); + else + cublas_check(cublasCgemv(bref.handle, op, m, n, reinterpret_cast(&alpha), + reinterpret_cast(a), lda, reinterpret_cast(x), incx, + reinterpret_cast(&beta), reinterpret_cast(y), incy), + "cublasCgemv"); + } + + static void gemm(BRef bref, const char* transa, const char* transb, int m, int n, int k, Scalar alpha, const Scalar* a, + int lda, const Scalar* b, int ldb, Scalar beta, Scalar* c, int ldc) { + auto opa = cublas_op(transa); + auto opb = cublas_op(transb); + if constexpr (std::is_same_v) + cublas_check(cublasDgemm(bref.handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc), "cublasDgemm"); + else if constexpr (std::is_same_v) + cublas_check(cublasSgemm(bref.handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc), "cublasSgemm"); + else if constexpr (std::is_same_v>) + cublas_check(cublasZgemm(bref.handle, opa, opb, m, n, k, reinterpret_cast(&alpha), + reinterpret_cast(a), lda, + reinterpret_cast(b), ldb, + reinterpret_cast(&beta), + reinterpret_cast(c), ldc), + "cublasZgemm"); + else + cublas_check(cublasCgemm(bref.handle, opa, opb, m, n, k, reinterpret_cast(&alpha), + reinterpret_cast(a), lda, reinterpret_cast(b), ldb, + reinterpret_cast(&beta), reinterpret_cast(c), ldc), + "cublasCgemm"); + } + + // gemv with a real x-vector applied to a possibly-complex matrix. + // The Real x in callers (sapps, seupd) always lives in host workl/h/q. + // Under CudaBackend we gather x into a contiguous Scalar host buffer + // (widening on the complex path), upload to a device temp, then call + // the regular gemv. y is device-resident (workd / V column). + static void gemv_rv(BRef bref, const char* trans, int m, int ncols, Real alpha, const Scalar* a, int lda, const Real* x, + int incx, Real beta, Scalar* y, int incy) { + int xlen = (*trans == 'N' || *trans == 'n') ? ncols : m; + std::vector host_x(xlen); + if constexpr (!is_complex) { + for (int i = 0; i < xlen; ++i) host_x[i] = x[i * incx]; + } else { + for (int i = 0; i < xlen; ++i) host_x[i] = Scalar(x[i * incx], 0); + } + Scalar* dev_x = nullptr; + cuda_check(cudaMallocAsync(reinterpret_cast(&dev_x), static_cast(xlen) * sizeof(Scalar), bref.stream), + "cudaMallocAsync(gemv_rv temp)"); + cuda_check(cudaMemcpyAsync(dev_x, host_x.data(), static_cast(xlen) * sizeof(Scalar), cudaMemcpyHostToDevice, + bref.stream), + "cudaMemcpyAsync(gemv_rv H2D)"); + // Sync so host_x outlives the upload (host_x is local). + cuda_check(cudaStreamSynchronize(bref.stream), "cudaStreamSynchronize(gemv_rv upload)"); + if constexpr (!is_complex) { + gemv(bref, trans, m, ncols, alpha, a, lda, dev_x, 1, beta, y, incy); + } else { + gemv(bref, trans, m, ncols, Scalar(alpha, 0), a, lda, dev_x, 1, Scalar(beta, 0), y, incy); + } + cuda_check(cudaFreeAsync(dev_x, bref.stream), "cudaFreeAsync(gemv_rv temp)"); + } + + // ---- LAPACK aux that operate on length-n device data ------------------ + + // larnv: generate random vector on host via the CPU Ops, then upload. + // Avoids the bookkeeping of cuRAND for a once-per-restart-attempt call. + static void larnv(BRef bref, int idist, int* iseed, int n, Scalar* x) { + std::vector host_x(n); + Ops::larnv(idist, iseed, n, host_x.data()); + cuda_check(cudaMemcpyAsync(x, host_x.data(), static_cast(n) * sizeof(Scalar), cudaMemcpyHostToDevice, + bref.stream), + "cudaMemcpyAsync(larnv H2D)"); + cuda_check(cudaStreamSynchronize(bref.stream), "cudaStreamSynchronize(larnv)"); + } + + // lascl_: scale a device-resident block by cto/cfrom, applying the + // overflow-safe multi-step ratio that LAPACK dlascl uses (this matters + // because the only caller triggers it when cfrom = rnorm < safmin, so + // 1/cfrom would overflow). The mul sequence is pure scalar arithmetic + // computed on the host; each step is applied on-device with rscal + // (a real scaling — cublasXscal / Zdscal / Csscal), keeping the path + // nvcc-free. Only type 'G' (general, full m x n block) is supported, + // which is the only type any caller reaching this backend uses; other + // types fall back to the host path. + static void lascl_(BRef bref, const char* type, const int* kl, const int* ku, const Real* cfrom, const Real* cto, + const int* m, const int* n, Scalar* a, const int* lda, int* info) { + *info = 0; + if (!(type[0] == 'G' || type[0] == 'g')) { + // Unexpected type (none of the algorithmic call sites use these + // under a device backend) — preserve correctness via host fallback. + const int nelem = (*lda) * (*n); + std::vector host_a(nelem); + cuda_check(cudaMemcpyAsync(host_a.data(), a, static_cast(nelem) * sizeof(Scalar), cudaMemcpyDeviceToHost, + bref.stream), + "cudaMemcpyAsync(lascl D2H)"); + cuda_check(cudaStreamSynchronize(bref.stream), "cudaStreamSynchronize(lascl D2H)"); + Ops::lascl_(type, kl, ku, cfrom, cto, m, n, host_a.data(), lda, info); + cuda_check(cudaMemcpyAsync(a, host_a.data(), static_cast(nelem) * sizeof(Scalar), cudaMemcpyHostToDevice, + bref.stream), + "cudaMemcpyAsync(lascl H2D)"); + cuda_check(cudaStreamSynchronize(bref.stream), "cudaStreamSynchronize(lascl H2D)"); + return; + } + + const Real smlnum = Ops::lamch("safe minimum"); + const Real bignum = Real(1) / smlnum; + Real cfromc = *cfrom; + Real ctoc = *cto; + const int rows = *m; + const int cols = *n; + const int ld = *lda; + + bool done = false; + while (!done) { + Real cfrom1 = cfromc * smlnum; + Real mul; + if (cfrom1 == cfromc) { + // cfromc is an infinity; cto/cfrom is the well-defined ratio. + mul = ctoc / cfromc; + done = true; + } else { + Real cto1 = ctoc / bignum; + if (cto1 == ctoc) { + // ctoc is zero. + mul = ctoc; + done = true; + cfromc = Real(1); + } else if (std::abs(cfrom1) > std::abs(ctoc) && ctoc != Real(0)) { + mul = smlnum; + done = false; + cfromc = cfrom1; + } else if (std::abs(cto1) > std::abs(cfromc)) { + mul = bignum; + done = false; + ctoc = cto1; + } else { + mul = ctoc / cfromc; + done = true; + } + } + // Apply A *= mul over the general m x n block (column-major, ld). + for (int j = 0; j < cols; ++j) + rscal(bref, rows, mul, a + static_cast(j) * ld, 1); + } + } + }; + + } // namespace detail + +} // namespace arnoldi + +#endif // ARNOLDI_USE_CUDA + +#endif // ARNOLDI_CUDA_HPP diff --git a/include/arnoldi/detail/blas_bindings.hpp b/include/arnoldi/detail/blas_bindings.hpp index 8abff30..b30866e 100644 --- a/include/arnoldi/detail/blas_bindings.hpp +++ b/include/arnoldi/detail/blas_bindings.hpp @@ -66,6 +66,18 @@ void cgemv_(const char* trans, const int* m, const int* n, const std::complex* alpha, + const std::complex* a, const int* lda, const std::complex* b, const int* ldb, + const std::complex* beta, std::complex* c, const int* ldc); +void cgemm_(const char* transa, const char* transb, const int* m, const int* n, const int* k, const std::complex* alpha, + const std::complex* a, const int* lda, const std::complex* b, const int* ldb, + const std::complex* beta, std::complex* c, const int* ldc); + // ---- BLAS-2 (ger / geru) -------------------------------------------------- void dger_(const int* m, const int* n, const double* alpha, const double* x, const int* incx, const double* y, const int* incy, double* a, const int* lda); @@ -238,6 +250,10 @@ inline void dgemv(const char* trans, int m, int n, double alpha, const double* a double* y, int incy) { dgemv_(trans, &m, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy); } +inline void dgemm(const char* transa, const char* transb, int m, int n, int k, double alpha, const double* a, int lda, + const double* b, int ldb, double beta, double* c, int ldc) { + dgemm_(transa, transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); +} inline void dlarnv(int idist, int* iseed, int n, double* x) { dlarnv_(&idist, iseed, &n, x); } inline double dlamch(const char* cmach) { return dlamch_(cmach); } inline double dlapy2(double x, double y) { return dlapy2_(&x, &y); } @@ -264,6 +280,10 @@ inline void sgemv(const char* trans, int m, int n, float alpha, const float* a float* y, int incy) { sgemv_(trans, &m, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy); } +inline void sgemm(const char* transa, const char* transb, int m, int n, int k, float alpha, const float* a, int lda, + const float* b, int ldb, float beta, float* c, int ldc) { + sgemm_(transa, transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); +} inline void slarnv(int idist, int* iseed, int n, float* x) { slarnv_(&idist, iseed, &n, x); } inline float slamch(const char* cmach) { return slamch_(cmach); } inline float slapy2(float x, float y) { return slapy2_(&x, &y); } @@ -289,6 +309,11 @@ inline void zgemv(const char* trans, int m, int n, std::complex alpha, const std::complex* x, int incx, std::complex beta, std::complex* y, int incy) { zgemv_(trans, &m, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy); } +inline void zgemm(const char* transa, const char* transb, int m, int n, int k, std::complex alpha, + const std::complex* a, int lda, const std::complex* b, int ldb, std::complex beta, + std::complex* c, int ldc) { + zgemm_(transa, transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); +} inline void zlarnv(int idist, int* iseed, int n, std::complex* x) { zlarnv_(&idist, iseed, &n, x); } inline void zlacpy(const char* uplo, int m, int n, const std::complex* a, int lda, std::complex* b, int ldb) { zlacpy_(uplo, &m, &n, a, &lda, b, &ldb); @@ -340,6 +365,11 @@ inline void cgemv(const char* trans, int m, int n, std::complex alpha, c const std::complex* x, int incx, std::complex beta, std::complex* y, int incy) { cgemv_(trans, &m, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy); } +inline void cgemm(const char* transa, const char* transb, int m, int n, int k, std::complex alpha, + const std::complex* a, int lda, const std::complex* b, int ldb, std::complex beta, + std::complex* c, int ldc) { + cgemm_(transa, transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); +} inline void clarnv(int idist, int* iseed, int n, std::complex* x) { clarnv_(&idist, iseed, &n, x); } inline void clacpy(const char* uplo, int m, int n, const std::complex* a, int lda, std::complex* b, int ldb) { clacpy_(uplo, &m, &n, a, &lda, b, &ldb); diff --git a/include/arnoldi/detail/getv0.hpp b/include/arnoldi/detail/getv0.hpp index aaa712d..d2405a2 100644 --- a/include/arnoldi/detail/getv0.hpp +++ b/include/arnoldi/detail/getv0.hpp @@ -10,9 +10,9 @@ namespace arnoldi::detail { // getv0 — generate a starting vector (shared by sym / nonsym). // Replaces the reverse-communication version: callbacks op/bop are // invoked directly instead of returning ido. - template - void getv0(const char* bmat, int itry, bool initv, int n, int j, Scalar* v, int ldv, Scalar* resid, - detail::real_t& rnorm, Scalar* workd, int& ierr, OP&& op, BOP&& bop, const Comm& comm) { + template + void getv0(detail::BackendRef bref, const char* bmat, int itry, bool initv, int n, int j, Scalar* v, int ldv, + Scalar* resid, detail::real_t& rnorm, Scalar* workd, int& ierr, OP&& op, BOP&& bop, const Comm& comm) { using Real = detail::real_t; static int iseed[4] = {1, 3, 5, 7}; int msglvl = detail::debug.getv0; @@ -22,25 +22,25 @@ namespace arnoldi::detail { ierr = 0; if (!initv) { - detail::Ops::larnv(2, iseed, n, resid); + detail::Ops::larnv(bref, 2, iseed, n, resid); } if (itry == 1) { detail::stats.nopx++; detail::arscnd(t2); - detail::Ops::copy(n, resid, 1, workd, 1); + detail::Ops::copy(bref, n, resid, 1, workd, 1); op(workd, &workd[n]); if (*bmat == 'G') { detail::arscnd(t3); detail::stats.mvopx += (t3 - t2); } - detail::Ops::copy(n, &workd[n], 1, resid, 1); + detail::Ops::copy(bref, n, &workd[n], 1, resid, 1); } else if (itry > 1 && *bmat == 'G') { - detail::Ops::copy(n, resid, 1, &workd[n], 1); + detail::Ops::copy(bref, n, resid, 1, &workd[n], 1); } detail::arscnd(t2); - Real rnorm0 = detail::bnorm(*bmat, n, resid, workd, &workd[n], bop, comm); + Real rnorm0 = detail::bnorm(bref, *bmat, n, resid, workd, &workd[n], bop, comm); if (*bmat == 'G') { detail::arscnd(t3); detail::stats.mvbx += (t3 - t2); @@ -49,13 +49,13 @@ namespace arnoldi::detail { if (j != 1) { for (int iter = 0;; ++iter) { - detail::Ops::gemv(detail::Ops::herm_trans(), n, j - 1, Scalar(1), v, ldv, workd, 1, Scalar(0), &workd[n], - 1); - comm.allreduce_sum(&workd[n], j - 1); - detail::Ops::gemv("N", n, j - 1, Scalar(-1), v, ldv, &workd[n], 1, Scalar(1), resid, 1); + detail::Ops::gemv(bref, detail::Ops::herm_trans(), n, j - 1, Scalar(1), v, ldv, workd, 1, + Scalar(0), &workd[n], 1); + detail::backend_allreduce(bref, comm, &workd[n], j - 1); + detail::Ops::gemv(bref, "N", n, j - 1, Scalar(-1), v, ldv, &workd[n], 1, Scalar(1), resid, 1); detail::arscnd(t2); - rnorm = detail::bnorm(*bmat, n, resid, workd, &workd[n], bop, comm); + rnorm = detail::bnorm(bref, *bmat, n, resid, workd, &workd[n], bop, comm); if (*bmat == 'G') { detail::arscnd(t3); detail::stats.mvbx += (t3 - t2); @@ -65,7 +65,7 @@ namespace arnoldi::detail { rnorm0 = rnorm; if (iter >= 5) { - for (int jj = 0; jj < n; jj++) resid[jj] = Scalar(0); + detail::Ops::zero(bref, n, resid); rnorm = Real(0); ierr = -1; break; @@ -84,6 +84,15 @@ namespace arnoldi::detail { detail::stats.getv0 += (t1 - t0); } + // getv0 without explicit BackendRef — defaults to CpuBackend, for the + // historical public callback API (tests, examples). + template + void getv0(const char* bmat, int itry, bool initv, int n, int j, Scalar* v, int ldv, Scalar* resid, + detail::real_t& rnorm, Scalar* workd, int& ierr, OP&& op, BOP&& bop, const Comm& comm) { + getv0(detail::BackendRef{}, bmat, itry, initv, n, j, v, ldv, resid, rnorm, + workd, ierr, std::forward(op), std::forward(bop), comm); + } + } // namespace arnoldi::detail #endif // ARNOLDI_DETAIL_GETV0_HPP diff --git a/include/arnoldi/detail/nonsym.hpp b/include/arnoldi/detail/nonsym.hpp index a47587b..13f9015 100644 --- a/include/arnoldi/detail/nonsym.hpp +++ b/include/arnoldi/detail/nonsym.hpp @@ -19,9 +19,9 @@ namespace arnoldi::detail { // naitr — Arnoldi iteration (nonsymmetric). - template - void naitr(const char* bmat, int n, int k, int np, int nb, Real* resid, Real& rnorm, Real* v, int ldv, Real* h, int ldh, - Real* workd, int& info, OP&& op, BOP&& bop, const Comm& comm) { + template + void naitr(detail::BackendRef bref, const char* bmat, int n, int k, int np, int nb, Real* resid, Real& rnorm, Real* v, + int ldv, Real* h, int ldh, Real* workd, int& info, OP&& op, BOP&& bop, const Comm& comm) { const int ipj = 0, irj = n, ivj = 2 * n; Real unfl = detail::Ops::lamch("safe minimum"); @@ -56,7 +56,7 @@ namespace arnoldi::detail { int ierr = -1; for (int itry = 1; itry <= 3; ++itry) { - getv0(bmat, itry, false, n, j, v, ldv, resid, rnorm, workd, ierr, op, bop, comm); + getv0(bref, bmat, itry, false, n, j, v, ldv, resid, rnorm, workd, ierr, op, bop, comm); if (ierr >= 0) break; } if (ierr < 0) { @@ -67,27 +67,27 @@ namespace arnoldi::detail { } } - detail::Ops::copy(n, resid, 1, &v[(j - 1) * ldv], 1); + detail::Ops::copy(bref, n, resid, 1, &v[(j - 1) * ldv], 1); if (rnorm >= unfl) { Real temp1 = (Real)1 / rnorm; - detail::Ops::scal(n, temp1, &v[(j - 1) * ldv], 1); - detail::Ops::scal(n, temp1, &workd[ipj], 1); + detail::Ops::scal(bref, n, temp1, &v[(j - 1) * ldv], 1); + detail::Ops::scal(bref, n, temp1, &workd[ipj], 1); } else { int ione = 1; int infol; Real done = (Real)1; - detail::Ops::lascl_("General", &ione, &ione, &rnorm, &done, &n, &ione, &v[(j - 1) * ldv], &n, &infol); - detail::Ops::lascl_("General", &ione, &ione, &rnorm, &done, &n, &ione, &workd[ipj], &n, &infol); + detail::Ops::lascl_(bref, "General", &ione, &ione, &rnorm, &done, &n, &ione, &v[(j - 1) * ldv], &n, &infol); + detail::Ops::lascl_(bref, "General", &ione, &ione, &rnorm, &done, &n, &ione, &workd[ipj], &n, &infol); } detail::stats.nopx++; detail::arscnd(t2); - detail::Ops::copy(n, &v[(j - 1) * ldv], 1, &workd[ivj], 1); + detail::Ops::copy(bref, n, &v[(j - 1) * ldv], 1, &workd[ivj], 1); op(&workd[ivj], &workd[irj]); detail::arscnd(t3); detail::stats.mvopx += (t3 - t2); - detail::Ops::copy(n, &workd[irj], 1, resid, 1); + detail::Ops::copy(bref, n, &workd[irj], 1, resid, 1); // B-multiply w = OP*v_j (or copy through if bmat='I'). detail::arscnd(t2); @@ -95,7 +95,7 @@ namespace arnoldi::detail { detail::stats.nbx++; bop(&workd[irj], &workd[ipj]); } else { - detail::Ops::copy(n, resid, 1, &workd[ipj], 1); + detail::Ops::copy(bref, n, resid, 1, &workd[ipj], 1); } if (bmat[0] == 'G') { detail::arscnd(t3); @@ -104,16 +104,16 @@ namespace arnoldi::detail { Real wnorm; if (bmat[0] == 'G') { - wnorm = detail::pdot(comm, n, resid, 1, &workd[ipj], 1); + wnorm = detail::pdot(bref, comm, n, resid, 1, &workd[ipj], 1); wnorm = std::sqrt(std::abs(wnorm)); } else { - wnorm = detail::pnrm2_real(comm, n, resid, 1); + wnorm = detail::pnrm2_real(bref, comm, n, resid, 1); } // h(:,j) = V' * B*w ; resid -= V * h(:,j) - detail::Ops::gemv("T", n, j, (Real)1, v, ldv, &workd[ipj], 1, (Real)0, &h[(j - 1) * ldh], 1); - comm.allreduce_sum(&h[(j - 1) * ldh], j); - detail::Ops::gemv("N", n, j, (Real)-1, v, ldv, &h[(j - 1) * ldh], 1, (Real)1, resid, 1); + detail::Ops::gemv(bref, "T", n, j, (Real)1, v, ldv, &workd[ipj], 1, (Real)0, &h[(j - 1) * ldh], 1); + detail::backend_allreduce(bref, comm, &h[(j - 1) * ldh], j); + detail::Ops::gemv(bref, "N", n, j, (Real)-1, v, ldv, &h[(j - 1) * ldh], 1, (Real)1, resid, 1); if (j > 1) h[(j - 2) * ldh + (j - 1)] = betaj; @@ -122,10 +122,10 @@ namespace arnoldi::detail { detail::arscnd(t2); if (bmat[0] == 'G') { detail::stats.nbx++; - detail::Ops::copy(n, resid, 1, &workd[irj], 1); + detail::Ops::copy(bref, n, resid, 1, &workd[irj], 1); bop(&workd[irj], &workd[ipj]); } else { - detail::Ops::copy(n, resid, 1, &workd[ipj], 1); + detail::Ops::copy(bref, n, resid, 1, &workd[ipj], 1); } if (bmat[0] == 'G') { detail::arscnd(t3); @@ -133,10 +133,10 @@ namespace arnoldi::detail { } if (bmat[0] == 'G') { - rnorm = detail::pdot(comm, n, resid, 1, &workd[ipj], 1); + rnorm = detail::pdot(bref, comm, n, resid, 1, &workd[ipj], 1); rnorm = std::sqrt(std::abs(rnorm)); } else { - rnorm = detail::pnrm2_real(comm, n, resid, 1); + rnorm = detail::pnrm2_real(bref, comm, n, resid, 1); } if (rnorm <= 0.717 * wnorm) { @@ -148,19 +148,19 @@ namespace arnoldi::detail { detail::debug.vout(j, &h[(j - 1) * ldh], "_naitr: j-th column of H"); } - detail::Ops::gemv("T", n, j, (Real)1, v, ldv, &workd[ipj], 1, (Real)0, &workd[irj], 1); - comm.allreduce_sum(&workd[irj], j); - detail::Ops::gemv("N", n, j, (Real)-1, v, ldv, &workd[irj], 1, (Real)1, resid, 1); - detail::Ops::axpy(j, (Real)1, &workd[irj], 1, &h[(j - 1) * ldh], 1); + detail::Ops::gemv(bref, "T", n, j, (Real)1, v, ldv, &workd[ipj], 1, (Real)0, &workd[irj], 1); + detail::backend_allreduce(bref, comm, &workd[irj], j); + detail::Ops::gemv(bref, "N", n, j, (Real)-1, v, ldv, &workd[irj], 1, (Real)1, resid, 1); + detail::Ops::axpy(bref, j, (Real)1, &workd[irj], 1, &h[(j - 1) * ldh], 1); detail::arscnd(t2); Real rnorm1; if (bmat[0] == 'G') { detail::stats.nbx++; - detail::Ops::copy(n, resid, 1, &workd[irj], 1); + detail::Ops::copy(bref, n, resid, 1, &workd[irj], 1); bop(&workd[irj], &workd[ipj]); } else { - detail::Ops::copy(n, resid, 1, &workd[ipj], 1); + detail::Ops::copy(bref, n, resid, 1, &workd[ipj], 1); } if (bmat[0] == 'G') { detail::arscnd(t3); @@ -168,10 +168,10 @@ namespace arnoldi::detail { } if (bmat[0] == 'G') { - rnorm1 = detail::pdot(comm, n, resid, 1, &workd[ipj], 1); + rnorm1 = detail::pdot(bref, comm, n, resid, 1, &workd[ipj], 1); rnorm1 = std::sqrt(std::abs(rnorm1)); } else { - rnorm1 = detail::pnrm2_real(comm, n, resid, 1); + rnorm1 = detail::pnrm2_real(bref, comm, n, resid, 1); } if (msglvl > 0 && iter > 0) { @@ -190,7 +190,7 @@ namespace arnoldi::detail { detail::stats.nitref++; rnorm = rnorm1; if (iter == 1) { - for (int jj = 0; jj < n; jj++) resid[jj] = (Real)0; + detail::Ops::zero(bref, n, resid); rnorm = (Real)0; } } @@ -221,9 +221,9 @@ namespace arnoldi::detail { // napps — apply implicit shifts to nonsymmetric Arnoldi factorisation. // Ported from ARPACK dnapps.f (goto-free version). - template - void napps(int n, int& kev, int np, Real* shiftr, Real* shifti, Real* v, int ldv, Real* h, int ldh, Real* resid, Real* q, - int ldq, Real* workl, Real* workd) { + template + void napps(detail::BackendRef bref, int n, int& kev, int np, Real* shiftr, Real* shifti, Real* v, int ldv, Real* h, + int ldh, Real* resid, Real* q, int ldq, Real* workl, Real* workd) { Real unfl = detail::Ops::lamch("safe minimum"); Real ovfl = Real(1) / unfl; detail::Ops::labad(unfl, ovfl); @@ -394,19 +394,20 @@ namespace arnoldi::detail { } if (h[(kev - 1) * ldh + kev] > Real(0)) - detail::Ops::gemv("N", n, kplusp, Real(1), v, ldv, &q[kev * ldq], 1, Real(0), &workd[n], 1); + detail::Ops::gemv(bref, "N", n, kplusp, Real(1), v, ldv, &q[kev * ldq], 1, Real(0), &workd[n], 1); for (int i = 1; i <= kev; i++) { - detail::Ops::gemv("N", n, kplusp - i + 1, Real(1), v, ldv, &q[(kev - i) * ldq], 1, Real(0), workd, 1); - detail::Ops::copy(n, workd, 1, &v[(kplusp - i) * ldv], 1); + detail::Ops::gemv(bref, "N", n, kplusp - i + 1, Real(1), v, ldv, &q[(kev - i) * ldq], 1, Real(0), workd, 1); + detail::Ops::copy(bref, n, workd, 1, &v[(kplusp - i) * ldv], 1); } - detail::Ops::lacpy("A", n, kev, &v[(kplusp - kev) * ldv], ldv, v, ldv); + detail::Ops::lacpy("A", n, kev, &v[(kplusp - kev) * ldv], ldv, v, ldv); - if (h[(kev - 1) * ldh + kev] > Real(0)) detail::Ops::copy(n, &workd[n], 1, &v[kev * ldv], 1); + if (h[(kev - 1) * ldh + kev] > Real(0)) detail::Ops::copy(bref, n, &workd[n], 1, &v[kev * ldv], 1); - detail::Ops::scal(n, q[(kev - 1) * ldq + (kplusp - 1)], resid, 1); - if (h[(kev - 1) * ldh + kev] > Real(0)) detail::Ops::axpy(n, h[(kev - 1) * ldh + kev], &v[kev * ldv], 1, resid, 1); + detail::Ops::scal(bref, n, q[(kev - 1) * ldq + (kplusp - 1)], resid, 1); + if (h[(kev - 1) * ldh + kev] > Real(0)) + detail::Ops::axpy(bref, n, h[(kev - 1) * ldh + kev], &v[kev * ldv], 1, resid, 1); if (msglvl > 1) { detail::debug.vout(1, &q[(kev - 1) * ldq + (kplusp - 1)], "_napps: sigmak = (e_{kev+p}^T*Q)*e_{kev}"); @@ -420,10 +421,11 @@ namespace arnoldi::detail { } // naup2 — main Arnoldi iteration driver (nonsymmetric). - template - void naup2(const char* bmat, int n, const char* which, int& nev, int& np, Real tol, Real* resid, int mode, int iupd, int ishift, - int& mxiter, Real* v, int ldv, Real* h, int ldh, Real* ritzr, Real* ritzi, Real* bounds, Real* q, int ldq, - Real* workl, Real* workd, int& info, OP&& op, BOP&& bop, const Comm& comm) { + template + void naup2(detail::BackendRef bref, const char* bmat, int n, const char* which, int& nev, int& np, Real tol, + Real* resid, int mode, int iupd, int ishift, int& mxiter, Real* v, int ldv, Real* h, int ldh, Real* ritzr, + Real* ritzi, Real* bounds, Real* q, int ldq, Real* workl, Real* workd, int& info, OP&& op, BOP&& bop, + const Comm& comm) { double t0, t1, t2, t3; detail::arscnd(t0); int msglvl = detail::debug.aup2; @@ -448,14 +450,14 @@ namespace arnoldi::detail { detail::stats.aup2 = t1 - t0; }; - getv0(bmat, 1, initv, n, 1, v, ldv, resid, rnorm, workd, info, op, bop, comm); + getv0(bref, bmat, 1, initv, n, 1, v, ldv, resid, rnorm, workd, info, op, bop, comm); if (rnorm == (Real)0) { info = -9; end_naup2(); return; } - naitr(bmat, n, 0, nev, mode, resid, rnorm, v, ldv, h, ldh, workd, info, op, bop, comm); + naitr(bref, bmat, n, 0, nev, mode, resid, rnorm, v, ldv, h, ldh, workd, info, op, bop, comm); if (info > 0) { np = info; mxiter = iter; @@ -477,7 +479,7 @@ namespace arnoldi::detail { detail::debug.ivout(1, &np, "_naup2: Extend the Arnoldi factorization by"); } - naitr(bmat, n, nev, np, mode, resid, rnorm, v, ldv, h, ldh, workd, info, op, bop, comm); + naitr(bref, bmat, n, nev, np, mode, resid, rnorm, v, ldv, h, ldh, workd, info, op, bop, comm); if (info > 0) { np = info; mxiter = iter; @@ -604,19 +606,19 @@ namespace arnoldi::detail { } } - napps(n, nev, np, ritzr, ritzi, v, ldv, h, ldh, resid, q, ldq, workl, workd); + napps(bref, n, nev, np, ritzr, ritzi, v, ldv, h, ldh, resid, q, ldq, workl, workd); // B-norm of residual after compression. detail::arscnd(t2); if (bmat[0] == 'G') { detail::stats.nbx++; - detail::Ops::copy(n, resid, 1, &workd[n], 1); + detail::Ops::copy(bref, n, resid, 1, &workd[n], 1); bop(&workd[n], workd); - rnorm = detail::pdot(comm, n, resid, 1, workd, 1); + rnorm = detail::pdot(bref, comm, n, resid, 1, workd, 1); rnorm = std::sqrt(std::abs(rnorm)); } else { - detail::Ops::copy(n, resid, 1, workd, 1); - rnorm = detail::pnrm2_real(comm, n, resid, 1); + detail::Ops::copy(bref, n, resid, 1, workd, 1); + rnorm = detail::pnrm2_real(bref, comm, n, resid, 1); } if (bmat[0] == 'G') { detail::arscnd(t3); @@ -635,9 +637,10 @@ namespace arnoldi::detail { } // naupd — nonsymmetric eigensolver entry point (callback version). - template - void naupd(const char* bmat, int n, const char* which, int nev, Real& tol, Real* resid, int ncv, Real* v, int ldv, int* iparam, - int* ipntr, Real* workd, Real* workl, int lworkl, int& info, OP&& op, BOP&& bop, const Comm& comm) { + template + void naupd(detail::BackendRef bref, const char* bmat, int n, const char* which, int nev, Real& tol, Real* resid, + int ncv, Real* v, int ldv, int* iparam, int* ipntr, Real* workd, Real* workl, int lworkl, int& info, OP&& op, + BOP&& bop, const Comm& comm) { detail::stats.reset(); double t0, t1; detail::arscnd(t0); @@ -702,8 +705,8 @@ namespace arnoldi::detail { ipntr[7] = bounds; ipntr[13] = iw; - naup2(bmat, n, which, nev0, np, tol, resid, mode, iupd, ishift, mxiter, v, ldv, &workl[ih], ldh, &workl[ritzr], - &workl[ritzi], &workl[bounds], &workl[iq], ldq, &workl[iw], workd, info, op, bop, comm); + naup2(bref, bmat, n, which, nev0, np, tol, resid, mode, iupd, ishift, mxiter, v, ldv, &workl[ih], ldh, + &workl[ritzr], &workl[ritzi], &workl[bounds], &workl[iq], ldq, &workl[iw], workd, info, op, bop, comm); iparam[2] = mxiter; iparam[4] = np; @@ -751,20 +754,43 @@ namespace arnoldi::detail { } // op + bop, no comm (defaults to SerialComm). + template + void naupd(detail::BackendRef bref, const char* bmat, int n, const char* which, int nev, Real& tol, Real* resid, + int ncv, Real* v, int ldv, int* iparam, int* ipntr, Real* workd, Real* workl, int lworkl, int& info, OP&& op, + BOP&& bop) { + naupd(bref, bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr, workd, workl, lworkl, info, + std::forward(op), std::forward(bop), SerialComm{}); + } + + // Standard problem (bmat='I'), no bop, no comm. + template + void naupd(detail::BackendRef bref, const char* bmat, int n, const char* which, int nev, Real& tol, Real* resid, + int ncv, Real* v, int ldv, int* iparam, int* ipntr, Real* workd, Real* workl, int lworkl, int& info, OP&& op) { + naupd( + bref, bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr, workd, workl, lworkl, info, std::forward(op), + [](const Real*, Real*) {}, SerialComm{}); + } + + // --------------------------------------------------------------------------- + // naupd no-BackendRef overloads — defaults to CpuBackend for the historical + // public callback API (tests, examples). + template + void naupd(const char* bmat, int n, const char* which, int nev, Real& tol, Real* resid, int ncv, Real* v, int ldv, int* iparam, + int* ipntr, Real* workd, Real* workl, int lworkl, int& info, OP&& op, BOP&& bop, const Comm& comm) { + naupd(detail::BackendRef{}, bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, + ipntr, workd, workl, lworkl, info, std::forward(op), std::forward(bop), comm); + } template void naupd(const char* bmat, int n, const char* which, int nev, Real& tol, Real* resid, int ncv, Real* v, int ldv, int* iparam, int* ipntr, Real* workd, Real* workl, int lworkl, int& info, OP&& op, BOP&& bop) { - naupd(bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr, workd, workl, lworkl, info, std::forward(op), - std::forward(bop), SerialComm{}); + naupd(detail::BackendRef{}, bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, + ipntr, workd, workl, lworkl, info, std::forward(op), std::forward(bop)); } - - // Standard problem (bmat='I'), no bop, no comm. template void naupd(const char* bmat, int n, const char* which, int nev, Real& tol, Real* resid, int ncv, Real* v, int ldv, int* iparam, int* ipntr, Real* workd, Real* workl, int lworkl, int& info, OP&& op) { - naupd( - bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr, workd, workl, lworkl, info, std::forward(op), - [](const Real*, Real*) {}, SerialComm{}); + naupd(detail::BackendRef{}, bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, + ipntr, workd, workl, lworkl, info, std::forward(op)); } // neupd — nonsymmetric eigenvector extraction (callback version). @@ -772,10 +798,10 @@ namespace arnoldi::detail { // Reads the Hessenberg factorization and Ritz data left by cb::naupd in // workl/ipntr/iparam (cb::naupd stores 0-based offsets in ipntr[4..7,13]) // and computes eigenvalues (dr,di) and optionally eigenvectors z. - template - void neupd(bool rvec, const char* howmny, Real* dr, Real* di, Real* z, int ldz, Real sigmar, Real sigmai, Real* workev, - const char* bmat, int n, const char* which, int nev, Real tol, Real* resid, int ncv, Real* v, int ldv, int* iparam, - int* ipntr, Real* workd, Real* workl, int lworkl, int& info) { + template + void neupd(detail::BackendRef bref, bool rvec, const char* howmny, Real* dr, Real* di, Real* z, int ldz, Real sigmar, + Real sigmai, Real* workev, const char* bmat, int n, const char* which, int nev, Real tol, Real* resid, int ncv, + Real* v, int ldv, int* iparam, int* ipntr, Real* workd, Real* workl, int lworkl, int& info) { int msglvl = detail::debug.eupd; int mode = iparam[6]; int nconv = iparam[4]; @@ -957,9 +983,9 @@ namespace arnoldi::detail { detail::Ops::geqr2_(&ncv, &nconv, &workl[invsub], &ldq, workev, &workev[ncv], &ierr); - detail::Ops::orm2r_("Right", "Notranspose", &n, &ncv, &nconv, &workl[invsub], &ldq, workev, v, &ldv, &workd[n], - &ierr); - detail::Ops::lacpy("All", n, nconv, v, ldv, z, ldz); + detail::Ops::orm2r_("Right", "Notranspose", &n, &ncv, &nconv, &workl[invsub], &ldq, workev, v, &ldv, + &workd[n], &ierr); + detail::Ops::lacpy("All", n, nconv, v, ldv, z, ldz); for (int j = 0; j < nconv; j++) { if (workl[invsub + j * ldq + j] < Real(0)) { @@ -1026,13 +1052,13 @@ namespace arnoldi::detail { detail::Ops::geqr2_(&ncv, &nconv, &workl[invsub], &ldq, workev, &workev[ncv], &ierr); - detail::Ops::orm2r_("Right", "Notranspose", &n, &ncv, &nconv, &workl[invsub], &ldq, workev, z, &ldz, &workd[n], - &ierr); + detail::Ops::orm2r_("Right", "Notranspose", &n, &ncv, &nconv, &workl[invsub], &ldq, workev, z, &ldz, + &workd[n], &ierr); { Real done = Real(1); - detail::Ops::trmm_("Right", "Upper", "No transpose", "Non-unit", &n, &nconv, &done, &workl[invsub], &ldq, z, - &ldz); + detail::Ops::trmm_("Right", "Upper", "No transpose", "Non-unit", &n, &nconv, &done, &workl[invsub], &ldq, + z, &ldz); } } @@ -1100,10 +1126,20 @@ namespace arnoldi::detail { iconj = 0; } } - detail::Ops::ger(n, nconv, Real(1), resid, 1, workev, 1, z, ldz); + detail::Ops::ger(n, nconv, Real(1), resid, 1, workev, 1, z, ldz); } } + // neupd no-BackendRef overload — defaults to CpuBackend for the historical + // public callback API (tests, examples). + template + void neupd(bool rvec, const char* howmny, Real* dr, Real* di, Real* z, int ldz, Real sigmar, Real sigmai, Real* workev, + const char* bmat, int n, const char* which, int nev, Real tol, Real* resid, int ncv, Real* v, int ldv, int* iparam, + int* ipntr, Real* workd, Real* workl, int lworkl, int& info) { + neupd(detail::BackendRef{}, rvec, howmny, dr, di, z, ldz, sigmar, sigmai, workev, + bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr, workd, workl, lworkl, info); + } + } // namespace arnoldi::detail #endif // ARNOLDI_DETAIL_NONSYM_HPP diff --git a/include/arnoldi/detail/ops.hpp b/include/arnoldi/detail/ops.hpp index af3fbd1..957d9f1 100644 --- a/include/arnoldi/detail/ops.hpp +++ b/include/arnoldi/detail/ops.hpp @@ -9,11 +9,13 @@ // compile if instantiated for complex — which is correct since the Arnoldi // algorithms only call them on real scalars. +#include #include #include #include #include #include +#include #include #include #include @@ -31,7 +33,40 @@ namespace arnoldi::detail { template using real_t = typename real_type::type; - template + // Non-owning view threaded through the deep algorithmic templates, mirroring + // the way `const Comm&` is passed today. For CpuBackend it carries no state; + // device backends specialize this to hold the cuBLAS handle, stream, etc. + template + struct BackendRef {}; + + // Default backend: all length-n BLAS calls hit host-side BLAS via blas_bindings. + // GPU backends (e.g. CudaBackend in arnoldi/cuda.hpp) partially specialize + // Ops to route those calls to device libraries instead. + struct CpuBackend { + BackendRef ref() const noexcept { return {}; } + }; + + // Selects the owning buffer type for Arnoldi's length-n workspace + // (resid_, v_, workd_) under each Backend, and provides static helpers + // for transferring data between host pointers and the buffer. Primary + // template is the CPU path; device backends specialize this (see + // cuda.hpp). Member access is dependent on Backend, so the right + // specialization is selected at instantiation. + template + struct buffer_traits { + using vector_type = std::vector; + + // Host pointer -> host vector. Trivial std::copy. + static void copy_from_host(vector_type& dst, const T* src, std::size_t n) { + std::copy(src, src + n, dst.data()); + } + + // copy_from_device / copy_to_host are not provided for CpuBackend: + // arnoldi::Arnoldi only invokes them on non-CpuBackend (initial_resid_device + // is SFINAE'd out, extract_'s else branch is `if constexpr`-discarded). + }; + + template struct Ops { using Real = real_t; static constexpr bool is_complex = !std::is_same_v; @@ -55,7 +90,31 @@ namespace arnoldi::detail { return slapy2(x, y); } - static void copy(int n, const Scalar* x, int incx, Scalar* y, int incy) { + // Fill a length-n buffer with Scalar(0). CPU: trivial loop. Device backends + // specialize this to a single asynchronous memset. + static void zero([[maybe_unused]] BackendRef bref, int n, Scalar* x) { + for (int i = 0; i < n; ++i) x[i] = Scalar(0); + } + + // Read a single Scalar from a buffer that may be device-resident. + // CPU: deref. Device backends specialize to cudaMemcpy + sync. + static Scalar read_scalar([[maybe_unused]] BackendRef bref, const Scalar* p) { return *p; } + + // Stage a single Scalar from a (possibly device-resident) buffer into a + // host slot WITHOUT synchronizing. Callers must Ops::sync() before + // reading host_dst. CPU: an immediate copy (sync() is then a no-op), so + // numerics and ordering are bit-identical to a direct read_scalar. + // Device backends specialize this to an async cudaMemcpy so per-step + // device->host stalls collapse into one sync per call site batch. + static void read_scalar_async([[maybe_unused]] BackendRef bref, Scalar* host_dst, const Scalar* src) { + *host_dst = *src; + } + + // Drain the backend stream so all prior read_scalar_async stages have + // landed in host memory. CPU: no-op. + static void sync([[maybe_unused]] BackendRef bref) {} + + static void copy([[maybe_unused]] BackendRef bref, int n, const Scalar* x, int incx, Scalar* y, int incy) { if constexpr (std::is_same_v) dcopy(n, x, incx, y, incy); else if constexpr (std::is_same_v) @@ -65,7 +124,7 @@ namespace arnoldi::detail { else ccopy(n, x, incx, y, incy); } - static void scal(int n, Scalar a, Scalar* x, int incx) { + static void scal([[maybe_unused]] BackendRef bref, int n, Scalar a, Scalar* x, int incx) { if constexpr (std::is_same_v) dscal(n, a, x, incx); else if constexpr (std::is_same_v) @@ -75,7 +134,7 @@ namespace arnoldi::detail { else cscal(n, a, x, incx); } - static void axpy(int n, Scalar a, const Scalar* x, int incx, Scalar* y, int incy) { + static void axpy([[maybe_unused]] BackendRef bref, int n, Scalar a, const Scalar* x, int incx, Scalar* y, int incy) { if constexpr (std::is_same_v) daxpy(n, a, x, incx, y, incy); else if constexpr (std::is_same_v) @@ -85,7 +144,7 @@ namespace arnoldi::detail { else caxpy(n, a, x, incx, y, incy); } - static Real nrm2(int n, const Scalar* x, int incx) { + static Real nrm2([[maybe_unused]] BackendRef bref, int n, const Scalar* x, int incx) { if constexpr (std::is_same_v) return dnrm2(n, x, incx); else if constexpr (std::is_same_v) @@ -95,7 +154,7 @@ namespace arnoldi::detail { else return scnrm2(n, x, incx); } - static void larnv(int idist, int* iseed, int n, Scalar* x) { + static void larnv([[maybe_unused]] BackendRef bref, int idist, int* iseed, int n, Scalar* x) { if constexpr (std::is_same_v) dlarnv(idist, iseed, n, x); else if constexpr (std::is_same_v) @@ -107,22 +166,22 @@ namespace arnoldi::detail { } // Scale complex vector by real factor; for real types collapses to scal(). - static void rscal(int n, Real a, Scalar* x, int incx) { + static void rscal(BackendRef bref, int n, Real a, Scalar* x, int incx) { if constexpr (!is_complex) - scal(n, a, x, incx); + scal(bref, n, a, x, incx); else if constexpr (std::is_same_v) zdscal(n, a, x, incx); else csscal(n, a, x, incx); } - static void raxpy(int n, Real a, const Scalar* x, int incx, Scalar* y, int incy) { + static void raxpy(BackendRef bref, int n, Real a, const Scalar* x, int incx, Scalar* y, int incy) { if constexpr (!is_complex) - axpy(n, a, x, incx, y, incy); + axpy(bref, n, a, x, incx, y, incy); else - axpy(n, Scalar(a, 0), x, incx, y, incy); + axpy(bref, n, Scalar(a, 0), x, incx, y, incy); } - static Scalar dot(int n, const Scalar* x, int incx, const Scalar* y, int incy) { + static Scalar dot([[maybe_unused]] BackendRef bref, int n, const Scalar* x, int incx, const Scalar* y, int incy) { if constexpr (std::is_same_v) return ddot(n, x, incx, y, incy); else if constexpr (std::is_same_v) @@ -135,7 +194,7 @@ namespace arnoldi::detail { sswap(n, x, incx, y, incy); } - static Scalar dotc(int n, const Scalar* x, int incx, const Scalar* y, int incy) { + static Scalar dotc([[maybe_unused]] BackendRef bref, int n, const Scalar* x, int incx, const Scalar* y, int incy) { if constexpr (std::is_same_v) return ddot(n, x, incx, y, incy); else if constexpr (std::is_same_v) @@ -150,8 +209,8 @@ namespace arnoldi::detail { return r; } } - static Real rdotc(int n, const Scalar* x, int incx, const Scalar* y, int incy) { - return std::real(dotc(n, x, incx, y, incy)); + static Real rdotc(BackendRef bref, int n, const Scalar* x, int incx, const Scalar* y, int incy) { + return std::real(dotc(bref, n, x, incx, y, incy)); } static const char* herm_trans() { @@ -161,8 +220,8 @@ namespace arnoldi::detail { return "T"; } - static void gemv(const char* trans, int m, int n, Scalar alpha, const Scalar* a, int lda, const Scalar* x, int incx, - Scalar beta, Scalar* y, int incy) { + static void gemv([[maybe_unused]] BackendRef bref, const char* trans, int m, int n, Scalar alpha, const Scalar* a, + int lda, const Scalar* x, int incx, Scalar beta, Scalar* y, int incy) { if constexpr (std::is_same_v) dgemv(trans, m, n, alpha, a, lda, x, incx, beta, y, incy); else if constexpr (std::is_same_v) @@ -173,19 +232,77 @@ namespace arnoldi::detail { cgemv(trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } + static void gemm([[maybe_unused]] BackendRef bref, const char* transa, const char* transb, int m, int n, int k, + Scalar alpha, const Scalar* a, int lda, const Scalar* b, int ldb, Scalar beta, Scalar* c, int ldc) { + if constexpr (std::is_same_v) + dgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + else if constexpr (std::is_same_v) + sgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + else if constexpr (std::is_same_v>) + zgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + else + cgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + } + // gemv with a real x-vector applied to a possibly-complex matrix. - static void gemv_rv(const char* trans, int m, int ncols, Real alpha, const Scalar* a, int lda, const Real* x, int incx, - Real beta, Scalar* y, int incy) { + static void gemv_rv(BackendRef bref, const char* trans, int m, int ncols, Real alpha, const Scalar* a, int lda, + const Real* x, int incx, Real beta, Scalar* y, int incy) { if constexpr (!is_complex) { - gemv(trans, m, ncols, alpha, a, lda, x, incx, beta, y, incy); + gemv(bref, trans, m, ncols, alpha, a, lda, x, incx, beta, y, incy); } else { int xlen = (*trans == 'N' || *trans == 'n') ? ncols : m; std::vector cx(xlen); for (int i = 0; i < xlen; i++) cx[i] = Scalar(x[i * incx], 0); - gemv(trans, m, ncols, Scalar(alpha, 0), a, lda, cx.data(), 1, Scalar(beta, 0), y, incy); + gemv(bref, trans, m, ncols, Scalar(alpha, 0), a, lda, cx.data(), 1, Scalar(beta, 0), y, incy); } } + // ----------------------------------------------------------------------- + // No-BackendRef overloads — for host-only call sites (e.g. eig.hpp's + // seigt / neigh on workl/h/q, and other small host-side BLAS-1/2 calls). + // These forward to the BackendRef-taking version with an empty + // BackendRef{}; under CudaBackend the partial specialization + // does NOT provide these, so an accidental device-side host-style call + // becomes a compile error. + static void zero(int n, Scalar* x) { zero(BackendRef{}, n, x); } + static Scalar read_scalar(const Scalar* p) { return read_scalar(BackendRef{}, p); } + static void copy(int n, const Scalar* x, int incx, Scalar* y, int incy) { copy(BackendRef{}, n, x, incx, y, incy); } + static void scal(int n, Scalar a, Scalar* x, int incx) { scal(BackendRef{}, n, a, x, incx); } + static void axpy(int n, Scalar a, const Scalar* x, int incx, Scalar* y, int incy) { + axpy(BackendRef{}, n, a, x, incx, y, incy); + } + static Real nrm2(int n, const Scalar* x, int incx) { return nrm2(BackendRef{}, n, x, incx); } + static void larnv(int idist, int* iseed, int n, Scalar* x) { larnv(BackendRef{}, idist, iseed, n, x); } + static void rscal(int n, Real a, Scalar* x, int incx) { rscal(BackendRef{}, n, a, x, incx); } + static void raxpy(int n, Real a, const Scalar* x, int incx, Scalar* y, int incy) { + raxpy(BackendRef{}, n, a, x, incx, y, incy); + } + static Scalar dot(int n, const Scalar* x, int incx, const Scalar* y, int incy) { + return dot(BackendRef{}, n, x, incx, y, incy); + } + static Scalar dotc(int n, const Scalar* x, int incx, const Scalar* y, int incy) { + return dotc(BackendRef{}, n, x, incx, y, incy); + } + static Real rdotc(int n, const Scalar* x, int incx, const Scalar* y, int incy) { + return rdotc(BackendRef{}, n, x, incx, y, incy); + } + static void gemv(const char* trans, int m, int n, Scalar alpha, const Scalar* a, int lda, const Scalar* x, int incx, + Scalar beta, Scalar* y, int incy) { + gemv(BackendRef{}, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); + } + static void gemm(const char* transa, const char* transb, int m, int n, int k, Scalar alpha, const Scalar* a, int lda, + const Scalar* b, int ldb, Scalar beta, Scalar* c, int ldc) { + gemm(BackendRef{}, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + } + static void gemv_rv(const char* trans, int m, int ncols, Real alpha, const Scalar* a, int lda, const Real* x, int incx, + Real beta, Scalar* y, int incy) { + gemv_rv(BackendRef{}, trans, m, ncols, alpha, a, lda, x, incx, beta, y, incy); + } + static void lascl_(const char* type, const int* kl, const int* ku, const Real* cfrom, const Real* cto, const int* m, + const int* n, Scalar* a, const int* lda, int* info) { + lascl_(BackendRef{}, type, kl, ku, cfrom, cto, m, n, a, lda, info); + } + static void ger(int m, int n, Scalar alpha, const Scalar* x, int incx, const Scalar* y, int incy, Scalar* a, int lda) { if constexpr (std::is_same_v) dger(m, n, alpha, x, incx, y, incy, a, lda); @@ -209,8 +326,8 @@ namespace arnoldi::detail { else clacpy(uplo, m, n, a, lda, b, ldb); } - static void lascl_(const char* type, const int* kl, const int* ku, const Real* cfrom, const Real* cto, const int* m, - const int* n, Scalar* a, const int* lda, int* info) { + static void lascl_([[maybe_unused]] BackendRef bref, const char* type, const int* kl, const int* ku, const Real* cfrom, + const Real* cto, const int* m, const int* n, Scalar* a, const int* lda, int* info) { if constexpr (std::is_same_v) ::dlascl_(type, kl, ku, cfrom, cto, m, n, a, lda, info); else if constexpr (std::is_same_v) @@ -332,45 +449,91 @@ namespace arnoldi::detail { // Distributed reduction helpers (PARPACK-style). // For SerialComm these compile to bare BLAS calls with zero overhead. - template - Real pdot(const Comm& c, int n, const Real* x, int incx, const Real* y, int incy) { - return c.allreduce_sum(Ops::dot(n, x, incx, y, incy)); + // Backend defaults to CpuBackend so existing pdot(...) call sites keep + // compiling; algorithmic code that lives under a Backend template forwards + // it explicitly: pdot(bref, ...). + template + Real pdot(BackendRef bref, const Comm& c, int n, const Real* x, int incx, const Real* y, int incy) { + return c.allreduce_sum(Ops::dot(bref, n, x, incx, y, incy)); } - template - real_t prdotc(const Comm& c, int n, const Scalar* x, int incx, const Scalar* y, int incy) { - return c.allreduce_sum(Ops::rdotc(n, x, incx, y, incy)); + template + real_t prdotc(BackendRef bref, const Comm& c, int n, const Scalar* x, int incx, const Scalar* y, int incy) { + return c.allreduce_sum(Ops::rdotc(bref, n, x, incx, y, incy)); } - template - Real pnrm2_real(const Comm& c, int n, const Real* x, int incx) { - Real s = c.allreduce_sum(Ops::dot(n, x, incx, x, incx)); + template + Real pnrm2_real(BackendRef bref, const Comm& c, int n, const Real* x, int incx) { + Real s = c.allreduce_sum(Ops::dot(bref, n, x, incx, x, incx)); return std::sqrt(std::abs(s)); } - template - real_t pnrm2(const Comm& c, int n, const Scalar* x, int incx) { - auto s = c.allreduce_sum(Ops::rdotc(n, x, incx, x, incx)); + template + real_t pnrm2(BackendRef bref, const Comm& c, int n, const Scalar* x, int incx) { + auto s = c.allreduce_sum(Ops::rdotc(bref, n, x, incx, x, incx)); return std::sqrt(std::abs(s)); } // B-norm helper: compute rnorm = B-norm(resid) using workd as scratch. // For bmat='G': bop(resid, workd[0..]), rnorm = sqrt(|resid . workd|). // For bmat='I': rnorm = nrm2(resid). - template - real_t bnorm(const char bmat, int n, Scalar* resid, Scalar* workd_lo, Scalar* workd_hi, BOP&& bop, const Comm& comm) { + template + real_t bnorm(BackendRef bref, const char bmat, int n, Scalar* resid, Scalar* workd_lo, Scalar* workd_hi, + BOP&& bop, const Comm& comm) { using Real = real_t; Real val; if (bmat == 'G') { stats.nbx++; - Ops::copy(n, resid, 1, workd_hi, 1); + Ops::copy(bref, n, resid, 1, workd_hi, 1); bop(workd_hi, workd_lo); - val = prdotc(comm, n, resid, 1, workd_lo, 1); + val = prdotc(bref, comm, n, resid, 1, workd_lo, 1); val = std::sqrt(std::abs(val)); } else { - Ops::copy(n, resid, 1, workd_lo, 1); - val = pnrm2(comm, n, resid, 1); + Ops::copy(bref, n, resid, 1, workd_lo, 1); + val = pnrm2(bref, comm, n, resid, 1); } return val; } + // --------------------------------------------------------------------------- + // No-BackendRef overloads for the reduction helpers — defaults to CpuBackend + // for the historical public API (tests, examples). + template + Real pdot(const Comm& c, int n, const Real* x, int incx, const Real* y, int incy) { + return pdot(BackendRef{}, c, n, x, incx, y, incy); + } + template + real_t prdotc(const Comm& c, int n, const Scalar* x, int incx, const Scalar* y, int incy) { + return prdotc(BackendRef{}, c, n, x, incx, y, incy); + } + template + Real pnrm2_real(const Comm& c, int n, const Real* x, int incx) { + return pnrm2_real(BackendRef{}, c, n, x, incx); + } + template + real_t pnrm2(const Comm& c, int n, const Scalar* x, int incx) { + return pnrm2(BackendRef{}, c, n, x, incx); + } + template + real_t bnorm(const char bmat, int n, Scalar* resid, Scalar* workd_lo, Scalar* workd_hi, BOP&& bop, const Comm& comm) { + return bnorm(BackendRef{}, bmat, n, resid, workd_lo, workd_hi, std::forward(bop), comm); + } + + // --------------------------------------------------------------------------- + // Backend-aware extension points. Primary versions are CPU; the CUDA + // backend overrides them via additional overloads declared in cuda.hpp. + // Algorithmic templates call these by name; ADL + overload resolution + // picks the right one based on the BackendRef argument. + // --------------------------------------------------------------------------- + + // In-place allreduce on a length-`len` buffer. Under CpuBackend the buffer + // is host-resident, so we forward to comm.allreduce_sum directly. The CUDA + // backend provides an overload that stages D->H, allreduces on a host + // scratch buffer, then copies H->D. Length is always small (<= ncv) — the + // staging cost is irrelevant compared to the matvec. + template + void backend_allreduce(BackendRef, const Comm& comm, Scalar* buf, int len) { + comm.allreduce_sum(buf, len); + } + + } // namespace arnoldi::detail #endif // ARNOLDI_DETAIL_OPS_HPP diff --git a/include/arnoldi/detail/sym.hpp b/include/arnoldi/detail/sym.hpp index 8b88b6b..e3ae2f3 100644 --- a/include/arnoldi/detail/sym.hpp +++ b/include/arnoldi/detail/sym.hpp @@ -21,9 +21,10 @@ namespace arnoldi::detail { // saitr — Lanczos iteration (symmetric / Hermitian). // Scalar = double|float for real symmetric, complex for Hermitian. // Vectors (v, resid, workd) are Scalar; tridiagonal h is detail::real_t. - template - void saitr(const char* bmat, int n, int k, int np, int mode, Scalar* resid, detail::real_t& rnorm, Scalar* v, int ldv, - detail::real_t* h, int ldh, Scalar* workd, int& info, OP&& op, BOP&& bop, const Comm& comm) { + template + void saitr(detail::BackendRef bref, const char* bmat, int n, int k, int np, int mode, Scalar* resid, + detail::real_t& rnorm, Scalar* v, int ldv, detail::real_t* h, int ldh, Scalar* workd, int& info, + OP&& op, BOP&& bop, const Comm& comm) { using Real = detail::real_t; const int ipj = 0, irj = n, ivj = 2 * n; @@ -33,6 +34,26 @@ namespace arnoldi::detail { detail::arscnd(t0); info = 0; + // The diagonal of H (h[ldh + j - 1]) is write-only inside this loop and + // is consumed only later by seigt. Stage each per-step device read + // asynchronously and reconstruct the host diagonal with a single stream + // sync per saitr call, instead of one device->host stall per Lanczos + // step. Under CpuBackend read_scalar_async is an immediate copy and + // sync() a no-op, so the staged values and their summation order are + // identical to a direct read — numerically bit-for-bit unchanged. + std::vector hdiag_stage(static_cast(k + np)); + std::vector hreorth_stage(static_cast(k + np) * 2); + std::vector hreorth_cnt(static_cast(k + np), 0); + auto flush_hdiag = [&](int jlast) { + detail::Ops::sync(bref); + for (int jj = k + 1; jj <= jlast; ++jj) { + Real d = std::real(hdiag_stage[jj - 1]); + int cnt = hreorth_cnt[jj - 1]; + for (int r = 0; r < cnt; ++r) d += std::real(hreorth_stage[static_cast(jj - 1) * 2 + r]); + h[ldh + jj - 1] = d; + } + }; + for (int j = k + 1; j <= k + np; ++j) { if (msglvl > 2) { int jj = j; @@ -56,38 +77,40 @@ namespace arnoldi::detail { int ierr = -1; for (int itry = 1; itry <= 3; ++itry) { - getv0(bmat, itry, false, n, j, v, ldv, resid, rnorm, workd, ierr, op, bop, comm); + getv0(bref, bmat, itry, false, n, j, v, ldv, resid, rnorm, workd, ierr, op, bop, comm); if (ierr >= 0) break; } if (ierr < 0) { info = j - 1; + flush_hdiag(j - 1); detail::arscnd(t1); detail::stats.aitr += (t1 - t0); return; } } - detail::Ops::copy(n, resid, 1, &v[(j - 1) * ldv], 1); + detail::Ops::copy(bref, n, resid, 1, &v[(j - 1) * ldv], 1); if (rnorm >= safmin) { Real temp1 = Real(1) / rnorm; - detail::Ops::rscal(n, temp1, &v[(j - 1) * ldv], 1); - detail::Ops::rscal(n, temp1, &workd[ipj], 1); + detail::Ops::rscal(bref, n, temp1, &v[(j - 1) * ldv], 1); + detail::Ops::rscal(bref, n, temp1, &workd[ipj], 1); } else { int i_zero = 0, i_one = 1; int infol; Real r_one = Real(1); - detail::Ops::lascl_("G", &i_zero, &i_zero, &rnorm, &r_one, &n, &i_one, &v[(j - 1) * ldv], &n, &infol); - detail::Ops::lascl_("G", &i_zero, &i_zero, &rnorm, &r_one, &n, &i_one, &workd[ipj], &n, &infol); + detail::Ops::lascl_(bref, "G", &i_zero, &i_zero, &rnorm, &r_one, &n, &i_one, &v[(j - 1) * ldv], &n, + &infol); + detail::Ops::lascl_(bref, "G", &i_zero, &i_zero, &rnorm, &r_one, &n, &i_one, &workd[ipj], &n, &infol); } detail::stats.nopx++; detail::arscnd(t2); - detail::Ops::copy(n, &v[(j - 1) * ldv], 1, &workd[ivj], 1); + detail::Ops::copy(bref, n, &v[(j - 1) * ldv], 1, &workd[ivj], 1); op(&workd[ivj], &workd[irj]); detail::arscnd(t3); detail::stats.mvopx += (t3 - t2); - detail::Ops::copy(n, &workd[irj], 1, resid, 1); + detail::Ops::copy(bref, n, &workd[irj], 1, resid, 1); if (mode != 2) { detail::arscnd(t2); @@ -95,7 +118,7 @@ namespace arnoldi::detail { detail::stats.nbx++; bop(&workd[irj], &workd[ipj]); } else { - detail::Ops::copy(n, resid, 1, &workd[ipj], 1); + detail::Ops::copy(bref, n, resid, 1, &workd[ipj], 1); } if (*bmat == 'G') { detail::arscnd(t3); @@ -105,26 +128,26 @@ namespace arnoldi::detail { Real wnorm; if (mode == 2) { - wnorm = detail::prdotc(comm, n, resid, 1, &workd[ivj], 1); + wnorm = detail::prdotc(bref, comm, n, resid, 1, &workd[ivj], 1); wnorm = std::sqrt(std::abs(wnorm)); } else if (*bmat == 'G') { - wnorm = detail::prdotc(comm, n, resid, 1, &workd[ipj], 1); + wnorm = detail::prdotc(bref, comm, n, resid, 1, &workd[ipj], 1); wnorm = std::sqrt(std::abs(wnorm)); } else { - wnorm = detail::pnrm2(comm, n, resid, 1); + wnorm = detail::pnrm2(bref, comm, n, resid, 1); } if (mode != 2) { - detail::Ops::gemv(detail::Ops::herm_trans(), n, j, Scalar(1), v, ldv, &workd[ipj], 1, Scalar(0), - &workd[irj], 1); + detail::Ops::gemv(bref, detail::Ops::herm_trans(), n, j, Scalar(1), v, ldv, &workd[ipj], + 1, Scalar(0), &workd[irj], 1); } else { - detail::Ops::gemv(detail::Ops::herm_trans(), n, j, Scalar(1), v, ldv, &workd[ivj], 1, Scalar(0), - &workd[irj], 1); + detail::Ops::gemv(bref, detail::Ops::herm_trans(), n, j, Scalar(1), v, ldv, &workd[ivj], + 1, Scalar(0), &workd[irj], 1); } - comm.allreduce_sum(&workd[irj], j); - detail::Ops::gemv("N", n, j, Scalar(-1), v, ldv, &workd[irj], 1, Scalar(1), resid, 1); + detail::backend_allreduce(bref, comm, &workd[irj], j); + detail::Ops::gemv(bref, "N", n, j, Scalar(-1), v, ldv, &workd[irj], 1, Scalar(1), resid, 1); - h[ldh + j - 1] = std::real(workd[irj + j - 1]); + detail::Ops::read_scalar_async(bref, &hdiag_stage[j - 1], &workd[irj + j - 1]); if (j == 1 || rstart) h[j - 1] = Real(0); else @@ -135,10 +158,10 @@ namespace arnoldi::detail { detail::arscnd(t2); if (*bmat == 'G') { detail::stats.nbx++; - detail::Ops::copy(n, resid, 1, &workd[irj], 1); + detail::Ops::copy(bref, n, resid, 1, &workd[irj], 1); bop(&workd[irj], &workd[ipj]); } else { - detail::Ops::copy(n, resid, 1, &workd[ipj], 1); + detail::Ops::copy(bref, n, resid, 1, &workd[ipj], 1); } if (*bmat == 'G') { detail::arscnd(t3); @@ -146,10 +169,10 @@ namespace arnoldi::detail { } if (*bmat == 'G') { - rnorm = detail::prdotc(comm, n, resid, 1, &workd[ipj], 1); + rnorm = detail::prdotc(bref, comm, n, resid, 1, &workd[ipj], 1); rnorm = std::sqrt(std::abs(rnorm)); } else { - rnorm = detail::pnrm2(comm, n, resid, 1); + rnorm = detail::pnrm2(bref, comm, n, resid, 1); } if (rnorm <= Real(0.717) * wnorm) { @@ -160,22 +183,27 @@ namespace arnoldi::detail { detail::debug.vout(2, xtemp, "_saitr: re-orthonalization ; wnorm and rnorm are"); } - detail::Ops::gemv(detail::Ops::herm_trans(), n, j, Scalar(1), v, ldv, &workd[ipj], 1, Scalar(0), - &workd[irj], 1); - comm.allreduce_sum(&workd[irj], j); - detail::Ops::gemv("N", n, j, Scalar(-1), v, ldv, &workd[irj], 1, Scalar(1), resid, 1); + detail::Ops::gemv(bref, detail::Ops::herm_trans(), n, j, Scalar(1), v, ldv, + &workd[ipj], 1, Scalar(0), &workd[irj], 1); + detail::backend_allreduce(bref, comm, &workd[irj], j); + detail::Ops::gemv(bref, "N", n, j, Scalar(-1), v, ldv, &workd[irj], 1, Scalar(1), resid, 1); if (j == 1 || rstart) h[j - 1] = Real(0); - h[ldh + j - 1] = h[ldh + j - 1] + std::real(workd[irj + j - 1]); + { + int& rc = hreorth_cnt[j - 1]; + detail::Ops::read_scalar_async(bref, &hreorth_stage[static_cast(j - 1) * 2 + rc], + &workd[irj + j - 1]); + ++rc; + } detail::arscnd(t2); Real rnorm1; if (*bmat == 'G') { detail::stats.nbx++; - detail::Ops::copy(n, resid, 1, &workd[irj], 1); + detail::Ops::copy(bref, n, resid, 1, &workd[irj], 1); bop(&workd[irj], &workd[ipj]); } else { - detail::Ops::copy(n, resid, 1, &workd[ipj], 1); + detail::Ops::copy(bref, n, resid, 1, &workd[ipj], 1); } if (*bmat == 'G') { detail::arscnd(t3); @@ -183,10 +211,10 @@ namespace arnoldi::detail { } if (*bmat == 'G') { - rnorm1 = detail::prdotc(comm, n, resid, 1, &workd[ipj], 1); + rnorm1 = detail::prdotc(bref, comm, n, resid, 1, &workd[ipj], 1); rnorm1 = std::sqrt(std::abs(rnorm1)); } else { - rnorm1 = detail::pnrm2(comm, n, resid, 1); + rnorm1 = detail::pnrm2(bref, comm, n, resid, 1); } if (msglvl > 0 && riter > 0) { @@ -205,7 +233,7 @@ namespace arnoldi::detail { detail::stats.nitref++; rnorm = rnorm1; if (riter == 1) { - for (int jj = 0; jj < n; jj++) resid[jj] = Scalar(0); + detail::Ops::zero(bref, n, resid); rnorm = Real(0); } } @@ -218,12 +246,14 @@ namespace arnoldi::detail { if (h[j - 1] < Real(0)) { h[j - 1] = -h[j - 1]; if (j < k + np) - detail::Ops::rscal(n, Real(-1), &v[j * ldv], 1); + detail::Ops::rscal(bref, n, Real(-1), &v[j * ldv], 1); else - detail::Ops::rscal(n, Real(-1), resid, 1); + detail::Ops::rscal(bref, n, Real(-1), resid, 1); } } + flush_hdiag(k + np); + detail::arscnd(t1); detail::stats.aitr += (t1 - t0); @@ -238,9 +268,9 @@ namespace arnoldi::detail { // sapps — implicit QR shifts for symmetric/Hermitian Lanczos. // Scalar-aware: Givens rotations on Real h/q, final V update on Scalar. - template - void sapps(int n, int kev, int np, detail::real_t* shift, Scalar* v, int ldv, detail::real_t* h, int ldh, - Scalar* resid, detail::real_t* q, int ldq, Scalar* workd) { + template + void sapps(detail::BackendRef bref, int n, int kev, int np, detail::real_t* shift, Scalar* v, int ldv, + detail::real_t* h, int ldh, Scalar* resid, detail::real_t* q, int ldq, Scalar* workd) { using Real = detail::real_t; const Real one = Real(1), zero = Real(0); @@ -355,21 +385,22 @@ namespace arnoldi::detail { } // V_new = V * Q and resid update. - if (h[kev] > zero) detail::Ops::gemv_rv("N", n, kplusp, one, v, ldv, &q[kev * ldq], 1, zero, &workd[n], 1); + if (h[kev] > zero) + detail::Ops::gemv_rv(bref, "N", n, kplusp, one, v, ldv, &q[kev * ldq], 1, zero, &workd[n], 1); for (i = 1; i <= kev; i++) { - detail::Ops::gemv_rv("N", n, kplusp - i + 1, one, v, ldv, &q[(kev - i) * ldq], 1, zero, workd, 1); - detail::Ops::copy(n, workd, 1, &v[(kplusp - i) * ldv], 1); + detail::Ops::gemv_rv(bref, "N", n, kplusp - i + 1, one, v, ldv, &q[(kev - i) * ldq], 1, zero, workd, 1); + detail::Ops::copy(bref, n, workd, 1, &v[(kplusp - i) * ldv], 1); } for (i = 1; i <= kev; i++) { - detail::Ops::copy(n, &v[(np + i - 1) * ldv], 1, &v[(i - 1) * ldv], 1); + detail::Ops::copy(bref, n, &v[(np + i - 1) * ldv], 1, &v[(i - 1) * ldv], 1); } - if (h[kev] > zero) detail::Ops::copy(n, &workd[n], 1, &v[kev * ldv], 1); + if (h[kev] > zero) detail::Ops::copy(bref, n, &workd[n], 1, &v[kev * ldv], 1); - detail::Ops::rscal(n, q[(kev - 1) * ldq + kplusp - 1], resid, 1); - if (h[kev] > zero) detail::Ops::raxpy(n, h[kev], &v[kev * ldv], 1, resid, 1); + detail::Ops::rscal(bref, n, q[(kev - 1) * ldq + kplusp - 1], resid, 1); + if (h[kev] > zero) detail::Ops::raxpy(bref, n, h[kev], &v[kev * ldv], 1, resid, 1); if (msglvl > 1) { detail::debug.vout(1, &q[(kev - 1) * ldq + kplusp - 1], "_sapps: sigmak of the updated residual vector"); @@ -385,11 +416,12 @@ namespace arnoldi::detail { } // saup2 — main Lanczos iteration driver (symmetric / Hermitian). - template - void saup2(const char* bmat, int n, const char* which, int& nev, int& np, detail::real_t tol, Scalar* resid, int mode, - int iupd, int ishift, int& mxiter, Scalar* v, int ldv, detail::real_t* h, int ldh, - detail::real_t* ritz, detail::real_t* bounds, detail::real_t* q, int ldq, - detail::real_t* workl, Scalar* workd, int& info, OP&& op, BOP&& bop, const Comm& comm) { + template + void saup2(detail::BackendRef bref, const char* bmat, int n, const char* which, int& nev, int& np, + detail::real_t tol, Scalar* resid, int mode, int iupd, int ishift, int& mxiter, Scalar* v, int ldv, + detail::real_t* h, int ldh, detail::real_t* ritz, detail::real_t* bounds, + detail::real_t* q, int ldq, detail::real_t* workl, Scalar* workd, int& info, OP&& op, BOP&& bop, + const Comm& comm) { using Real = detail::real_t; double t0, t1, t2, t3; detail::arscnd(t0); @@ -415,14 +447,14 @@ namespace arnoldi::detail { detail::stats.aup2 = t1 - t0; }; - getv0(bmat, 1, initv, n, 1, v, ldv, resid, rnorm, workd, info, op, bop, comm); + getv0(bref, bmat, 1, initv, n, 1, v, ldv, resid, rnorm, workd, info, op, bop, comm); if (rnorm == Real(0)) { info = -9; end_saup2(); return; } - saitr(bmat, n, 0, nev0, mode, resid, rnorm, v, ldv, h, ldh, workd, info, op, bop, comm); + saitr(bref, bmat, n, 0, nev0, mode, resid, rnorm, v, ldv, h, ldh, workd, info, op, bop, comm); if (info > 0) { np = info; mxiter = iter; @@ -441,7 +473,7 @@ namespace arnoldi::detail { detail::debug.ivout(1, &np, "_saup2: Extend the Lanczos factorization by"); } - saitr(bmat, n, nev, np, mode, resid, rnorm, v, ldv, h, ldh, workd, info, op, bop, comm); + saitr(bref, bmat, n, nev, np, mode, resid, rnorm, v, ldv, h, ldh, workd, info, op, bop, comm); if (info > 0) { np = info; mxiter = iter; @@ -563,18 +595,18 @@ namespace arnoldi::detail { } } - sapps(n, nev, np, ritz, v, ldv, h, ldh, resid, q, ldq, workd); + sapps(bref, n, nev, np, ritz, v, ldv, h, ldh, resid, q, ldq, workd); detail::arscnd(t2); if (*bmat == 'G') { detail::stats.nbx++; - detail::Ops::copy(n, resid, 1, &workd[n], 1); + detail::Ops::copy(bref, n, resid, 1, &workd[n], 1); bop(&workd[n], workd); - rnorm = detail::prdotc(comm, n, resid, 1, workd, 1); + rnorm = detail::prdotc(bref, comm, n, resid, 1, workd, 1); rnorm = std::sqrt(std::abs(rnorm)); } else { - detail::Ops::copy(n, resid, 1, workd, 1); - rnorm = detail::pnrm2(comm, n, resid, 1); + detail::Ops::copy(bref, n, resid, 1, workd, 1); + rnorm = detail::pnrm2(bref, comm, n, resid, 1); } if (*bmat == 'G') { detail::arscnd(t3); @@ -598,10 +630,10 @@ namespace arnoldi::detail { // saupd — symmetric/Hermitian eigensolver entry point (callback). // Scalar = double|float for real, complex for Hermitian. // workd is Scalar* (3n), workl is detail::real_t* (ncv^2 + 8*ncv). - template - void saupd(const char* bmat, int n, const char* which, int nev, detail::real_t& tol, Scalar* resid, int ncv, Scalar* v, - int ldv, int* iparam, int* ipntr, Scalar* workd, detail::real_t* workl, int lworkl, int& info, OP&& op, - BOP&& bop, const Comm& comm) { + template + void saupd(detail::BackendRef bref, const char* bmat, int n, const char* which, int nev, detail::real_t& tol, + Scalar* resid, int ncv, Scalar* v, int ldv, int* iparam, int* ipntr, Scalar* workd, detail::real_t* workl, + int lworkl, int& info, OP&& op, BOP&& bop, const Comm& comm) { using Real = detail::real_t; detail::stats.reset(); double t0, t1; @@ -665,8 +697,8 @@ namespace arnoldi::detail { ipntr[6] = bounds + 1; ipntr[10] = iw + 1; - saup2(bmat, n, which, nev0, np, tol, resid, mode, iupd, ishift, mxiter, v, ldv, &workl[ih], ldh, &workl[ritz], - &workl[bounds], &workl[iq], ldq, &workl[iw], workd, info, op, bop, comm); + saup2(bref, bmat, n, which, nev0, np, tol, resid, mode, iupd, ishift, mxiter, v, ldv, &workl[ih], ldh, + &workl[ritz], &workl[bounds], &workl[iq], ldq, &workl[iw], workd, info, op, bop, comm); iparam[2] = mxiter; iparam[4] = np; @@ -699,30 +731,56 @@ namespace arnoldi::detail { } // op + bop, no comm (defaults to SerialComm). + template + void saupd(detail::BackendRef bref, const char* bmat, int n, const char* which, int nev, detail::real_t& tol, + Scalar* resid, int ncv, Scalar* v, int ldv, int* iparam, int* ipntr, Scalar* workd, detail::real_t* workl, + int lworkl, int& info, OP&& op, BOP&& bop) { + saupd(bref, bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr, workd, workl, lworkl, info, + std::forward(op), std::forward(bop), SerialComm{}); + } + + // Standard problem (bmat='I'), no bop, no comm. + template + void saupd(detail::BackendRef bref, const char* bmat, int n, const char* which, int nev, detail::real_t& tol, + Scalar* resid, int ncv, Scalar* v, int ldv, int* iparam, int* ipntr, Scalar* workd, detail::real_t* workl, + int lworkl, int& info, OP&& op) { + saupd( + bref, bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr, workd, workl, lworkl, info, std::forward(op), + [](const Scalar*, Scalar*) {}, SerialComm{}); + } + + // --------------------------------------------------------------------------- + // saupd no-BackendRef overloads — defaults to CpuBackend for the historical + // public callback API (tests, examples). + template + void saupd(const char* bmat, int n, const char* which, int nev, detail::real_t& tol, Scalar* resid, int ncv, Scalar* v, + int ldv, int* iparam, int* ipntr, Scalar* workd, detail::real_t* workl, int lworkl, int& info, OP&& op, + BOP&& bop, const Comm& comm) { + saupd(detail::BackendRef{}, bmat, n, which, nev, tol, resid, ncv, v, ldv, + iparam, ipntr, workd, workl, lworkl, info, std::forward(op), std::forward(bop), + comm); + } template void saupd(const char* bmat, int n, const char* which, int nev, detail::real_t& tol, Scalar* resid, int ncv, Scalar* v, int ldv, int* iparam, int* ipntr, Scalar* workd, detail::real_t* workl, int lworkl, int& info, OP&& op, BOP&& bop) { - saupd(bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr, workd, workl, lworkl, info, std::forward(op), - std::forward(bop), SerialComm{}); + saupd(detail::BackendRef{}, bmat, n, which, nev, tol, resid, ncv, v, ldv, + iparam, ipntr, workd, workl, lworkl, info, std::forward(op), std::forward(bop)); } - - // Standard problem (bmat='I'), no bop, no comm. template void saupd(const char* bmat, int n, const char* which, int nev, detail::real_t& tol, Scalar* resid, int ncv, Scalar* v, int ldv, int* iparam, int* ipntr, Scalar* workd, detail::real_t* workl, int lworkl, int& info, OP&& op) { - saupd( - bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr, workd, workl, lworkl, info, std::forward(op), - [](const Scalar*, Scalar*) {}, SerialComm{}); + saupd(detail::BackendRef{}, bmat, n, which, nev, tol, resid, ncv, v, ldv, + iparam, ipntr, workd, workl, lworkl, info, std::forward(op)); } // seupd — symmetric/Hermitian eigenvector extraction (callback). // d is Real* (eigenvalues), z is Scalar* (eigenvectors). - template - void seupd(bool rvec, const char* howmny, detail::real_t* d, Scalar* z, int ldz, detail::real_t sigma, - const char* bmat, int n, const char* which, int nev, detail::real_t tol, Scalar* resid, int ncv, Scalar* v, - int ldv, int* iparam, int* ipntr, Scalar* workd, detail::real_t* workl, int lworkl, int& info, - const Comm& comm) { + template + void seupd(detail::BackendRef bref, bool rvec, const char* howmny, detail::real_t* d, Scalar* z, int ldz, + detail::real_t sigma, const char* bmat, int n, const char* which, int nev, detail::real_t tol, + Scalar* resid, int ncv, Scalar* v, int ldv, int* iparam, int* ipntr, Scalar* workd, detail::real_t* workl, + int lworkl, int& info, const Comm& comm) { using Real = detail::real_t; char type[7]; @@ -793,7 +851,7 @@ namespace arnoldi::detail { if (*bmat == 'I') bnorm2 = rnorm; else - bnorm2 = detail::pnrm2(comm, n, workd, 1); + bnorm2 = detail::pnrm2(bref, comm, n, workd, 1); std::vector select(ncv, 0); @@ -890,23 +948,45 @@ namespace arnoldi::detail { } if (rvec && *howmny == 'A') { - // z = V * S (Scalar V times Real eigenvector matrix S). When z - // aliases v, a column-by-column multiply would destroy V columns - // before they are fully consumed; use a temporary in that case. + // z = V * S (Scalar V times Real eigenvector matrix S = workl[iq..]). + // When z aliases v, the GEMM source/destination would overlap; copy V + // into a tmpbuf in that case so the GEMM has a clean source. Scalar* vbuf = v; std::vector tmpbuf; if (z == v) { tmpbuf.resize(static_cast(ldv) * ncv); - detail::Ops::copy(ldv * ncv, v, 1, tmpbuf.data(), 1); + detail::Ops::copy(bref, ldv * ncv, v, 1, tmpbuf.data(), 1); vbuf = tmpbuf.data(); } - for (j = 0; j < nconv; j++) { - for (int ii = 0; ii < n; ii++) z[j * ldz + ii] = Scalar(0); - for (k = 0; k < ncv; k++) { - detail::Ops::raxpy(n, workl[iq + j * ldq + k], &vbuf[k * ldv], 1, &z[j * ldz], 1); - } + + // S is Real, ncv-rows × nconv-cols, column-major, leading dim ldq. + // For real Scalar, S has the right storage type already; for complex + // Scalar, widen S to Scalar (im = 0) on the host first. + std::vector S_host(static_cast(ldq) * nconv); + if constexpr (std::is_same_v) { + for (j = 0; j < ldq * nconv; j++) S_host[j] = workl[iq + j]; + } else { + for (j = 0; j < nconv; j++) + for (k = 0; k < ncv; k++) S_host[j * ldq + k] = Scalar(workl[iq + j * ldq + k], 0); } + // Under CpuBackend the gemm consumes S directly from the host buffer. + // Under a device backend we upload S into a backend-typed buffer first; + // the cuBLAS gemm requires all three operands to be device pointers. + // S is small (ldq*nconv <= ncv²), so the upload cost is negligible. + typename detail::buffer_traits::vector_type S_buf; + const Scalar* S_ptr; + if constexpr (std::is_same_v) { + S_ptr = S_host.data(); + } else { + S_buf.assign(static_cast(ldq) * nconv, Scalar{}); + detail::buffer_traits::copy_from_host(S_buf, S_host.data(), + static_cast(ldq) * nconv); + S_ptr = S_buf.data(); + } + + detail::Ops::gemm(bref, "N", "N", n, nconv, ncv, Scalar(1), vbuf, ldv, S_ptr, ldq, Scalar(0), z, ldz); + // Last-row weights for error bounds. for (j = 1; j <= ncv - 1; j++) workl[ihb + j - 1] = Real(0); workl[ihb + ncv - 1] = Real(1); @@ -941,18 +1021,37 @@ namespace arnoldi::detail { if (rvec && std::strcmp(type, "REGULR") != 0) { for (k = 0; k < nconv; k++) { - detail::Ops::raxpy(n, workl[iw + k], resid, 1, &z[k * ldz], 1); + detail::Ops::raxpy(bref, n, workl[iw + k], resid, 1, &z[k * ldz], 1); } } } // seupd without Comm (defaults to SerialComm). + template + void seupd(detail::BackendRef bref, bool rvec, const char* howmny, detail::real_t* d, Scalar* z, int ldz, + detail::real_t sigma, const char* bmat, int n, const char* which, int nev, detail::real_t tol, + Scalar* resid, int ncv, Scalar* v, int ldv, int* iparam, int* ipntr, Scalar* workd, detail::real_t* workl, + int lworkl, int& info) { + seupd(bref, rvec, howmny, d, z, ldz, sigma, bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr, + workd, workl, lworkl, info, SerialComm{}); + } + + // seupd no-BackendRef overloads — defaults to CpuBackend for the historical + // public callback API (tests, examples). + template + void seupd(bool rvec, const char* howmny, detail::real_t* d, Scalar* z, int ldz, detail::real_t sigma, + const char* bmat, int n, const char* which, int nev, detail::real_t tol, Scalar* resid, int ncv, Scalar* v, + int ldv, int* iparam, int* ipntr, Scalar* workd, detail::real_t* workl, int lworkl, int& info, + const Comm& comm) { + seupd(detail::BackendRef{}, rvec, howmny, d, z, ldz, sigma, bmat, n, which, + nev, tol, resid, ncv, v, ldv, iparam, ipntr, workd, workl, lworkl, info, comm); + } template void seupd(bool rvec, const char* howmny, detail::real_t* d, Scalar* z, int ldz, detail::real_t sigma, const char* bmat, int n, const char* which, int nev, detail::real_t tol, Scalar* resid, int ncv, Scalar* v, int ldv, int* iparam, int* ipntr, Scalar* workd, detail::real_t* workl, int lworkl, int& info) { - seupd(rvec, howmny, d, z, ldz, sigma, bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr, workd, workl, - lworkl, info, SerialComm{}); + seupd(detail::BackendRef{}, rvec, howmny, d, z, ldz, sigma, bmat, n, which, + nev, tol, resid, ncv, v, ldv, iparam, ipntr, workd, workl, lworkl, info); } } // namespace arnoldi::detail diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 116f301..e9aee6f 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,5 +1,33 @@ +# Test suite — built on Catch2 v3 (fetched at configure time). + +include(FetchContent) +FetchContent_Declare( + Catch2 + GIT_REPOSITORY https://github.com/catchorg/Catch2.git + GIT_TAG v3.7.1 +) +FetchContent_MakeAvailable(Catch2) + +# Catch2 ships CMake helpers (catch_discover_tests) under extras/. +list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras) +include(Catch) + foreach(_test IN ITEMS test_validation test_getv0 test_solver test_helpers test_modes test_low_level_validation) add_executable(${_test} ${_test}.cpp) - target_link_libraries(${_test} PRIVATE arnoldi) - add_test(NAME ${_test} COMMAND ${_test}) + target_link_libraries(${_test} PRIVATE arnoldi Catch2::Catch2WithMain) + catch_discover_tests(${_test}) endforeach() + +# CUDA-backend test. Mirrors test_solver but launches the matvec as a CUDA +# kernel and uses Arnoldi<..., CudaBackend>. Built only when ARNOLDI_USE_CUDA +# is ON; that option already enables CUDA::cudart and CUDA::cublas on the +# arnoldi interface target. +if(ARNOLDI_USE_CUDA) + enable_language(CUDA) + add_executable(test_solver_cuda test_solver_cuda.cu) + set_target_properties(test_solver_cuda PROPERTIES + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON) + target_link_libraries(test_solver_cuda PRIVATE arnoldi Catch2::Catch2WithMain) + catch_discover_tests(test_solver_cuda) +endif() diff --git a/tests/test_getv0.cpp b/tests/test_getv0.cpp index 775e2e9..8186475 100644 --- a/tests/test_getv0.cpp +++ b/tests/test_getv0.cpp @@ -131,14 +131,12 @@ static bool test_debug_output() { return true; } -int main() { - std::printf("test_getv0:\n"); - bool ok = true; - ok &= test_basic(); - ok &= test_generalized_retry(); - ok &= test_orthogonalization(); - ok &= test_orthogonalization_generalized(); - ok &= test_debug_output(); - std::printf("%s\n", ok ? "PASS" : "FAIL"); - return ok ? 0 : 1; -} +#include + +// The helpers above return bool and print their own progress; wrap each as +// a Catch2 test case so they're individually discovered and reported. +TEST_CASE("getv0 basic", "[getv0]") { CHECK(test_basic()); } +TEST_CASE("getv0 generalized retry", "[getv0]") { CHECK(test_generalized_retry()); } +TEST_CASE("getv0 orthogonalization", "[getv0]") { CHECK(test_orthogonalization()); } +TEST_CASE("getv0 orthogonalization generalized", "[getv0]") { CHECK(test_orthogonalization_generalized()); } +TEST_CASE("getv0 debug output", "[getv0]") { CHECK(test_debug_output()); } diff --git a/tests/test_helpers.cpp b/tests/test_helpers.cpp index 37667f1..459349a 100644 --- a/tests/test_helpers.cpp +++ b/tests/test_helpers.cpp @@ -17,17 +17,14 @@ #include #include -static int g_pass = 0, g_fail = 0; - -static void check(const char* name, bool cond) { - if (cond) { - std::printf(" OK %s\n", name); - ++g_pass; - } else { - std::printf(" FAIL %s\n", name); - ++g_fail; - } -} +#include + +// Bridge legacy check("desc", cond) calls onto Catch2 assertions. +#define check(msg, cond) \ + do { \ + INFO(msg); \ + CHECK((cond)); \ + } while (0) template static bool nearly(Real a, Real b, Real tol = Real(1e-10)) { return std::abs(a - b) <= tol; } @@ -35,7 +32,7 @@ static bool nearly(Real a, Real b, Real tol = Real(1e-10)) { return std::abs(a - // In ARPACK the "wanted" values end up at the end of the sorted array. So // sortc/sortr with "LM" arranges by ascending magnitude; "SM" by descending // magnitude, etc. -static void test_sortc_all() { +TEST_CASE("test_sortc_all", "[helpers]") { using arnoldi::detail::sortc; auto run = [&](const char* which, std::vector xr, std::vector xi, @@ -53,7 +50,7 @@ static void test_sortc_all() { run("SI", {0, 0, 0}, {1, 3, 2}, {1, 2, 3}, {0, 0, 0}, {3, 2, 1}); } -static void test_sortr_all() { +TEST_CASE("test_sortr_all", "[helpers]") { using arnoldi::detail::sortr; auto run = [&](const char* which, std::vector a, std::vector b, std::vector exp_a) { sortr(which, true, (int)a.size(), a.data(), b.data()); @@ -67,7 +64,7 @@ static void test_sortr_all() { // Sort a 3x3 matrix by columns based on a key vector x; verify both the key // reordering and that columns of `a` followed (apply=true). -static void test_sesrt_all() { +TEST_CASE("test_sesrt_all", "[helpers]") { using arnoldi::detail::sesrt; const int n = 3, lda = 3; auto run = [&](const char* which, std::vector x, std::vector exp_x) { @@ -85,7 +82,7 @@ static void test_sesrt_all() { // Verify ngets walks each branch of the which-translation switch and the // "split conjugate pair" guard that pulls the boundary index back by one. -static void test_ngets_branches() { +TEST_CASE("test_ngets_branches", "[helpers]") { using arnoldi::detail::ngets; int kev = 2, np = 2; std::vector rr = {1, 2, 3, 4}, ri = {0, 0, 0, 0}, bnd = {.1, .2, .3, .4}; @@ -108,7 +105,7 @@ static void test_ngets_branches() { // sgets has a BE special case that swaps two halves of the spectrum to keep // "both ends" balanced. Exercise both BE and a regular which. -static void test_sgets_branches() { +TEST_CASE("test_sgets_branches", "[helpers]") { using arnoldi::detail::sgets; std::vector ritz = {1, 2, 3, 4, 5, 6}, bounds = {.1, .2, .3, .4, .5, .6}; std::vector shifts(3); @@ -122,7 +119,7 @@ static void test_sgets_branches() { // Convergence check returns the number of Ritz values whose error bound is // below tol*max(eps23, |ritz|). -static void test_conv_check() { +TEST_CASE("test_conv_check", "[helpers]") { using arnoldi::detail::nconv; using arnoldi::detail::sconv; std::vector ritzr = {1, 2, 3}, ritzi = {0, 0, 0}, bounds = {1e-15, 1e-15, 1e-1}; @@ -137,7 +134,7 @@ static void test_conv_check() { // stqrb is the tridiagonal QR used by seigt; cover n=0, n=1 and a small // generic case so the LAPACK glue and label-90 pivot loop both run. -static void test_stqrb() { +TEST_CASE("test_stqrb", "[helpers]") { using arnoldi::detail::stqrb; int info = 0; std::vector z; @@ -160,7 +157,7 @@ static void test_stqrb() { // Exercise the seigt entry point used by sym Lanczos extraction; also covers // the n>1 sub-diagonal debug print branch when msglvl>0. -static void test_seigt_neigh() { +TEST_CASE("test_seigt_neigh", "[helpers]") { using arnoldi::detail::neigh; using arnoldi::detail::seigt; @@ -195,7 +192,7 @@ static void test_seigt_neigh() { // Hit each print routine in detail::debug for both real and complex element // types so the four print_elem overloads are all instantiated. Output is // noisy but small; we just want the lines exercised for coverage. -static void test_debug_prints() { +TEST_CASE("test_debug_prints", "[helpers]") { using namespace arnoldi::detail; int iv[3] = {1, 2, 3}; @@ -217,7 +214,7 @@ static void test_debug_prints() { // Exercise the Real/Complex Ops dispatches that the standard double symmetric // solver path doesn't reach: rscal/raxpy on a complex vector, dotc, gemv_rv. -static void test_ops_complex_dispatch() { +TEST_CASE("test_ops_complex_dispatch", "[helpers]") { using cplx = std::complex; using O = arnoldi::detail::Ops; @@ -257,7 +254,7 @@ static void test_ops_complex_dispatch() { // pdot/prdotc/pnrm2_real/pnrm2 reduction wrappers: ensure they call through // SerialComm without modifying anything beyond the BLAS forwarding. -static void test_parallel_reductions() { +TEST_CASE("test_parallel_reductions", "[helpers]") { using arnoldi::detail::pdot; using arnoldi::detail::pnrm2; using arnoldi::detail::pnrm2_real; @@ -273,21 +270,3 @@ static void test_parallel_reductions() { check("pnrm2 serial complex", nearly(pnrm2(comm, 2, z.data(), 1), 5.0)); check("prdotc serial complex", nearly(prdotc(comm, 2, z.data(), 1, z.data(), 1), 25.0)); } - -int main() { - std::printf("test_helpers:\n"); - setvbuf(stdout, nullptr, _IOLBF, 0); - test_sortc_all(); - test_sortr_all(); - test_sesrt_all(); - test_ngets_branches(); - test_sgets_branches(); - test_conv_check(); - test_stqrb(); - test_seigt_neigh(); - test_debug_prints(); - test_ops_complex_dispatch(); - test_parallel_reductions(); - std::printf("\n%d passed, %d failed\n", g_pass, g_fail); - return g_fail > 0 ? 1 : 0; -} diff --git a/tests/test_low_level_validation.cpp b/tests/test_low_level_validation.cpp index ce9e1e6..c7ca4ce 100644 --- a/tests/test_low_level_validation.cpp +++ b/tests/test_low_level_validation.cpp @@ -13,16 +13,14 @@ #include #include -static int g_pass = 0, g_fail = 0; +#include +// Catch2-backed check(): the optional got/want ints are surfaced as scoped +// INFO context (only printed on failure). Valid inside helper functions +// because Catch2 records the assertion against the running TEST_CASE. static void check(const char* name, bool cond, int got = 0, int want = 0) { - if (cond) { - std::printf(" OK %s\n", name); - ++g_pass; - } else { - std::printf(" FAIL %s (got info=%d, want=%d)\n", name, got, want); - ++g_fail; - } + INFO(name << " (got info=" << got << ", want=" << want << ")"); + CHECK(cond); } // Convenience holder for a "valid" workspace; individual fields are then @@ -89,7 +87,7 @@ struct NaupdHarness { } }; -static void test_saupd_errors() { +TEST_CASE("test_saupd_errors", "[lowlevel]") { { SaupdHarness h; h.n = 0; check("saupd n<=0 -> -1", h.run() == -1, h.info, -1); @@ -147,7 +145,7 @@ static void test_saupd_errors() { } } -static void test_naupd_errors() { +TEST_CASE("test_naupd_errors", "[lowlevel]") { { NaupdHarness h; h.n = 0; check("naupd n<=0 -> -1", h.run() == -1, h.info, -1); @@ -206,7 +204,7 @@ static void test_naupd_errors() { // seupd validates its arguments separately; drive several of those branches. // seupd assumes nconv>0 (read from iparam[4]); pre-set it so other checks fire. -static void test_seupd_errors() { +TEST_CASE("test_seupd_errors", "[lowlevel]") { SaupdHarness base; base.iparam[4] = 1; // nconv @@ -294,7 +292,7 @@ static void test_seupd_errors() { } } -static void test_neupd_errors() { +TEST_CASE("test_neupd_errors", "[lowlevel]") { NaupdHarness base; auto run_neupd = [&](const char* bmat, const char* which, int n, int nev, int ncv, @@ -388,7 +386,7 @@ static void test_neupd_errors() { // Drive a real solve all the way through saupd, then call seupd directly with // z aliasing v. Forces the seupd `if (z == v) tmpbuf` allocation branch // which the high-level Arnoldi class never triggers (it owns separate buffers). -static void test_seupd_aliased_z_v() { +TEST_CASE("test_seupd_aliased_z_v", "[lowlevel]") { const int n = 16, nev = 3, ncv = 8; int lworkl = ncv * (ncv + 8); double tol = 0.0; @@ -427,7 +425,7 @@ static void test_seupd_aliased_z_v() { // Call the no-bop, no-comm 3-arg saupd/naupd helper overloads directly. // The high-level Arnoldi class always passes a bop lambda so these helper // overloads are unreachable through the public API. -static void test_no_bop_overloads() { +TEST_CASE("test_no_bop_overloads", "[lowlevel]") { { const int n = 16, nev = 2, ncv = 8; int lworkl = ncv * (ncv + 8); @@ -471,7 +469,7 @@ static void test_no_bop_overloads() { } // User-supplied initial residual: covers info_in_=1 path through naupd. -static void test_user_initial_residual_nonsym() { +TEST_CASE("test_user_initial_residual_nonsym", "[lowlevel]") { const int n = 32, nev = 3, ncv = 12; std::vector resid(n, 1.0); @@ -484,16 +482,3 @@ static void test_user_initial_residual_nonsym() { }); check("Nonsym user initial resid: solve ok", s.info() >= 0); } - -int main() { - std::printf("test_low_level_validation:\n"); - test_saupd_errors(); - test_naupd_errors(); - test_seupd_errors(); - test_neupd_errors(); - test_user_initial_residual_nonsym(); - test_seupd_aliased_z_v(); - test_no_bop_overloads(); - std::printf("\n%d passed, %d failed\n", g_pass, g_fail); - return g_fail > 0 ? 1 : 0; -} diff --git a/tests/test_modes.cpp b/tests/test_modes.cpp index f51edae..426c972 100644 --- a/tests/test_modes.cpp +++ b/tests/test_modes.cpp @@ -21,17 +21,14 @@ void dgttrs_(const char* trans, const int* n, const int* nrhs, const double* dl, const double* du2, const int* ipiv, double* b, const int* ldb, int* info); } -static int g_pass = 0, g_fail = 0; - -static void check(const char* name, bool cond) { - if (cond) { - std::printf(" OK %s\n", name); - ++g_pass; - } else { - std::printf(" FAIL %s\n", name); - ++g_fail; - } -} +#include + +// Bridge legacy check("desc", cond) calls onto Catch2 assertions. +#define check(msg, cond) \ + do { \ + INFO(msg); \ + CHECK((cond)); \ + } while (0) // --------------------------------------------------------------------------- // Analytic spectra of the test problems. @@ -177,7 +174,7 @@ static void mv_mass(int n, const double* v, double* w) { // Mode 3: A*x = lambda*M*x via OP = inv(A - sigma*M)*M, B = M. // Shift-invert with σ=0 converges to the eigenvalues of (A,M) closest to 0, // i.e. the smallest eigenvalues of the FE Laplacian. -static void test_sym_mode3_shift_invert() { +TEST_CASE("test_sym_mode3_shift_invert", "[modes]") { const int n = 100, nev = 4, ncv = 10; const double sigma = 0.0; ShiftInvertFE shift(n, sigma); @@ -210,7 +207,7 @@ static void test_sym_mode3_shift_invert() { // Mode 4 (buckling): K*x = lambda*KG*x via OP = inv(K - sigma*KG)*K, B = K. // In our setup K = FE stiffness, KG = FE mass, so the underlying spectrum is // still the FE Laplacian spectrum and the converged values are nearest σ. -static void test_sym_mode4_buckling() { +TEST_CASE("test_sym_mode4_buckling", "[modes]") { const int n = 100, nev = 4, ncv = 10; const double sigma = 1.0; ShiftInvertFE shift(n, sigma); @@ -240,7 +237,7 @@ static void test_sym_mode4_buckling() { // Mode 5 (Cayley): A*x = lambda*M*x via OP = inv(A - sigma*M)*(A + sigma*M). // μ = (λ+σ)/(λ−σ); LM Ritz values give λ closest to σ. -static void test_sym_mode5_cayley() { +TEST_CASE("test_sym_mode5_cayley", "[modes]") { const int n = 100, nev = 4, ncv = 20; const double sigma = 150.0; ShiftInvertFE shift(n, sigma); @@ -272,7 +269,7 @@ static void test_sym_mode5_cayley() { // with A*x because saitr later reads it back when computing the M-norm // (wnorm² = (M^-1 A x)·x where x is required to be A*x at that point). // A = standard tridiagonal Laplacian, M = 2*I, so eigenvalues are λ(A)/2. -static void test_sym_mode2_generalized() { +TEST_CASE("test_sym_mode2_generalized", "[modes]") { const int n = 64, nev = 3, ncv = 10; arnoldi::Arnoldi s("G", n, "LM", nev, ncv); @@ -304,7 +301,7 @@ static void test_sym_mode2_generalized() { // Nonsymmetric shift-invert (mode 3, real shift). The underlying problem is // still symmetric (FE Laplacian generalized), so all eigenvalues are real and // nev nearest σ should appear in r.values_re with values_im ≈ 0. -static void test_nonsym_mode3_real_shift() { +TEST_CASE("test_nonsym_mode3_real_shift", "[modes]") { const int n = 100, nev = 4, ncv = 20; const double sigma = 1.0; ShiftInvertFE shift(n, sigma); @@ -341,7 +338,7 @@ static void test_nonsym_mode3_real_shift() { // Sym which="LM" with rvec=false hits the REGULR branch of seupd that copies // ritz into d, distinct from the rvec=true sesrt path. -static void test_sym_regular_rvec_false_LM() { +TEST_CASE("test_sym_regular_rvec_false_LM", "[modes]") { const int n = 32, nev = 3, ncv = 10; arnoldi::Arnoldi s("I", n, "LM", nev, ncv); s.tol(1e-10).maxiter(500); @@ -361,7 +358,7 @@ static void test_sym_regular_rvec_false_LM() { // Nonsym with which="SM" exercises the SM mapping in ngets and the regular // extraction path. Underlying matrix is symmetric so eigenvalues are real. -static void test_nonsym_which_SM() { +TEST_CASE("test_nonsym_which_SM", "[modes]") { const int n = 32, nev = 3, ncv = 12; arnoldi::Arnoldi s("I", n, "SM", nev, ncv); s.tol(1e-10).maxiter(500); @@ -382,7 +379,7 @@ static void test_nonsym_which_SM() { // Herm with which="LM" + rvec=false; covers the saupd regular path // and the rvec=false copy-Ritz path in seupd. -static void test_herm_LM_LA() { +TEST_CASE("test_herm_LM_LA", "[modes]") { using cplx = std::complex; const int n = 32, nev = 3, ncv = 10; arnoldi::Arnoldi s("I", n, "LM", nev, ncv); @@ -428,7 +425,7 @@ struct DebugAllOn { // *aupd, sapps/napps, sgets/ngets, *eupd, and getv0 by running once with // every debug knob turned all the way up. Output is voluminous but only // printed during this single test. -static void test_debug_branches() { +TEST_CASE("test_debug_branches", "[modes]") { DebugAllOn guard; { const int n = 10, nev = 2, ncv = 6; @@ -482,7 +479,7 @@ static void test_debug_branches() { // running both Sym + which="BE" (BE branch swaps with sswap_) and // Nonsym. Standard tridiagonal Laplacian, small problem size. // Tolerances are looser than the double tests because of single precision. -static void test_float_overloads() { +TEST_CASE("test_float_overloads", "[modes]") { { const int n = 16, nev = 4, ncv = 12; arnoldi::Arnoldi s("I", n, "BE", nev, ncv); @@ -521,7 +518,7 @@ static void test_float_overloads() { // naup2 runs many times. Each pass through the loop hits the bmat=='G' // post-napps recompute that the more relaxed shift-invert tests skip // because they converge in a single iter. -static void test_nonsym_generalized_many_iters() { +TEST_CASE("test_nonsym_generalized_many_iters", "[modes]") { const int n = 60, nev = 6, ncv = 10; const double sigma = 0.5; ShiftInvertFE shift(n, sigma); @@ -546,7 +543,7 @@ static void test_nonsym_generalized_many_iters() { // dimensionless tridiagonal A = [-1, 2, -1] and mass M = [1, 4, 1] / 6; // the analytic spectrum is the FE Laplacian's with h ≡ 1 (no scaling): // λ_k = 6 (1 - cos(kπ/(n+1))) / (2 + cos(kπ/(n+1))). -static void test_sym_generalized_many_iters() { +TEST_CASE("test_sym_generalized_many_iters", "[modes]") { const int n = 60, nev = 4, ncv = 8; arnoldi::Arnoldi s("G", n, "LM", nev, ncv); s.tol(1e-12).maxiter(1000).mode(2); @@ -598,7 +595,7 @@ static void test_sym_generalized_many_iters() { // Force the REALPT (mode 3 with sigmai != 0) and IMAGPT (mode 4) branches in // neupd's type-selection logic. -static void test_nonsym_neupd_realpt_imagpt() { +TEST_CASE("test_nonsym_neupd_realpt_imagpt", "[modes]") { const int n = 16, nev = 2, ncv = 8; auto av = [n](const double* x, double* y) { y[0] = x[1]; @@ -629,7 +626,7 @@ static void test_nonsym_neupd_realpt_imagpt() { // Spectrum is ±2i cos(kπ/(n+1)) so neupd's conjugate-pair post-processing // branches (REGULR & SHIFTI lapy2/scal blocks) get exercised. Real parts must // be ≈ 0 and imaginary magnitudes must match the exact closed form. -static void test_nonsym_complex_eigenvalues() { +TEST_CASE("test_nonsym_complex_eigenvalues", "[modes]") { const int n = 16, nev = 4, ncv = 12; auto av = [n](const double* x, double* y) { y[0] = x[1]; @@ -668,23 +665,3 @@ static void test_nonsym_complex_eigenvalues() { check("Nonsym complex LI rvec=false: matches exact imag pairs", err < 1e-9); } } - -int main() { - std::printf("test_modes:\n"); - test_sym_mode2_generalized(); - test_sym_mode3_shift_invert(); - test_sym_mode4_buckling(); - test_sym_mode5_cayley(); - test_nonsym_mode3_real_shift(); - test_sym_regular_rvec_false_LM(); - test_nonsym_which_SM(); - test_herm_LM_LA(); - test_debug_branches(); - test_float_overloads(); - test_nonsym_complex_eigenvalues(); - test_nonsym_neupd_realpt_imagpt(); - test_nonsym_generalized_many_iters(); - test_sym_generalized_many_iters(); - std::printf("\n%d passed, %d failed\n", g_pass, g_fail); - return g_fail > 0 ? 1 : 0; -} diff --git a/tests/test_solver.cpp b/tests/test_solver.cpp index 70c8e55..ed8e63b 100644 --- a/tests/test_solver.cpp +++ b/tests/test_solver.cpp @@ -5,24 +5,19 @@ #include #include +#include #include #include -#include #include #include - -static int g_pass = 0, g_fail = 0; - -static void check(const char* name, bool cond) { - if (cond) { - std::printf(" OK %s\n", name); - ++g_pass; - } else { - std::printf(" FAIL %s\n", name); - ++g_fail; - } -} +// Bridge the legacy check("desc", cond) calls onto Catch2: the description +// becomes scoped INFO context (shown on failure) and the condition a CHECK. +#define check(msg, cond) \ + do { \ + INFO(msg); \ + CHECK((cond)); \ + } while (0) // 1D Laplacian: A(i,i)=2, A(i,i±1)=-1, eigenvalues = 2-2*cos(k*pi/(n+1)). static void av_laplacian(int n, const double* x, double* y) { @@ -36,7 +31,7 @@ static double exact_eig_laplacian(int n, int k) { return 2.0 - 2.0 * std::cos(k * M_PI / (n + 1)); } -void test_sym_double() { +TEST_CASE("test_sym_double", "[solver]") { const int n = 64, nev = 4, ncv = 12; arnoldi::Arnoldi s("I", n, "SM", nev, ncv); s.tol(1e-12).maxiter(500); @@ -60,7 +55,7 @@ void test_sym_double() { check("Sym: eigenvalue accuracy < 1e-10", maxerr < 1e-10); } -void test_sym_float() { +TEST_CASE("test_sym_float", "[solver]") { const int n = 32, nev = 3, ncv = 10; arnoldi::Arnoldi s("I", n, "SM", nev, ncv); s.tol(1e-5f).maxiter(500); @@ -81,7 +76,7 @@ void test_sym_float() { check("Sym: eigenvalue accuracy < 1e-4", maxerr < 1e-4f); } -void test_nonsym_double() { +TEST_CASE("test_nonsym_double", "[solver]") { const int n = 64, nev = 4, ncv = 14; const double rho = 10.0; const double h = 1.0 / (n + 1); @@ -115,7 +110,7 @@ void test_nonsym_double() { check("Nonsym: LM eigenvalues have large magnitude", min_mag > 100.0); } -void test_herm_complex_double() { +TEST_CASE("test_herm_complex_double", "[solver]") { using cplx = std::complex; const int n = 64, nev = 4, ncv = 12; const cplx off(-1.0, 0.25); @@ -138,7 +133,7 @@ void test_herm_complex_double() { check("Herm: eigenvalues are real-valued (small)", std::isfinite(r.values[i])); } -void test_user_initial_resid() { +TEST_CASE("test_user_initial_resid", "[solver]") { const int n = 32, nev = 3, ncv = 10; std::vector resid(n, 1.0); @@ -153,7 +148,7 @@ void test_user_initial_resid() { check("user_resid: eigenvalue accurate", err < 1e-10); } -void test_values_only() { +TEST_CASE("test_values_only", "[solver]") { const int n = 32, nev = 3, ncv = 10; arnoldi::Arnoldi s("I", n, "SM", nev, ncv); s.tol(1e-12).maxiter(500); @@ -167,7 +162,7 @@ void test_values_only() { check("values_only: eigenvalue is small", r.values[0] < 0.1); } -void test_nonsym_values_only() { +TEST_CASE("test_nonsym_values_only", "[solver]") { const int n = 32, nev = 3, ncv = 10; arnoldi::Arnoldi s("I", n, "LM", nev, ncv); s.tol(1e-12).maxiter(500); @@ -179,7 +174,7 @@ void test_nonsym_values_only() { check("nonsym_values_only: values_re non-zero", std::abs(r.values_re[0]) > 0.01); } -void test_sym_which_LM() { +TEST_CASE("test_sym_which_LM", "[solver]") { const int n = 32, nev = 3, ncv = 10; arnoldi::Arnoldi s("I", n, "LM", nev, ncv); s.tol(1e-12).maxiter(500); @@ -190,7 +185,7 @@ void test_sym_which_LM() { check("Sym which=LM: largest eig > 3.5", r.values[0] > 3.5); } -void test_sym_which_LA() { +TEST_CASE("test_sym_which_LA", "[solver]") { const int n = 32, nev = 3, ncv = 10; arnoldi::Arnoldi s("I", n, "LA", nev, ncv); s.tol(1e-12).maxiter(500); @@ -201,7 +196,7 @@ void test_sym_which_LA() { check("Sym which=LA: largest algebraic > 3.5", r.values[nev - 1] > 3.5); } -void test_sym_which_SA() { +TEST_CASE("test_sym_which_SA", "[solver]") { const int n = 32, nev = 3, ncv = 10; arnoldi::Arnoldi s("I", n, "SA", nev, ncv); s.tol(1e-12).maxiter(500); @@ -213,7 +208,7 @@ void test_sym_which_SA() { check("Sym which=SA: all eigenvalues small", maxv < 0.15); } -void test_sym_which_BE() { +TEST_CASE("test_sym_which_BE", "[solver]") { const int n = 32, nev = 4, ncv = 12; arnoldi::Arnoldi s("I", n, "BE", nev, ncv); s.tol(1e-12).maxiter(500); @@ -228,7 +223,7 @@ void test_sym_which_BE() { // A = Laplacian, B = 2*I. Eigenvalues of A*x = lambda*B*x are half the // standard eigenvalues, so OP = inv(B)*A = A/2 in mode 2. -void test_sym_generalized() { +TEST_CASE("test_sym_generalized", "[solver]") { const int n = 32, nev = 3, ncv = 10; arnoldi::Arnoldi s("G", n, "LM", nev, ncv); @@ -250,7 +245,7 @@ void test_sym_generalized() { check("Sym generalized: largest eig > 1.5", r.values[0] > 1.5); } -void test_insufficient_maxiter() { +TEST_CASE("test_insufficient_maxiter", "[solver]") { const int n = 64, nev = 4, ncv = 12; arnoldi::Arnoldi s("I", n, "SM", nev, ncv); s.tol(1e-14).maxiter(1); @@ -261,7 +256,7 @@ void test_insufficient_maxiter() { s.info() != 0 || s.num_converged() < nev); } -void test_accessors() { +TEST_CASE("test_accessors", "[solver]") { const int n = 16, nev = 2, ncv = 6; arnoldi::Arnoldi s("I", n, "SM", nev, ncv); s.tol(1e-10).maxiter(300); @@ -272,25 +267,3 @@ void test_accessors() { check("accessors: iparam() != nullptr", s.iparam() != nullptr); check("accessors: ipntr() != nullptr", s.ipntr() != nullptr); } - -int main() { - std::printf("test_solver:\n"); - - test_sym_double(); - test_sym_float(); - test_nonsym_double(); - test_herm_complex_double(); - test_user_initial_resid(); - test_values_only(); - test_nonsym_values_only(); - test_sym_which_LM(); - test_sym_which_LA(); - test_sym_which_SA(); - test_sym_which_BE(); - test_sym_generalized(); - test_insufficient_maxiter(); - test_accessors(); - - std::printf("\n%d passed, %d failed\n", g_pass, g_fail); - return g_fail > 0 ? 1 : 0; -} diff --git a/tests/test_solver_cuda.cu b/tests/test_solver_cuda.cu new file mode 100644 index 0000000..b7d4c2d --- /dev/null +++ b/tests/test_solver_cuda.cu @@ -0,0 +1,282 @@ +// CUDA-backend tests for arnoldi::Arnoldi. +// +// Mirror of tests/test_solver.cpp for the Sym / Herm path, but with the +// matvec implemented as a CUDA kernel receiving device pointers. The user +// matvec callback receives DEVICE pointers (per the cuda.hpp contract); +// solver workspace lives in device_vector under CudaBackend. +// +// Cases: +// - Sym Laplacian (n=128): eigenvalues match the exact analytical formula +// 2 - 2 cos(k pi / (n+1)). +// - Herm complex tridiagonal (n=128): eigenvalues real, residual +// ||A v - lambda v|| / ||A|| below tolerance. +// - which="LM": largest-magnitude eigenvalue is large. +// - Eigenvector residual: ||A v - lambda v|| is small. +// - initial_resid_device: solve seeded from a device-resident vector. + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +// Bridge legacy check("desc", cond) calls onto Catch2 assertions. +#define check(msg, cond) \ + do { \ + INFO(msg); \ + CHECK((cond)); \ + } while (0) + +// Skip a CUDA test case gracefully when no device is present. +static bool cuda_device_present() { + int c = 0; + return cudaGetDeviceCount(&c) == cudaSuccess && c > 0; +} +#define REQUIRE_CUDA() \ + do { \ + if (!cuda_device_present()) \ + SKIP("no CUDA device available"); \ + } while (0) + +static double exact_eig_laplacian(int n, int k) { + return 2.0 - 2.0 * std::cos(k * M_PI / (n + 1)); +} + +// -------- Kernels ------------------------------------------------------------ +// 1D Dirichlet Laplacian: y = A x with A(i,i)=2, A(i,i±1)=-1. +__global__ void d_av_laplacian(int n, const double* x, double* y) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) return; + double left = (i == 0) ? 0.0 : x[i - 1]; + double right = (i == n - 1) ? 0.0 : x[i + 1]; + y[i] = 2.0 * x[i] - left - right; +} + +// Hermitian complex tridiagonal: +// H(i,i) = 2 +// H(i,i+1) = off (and H(i+1,i) = conj(off)) +// with off = -1 + 0.25 i. The eigenvalues are real because H is Hermitian. +__global__ void d_av_hermitian(int n, const cuDoubleComplex* x, cuDoubleComplex* y) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) return; + const cuDoubleComplex off = make_cuDoubleComplex(-1.0, 0.25); + const cuDoubleComplex off_conj = make_cuDoubleComplex(-1.0, -0.25); + cuDoubleComplex left = (i == 0) ? make_cuDoubleComplex(0.0, 0.0) : cuCmul(off_conj, x[i - 1]); + cuDoubleComplex right = (i == n - 1) ? make_cuDoubleComplex(0.0, 0.0) : cuCmul(off, x[i + 1]); + cuDoubleComplex diag = make_cuDoubleComplex(2.0 * cuCreal(x[i]), 2.0 * cuCimag(x[i])); + y[i] = cuCadd(cuCadd(left, diag), right); +} + +static int blocks_for(int n, int threads = 256) { return (n + threads - 1) / threads; } + +// -------- Tests -------------------------------------------------------------- + +TEST_CASE("test_sym_cuda_laplacian", "[cuda]") { + REQUIRE_CUDA(); + const int n = 128, nev = 4, ncv = 12; + arnoldi::Arnoldi s("I", n, "SM", nev, ncv); + s.tol(1e-12).maxiter(500); + cudaStream_t stream = s.backend().stream(); + + s.solve([&](const double* x, double* y) { + d_av_laplacian<<>>(n, x, y); + }); + + check("Sym: converged", s.converged()); + check("Sym: nconv >= nev", s.num_converged() >= nev); + check("Sym: info == 0", s.info() == 0); + + auto r = s.eigenpairs(false); + // seupd does not guarantee a particular order within the converged set; + // compare as sorted sets. The nev smallest exact Laplacian eigenvalues + // are exact_eig_laplacian(n, 1..nev). + std::vector got(r.values.begin(), r.values.end()); + std::sort(got.begin(), got.end()); + std::vector want(nev); + for (int k = 0; k < nev; ++k) want[k] = exact_eig_laplacian(n, k + 1); + std::sort(want.begin(), want.end()); + double max_err = 0.0; + for (int k = 0; k < nev; ++k) max_err = std::max(max_err, std::abs(got[k] - want[k])); + check("Sym: eigenvalues match exact to 1e-10", max_err < 1e-10); +} + +TEST_CASE("test_sym_cuda_which_LM", "[cuda]") { + REQUIRE_CUDA(); + const int n = 64, nev = 3, ncv = 10; + arnoldi::Arnoldi s("I", n, "LM", nev, ncv); + s.tol(1e-12).maxiter(500); + cudaStream_t stream = s.backend().stream(); + + s.solve([&](const double* x, double* y) { + d_av_laplacian<<>>(n, x, y); + }); + + check("Sym which=LM: converged", s.converged()); + auto r = s.eigenpairs(false); + check("Sym which=LM: largest eig > 3.5", r.values[0] > 3.5); +} + +TEST_CASE("test_sym_cuda_eigenvectors", "[cuda]") { + REQUIRE_CUDA(); + const int n = 64, nev = 3, ncv = 10; + arnoldi::Arnoldi s("I", n, "SM", nev, ncv); + s.tol(1e-12).maxiter(500); + cudaStream_t stream = s.backend().stream(); + + s.solve([&](const double* x, double* y) { + d_av_laplacian<<>>(n, x, y); + }); + + auto r = s.eigenpairs(true); + check("Sym evecs: vectors.size() == n*nev", (int)r.vectors.size() == n * nev); + + // Residual check: for each (lambda, v), compute ||A v - lambda v|| / ||v|| + // on the host (host-side A applied to the host-resident eigenvectors). + double max_res = 0.0; + std::vector Av(n); + for (int k = 0; k < nev; ++k) { + const double* vk = &r.vectors[k * n]; + Av[0] = 2.0 * vk[0] - vk[1]; + for (int i = 1; i < n - 1; ++i) Av[i] = -vk[i - 1] + 2.0 * vk[i] - vk[i + 1]; + Av[n - 1] = -vk[n - 2] + 2.0 * vk[n - 1]; + + double rnorm = 0.0, vnorm = 0.0; + for (int i = 0; i < n; ++i) { + double d = Av[i] - r.values[k] * vk[i]; + rnorm += d * d; + vnorm += vk[i] * vk[i]; + } + rnorm = std::sqrt(rnorm / vnorm); + if (rnorm > max_res) max_res = rnorm; + } + check("Sym evecs: max residual < 1e-10", max_res < 1e-10); +} + +TEST_CASE("test_herm_cuda", "[cuda]") { + REQUIRE_CUDA(); + using cplx = std::complex; + const int n = 128, nev = 4, ncv = 12; + arnoldi::Arnoldi s("I", n, "SM", nev, ncv); + s.tol(1e-12).maxiter(500); + cudaStream_t stream = s.backend().stream(); + + s.solve([&](const cplx* x, cplx* y) { + d_av_hermitian<<>>( + n, + reinterpret_cast(x), + reinterpret_cast(y)); + }); + + check("Herm: converged", s.converged()); + check("Herm: nconv >= nev", s.num_converged() >= nev); + + auto r = s.eigenpairs(true); + check("Herm: vectors.size() == n*nev", (int)r.vectors.size() == n * nev); + + // Hermitian eigenvalues must be real-valued (the solver returns them + // as Real, so this is automatic; just verify finiteness). + for (int k = 0; k < nev; ++k) + check("Herm: eigenvalue finite", std::isfinite(r.values[k])); + + // Residual: ||H v - lambda v|| / ||v|| on the host. + const cplx off(-1.0, 0.25); + std::vector Hv(n); + double max_res = 0.0; + for (int k = 0; k < nev; ++k) { + const cplx* vk = &r.vectors[k * n]; + Hv[0] = 2.0 * vk[0] + off * vk[1]; + for (int i = 1; i < n - 1; ++i) + Hv[i] = std::conj(off) * vk[i - 1] + 2.0 * vk[i] + off * vk[i + 1]; + Hv[n - 1] = std::conj(off) * vk[n - 2] + 2.0 * vk[n - 1]; + + double rnorm = 0.0, vnorm = 0.0; + for (int i = 0; i < n; ++i) { + cplx d = Hv[i] - r.values[k] * vk[i]; + rnorm += std::norm(d); + vnorm += std::norm(vk[i]); + } + rnorm = std::sqrt(rnorm / vnorm); + if (rnorm > max_res) max_res = rnorm; + } + check("Herm: max residual < 1e-9", max_res < 1e-9); +} + +TEST_CASE("test_initial_resid_device", "[cuda]") { + REQUIRE_CUDA(); + const int n = 64, nev = 3, ncv = 10; + + // Allocate a device buffer of size n, seed with 1.0 from the host. + double* d_resid = nullptr; + cudaMalloc(&d_resid, n * sizeof(double)); + std::vector h_resid(n, 1.0); + cudaMemcpy(d_resid, h_resid.data(), n * sizeof(double), cudaMemcpyHostToDevice); + + arnoldi::Arnoldi s("I", n, "SM", nev, ncv); + s.tol(1e-12).maxiter(500); + s.initial_resid_device(d_resid); // device pointer overload + cudaStream_t stream = s.backend().stream(); + + s.solve([&](const double* x, double* y) { + d_av_laplacian<<>>(n, x, y); + }); + + check("initial_resid_device: converged", s.converged()); + auto r = s.eigenpairs(false); + // Smallest converged eigenvalue (order within the set is unspecified). + double got_min = *std::min_element(r.values.begin(), r.values.end()); + double err = std::abs(got_min - exact_eig_laplacian(n, 1)); + check("initial_resid_device: smallest eigenvalue accurate", err < 1e-10); + + cudaFree(d_resid); +} + +TEST_CASE("test_eigenpairs_device", "[cuda]") { + REQUIRE_CUDA(); + const int n = 64, nev = 3, ncv = 10; + arnoldi::Arnoldi s("I", n, "SM", nev, ncv); + s.tol(1e-12).maxiter(500); + cudaStream_t stream = s.backend().stream(); + + s.solve([&](const double* x, double* y) { + d_av_laplacian<<>>(n, x, y); + }); + check("eigenpairs_device: converged", s.converged()); + + // Eigenvectors stay on the device; eigenvalues come back on the host. + auto dr = s.eigenpairs_device(true); + check("eigenpairs_device: values.size() == nev", (int)dr.values.size() == nev); + + // Pull the device eigenvectors down ourselves and residual-check. + std::vector hv(n * nev); + cudaMemcpy(hv.data(), dr.vectors.data(), n * nev * sizeof(double), cudaMemcpyDeviceToHost); + + double max_res = 0.0; + std::vector Av(n); + for (int k = 0; k < nev; ++k) { + const double* vk = &hv[k * n]; + Av[0] = 2.0 * vk[0] - vk[1]; + for (int i = 1; i < n - 1; ++i) Av[i] = -vk[i - 1] + 2.0 * vk[i] - vk[i + 1]; + Av[n - 1] = -vk[n - 2] + 2.0 * vk[n - 1]; + double rn = 0.0, vn = 0.0; + for (int i = 0; i < n; ++i) { + double d = Av[i] - dr.values[k] * vk[i]; + rn += d * d; + vn += vk[i] * vk[i]; + } + max_res = std::max(max_res, std::sqrt(rn / vn)); + } + check("eigenpairs_device: device-evec residual < 1e-10", max_res < 1e-10); +} diff --git a/tests/test_validation.cpp b/tests/test_validation.cpp index bea7541..6032c8b 100644 --- a/tests/test_validation.cpp +++ b/tests/test_validation.cpp @@ -7,139 +7,112 @@ #include #include -static int g_pass = 0, g_fail = 0; +#include -template +// Catch2-backed expectation helpers. Signatures unchanged so the existing +// call sites need no edits; assertions are recorded against the running +// TEST_CASE. expect_throw requires a std::invalid_argument whose message +// contains `substr` (when non-empty). +template void expect_throw(const char* name, F&& f, const std::string& substr = "") { + INFO(name); try { f(); - std::printf(" FAIL %s — no exception thrown\n", name); - ++g_fail; + FAIL("no exception thrown"); } catch (const std::invalid_argument& e) { - if (!substr.empty() && std::string(e.what()).find(substr) == std::string::npos) { - std::printf(" FAIL %s — wrong message: %s\n", name, e.what()); - ++g_fail; - } else { - std::printf(" OK %s\n", name); - ++g_pass; - } + INFO("message: " << e.what()); + if (!substr.empty()) + CHECK(std::string(e.what()).find(substr) != std::string::npos); + else + SUCCEED(); } catch (const std::exception& e) { - std::printf(" FAIL %s — unexpected exception: %s\n", name, e.what()); - ++g_fail; + FAIL("unexpected exception: " << e.what()); } } -template +template void expect_no_throw(const char* name, F&& f) { + INFO(name); try { f(); - std::printf(" OK %s\n", name); - ++g_pass; + SUCCEED(); } catch (const std::exception& e) { - std::printf(" FAIL %s — exception: %s\n", name, e.what()); - ++g_fail; + FAIL("exception: " << e.what()); } } -void test_n_zero() { +TEST_CASE("test_n_zero", "[validation]") { expect_throw("Sym: n=0", []{ arnoldi::Arnoldi("I", 0, "LM", 1, 3); }, "n must be"); } -void test_n_negative() { +TEST_CASE("test_n_negative", "[validation]") { expect_throw("Sym: n=-1", []{ arnoldi::Arnoldi("I", -1, "LM", 1, 3); }, "n must be"); } -void test_nev_zero() { +TEST_CASE("test_nev_zero", "[validation]") { expect_throw("Sym: nev=0", []{ arnoldi::Arnoldi("I", 10, "LM", 0, 5); }, "nev must be"); } -void test_nev_negative() { +TEST_CASE("test_nev_negative", "[validation]") { expect_throw("Nonsym: nev=-1", []{ arnoldi::Arnoldi("I", 10, "LM", -1, 5); }, "nev must be"); } -void test_ncv_too_large() { +TEST_CASE("test_ncv_too_large", "[validation]") { expect_throw("Sym: ncv > n", []{ arnoldi::Arnoldi("I", 5, "LM", 2, 10); }, "ncv must be <= global"); } -void test_ncv_sym_too_small() { +TEST_CASE("test_ncv_sym_too_small", "[validation]") { expect_throw("Sym: ncv == nev", []{ arnoldi::Arnoldi("I", 20, "LM", 5, 5); }, "ncv must satisfy"); } -void test_ncv_nonsym_too_small() { +TEST_CASE("test_ncv_nonsym_too_small", "[validation]") { expect_throw("Nonsym: ncv == nev+1", []{ arnoldi::Arnoldi("I", 20, "LM", 4, 5); }, "ncv must satisfy"); } -void test_ncv_nonsym_boundary() { +TEST_CASE("test_ncv_nonsym_boundary", "[validation]") { expect_no_throw("Nonsym: ncv == nev+2 (valid)", []{ arnoldi::Arnoldi("I", 20, "LM", 4, 6); }); } -void test_ncv_sym_boundary() { +TEST_CASE("test_ncv_sym_boundary", "[validation]") { expect_no_throw("Sym: ncv == nev+1 (valid)", []{ arnoldi::Arnoldi("I", 20, "LM", 5, 6); }); } -void test_bmat_invalid() { +TEST_CASE("test_bmat_invalid", "[validation]") { expect_throw("Sym: bmat='X'", []{ arnoldi::Arnoldi("X", 10, "LM", 2, 5); }, "bmat must be"); } -void test_bmat_empty() { +TEST_CASE("test_bmat_empty", "[validation]") { expect_throw("Sym: bmat=''", []{ arnoldi::Arnoldi("", 10, "LM", 2, 5); }, "bmat must be"); } -void test_bmat_generalized() { +TEST_CASE("test_bmat_generalized", "[validation]") { expect_no_throw("Sym: bmat='G' (valid)", []{ arnoldi::Arnoldi("G", 10, "LM", 2, 5); }); } -void test_which_sym_invalid() { +TEST_CASE("test_which_sym_invalid", "[validation]") { expect_throw("Sym: which='XX'", []{ arnoldi::Arnoldi("I", 10, "XX", 2, 5); }, "which must be"); } -void test_which_sym_LI() { +TEST_CASE("test_which_sym_LI", "[validation]") { // LI is valid for Nonsym but not Sym. expect_throw("Sym: which='LI'", []{ arnoldi::Arnoldi("I", 10, "LI", 2, 5); }, "which must be"); } -void test_which_sym_all_valid() { +TEST_CASE("test_which_sym_all_valid", "[validation]") { for (auto w : {"LM", "SM", "LA", "SA", "BE"}) { std::string name = std::string("Sym: which='") + w + "'"; expect_no_throw(name.c_str(), [w]{ arnoldi::Arnoldi("I", 20, w, 2, 5); }); } } -void test_which_nonsym_invalid() { +TEST_CASE("test_which_nonsym_invalid", "[validation]") { expect_throw("Nonsym: which='LA'", []{ arnoldi::Arnoldi("I", 20, "LA", 2, 5); }, "which must be"); } -void test_which_nonsym_all_valid() { +TEST_CASE("test_which_nonsym_all_valid", "[validation]") { for (auto w : {"LM", "SM", "LR", "SR", "LI", "SI"}) { std::string name = std::string("Nonsym: which='") + w + "'"; expect_no_throw(name.c_str(), [w]{ arnoldi::Arnoldi("I", 20, w, 2, 5); }); } } -void test_which_herm_invalid() { +TEST_CASE("test_which_herm_invalid", "[validation]") { using cplx = std::complex; expect_throw("Herm: which='LI'", []{ arnoldi::Arnoldi("I", 20, "LI", 2, 5); }, "which must be"); } -void test_which_herm_all_valid() { +TEST_CASE("test_which_herm_all_valid", "[validation]") { using cplx = std::complex; for (auto w : {"LM", "SM", "LA", "SA", "BE"}) { std::string name = std::string("Herm: which='") + w + "'"; expect_no_throw(name.c_str(), [w]{ arnoldi::Arnoldi("I", 20, w, 2, 5); }); } } - -int main() { - std::printf("test_validation:\n"); - - test_n_zero(); - test_n_negative(); - test_nev_zero(); - test_nev_negative(); - test_ncv_too_large(); - test_ncv_sym_too_small(); - test_ncv_nonsym_too_small(); - test_ncv_nonsym_boundary(); - test_ncv_sym_boundary(); - test_bmat_invalid(); - test_bmat_empty(); - test_bmat_generalized(); - test_which_sym_invalid(); - test_which_sym_LI(); - test_which_sym_all_valid(); - test_which_nonsym_invalid(); - test_which_nonsym_all_valid(); - test_which_herm_invalid(); - test_which_herm_all_valid(); - - std::printf("\n%d passed, %d failed\n", g_pass, g_fail); - return g_fail > 0 ? 1 : 0; -}