Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
# Build directories
build/
Build/
build-*/
*build*/
/cmake-build-*/

# CMake generated files
Expand Down
9 changes: 9 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ set(CMAKE_CXX_EXTENSIONS OFF)

option(ARNOLDI_BUILD_EXAMPLES "Build example programs" OFF)
option(ARNOLDI_REGRESSION "Build regression benchmark against original ARPACK (requires arpackng)" OFF)
option(ARNOLDI_USE_CUDA "Build with CUDA + cuBLAS backend (Sym / Herm only)" OFF)

find_package(BLAS REQUIRED)
find_package(LAPACK REQUIRED)
Expand All @@ -25,6 +26,14 @@ else()
target_link_libraries(arnoldi INTERFACE ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES})
endif()

if(ARNOLDI_USE_CUDA)
# cuBLAS / cudart are C-API libraries; no nvcc required for the wrappers.
find_package(CUDAToolkit REQUIRED)
target_link_libraries(arnoldi INTERFACE CUDA::cudart CUDA::cublas)
target_compile_definitions(arnoldi INTERFACE ARNOLDI_USE_CUDA)
message(STATUS "Arnoldi: CUDA backend enabled (cuBLAS=${CUDAToolkit_VERSION})")
endif()

include(GNUInstallDirs)
install(TARGETS arnoldi EXPORT arnoldiTargets)
install(DIRECTORY include/arnoldi DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
Expand Down
76 changes: 75 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ Header-only API for a C++ port of ARPACK’s symmetric and nonsymmetric eigenval

- C++17 compiler
- BLAS and LAPACK (Fortran calling convention, `_` suffix)
- *(optional, GPU backend)* CUDA Toolkit with cuBLAS — see [GPU (CUDA) backend](#gpu-cuda-backend)
- *(optional, tests)* none to install: Catch2 v3 is fetched automatically via CMake `FetchContent` (needs network on first configure)

## Build (library interface + examples)

Expand All @@ -19,6 +21,20 @@ cmake --build build
./build/examples/arpack_cpp_examples dnsimp # Fortran EXAMPLES catalog (see examples/README.md)
```

Options:

| Option | Default | Effect |
| --- | --- | --- |
| `ARNOLDI_BUILD_EXAMPLES` | OFF | build the `examples/` programs |
| `ARNOLDI_BUILD_TESTS` | OFF | build the Catch2 test suite (`ctest`) |
| `ARNOLDI_USE_CUDA` | OFF | enable the CUDA + cuBLAS backend (Sym / Herm) |

```bash
# tests
cmake -S . -B build -DARNOLDI_BUILD_TESTS=ON
cmake --build build && ctest --test-dir build
```

## Use in your project

```cmake
Expand All @@ -31,11 +47,69 @@ target_link_libraries(your_target PRIVATE arpack::callback)
// arnoldi::detail::naupd, neupd, saupd, seupd, ...
```

## GPU (CUDA) backend

`Arnoldi<K, Scalar, Comm, Backend>` is templated on a backend. The default
`CpuBackend` runs every length-`n` BLAS on the host (unchanged behaviour).
`CudaBackend` keeps the Lanczos workspace (`resid`, `v`, `workd`) in device
memory and routes those operations through cuBLAS, so the **matvec runs
entirely on the GPU**. Supported for `Kind::Sym` and `Kind::Herm` (Nonsym on
device is rejected at compile time — no cuSOLVER analogue for its real-Schur
path). The small implicitly-restarted shift machinery (`ncv²`-sized `h`,
`q`, `workl`) stays host-resident by design.

Enable it:

```bash
cmake -S . -B build -DARNOLDI_USE_CUDA=ON \
-DCMAKE_CUDA_ARCHITECTURES=<your_sm> # e.g. 61 for Quadro P1000
cmake --build build
```

`cuda.hpp` is plain C++ calling the cuBLAS C API — only the `.cu`
tests/examples need `nvcc`. If your system compiler is newer than what your
`nvcc` supports (e.g. CUDA 12.1 + GCC ≥ 13), point `nvcc` at an older host
compiler:

```bash
cmake -S . -B build -DARNOLDI_USE_CUDA=ON \
-DCMAKE_CUDA_HOST_COMPILER=/path/to/g++-12 \
-DCMAKE_CUDA_ARCHITECTURES=61
```

Usage — the matvec callback receives **device** pointers and should launch on
the backend's stream:

```cpp
#include <arnoldi/arnoldi.hpp>
#include <arnoldi/cuda.hpp>

arnoldi::Arnoldi<arnoldi::Kind::Sym, double,
arnoldi::SerialComm, arnoldi::CudaBackend>
s("I", n, "LM", nev, ncv);
s.tol(1e-10).maxiter(1000);

cudaStream_t stream = s.backend().stream();
s.solve([&](const double* d_x, double* d_y) { // device pointers
my_matvec_kernel<<<blocks, threads, 0, stream>>>(n, d_x, d_y);
});

auto r = s.eigenpairs(); // values + vectors copied to host
auto dr = s.eigenpairs_device(); // eigenvectors left on the GPU
```

Also available under a device backend: `s.initial_resid_device(devptr)` to
seed from a device-resident vector, and `s.backend().handle()` for the cuBLAS
handle. See `examples/arnoldi_sym_cuda.cu` (and `arnoldi_sym_cpu.cpp` for the
matching CPU baseline / benchmark).

## Layout

- `include/arnoldi/arnoldi.hpp` — include this
- `include/arnoldi/cuda.hpp` — opt-in CUDA backend (`-DARNOLDI_USE_CUDA=ON`)
- `include/arnoldi/detail/` — implementation headers
- `examples/` — `example_symmetric_laplacian`, plus `arpack_cpp_examples` (Fortran `EXAMPLES/` name catalog)
- `examples/` — `example_symmetric_laplacian`, `arnoldi_sym_cuda` / `arnoldi_sym_cpu`, plus `arpack_cpp_examples` (Fortran `EXAMPLES/` name catalog)
- `tests/` — Catch2 suite (`ctest`), incl. `test_solver_cuda` when CUDA is on

## Origin

Expand Down
38 changes: 38 additions & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,23 @@
# Included only when ARNOLDI_BUILD_EXAMPLES=ON (see parent CMakeLists.txt).

if(ARNOLDI_USE_CUDA AND NOT CMAKE_CUDA_ARCHITECTURES)
if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.24)
set(CMAKE_CUDA_ARCHITECTURES native)
else()
set(CMAKE_CUDA_ARCHITECTURES 61) # Pascal — safe minimum baseline
endif()
message(STATUS "Arnoldi examples: CMAKE_CUDA_ARCHITECTURES defaulted to "
"${CMAKE_CUDA_ARCHITECTURES} (override with -DCMAKE_CUDA_ARCHITECTURES=<cc>)")
endif()

if(ARNOLDI_BUILD_EXAMPLES)
add_executable(example_symmetric_laplacian symmetric_laplacian.cpp)
target_link_libraries(example_symmetric_laplacian PRIVATE arnoldi)

# CPU twin of arnoldi_sym_cuda — same problem, for backend benchmarking.
add_executable(example_arnoldi_sym_cpu arnoldi_sym_cpu.cpp)
target_link_libraries(example_arnoldi_sym_cpu PRIVATE arnoldi)

# Class-based Arnoldi<Kind, Scalar> demos (one per kind)
foreach(_demo IN ITEMS arnoldi_sym arnoldi_nonsym arnoldi_herm)
add_executable(example_${_demo} ${_demo}.cpp)
Expand All @@ -18,6 +32,16 @@ if(ARNOLDI_BUILD_EXAMPLES)
target_link_libraries(example_${_demo} PRIVATE arnoldi)
endforeach()

# CUDA device-matvec example (only when the CUDA backend is enabled)
if(ARNOLDI_USE_CUDA)
enable_language(CUDA)
add_executable(example_arnoldi_sym_cuda arnoldi_sym_cuda.cu)
set_target_properties(example_arnoldi_sym_cuda PROPERTIES
CUDA_STANDARD 17
CUDA_STANDARD_REQUIRED ON)
target_link_libraries(example_arnoldi_sym_cuda PRIVATE arnoldi)
endif()

# MPI parallel example (optional — only if MPI found)
find_package(MPI COMPONENTS CXX QUIET)
if(MPI_CXX_FOUND)
Expand All @@ -30,8 +54,22 @@ endif()
# Performance + correctness benchmark against original ARPACK
if(ARNOLDI_BUILD_EXAMPLES AND ORIGINAL_ARPACK_LIB)
enable_language(Fortran)

# With the CUDA backend the benchmark adds a GPU-vs-Fortran tier with
# device matvec kernels (__global__ / <<<>>>), so the .cpp must be
# compiled by nvcc.
if(ARNOLDI_USE_CUDA)
enable_language(CUDA)
set_source_files_properties(benchmark.cpp PROPERTIES LANGUAGE CUDA)
endif()

add_executable(bench_arpack benchmark.cpp)
target_include_directories(bench_arpack PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
if(ARNOLDI_USE_CUDA)
set_target_properties(bench_arpack PROPERTIES
CUDA_STANDARD 17
CUDA_STANDARD_REQUIRED ON)
endif()
if(TARGET BLAS::BLAS AND TARGET LAPACK::LAPACK)
target_link_libraries(bench_arpack PRIVATE
arnoldi ${ORIGINAL_ARPACK_LIB} LAPACK::LAPACK BLAS::BLAS)
Expand Down
52 changes: 52 additions & 0 deletions examples/arnoldi_sym_cpu.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
// arnoldi_sym_cpu — CPU twin of examples/arnoldi_sym_cuda.cu.
//
// Same problem (diagonal operator A(i,i) = i+1, n = 8192, nev = 5,
// ncv = 8*nev, "LM", tol 1e-10) solved with the default CpuBackend and a
// host matvec. Times the solve() call so it can be compared directly
// against the CUDA example as a backend performance benchmark.
//
// Build: configure with -DARNOLDI_BUILD_EXAMPLES=ON (no CUDA needed).

#include <arnoldi/arnoldi.hpp>

#include <algorithm>
#include <chrono>
#include <cmath>
#include <cstdio>
#include <functional>
#include <vector>

int main() {
const int n = 1 << 18; // 8192 — same size as the CUDA example
const int nev = 5;
const int ncv = 8 * nev; // matches arnoldi_sym_cuda.cu

// "LM": the nev largest eigenvalues, i.e. n, n-1, ..., n-nev+1.
arnoldi::Arnoldi<arnoldi::Kind::Sym, double> solver("I", n, "LM", nev, ncv);
solver.tol(1e-10).maxiter(1000);

auto t0 = std::chrono::steady_clock::now();
solver.solve([n](const double* x, double* y) {
for (int i = 0; i < n; ++i) y[i] = static_cast<double>(i + 1) * x[i];
});
auto t1 = std::chrono::steady_clock::now();
double secs = std::chrono::duration<double>(t1 - t0).count();

if (!solver.converged()) {
std::printf("CPU saupd failed, info = %d\n", solver.info());
return 1;
}

auto r = solver.eigenpairs(false); // values only
std::sort(r.values.begin(), r.values.end(), std::greater<double>());
std::printf("Largest %d eigenvalues of diag(1..%d), CPU backend:\n", nev, n);
for (int i = 0; i < nev; ++i) {
double exact = static_cast<double>(n - i);
std::printf(" lambda[%d] = %.6f (exact %.0f, |err| = %.2e)\n",
i, r.values[i], exact, std::fabs(r.values[i] - exact));
}
std::printf("iterations=%d, OP applies=%d\n",
solver.num_iterations(), solver.num_op_applies());
std::printf("solve() wall time: %.4f s\n", secs);
return 0;
}
94 changes: 94 additions & 0 deletions examples/arnoldi_sym_cuda.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
// arnoldi_sym_cuda — smallest eigenvalues of the 1D Dirichlet Laplacian
// with the matvec running entirely on the GPU.
//
// Demonstrates the CudaBackend device-matvec pattern:
// - Arnoldi<Kind::Sym, double, SerialComm, CudaBackend> keeps the Lanczos
// workspace (resid/v/workd) in device memory.
// - The user matvec callback receives DEVICE pointers and launches a CUDA
// kernel on the backend's stream.
// - eigenpairs() copies results to the host; eigenpairs_device() leaves
// the eigenvectors on the GPU for downstream device work.
//
// Build: configure with -DARNOLDI_USE_CUDA=ON -DARNOLDI_BUILD_EXAMPLES=ON.

#include <arnoldi/arnoldi.hpp>
#include <arnoldi/cuda.hpp>

#include <algorithm>
#include <chrono>
#include <cmath>
#include <cstdio>
#include <cuda_runtime.h>
#include <functional>
#include <vector>

// y = A x for the diagonal operator A(i,i) = i+1. The spectrum {1,2,...,n}
// is perfectly separated, so IRA converges fast at any n — a clean showcase
// of a large GPU matvec with exactly-known eigenvalues. (The tridiagonal
// Laplacian is clustered at both spectral ends for large n and would need
// shift-invert to target either end quickly — out of scope for a demo.)
__global__ void diag_matvec(int n, const double* x, double* y) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= n) return;
y[i] = static_cast<double>(i + 1) * x[i];
}

int main() {
int dev_count = 0;
cudaGetDeviceCount(&dev_count);
if (dev_count == 0) {
std::printf("no CUDA device available\n");
return 0;
}

const int n = 1 << 18; // 8192 — a non-trivial GPU matvec
const int nev = 5;
const int ncv = 8 * nev; // larger Krylov subspace: the relative gap of
// the top eigenvalues is ~1/n, so a bigger ncv
// keeps IRA convergence fast at large n

// "LM": the nev largest eigenvalues, i.e. n, n-1, ..., n-nev+1.
arnoldi::Arnoldi<arnoldi::Kind::Sym, double, arnoldi::SerialComm,
arnoldi::CudaBackend> solver("I", n, "LM", nev, ncv);
solver.tol(1e-10).maxiter(1000);

cudaStream_t stream = solver.backend().stream();
const int threads = 256;
const int blocks = (n + threads - 1) / threads;

// x, y are device pointers (workspace lives on the GPU). Launch on the
// backend's stream so the surrounding cuBLAS calls order correctly.
auto t0 = std::chrono::steady_clock::now();
solver.solve([&](const double* x, double* y) {
diag_matvec<<<blocks, threads, 0, stream>>>(n, x, y);
});
cudaStreamSynchronize(stream); // ensure all device work is finished
auto t1 = std::chrono::steady_clock::now();
double secs = std::chrono::duration<double>(t1 - t0).count();

if (!solver.converged()) {
std::printf("CUDA saupd failed, info = %d\n", solver.info());
return 1;
}

auto r = solver.eigenpairs(false); // values only (host)
// eigenpairs(false) returns the wanted set in which-selection order, not
// sorted by value — sort descending for a clean presentation.
std::sort(r.values.begin(), r.values.end(), std::greater<double>());
std::printf("Largest %d eigenvalues of diag(1..%d), CUDA backend:\n", nev, n);
for (int i = 0; i < nev; ++i) {
double exact = static_cast<double>(n - i); // n, n-1, ..., n-nev+1
std::printf(" lambda[%d] = %.6f (exact %.0f, |err| = %.2e)\n",
i, r.values[i], exact, std::fabs(r.values[i] - exact));
}
std::printf("iterations=%d, OP applies=%d\n",
solver.num_iterations(), solver.num_op_applies());
std::printf("solve() wall time: %.4f s\n", secs);

// Eigenvectors can also be kept on the device for further GPU work.
auto dr = solver.eigenpairs_device(true);
std::printf("eigenpairs_device: %d eigenvalues on host, "
"%zu-element eigenvector matrix left on device\n",
(int)dr.values.size(), (size_t)(n) * nev);
return 0;
}
Loading
Loading