CUDA
NVIDIA's parallel computing platform and the de-facto standard for GPU compute. The richest ecosystem — cuDNN, cuBLAS, CUTLASS, Nsight tooling — and the target every major ML framework optimises for first.
NVIDIAC/C++ · PythonNVIDIA onlymost mature
Install
# Install the CUDA Toolkit (Linux example)
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update && sudo apt-get install -y cuda-toolkit
nvcc --version # verify
Hello, GPU
vecadd.cu — add two vectors on the GPU
#include <cuda_runtime.h>
#include <stdio.h>
__global__ void add(const float* a, const float* b, float* c, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) c[i] = a[i] + b[i];
}
int main() {
int n = 1 << 20; size_t bytes = n * sizeof(float);
float *a, *b, *c; // unified memory
cudaMallocManaged(&a, bytes);
cudaMallocManaged(&b, bytes);
cudaMallocManaged(&c, bytes);
for (int i = 0; i < n; i++) { a[i] = 1.0f; b[i] = 2.0f; }
int threads = 256, blocks = (n + threads - 1) / threads;
add<<<blocks, threads>>>(a, b, c, n);
cudaDeviceSynchronize();
printf("c[0] = %f\n", c[0]); // 3.0
cudaFree(a); cudaFree(b); cudaFree(c);
}
Run it:
nvcc vecadd.cu -o vecadd && ./vecadd