CUDA

NVIDIA's parallel computing platform and the de-facto standard for GPU compute. The richest ecosystem — cuDNN, cuBLAS, CUTLASS, Nsight tooling — and the target every major ML framework optimises for first.

NVIDIAC/C++ · PythonNVIDIA onlymost mature

Official docs ↗ ← All libraries

Install

# Install the CUDA Toolkit (Linux example)
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update && sudo apt-get install -y cuda-toolkit
nvcc --version   # verify

Hello, GPU

vecadd.cu — add two vectors on the GPU

#include <cuda_runtime.h>
#include <stdio.h>

__global__ void add(const float* a, const float* b, float* c, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) c[i] = a[i] + b[i];
}

int main() {
    int n = 1 << 20; size_t bytes = n * sizeof(float);
    float *a, *b, *c;                 // unified memory
    cudaMallocManaged(&a, bytes);
    cudaMallocManaged(&b, bytes);
    cudaMallocManaged(&c, bytes);
    for (int i = 0; i < n; i++) { a[i] = 1.0f; b[i] = 2.0f; }

    int threads = 256, blocks = (n + threads - 1) / threads;
    add<<<blocks, threads>>>(a, b, c, n);
    cudaDeviceSynchronize();

    printf("c[0] = %f\n", c[0]);     // 3.0
    cudaFree(a); cudaFree(b); cudaFree(c);
}

Run it:

nvcc vecadd.cu -o vecadd && ./vecadd

Learn more