GPU

using BenchmarkTools
using CUDA
using Random
using Test
using LinearAlgebra
using ForwardDiff
using ProgressMeter
using Plots
CUDA.version()

Useful function to enable GPU version of your code

CUDA.functional()
for device in CUDA.devices()
    @show capability(device)
    @show name(device)
end

Array programming

Construction

a = CuArray{Float32,2}(undef, 2, 2)
similar(a)
a = CuArray([1,2,3])

Transfer to CPU

b is allocated on the CPU, a data transfer is made

b = Array(a)
collect(a)

API compatibilty with Base.Array

CUDA.ones(2)
a = CUDA.zeros(Float32, 2)
a isa AbstractArray
CUDA.fill(42, (3,4))

Random numbers

CUDA.rand(2, 2)
CUDA.randn(2, 1)
x = CUDA.CuArray(0:0.01:1.0)
nt = length(x)
y = 0.2 .+ 0.5 .* x + 0.1 .* CUDA.randn(nt);
scatter( Array(x), Array(y))
plot!( x -> 0.2 + 0.5x)
xlims!(0,1)
ylims!(0,1)
X = hcat(CUDA.ones(nt), x);
β = X'X \ X'y
sum( ( β[1] .+ β[2] .* x .- y).^2 )
a = CuArray([1 2 3])
view(a, 2:3)
a = CuArray{Float64}([1 2 3])
b = CuArray{Float64}([4 5 6])

map(a) do x
    x + 1
end
reduce(+, a)
accumulate(+, b; dims=2)
findfirst(isequal(2), a)
a = CuArray([1 2 3])
b = CuArray([4 5 6])

map(a) do x
    x + 1
end

a .+ 2b

reduce(+, a)

accumulate(+, b; dims=2)

findfirst(isequal(2), a)

CURAND

CUDA.rand!(a)

CUBLAS

a * b'

CUSOLVER

L, ipiv = CUDA.CUSOLVER.getrf!(a'b)
CUDA.CUSOLVER.getrs!('N', L, ipiv, CUDA.ones(Float64, 3))

CUFFT

fft = CUFFT.plan_fft(a) 
fft * a
ifft = CUFFT.plan_ifft(a)
real(ifft * (fft * a))

CUDDN

CUDA.CUDNN.softmax(real(ans))

CUSPARSE

CUDA.CUSPARSE.CuSparseMatrixCSR(a)

Workflow

A typical approach for porting or developing an application for the GPU is as follows:

  1. develop an application using generic array functionality, and test it on the CPU with the Array type
  2. port your application to the GPU by switching to the CuArray type
  3. disallow the CPU fallback (“scalar indexing”) to find operations that are not implemented for or incompatible with GPU execution
  4. (optional) use lower-level, CUDA-specific interfaces to implement missing functionality or optimize performance

Linear regression example

# squared error loss function
loss(w, b, x, y) = sum(abs2, y - (w*x .+ b)) / size(y, 2)
# get gradient w.r.t to `w`
loss∇w(w, b, x, y) = ForwardDiff.gradient(w -> loss(w, b, x, y), w)
# get derivative w.r.t to `b` (`ForwardDiff.derivative` is
# used instead of `ForwardDiff.gradient` because `b` is
# a scalar instead of an array)
lossdb(w, b, x, y) = ForwardDiff.derivative(b -> loss(w, b, x, y), b)
# proximal gradient descent function
function train(w, b, x, y; lr=0.1)
    w -= lmul!(lr, loss∇w(w, b, x, y))
    b -= lr * lossdb(w, b, x, y)
    return w, b
end
function cpu_test(n = 1000, p = 100, iter = 100)
    x = randn(n, p)'
    y = sum(x[1:5,:]; dims=1) .+ randn(n)' * 0.1
    w = 0.0001 * randn(1, p)
    b = 0.0
    for i = 1:iter
       w, b = train(w, b, x, y)
    end
    return loss(w,b,x,y)
end
@time cpu_test()

Moving to GPU

function gpu_test( n = 1000, p = 100, iter = 100)
    x = randn(n, p)'
    y = sum(x[1:5,:]; dims=1) .+ randn(n)' * 0.1
    w = 0.0001 * randn(1, p)
    b = 0.0
    x = CuArray(x)
    y = CuArray(y)
    w = CuArray(w)
    
    for i = 1:iter
       w, b = train(w, b, x, y)
       
    end
    return loss(w,b,x,y)
end
@time gpu_test()
@btime cpu_test( 10000, 100, 100)
@btime gpu_test( 10000, 100, 100);

Custom Kernel

  • you cannot allocate memory,
  • I/O is disallowed,
  • badly-typed code will not compile.

Keep kernels simple, and only incrementally port code while continuously verifying that it still compiles and executes as expected.

a = CUDA.zeros(1024)

function kernel(a)
    i = threadIdx().x
    a[i] += 1
    return
end

@cuda threads=length(a) kernel(a)
a = CUDA.rand(Int, 1000)
norm(a)
@btime norm($a)
@btime norm($(Array(a)))

The norm computation is much faster on the CPU

CUDA.allowscalar(false)
a = CuArray(1:9_999_999);
@time a .+ reverse(a);

You need two kernels to perfom this last computation. @time is not the right tool to evaluate the elasped time. The task is scheduled on the GPU device but not executed. It will be executed when you will fetch the result on the CPU.

@time CUDA.@sync a .+ reverse(a);
CUDA.@time a .+ reverse(a);
@btime CUDA.@sync $a .+ reverse($a);
@btime CUDA.@sync $(Array(a)) .+ reverse($(Array(a)));

NVIDIA Nsight Compute

$ nv-nsight-cu-cli --profile-from-start off julia
julia> CUDAdrv.@profile a .+ reverse(a)
julia> exit()
$ nsys launch julia
julia> CUDAdrv.@profile a .+ reverse(a)

Interactive development

kernel() = (@cuprintln("foo"); return)
@cuda kernel()
kernel() = (@cuprintln("bar"); return)
@cuda kernel()

There is a significant overhead when you launch several kernels

a = CuArray(1:9_999_999)
c = similar(a)
c .= a .+ reverse(a);
function vadd_reverse(c, a, b)
    i = threadIdx().x
    if i <= length(c)
        @inbounds c[i] = a[i] + reverse(b)[i]
    end
    return
end

Unsupported

@cuda threads = length(a) vadd_reverse(c, a, a)

will raise an error because some features are not supported on the GPU

  • Dynamic allocations
  • Garbage collection
  • Dynamic dispatch
  • Input/Output
function vadd_reverse(c, a, b)
    
    i = threadIdx().x
    if i <= length(c)
        @inbounds c[i] = a[i] + b[end - i + 1]
    end
    return
end

The following expression will also raise an error because you can’t loop over an array on GPU in the same way.

@cuda threads = length(a) vadd_reverse(c, a, a)
attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK)
function vadd_reverse(c, a, b)
    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    if i <= length(c)
        @inbounds c[i] = a[i] + b[end - i + 1]
    end
    return
end

The kernel you built is twice faster

@btime CUDA.@sync @cuda threads=1024 blocks=length($a)÷1024+1 vadd_reverse($c, $a, $a)
@btime CUDA.@sync $a .+ reverse($a);

To adapt the code to your device, use this configurator function

function configurator(kernel)
    config = launch_configuration(kernel.fun)
    threads = min(length(a), config.threads)
    blocks = cld(length(a), threads)
    return (threads=threads, blocks=blocks)
end
@cuda config=configurator vadd_reverse(c, a, a)

12 Performance optimization

  • Avoid thread divergence

  • Reduce and coalesce global accesses

  • Improve occuancy, optimize launch configuration

  • Early-free array using CUDA.unsafe_free!

  • Annotate with @inbounds to avoid exception branches

  • Use Int32 and floating-point values

@device_code_llvm @cuda vadd_reverse(c, a, a)

Cooperative groups

https://devblogs.nvidia.com/cooperative-groups/

@cuda cooperative=true kernel(...)

Shared memory

https://devblogs.nvidia.com/using-shared-memory-cuda-cc/

a = @cuStaticSharedMem(Int, 64)

Dynamic parallelism

https://devblogs.nvidia.com/cuda-dynamic-parallelism-api-principles/

@cuda dynamic=true kernel(…)

Standard output

@cuprintln("Thread $(threadIdx().x)")

Atomics

@atomic a[...] += val

GPU programming performance tips