using BenchmarkTools
using CUDA
using Random
using Test
using LinearAlgebra
using ForwardDiff
using ProgressMeter
using Plots
GPU
version() CUDA.
Useful function to enable GPU version of your code
functional() CUDA.
for device in CUDA.devices()
@show capability(device)
@show name(device)
end
Array programming
Construction
= CuArray{Float32,2}(undef, 2, 2) a
similar(a)
= CuArray([1,2,3]) a
Transfer to CPU
b is allocated on the CPU, a data transfer is made
= Array(a) b
collect(a)
API compatibilty with Base.Array
ones(2) CUDA.
= CUDA.zeros(Float32, 2) a
AbstractArray a isa
fill(42, (3,4)) CUDA.
Random numbers
rand(2, 2) CUDA.
randn(2, 1) CUDA.
= CUDA.CuArray(0:0.01:1.0)
x = length(x)
nt = 0.2 .+ 0.5 .* x + 0.1 .* CUDA.randn(nt);
y scatter( Array(x), Array(y))
plot!( x -> 0.2 + 0.5x)
xlims!(0,1)
ylims!(0,1)
= hcat(CUDA.ones(nt), x); X
= X'X \ X'y β
sum( ( β[1] .+ β[2] .* x .- y).^2 )
= CuArray([1 2 3]) a
view(a, 2:3)
= CuArray{Float64}([1 2 3])
a = CuArray{Float64}([4 5 6])
b
map(a) do x
+ 1
x end
reduce(+, a)
accumulate(+, b; dims=2)
findfirst(isequal(2), a)
= CuArray([1 2 3])
a = CuArray([4 5 6])
b
map(a) do x
+ 1
x end
.+ 2b
a
reduce(+, a)
accumulate(+, b; dims=2)
findfirst(isequal(2), a)
CURAND
rand!(a) CUDA.
CUBLAS
* b' a
CUSOLVER
= CUDA.CUSOLVER.getrf!(a'b)
L, ipiv getrs!('N', L, ipiv, CUDA.ones(Float64, 3)) CUDA.CUSOLVER.
CUFFT
= CUFFT.plan_fft(a)
fft * a fft
= CUFFT.plan_ifft(a)
ifft real(ifft * (fft * a))
CUDDN
softmax(real(ans)) CUDA.CUDNN.
CUSPARSE
CuSparseMatrixCSR(a) CUDA.CUSPARSE.
Workflow
A typical approach for porting or developing an application for the GPU is as follows:
- develop an application using generic array functionality, and test it on the CPU with the Array type
- port your application to the GPU by switching to the CuArray type
- disallow the CPU fallback (“scalar indexing”) to find operations that are not implemented for or incompatible with GPU execution
- (optional) use lower-level, CUDA-specific interfaces to implement missing functionality or optimize performance
Linear regression example
# squared error loss function
loss(w, b, x, y) = sum(abs2, y - (w*x .+ b)) / size(y, 2)
# get gradient w.r.t to `w`
loss∇w(w, b, x, y) = ForwardDiff.gradient(w -> loss(w, b, x, y), w)
# get derivative w.r.t to `b` (`ForwardDiff.derivative` is
# used instead of `ForwardDiff.gradient` because `b` is
# a scalar instead of an array)
lossdb(w, b, x, y) = ForwardDiff.derivative(b -> loss(w, b, x, y), b)
# proximal gradient descent function
function train(w, b, x, y; lr=0.1)
-= lmul!(lr, loss∇w(w, b, x, y))
w -= lr * lossdb(w, b, x, y)
b return w, b
end
function cpu_test(n = 1000, p = 100, iter = 100)
= randn(n, p)'
x = sum(x[1:5,:]; dims=1) .+ randn(n)' * 0.1
y = 0.0001 * randn(1, p)
w = 0.0
b for i = 1:iter
= train(w, b, x, y)
w, b end
return loss(w,b,x,y)
end
@time cpu_test()
Moving to GPU
function gpu_test( n = 1000, p = 100, iter = 100)
= randn(n, p)'
x = sum(x[1:5,:]; dims=1) .+ randn(n)' * 0.1
y = 0.0001 * randn(1, p)
w = 0.0
b = CuArray(x)
x = CuArray(y)
y = CuArray(w)
w
for i = 1:iter
= train(w, b, x, y)
w, b
end
return loss(w,b,x,y)
end
@time gpu_test()
@btime cpu_test( 10000, 100, 100)
@btime gpu_test( 10000, 100, 100);
Custom Kernel
- you cannot allocate memory,
- I/O is disallowed,
- badly-typed code will not compile.
Keep kernels simple, and only incrementally port code while continuously verifying that it still compiles and executes as expected.
= CUDA.zeros(1024)
a
function kernel(a)
= threadIdx().x
i += 1
a[i] return
end
@cuda threads=length(a) kernel(a)
= CUDA.rand(Int, 1000) a
norm(a)
@btime norm($a)
@btime norm($(Array(a)))
The norm
computation is much faster on the CPU
allowscalar(false) CUDA.
= CuArray(1:9_999_999); a
@time a .+ reverse(a);
You need two kernels to perfom this last computation. @time is not the right tool to evaluate the elasped time. The task is scheduled on the GPU device but not executed. It will be executed when you will fetch the result on the CPU.
@time CUDA.@sync a .+ reverse(a);
@time a .+ reverse(a); CUDA.
@btime CUDA.@sync $a .+ reverse($a);
@btime CUDA.@sync $(Array(a)) .+ reverse($(Array(a)));
NVIDIA Nsight Compute
$ nv-nsight-cu-cli --profile-from-start off julia
julia> CUDAdrv.@profile a .+ reverse(a)
julia> exit()
$ nsys launch julia
julia> CUDAdrv.@profile a .+ reverse(a)
Interactive development
kernel() = (@cuprintln("foo"); return)
@cuda kernel()
kernel() = (@cuprintln("bar"); return)
@cuda kernel()
There is a significant overhead when you launch several kernels
= CuArray(1:9_999_999)
a = similar(a)
c .= a .+ reverse(a); c
function vadd_reverse(c, a, b)
= threadIdx().x
i if i <= length(c)
@inbounds c[i] = a[i] + reverse(b)[i]
end
return
end
Unsupported
@cuda threads = length(a) vadd_reverse(c, a, a)
will raise an error because some features are not supported on the GPU
- Dynamic allocations
- Garbage collection
- Dynamic dispatch
- Input/Output
function vadd_reverse(c, a, b)
= threadIdx().x
i if i <= length(c)
@inbounds c[i] = a[i] + b[end - i + 1]
end
return
end
The following expression will also raise an error because you can’t loop over an array on GPU in the same way.
@cuda threads = length(a) vadd_reverse(c, a, a)
attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK)
function vadd_reverse(c, a, b)
= (blockIdx().x - 1) * blockDim().x + threadIdx().x
i if i <= length(c)
@inbounds c[i] = a[i] + b[end - i + 1]
end
return
end
The kernel you built is twice faster
@btime CUDA.@sync @cuda threads=1024 blocks=length($a)÷1024+1 vadd_reverse($c, $a, $a)
@btime CUDA.@sync $a .+ reverse($a);
To adapt the code to your device, use this configurator function
function configurator(kernel)
= launch_configuration(kernel.fun)
config = min(length(a), config.threads)
threads = cld(length(a), threads)
blocks return (threads=threads, blocks=blocks)
end
@cuda config=configurator vadd_reverse(c, a, a)
12 Performance optimization
Avoid thread divergence
Reduce and coalesce global accesses
Improve occuancy, optimize launch configuration
Early-free array using CUDA.unsafe_free!
Annotate with @inbounds to avoid exception branches
Use Int32 and floating-point values
@device_code_llvm @cuda vadd_reverse(c, a, a)
Cooperative groups
https://devblogs.nvidia.com/cooperative-groups/
@cuda cooperative=true kernel(...)
Dynamic parallelism
https://devblogs.nvidia.com/cuda-dynamic-parallelism-api-principles/
@cuda dynamic=true kernel(…)
Standard output
@cuprintln("Thread $(threadIdx().x)")
Atomics
@atomic a[...] += val
GPU programming performance tips
- Avoid thread divergence
- Reduce and coalesce global accesses
- Improve occupancy
- Early-free arrays CuArrays.unsafe_free!
- Annotate with
@inbounds
- Use 32 bits for float and integers