using CUDA
using CUDA.CUFFT
CUDA.allowscalar(false)
CuArray: GPU array type CuArray that works like regular Julia arrays but live in GPU memoryCUFFT: CUDA-accelerated FFT library (much faster than CPU FFTW)allowscalar(false): Strict mode—prevents accidental transfers of individual elements (performance safety)using BenchmarkTools
Used to measure execution time and compare GPU vs CPU performance.
"""
GPU-optimized in-place Matsuno right-hand side computation
This is the GPU kernel version of the Matsuno model. The key differences
from CPU version:
1. All arrays are CuArray (live on GPU)
2. Uses @. macro for fused element-wise operations
3. In-place FFTs modify arrays directly
4. No explicit threading (GPU provides implicit parallelism)
5. Minimizes GPU-CPU transfers
Parameters:
- h: η̂ (elevation spectrum, complex, modified in-place)
- u: φ̂ (velocity spectrum, complex, modified in-place)
- p: FFT plan (forward FFT)
- pinv: inverse FFT plan
- Γ: |k| (dispersion array)
- ϵ: nonlinearity parameter
- Dx: ∂_x operator (i·k)
- H: Hilbert transform (-i·sign(k))
- Π⅔: dealiasing filter
- unew, hnew, I₀, I₁, I₂, I₃: work arrays
"""
function matsuno!(h, u, p, pinv, Γ, ϵ, Dx, H, Π⅔, unew, hnew, I₀, I₁, I₂, I₃)
# === Compute first terms ===
# unew = Γ * h (|k| times η̂)
@. unew = Γ * h
# Transform to physical space
pinv * unew
# hnew = Dx * h (∂_x times η̂)
@. hnew = Dx * h
# Copy for later use
@. I₁ = hnew
# Transform to physical space
pinv * I₁
# Nonlinear product: unew * I₁ in physical space
@. I₁ = unew * I₁
# Transform to Fourier space
p * I₁
# Apply dealiasing to nonlinear term
@. I₁ = I₁ * ϵ * Π⅔ - hnew
# === Second nonlinear computation ===
# Transform η̂ and φ̂ to physical space
@. hnew = h
pinv * hnew # η in physical space
@. unew = u
pinv * unew # φ in physical space
# Nonlinear product: η * φ in physical space
@. I₃ = hnew * unew
# Transform to Fourier space
p * I₃
# === Hilbert transform terms ===
# Apply Hilbert transform to φ̂
@. h = H * u
# Multiply by dispersion
@. I₂ = Γ * u
# Transform to physical space
pinv * I₂
# Multiply by η in physical space
@. hnew = I₂ * hnew
# Transform back to Fourier space
p * hnew
# === Final combination ===
# Combine all terms for η̂_t
@. h = h - (I₃ * Dx + hnew * H) * ϵ * Π⅔
# Kinetic energy term
@. unew = unew^2
# Transform to Fourier space
p * unew
# Final update for φ̂_t
@. u = I₁ - unew * Dx * ϵ/2 * Π⅔
end
GPU Optimization Techniques Used:
@. Macro: Fuses multiple element-wise operations into single kernels
@. unew = Γ * h # No intermediate array allocationIn-place operations: Arrays modified directly without copies
pinv * unew # Modifies unew in placeWork array reuse: Minimize allocations by reusing arrays
Kernel fusion: Combined operations reduce memory bandwidth requirements
"""
Main GPU simulation loop implementing RK4 time integration
This function:
1. Allocates all arrays on GPU
2. Pre-plans FFTs for efficiency
3. Runs full RK4 time stepping on GPU
4. Only transfers final results back to CPU
Parameters:
- h: initial η (can be CPU or GPU array)
- u: initial φ (can be CPU or GPU array)
- N: number of grid points
- Nt: number of time steps
- dt: time step size
- ϵ: nonlinearity parameter
- k: wavenumber array
- kmax: maximum wavenumber
Returns:
- data: solution history [η̂, φ̂] at all times (on CPU)
"""
function loop_over_time(h, u, N, Nt, dt, ϵ, k, kmax)
# === GPU Memory Allocation ===
# Allocate RK4 stage arrays on GPU
Uhat = CuArray{ComplexF64,2}(undef, (N, 2)) # RK4 stages
dU = CuArray{ComplexF64,2}(undef, (N, 2)) # Accumulated derivatives
# Allocate storage for solution history on CPU
# (Transfer individual snapshots, not entire history)
data = zeros(ComplexF64, (N, 2, Nt))
# === Transfer operators to GPU ===
Γ = CuArray(abs.(k)) # |k|
Dx = CuArray(1im * k) # ∂_x operator
H = CuArray(-1im * sign.(k)) # Hilbert transform
Π⅔ = CuArray(abs.(k) .< kmax * 2/3) # Dealiasing filter
# === Allocate work arrays on GPU ===
hnew = CuArray{ComplexF64}(undef, N)
unew = CuArray{ComplexF64}(undef, N)
I₀ = CuArray{ComplexF64}(undef, N)
I₁ = CuArray{ComplexF64}(undef, N)
I₂ = CuArray{ComplexF64}(undef, N)
I₃ = CuArray{ComplexF64}(undef, N)
# === Pre-plan FFTs ===
# Creating plans once is critical for performance
# Plans are cached and reused in the loop
p = plan_fft!(h) # Forward FFT plan
pinv = plan_ifft!(h) # Inverse FFT plan
# === Initialize Solution ===
# Apply FFT to initial conditions
p * h # Transform η to Fourier space
p * u # Transform φ to Fourier space
# Apply dealiasing filter to prevent aliasing in nonlinear terms
h .= Π⅔ .* h
u .= Π⅔ .* u
# Combine into 2-component vector
U = CuArray(hcat(h, u))
# === Main Time Stepping Loop ===
for j in 1:Nt
# === RK4 Stage 1: k₁ ===
Uhat .= U # Copy current state
matsuno!(view(Uhat, :, 1), view(Uhat, :, 2), p, pinv,
Γ, ϵ, Dx, H, Π⅔, unew, hnew, I₀, I₁, I₂, I₃)
# === RK4 Stage 2: k₂ ===
dU .= Uhat # dU = k₁
Uhat .= U .+ dt/2 .* Uhat # Y = U + dt/2·k₁
matsuno!(view(Uhat, :, 1), view(Uhat, :, 2), p, pinv,
Γ, ϵ, Dx, H, Π⅔, unew, hnew, I₀, I₁, I₂, I₃)
# === RK4 Stage 3: k₃ ===
dU .+= 2 .* Uhat # dU = k₁ + 2k₂
Uhat .= U .+ dt/2 .* Uhat # Y = U + dt/2·k₂
matsuno!(view(Uhat, :, 1), view(Uhat, :, 2), p, pinv,
Γ, ϵ, Dx, H, Π⅔, unew, hnew, I₀, I₁, I₂, I₃)
# === RK4 Stage 4: k₄ ===
dU .+= 2 .* Uhat # dU = k₁ + 2k₂ + 2k₃
Uhat .= U .+ dt .* Uhat # Y = U + dt·k₃
matsuno!(view(Uhat, :, 1), view(Uhat, :, 2), p, pinv,
Γ, ϵ, Dx, H, Π⅔, unew, hnew, I₀, I₁, I₂, I₃)
# === Final RK4 Update ===
dU .+= Uhat # dU = k₁ + 2k₂ + 2k₃ + k₄
U .+= dt/6 .* dU # U^(n+1) = U + dt/6·dU
# === Save snapshot to CPU ===
# Only transfer the solution, not the work arrays
data[:, :, j] .= collect(U)
end
data
end
Key GPU Optimization Strategies:
Allocate once, reuse: All arrays allocated before loop, reused in each iteration
FFT planning: Plans created once and applied with * operator
p * h # Apply cached FFT plan to hVectorized operations: @. macro ensures element-wise operations fuse
Minimize transfers: Only snapshots transferred to CPU at each time step
GPU kernels: The Matsuno function runs entirely on GPU (no CPU-GPU communication)
"""
Run complete deep water wave simulation on GPU
Parameters:
- N: number of grid points (should be power of 2)
- animation: whether to create animated output
"""
function main_gpu(N::Int64; animation=true)
# Set up parameters (same as CPU version)
param = (ϵ=1/2,
N=N,
L=10.,
T=5.,
dt=0.001)
# Initialize solver components (uses CPU structures)
mesh = Mesh(param)
times = Times(param.dt, param.T)
init = BellCurve(param, 2.5)
model = Matsuno(param)
# Transfer initial conditions to GPU
h = CuArray(init.h)
u = CuArray(init.u)
# Run GPU simulation
# Note: All mesh data transferred to GPU inside loop_over_time
data = loop_over_time(h, u, mesh.N, times.Nt,
cu(param.dt), cu(param.ϵ),
mesh.k, mesh.kmax)
# Create visualization if requested
if animation
create_animation(mesh, times, model, data)
end
end
Data Flow: