5.1 Import CUDA Libraries

using CUDA
using CUDA.CUFFT
CUDA.allowscalar(false)

5.1.1 Performance Benchmarking

using BenchmarkTools

Used to measure execution time and compare GPU vs CPU performance.


5.1.2 GPU-Optimized Matsuno Kernel

"""
GPU-optimized in-place Matsuno right-hand side computation

This is the GPU kernel version of the Matsuno model. The key differences
from CPU version:

1. All arrays are CuArray (live on GPU)
2. Uses @. macro for fused element-wise operations
3. In-place FFTs modify arrays directly
4. No explicit threading (GPU provides implicit parallelism)
5. Minimizes GPU-CPU transfers

Parameters:
- h: η̂ (elevation spectrum, complex, modified in-place)
- u: φ̂ (velocity spectrum, complex, modified in-place)
- p: FFT plan (forward FFT)
- pinv: inverse FFT plan
- Γ: |k| (dispersion array)
- ϵ: nonlinearity parameter
- Dx: ∂_x operator (i·k)
- H: Hilbert transform (-i·sign(k))
- Π⅔: dealiasing filter
- unew, hnew, I₀, I₁, I₂, I₃: work arrays
"""
function matsuno!(h, u, p, pinv, Γ, ϵ, Dx, H, Π⅔, unew, hnew, I₀, I₁, I₂, I₃)
    
    # === Compute first terms ===
    # unew = Γ * h (|k| times η̂)
    @. unew = Γ * h
    
    # Transform to physical space
    pinv * unew
    
    # hnew = Dx * h (∂_x times η̂)
    @. hnew = Dx * h
    
    # Copy for later use
    @. I₁ = hnew
    
    # Transform to physical space
    pinv * I₁
    
    # Nonlinear product: unew * I₁ in physical space
    @. I₁ = unew * I₁
    
    # Transform to Fourier space
    p * I₁
    
    # Apply dealiasing to nonlinear term
    @. I₁ = I₁ * ϵ * Π⅔ - hnew
    
    # === Second nonlinear computation ===
    # Transform η̂ and φ̂ to physical space
    @. hnew = h
    pinv * hnew  # η in physical space
    
    @. unew = u
    pinv * unew  # φ in physical space
    
    # Nonlinear product: η * φ in physical space
    @. I₃ = hnew * unew
    
    # Transform to Fourier space
    p * I₃
    
    # === Hilbert transform terms ===
    # Apply Hilbert transform to φ̂
    @. h = H * u
    
    # Multiply by dispersion
    @. I₂ = Γ * u
    
    # Transform to physical space
    pinv * I₂
    
    # Multiply by η in physical space
    @. hnew = I₂ * hnew
    
    # Transform back to Fourier space
    p * hnew
    
    # === Final combination ===
    # Combine all terms for η̂_t
    @. h = h - (I₃ * Dx + hnew * H) * ϵ * Π⅔
    
    # Kinetic energy term
    @. unew = unew^2
    
    # Transform to Fourier space
    p * unew
    
    # Final update for φ̂_t
    @. u = I₁ - unew * Dx * ϵ/2 * Π⅔

end

GPU Optimization Techniques Used:

  1. @. Macro: Fuses multiple element-wise operations into single kernels

    @. unew = Γ * h     # No intermediate array allocation
  2. In-place operations: Arrays modified directly without copies

    pinv * unew         # Modifies unew in place
  3. Work array reuse: Minimize allocations by reusing arrays

  4. Kernel fusion: Combined operations reduce memory bandwidth requirements

5.1.3 Time Stepping Loop on GPU

"""
Main GPU simulation loop implementing RK4 time integration

This function:
1. Allocates all arrays on GPU
2. Pre-plans FFTs for efficiency
3. Runs full RK4 time stepping on GPU
4. Only transfers final results back to CPU

Parameters:
- h: initial η (can be CPU or GPU array)
- u: initial φ (can be CPU or GPU array)
- N: number of grid points
- Nt: number of time steps
- dt: time step size
- ϵ: nonlinearity parameter
- k: wavenumber array
- kmax: maximum wavenumber

Returns:
- data: solution history [η̂, φ̂] at all times (on CPU)
"""
function loop_over_time(h, u, N, Nt, dt, ϵ, k, kmax)
    
    # === GPU Memory Allocation ===
    # Allocate RK4 stage arrays on GPU
    Uhat = CuArray{ComplexF64,2}(undef, (N, 2))  # RK4 stages
    dU = CuArray{ComplexF64,2}(undef, (N, 2))    # Accumulated derivatives
    
    # Allocate storage for solution history on CPU
    # (Transfer individual snapshots, not entire history)
    data = zeros(ComplexF64, (N, 2, Nt))
    
    # === Transfer operators to GPU ===
    Γ = CuArray(abs.(k))                         # |k|
    Dx = CuArray(1im * k)                        # ∂_x operator
    H = CuArray(-1im * sign.(k))                 # Hilbert transform
    Π⅔ = CuArray(abs.(k) .< kmax * 2/3)          # Dealiasing filter
    
    # === Allocate work arrays on GPU ===
    hnew = CuArray{ComplexF64}(undef, N)
    unew = CuArray{ComplexF64}(undef, N)
    I₀ = CuArray{ComplexF64}(undef, N)
    I₁ = CuArray{ComplexF64}(undef, N)
    I₂ = CuArray{ComplexF64}(undef, N)
    I₃ = CuArray{ComplexF64}(undef, N)
    
    # === Pre-plan FFTs ===
    # Creating plans once is critical for performance
    # Plans are cached and reused in the loop
    p = plan_fft!(h)          # Forward FFT plan
    pinv = plan_ifft!(h)      # Inverse FFT plan
    
    # === Initialize Solution ===
    # Apply FFT to initial conditions
    p * h   # Transform η to Fourier space
    p * u   # Transform φ to Fourier space
    
    # Apply dealiasing filter to prevent aliasing in nonlinear terms
    h .= Π⅔ .* h
    u .= Π⅔ .* u
    
    # Combine into 2-component vector
    U = CuArray(hcat(h, u))
    
    # === Main Time Stepping Loop ===
    for j in 1:Nt
        
        # === RK4 Stage 1: k₁ ===
        Uhat .= U                              # Copy current state
        matsuno!(view(Uhat, :, 1), view(Uhat, :, 2), p, pinv,
                 Γ, ϵ, Dx, H, Π⅔, unew, hnew, I₀, I₁, I₂, I₃)
        
        # === RK4 Stage 2: k₂ ===
        dU .= Uhat                             # dU = k₁
        Uhat .= U .+ dt/2 .* Uhat              # Y = U + dt/2·k₁
        matsuno!(view(Uhat, :, 1), view(Uhat, :, 2), p, pinv,
                 Γ, ϵ, Dx, H, Π⅔, unew, hnew, I₀, I₁, I₂, I₃)
        
        # === RK4 Stage 3: k₃ ===
        dU .+= 2 .* Uhat                       # dU = k₁ + 2k₂
        Uhat .= U .+ dt/2 .* Uhat              # Y = U + dt/2·k₂
        matsuno!(view(Uhat, :, 1), view(Uhat, :, 2), p, pinv,
                 Γ, ϵ, Dx, H, Π⅔, unew, hnew, I₀, I₁, I₂, I₃)
        
        # === RK4 Stage 4: k₄ ===
        dU .+= 2 .* Uhat                       # dU = k₁ + 2k₂ + 2k₃
        Uhat .= U .+ dt .* Uhat                # Y = U + dt·k₃
        matsuno!(view(Uhat, :, 1), view(Uhat, :, 2), p, pinv,
                 Γ, ϵ, Dx, H, Π⅔, unew, hnew, I₀, I₁, I₂, I₃)
        
        # === Final RK4 Update ===
        dU .+= Uhat                            # dU = k₁ + 2k₂ + 2k₃ + k₄
        U .+= dt/6 .* dU                       # U^(n+1) = U + dt/6·dU
        
        # === Save snapshot to CPU ===
        # Only transfer the solution, not the work arrays
        data[:, :, j] .= collect(U)
    end
    
    data
end

Key GPU Optimization Strategies:

  1. Allocate once, reuse: All arrays allocated before loop, reused in each iteration

  2. FFT planning: Plans created once and applied with * operator

    p * h    # Apply cached FFT plan to h
  3. Vectorized operations: @. macro ensures element-wise operations fuse

  4. Minimize transfers: Only snapshots transferred to CPU at each time step

    • Not the entire work array history
    • Not intermediate stages
  5. GPU kernels: The Matsuno function runs entirely on GPU (no CPU-GPU communication)

5.1.4 GPU Main Function

"""
Run complete deep water wave simulation on GPU

Parameters:
- N: number of grid points (should be power of 2)
- animation: whether to create animated output
"""
function main_gpu(N::Int64; animation=true)
    
    # Set up parameters (same as CPU version)
    param = (ϵ=1/2,
             N=N,
             L=10.,
             T=5.,
             dt=0.001)
    
    # Initialize solver components (uses CPU structures)
    mesh = Mesh(param)
    times = Times(param.dt, param.T)
    init = BellCurve(param, 2.5)
    model = Matsuno(param)
    
    # Transfer initial conditions to GPU
    h = CuArray(init.h)
    u = CuArray(init.u)
    
    # Run GPU simulation
    # Note: All mesh data transferred to GPU inside loop_over_time
    data = loop_over_time(h, u, mesh.N, times.Nt, 
                         cu(param.dt), cu(param.ϵ), 
                         mesh.k, mesh.kmax)

    # Create visualization if requested
    if animation
        create_animation(mesh, times, model, data)
    end

end

Data Flow:

  1. CPU: Create initial conditions and mesh
  2. GPU: Transfer and run simulation
  3. CPU: Store results and create visualizations


CC BY-NC-SA 4.0 Pierre Navaro