Explicit transpose

3.6 Explicit transpose

Multidimensional arrays in Julia are stored in column-major order.
FFTs along y are slower than FFTs along x
We can speed-up the computation by allocating the transposed f and transpose f for each advection along y.

function with_fft_transposed(tf, nt, mesh::Mesh)

    dt = tf/nt

    f  = zeros(Complex{Float64},(mesh.nx,mesh.ny))
    f̂  = similar(f)
    fᵗ = zeros(Complex{Float64},(mesh.ny,mesh.nx))
    f̂ᵗ = similar(fᵗ)

    exky = exp.( 1im*tan(dt/2) .* mesh.x' .* mesh.ky )
    ekxy = exp.(-1im*sin(dt)   .* mesh.y' .* mesh.kx )
    
    FFTW.set_num_threads(4)
    Px = plan_fft(f,  1, flags=FFTW.PATIENT)    
    Py = plan_fft(fᵗ, 1, flags=FFTW.PATIENT)
    
    f .= exact(0.0, mesh)
    
    for n = 1:nt
        transpose!(fᵗ,f)
        mul!(f̂ᵗ, Py, fᵗ)
        f̂ᵗ .= f̂ᵗ .* exky
        ldiv!(fᵗ, Py, f̂ᵗ)
        transpose!(f,fᵗ)
        
        mul!(f̂, Px, f)
        f̂ .= f̂ .* ekxy 
        ldiv!(f, Px, f̂)
        
        transpose!(fᵗ,f)
        mul!(f̂ᵗ, Py, fᵗ)
        f̂ᵗ .= f̂ᵗ .* exky
        ldiv!(fᵗ, Py, f̂ᵗ)
        transpose!(f,fᵗ)
    end
    real(f)
end

f = with_fft_transposed(0.1, 1, mesh) # trigger build
@time norm(with_fft_transposed(tf, nt, mesh) .- exact(tf, mesh))

vectorized_bench = @benchmark vectorized(tf, nt, mesh)
inplace_bench = @benchmark inplace(tf, nt, mesh)
with_fft_plans_inplace_bench = @benchmark with_fft_plans_inplace(tf, nt, mesh)
with_fft_transposed_bench = @benchmark with_fft_transposed(tf, nt, mesh)

d = Dict() 
d["inplace"] = minimum(inplace_bench.times) / 1e6
d["vectorized"] = minimum(vectorized_bench.times) / 1e6
d["with_fft_plans_inplace"] = minimum(with_fft_plans_inplace_bench.times) / 1e6
d["with_fft_transposed"] = minimum(with_fft_transposed_bench.times) / 1e6;

for (key, value) in sort(collect(d), by=last)
    println(rpad(key, 25, "."), lpad(round(value, digits=1), 6, "."))
end

3.5 Inplace computation an.. ← → 3.7 Conclusion

CC BY-SA 4.0 Pierre Navaro