Explicit transpose
- Multidimensional arrays in Julia are stored in column-major order.
- FFTs along y are slower than FFTs along x
- We can speed-up the computation by allocating the transposed
f
and transpose f for each advection along y.
function with_fft_transposed(tf, nt, mesh::Mesh)
dt = tf/nt
f = zeros(Complex{Float64},(mesh.nx,mesh.ny))
f̂ = similar(f)
fᵗ = zeros(Complex{Float64},(mesh.ny,mesh.nx))
f̂ᵗ = similar(fᵗ)
exky = exp.( 1im*tan(dt/2) .* mesh.x' .* mesh.ky )
ekxy = exp.(-1im*sin(dt) .* mesh.y' .* mesh.kx )
FFTW.set_num_threads(4)
Px = plan_fft(f, 1, flags=FFTW.PATIENT)
Py = plan_fft(fᵗ, 1, flags=FFTW.PATIENT)
f .= exact(0.0, mesh)
for n = 1:nt
transpose!(fᵗ,f)
mul!(f̂ᵗ, Py, fᵗ)
f̂ᵗ .= f̂ᵗ .* exky
ldiv!(fᵗ, Py, f̂ᵗ)
transpose!(f,fᵗ)
mul!(f̂, Px, f)
f̂ .= f̂ .* ekxy
ldiv!(f, Px, f̂)
transpose!(fᵗ,f)
mul!(f̂ᵗ, Py, fᵗ)
f̂ᵗ .= f̂ᵗ .* exky
ldiv!(fᵗ, Py, f̂ᵗ)
transpose!(f,fᵗ)
end
real(f)
end
f = with_fft_transposed(0.1, 1, mesh) # trigger build
@time norm(with_fft_transposed(tf, nt, mesh) .- exact(tf, mesh))
vectorized_bench = @benchmark vectorized(tf, nt, mesh)
inplace_bench = @benchmark inplace(tf, nt, mesh)
with_fft_plans_inplace_bench = @benchmark with_fft_plans_inplace(tf, nt, mesh)
with_fft_transposed_bench = @benchmark with_fft_transposed(tf, nt, mesh)
d = Dict()
d["inplace"] = minimum(inplace_bench.times) / 1e6
d["vectorized"] = minimum(vectorized_bench.times) / 1e6
d["with_fft_plans_inplace"] = minimum(with_fft_plans_inplace_bench.times) / 1e6
d["with_fft_transposed"] = minimum(with_fft_transposed_bench.times) / 1e6;
for (key, value) in sort(collect(d), by=last)
println(rpad(key, 25, "."), lpad(round(value, digits=1), 6, "."))
end