Memory order

“N-dimensional arrays are stored in column-major layout. The elements from the first (leftmost) dimension or index are contiguous in memory.”

function compute_dist!(x, dist)
    for i=eachindex(x)
        for j=eachindex(x)
            dist[i, j] = abs(x[i] - x[j])
        end
    end
end

N = 10_000
x = rand(Float64, N)
dist = Array{Float64}(undef, (N, N))

compute_dist!(x, dist)
@time compute_dist!(x, dist)
  0.188357 seconds

function compute_dist!(x, dist)
    for j=eachindex(x)
        for i=eachindex(x)
            dist[i, j] = abs(x[i] - x[j])
        end
    end
end

N = 10_000
x = rand(Float64, N)
dist = Array{Float64}(undef, (N, N))

compute_dist!(x, dist)
@time compute_dist!(x, dist)
  0.042062 seconds

using BenchmarkTools, FFTW
xmin, xmax, nx = 0, 4π, 1024
ymin, ymax, ny = 0, 4π, 1024
x = LinRange(xmin, xmax, nx+1)[1:end-1]
y = LinRange(ymin, ymax, ny+1)[1:end-1]

function df_dy!( f )
    ky  = 2π ./ (ymax-ymin) .* fftfreq(ny, ny)
    exky = exp.( 1im .* ky' .* x)
    f .= real(ifft(exky .* fft(f, 2), 2))
end

f1 = sin.(x) .* cos.(y') 
df_dy!( f1 );

function df_dy_transposed!( f )
    ft = transpose(f)
    ky  = 2π ./ (ymax-ymin) .* fftfreq(ny, ny)
    exky = exp.( 1im .* ky .* x')
    f .= transpose(real(ifft(exky .* fft(ft, 1), 1)))
end
f2 = sin.(x) .* cos.(y') 
df_dy_transposed!( f2 );

isequal(f1, f2)
true
f = sin.(x) .* cos.(y')
@btime df_dy!($f);
f = sin.(x) .* cos.(y')
@btime df_dy_transposed!($f);
  78.131 ms (36 allocations: 88.01 MiB)
  35.443 ms (37 allocations: 88.01 MiB)