Memory allocation

Functions with preallocated memory run faster

function build_preallocate(n::Int)
    @assert n >= 2
    v = zeros(Int64,n)
    v[1] = 1
    v[2] = 1
    for i = 3:n
        v[i] = v[i-1] + v[i-2]
    end
    return v
end
build_preallocate (generic function with 1 method)
function build_no_allocation(n::Int)
    @assert n >= 2
    v = Vector{Int64}()
    push!(v,1)
    push!(v,1)
    for i = 3:n
        push!(v,v[i-1]+v[i-2])
    end
    return v
end
build_no_allocation (generic function with 1 method)

Whenever possible, preallocate memory

isequal(build_preallocate(10),build_no_allocation(10))
true
using BenchmarkTools

n = 100

@btime build_no_allocation(n);

@btime build_preallocate(n);
  667.213 ns (4 allocations: 1.92 KiB)
  164.842 ns (1 allocation: 896 bytes)

julia --check-bounds=no -O3 --track-allocation=user build_no_allocation.jl

cat build_no_allocation.jl.*.mem

   - function build_no_allocation(n::Int)
   0     @assert n >= 2
  64     v = Vector{Int64}()
  80     push!(v,1)
   0     push!(v,1)
   0     for i = 3:n
1824         push!(v,v[i-1]+v[i-2])
   0     end
   0     return v
   - end

julia --check-bounds=no -O3 --track-allocation=user build_preallocate.jl

cat build_preallocate.jl.*.mem

  - function build_preallocate(n::Int)
  0     @assert n >= 2
896     v = zeros(Int64,n)
  0     v[1] = 1
  0     v[2] = 1
  0     for i = 3:n
  0         v[i] = v[i-1] + v[i-2]
  0     end
  0     return v
  - end

Pre-allocating outputs

Whenever you can reuse memory, reuse it.

You have a vector b and a vector h where b[i] is the base length of triangle i and h[i] is the height length. The experiment is to find the hypotenuse value of all triangles.

using BenchmarkTools

b = rand(1000)*10
h = rand(1000)*10
function find_hypotenuse(b::Vector{T},h::Vector{T}) where T <: Real
    return sqrt.(b.^2+h.^2)
end
find_hypotenuse (generic function with 1 method)
@btime find_hypotenuse($b, $h);
  4.026 μs (4 allocations: 31.75 KiB)

function find_hypotenuse_optimized(b::Vector{T},h::Vector{T}) where T <: Real
    accum_vec = similar(b)
    for i = eachindex(accum_vec)
        accum_vec[i] = b[i]^2
        accum_vec[i] += h[i]^2 # here, we used the same space in memory to hold the sum
        accum_vec[i] = sqrt(accum_vec[i]) # same thing here, to hold the sqrt
    end
    return accum_vec
end
find_hypotenuse_optimized (generic function with 1 method)
@btime find_hypotenuse_optimized($b, $h);
  2.791 μs (1 allocation: 7.94 KiB)

using FFTW, LinearAlgebra

xmin, xmax, nx = 0, 4π, 1024
ymin, ymax, ny = 0, 4π, 1024

x = LinRange(xmin, xmax, nx+1)[1:end-1]
y = LinRange(ymin, ymax, ny+1)[1:end-1]
ky  = 2π ./ (ymax-ymin) .* fftfreq(ny, ny)
exky = exp.( 1im .* ky .* x')

f  = zeros(ComplexF64, (nx,ny))
fᵗ = zeros(ComplexF64, reverse(size(f)))
f̂ᵗ = zeros(ComplexF64, reverse(size(f)))
f .= sin.(x) .* cos.(y')

plan = plan_fft(fᵗ, 1, flags=FFTW.PATIENT)
FFTW forward plan for 1024×1024 array of ComplexF64
(dft-ct-dit/64
  (dftw-direct-64/504-x1024 "t2fv_64_avx2")
  (dft-vrank>=1-x1024/1
    (dft-direct-16-x64 "n2fv_16_avx2")))

function df_dy_optimized!( f, fᵗ, f̂ᵗ, plan, exky )

    transpose!(fᵗ,f)
    mul!(f̂ᵗ,  plan, fᵗ)
    f̂ᵗ .= f̂ᵗ .* exky
    ldiv!(fᵗ, plan, f̂ᵗ)
    transpose!(f, fᵗ)

end

@btime df_dy_optimized!($f, $fᵗ, $f̂ᵗ, $plan, $exky );
  13.664 ms (2 allocations: 112 bytes)