function build_preallocate(n::Int)
@assert n >= 2
= zeros(Int64,n)
v 1] = 1
v[2] = 1
v[for i = 3:n
= v[i-1] + v[i-2]
v[i] end
return v
end
build_preallocate (generic function with 1 method)
667.213 ns (4 allocations: 1.92 KiB)
164.842 ns (1 allocation: 896 bytes)
julia --check-bounds=no -O3 --track-allocation=user build_no_allocation.jl
cat build_no_allocation.jl.*.mem
- function build_no_allocation(n::Int)
0 @assert n >= 2
64 v = Vector{Int64}()
80 push!(v,1)
0 push!(v,1)
0 for i = 3:n
1824 push!(v,v[i-1]+v[i-2])
0 end
0 return v
- end
julia --check-bounds=no -O3 --track-allocation=user build_preallocate.jl
cat build_preallocate.jl.*.mem
- function build_preallocate(n::Int)
0 @assert n >= 2
896 v = zeros(Int64,n)
0 v[1] = 1
0 v[2] = 1
0 for i = 3:n
0 v[i] = v[i-1] + v[i-2]
0 end
0 return v
- end
Whenever you can reuse memory, reuse it.
You have a vector b
and a vector h
where b[i]
is the base length of triangle i
and h[i]
is the height length. The experiment is to find the hypotenuse value of all triangles.
using BenchmarkTools
b = rand(1000)*10
h = rand(1000)*10
function find_hypotenuse(b::Vector{T},h::Vector{T}) where T <: Real
return sqrt.(b.^2+h.^2)
end
find_hypotenuse (generic function with 1 method)
function find_hypotenuse_optimized(b::Vector{T},h::Vector{T}) where T <: Real
accum_vec = similar(b)
for i = eachindex(accum_vec)
accum_vec[i] = b[i]^2
accum_vec[i] += h[i]^2 # here, we used the same space in memory to hold the sum
accum_vec[i] = sqrt(accum_vec[i]) # same thing here, to hold the sqrt
end
return accum_vec
end
find_hypotenuse_optimized (generic function with 1 method)
using FFTW, LinearAlgebra
xmin, xmax, nx = 0, 4π, 1024
ymin, ymax, ny = 0, 4π, 1024
x = LinRange(xmin, xmax, nx+1)[1:end-1]
y = LinRange(ymin, ymax, ny+1)[1:end-1]
ky = 2π ./ (ymax-ymin) .* fftfreq(ny, ny)
exky = exp.( 1im .* ky .* x')
f = zeros(ComplexF64, (nx,ny))
fᵗ = zeros(ComplexF64, reverse(size(f)))
f̂ᵗ = zeros(ComplexF64, reverse(size(f)))
f .= sin.(x) .* cos.(y')
plan = plan_fft(fᵗ, 1, flags=FFTW.PATIENT)
FFTW forward plan for 1024×1024 array of ComplexF64
(dft-ct-dit/64
(dftw-direct-64/504-x1024 "t2fv_64_avx2")
(dft-vrank>=1-x1024/1
(dft-direct-16-x64 "n2fv_16_avx2")))