f(x) = 3x.^2 + 4x + 7x.^3;
fdot(x) = @. 3x^2 + 4x + 7x^3; # = 3 .* x.^2 .+ 4 .* x .+ 7 .* x.^3
Vectorized operations
Both f
and fdot
compute the same thing.
= rand(10^6);
x f(x) # warmup
@time f(x);
0.016093 seconds (12 allocations: 45.777 MiB, 39.86% gc time)
fdot(x) # warmup
@time fdot(x);
0.000559 seconds (2 allocations: 7.629 MiB)
f.(x) # warmup
@time f.(x);
0.001560 seconds (4 allocations: 7.629 MiB)
fdot(x)
is faster and allocates less memory, because each *
and +
operation in f(x)
allocates a new temporary array and executes in a separate loop.
Consider using views for slices
let
= 50_000_000
N = 1.2
a = rand(Float64, N)
x = rand(Float64, N)
y
= 100
nn = 1 + nn
n_start = N - nn
n_end
# timing
@time @. y[n_start:n_end] += a * x[n_start:n_end];
# timing
@time @. @views y[n_start:n_end] += a * x[n_start:n_end];
nothing
end
0.159749 seconds (4 allocations: 762.936 MiB, 2.57% gc time)
0.044672 seconds
Copy irregularly-accessed data into a contiguous array before operating on it
using Random
= randn(1_000_000);
x
= shuffle(1:1_000_000)[1:800000];
inds
= randn(50, 1_000_000);
A
= zeros(800_000);
xtmp = zeros(50, 800_000);
Atmp
@time sum(view(A, :, inds) * view(x, inds))
@time sum(view(A, :, inds) * view(x, inds))
0.280936 seconds (214.16 k allocations: 13.800 MiB, 42.26% compilation time)
0.162002 seconds (5 allocations: 624 bytes)
6642.838110703082
Irregular access patterns and non-contiguous views can drastically slow down computations on arrays because of non-sequential memory access. Copying the views into plain arrays speeds up the multiplication even with the cost of the copying operation.
@time begin
copyto!(xtmp, view(x, inds))
copyto!(Atmp, view(A, :, inds))
sum(Atmp * xtmp)
end
0.302451 seconds (209.90 k allocations: 14.376 MiB, 42.12% compilation time)
6642.838110702842
@time begin
copyto!(xtmp, view(x, inds))
copyto!(Atmp, view(A, :, inds))
sum(Atmp * xtmp)
end
0.174616 seconds (5 allocations: 624 bytes)
6642.838110702842