= 100_000
n = 1.2
a = rand(Float64, n)
x = rand(Float64, n)
y
@time for i in eachindex(y, x)
+= a * x[i]
y[i] end
0.045648 seconds (699.26 k allocations: 12.205 MiB, 38.42% gc time, 33.61% compilation time)
Performance critical code should be inside a function
Let’s compute \(y = a * x\)
n = 100_000
a = 1.2
x = rand(Float64, n)
y = rand(Float64, n)
@time for i in eachindex(y, x)
y[i] += a * x[i]
end
0.045648 seconds (699.26 k allocations: 12.205 MiB, 38.42% gc time, 33.61% compilation time)
To optimize the code, Julia needs it to be inside a function.
function axpy!(y, a, x)
for i in eachindex(y, x)
y[i] += a * x[i]
end
end
# warmup
axpy!(y, a, x)
# timing
@time axpy!(y, a, x)
0.000045 seconds
function add_using_function_arg(x, y)
return x + y
end
@btime add_using_function_arg(10, $variable);
2.775 ns (0 allocations: 0 bytes)
; @ In[5]:1 within `add_using_function_arg`
define i64 @julia_add_using_function_arg_989(i64 signext %0, i64 signext %1) #0 {
top:
; @ In[5]:2 within `add_using_function_arg`
; ┌ @ int.jl:87 within `+`
%2 = add i64 %1, %0
; └
ret i64 %2
}
; @ In[4]:5 within `add_using_global_variable`
define nonnull {}* @julia_add_using_global_variable_1012(i64 signext %0) #0 {
top:
%1 = alloca [2 x {}*], align 8
%gcframe2 = alloca [4 x {}*], align 16
%gcframe2.sub = getelementptr inbounds [4 x {}*], [4 x {}*]* %gcframe2, i64 0, i64 0
%.sub = getelementptr inbounds [2 x {}*], [2 x {}*]* %1, i64 0, i64 0
%2 = bitcast [4 x {}*]* %gcframe2 to i8*
call void @llvm.memset.p0i8.i32(i8* noundef nonnull align 16 dereferenceable(32) %2, i8 0, i32 32, i1 false)
%thread_ptr = call i8* asm "movq %fs:0, $0", "=r"() #4
%ppgcstack_i8 = getelementptr i8, i8* %thread_ptr, i64 -8
%ppgcstack = bitcast i8* %ppgcstack_i8 to {}****
%pgcstack = load {}***, {}**** %ppgcstack, align 8
; @ In[4]:6 within `add_using_global_variable`
%3 = bitcast [4 x {}*]* %gcframe2 to i64*
store i64 8, i64* %3, align 16
%4 = getelementptr inbounds [4 x {}*], [4 x {}*]* %gcframe2, i64 0, i64 1
%5 = bitcast {}** %4 to {}***
%6 = load {}**, {}*** %pgcstack, align 8
store {}** %6, {}*** %5, align 8
%7 = bitcast {}*** %pgcstack to {}***
store {}** %gcframe2.sub, {}*** %7, align 8
%8 = load atomic {}*, {}** inttoptr (i64 139955336514392 to {}**) unordered, align 8
%9 = getelementptr inbounds [4 x {}*], [4 x {}*]* %gcframe2, i64 0, i64 2
store {}* %8, {}** %9, align 16
%10 = call nonnull {}* @ijl_box_int64(i64 signext %0)
%11 = getelementptr inbounds [4 x {}*], [4 x {}*]* %gcframe2, i64 0, i64 3
store {}* %10, {}** %11, align 8
store {}* %10, {}** %.sub, align 8
%12 = getelementptr inbounds [2 x {}*], [2 x {}*]* %1, i64 0, i64 1
store {}* %8, {}** %12, align 8
%13 = call nonnull {}* @ijl_apply_generic({}* inttoptr (i64 139956047507120 to {}*), {}** nonnull %.sub, i32 2)
%14 = load {}*, {}** %4, align 8
%15 = bitcast {}*** %pgcstack to {}**
store {}* %14, {}** %15, align 8
ret {}* %13
}
const
const constant = 10
function add_by_passing_global_constant(x, v)
return x + v
end
@btime add_by_passing_global_constant(10, $constant);
3.085 ns (0 allocations: 0 bytes)
variable = 10
function sum_variable_many_times(n)
total = rand(variable)
for i in 1:n
total .+= rand(variable)
end
return total
end
@btime sum_variable_many_times(100);
57.287 μs (301 allocations: 20.45 KiB)