Put code inside functions

Performance critical code should be inside a function

Let’s compute \(y = a * x\)

n = 100_000
a = 1.2
x = rand(Float64, n)
y = rand(Float64, n)

@time for i in eachindex(y, x)
    y[i] += a * x[i]
end
  0.045648 seconds (699.26 k allocations: 12.205 MiB, 38.42% gc time, 33.61% compilation time)

To optimize the code, Julia needs it to be inside a function.

function axpy!(y, a, x)
    for i in eachindex(y, x)
        y[i] += a * x[i]
    end
end

# warmup
axpy!(y, a, x)

# timing
@time axpy!(y, a, x)
  0.000045 seconds

Avoid untyped global variables

Using global variable

using BenchmarkTools

variable = 10 

function add_using_global_variable(x)
    return x + variable
end

@btime add_using_global_variable(10);
  19.121 ns (0 allocations: 0 bytes)

Pass the variable in the arguments of the function

function add_using_function_arg(x, y)
    return x + y
end

@btime add_using_function_arg(10, $variable);
  2.775 ns (0 allocations: 0 bytes)

@code_llvm add_using_function_arg(10, variable)
;  @ In[5]:1 within `add_using_function_arg`
define i64 @julia_add_using_function_arg_989(i64 signext %0, i64 signext %1) #0 {
top:
;  @ In[5]:2 within `add_using_function_arg`
; ┌ @ int.jl:87 within `+`
   %2 = add i64 %1, %0
; └
  ret i64 %2
}

@code_llvm add_using_global_variable(10)
;  @ In[4]:5 within `add_using_global_variable`
define nonnull {}* @julia_add_using_global_variable_1012(i64 signext %0) #0 {
top:
  %1 = alloca [2 x {}*], align 8
  %gcframe2 = alloca [4 x {}*], align 16
  %gcframe2.sub = getelementptr inbounds [4 x {}*], [4 x {}*]* %gcframe2, i64 0, i64 0
  %.sub = getelementptr inbounds [2 x {}*], [2 x {}*]* %1, i64 0, i64 0
  %2 = bitcast [4 x {}*]* %gcframe2 to i8*
  call void @llvm.memset.p0i8.i32(i8* noundef nonnull align 16 dereferenceable(32) %2, i8 0, i32 32, i1 false)
  %thread_ptr = call i8* asm "movq %fs:0, $0", "=r"() #4
  %ppgcstack_i8 = getelementptr i8, i8* %thread_ptr, i64 -8
  %ppgcstack = bitcast i8* %ppgcstack_i8 to {}****
  %pgcstack = load {}***, {}**** %ppgcstack, align 8
;  @ In[4]:6 within `add_using_global_variable`
  %3 = bitcast [4 x {}*]* %gcframe2 to i64*
  store i64 8, i64* %3, align 16
  %4 = getelementptr inbounds [4 x {}*], [4 x {}*]* %gcframe2, i64 0, i64 1
  %5 = bitcast {}** %4 to {}***
  %6 = load {}**, {}*** %pgcstack, align 8
  store {}** %6, {}*** %5, align 8
  %7 = bitcast {}*** %pgcstack to {}***
  store {}** %gcframe2.sub, {}*** %7, align 8
  %8 = load atomic {}*, {}** inttoptr (i64 139955336514392 to {}**) unordered, align 8
  %9 = getelementptr inbounds [4 x {}*], [4 x {}*]* %gcframe2, i64 0, i64 2
  store {}* %8, {}** %9, align 16
  %10 = call nonnull {}* @ijl_box_int64(i64 signext %0)
  %11 = getelementptr inbounds [4 x {}*], [4 x {}*]* %gcframe2, i64 0, i64 3
  store {}* %10, {}** %11, align 8
  store {}* %10, {}** %.sub, align 8
  %12 = getelementptr inbounds [2 x {}*], [2 x {}*]* %1, i64 0, i64 1
  store {}* %8, {}** %12, align 8
  %13 = call nonnull {}* @ijl_apply_generic({}* inttoptr (i64 139956047507120 to {}*), {}** nonnull %.sub, i32 2)
  %14 = load {}*, {}** %4, align 8
  %15 = bitcast {}*** %pgcstack to {}**
  store {}* %14, {}** %15, align 8
  ret {}* %13
}

Set type of the global variable

variable_typed::Int = 10

function add_using_global_variable_typed(x)
    return x + variable_typed
end

@btime add_using_global_variable_typed(10);
  3.085 ns (0 allocations: 0 bytes)

Use the keyword const

const constant = 10

function add_by_passing_global_constant(x, v)
    return x + v
end

@btime add_by_passing_global_constant(10, $constant);
  3.085 ns (0 allocations: 0 bytes)

variable = 10

function sum_variable_many_times(n)
    total = rand(variable)
    for i in 1:n
        total .+= rand(variable)
    end
    return total
end

@btime sum_variable_many_times(100);
  57.287 μs (301 allocations: 20.45 KiB)

const constant = 10

function sum_constant_many_times(n)
    total = rand(constant)
    for i in 1:n
        total .+= rand(constant)
    end
    return total
end

@btime sum_constant_many_times(100);
  8.810 μs (101 allocations: 14.20 KiB)