Skip to content

Instantly share code, notes, and snippets.

@mdmaas
Last active September 29, 2023 14:00
Show Gist options
  • Select an option

  • Save mdmaas/d1b6b1a69a6b235143d7110237ff4ae8 to your computer and use it in GitHub Desktop.

Select an option

Save mdmaas/d1b6b1a69a6b235143d7110237ff4ae8 to your computer and use it in GitHub Desktop.

Revisions

  1. mdmaas revised this gist Sep 17, 2023. 1 changed file with 2 additions and 0 deletions.
    2 changes: 2 additions & 0 deletions jarrays.jl
    Original file line number Diff line number Diff line change
    @@ -3,6 +3,8 @@ using Bumper
    using StrideArraysCore
    using StaticTools

    set_default_buffer_size!(1000)

    @inline function sumArray_alloc(N)
    smallarray = Array{Float64}(undef,N)
    @turbo for i 1:N
  2. mdmaas revised this gist Sep 17, 2023. 1 changed file with 12 additions and 42 deletions.
    54 changes: 12 additions & 42 deletions jarrays.jl
    Original file line number Diff line number Diff line change
    @@ -3,14 +3,6 @@ using Bumper
    using StrideArraysCore
    using StaticTools

    function sum_cheat(N)
    sum = 0.0
    @turbo for i 1:N
    sum += 1.0 / i^2
    end
    return sum
    end

    @inline function sumArray_alloc(N)
    smallarray = Array{Float64}(undef,N)
    @turbo for i 1:N
    @@ -81,26 +73,6 @@ function test_bumper(N)
    end
    end

    function test_prealloc(N)
    rep = 10000
    smallarray = Array{Float64}(undef,N)
    x = 0.0
    for i 1:rep
    x = sumArray_prealloc(N,smallarray)
    end
    end




    function test_cheat(N)
    rep = 10000
    x = 0.0
    for i 1:rep
    x = sum_cheat(N)
    end
    end

    function test_malloc(N)
    rep = 10000
    x = 0.0
    @@ -109,19 +81,17 @@ function test_malloc(N)
    end
    end



    function test_pre_malloc(N)
    function test_prealloc(N)
    rep = 10000
    smallarray = MallocArray{Float64}(undef,N)
    smallarray = Array{Float64}(undef,N)
    x = 0.0
    for i 1:rep
    x = sumArray_prealloc(N,smallarray)
    end
    free(smallarray)
    end



    using Libdl
    C_code = """
    #include <stdlib.h>
    @@ -185,7 +155,7 @@ end
    time_sumCArray(N, REP) = @ccall "./libarray.so".timesumCArray(N::Cint, REP::Cint)::Cdouble
    test_c_timing(N) = time_sumCArray(N, 10000)

    Ns = [3^p for p in 2:10]
    Ns = 5:2:100
    t_cheat = [(@elapsed test_cheat(N))*1e6 for N Ns]
    t_alloc = [(@elapsed test_alloc(N))*1e6 for N Ns]
    t_malloc = [(@elapsed test_malloc(N))*1e6 for N Ns]
    @@ -200,11 +170,11 @@ mean(x) = sum(x) / length(x)
    using Plots
    gr()

    scatter(log10.(Ns), t_alloc./t_cstack, label="Julia Arrays")
    scatter!(log10.(Ns), t_prealloc./t_cstack, label="Julia Pre-allocated Arrays")
    scatter!(log10.(Ns), t_malloc./t_cstack, label="Julia MallocArrays")
    scatter!(log10.(Ns), t_pre_malloc./t_cstack, label="Julia Pre-MallocArrays")
    scatter!(log10.(Ns), t_bumper./t_cstack, label="Bumper+StrideArrays")
    scatter!(log10.(Ns), t_cheap./t_cstack, label="C Heap Array")
    plot!(xlabel="log10(N)", ylabel="Cost wrt C stack-allocation")
    plot!(log10.(Ns), ones(size(Ns)), color=:black, label="C Stack Arrays")
    # plot(Ns, t_alloc./t_cstack, label="Julia Arrays")
    plot(Ns, t_prealloc./t_cstack, label="Julia Pre-allocated Arrays")
    plot!(Ns, t_malloc./t_cstack, label="Julia MallocArrays")
    plot!(Ns, t_pre_malloc./t_cstack, label="Julia Pre-MallocArrays")
    plot!(Ns, t_bumper./t_cstack, label="Bumper+StrideArrays")
    plot!(Ns, t_cheap./t_cstack, label="C Heap Array")
    plot!(xlabel="N", ylabel="Cost wrt C stack-allocation")
    plot!(Ns, ones(size(Ns)), color=:black, label="C Stack Arrays")
  3. mdmaas revised this gist Sep 17, 2023. 1 changed file with 54 additions and 19 deletions.
    73 changes: 54 additions & 19 deletions jarrays.jl
    Original file line number Diff line number Diff line change
    @@ -1,9 +1,17 @@
    using LoopVectorization
    using Bumper
    using StrideArrays
    using StrideArraysCore
    using StaticTools

    function sumArray_alloc(N)
    function sum_cheat(N)
    sum = 0.0
    @turbo for i 1:N
    sum += 1.0 / i^2
    end
    return sum
    end

    @inline function sumArray_alloc(N)
    smallarray = Array{Float64}(undef,N)
    @turbo for i 1:N
    smallarray[i] = 1.0 / i^2
    @@ -15,7 +23,7 @@ function sumArray_alloc(N)
    return sum
    end

    function sumArray_malloc(N)
    @inline function sumArray_malloc(N)
    smallarray = MallocArray{Float64}(undef, N)
    @turbo for i 1:N
    smallarray[i] = 1.0 / i^2
    @@ -28,7 +36,7 @@ function sumArray_malloc(N)
    return sum
    end

    function sumArray_bumper(N)
    @inline function sumArray_bumper(N)
    @no_escape begin
    smallarray = alloc(Float64, N)
    @turbo for i 1:N
    @@ -44,7 +52,7 @@ end



    function sumArray_prealloc(N, smallarray)
    @inline function sumArray_prealloc(N, smallarray)
    @turbo for i 1:N
    smallarray[i] = 1.0 / i^2
    end
    @@ -55,6 +63,8 @@ function sumArray_prealloc(N, smallarray)
    return sum
    end



    function test_alloc(N)
    rep = 10000
    x = 0.0
    @@ -63,30 +73,52 @@ function test_alloc(N)
    end
    end

    function test_malloc(N)
    function test_bumper(N)
    rep = 10000
    x = 0.0
    for i 1:rep
    x = sumArray_malloc(N)
    x = sumArray_bumper(N)
    end
    end

    function test_bumper(N)
    function test_prealloc(N)
    rep = 10000
    smallarray = Array{Float64}(undef,N)
    x = 0.0
    for i 1:rep
    x = sumArray_bumper(N)
    x = sumArray_prealloc(N,smallarray)
    end
    end


    function test_prealloc(N)


    function test_cheat(N)
    rep = 10000
    x = 0.0
    for i 1:rep
    x = sum_cheat(N)
    end
    end

    function test_malloc(N)
    rep = 10000
    smallarray = Array{Float64}(undef,N)
    x = 0.0
    for i 1:rep
    x = sumArray_malloc(N)
    end
    end



    function test_pre_malloc(N)
    rep = 10000
    smallarray = MallocArray{Float64}(undef,N)
    x = 0.0
    for i 1:rep
    x = sumArray_prealloc(N,smallarray)
    end
    free(smallarray)
    end


    @@ -153,11 +185,13 @@ end
    time_sumCArray(N, REP) = @ccall "./libarray.so".timesumCArray(N::Cint, REP::Cint)::Cdouble
    test_c_timing(N) = time_sumCArray(N, 10000)

    Ns = [2^p for p in 2:14]
    Ns = [3^p for p in 2:10]
    t_cheat = [(@elapsed test_cheat(N))*1e6 for N Ns]
    t_alloc = [(@elapsed test_alloc(N))*1e6 for N Ns]
    t_malloc = [(@elapsed test_malloc(N))*1e6 for N Ns]
    t_bumper = [(@elapsed test_bumper(N))*1e6 for N Ns]
    t_prealloc = [(@elapsed test_prealloc(N))*1e6 for N Ns]
    t_pre_malloc = [(@elapsed test_pre_malloc(N))*1e6 for N Ns]
    t_cstack = [(@elapsed test_cstackarray(N))*1e6 for N Ns]
    t_cheap = [(@elapsed test_cheaparray(N))*1e6 for N Ns]

    @@ -166,10 +200,11 @@ mean(x) = sum(x) / length(x)
    using Plots
    gr()

    scatter(log2.(Ns), t_alloc./t_cstack, label="Julia Arrays")
    scatter!(log2.(Ns), t_prealloc./t_cstack, label="Julia Pre-allocated Arrays")
    scatter!(log2.(Ns), t_malloc./t_cstack, label="Julia MallocArrays")
    scatter!(log2.(Ns), t_bumper./t_cstack, label="Bumper+StrideArrays")
    scatter!(log2.(Ns), t_cheap./t_cstack, label="C Heap Array")
    plot!(xlabel="log2(N)", ylabel="Cost wrt C stack-allocation")
    plot!(log2.(Ns), ones(size(Ns)), color=:black, label="C Stack Arrays")
    scatter(log10.(Ns), t_alloc./t_cstack, label="Julia Arrays")
    scatter!(log10.(Ns), t_prealloc./t_cstack, label="Julia Pre-allocated Arrays")
    scatter!(log10.(Ns), t_malloc./t_cstack, label="Julia MallocArrays")
    scatter!(log10.(Ns), t_pre_malloc./t_cstack, label="Julia Pre-MallocArrays")
    scatter!(log10.(Ns), t_bumper./t_cstack, label="Bumper+StrideArrays")
    scatter!(log10.(Ns), t_cheap./t_cstack, label="C Heap Array")
    plot!(xlabel="log10(N)", ylabel="Cost wrt C stack-allocation")
    plot!(log10.(Ns), ones(size(Ns)), color=:black, label="C Stack Arrays")
  4. mdmaas created this gist Sep 15, 2023.
    175 changes: 175 additions & 0 deletions jarrays.jl
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,175 @@
    using LoopVectorization
    using Bumper
    using StrideArrays
    using StaticTools

    function sumArray_alloc(N)
    smallarray = Array{Float64}(undef,N)
    @turbo for i 1:N
    smallarray[i] = 1.0 / i^2
    end
    sum = 0.0
    @turbo for i 1:N
    sum += smallarray[i]
    end
    return sum
    end

    function sumArray_malloc(N)
    smallarray = MallocArray{Float64}(undef, N)
    @turbo for i 1:N
    smallarray[i] = 1.0 / i^2
    end
    sum = 0.0
    @turbo for i 1:N
    sum += smallarray[i]
    end
    free(smallarray)
    return sum
    end

    function sumArray_bumper(N)
    @no_escape begin
    smallarray = alloc(Float64, N)
    @turbo for i 1:N
    smallarray[i] = 1.0 / i^2
    end
    sum = 0.0
    @turbo for i 1:N
    sum += smallarray[i]
    end
    end
    return sum
    end



    function sumArray_prealloc(N, smallarray)
    @turbo for i 1:N
    smallarray[i] = 1.0 / i^2
    end
    sum = 0.0
    @turbo for i 1:N
    sum += smallarray[i]
    end
    return sum
    end

    function test_alloc(N)
    rep = 10000
    x = 0.0
    for i 1:rep
    x = sumArray_alloc(N)
    end
    end

    function test_malloc(N)
    rep = 10000
    x = 0.0
    for i 1:rep
    x = sumArray_malloc(N)
    end
    end

    function test_bumper(N)
    rep = 10000
    x = 0.0
    for i 1:rep
    x = sumArray_bumper(N)
    end
    end


    function test_prealloc(N)
    rep = 10000
    smallarray = Array{Float64}(undef,N)
    x = 0.0
    for i 1:rep
    x = sumArray_prealloc(N,smallarray)
    end
    end


    using Libdl
    C_code = """
    #include <stdlib.h>
    #include <math.h>
    #include <omp.h>
    double sumCStackArray( int N ) {
    double smallarray[N];
    for(unsigned int k = 0; k<N; k++){
    smallarray[k] = 1.0/pow(k+1,2);
    }
    double sum = 0.0;
    #pragma omp simd reduction(+:sum)
    for(unsigned int k = 0; k<N; k++){
    sum += smallarray[k];
    }
    return sum;
    }
    double sumCHeapArray( int N ) {
    double * smallarray = malloc(N * sizeof(double));
    for(unsigned int k = 0; k<N; k++){
    smallarray[k] = 1.0/pow(k+1,2);
    }
    double sum = 0.0;
    #pragma omp simd reduction(+:sum)
    for(unsigned int k = 0; k<N; k++){
    sum += smallarray[k];
    }
    free(smallarray);
    return sum;
    }
    """
    Clib = "libarray"
    open(`gcc -fPIC -O3 -fargument-noalias -fopenmp -xc -shared -o $(Clib * "." * Libdl.dlext) -`, "w") do f
    print(f, C_code)
    end
    sumCStackArray(N) = @ccall "./libarray.so".sumCStackArray(N::Cint)::Cdouble
    sumCHeapArray(N) = @ccall "./libarray.so".sumCHeapArray(N::Cint)::Cdouble

    function test_cstackarray(N)
    rep = 10000
    x = 0.0
    for i 1:rep
    x = sumCStackArray(N)
    end
    end

    function test_cheaparray(N)
    rep = 10000
    x = 0.0
    for i 1:rep
    x = sumCHeapArray(N)
    end
    end

    time_sumCArray(N, REP) = @ccall "./libarray.so".timesumCArray(N::Cint, REP::Cint)::Cdouble
    test_c_timing(N) = time_sumCArray(N, 10000)

    Ns = [2^p for p in 2:14]
    t_alloc = [(@elapsed test_alloc(N))*1e6 for N Ns]
    t_malloc = [(@elapsed test_malloc(N))*1e6 for N Ns]
    t_bumper = [(@elapsed test_bumper(N))*1e6 for N Ns]
    t_prealloc = [(@elapsed test_prealloc(N))*1e6 for N Ns]
    t_cstack = [(@elapsed test_cstackarray(N))*1e6 for N Ns]
    t_cheap = [(@elapsed test_cheaparray(N))*1e6 for N Ns]

    mean(x) = sum(x) / length(x)

    using Plots
    gr()

    scatter(log2.(Ns), t_alloc./t_cstack, label="Julia Arrays")
    scatter!(log2.(Ns), t_prealloc./t_cstack, label="Julia Pre-allocated Arrays")
    scatter!(log2.(Ns), t_malloc./t_cstack, label="Julia MallocArrays")
    scatter!(log2.(Ns), t_bumper./t_cstack, label="Bumper+StrideArrays")
    scatter!(log2.(Ns), t_cheap./t_cstack, label="C Heap Array")
    plot!(xlabel="log2(N)", ylabel="Cost wrt C stack-allocation")
    plot!(log2.(Ns), ones(size(Ns)), color=:black, label="C Stack Arrays")