Search code examples
dataframejuliacontrol-flowdataframes.jl

Multiple conditionals in Julia DataFrame


I have a DataFrame with 3 columns, named :x :y and :z which are Float64 type. :x and "y are iid uniform on (0,1) and z is the sum of x and y.
I want to a simple task. If x and y are both greater than 0.5 I want to print z and replace its value to 1.0. For some reason the following code is running but not working

if df.x .> 0.5 && df.y .> 0.5
  println(df.z)
  replace!(df, :z) .= 1.0
end

Would appreciate any help on this


Solution

  • Your code is working on whole columns, and you want the code to work on rows. The simplest way to do it is (there are faster ways to do it, but the one I show you is simplest):

    julia> using DataFrames
    
    julia> df = DataFrame(rand(10, 2), [:x, :y]);
    
    julia> df.z = df.x + df.y;
    julia> df = DataFrame(rand(10, 2), [:x, :y]);
    
    julia> df.z = df.x + df.y;
    
    julia> df
    10×3 DataFrame
     Row │ x           y         z
         │ Float64     Float64   Float64
    ─────┼────────────────────────────────
       1 │ 0.00461518  0.767149  0.771764
       2 │ 0.670752    0.891172  1.56192
       3 │ 0.531777    0.78527   1.31705
       4 │ 0.0666402   0.265558  0.332198
       5 │ 0.700547    0.25959   0.960137
       6 │ 0.764978    0.84093   1.60591
       7 │ 0.720063    0.795599  1.51566
       8 │ 0.524065    0.260897  0.784962
       9 │ 0.577509    0.62598   1.20349
      10 │ 0.363896    0.266637  0.630533
    
    julia> for row in eachrow(df)
               if row.x > 0.5 && row.y > 0.5
                   println(row.z)
                   row.z = 1.0
               end
           end
    1.5619237447442418
    1.3170464579861205
    1.6059082278386194
    1.515661749106264
    1.2034891678047939
    
    julia> df
    10×3 DataFrame
     Row │ x           y         z
         │ Float64     Float64   Float64
    ─────┼────────────────────────────────
       1 │ 0.00461518  0.767149  0.771764
       2 │ 0.670752    0.891172  1.0
       3 │ 0.531777    0.78527   1.0
       4 │ 0.0666402   0.265558  0.332198
       5 │ 0.700547    0.25959   0.960137
       6 │ 0.764978    0.84093   1.0
       7 │ 0.720063    0.795599  1.0
       8 │ 0.524065    0.260897  0.784962
       9 │ 0.577509    0.62598   1.0
      10 │ 0.363896    0.266637  0.630533
    

    Edit

    Assuming you do not need to print here is a benchmark of several options:

    julia> df = DataFrame(rand(10^7, 2), [:x, :y]);
    
    julia> df.z = df.x + df.y;
    
    julia> @time for row in eachrow(df) # slowest
               if row.x > 0.5 && row.y > 0.5
                   row.z = 1.0
               end
           end
      3.469350 seconds (90.00 M allocations: 2.533 GiB, 10.07% gc time)
    
    julia> @time df.z[df.x .> 0.5 .&& df.y .> 0.5] .= 1.0; # fast and simple
      0.026041 seconds (15 allocations: 20.270 MiB)
    
    julia> function update_condition!(x, y, z)
               @inbounds for i in eachindex(x, y, z)
                   if x[i] > 0.5 && y[i] > 0.5
                       z[i] = 1.0
                   end
               end
               return nothing
           end
    update_condition! (generic function with 1 method)
    
    julia> update_condition!(df.x, df.y, df.z); # compilation
    
    julia> @time update_condition!(df.x, df.y, df.z); # faster but more complex
      0.011243 seconds (3 allocations: 96 bytes)