Search code examples
dataframeazurescalaapache-sparkapache-spark-sql

How to split the original dataframe and merge it back together


I have a dataframe(let's say originalDf) with 230 columns and 10 rows. I need to split it based on number of columns (say = 150) i.e df1 with 150 columns and df2 with 80 columns. When i merged it back, the columns are merged correctly but i see number of rows as 20. I am using scala and spark (3.2.0). Please suggest the solution.

// Specify the number of columns in each split
    val columnsPerSplit = 100

    // Get the total number of columns in the original DataFrame
    val totalColumns = originalDF.columns.length

    // Calculate the number of splits required
    val numSplits = (totalColumns.toDouble / columnsPerSplit).ceil.toInt

    // Split the original DataFrame into multiple DataFrames based on the specified number of columns
    val splitDataFrames = (0 until numSplits).map { splitIndex =>
      val startColIndex = splitIndex * columnsPerSplit
      val endColIndex = Math.min((splitIndex + 1) * columnsPerSplit, totalColumns)

      // Select columns for the current split
      val selectedColumns = originalDF.columns.slice(startColIndex, endColIndex).map(col)

      // Create a new DataFrame with selected columns
      val splitDataFrame = originalDF.select(selectedColumns: _*)

      splitDataFrame
    }

    // Merge all the splitted transformed dataframes
    val mergedDF = splitDataFrames.reduce((df1, df2) => df1.unionByName(df2, true))```


Expected: Number of rows should remain same both in originalDF and mergedDF

For ex: 

Actual:

df1 = spark.createDataFrame([[0, 1, 2]], ["col0", "col1", "col2"])
df2 = spark.createDataFrame([[3, 4, 5]], ["col3", "col4", "col5"])
df1.unionByName(df2, allowMissingColumns=True).show()
+----+----+----+----+----+----+
|col0|col1|col2|col3|col4|col5|
+----+----+----+----+----+----+
|   0|   1|   2|NULL|NULL|NULL|
|NULL|NULL|NULL|   3|   4|   5|
+----+----+----+----+----+----+


Expected:

df1 = spark.createDataFrame([[0, 1, 2]], ["col0", "col1", "col2"])
df2 = spark.createDataFrame([[3, 4, 5]], ["col3", "col4", "col5"])
df1.unionByName(df2, allowMissingColumns=True).show()
+----+----+----+----+----+----+
|col0|col1|col2|col3|col4|col5|
+----+----+----+----+----+----+
|   0|   1|   2|3   |4   |5   |
+----+----+----+----+----+----+

Solution

  • Add one unique id column using monotonically_increasing_id function to actual DataFrame. Using this column you can join DataFrame's back.

    val inputDF = df.withColumn("id", monotonically_increasing_id())
    
    val splitAt = 15 // You can change as per your need.
    
    val splitColumns = inputDF.columns.init.splitAt(splitAt)
    
    val leftColumns  = splitColumns._1 ++ Seq("id") 
    
    // leftColumns are col_1, col_2, col_3, col_4, col_5, col_6, col_7, col_8, col_9, col_10, col_11, col_12, col_13, col_14, col_15, id
    
    val rightColumns = splitColumns._2 ++ Seq("id") 
    
    // rightColumns are col_16, col_17, col_18, col_19, col_20, col_21, col_22, col_23, id
    
    // Splitting original DataFrame
    val leftDF = inputDF.selectExpr(leftColumns: _*) 
    val rightDF = inputDF.selectExpr(rightColumns: _*)
    
    // Merging DataFrame's back using join 
    val mergedDF = leftDF.join(rightDF, "id", "inner")
    

    Updated Code Below

      val splitAt = 5
    
      val output = df.columns.init.zipWithIndex
        .grouped(splitAt)
        .map { cols =>
          val columns = cols.map(_._1) ++ Seq("id")
          df.selectExpr(columns: _*)
        }
        .toList
    
     // output list contains 5 DataFrame's as split size is 5 & total columns are 24
    
      output.foreach{ sdf =>
        sdf.printSchema()
        sdf.show(false)
      }
    
    // Exiting paste mode, now interpreting.
    
    root
     |-- col_1: string (nullable = true)
     |-- col_2: string (nullable = true)
     |-- col_3: string (nullable = true)
     |-- col_4: string (nullable = true)
     |-- col_5: string (nullable = true)
     |-- id: long (nullable = false)
    
    +-----+-----+-----+-----+-----+---+
    |col_1|col_2|col_3|col_4|col_5|id |
    +-----+-----+-----+-----+-----+---+
    |A1   |A2   |A3   |A4   |A5   |0  |
    |B1   |B2   |B3   |B4   |B5   |1  |
    |C1   |C2   |C3   |C4   |C5   |2  |
    |D1   |D2   |D3   |D4   |D5   |3  |
    |E1   |E2   |E3   |E4   |E5   |4  |
    |F1   |F2   |F3   |F4   |F5   |5  |
    |G1   |G2   |G3   |G4   |G5   |6  |
    |H1   |H2   |H3   |H4   |H5   |7  |
    |I1   |I2   |I3   |I4   |I5   |8  |
    +-----+-----+-----+-----+-----+---+
    
    root
     |-- col_6: string (nullable = true)
     |-- col_7: string (nullable = true)
     |-- col_8: string (nullable = true)
     |-- col_9: string (nullable = true)
     |-- col_10: string (nullable = true)
     |-- id: long (nullable = false)
    
    +-----+-----+-----+-----+------+---+
    |col_6|col_7|col_8|col_9|col_10|id |
    +-----+-----+-----+-----+------+---+
    |A6   |A7   |A8   |A9   |A10   |0  |
    |B6   |B7   |B8   |B9   |B10   |1  |
    |C6   |C7   |C8   |C9   |C10   |2  |
    |D6   |D7   |D8   |D9   |D10   |3  |
    |E6   |E7   |E8   |E9   |E10   |4  |
    |F6   |F7   |F8   |F9   |F10   |5  |
    |G6   |G7   |G8   |G9   |G10   |6  |
    |H6   |H7   |H8   |H9   |H10   |7  |
    |I6   |I7   |I8   |I9   |I10   |8  |
    +-----+-----+-----+-----+------+---+
    
    root
     |-- col_11: string (nullable = true)
     |-- col_12: string (nullable = true)
     |-- col_13: string (nullable = true)
     |-- col_14: string (nullable = true)
     |-- col_15: string (nullable = true)
     |-- id: long (nullable = false)
    
    +------+------+------+------+------+---+
    |col_11|col_12|col_13|col_14|col_15|id |
    +------+------+------+------+------+---+
    |A11   |A12   |A13   |A14   |A15   |0  |
    |B11   |B12   |B13   |B14   |B15   |1  |
    |C11   |C12   |C13   |C14   |C15   |2  |
    |D11   |D12   |D13   |D14   |D15   |3  |
    |E11   |E12   |E13   |E14   |E15   |4  |
    |F11   |F12   |F13   |F14   |F15   |5  |
    |G11   |G12   |G13   |G14   |G15   |6  |
    |H11   |H12   |H13   |H14   |H15   |7  |
    |I11   |I12   |I13   |I14   |I15   |8  |
    +------+------+------+------+------+---+
    
    root
     |-- col_16: string (nullable = true)
     |-- col_17: string (nullable = true)
     |-- col_18: string (nullable = true)
     |-- col_19: string (nullable = true)
     |-- col_20: string (nullable = true)
     |-- id: long (nullable = false)
    
    +------+------+------+------+------+---+
    |col_16|col_17|col_18|col_19|col_20|id |
    +------+------+------+------+------+---+
    |A16   |A17   |A18   |A19   |A20   |0  |
    |B16   |B17   |B18   |B19   |B20   |1  |
    |C16   |C17   |C18   |C19   |C20   |2  |
    |D16   |D17   |D18   |D19   |D20   |3  |
    |E16   |E17   |E18   |E19   |E20   |4  |
    |F16   |F17   |F18   |F19   |F20   |5  |
    |G16   |G17   |G18   |G19   |G20   |6  |
    |H16   |H17   |H18   |H19   |H20   |7  |
    |I16   |I17   |I18   |I19   |I20   |8  |
    +------+------+------+------+------+---+
    
    root
     |-- col_21: string (nullable = true)
     |-- col_22: string (nullable = true)
     |-- col_23: string (nullable = true)
     |-- id: long (nullable = false)
    
    +------+------+------+---+
    |col_21|col_22|col_23|id |
    +------+------+------+---+
    |A21   |A22   |A23   |0  |
    |B21   |B22   |B23   |1  |
    |C21   |C22   |C23   |2  |
    |D21   |D22   |D23   |3  |
    |E21   |E22   |E23   |4  |
    |F21   |F22   |F23   |5  |
    |G21   |G22   |G23   |6  |
    |H21   |H22   |H23   |7  |
    |I21   |I22   |I23   |8  |
    +------+------+------+---+
    
    
    output.reduce(_.join(_, "id", "inner")).show(false)
    +---+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+------+------+------+------+------+------+------+------+------+------+------+------+------+
    |id |col_1|col_2|col_3|col_4|col_5|col_6|col_7|col_8|col_9|col_10|col_11|col_12|col_13|col_14|col_15|col_16|col_17|col_18|col_19|col_20|col_21|col_22|col_23|
    +---+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+------+------+------+------+------+------+------+------+------+------+------+------+------+
    |0  |A1   |A2   |A3   |A4   |A5   |A6   |A7   |A8   |A9   |A10   |A11   |A12   |A13   |A14   |A15   |A16   |A17   |A18   |A19   |A20   |A21   |A22   |A23   |
    |1  |B1   |B2   |B3   |B4   |B5   |B6   |B7   |B8   |B9   |B10   |B11   |B12   |B13   |B14   |B15   |B16   |B17   |B18   |B19   |B20   |B21   |B22   |B23   |
    |2  |C1   |C2   |C3   |C4   |C5   |C6   |C7   |C8   |C9   |C10   |C11   |C12   |C13   |C14   |C15   |C16   |C17   |C18   |C19   |C20   |C21   |C22   |C23   |
    |3  |D1   |D2   |D3   |D4   |D5   |D6   |D7   |D8   |D9   |D10   |D11   |D12   |D13   |D14   |D15   |D16   |D17   |D18   |D19   |D20   |D21   |D22   |D23   |
    |4  |E1   |E2   |E3   |E4   |E5   |E6   |E7   |E8   |E9   |E10   |E11   |E12   |E13   |E14   |E15   |E16   |E17   |E18   |E19   |E20   |E21   |E22   |E23   |
    |5  |F1   |F2   |F3   |F4   |F5   |F6   |F7   |F8   |F9   |F10   |F11   |F12   |F13   |F14   |F15   |F16   |F17   |F18   |F19   |F20   |F21   |F22   |F23   |
    |6  |G1   |G2   |G3   |G4   |G5   |G6   |G7   |G8   |G9   |G10   |G11   |G12   |G13   |G14   |G15   |G16   |G17   |G18   |G19   |G20   |G21   |G22   |G23   |
    |7  |H1   |H2   |H3   |H4   |H5   |H6   |H7   |H8   |H9   |H10   |H11   |H12   |H13   |H14   |H15   |H16   |H17   |H18   |H19   |H20   |H21   |H22   |H23   |
    |8  |I1   |I2   |I3   |I4   |I5   |I6   |I7   |I8   |I9   |I10   |I11   |I12   |I13   |I14   |I15   |I16   |I17   |I18   |I19   |I20   |I21   |I22   |I23   |
    +---+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+------+------+------+------+------+------+------+------+------+------+------+------+------+