I have a dataframe(let's say originalDf) with 230 columns and 10 rows. I need to split it based on number of columns (say = 150) i.e df1 with 150 columns and df2 with 80 columns. When i merged it back, the columns are merged correctly but i see number of rows as 20. I am using scala and spark (3.2.0). Please suggest the solution.
// Specify the number of columns in each split
val columnsPerSplit = 100
// Get the total number of columns in the original DataFrame
val totalColumns = originalDF.columns.length
// Calculate the number of splits required
val numSplits = (totalColumns.toDouble / columnsPerSplit).ceil.toInt
// Split the original DataFrame into multiple DataFrames based on the specified number of columns
val splitDataFrames = (0 until numSplits).map { splitIndex =>
val startColIndex = splitIndex * columnsPerSplit
val endColIndex = Math.min((splitIndex + 1) * columnsPerSplit, totalColumns)
// Select columns for the current split
val selectedColumns = originalDF.columns.slice(startColIndex, endColIndex).map(col)
// Create a new DataFrame with selected columns
val splitDataFrame = originalDF.select(selectedColumns: _*)
splitDataFrame
}
// Merge all the splitted transformed dataframes
val mergedDF = splitDataFrames.reduce((df1, df2) => df1.unionByName(df2, true))```
Expected: Number of rows should remain same both in originalDF and mergedDF
For ex:
Actual:
df1 = spark.createDataFrame([[0, 1, 2]], ["col0", "col1", "col2"])
df2 = spark.createDataFrame([[3, 4, 5]], ["col3", "col4", "col5"])
df1.unionByName(df2, allowMissingColumns=True).show()
+----+----+----+----+----+----+
|col0|col1|col2|col3|col4|col5|
+----+----+----+----+----+----+
| 0| 1| 2|NULL|NULL|NULL|
|NULL|NULL|NULL| 3| 4| 5|
+----+----+----+----+----+----+
Expected:
df1 = spark.createDataFrame([[0, 1, 2]], ["col0", "col1", "col2"])
df2 = spark.createDataFrame([[3, 4, 5]], ["col3", "col4", "col5"])
df1.unionByName(df2, allowMissingColumns=True).show()
+----+----+----+----+----+----+
|col0|col1|col2|col3|col4|col5|
+----+----+----+----+----+----+
| 0| 1| 2|3 |4 |5 |
+----+----+----+----+----+----+
Add one unique id column using monotonically_increasing_id
function to actual DataFrame. Using this column you can join DataFrame's back.
val inputDF = df.withColumn("id", monotonically_increasing_id())
val splitAt = 15 // You can change as per your need.
val splitColumns = inputDF.columns.init.splitAt(splitAt)
val leftColumns = splitColumns._1 ++ Seq("id")
// leftColumns are col_1, col_2, col_3, col_4, col_5, col_6, col_7, col_8, col_9, col_10, col_11, col_12, col_13, col_14, col_15, id
val rightColumns = splitColumns._2 ++ Seq("id")
// rightColumns are col_16, col_17, col_18, col_19, col_20, col_21, col_22, col_23, id
// Splitting original DataFrame
val leftDF = inputDF.selectExpr(leftColumns: _*)
val rightDF = inputDF.selectExpr(rightColumns: _*)
// Merging DataFrame's back using join
val mergedDF = leftDF.join(rightDF, "id", "inner")
Updated Code Below
val splitAt = 5
val output = df.columns.init.zipWithIndex
.grouped(splitAt)
.map { cols =>
val columns = cols.map(_._1) ++ Seq("id")
df.selectExpr(columns: _*)
}
.toList
// output list contains 5 DataFrame's as split size is 5 & total columns are 24
output.foreach{ sdf =>
sdf.printSchema()
sdf.show(false)
}
// Exiting paste mode, now interpreting.
root
|-- col_1: string (nullable = true)
|-- col_2: string (nullable = true)
|-- col_3: string (nullable = true)
|-- col_4: string (nullable = true)
|-- col_5: string (nullable = true)
|-- id: long (nullable = false)
+-----+-----+-----+-----+-----+---+
|col_1|col_2|col_3|col_4|col_5|id |
+-----+-----+-----+-----+-----+---+
|A1 |A2 |A3 |A4 |A5 |0 |
|B1 |B2 |B3 |B4 |B5 |1 |
|C1 |C2 |C3 |C4 |C5 |2 |
|D1 |D2 |D3 |D4 |D5 |3 |
|E1 |E2 |E3 |E4 |E5 |4 |
|F1 |F2 |F3 |F4 |F5 |5 |
|G1 |G2 |G3 |G4 |G5 |6 |
|H1 |H2 |H3 |H4 |H5 |7 |
|I1 |I2 |I3 |I4 |I5 |8 |
+-----+-----+-----+-----+-----+---+
root
|-- col_6: string (nullable = true)
|-- col_7: string (nullable = true)
|-- col_8: string (nullable = true)
|-- col_9: string (nullable = true)
|-- col_10: string (nullable = true)
|-- id: long (nullable = false)
+-----+-----+-----+-----+------+---+
|col_6|col_7|col_8|col_9|col_10|id |
+-----+-----+-----+-----+------+---+
|A6 |A7 |A8 |A9 |A10 |0 |
|B6 |B7 |B8 |B9 |B10 |1 |
|C6 |C7 |C8 |C9 |C10 |2 |
|D6 |D7 |D8 |D9 |D10 |3 |
|E6 |E7 |E8 |E9 |E10 |4 |
|F6 |F7 |F8 |F9 |F10 |5 |
|G6 |G7 |G8 |G9 |G10 |6 |
|H6 |H7 |H8 |H9 |H10 |7 |
|I6 |I7 |I8 |I9 |I10 |8 |
+-----+-----+-----+-----+------+---+
root
|-- col_11: string (nullable = true)
|-- col_12: string (nullable = true)
|-- col_13: string (nullable = true)
|-- col_14: string (nullable = true)
|-- col_15: string (nullable = true)
|-- id: long (nullable = false)
+------+------+------+------+------+---+
|col_11|col_12|col_13|col_14|col_15|id |
+------+------+------+------+------+---+
|A11 |A12 |A13 |A14 |A15 |0 |
|B11 |B12 |B13 |B14 |B15 |1 |
|C11 |C12 |C13 |C14 |C15 |2 |
|D11 |D12 |D13 |D14 |D15 |3 |
|E11 |E12 |E13 |E14 |E15 |4 |
|F11 |F12 |F13 |F14 |F15 |5 |
|G11 |G12 |G13 |G14 |G15 |6 |
|H11 |H12 |H13 |H14 |H15 |7 |
|I11 |I12 |I13 |I14 |I15 |8 |
+------+------+------+------+------+---+
root
|-- col_16: string (nullable = true)
|-- col_17: string (nullable = true)
|-- col_18: string (nullable = true)
|-- col_19: string (nullable = true)
|-- col_20: string (nullable = true)
|-- id: long (nullable = false)
+------+------+------+------+------+---+
|col_16|col_17|col_18|col_19|col_20|id |
+------+------+------+------+------+---+
|A16 |A17 |A18 |A19 |A20 |0 |
|B16 |B17 |B18 |B19 |B20 |1 |
|C16 |C17 |C18 |C19 |C20 |2 |
|D16 |D17 |D18 |D19 |D20 |3 |
|E16 |E17 |E18 |E19 |E20 |4 |
|F16 |F17 |F18 |F19 |F20 |5 |
|G16 |G17 |G18 |G19 |G20 |6 |
|H16 |H17 |H18 |H19 |H20 |7 |
|I16 |I17 |I18 |I19 |I20 |8 |
+------+------+------+------+------+---+
root
|-- col_21: string (nullable = true)
|-- col_22: string (nullable = true)
|-- col_23: string (nullable = true)
|-- id: long (nullable = false)
+------+------+------+---+
|col_21|col_22|col_23|id |
+------+------+------+---+
|A21 |A22 |A23 |0 |
|B21 |B22 |B23 |1 |
|C21 |C22 |C23 |2 |
|D21 |D22 |D23 |3 |
|E21 |E22 |E23 |4 |
|F21 |F22 |F23 |5 |
|G21 |G22 |G23 |6 |
|H21 |H22 |H23 |7 |
|I21 |I22 |I23 |8 |
+------+------+------+---+
output.reduce(_.join(_, "id", "inner")).show(false)
+---+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+------+------+------+------+------+------+------+------+------+------+------+------+------+
|id |col_1|col_2|col_3|col_4|col_5|col_6|col_7|col_8|col_9|col_10|col_11|col_12|col_13|col_14|col_15|col_16|col_17|col_18|col_19|col_20|col_21|col_22|col_23|
+---+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+------+------+------+------+------+------+------+------+------+------+------+------+------+
|0 |A1 |A2 |A3 |A4 |A5 |A6 |A7 |A8 |A9 |A10 |A11 |A12 |A13 |A14 |A15 |A16 |A17 |A18 |A19 |A20 |A21 |A22 |A23 |
|1 |B1 |B2 |B3 |B4 |B5 |B6 |B7 |B8 |B9 |B10 |B11 |B12 |B13 |B14 |B15 |B16 |B17 |B18 |B19 |B20 |B21 |B22 |B23 |
|2 |C1 |C2 |C3 |C4 |C5 |C6 |C7 |C8 |C9 |C10 |C11 |C12 |C13 |C14 |C15 |C16 |C17 |C18 |C19 |C20 |C21 |C22 |C23 |
|3 |D1 |D2 |D3 |D4 |D5 |D6 |D7 |D8 |D9 |D10 |D11 |D12 |D13 |D14 |D15 |D16 |D17 |D18 |D19 |D20 |D21 |D22 |D23 |
|4 |E1 |E2 |E3 |E4 |E5 |E6 |E7 |E8 |E9 |E10 |E11 |E12 |E13 |E14 |E15 |E16 |E17 |E18 |E19 |E20 |E21 |E22 |E23 |
|5 |F1 |F2 |F3 |F4 |F5 |F6 |F7 |F8 |F9 |F10 |F11 |F12 |F13 |F14 |F15 |F16 |F17 |F18 |F19 |F20 |F21 |F22 |F23 |
|6 |G1 |G2 |G3 |G4 |G5 |G6 |G7 |G8 |G9 |G10 |G11 |G12 |G13 |G14 |G15 |G16 |G17 |G18 |G19 |G20 |G21 |G22 |G23 |
|7 |H1 |H2 |H3 |H4 |H5 |H6 |H7 |H8 |H9 |H10 |H11 |H12 |H13 |H14 |H15 |H16 |H17 |H18 |H19 |H20 |H21 |H22 |H23 |
|8 |I1 |I2 |I3 |I4 |I5 |I6 |I7 |I8 |I9 |I10 |I11 |I12 |I13 |I14 |I15 |I16 |I17 |I18 |I19 |I20 |I21 |I22 |I23 |
+---+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+------+------+------+------+------+------+------+------+------+------+------+------+------+