Search code examples
pythonjsonapache-sparkpysparkapache-spark-sql

PySpark : How to merge two json columns to new column


Pyspark Table:

Col1    
{'table': [{'name': 'XXS',
    'ranges': {'chestc': {'min': 87.88, 'max': 87.88},
     'waistc': {'min': 58.42, 'max': 58.42}}},
   {'name': 'XS',
    'ranges': {'chestc': {'min': 94.22, 'max': 94.22},
     'waistc': {'min': 66.04, 'max': 66.04}}},
   {'name': 'S',
    'ranges': {'chestc': {'min': 100.58, 'max': 100.58},
     'waistc': {'min': 73.66, 'max': 73.66}}},
   {'name': 'M',
    'ranges': {'chestc': {'min': 106.92, 'max': 106.92},
     'waistc': {'min': 81.28, 'max': 81.28}}},
   {'name': 'L',
    'ranges': {'chestc': {'min': 114.54, 'max': 114.54},
     'waistc': {'min': 93.98, 'max': 93.98}}},
   {'name': 'XL',
    'ranges': {'chestc': {'min': 122.16, 'max': 122.16},
     'waistc': {'min': 106.68, 'max': 106.68}}},
   {'name': 'XXL',
    'ranges': {'chestc': {'min': 131.06, 'max': 131.06},
     'waistc': {'min': 121.92, 'max': 121.92}}}],
  'measurement_system': 'metric'}

Col2

{
  "gender": "male",
  "measurement_system": "metric",
  "measurements": {
    "height": 178,
    "weight": 99
  }
}

enter image description here

I need create Col3 like :

{params:{'table': [{'name': 'XXS',
    'ranges': {'chestc': {'min': 87.88, 'max': 87.88},
     'waistc': {'min': 58.42, 'max': 58.42}}},
   {'name': 'XS',
    'ranges': {'chestc': {'min': 94.22, 'max': 94.22},
     'waistc': {'min': 66.04, 'max': 66.04}}},
   {'name': 'S',
    'ranges': {'chestc': {'min': 100.58, 'max': 100.58},
     'waistc': {'min': 73.66, 'max': 73.66}}},
   {'name': 'M',
    'ranges': {'chestc': {'min': 106.92, 'max': 106.92},
     'waistc': {'min': 81.28, 'max': 81.28}}},
   {'name': 'L',
    'ranges': {'chestc': {'min': 114.54, 'max': 114.54},
     'waistc': {'min': 93.98, 'max': 93.98}}},
   {'name': 'XL',
    'ranges': {'chestc': {'min': 122.16, 'max': 122.16},
     'waistc': {'min': 106.68, 'max': 106.68}}},
   {'name': 'XXL',
    'ranges': {'chestc': {'min': 131.06, 'max': 131.06},
     'waistc': {'min': 121.92, 'max': 121.92}}}],
  'measurement_system': 'metric'},
  "gender": "male",
  "measurement_system": "metric",
  "measurements": {
    "height": 178,
    "weight": 99
  }
}

When I try :

table = table.withColumn(
    "params",
    F.struct(
        F.col("Col1"),
        F.col("Col2")
    )
)

I get Row object like :

Row(Col1='{"table":[{"name":"26W/30L","ranges":{"height":{"min":120.000000,"max":175.000000},"hipc":{"min":80.000000,"max":86.000000}}},{"name":"28W/32L","ranges":{"height":{"min":175.000000,"max":220.000000},"hipc":{"min":88.000000,"max":90.000000}}},{"name":"31W/32L","ranges":{"height":{"min":175.000000,"max":220.000000},"hipc":{"min":98.000000,"max":99.000000}}},{"name":"34W/30L","ranges":{"height":{"min":120.000000,"max":175.000000},"hipc":{"min":103.000000,"max":106.500000}}},{"name":"38W/30L","ranges":{"height":{"min":120.000000,"max":175.000000},"hipc":{"min":115.000000,"max":120.000000}}},{"name":"26W/32L","ranges":{"height":{"min":175.000000,"max":220.000000},"hipc{"min":80.000000,"max":86.000000}}}}],"measurement_system":"metric"}',
Col2='{"gender":"male","measurement_system":"metric","measurements":{"height":150,"weight":50}}')

**This example of row

and when I try convert to JSON:

table = table.withColumn(
    "params",
    F.to_json(
        F.struct(
            F.col("Col1"),
            F.col("Col2")
        )
    )
)

I get json with slashes like:

{
  "params": {
    "Col1": "{\"table\":[{\"name\":\"XXS\",\"ranges\":{\"height\":{\"min\":120.000000,\"max\":175.000000},\"hipc\":{\"min\":80.000000,\"max\":86.000000}}},{\"name\":\"XS\",\"ranges\":{\"height\":{\"min\":175.000000,\"max\":220.000000},\"hipc\":{\"min\":88.000000,\"max\":90.000000}}},{\"name\":\"M\",\"ranges\":{\"height\":{\"min\":175.000000,\"max\":220.000000},\"hipc\":{\"min\":98.000000,\"max\":99.000000}}},{\"name\":\"S\",\"ranges\":{\"height\":{\"min\":120.000000,\"max\":175.000000},\"hipc\":{\"min\":103.000000,\"max\":106.500000}}},{\"name\":\"L\",\"ranges\":{\"height\":{\"min\":120.000000,\"max\":175.000000},\"hipc\":{\"min\":115.000000,\"max\":120.000000}}},{\"name\":\"XL\",\"ranges\":{\"height\":{\"min\":175.000000,\"max\":220.000000},\"hipc\":{\"min\":80.000000,\"max\":86.000000}}},{\"name\":\"XXL\",\"ranges\":{\"height\":{\"min\":175.000000,\"max\":220.000000},\"hipc\":{\"min\":86.000000,\"max\":88.000000}}}}}],\"measurement_system\":\"metric\"}",
    "Col2": "{\"gender\":\"male\",\"measurement_system\":\"metric\",\"measurements\":{\"height\":150,\"weight\":50}"
  }
}

The challenge is not to use the UDF or any Python non-Pyspark functions or functions like collect(). I need to stay in Pyspark.

How I can get a column in normal JSON format?


Solution

  • This would work:

    # Take first row to infer schema
    first_row = df.first()
    col_1_schema = spark.read.json(sc.parallelize([first_row[('col1')]])).schema
    col_2_schema = spark.read.json(sc.parallelize([first_row[('col2')]])).schema
    
    
    df=df.withColumn("col1", F.from_json("col1", col_1_schema))\
    .withColumn("col2", F.from_json("col2", col_2_schema))\
    .select(F.struct(F.col("col1.*"),F.col("col2.*")).alias("params"))
    
    # Check Schema
    df.printSchema()
    
    # Convert struct back to string JSON
    df.select(F.to_json(F.struct(F.col("col1.*"),F.col("col2.*")).alias("params")).alias("asJson"))\
    .show(truncate=False)
    

    Input:

    +-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------+
    |col1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |col2                                                                                     |
    +-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------+
    |{"table":[{"name":"XXS","ranges":{"chestc":{"min":87.88,"max":87.88},"waistc":{"min":58.42,"max":58.42}}},{"name":"XS","ranges":{"chestc":{"min":94.22,"max":94.22},"waistc":{"min":66.04,"max":66.04}}},{"name":"S","ranges":{"chestc":{"min":100.58,"max":100.58},"waistc":{"min":73.66,"max":73.66}}},{"name":"M","ranges":{"chestc":{"min":106.92,"max":106.92},"waistc":{"min":81.28,"max":81.28}}},{"name":"L","ranges":{"chestc":{"min":114.54,"max":114.54},"waistc":{"min":93.98,"max":93.98}}},{"name":"XL","ranges":{"chestc":{"min":122.16,"max":122.16},"waistc":{"min":106.68,"max":106.68}}},{"name":"XXL","ranges":{"chestc":{"min":131.06,"max":131.06},"waistc":{"min":121.92,"max":121.92}}}],"measurement_system":"metric"}|{"gender":"male","measurement_system":"metric","measurements":{"height":150,"weight":50}}|
    +-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------+
    

    Output:

    Output Schema:

    root
    
    
     |-- params: struct (nullable = false)
     |    |-- measurement_system: string (nullable = true)
     |    |-- table: array (nullable = true)
     |    |    |-- element: struct (containsNull = true)
     |    |    |    |-- name: string (nullable = true)
     |    |    |    |-- ranges: struct (nullable = true)
     |    |    |    |    |-- chestc: struct (nullable = true)
     |    |    |    |    |    |-- max: double (nullable = true)
     |    |    |    |    |    |-- min: double (nullable = true)
     |    |    |    |    |-- waistc: struct (nullable = true)
     |    |    |    |    |    |-- max: double (nullable = true)
     |    |    |    |    |    |-- min: double (nullable = true)
     |    |-- gender: string (nullable = true)
     |    |-- measurement_system: string (nullable = true)
     |    |-- measurements: struct (nullable = true)
     |    |    |-- height: long (nullable = true)
     |    |    |-- weight: long (nullable = true)
    

    Final DF (with String JSON):

    +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
    |asJson                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
    +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
    |{"measurement_system":"metric","table":[{"name":"XXS","ranges":{"chestc":{"max":87.88,"min":87.88},"waistc":{"max":58.42,"min":58.42}}},{"name":"XS","ranges":{"chestc":{"max":94.22,"min":94.22},"waistc":{"max":66.04,"min":66.04}}},{"name":"S","ranges":{"chestc":{"max":100.58,"min":100.58},"waistc":{"max":73.66,"min":73.66}}},{"name":"M","ranges":{"chestc":{"max":106.92,"min":106.92},"waistc":{"max":81.28,"min":81.28}}},{"name":"L","ranges":{"chestc":{"max":114.54,"min":114.54},"waistc":{"max":93.98,"min":93.98}}},{"name":"XL","ranges":{"chestc":{"max":122.16,"min":122.16},"waistc":{"max":106.68,"min":106.68}}},{"name":"XXL","ranges":{"chestc":{"max":131.06,"min":131.06},"waistc":{"max":121.92,"min":121.92}}}],"gender":"male","measurement_system":"metric","measurements":{"height":150,"weight":50}}|
    +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+