Pyspark Table:
Col1
{'table': [{'name': 'XXS',
'ranges': {'chestc': {'min': 87.88, 'max': 87.88},
'waistc': {'min': 58.42, 'max': 58.42}}},
{'name': 'XS',
'ranges': {'chestc': {'min': 94.22, 'max': 94.22},
'waistc': {'min': 66.04, 'max': 66.04}}},
{'name': 'S',
'ranges': {'chestc': {'min': 100.58, 'max': 100.58},
'waistc': {'min': 73.66, 'max': 73.66}}},
{'name': 'M',
'ranges': {'chestc': {'min': 106.92, 'max': 106.92},
'waistc': {'min': 81.28, 'max': 81.28}}},
{'name': 'L',
'ranges': {'chestc': {'min': 114.54, 'max': 114.54},
'waistc': {'min': 93.98, 'max': 93.98}}},
{'name': 'XL',
'ranges': {'chestc': {'min': 122.16, 'max': 122.16},
'waistc': {'min': 106.68, 'max': 106.68}}},
{'name': 'XXL',
'ranges': {'chestc': {'min': 131.06, 'max': 131.06},
'waistc': {'min': 121.92, 'max': 121.92}}}],
'measurement_system': 'metric'}
Col2
{
"gender": "male",
"measurement_system": "metric",
"measurements": {
"height": 178,
"weight": 99
}
}
I need create Col3 like :
{params:{'table': [{'name': 'XXS',
'ranges': {'chestc': {'min': 87.88, 'max': 87.88},
'waistc': {'min': 58.42, 'max': 58.42}}},
{'name': 'XS',
'ranges': {'chestc': {'min': 94.22, 'max': 94.22},
'waistc': {'min': 66.04, 'max': 66.04}}},
{'name': 'S',
'ranges': {'chestc': {'min': 100.58, 'max': 100.58},
'waistc': {'min': 73.66, 'max': 73.66}}},
{'name': 'M',
'ranges': {'chestc': {'min': 106.92, 'max': 106.92},
'waistc': {'min': 81.28, 'max': 81.28}}},
{'name': 'L',
'ranges': {'chestc': {'min': 114.54, 'max': 114.54},
'waistc': {'min': 93.98, 'max': 93.98}}},
{'name': 'XL',
'ranges': {'chestc': {'min': 122.16, 'max': 122.16},
'waistc': {'min': 106.68, 'max': 106.68}}},
{'name': 'XXL',
'ranges': {'chestc': {'min': 131.06, 'max': 131.06},
'waistc': {'min': 121.92, 'max': 121.92}}}],
'measurement_system': 'metric'},
"gender": "male",
"measurement_system": "metric",
"measurements": {
"height": 178,
"weight": 99
}
}
When I try :
table = table.withColumn(
"params",
F.struct(
F.col("Col1"),
F.col("Col2")
)
)
I get Row object like :
Row(Col1='{"table":[{"name":"26W/30L","ranges":{"height":{"min":120.000000,"max":175.000000},"hipc":{"min":80.000000,"max":86.000000}}},{"name":"28W/32L","ranges":{"height":{"min":175.000000,"max":220.000000},"hipc":{"min":88.000000,"max":90.000000}}},{"name":"31W/32L","ranges":{"height":{"min":175.000000,"max":220.000000},"hipc":{"min":98.000000,"max":99.000000}}},{"name":"34W/30L","ranges":{"height":{"min":120.000000,"max":175.000000},"hipc":{"min":103.000000,"max":106.500000}}},{"name":"38W/30L","ranges":{"height":{"min":120.000000,"max":175.000000},"hipc":{"min":115.000000,"max":120.000000}}},{"name":"26W/32L","ranges":{"height":{"min":175.000000,"max":220.000000},"hipc{"min":80.000000,"max":86.000000}}}}],"measurement_system":"metric"}',
Col2='{"gender":"male","measurement_system":"metric","measurements":{"height":150,"weight":50}}')
**This example of row
and when I try convert to JSON:
table = table.withColumn(
"params",
F.to_json(
F.struct(
F.col("Col1"),
F.col("Col2")
)
)
)
I get json with slashes like:
{
"params": {
"Col1": "{\"table\":[{\"name\":\"XXS\",\"ranges\":{\"height\":{\"min\":120.000000,\"max\":175.000000},\"hipc\":{\"min\":80.000000,\"max\":86.000000}}},{\"name\":\"XS\",\"ranges\":{\"height\":{\"min\":175.000000,\"max\":220.000000},\"hipc\":{\"min\":88.000000,\"max\":90.000000}}},{\"name\":\"M\",\"ranges\":{\"height\":{\"min\":175.000000,\"max\":220.000000},\"hipc\":{\"min\":98.000000,\"max\":99.000000}}},{\"name\":\"S\",\"ranges\":{\"height\":{\"min\":120.000000,\"max\":175.000000},\"hipc\":{\"min\":103.000000,\"max\":106.500000}}},{\"name\":\"L\",\"ranges\":{\"height\":{\"min\":120.000000,\"max\":175.000000},\"hipc\":{\"min\":115.000000,\"max\":120.000000}}},{\"name\":\"XL\",\"ranges\":{\"height\":{\"min\":175.000000,\"max\":220.000000},\"hipc\":{\"min\":80.000000,\"max\":86.000000}}},{\"name\":\"XXL\",\"ranges\":{\"height\":{\"min\":175.000000,\"max\":220.000000},\"hipc\":{\"min\":86.000000,\"max\":88.000000}}}}}],\"measurement_system\":\"metric\"}",
"Col2": "{\"gender\":\"male\",\"measurement_system\":\"metric\",\"measurements\":{\"height\":150,\"weight\":50}"
}
}
The challenge is not to use the UDF or any Python non-Pyspark functions or functions like collect(). I need to stay in Pyspark.
How I can get a column in normal JSON format?
This would work:
# Take first row to infer schema
first_row = df.first()
col_1_schema = spark.read.json(sc.parallelize([first_row[('col1')]])).schema
col_2_schema = spark.read.json(sc.parallelize([first_row[('col2')]])).schema
df=df.withColumn("col1", F.from_json("col1", col_1_schema))\
.withColumn("col2", F.from_json("col2", col_2_schema))\
.select(F.struct(F.col("col1.*"),F.col("col2.*")).alias("params"))
# Check Schema
df.printSchema()
# Convert struct back to string JSON
df.select(F.to_json(F.struct(F.col("col1.*"),F.col("col2.*")).alias("params")).alias("asJson"))\
.show(truncate=False)
Input:
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------+
|col1 |col2 |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------+
|{"table":[{"name":"XXS","ranges":{"chestc":{"min":87.88,"max":87.88},"waistc":{"min":58.42,"max":58.42}}},{"name":"XS","ranges":{"chestc":{"min":94.22,"max":94.22},"waistc":{"min":66.04,"max":66.04}}},{"name":"S","ranges":{"chestc":{"min":100.58,"max":100.58},"waistc":{"min":73.66,"max":73.66}}},{"name":"M","ranges":{"chestc":{"min":106.92,"max":106.92},"waistc":{"min":81.28,"max":81.28}}},{"name":"L","ranges":{"chestc":{"min":114.54,"max":114.54},"waistc":{"min":93.98,"max":93.98}}},{"name":"XL","ranges":{"chestc":{"min":122.16,"max":122.16},"waistc":{"min":106.68,"max":106.68}}},{"name":"XXL","ranges":{"chestc":{"min":131.06,"max":131.06},"waistc":{"min":121.92,"max":121.92}}}],"measurement_system":"metric"}|{"gender":"male","measurement_system":"metric","measurements":{"height":150,"weight":50}}|
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------+
Output:
Output Schema:
root
|-- params: struct (nullable = false)
| |-- measurement_system: string (nullable = true)
| |-- table: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- name: string (nullable = true)
| | | |-- ranges: struct (nullable = true)
| | | | |-- chestc: struct (nullable = true)
| | | | | |-- max: double (nullable = true)
| | | | | |-- min: double (nullable = true)
| | | | |-- waistc: struct (nullable = true)
| | | | | |-- max: double (nullable = true)
| | | | | |-- min: double (nullable = true)
| |-- gender: string (nullable = true)
| |-- measurement_system: string (nullable = true)
| |-- measurements: struct (nullable = true)
| | |-- height: long (nullable = true)
| | |-- weight: long (nullable = true)
Final DF (with String JSON):
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|asJson |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|{"measurement_system":"metric","table":[{"name":"XXS","ranges":{"chestc":{"max":87.88,"min":87.88},"waistc":{"max":58.42,"min":58.42}}},{"name":"XS","ranges":{"chestc":{"max":94.22,"min":94.22},"waistc":{"max":66.04,"min":66.04}}},{"name":"S","ranges":{"chestc":{"max":100.58,"min":100.58},"waistc":{"max":73.66,"min":73.66}}},{"name":"M","ranges":{"chestc":{"max":106.92,"min":106.92},"waistc":{"max":81.28,"min":81.28}}},{"name":"L","ranges":{"chestc":{"max":114.54,"min":114.54},"waistc":{"max":93.98,"min":93.98}}},{"name":"XL","ranges":{"chestc":{"max":122.16,"min":122.16},"waistc":{"max":106.68,"min":106.68}}},{"name":"XXL","ranges":{"chestc":{"max":131.06,"min":131.06},"waistc":{"max":121.92,"min":121.92}}}],"gender":"male","measurement_system":"metric","measurements":{"height":150,"weight":50}}|
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+