I am having below issue when I have one dataframe and it is having one column name is attribute and attribute type is
|-- attributes: string (nullable = true)
in that column i am having values like below
[
{'name': 'sfcc.created_by', 'type': 'STRING', 'value': 'DummyUser'},
{'name': 'shippingLines', 'type': 'JSON', 'value': {'code': 'DUMMYCODE', 'price': '0', 'discountedPrice': '0'}},
{'name': 'sourceChannel', 'type': 'STRING', 'value': 'dummyOS'},
{'name': 'leaveAtDoor', 'type': 'BOOLEAN', 'value': False},
{'name': 'actualCreateDate', 'type': 'STRING', 'value': '2024-01-01T00:00:00.000Z'},
{'name': 'orderCompleteDate', 'type': 'STRING', 'value': '2024-01-02T00:00:00.000Z'},
{'name': 'sfcc.order_date', 'type': 'STRING', 'value': '2024-01-01T00:00:00.000Z'},
{'name': 'orderSuccessfullyCreatedInSAP', 'type': 'BOOLEAN', 'value': True},
{'name': 'splitPayments', 'type': 'JSON', 'value': [{'price': '100', 'gateway': 'DUMMYGATEWAY', 'paymentGatewayName': 'DUMMY_CARD'}]},
{'name': 'sfcc.customer_locale', 'type': 'STRING', 'value': 'en_US'},
{'name': 'customerId', 'type': 'STRING', 'value': '000DummyID0000'},
{'name': 'financialStatus', 'type': 'STRING', 'value': 'DUMMYSTATUS'},
{'name': 'addressLine1', 'type': 'STRING', 'value': 'Dummy Street 1'},
{'name': 'sourceLocation', 'type': 'STRING', 'value': 'DUMMYWEBSITE'},
{'name': 'addressLine2', 'type': 'STRING', 'value': 'Dummy Street 2'},
{'name': 'subtotalPrice', 'type': 'STRING', 'value': '100'},
{'name': 'carrierId', 'type': 'STRING', 'value': 'DummyCarrier'},
{'name': 'isCancelable', 'type': 'BOOLEAN', 'value': False},
{'name': 'consignments', 'type': 'JSON', 'value': ['0000-0000-0000-0000']},
{'name': 'sfcc.shipments', 'type': 'JSON', 'value': {'shipments': [{'items': [{'skuRef': '0000000000', 'quantity': 1}], 'isGift': False, 'shipmentId': '00000000', 'giftMessage': None, 'shippingAddress': {'area': 'Dummy Area', 'city': 'Dummy City', 'name': 'John Doe', 'phone': '+00000000000', 'street': 'Dummy Street', 'country': 'XX', 'lastName': 'Doe', 'postcode': '00000', 'firstName': 'John'}, 'shippingMethodRef': 'XXX000'}]}}
]
I want to get all result in separate columns like
sfcc.created_by shippingLines
DummyUser {'code': 'DUMMYCODE', 'price': '0', 'discountedPrice': '0'}
from pyspark.sql.functions import (
regexp_replace,
from_json,
schema_of_json,
map_from_entries,
col,
expr,
explode,
)
# Create the dataframe.
data = [
(
"""[{"name": "sfcc.created_by", "type": "STRING", "value": "XXXXX"},
{'name': 'shippingLines', 'type': 'JSON', 'value': {'code': 'XXXXX', 'price': '0', 'discountedPrice': '0'}},
{'name': 'sourceChannel', 'type': 'STRING', 'value': 'XXXXX'},
{'name': 'leaveAtDoor', 'type': 'BOOLEAN', 'value': False},
{'name': 'actualCreateDate', 'type': 'STRING', 'value': '2020-04-13T14:02:21.529Z'},
{'name': 'orderCompleteDate', 'type': 'STRING', 'value': '2020-04-15T11:16:17.086Z'},
{'name': 'sfcc.order_date', 'type': 'STRING', 'value': '2020-04-13T14:01:36.000Z'},
{'name': 'orderSuccessfullyCreatedInSAP', 'type': 'BOOLEAN', 'value': True},
{'name': 'splitPayments', 'type': 'JSON', 'value': [{'price': 'XXXXX', 'gateway': 'XXXXX', 'paymentGatewayName': 'XXXXX'}]},
{'name': 'sfcc.customer_locale', 'type': 'STRING', 'value': 'XXXXX'},
{'name': 'customerId', 'type': 'STRING', 'value': 'XXXXX'},
{'name': 'financialStatus', 'type': 'STRING', 'value': 'XXXXX'},
{'name': 'addressLine1', 'type': 'STRING', 'value': 'XXXXX'},
{'name': 'sourceLocation', 'type': 'STRING', 'value': 'XXXXX'},
{'name': 'addressLine2', 'type': 'STRING', 'value': 'XXXXX'},
{'name': 'subtotalPrice', 'type': 'STRING', 'value': 'XXXXX'},
{'name': 'carrierId', 'type': 'STRING', 'value': 'XXXXX'},
{'name': 'isCancelable', 'type': 'BOOLEAN', 'value': False},
{'name': 'consignments', 'type': 'JSON', 'value': ['XXXXX']},
{'name': 'sfcc.shipments', 'type': 'JSON', 'value': {'shipments': [{'items': [{'skuRef': 'XXXXX', 'quantity': 1}], 'isGift': False, 'shipmentId': 'XXXXX', 'giftMessage': None, 'shippingAddress': {'area': 'Other', 'city': 'XXXXX', 'name': 'XXXXX', 'phone': '+910000000', 'street': 'XXXXX', 'country': 'XXXXX', 'lastName': 'XXXXX', 'postcode': None, 'firstName': 'XXXXX'}, 'shippingMethodRef': 'XXXXX'}]}}
]
""",
),
(
"""[{"name": "sfcc.created_by", "type": "STRING", "value": "AAAAA"},
{'name': 'shippingLines', 'type': 'JSON', 'value': {'code': 'BBBBB', 'price': '0', 'discountedPrice': '0'}},
{'name': 'sourceChannel', 'type': 'STRING', 'value': 'CCCC'},
{'name': 'leaveAtDoor', 'type': 'BOOLEAN', 'value': False},
{'name': 'actualCreateDate', 'type': 'STRING', 'value': '2020-04-13T14:02:21.529Z'}
]
""",
)
]
columns = ["attributes"]
df = spark.createDataFrame(data, schema=columns)
EDIT: Sample data is not a valid JSON, hence it must be cleaned up first. The rest of the code works as is.
df = (
df.withColumn("attributes", regexp_replace("attributes", "False", "false"))
.withColumn("attributes", regexp_replace("attributes", "True", "true"))
.withColumn("attributes", regexp_replace("attributes", "None", "null"))
)
Assuming that your data is in the dataframe df
, you can parse the data into a Map
column:
# Parse the string data as a JSON and then into a MAP type.
df = (
df.withColumn(
"attributes",
from_json(
col("attributes"), schema_of_json(df.select(col("attributes")).first()[0])
),
)
.withColumn(
"attributes", expr("transform(attributes, x -> struct(x.name, x.value))")
)
.withColumn("attributes", map_from_entries("attributes"))
)
Once you have the data in a Map
type column, you can extract the keys and then select the keys as individual columns:
# Extract all possible keys from the dataframe.
keys = (
df.select(explode("attributes"))
.select("key")
.distinct()
.rdd.flatMap(lambda x: x)
.collect()
)
# Create the desired dataframe.
df = df.select(*[col("attributes").getItem(k).alias(k) for k in keys])
df.show()
Result:
+--------------------+------------+---------+--------------------+-------------+-----------------------------+--------------------+--------------------+--------------------+------------+---------------+--------------+--------------------+----------+---------------+--------------------+-------------+------------+-----------+------------+
|sfcc.customer_locale|isCancelable|carrierId| orderCompleteDate|subtotalPrice|orderSuccessfullyCreatedInSAP| actualCreateDate| sfcc.order_date| splitPayments|consignments|financialStatus|sourceLocation| sfcc.shipments|customerId|sfcc.created_by| shippingLines|sourceChannel|addressLine2|leaveAtDoor|addressLine1|
+--------------------+------------+---------+--------------------+-------------+-----------------------------+--------------------+--------------------+--------------------+------------+---------------+--------------+--------------------+----------+---------------+--------------------+-------------+------------+-----------+------------+
| XXXXX| false| XXXXX|2020-04-15T11:16:...| XXXXX| true|2020-04-13T14:02:...|2020-04-13T14:01:...|[{"price":"XXXXX"...| ["XXXXX"]| XXXXX| XXXXX|{"shipments":[{"i...| XXXXX| XXXXX|{"code":"XXXXX","...| XXXXX| XXXXX| false| XXXXX|
| null| null| null| null| null| null|2020-04-13T14:02:...| null| null| null| null| null| null| null| AAAAA|{"code":"BBBBB","...| CCCC| null| false| null|
+--------------------+------------+---------+--------------------+-------------+-----------------------------+--------------------+--------------------+--------------------+------------+---------------+--------------+--------------------+----------+---------------+--------------------+-------------+------------+-----------+------------+