I have a data frame which looks like this:
df1 = spark.createDataFrame(
[
(1, "google", "1", "search_engine"),
(1, "pinterest", "1", "social_media"),
(1, "yahoo", "2", "search_engine"),
(1, "youtube", "2", "social_media")
],
["id", "social_media", "rank", "types"]
)
df1.show(truncate=False)
+---+------------+----+-------------+
|id |social_media|rank|types |
+---+------------+----+-------------+
|1 |google |1 |search_engine|
|1 |pinterest |1 |social_media |
|1 |yahoo |2 |search_engine|
|1 |youtube |2 |social_media |
+---+------------+----+-------------+
I want a 2 column output with one as id and the other as list of dictionaries like below:
1, [{'rank':1,'search engine':'google','social media': 'pinterest'},
{'rank':2,'search engine':'yahoo','social media': 'youtube'}]
Closest I got is the following:
import pyspark
import pyspark.sql.functions as F
df1.groupBy("id").agg(
F.map_from_entries(
F.collect_list(
F.struct(F.struct("types", "social_media"), F.struct(F.lit("rank") , "rank")))).alias("key_value")
).show(truncate=False)
id | key_value |
---|---|
1 | {{search_engine, google} -> {rank, 1}, {social_media, pinterest} -> {rank, 1}, {search_engine, yahoo} -> {rank, 2}, {social_media, youtube} -> {rank, 2}} |
Any help is greatly appreciated. Thank you!!
One way to do it is this
df2 = df1.groupBy("id", "rank").agg(F.collect_list(F.struct("types", "social_media")).alias("agg_col"))
# Using MapType
def map_fn(rank, agg_cols):
result = {"rank": rank}
for agg in agg_cols:
result[agg[0]] = agg[1]
return result
map_udf = F.udf(map_fn, MapType(StringType(), StringType()))
df2 = df2.withColumn("agg_map", map_udf(df2.rank, df2.agg_col))
df2 = df2.groupBy("id").agg(F.collect_list("agg_map").alias("agg_map_list"))
df2.show(truncate=False)
+---+-------------------------------------------------------------------------------------------------------------------------------+
|id |agg_map_list |
+---+-------------------------------------------------------------------------------------------------------------------------------+
|1 |[{rank -> 1, search_engine -> google, social_media -> pinterest}, {rank -> 2, search_engine -> yahoo, social_media -> youtube}]|
+---+-------------------------------------------------------------------------------------------------------------------------------+