Search code examples
pythonpyspark

Creating List of Dictionaries from a Pyspark Dataframe


I have a data frame which looks like this:

df1 = spark.createDataFrame(
    [ 
        (1, "google", "1", "search_engine"), 
        (1, "pinterest", "1", "social_media"), 
        (1, "yahoo", "2", "search_engine"),
        (1, "youtube", "2", "social_media")
    ],
    ["id", "social_media", "rank", "types"]
)

df1.show(truncate=False)

+---+------------+----+-------------+
|id |social_media|rank|types        |
+---+------------+----+-------------+
|1  |google      |1   |search_engine|
|1  |pinterest   |1   |social_media |
|1  |yahoo       |2   |search_engine|
|1  |youtube     |2   |social_media |
+---+------------+----+-------------+

I want a 2 column output with one as id and the other as list of dictionaries like below:

1, [{'rank':1,'search engine':'google','social media': 'pinterest'},

{'rank':2,'search engine':'yahoo','social media': 'youtube'}]

Closest I got is the following:

import pyspark
import pyspark.sql.functions as F

df1.groupBy("id").agg(
    F.map_from_entries(
        F.collect_list(
            F.struct(F.struct("types", "social_media"), F.struct(F.lit("rank") , "rank")))).alias("key_value")
).show(truncate=False)
id key_value
{{search_engine, google} -> {rank, 1}, {social_media, pinterest} -> {rank, 1}, {search_engine, yahoo} -> {rank, 2}, {social_media, youtube} -> {rank, 2}}

Any help is greatly appreciated. Thank you!!


Solution

  • One way to do it is this

    df2 = df1.groupBy("id", "rank").agg(F.collect_list(F.struct("types", "social_media")).alias("agg_col"))
    
    
    # Using MapType 
    def map_fn(rank, agg_cols):
        result = {"rank": rank}
        for agg in agg_cols:
            result[agg[0]] = agg[1]
        return result
    
    map_udf = F.udf(map_fn, MapType(StringType(), StringType()))
    df2 = df2.withColumn("agg_map", map_udf(df2.rank, df2.agg_col))
    df2 = df2.groupBy("id").agg(F.collect_list("agg_map").alias("agg_map_list"))
    df2.show(truncate=False)
    
    
    +---+-------------------------------------------------------------------------------------------------------------------------------+
    |id |agg_map_list                                                                                                                   |
    +---+-------------------------------------------------------------------------------------------------------------------------------+
    |1  |[{rank -> 1, search_engine -> google, social_media -> pinterest}, {rank -> 2, search_engine -> yahoo, social_media -> youtube}]|
    +---+-------------------------------------------------------------------------------------------------------------------------------+