I'm new to pyspark RDD and have a dataframe obtained from a JSON file:
Row(created_at='2021-05-05 23:37:51', hash_tags=None, id=1390088382659895296, replyto_id=None, replyto_user_id=None, retweet_id=1390027514332991489, retweet_user_id=807095, text='RT @nytimes: Breaking News: The Biden administration will support lifting patent protections for Covid-19 vaccines, a breakthrough for glob…', user_id=17799542, user_mentions=[Row(id=807095, indices=[3, 11])])
This is all of my code:
spark = SparkSession \
.builder \
.appName("Python Spark SQL basic example") \
.config("spark.some.config.option", "some-value") \
.getOrCreate()
data_rdd = spark.read.option("multiline","true")\
.json("tweets.json")
print(data_rdd.collect()[0])
def extractColumns(record):
return (record[8],[record[4], record[6]])
ddata_frame = data_rdd.rdd.map(extractColumns)\
.groupByKey()\
.map(lambda r: (r[0], list(r[1])))
I obtained RDD data in the form of: [(17799542, [[None, 807095]]),
...
(3094649957, [[None, 3094649957], [None, None], [None, 3094649957], [None, None], [None, 3094649957], [None, None]])]
How can I eliminate None in the values to achieve below: [(17799542, [807095]),
...
(3094649957, [3094649957, 3094649957, 3094649957])]
I've tried below but not working:
def eliminateNone(record):
s = list(filter(lambda s: each != None for each in s))
return (record[0], s)
data_frame.mapValues(eliminateNone)
print(data_frame.collect())
I'm grateful for any help.
spark = SparkSession \
.builder \
.appName("Python Spark SQL basic example") \
.config("spark.some.config.option", "some-value") \
.getOrCreate()
data_rdd = spark.read.option("multiline","true")\
.json("tweets.json")
print(data_rdd.collect()[0])
def extractColumns(record):
return (record[8],[record[3], record[5]])
def merge_values(data):
result = []
for l in data:
for x in l:
if x != None:
result.append(x)
return result
data_frame = data_rdd.rdd.map(extractColumns)\
.groupByKey()\
.map(lambda r: (r[0], list(r[1])))
data_frame = data_frame.mapValues(merge_values)
print(data_frame.collect())
You can try this.