Iam new to the aws-glue. I am trying to read the csv and transforming to the json object. As i seen the approach would be to read the csv via crawler and convert to Pyspark DF, then convert to json object. Till now, i have converted to json object. Now i would need to write these json back to s3 bucket?
#########################################
### IMPORT LIBRARIES AND SET VARIABLES
#########################################
#Import python modules
from datetime import datetime
#Import pyspark modules
from pyspark.context import SparkContext
import pyspark.sql.functions as f
#Import glue modules
from awsglue.utils import getResolvedOptions
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
from awsglue.job import Job
import json
import boto3
#Initialize contexts and session
spark_context = SparkContext.getOrCreate()
glue_context = GlueContext(spark_context)
session = glue_context.spark_session
s3_source = boto3.resource('s3')
#Parameters
glue_db = "umesh-db"
glue_tbl = "read"
#########################################
### EXTRACT (READ DATA)
#########################################
#Read movie data to Glue dynamic frame
dynamic_frame_read = glue_context.create_dynamic_frame.from_catalog(database = glue_db, table_name = glue_tbl)
#Convert dynamic frame to data frame to use standard pyspark functions
data_frame = dynamic_frame_read.toDF()
## Show DF data
print("Showing Df data")
data_frame.show()
### Convert the DF to the json
jsonContent = data_frame.toJSON()
jsonValue={}
arrraYObj=[]
for row in jsonContent.collect():
print("Row data ", row)
arrraYObj.append(row)
print("Array Obj",arrraYObj)
jsonValue['Employee']=arrraYObj
print("Json object ", jsonValue)
#Log end time
#dt_end = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
#print("Start time:", dt_end)
Appreciate if anyone can help to provide the right approach? Thanks
data_frame.write.format(‘json’).save(‘s3://bucket/key’)
Or directly from dynamic frame
glue_context.write_dynamic_frame.from_options(frame = dynamic_frame_read,
connection_type = "s3",
connection_options = {"path": "s3://bucket/key"},
format = "json")