I have 2GB json data from Mongodb export.I am trying make some analytics with those data and post those analytics data into remote Postgres table.This is my code:
import json
from psycopg2 import connect, Error
import dateutil.parser as parser
from datetime import datetime
record_list = []
for line in open('december.json', 'r'):
record_list.append(json.loads(line))
#print(record_list[0])
devices = []
for key, values in record_list[0].items():
if key == "deviceSerial":
devices.append(values)
test_list = list(set(devices))
device_data = []
for x in test_list:
device_1 = [i for i in record_list if (i['deviceSerial'] == x)]
datetime_object_first = parser.parse(device_1[0]['createdAt'],fuzzy=True)
datetime_object_last = parser.parse(device_1[len(device_1)-1]['createdAt'],fuzzy=True)
devices = device_1[0]['deviceSerial']
device_model = device_1[0]['device']
device_Usage=round((datetime.timestamp(datetime_object_last)-datetime.timestamp(datetime_object_first))/60/1000,3)
#calculating device usage
device_data_elements ={
'deviceModel': device_model,
'deviceSerial':devices,
'deviceUsageInMin':device_Usage
}
device_data.append(device_data_elements)
if type(device_data) == list:
first_record = device_data[0]
columns = list(first_record.keys())
#print ("\ncolumn names:", columns)
#print(device_data)
table_name = "test_data"
sql_string = 'INSERT INTO {} '.format( table_name )
sql_string += "(" + ', '.join(columns) + ")\nVALUES "
# enumerate over the record
for i, record_dict in enumerate(device_data):
# iterate over the values of each record dict object
values = []
for col_names, val in record_dict.items():
if type(val) == str:
val = val.replace("'", "''")
val = "'" + val + "'"
values += [ str(val) ]
# join the list of values and enclose record in parenthesis
sql_string += "(" + ', '.join(values) + "),\n"
# remove the last comma and end statement with a semicolon
sql_string = sql_string[:-2] + ";"
print ("\nSQL statement:")
print (sql_string)
try:
# declare a new PostgreSQL connection object
conn = connect(
dbname = "events_data",
user = "chao",
host = "localhost",
# attempt to connect for 10 seconds then raise exception
connect_timeout = 10
)
cur = conn.cursor()
print ("\ncreated cursor object:", cur)
#Post data into postgres table
except (Exception, Error) as err:
print ("\npsycopg2 connect error:", err)
conn = None
cur = None
if cur != None:
try:
cur.execute( sql_string )
conn.commit()
print ('\nfinished INSERT INTO execution')
except (Exception, Error) as error:
print("\nexecute_sql() error:", error)
conn.rollback()
# close the cursor and connection
cur.close()
conn.close()
I am doing some calculations(analytics) with this script.So there is some for loop in the script.When data length is less, script inserts those analysis into Postgres successfully.If data length is big, it takes too long time and I waited even 12h but no success .Right now my script is running locally and reading data also locally.What would be the best approach to read and manipulate large volume data and post the analytics to Postgres table.This is my sample json data.
[
{
"createdAt": "Fri Nov 27 2020 08:07:39 GMT+0000 ",
"sessionId": null,
"text": null,
"device": null,
"deviceSerial": null
},
{
"createdAt": "Tue Sep 01 2020 06:59:18 GMT+0000",
"sessionId": null,
"text": "Approve",
"device": "Android",
"deviceSerial": null
},
{
"createdAt": "Wed Sep 02 2020 08:40:10 GMT+0000",
"pageTitle": "submit option",
"sessionId": null,
"text": "launchComponent",
"device": "Android",
"deviceSerial": "636363636890"
},
{
"createdAt": "Wed Sep 02 2020 08:40:11",
"pageTitle": "quick check",
"sessionId": "88958d89c65f4fcea56e148a5a2838cfhdhdhd",
"text": "",
"device": "Android",
"deviceSerial": "6625839827"
}
]
I would recommend dumping the JSON into Postgres and doing the analysis in Postgres. That's what Postgres good at. No processing of the JSON is necessary, you can directly turn the JSON array into Postgres rows.
One option is to make a table with a single jsonb
column and insert each item as a row using jsonb_array_elements
.
create table devices_json (
data jsonb
)
insert into devices_json (data)
select * from jsonb_array_elements('
[
{
"createdAt": "Fri Nov 27 2020 08:07:39 GMT+0000 ",
"sessionId": null,
"text": null,
"device": null,
"deviceSerial": null
},
...and so on...
]
')
Then do your analysis in Postgres using its JSON functions.
If the fields are well known, you can use json_populate_recordset
to insert the fields into individual columns of a traditional SQL table. Then you have a traditional SQL table which may be easier to work with.
-- NOTE Postgres columns are case-sensitive, so they must be quoted to
-- ensure they exactly match the JSON keys
create table devices (
"createdAt" timestamp,
"sessionId" text,
"text" text,
device text,
"deviceSerial" text,
"pageTitle" text
);
insert into devices
select * from json_populate_recordset(NULL::devices, '
[
{
"createdAt": "Fri Nov 27 2020 08:07:39 GMT+0000 ",
"sessionId": null,
"text": null,
"device": null,
"deviceSerial": null
},
...and so on...
]
')
You can also do a hybrid of the two: dump the JSON array into a jsonb column and pull it apart into individual columns inside Postgres.