The json file has the structure like this:
{"text":"I","meta":{"paper_id":"cadf94cda790ae1bd90c32fbe441bb68a8637d83","title":"title1"}}
{"text":"love","meta":{"paper_id":"cadf94cda790ae1bd90c32fbe441bb68a8637d83","title":"title1"}}
{"text":"Coca-cola.","meta":{"paper_id":"cadf94cda790ae1bd90c32fbe441bb68a8637d83","title":"title1"}}
{"text":"He","meta":{"paper_id":"0f3402fa5b44e121d410ec73dfc21937074e5fa3","title":"title2"}}
{"text":"loves","meta":{"paper_id":"0f3402fa5b44e121d410ec73dfc21937074e5fa3","title":"title2"}}
{"text":"Pepsi.","meta":{"paper_id":"0f3402fa5b44e121d410ec73dfc21937074e5fa3","title":"title2"}}
I want to concatenate sentences that belong to the same paper (paper_id) that in the end to have:
{"text":"I love Coca-cola. ","meta":{"paper_id":"cadf94cda790ae1bd90c32fbe441bb68a8637d83","title":"title1"}}
{"text":"He loves Pepsi.","meta":{"paper_id":"0f3402fa5b44e121d410ec73dfc21937074e5fa3","title":"title2"}}
Any ideas how to tackle this? I stuck in iterating over those nested dictionaries.
Loaded data into list
data = [json.loads(line) for line in open('datafile_path', 'r')]
for sentence in data:
for key,dict_n in sentence.items():
for key2,value in dict_n.items():
print(value)
this throws error: AttributeError: 'str' object has no attribute 'items'
First you get the Id's like this:
def getIds(data):
ids = []
for i in data:
if i['meta']['paper_id'] not in ids:
ids.append(i['meta']['paper_id'])
return ids
and then you iterate through the list:
concatenate_sentence = {"text":"","meta":{"paper_id":"","title":""}}
for id in paper_ids:
for sentence in data_list:
if sentence['meta']['paper_id'] == id:
concatenate_sentence['text'] += sentence['text'] + ' '
concatenate_sentence['meta']['paper_id'] = id
concatenate_sentence['meta']['title'] = sentence['meta']['title']
new_data.append(concatenate_sentence)
concatenate_sentence = {"text":"","meta":{"paper_id":"","title":""}}
print(new_data)
Output:
[{'text': 'I love Coca-cola. ', 'meta': {'paper_id': 'cadf94cda790ae1bd90c32fbe441bb68a8637d83', 'title': 'title1'}}, {'text': 'He loves Pepsi. ', 'meta': {'paper_id': '0f3402fa5b44e121d410ec73dfc21937074e5fa3', 'title': 'title2'}}]