I'm wondering if you could help me with filling jsons with their original filenames. Here is a sample of json: jsv is a list of jsons (the first main key is number of document (document_0, document_1 ...)
jsv =
[
{
{
"document_0":{
"id":111,
"laboratory":"xxx",
"document_type":"xxx",
"language":"pl",
"creation_date":"09-12-2022",
"source_filename":"None",
"version":"0.1",
"exams_ocr_avg_confidence":0.0,
"patient_data":{
"first_name":"YYYY",
"surname":"YYYY",
"pesel":"12345678901",
"birth_date":"1111-22-22",
"sex":"F",
"age":"None"
},
"exams":[
{
"name":"xx",
"sampling_date":"2020-11-30",
"comment":"None",
"confidence":97,
"result":"222",
"unit":"ml",
"norm":"None",
"material":"None",
"icd9":"uuuuu"
},
{
"document_1":{
"id":111,
"laboratory":"xxx",
"document_type":"xxx",
"language":"pl",
"creation_date":"09-12-2022",
"source_filename":"None",
"version":"0.1",
"exams_ocr_avg_confidence":0.0,
"patient_data":{
"first_name":"YYYY",
"surname":"YYYY",
"pesel":"12345678901",
"birth_date":"1111-22-22",
"sex":"F",
"age":"None"
},
"exams":[
{
"name":"xx",
"sampling_date":"2020-11-30",
"comment":"None",
"confidence":97,
"result":"222",
"unit":"ml",
"norm":"None",
"material":"None",
"icd9":"uuuuu"
}
}
]
And inside of this json there is a key: source_filename which I want to update with real name of json file name
my folder with files as an example:
'11111.pdf.json',
'11112.pdf.json',
'11113.pdf.json',
'11114.pdf.json',
'11115.pdf.json'
What I want to achieve:
jsv =
[
{
{
"document_0":{
"id":111,
"laboratory":"xxx",
"document_type":"xxx",
"language":"pl",
"creation_date":"09-12-2022",
"source_filename":"11111.pdf.json",
"version":"0.1",
"exams_ocr_avg_confidence":0.0,
"patient_data":{
"first_name":"YYYY",
"surname":"YYYY",
"pesel":"12345678901",
"birth_date":"1111-22-22",
"sex":"F",
"age":"None"
},
"exams":[
{
"name":"xx",
"sampling_date":"2222-22-22",
"comment":"None",
"confidence":22,
"result":"222",
"unit":"ml",
"norm":"None",
"material":"None",
"icd9":"uuuuu"
},
{
"document_1":{
"id":111,
"laboratory":"xxx",
"document_type":"xxx",
"language":"pl",
"creation_date":"22-22-2222",
"source_filename":"11111.pdf.json",
"version":"0.1",
"exams_ocr_avg_confidence":0.0,
"patient_data":{
"first_name":"YYYY",
"surname":"YYYY",
"pesel":"12345678901",
"birth_date":"1111-22-22",
"sex":"F",
"age":"None"
},
"exams":[
{
"name":"xx",
"sampling_date":"2222-11-22",
"comment":"None",
"confidence":22,
"result":"222",
"unit":"ml",
"norm":"None",
"material":"None",
"icd9":"uuuuu"
}
}
]
document_0 and document_1 are with the same filename
what I've managed to get:
dir_name = 'path_name'
from os import listdir
from os.path import isfile, join
onlyfiles = [f for f in listdir(dir_name) if isfile(join(dir_name, f))]
only_files which is a list of filenames of my jsons. Now I was thinking to maybe update somehow my jsv with it in a loop? But I'm also looking for a method which will be very efficient due to large amount of data I have to process
EDIT: I've managed to do it with a for loop, but maybe there is more effective way:
for i in range(len(jsv)): if (type(jsv[i]) == dict):
jsv[i]["document_0"].update({"source_filename": onlyfiles[i]})
else:
print(onlyfiles[i])
If your jsv is:
jsv = [
{
"document_0": {
"id": 111,
"laboratory": "xxx",
"document_type": "xxx",
"language": "pl",
"creation_date": "09-12-2022",
"source_filename": "None",
"version": "0.1",
"exams_ocr_avg_confidence": 0.0,
"patient_data": {
"first_name": "YYYY",
"surname": "YYYY",
"pesel": "12345678901",
"birth_date": "1111-22-22",
"sex": "F",
"age": "None",
},
"exams": [
{
"name": "xx",
"sampling_date": "2020-11-30",
"comment": "None",
"confidence": 97,
"result": "222",
"unit": "ml",
"norm": "None",
"material": "None",
"icd9": "uuuuu",
},
],
}
},
{
"document_1": {
"id": 111,
"laboratory": "xxx",
"document_type": "xxx",
"language": "pl",
"creation_date": "09-12-2022",
"source_filename": "None",
"version": "0.1",
"exams_ocr_avg_confidence": 0.0,
"patient_data": {
"first_name": "YYYY",
"surname": "YYYY",
"pesel": "12345678901",
"birth_date": "1111-22-22",
"sex": "F",
"age": "None",
},
"exams": [
{
"name": "xx",
"sampling_date": "2020-11-30",
"comment": "None",
"confidence": 97,
"result": "222",
"unit": "ml",
"norm": "None",
"material": "None",
"icd9": "uuuuu",
},
],
},
},
]
In Python, you can do something like this:
arq = ['11111.pdf.json', '11112.pdf.json']
if len(arq) == len(jsv):
for i, json in enumerate(jsv):
for key in enumerate(json.keys()):
json[key[1]]['source_filename'] = arq[i]
Need to check if the length of files list is the same of the jsv list!
result this jsv:
[
{
"document_0": {
"id": 111,
"laboratory": "xxx",
"document_type": "xxx",
"language": "pl",
"creation_date": "09-12-2022",
"source_filename": "11111.pdf.json",
"version": "0.1",
"exams_ocr_avg_confidence": 0.0,
"patient_data": {
"first_name": "YYYY",
"surname": "YYYY",
"pesel": "12345678901",
"birth_date": "1111-22-22",
"sex": "F",
"age": "None",
},
"exams": [
{
"name": "xx",
"sampling_date": "2020-11-30",
"comment": "None",
"confidence": 97,
"result": "222",
"unit": "ml",
"norm": "None",
"material": "None",
"icd9": "uuuuu",
}
],
}
},
{
"document_1": {
"id": 222,
"laboratory": "xxx",
"document_type": "xxx",
"language": "pl",
"creation_date": "09-12-2022",
"source_filename": "11112.pdf.json",
"version": "0.1",
"exams_ocr_avg_confidence": 0.0,
"patient_data": {
"first_name": "YYYY",
"surname": "YYYY",
"pesel": "12345678901",
"birth_date": "1111-22-22",
"sex": "F",
"age": "None",
},
"exams": [
{
"name": "xx",
"sampling_date": "2020-11-30",
"comment": "None",
"confidence": 97,
"result": "222",
"unit": "ml",
"norm": "None",
"material": "None",
"icd9": "uuuuu",
}
],
}
},
]