Search code examples
pythonjsonpandasdataframeapi

JSON to Python dataframe: mapping values from another API


I have an API with student data like this, for every student id there will be a corresponding API link with mark details.

for example:

https://api.school.com/2020/students.json

{
    "Students": [
        {
            "StudentName": "AAA",
            "Sid": 1020,
            "Saddress": "st.aaa",
            "Sdob": "10-11-1999"
        },
        {
            "StudentName": "BBB",
            "Sid": 1021,
            "Saddress": "st.bbb",
            "Sdob": "11-11-1999"
        },
        {
            "StudentName": "CCC",
            "Sid": 1022,
            "Saddress": "st.fff",
            "Sdob": "05-12-1999"
        },
        {
            "StudentName": "DDD",
            "Sid": 1023,
            "Saddress": "st.ddd",
            "Sdob": "15-09-1999"
        },
        {
            "StudentName": "EEE",
            "Sid": 1024,
            "Saddress": "st.eee",
            "Sdob": "10-11-1999"
        },
        {
            "StudentName": "FFF",
            "Sid": 1025,
            "Saddress": "st.ddd",
            "Sdob": "20-11-1999"
        },
        {
            "StudentName": "GGG",
            "Sid": 1026,
            "Saddress": "st.ggg",
            "Sdob": "25-11-1999"
        },
        {
            "StudentName": "JJJ",
            "Sid": 1019,
            "Saddress": "st.aaa",
            "Sdob": "18-11-1999"
        }
    ]
}

https://api.school.com/2020/mark.json?sid=1020

   {
    "marks": [
        {
            "English": 11,
            "Math": 12,
            "Art": 13,
            "Science": 14,
            "History": 15,
            "Geography": 16,
            "Physical Education": 17,
            "Chemistry": 18,
            "Physics": 19,
            "Biology": 20
        }
    ]
}

https://api.school.com/2020/mark.json?sid=1021

 {
    "marks": [
        {
            "English": 21,
            "Math": 22,
            "Art": 23,
            "Science": 24,
            "History": 25,
            "Geography": 26,
            "Physical Education": 27,
            "Chemistry": 28,
            "Physics": 29,
            "Biology": 30
        }
    ]
}

https://api.school.com/2020/mark.json?sid=1022

{
    "marks": [
        {
            "English": 31,
            "Math": 32,
            "Art": 33,
            "Science": 34,
            "History": 35,
            "Geography": 36,
            "Physical Education": 37,
            "Chemistry": 38,
            "Physics": 39,
            "Biology": 40
        }
    ]
}

https://api.school.com/2020/mark.json?sid=1023

{
    "marks": [
        {
            "English": 41,
            "Math": 42,
            "Art": 43,
            "Science": 44,
            "History": 45,
            "Geography": 46,
            "Physical Education": 47,
            "Chemistry": 48,
            "Physics": 49,
            "Biology": 50
        }
    ]
}

https://api.school.com/2020/mark.json?sid=1024

{
    "marks": [
        {
            "English": 51,
            "Math": 52,
            "Art": 53,
            "Science": 54,
            "History": 55,
            "Geography": 56,
            "Physical Education": 57,
            "Chemistry": 58,
            "Physics": 59,
            "Biology": 60
        }
    ]
}

https://api.school.com/2020/mark.json?sid=1025

{
    "marks": [
        {
            "English": 61,
            "Math": 62,
            "Art": 63,
            "Science": 64,
            "History": 65,
            "Geography": 66,
            "Physical Education": 67,
            "Chemistry": 68,
            "Physics": 69,
            "Biology": 70
        }
    ]
}

https://api.school.com/2020/mark.json?sid=1026

{
    "marks": [
        {
            "English": 71,
            "Math": 72,
            "Art": 73,
            "Science": 74,
            "History": 75,
            "Geography": 76,
            "Physical Education": 77,
            "Chemistry": 78,
            "Physics": 79,
            "Biology": 80
        }
    ]
}

https://api.school.com/2020/mark.json?sid=1019

   {
    "marks": [
        {
            "English": 1,
            "Math": 2,
            "Art": 3,
            "Science": 4,
            "History": 5,
            "Geography": 6,
            "Physical Education": 7,
            "Chemistry": 8,
            "Physics": 9,
            "Biology": 10
        }
    ]
}

I need to get an output like this in a csv file what i want

My code(not completed):

import requests
import json
import pandas as pd

def getdata(link):
    try:
        response = s.get(link)
    except requests.exceptions.RequestException as e:
        print('Request Exception Found!')
    json_data = response.json()
    return json_data

s = requests.session()
json_data = getdata('https://api.school.com/2020/students.json')

print(json_data)

def normalize(json_data,record_path):
    temp = json.dumps(json_data)
    ar = json.loads(temp)
    df = pd.json_normalize(ar[record_path])
    return df

Student_df = normalize(json_data, 'Students')

Student_ids = Student_df["Sid"].tolist()

print(Student_ids)

links = []

for i, row in Student_df.groupby('Sid').size().items():
    link = "https://api.school.com/2020/mark.json?sid="+str(i)
    links.append(link)

for x in links:
    normalize(getdata(x),'marks')

is it possible to insert the StudentName and Sid along with the mark? when the program fetches the mark from each API, it should be able to map and add the StudentName and Sid.

if i saves each mark api data to a json file in the local machine, then i can use this code and it works perfectly


Solution

  • Using concurrent.futures to asynchronously fetch mark data from api. (you can drop any columns you don't need in merge_frames() method.)

    import concurrent.futures
    import json
    import os
    
    import pandas as pd
    import requests
    
    
    class StudentsMarks:
        def __init__(self):
            self.root_url = "https://api.school.com/2020/"
            self.students = self.get_students()
            self.processors = os.cpu_count()
    
        def get_students(self) -> pd.DataFrame:
            url = "students.json"
    
            return self.get_data(url=url, record="Students")
    
        def process_marks(self) -> pd.DataFrame:
            student_ids = self.students["Sid"].tolist()
    
            with concurrent.futures.ProcessPoolExecutor(max_workers=self.processors) as executor:
                grades = pd.concat(executor.map(self.get_data, student_ids)).reset_index(drop=True)
    
            return self.merge_frames(grades)
    
    
        def get_data(self, sid: int = "", url: str = "mark.json?sid=", record: str = "marks") -> pd.DataFrame:
            url = f"{self.root_url}{url}{sid}"
    
            with requests.Session() as request:
                response = request.get(url, timeout=30)
            if response.status_code != 200:
                print(response.raise_for_status())
    
            data = json.loads(response.text)
    
            return pd.json_normalize(data=data, record_path=record).assign(id=sid)
    
        def merge_frames(self, grades: pd.DataFrame) -> pd.DataFrame:
            df = pd.merge(left=self.students, right=grades, left_on="Sid", right_on="id", suffixes=("_students", "_grades"))
    
            return df[df.columns.drop(list(df.filter(regex="students|grades")))]
    
    
    if __name__ == "__main__":
        final = StudentsMarks().process_marks()
        print(final)
    

    Output:

      Saddress     Sdob     Sid StudentName  Art  Biology  Chemistry  English  Geography  History  Math  Physical Education  Physics  Science
    0  st.aaa  10-31-1999  1020     AAA      13       20         18       11         16       15    12                  17       19       14
    1  st.bbb  11-19-1999  1021     BBB      23       30         28       21         26       25    22                  27       29       24
    2  st.fff  12-22-1999  1022     CCC      33       40         38       31         36       35    32                  37       39       34
    3  st.ddd  09-15-1999  1023     DDD      43       50         48       41         46       45    42                  47       49       44
    4  st.eee  02-08-1999  1024     EEE      53       60         58       51         56       55    52                  57       59       54
    5  st.ddd  05-14-1999  1025     FFF      63       70         68       61         66       65    62                  67       69       64
    6  st.ggg  11-25-1999  1026     GGG      73       80         78       71         76       75    72                  77       79       74
    7  st.aaa  11-18-1999  1019     JJJ       3       10          8        1          6        5     2                   7        9        4