Search code examples
pythonpandassqlalchemyazure-sql-databasepyodbc

Ignoring duplicate entries of primary key when inserting API data to SQL with Python


Problem

This code enters data to a sql table that already exists for data from the previous day (09/10/23), the primary key in this table is a column called "id". When trying to enter yesterdays data and previous days for example changing the api parameters to 06/10/23-09/10/23 it will not enter any of the data as the data for 09/10/23 already exists. I have tried multiple solutions already on here and none work. I basically need a loop that can iterate through rows and if the primary key already exists in the table then carry on and ignore integrity errors and if it's not in the table insert the entry. Any help would be much appreciated.

Code

### Libraries ###

import json
import requests
import pandas as pd
from sqlalchemy import create_engine, exc
from datetime import datetime, timedelta
import pyodbc

### API Call ###

# Calculate the date for previous day
previous_day = datetime.now() - timedelta(days=1)
date_from = previous_day.strftime('%Y-%m-%dT00:00:00')
date_to = previous_day.strftime('%Y-%m-%dT23:59:59')

# API Credentials
url = f"XXXX"
payload = {}
headers = {'Authorization': 'XXXX'}

# API Request
response = requests.request("GET", url, headers=headers, data=payload)

# Check if the request was successful
if response.status_code == 200:
    # Parse the JSON response into a DataFrame
    data = response.json()
    # Extract relevant data from dictionaries within dictionaries
    df = pd.json_normalize(data['data'])
    
    # Modify 'charging_periods' column
    def extract_tariff_id(charging_periods):
        if isinstance(charging_periods, list) and len(charging_periods) > 0:
            # Get the first dictionary in the list (if it exists)
            first_entry = charging_periods[0]
            # Extract 'tariff_id' from the first dictionary
            tariff_id = first_entry.get('tariff_id', 'Unknown Tariff ID')
            return tariff_id
        else:
            return 'Unknown Tariff ID'

    # Apply the modification to 'charging_periods' and rename the column
    df['tariff_id'] = df['charging_periods'].apply(extract_tariff_id)

    # Drop the original 'charging_periods' column
    df.drop(columns=['charging_periods'], inplace=True)

### Azure DB ###

    # Parameters
    server = 'XXXX'
    database = 'XXXX'
    username = 'XXXX'
    password = 'XXXX'
    driver = 'XXXX'

    # Create a connection to the Azure SQL Database
    conn = pyodbc.connect(f'DRIVER={{XXXX}};SERVER={server};DATABASE={database};UID={username};PWD={password}')

    # Create a cursor
    cursor = conn.cursor()

    # Engine (Mini SQL Enviroment)
    engine = create_engine(f"mssql+pyodbc:///?odbc_connect={f'DRIVER={{XXXX}};SERVER={server};DATABASE={database};UID={username};PWD={password}'}")
    
    # Insert to SQL
    df.to_sql('TABLE_NAME', engine, if_exists='append', index=False, schema='ev')

    # Commit the changes and close the connection
    conn.commit()
    conn.close()

    # Print tabular format and data types
    print('Success!')
    print(df)
    print(df.info())

else:
    print(f"API request failed with status code: {response.status_code}")
    
# Remove Duplicates
for i in range(len(df)):
    try:
        df.iloc[i:i+1].to_sql(name="TABLE_NAME",if_exists='append',con = Engine)
    except exc.IntegrityError as e:
        pass

Solution

  • Needed to remove periods in title:

    #Remove periods from column names
        df.columns = df.columns.str.replace("[.]","_", regex=True)
    

    Then to remove duplicates did the following:

    #Engine (Mini SQL Enviroment)
        engine = create_engine(f"mssql+pyodbc:///?odbc_connect={f'DRIVER={{ODBC Driver 17 for SQL Server}};SERVER={server};DATABASE={database};UID={username};PWD={password}'}")
        try:    
            # Insert to SQL
            df.to_sql('table_name', engine, if_exists='append', index=False, schema='blah')
        except:
            # Remove Duplicates
            for index, row in df.iterrows():
                    # Insert the row into the database
                    columns = ", ".join(df.columns)
                    values = ", ".join([f"'{row[col]}'" for col in df.columns])
                    statement = f"INSERT INTO blah.table_name({columns}) values({values})"
                    status = row["status"]
                    statement_2 = f'SELECT id FROM blah.table_name WHERE id = {row["id"]}'
                    with conn as connection:
                        with conn.cursor() as cursor:
                                cursor.execute(statement_2)
                                row = cursor.fetchone()
                                if not row and status != 
                                "variable_messing_script":
                                    # cursor.execute(statement)
                                    try: 
                                        cursor.execute(statement)
                                    except: 
                                        print(statement)