Search code examples
pythonsqlpostgresqlyamlpyyaml

Assign/Substitute a list of values in a SQL statement for PostgreSQL using Python


I have a huge table with lot of partition tables underneath, but i only want to query/update few(18) of the partitions. So i have to prepare and run 18 python scripts (because i have to create and close the connection after inserting into every partition table as per our DBA instructions, because each period consists of millions of records) to run my Insert statements. So i tried two ways to solve this: One is to run a for loop to substitute the list of periods in my SQL statement which involves writing two python scripts:

import os
import logging
#import psycopg2
import socket
logging.basicConfig(level=logging.INFO)
cpc_name = socket.gethostname()

list_of_periods = [
    {'period': '1940'},
    {'period': '1941_1945'},
    {'period': '1946_1950'},
    {'period': '1951_1955'},
    {'period': '1956_1960'},
    {'period': '1961_1965'},
    {'period': '1966_1970'},
    {'period': '1971_1975'},
    {'period': '1976_1980'},
    {'period': '1981_1985'},
    {'period': '1986_1990'},
    {'period': '1991_1995'},
    {'period': '1996_2000'},
    {'period': '2001_2005'},
    {'period': '2006_2010'},
    {'period': '2011_2015'},
    {'period': '2016_2020'},
    {'period': '2021_2025'}
]

if __name__ == "__main__":
    logging.info("Starting test process")
    logging.info("  cpc = {}".format(cpc_name) + '\n')
    for period in list_of_periods:
        os.system('python sample_segment.py')


########
sample_segment.py

import os
import logging
import psycopg2
import socket
logging.basicConfig(level=logging.INFO)
cpc_name = socket.gethostname()

if __name__ == "__main__":
    logging.info("Starting test process")
    logging.info("  cpc = {}".format(cpc_name) + '\n')
    connection = psycopg2.connect(user        = os.environ.get("DATABASE_USER", "SVTDATAVANT"),
                                  password    = os.environ.get("DATABASE_PASS", "pass"),
                                  host        = os.environ.get("DATABASE_HOST", "psql.silver.com"),
                                  port        = 5432,
                                  dbname      = os.environ.get("DATABASE_NAME", "psql_db"),
                                  options     = "-c search_path=DATAVANT_O")                    
    with connection.cursor() as cursor:
        logging.info(str(connection.get_dsn_parameters()) + '\n')
        cursor.execute("SELECT version();")
        connection.commit()
        conn = cursor.fetchone()
        logging.info("You are connected to - " + str(conn))
        tempb = '''
        INSERT INTO DATAVANT_STG_O.mortality_index_{period}           
        SELECT * FROM DATAVANT_STG_O.tmp_mortality_{period}; '''
        logging.info("Performing Insert Operation")
        cursor.execute(tempb)
        connection.commit()
        count = cursor.rowcount
        logging.info(str(count) + " - count for the period:  {period}")
        logging.info(" - count for the period:  {period}")
        connection.close()
        print("PostgreSQL connection is closed")

Obviously it did not work and i suspect my python programming skills of-course or maybe i am looking for something like os.sytem.sql_with_parameters() But anyway my intention is to turn the below statement

INSERT INTO DATAVANT_STG_O.mortality_index_{period}           
SELECT * FROM DATAVANT_STG_O.tmp_mortality_{period};

INTO

INSERT INTO DATAVANT_STG_O.mortality_index_1940           
SELECT * FROM DATAVANT_STG_O.tmp_mortality_1940;

INSERT INTO DATAVANT_STG_O.mortality_index_1941_1945           
SELECT * FROM DATAVANT_STG_O.tmp_mortality_1941_1945;

etc...for 18 periods

And the second method i tried is by reading YAML file in my python script.

import os
import logging
import psycopg2
import socket
import yaml
logging.basicConfig(level=logging.INFO)
cpc_name = socket.gethostname()

with open(r'/home/SILVER/user/test/periods.yaml') as file:
    YEARS = yaml.load(file, Loader=yaml.FullLoader)
    print("load yaml file" + '\n')
    print(YEARS)


if __name__ == "__main__":
    logging.info("Starting test process")
    logging.info("  cpc = {}".format(cpc_name) + '\n')
    connection = psycopg2.connect(user        = os.environ.get("DATABASE_USER", "SVTDATAVANT"),
                                  password    = os.environ.get("DATABASE_PASS", "pass"),
                                  host        = os.environ.get("DATABASE_HOST", "psql.host.com"),
                                  port        = 5432,
                                  dbname      = os.environ.get("DATABASE_NAME", "psql_db"),
                                  options     = "-c search_path=DATAVANT_O")                    
    with connection.cursor() as cursor:
        logging.info(str(connection.get_dsn_parameters()) + '\n')
        cursor.execute("SELECT version();")
        connection.commit()
        conn = cursor.fetchone()
        logging.info("You are connected to - " + str(conn))
        tempb = '''
        INSERT INTO DATAVANT_STG_O.mortality_index_YEARS[periods]
        SELECT * FROM DATAVANT_STG_O.tmp_mortality_YEARS[periods]; '''
        cursor.execute(tempb)
        logging.info("Performing Insert Operation")
        connection.commit()
        count = cursor.rowcount
        logging.info(str(count) + " - count for the period:  YEARS[periods]")
        logging.info(" - count for the period:  YEARS[periods]")
        connection.close()
        print("PostgreSQL connection is closed")

##YAML file

---
periods:

  - 1940
  - 1941_1945
  - 1946_1950
  - 1951_1955
  - 1956_1960
  - 1961_1965
  - 1966_1970
  - 1971_1975
  - 1976_1980
  - 1981_1985
  - 1986_1990
  - 1991_1995
  - 1996_2000
  - 2001_2005
  - 2006_2010
  - 2011_2015
  - 2016_2020
  - 2021_2025

This is the part where i learnt that values from YAML file cannot be assigned to a SQL statement in a python file.

My apologies to the Large question but i just want to be clear and provide as much information as i could.

I would be very grateful for any suggestions to help me solve this scenario using either of the methods mentioned above or feel free to recommend me a new one, Thank you!

##Below is output, when i tried Mike's approach:

import os
import logging
import psycopg2
import socket
logging.basicConfig(level=logging.INFO)
cpc_name = socket.gethostname()

periods = ['1940']
periods.extend(['{}_{}'.format(i, i + 4) for i in range(1941, 2026, 5)])

for period in periods:
    # Do your psycopg2 connection here and get your cursor
    connection = psycopg2.connect(user        = os.environ.get("DATABASE_USER", "SVTDATAVANT"),
                                  password    = os.environ.get("DATABASE_PASS", "pass"),
                                  host        = os.environ.get("DATABASE_HOST", "psql.silver.com"),
                                  port        = 5432,
                                  dbname      = os.environ.get("DATABASE_NAME", "psql_db"),
                                  options     = "-c search_path=DATAVANT_O")                    
    with connection.cursor() as cursor:
        logging.info(str(connection.get_dsn_parameters()) + '\n')
        cursor.execute("SELECT version();")
        connection.commit()
        conn = cursor.fetchone()
        logging.info("You are connected to - " + str(conn))
        cursor.execute("""
        SELECT COUNT(*) FROM datavant_stg_o.mortality_index_{};""". format(period, period)
        )
        # Commit and close your connection here
        connection.commit()
        count = cursor.rowcount
        logging.info("Count for the period {} is: " . format(period, period) + str(count) +  '\n')
        connection.close()
        print("PostgreSQL connection is closed" + "\n")

(Currently i am only trying to execute the count(*)s to check the functionality) So i am expecting the counts for the partitions as:

SELECT COUNT(*) FROM datavant_stg_o.mortality_index_1940 - 1001066
SELECT COUNT(*) FROM datavant_stg_o.mortality_index_1941_1945 - 1713850
etc which are the original counts when i query from pgAdmin

but instead i am getting the output as

Count for the period 1940 is: 1, 
Count for the period 1941_1945 is: 1 etc,

Does it have anything to do with the quotes?


Solution

  • If you are on python 3.6 or later:

    periods = ['1940']
    periods.extend([f'{i}_{i + 4}' for i in range(1941, 2026, 5)])
    
    for period in periods:
        # Do your psycopg2 connection here and get your cursor
        cursor.execute(f"""
          INSERT INTO DATAVANT_STG_O.mortality_index_{period}
          SELECT * FROM DATAVANT_STG_O.tmp_mortality_{period};""")
        # Commit and close your connection here
    

    For an earlier python:

    periods = ['1940']
    periods.extend(['{}_{}'.format(i, i + 4) for i in range(1941, 2026, 5)])
    
    for period in periods:
        # Do your psycopg2 connection here and get your cursor
        cursor.execute("""
          INSERT INTO DATAVANT_STG_O.mortality_index_{}
          SELECT * FROM DATAVANT_STG_O.tmp_mortality_{};""". format(period, period)
        )
        # Commit and close your connection here