Search code examples
pythonpython-3.xfilepython-oracledb

Python-Oracle: Process does not write all rows from Oracle table


I'm using a python script to write output from an Oracle table as a .txt file. The Oracle table has around 33,000,000 rows, but when the process finishes, the txt file has around 985,000 rows. Am I missing something in my script that's preventing the process from writing the entire table?

import datetime
import arrow
import cx_Oracle
import pandas as pd
import numpy
import os.path
import warnings

warnings.filterwarnings('ignore')

cx_Oracle.init_oracle_client(lib_dir=r"xxxxxxx")

dsn_tns = cx_Oracle.makedsn('xxxxxxxxx')
conn = cx_Oracle.connect(user=r'xxxxx', password='xxxxx',dsn=xxxx)  

i = datetime.datetime.now()
current_date = arrow.now().format('YYYYMMDD')
year = datetime.datetime.now().strftime('%Y')
month = datetime.datetime.now().strftime('%B')
directory = "\\\xxxxx\\"+"CY"+year+"\\"+month+year+"\\"


SQL_QUERY = pd.read_sql_query("SELECT * FROM CLINICAL_LAB_SUPP ", conn, chunksize=1000000)


file1 = directory + "MyFile_{}".format(current_date) + ".txt"

for df_chunk in SQL_QUERY:
    df = pd.DataFrame(df_chunk)
    df.to_csv("{}".format(file1), index=False, header=True, mode='w+', sep='\t')

Solution

  • For such a large number of rows, you may want to use the native driver to remove any overheads of pandas and SQLAlchemy:

    #! /usr/bin/python
    
    import os
    import csv
    import getpass
    import traceback
    
    import oracledb
    
    un = 'cj'
    pw = getpass.getpass(f'Enter password for {un}: ')
    cs = 'localhost/orclpdb1'
    
    try:
        connection = oracledb.connect(user=un, password=pw, dsn=cs)
    
        with connection.cursor() as cursor:
            cursor.arraysize = 1000  # tune this for large queries
            print('Writing to emp.csv')
            f = open('emp.csv', 'w')
            writer = csv.writer(f, lineterminator="\n", quoting=csv.QUOTE_NONNUMERIC)
            cursor.execute("""select * from emp""")
            col_names = [row[0] for row in cursor.description]
            writer.writerow(col_names)
            while True:
                rows = cursor.fetchmany()  # extra call at end won't incur extra round-trip
                if not rows:
                    break
                writer.writerows(rows)
            f.close()
    
    except oracledb.Error as e:
        error, = e.args
        traceback.print_tb(e.__traceback__)
        print(error.message)