I'm using a python script to write output from an Oracle table as a .txt file. The Oracle table has around 33,000,000 rows, but when the process finishes, the txt file has around 985,000 rows. Am I missing something in my script that's preventing the process from writing the entire table?
import datetime
import arrow
import cx_Oracle
import pandas as pd
import numpy
import os.path
import warnings
warnings.filterwarnings('ignore')
cx_Oracle.init_oracle_client(lib_dir=r"xxxxxxx")
dsn_tns = cx_Oracle.makedsn('xxxxxxxxx')
conn = cx_Oracle.connect(user=r'xxxxx', password='xxxxx',dsn=xxxx)
i = datetime.datetime.now()
current_date = arrow.now().format('YYYYMMDD')
year = datetime.datetime.now().strftime('%Y')
month = datetime.datetime.now().strftime('%B')
directory = "\\\xxxxx\\"+"CY"+year+"\\"+month+year+"\\"
SQL_QUERY = pd.read_sql_query("SELECT * FROM CLINICAL_LAB_SUPP ", conn, chunksize=1000000)
file1 = directory + "MyFile_{}".format(current_date) + ".txt"
for df_chunk in SQL_QUERY:
df = pd.DataFrame(df_chunk)
df.to_csv("{}".format(file1), index=False, header=True, mode='w+', sep='\t')
For such a large number of rows, you may want to use the native driver to remove any overheads of pandas and SQLAlchemy:
#! /usr/bin/python
import os
import csv
import getpass
import traceback
import oracledb
un = 'cj'
pw = getpass.getpass(f'Enter password for {un}: ')
cs = 'localhost/orclpdb1'
try:
connection = oracledb.connect(user=un, password=pw, dsn=cs)
with connection.cursor() as cursor:
cursor.arraysize = 1000 # tune this for large queries
print('Writing to emp.csv')
f = open('emp.csv', 'w')
writer = csv.writer(f, lineterminator="\n", quoting=csv.QUOTE_NONNUMERIC)
cursor.execute("""select * from emp""")
col_names = [row[0] for row in cursor.description]
writer.writerow(col_names)
while True:
rows = cursor.fetchmany() # extra call at end won't incur extra round-trip
if not rows:
break
writer.writerows(rows)
f.close()
except oracledb.Error as e:
error, = e.args
traceback.print_tb(e.__traceback__)
print(error.message)