Search code examples
pythonamazon-s3boto3buffer-overflowinteger-overflow

Unable to read botocore.response.StreamingBody due to overflow


I am trying to get a file from S3 and read it to python. The object is returned as botocore.response.Streamingbody. Usually it can be read using strmingbody.read() method. But when I try to use read, it throws a Overflowerror: Python int too large to convert to C long. All the other solutions available in the internet suggests converting int to int64 or float64. But I cannot use the .read() in the first place due to this error. We even tried to pickle the csv and send, but that also doesn't work.

import boto3
import pandas as pd

def get_cx_data():
    """ Get cx data
    Returns:
        Pandas DataFrame: CX index DataFrame
    """    
    client = boto3.client('s3',
                           aws_access_key_id = 'key_id_here',
                           aws_secret_access_key = 'secret_key_here',
                           region_name = 'us-east-2')
    obj = client.get_object(
    Bucket = 'bucket name',
    Key = 'key_here')
    print(type(obj))
    print(obj['Body'])
    file_ = obj['Body'].read()   #throws_overflowerror
    with open('training_data.csv', 'w') as file:
        file.write(obj['Body'].read())   #throws_overflowerror
    # combine_inde_dep_vars_featools.pkl
    
    # Read data from the S3 object
    #data = pandas.read_csv(obj['Body'])
    # df_cx_index = pd.read_pickle("combine_inde_dep_vars_featools.pkl")
    df_cx_index = pd.read_csv(io.BytesIO(obj['Body'].read()))   #throws_overflowerror
    print(df_cx_index.head())
    return df_cx_index

The Traceback is given below

<class 'dict'>
<botocore.response.StreamingBody object at 0x0000027EB0533A60>
Traceback (most recent call last):
  File "C:/my_folder/git repos/collections_completed_checklist_items/save_csv.py", line 22, in <module>
    get_cx_data()
  File "C:/my_folder/git repos/collections_completed_checklist_items/save_csv.py", line 18, in get_cx_data
    file_ = obj['Body'].read()
  File "C:\CX_codes\environments\collections_completed_checklist_items\lib\site-packages\botocore\response.py", line 77, in read
    chunk = self._raw_stream.read(amt)
  File "C:\CX_codes\environments\collections_completed_checklist_items\lib\site-packages\urllib3\response.py", line 515, in read
    data = self._fp.read() if not fp_closed else b""
  File "C:\Users\a.mundachal\AppData\Local\Programs\Python\Python38\lib\http\client.py", line 468, in read
    s = self._safe_read(self.length)
  File "C:\Users\a.mundachal\AppData\Local\Programs\Python\Python38\lib\http\client.py", line 609, in _safe_read
    data = self.fp.read(amt)
  File "C:\Users\a.mundachal\AppData\Local\Programs\Python\Python38\lib\socket.py", line 669, in readinto
    return self._sock.recv_into(b)
  File "C:\Users\a.mundachal\AppData\Local\Programs\Python\Python38\lib\ssl.py", line 1241, in recv_into
    return self.read(nbytes, buffer)
  File "C:\Users\a.mundachal\AppData\Local\Programs\Python\Python38\lib\ssl.py", line 1099, in read
    return self._sslobj.read(len, buffer)
OverflowError: Python int too large to convert to C long

Is there any other way to read or save the botocore.response.StreamingBody object as the csv without using .read(). Or is there any work around to use the .read() without getting OverflowError?


Solution

  • df = pd.read_csv('s3://path_to_file/training_data.csv')
    

    should work. Install s3fs if not. And make sure your credentials are accessible.

    If that doesn't work, try

    import boto3
    import pandas as pd
    from io import StringIO
    
    s3_root_bucket = 'the_main_bucket_you_start_in'
    s3_path_to_file = 'the rest of the path from there to the csv file including the csv filename'
    
    s3_client = boto3.client('s3') #add credentials if necessary
    
    csv_object = s3_client.get_object(Bucket = s3_root_bucket, Key = s3_path_to_file)
    
    csv_string = csv_object['Body'].read().decode('utf-8')
    
    df = pd.read_csv(StringIO(csv_string))