Search code examples
pythonpandascsvtqdm

Progress in bytes when reading CSV from URL with pandas


Because some of the CSV files that I need to read are very large (multiple GB), I am trying to implement a progress bar that indicates the number of bytes read out of the total when reading a CSV file from a URL with pandas.

I am trying to implement something like this:

from tqdm import tqdm
import requests
from sodapy import Socrata
import contextlib
import urllib
import pandas as pd

url = "https://public.tableau.com/views/PPBOpenDataDownloads/UseOfForce-All.csv?:showVizHome=no"

response = requests.get(url, params=None, stream=True)
response.raise_for_status()
total_size = int(response.headers.get('Content-Length', 0))

block_size = 1000
df = []
last_position = 0
cur_position = 1
with tqdm(desc=url, total=total_size,
     unit='iB',
     unit_scale=True,
     unit_divisor=1024
     ) as bar:
    with contextlib.closing(urllib.request.urlopen(url=url)) as rd:
        # Create TextFileReader
        reader = pd.read_csv(rd, chunksize=block_size)
        for chunk in reader:
            df.append(chunk)
            # Here I would like to calculate the current file position: cur_position 
            bar.update(cur_position - last_position)
            last_position = cur_position

Is there a way to get the file position from the pandas TextFileReader somehow? Perhaps something equivalent to ftell in C++ for TextFileReader?


Solution

  • Not thoroughly tested, but you can implement custom class with read() method where you read from requests response line by line and update the tqdm bar:

    import requests
    import pandas as pd
    from tqdm import tqdm
    
    url = "https://public.tableau.com/views/PPBOpenDataDownloads/UseOfForce-All.csv?:showVizHome=no"
    
    
    class TqdmReader:
        def __init__(self, resp):
            total_size = int(resp.headers.get("Content-Length", 0))
    
            self.resp = resp
            self.bar = tqdm(
                desc=resp.url,
                total=total_size,
                unit="iB",
                unit_scale=True,
                unit_divisor=1024,
            )
    
            self.reader = self.read_from_stream()
    
        def read_from_stream(self):
            for line in self.resp.iter_lines():
                line += b"\n"
                self.bar.update(len(line))
                yield line
    
        def read(self, n=0):
            try:
                return next(self.reader)
            except StopIteration:
                return ""
    
    
    with requests.get(url, params=None, stream=True) as resp:
        df = pd.read_csv(TqdmReader(resp))
    
    print(len(df))
    

    Prints:

    https://public.tableau.com/views/PPBOpenDataDownloads/UseOfForce-All.csv?:showVizHome=no: 100%|██████████████████████████████████████████████████████████████████████████████| 2.09M/2.09M [00:00<00:00, 2.64MiB/s]
    7975