python python-3.x python-requests multiprocessing tor

Python Requests Stream Via Tor - Connection Dies

I'm using python requests library to download the file at 'onionurl' in a multiprocessed fashion, to download a number of files from a tor service.

That is the reasoning behind the code.

However as these files download, they cut out after a minute or two each. As in the stream fails to download, no error is given but 'closing text file' is returned. Meaning that it is impossible to download the files hosted on these onion servers, that are several hundred gigabytes each.

Any help with resolution of this problem would be greatly appreciated.

    session = requests.session()
    session.proxies = {}
    session.proxies['http'] = 'socks5h://localhost:9050'
    session.proxies['https'] = 'socks5h://localhost:9050'
    #print(onionurlforrequest)
    

    url = onionurl

    try:
        if not os.path.isdir(foldername):
            os.makedirs(foldername)
        # download the body of response by chunk, not immediately
        with session.get(url, stream=True, verify=False, timeout=1000000) as response:
            # get the total file size
            file_size = int(response.headers.get("Content-Length", 0))
            print(file_size)
            # get the file name

            filename = dataloc


            with open(filename, "wb") as text_file: 
                for chunk in response.iter_content(chunk_size=1024):

                    text_file.write(chunk)
 
                    if (file_size  > 1000000):
                        filesizemb = file_size / 1000000
                    else:
                        filesizemb = 1
            print("closing text file")
            text_file.close()

Solution

Managed to solve it, by simply accepting that the connection will die and writing a new function that resumes the download at the exact offset, the theory of which is explained in this question - How to resume file download in Python?

My code (warning, messy):

def onionrequestthreadeddataleakdownloadresume(onionurl,resume_byte_pos):
    print("rerunning")
    companyname = onionurl[0]
    onionurl = onionurl[1]
    dataloc = '/media/archangel/Elements/clop/dataleaks/'
    foldername = dataloc
    dataloc = dataloc + companyname + "/"
    try:
       if not os.path.isdir(dataloc):

           os.mkdir(dataloc)
    except Exception as e:

        print(e)
        print("folder not created")


    filename = os.path.basename(onionurl)
    filenamebasename = filename



    dataloc = dataloc + filename

    try:
 #       seconds = 20
  #      timeout = Timeout(seconds)
   #     timeout.start()



        session = requests.session()
        session.proxies = {}
        session.proxies['http'] = 'socks5h://localhost:9050'
        session.proxies['https'] = 'socks5h://localhost:9050'
        #print(onionurlforrequest)
        
      #  onionurlforrequest = "http://" + onionurl
        print("dataloc")
        print(dataloc)
        print("onionurl")
        print(onionurl)
        url = onionurl

        try:
            print("url")
            print(url)
            if not os.path.isdir(foldername):
                os.makedirs(foldername)
            # download the body of response by chunk, not immediately
#https://stackoverflow.com/questions/16694907/download-large-file-in-python-with-requests?rq=1
            try:
                try:
                    seconds = 20
                    timeout = Timeout(seconds)
                    timeout.start()
                except Exception as ex:
                    print(ex)

                resume_header = {'Accept-Encoding': None, 'Range': 'bytes=%d-' % resume_byte_pos}
                try:
                    with session.get(url, stream=True, verify=False, headers=resume_header, timeout=600) as response:
                        #response.raise_for_status()

                        # get the total file size
                        file_size = int(response.headers['Content-Length'])
                        if (file_size  > 1000000):
                            filesizemb = file_size / 1000000
                        else:
                            filesizemb = 1
                        print(file_size)
                        # get the file name

                        filename = dataloc
            #            filename = os.path.join(dataloc, url.split("/")[-1])
                        # progress bar, changing the unit to bytes instead of iteration (default by tqdm)
             #           response = session.get(url, stream = True)
            #            progress = tqdm(response.iter_content(1024), f"Downloading {filename}", total=file_size, unit="B", unit_scale=True, unit_divisor=1024)
                        try:
                            with open(filename, "ab") as text_file: 
                                for chunk in response.iter_content(chunk_size=1024*1024):
                                    #https://www.kite.com/python/answers/how-to-download-large-files-with-requests-in-python
                                    #if len(chunk) != 1024*36:
                                    if chunk: 
                                        #print(len(chunk))
                                        text_file.write(chunk)
                                        text_file.flush()
                        except Exception as ex:
                            logging.error(f'write failed with error: {ex}')
                            print(ex)
                                #else:
                                
                                    # write data read to the file
                #                    f.write(data)
                                    # update the progress bar manually
                 #                   progress.update(len(data))
                                # finally, if the url is valid

                        #logging.info('Download finished successfully')

                        print("exited with for file")
                except Exception as ex:
                    logging.error(f'Request failed with error: {ex}')
                    print(ex)

            except Exception as ex:
                logging.error(f'Attempt failed with error: {ex}')
                print(ex)

            print("closing text file")
          #  text_file.close()

                #list composed of dataleaklocation (location in external), filename (filename after / slash) , dataleakurl (urlofonion) , contentsize

        except Exception as e:
            print("FAILED DOWNLOAD 2")

            print(e)
    except Exception as e:
        print("FAILED DOWNLOAD 5")
        print(e)












def onionrequestthreadeddataleakdownload2(onionurl):
    companyname = onionurl[0]
    onionurl = onionurl[1]
    dataloc = '/media/archangel/Elements/clop/dataleaks/'
    foldername = dataloc
    dataloc = dataloc + companyname + "/"
    try:
       if not os.path.isdir(dataloc):

           os.mkdir(dataloc)
    except Exception as e:

        print(e)
        print("folder not created")


    filename = os.path.basename(onionurl)
    filenamebasename = filename



    dataloc = dataloc + filename

    try:
 #       seconds = 20
  #      timeout = Timeout(seconds)
   #     timeout.start()



        session = requests.session()
        session.proxies = {}
        session.proxies['http'] = 'socks5h://localhost:9050'
        session.proxies['https'] = 'socks5h://localhost:9050'
        #print(onionurlforrequest)
        
      #  onionurlforrequest = "http://" + onionurl
        print("dataloc")
        print(dataloc)
        print("onionurl")
        print(onionurl)
        url = onionurl

        try:
            print("url")
            print(url)
            if not os.path.isdir(foldername):
                os.makedirs(foldername)
            # download the body of response by chunk, not immediately
#https://stackoverflow.com/questions/16694907/download-large-file-in-python-with-requests?rq=1
            try:
                try:
                    seconds = 20
                    timeout = Timeout(seconds)
                    timeout.start()
                except Exception as ex:
                    print(ex)

               # resume_header = ({'Range': f'bytes=0-2000000'})
                #file_size_online = int(r.headers.get('content-length', 0))
                headersac = {'Accept-Encoding': None}
                try:
                    with session.get(url, stream=True, verify=False, headers = headersac, timeout=600) as response:
                        #response.raise_for_status()

                        # get the total file size
    #                    file_size = int(response.headers.get("Content-Length", 0))
                        file_size = int(response.headers['Content-Length'])
                        if (file_size  > 1000000):
                            filesizemb = file_size / 1000000
                        else:
                            filesizemb = 1
                        print(file_size)
                        #e
                        # get the file name

                        filename = dataloc
            #            filename = os.path.join(dataloc, url.split("/")[-1])
                        # progress bar, changing the unit to bytes instead of iteration (default by tqdm)
             #           response = session.get(url, stream = True)
            #            progress = tqdm(response.iter_content(1024), f"Downloading {filename}", total=file_size, unit="B", unit_scale=True, unit_divisor=1024)
                        try:
                            with open(filename, "wb") as text_file: 
                                for chunk in response.iter_content(chunk_size=1024*1024):
                                    #https://www.kite.com/python/answers/how-to-download-large-files-with-requests-in-python
                                    #if len(chunk) != 1024*36:
                                    if chunk: 
                                       # print(len(chunk))
                                        text_file.write(chunk)
                                        text_file.flush()
                        except Exception as ex:
                            logging.error(f'write failed with error: {ex}')
                            print(ex)
                                #else:
                                
                                    # write data read to the file
                #                    f.write(data)
                                    # update the progress bar manually
                 #                   progress.update(len(data))
                                # finally, if the url is valid

                        #logging.info('Download finished successfully')
                except Exception as ex:
                    logging.error(f'request failed with error: {ex}')
                    print(ex)
                    print("exited with for file")
                #path = Path(filename)
                file_size_offline = Path(filename).stat().st_size
                print("file size offline")
                while (file_size_offline != file_size):
                    try:
                        print(file_size_offline)
                        print(file_size)
                        print("file size incomplete")
                        file_size_offline = Path(filename).stat().st_size
                        onionurllist = []
                        onionurllist.append(companyname)

                        onionurllist.append(onionurl)
                        onionrequestthreadeddataleakdownloadresume(onionurllist, file_size_offline)
                        file_size_offline = Path(filename).stat().st_size

                    except Exception as ex:
                        print("redownload failed")
                        print(ex)
                print("LOOP FINISHED")

                print(file_size)
                print(file_size_offline)
                print(filename)
            except Exception as ex:
                logging.error(f'Attempt failed with error: {ex}')
                print(ex)

#            print("closing text file")
          #  text_file.close()
            if(file_size_offline != file_size):
                while (file_size_offline != file_size):
                    try:
                        print(file_size_offline)
                        print(file_size)
                        print("file size incomplete")
                        file_size_offline = Path(filename).stat().st_size
                        onionurllist = []
                        onionurllist.append(companyname)

                        onionurllist.append(onionurl)
                        onionrequestthreadeddataleakdownloadresume(onionurllist, file_size_offline)
                        file_size_offline = Path(filename).stat().st_size

                    except Exception as ex:
                        print("redownload failed")
                        print(ex)
            else:
                #list composed of dataleaklocation (location in external), filename (filename after / slash) , dataleakurl (urlofonion) , contentsize
                returnedlist = []
                returnedlist.append(dataloc)
                returnedlist.append(filenamebasename)
                returnedlist.append(url)
                returnedlist.append(filesizemb)
                return returnedlist
            if(file_size_offline != file_size):
                print("rerunning a final FINAL time")
                while (file_size_offline != file_size):
                    try:
                        print(file_size_offline)
                        print(file_size)
                        print("file size incomplete")
                        file_size_offline = Path(filename).stat().st_size
                        onionurllist = []
                        onionurllist.append(companyname)

                        onionurllist.append(onionurl)
                        onionrequestthreadeddataleakdownloadresume(onionurllist, file_size_offline)
                        file_size_offline = Path(filename).stat().st_size

                    except Exception as ex:
                        print("redownload failed")
                        print(ex)
            else:
                #list composed of dataleaklocation (location in external), filename (filename after / slash) , dataleakurl (urlofonion) , contentsize
                returnedlist = []
                returnedlist.append(dataloc)
                returnedlist.append(filenamebasename)
                returnedlist.append(url)
                returnedlist.append(filesizemb)
                return returnedlist
                


            returnedlist = []
            returnedlist.append(dataloc)
            returnedlist.append(filenamebasename)
            returnedlist.append(url)
            returnedlist.append(filesizemb)
            return returnedlist
        except Exception as e:
            print("FAILED DOWNLOAD 2")

            print(e)
    except Exception as e:
        print("FAILED DOWNLOAD 5")
        print(e)