Search code examples
pythonpython-3.xpython-requestsmultiprocessingtor

Python Requests Stream Via Tor - Connection Dies


I'm using python requests library to download the file at 'onionurl' in a multiprocessed fashion, to download a number of files from a tor service.

That is the reasoning behind the code.

However as these files download, they cut out after a minute or two each. As in the stream fails to download, no error is given but 'closing text file' is returned. Meaning that it is impossible to download the files hosted on these onion servers, that are several hundred gigabytes each.

Any help with resolution of this problem would be greatly appreciated.

    session = requests.session()
    session.proxies = {}
    session.proxies['http'] = 'socks5h://localhost:9050'
    session.proxies['https'] = 'socks5h://localhost:9050'
    #print(onionurlforrequest)
    

    url = onionurl

    try:
        if not os.path.isdir(foldername):
            os.makedirs(foldername)
        # download the body of response by chunk, not immediately
        with session.get(url, stream=True, verify=False, timeout=1000000) as response:
            # get the total file size
            file_size = int(response.headers.get("Content-Length", 0))
            print(file_size)
            # get the file name

            filename = dataloc


            with open(filename, "wb") as text_file: 
                for chunk in response.iter_content(chunk_size=1024):

                    text_file.write(chunk)
 
                    if (file_size  > 1000000):
                        filesizemb = file_size / 1000000
                    else:
                        filesizemb = 1
            print("closing text file")
            text_file.close()

Solution

  • Managed to solve it, by simply accepting that the connection will die and writing a new function that resumes the download at the exact offset, the theory of which is explained in this question - How to resume file download in Python?

    My code (warning, messy):

    def onionrequestthreadeddataleakdownloadresume(onionurl,resume_byte_pos):
        print("rerunning")
        companyname = onionurl[0]
        onionurl = onionurl[1]
        dataloc = '/media/archangel/Elements/clop/dataleaks/'
        foldername = dataloc
        dataloc = dataloc + companyname + "/"
        try:
           if not os.path.isdir(dataloc):
    
               os.mkdir(dataloc)
        except Exception as e:
    
            print(e)
            print("folder not created")
    
    
        filename = os.path.basename(onionurl)
        filenamebasename = filename
    
    
    
        dataloc = dataloc + filename
    
        try:
     #       seconds = 20
      #      timeout = Timeout(seconds)
       #     timeout.start()
    
    
    
            session = requests.session()
            session.proxies = {}
            session.proxies['http'] = 'socks5h://localhost:9050'
            session.proxies['https'] = 'socks5h://localhost:9050'
            #print(onionurlforrequest)
            
          #  onionurlforrequest = "http://" + onionurl
            print("dataloc")
            print(dataloc)
            print("onionurl")
            print(onionurl)
            url = onionurl
    
            try:
                print("url")
                print(url)
                if not os.path.isdir(foldername):
                    os.makedirs(foldername)
                # download the body of response by chunk, not immediately
    #https://stackoverflow.com/questions/16694907/download-large-file-in-python-with-requests?rq=1
                try:
                    try:
                        seconds = 20
                        timeout = Timeout(seconds)
                        timeout.start()
                    except Exception as ex:
                        print(ex)
    
                    resume_header = {'Accept-Encoding': None, 'Range': 'bytes=%d-' % resume_byte_pos}
                    try:
                        with session.get(url, stream=True, verify=False, headers=resume_header, timeout=600) as response:
                            #response.raise_for_status()
    
                            # get the total file size
                            file_size = int(response.headers['Content-Length'])
                            if (file_size  > 1000000):
                                filesizemb = file_size / 1000000
                            else:
                                filesizemb = 1
                            print(file_size)
                            # get the file name
    
                            filename = dataloc
                #            filename = os.path.join(dataloc, url.split("/")[-1])
                            # progress bar, changing the unit to bytes instead of iteration (default by tqdm)
                 #           response = session.get(url, stream = True)
                #            progress = tqdm(response.iter_content(1024), f"Downloading {filename}", total=file_size, unit="B", unit_scale=True, unit_divisor=1024)
                            try:
                                with open(filename, "ab") as text_file: 
                                    for chunk in response.iter_content(chunk_size=1024*1024):
                                        #https://www.kite.com/python/answers/how-to-download-large-files-with-requests-in-python
                                        #if len(chunk) != 1024*36:
                                        if chunk: 
                                            #print(len(chunk))
                                            text_file.write(chunk)
                                            text_file.flush()
                            except Exception as ex:
                                logging.error(f'write failed with error: {ex}')
                                print(ex)
                                    #else:
                                    
                                        # write data read to the file
                    #                    f.write(data)
                                        # update the progress bar manually
                     #                   progress.update(len(data))
                                    # finally, if the url is valid
    
                            #logging.info('Download finished successfully')
    
                            print("exited with for file")
                    except Exception as ex:
                        logging.error(f'Request failed with error: {ex}')
                        print(ex)
    
                except Exception as ex:
                    logging.error(f'Attempt failed with error: {ex}')
                    print(ex)
    
                print("closing text file")
              #  text_file.close()
    
                    #list composed of dataleaklocation (location in external), filename (filename after / slash) , dataleakurl (urlofonion) , contentsize
    
            except Exception as e:
                print("FAILED DOWNLOAD 2")
    
                print(e)
        except Exception as e:
            print("FAILED DOWNLOAD 5")
            print(e)
    
    
    
    
    
    
    
    
    
    
    
    
    def onionrequestthreadeddataleakdownload2(onionurl):
        companyname = onionurl[0]
        onionurl = onionurl[1]
        dataloc = '/media/archangel/Elements/clop/dataleaks/'
        foldername = dataloc
        dataloc = dataloc + companyname + "/"
        try:
           if not os.path.isdir(dataloc):
    
               os.mkdir(dataloc)
        except Exception as e:
    
            print(e)
            print("folder not created")
    
    
        filename = os.path.basename(onionurl)
        filenamebasename = filename
    
    
    
        dataloc = dataloc + filename
    
        try:
     #       seconds = 20
      #      timeout = Timeout(seconds)
       #     timeout.start()
    
    
    
            session = requests.session()
            session.proxies = {}
            session.proxies['http'] = 'socks5h://localhost:9050'
            session.proxies['https'] = 'socks5h://localhost:9050'
            #print(onionurlforrequest)
            
          #  onionurlforrequest = "http://" + onionurl
            print("dataloc")
            print(dataloc)
            print("onionurl")
            print(onionurl)
            url = onionurl
    
            try:
                print("url")
                print(url)
                if not os.path.isdir(foldername):
                    os.makedirs(foldername)
                # download the body of response by chunk, not immediately
    #https://stackoverflow.com/questions/16694907/download-large-file-in-python-with-requests?rq=1
                try:
                    try:
                        seconds = 20
                        timeout = Timeout(seconds)
                        timeout.start()
                    except Exception as ex:
                        print(ex)
    
                   # resume_header = ({'Range': f'bytes=0-2000000'})
                    #file_size_online = int(r.headers.get('content-length', 0))
                    headersac = {'Accept-Encoding': None}
                    try:
                        with session.get(url, stream=True, verify=False, headers = headersac, timeout=600) as response:
                            #response.raise_for_status()
    
                            # get the total file size
        #                    file_size = int(response.headers.get("Content-Length", 0))
                            file_size = int(response.headers['Content-Length'])
                            if (file_size  > 1000000):
                                filesizemb = file_size / 1000000
                            else:
                                filesizemb = 1
                            print(file_size)
                            #e
                            # get the file name
    
                            filename = dataloc
                #            filename = os.path.join(dataloc, url.split("/")[-1])
                            # progress bar, changing the unit to bytes instead of iteration (default by tqdm)
                 #           response = session.get(url, stream = True)
                #            progress = tqdm(response.iter_content(1024), f"Downloading {filename}", total=file_size, unit="B", unit_scale=True, unit_divisor=1024)
                            try:
                                with open(filename, "wb") as text_file: 
                                    for chunk in response.iter_content(chunk_size=1024*1024):
                                        #https://www.kite.com/python/answers/how-to-download-large-files-with-requests-in-python
                                        #if len(chunk) != 1024*36:
                                        if chunk: 
                                           # print(len(chunk))
                                            text_file.write(chunk)
                                            text_file.flush()
                            except Exception as ex:
                                logging.error(f'write failed with error: {ex}')
                                print(ex)
                                    #else:
                                    
                                        # write data read to the file
                    #                    f.write(data)
                                        # update the progress bar manually
                     #                   progress.update(len(data))
                                    # finally, if the url is valid
    
                            #logging.info('Download finished successfully')
                    except Exception as ex:
                        logging.error(f'request failed with error: {ex}')
                        print(ex)
                        print("exited with for file")
                    #path = Path(filename)
                    file_size_offline = Path(filename).stat().st_size
                    print("file size offline")
                    while (file_size_offline != file_size):
                        try:
                            print(file_size_offline)
                            print(file_size)
                            print("file size incomplete")
                            file_size_offline = Path(filename).stat().st_size
                            onionurllist = []
                            onionurllist.append(companyname)
    
                            onionurllist.append(onionurl)
                            onionrequestthreadeddataleakdownloadresume(onionurllist, file_size_offline)
                            file_size_offline = Path(filename).stat().st_size
    
                        except Exception as ex:
                            print("redownload failed")
                            print(ex)
                    print("LOOP FINISHED")
    
                    print(file_size)
                    print(file_size_offline)
                    print(filename)
                except Exception as ex:
                    logging.error(f'Attempt failed with error: {ex}')
                    print(ex)
    
    #            print("closing text file")
              #  text_file.close()
                if(file_size_offline != file_size):
                    while (file_size_offline != file_size):
                        try:
                            print(file_size_offline)
                            print(file_size)
                            print("file size incomplete")
                            file_size_offline = Path(filename).stat().st_size
                            onionurllist = []
                            onionurllist.append(companyname)
    
                            onionurllist.append(onionurl)
                            onionrequestthreadeddataleakdownloadresume(onionurllist, file_size_offline)
                            file_size_offline = Path(filename).stat().st_size
    
                        except Exception as ex:
                            print("redownload failed")
                            print(ex)
                else:
                    #list composed of dataleaklocation (location in external), filename (filename after / slash) , dataleakurl (urlofonion) , contentsize
                    returnedlist = []
                    returnedlist.append(dataloc)
                    returnedlist.append(filenamebasename)
                    returnedlist.append(url)
                    returnedlist.append(filesizemb)
                    return returnedlist
                if(file_size_offline != file_size):
                    print("rerunning a final FINAL time")
                    while (file_size_offline != file_size):
                        try:
                            print(file_size_offline)
                            print(file_size)
                            print("file size incomplete")
                            file_size_offline = Path(filename).stat().st_size
                            onionurllist = []
                            onionurllist.append(companyname)
    
                            onionurllist.append(onionurl)
                            onionrequestthreadeddataleakdownloadresume(onionurllist, file_size_offline)
                            file_size_offline = Path(filename).stat().st_size
    
                        except Exception as ex:
                            print("redownload failed")
                            print(ex)
                else:
                    #list composed of dataleaklocation (location in external), filename (filename after / slash) , dataleakurl (urlofonion) , contentsize
                    returnedlist = []
                    returnedlist.append(dataloc)
                    returnedlist.append(filenamebasename)
                    returnedlist.append(url)
                    returnedlist.append(filesizemb)
                    return returnedlist
                    
    
    
                returnedlist = []
                returnedlist.append(dataloc)
                returnedlist.append(filenamebasename)
                returnedlist.append(url)
                returnedlist.append(filesizemb)
                return returnedlist
            except Exception as e:
                print("FAILED DOWNLOAD 2")
    
                print(e)
        except Exception as e:
            print("FAILED DOWNLOAD 5")
            print(e)