I'm using python requests library to download the file at 'onionurl' in a multiprocessed fashion, to download a number of files from a tor service.
That is the reasoning behind the code.
However as these files download, they cut out after a minute or two each. As in the stream fails to download, no error is given but 'closing text file' is returned. Meaning that it is impossible to download the files hosted on these onion servers, that are several hundred gigabytes each.
Any help with resolution of this problem would be greatly appreciated.
session = requests.session()
session.proxies = {}
session.proxies['http'] = 'socks5h://localhost:9050'
session.proxies['https'] = 'socks5h://localhost:9050'
#print(onionurlforrequest)
url = onionurl
try:
if not os.path.isdir(foldername):
os.makedirs(foldername)
# download the body of response by chunk, not immediately
with session.get(url, stream=True, verify=False, timeout=1000000) as response:
# get the total file size
file_size = int(response.headers.get("Content-Length", 0))
print(file_size)
# get the file name
filename = dataloc
with open(filename, "wb") as text_file:
for chunk in response.iter_content(chunk_size=1024):
text_file.write(chunk)
if (file_size > 1000000):
filesizemb = file_size / 1000000
else:
filesizemb = 1
print("closing text file")
text_file.close()
Managed to solve it, by simply accepting that the connection will die and writing a new function that resumes the download at the exact offset, the theory of which is explained in this question - How to resume file download in Python?
My code (warning, messy):
def onionrequestthreadeddataleakdownloadresume(onionurl,resume_byte_pos):
print("rerunning")
companyname = onionurl[0]
onionurl = onionurl[1]
dataloc = '/media/archangel/Elements/clop/dataleaks/'
foldername = dataloc
dataloc = dataloc + companyname + "/"
try:
if not os.path.isdir(dataloc):
os.mkdir(dataloc)
except Exception as e:
print(e)
print("folder not created")
filename = os.path.basename(onionurl)
filenamebasename = filename
dataloc = dataloc + filename
try:
# seconds = 20
# timeout = Timeout(seconds)
# timeout.start()
session = requests.session()
session.proxies = {}
session.proxies['http'] = 'socks5h://localhost:9050'
session.proxies['https'] = 'socks5h://localhost:9050'
#print(onionurlforrequest)
# onionurlforrequest = "http://" + onionurl
print("dataloc")
print(dataloc)
print("onionurl")
print(onionurl)
url = onionurl
try:
print("url")
print(url)
if not os.path.isdir(foldername):
os.makedirs(foldername)
# download the body of response by chunk, not immediately
#https://stackoverflow.com/questions/16694907/download-large-file-in-python-with-requests?rq=1
try:
try:
seconds = 20
timeout = Timeout(seconds)
timeout.start()
except Exception as ex:
print(ex)
resume_header = {'Accept-Encoding': None, 'Range': 'bytes=%d-' % resume_byte_pos}
try:
with session.get(url, stream=True, verify=False, headers=resume_header, timeout=600) as response:
#response.raise_for_status()
# get the total file size
file_size = int(response.headers['Content-Length'])
if (file_size > 1000000):
filesizemb = file_size / 1000000
else:
filesizemb = 1
print(file_size)
# get the file name
filename = dataloc
# filename = os.path.join(dataloc, url.split("/")[-1])
# progress bar, changing the unit to bytes instead of iteration (default by tqdm)
# response = session.get(url, stream = True)
# progress = tqdm(response.iter_content(1024), f"Downloading {filename}", total=file_size, unit="B", unit_scale=True, unit_divisor=1024)
try:
with open(filename, "ab") as text_file:
for chunk in response.iter_content(chunk_size=1024*1024):
#https://www.kite.com/python/answers/how-to-download-large-files-with-requests-in-python
#if len(chunk) != 1024*36:
if chunk:
#print(len(chunk))
text_file.write(chunk)
text_file.flush()
except Exception as ex:
logging.error(f'write failed with error: {ex}')
print(ex)
#else:
# write data read to the file
# f.write(data)
# update the progress bar manually
# progress.update(len(data))
# finally, if the url is valid
#logging.info('Download finished successfully')
print("exited with for file")
except Exception as ex:
logging.error(f'Request failed with error: {ex}')
print(ex)
except Exception as ex:
logging.error(f'Attempt failed with error: {ex}')
print(ex)
print("closing text file")
# text_file.close()
#list composed of dataleaklocation (location in external), filename (filename after / slash) , dataleakurl (urlofonion) , contentsize
except Exception as e:
print("FAILED DOWNLOAD 2")
print(e)
except Exception as e:
print("FAILED DOWNLOAD 5")
print(e)
def onionrequestthreadeddataleakdownload2(onionurl):
companyname = onionurl[0]
onionurl = onionurl[1]
dataloc = '/media/archangel/Elements/clop/dataleaks/'
foldername = dataloc
dataloc = dataloc + companyname + "/"
try:
if not os.path.isdir(dataloc):
os.mkdir(dataloc)
except Exception as e:
print(e)
print("folder not created")
filename = os.path.basename(onionurl)
filenamebasename = filename
dataloc = dataloc + filename
try:
# seconds = 20
# timeout = Timeout(seconds)
# timeout.start()
session = requests.session()
session.proxies = {}
session.proxies['http'] = 'socks5h://localhost:9050'
session.proxies['https'] = 'socks5h://localhost:9050'
#print(onionurlforrequest)
# onionurlforrequest = "http://" + onionurl
print("dataloc")
print(dataloc)
print("onionurl")
print(onionurl)
url = onionurl
try:
print("url")
print(url)
if not os.path.isdir(foldername):
os.makedirs(foldername)
# download the body of response by chunk, not immediately
#https://stackoverflow.com/questions/16694907/download-large-file-in-python-with-requests?rq=1
try:
try:
seconds = 20
timeout = Timeout(seconds)
timeout.start()
except Exception as ex:
print(ex)
# resume_header = ({'Range': f'bytes=0-2000000'})
#file_size_online = int(r.headers.get('content-length', 0))
headersac = {'Accept-Encoding': None}
try:
with session.get(url, stream=True, verify=False, headers = headersac, timeout=600) as response:
#response.raise_for_status()
# get the total file size
# file_size = int(response.headers.get("Content-Length", 0))
file_size = int(response.headers['Content-Length'])
if (file_size > 1000000):
filesizemb = file_size / 1000000
else:
filesizemb = 1
print(file_size)
#e
# get the file name
filename = dataloc
# filename = os.path.join(dataloc, url.split("/")[-1])
# progress bar, changing the unit to bytes instead of iteration (default by tqdm)
# response = session.get(url, stream = True)
# progress = tqdm(response.iter_content(1024), f"Downloading {filename}", total=file_size, unit="B", unit_scale=True, unit_divisor=1024)
try:
with open(filename, "wb") as text_file:
for chunk in response.iter_content(chunk_size=1024*1024):
#https://www.kite.com/python/answers/how-to-download-large-files-with-requests-in-python
#if len(chunk) != 1024*36:
if chunk:
# print(len(chunk))
text_file.write(chunk)
text_file.flush()
except Exception as ex:
logging.error(f'write failed with error: {ex}')
print(ex)
#else:
# write data read to the file
# f.write(data)
# update the progress bar manually
# progress.update(len(data))
# finally, if the url is valid
#logging.info('Download finished successfully')
except Exception as ex:
logging.error(f'request failed with error: {ex}')
print(ex)
print("exited with for file")
#path = Path(filename)
file_size_offline = Path(filename).stat().st_size
print("file size offline")
while (file_size_offline != file_size):
try:
print(file_size_offline)
print(file_size)
print("file size incomplete")
file_size_offline = Path(filename).stat().st_size
onionurllist = []
onionurllist.append(companyname)
onionurllist.append(onionurl)
onionrequestthreadeddataleakdownloadresume(onionurllist, file_size_offline)
file_size_offline = Path(filename).stat().st_size
except Exception as ex:
print("redownload failed")
print(ex)
print("LOOP FINISHED")
print(file_size)
print(file_size_offline)
print(filename)
except Exception as ex:
logging.error(f'Attempt failed with error: {ex}')
print(ex)
# print("closing text file")
# text_file.close()
if(file_size_offline != file_size):
while (file_size_offline != file_size):
try:
print(file_size_offline)
print(file_size)
print("file size incomplete")
file_size_offline = Path(filename).stat().st_size
onionurllist = []
onionurllist.append(companyname)
onionurllist.append(onionurl)
onionrequestthreadeddataleakdownloadresume(onionurllist, file_size_offline)
file_size_offline = Path(filename).stat().st_size
except Exception as ex:
print("redownload failed")
print(ex)
else:
#list composed of dataleaklocation (location in external), filename (filename after / slash) , dataleakurl (urlofonion) , contentsize
returnedlist = []
returnedlist.append(dataloc)
returnedlist.append(filenamebasename)
returnedlist.append(url)
returnedlist.append(filesizemb)
return returnedlist
if(file_size_offline != file_size):
print("rerunning a final FINAL time")
while (file_size_offline != file_size):
try:
print(file_size_offline)
print(file_size)
print("file size incomplete")
file_size_offline = Path(filename).stat().st_size
onionurllist = []
onionurllist.append(companyname)
onionurllist.append(onionurl)
onionrequestthreadeddataleakdownloadresume(onionurllist, file_size_offline)
file_size_offline = Path(filename).stat().st_size
except Exception as ex:
print("redownload failed")
print(ex)
else:
#list composed of dataleaklocation (location in external), filename (filename after / slash) , dataleakurl (urlofonion) , contentsize
returnedlist = []
returnedlist.append(dataloc)
returnedlist.append(filenamebasename)
returnedlist.append(url)
returnedlist.append(filesizemb)
return returnedlist
returnedlist = []
returnedlist.append(dataloc)
returnedlist.append(filenamebasename)
returnedlist.append(url)
returnedlist.append(filesizemb)
return returnedlist
except Exception as e:
print("FAILED DOWNLOAD 2")
print(e)
except Exception as e:
print("FAILED DOWNLOAD 5")
print(e)