Search code examples
pythondownloadmultiprocess

Why the header is not expected?


As you see, I'm trying to make an multiprocess downloader. It works well untill I open the final file: it's broken. I check it but can't find any mistake. But It's probably the header download header is wrong and the Range value error. this is the code

class MultiprocessDownload:
    def __init__(self, url, path, filename, thread_num):
        self.url = url
        self.path = path
        self.filename = filename
        self.thread_num = thread_num
        self.threads = []
        self.head = requests.head(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko'}).headers
        self.length = int(self.head.get('Content-Length', False))
        print(self.length)
        self.proc = []
        self.lock = []
        for i in range(self.thread_num):
            self.proc.append(None)
            self.lock.append(None)
        if self.length == False: raise Exception('Thik file does not support multiprocess download')
        num = self.length // self.thread_num
        last = -1
        for i in range(1, thread_num + 1):
            self.threads.append([last + 1, num * i])
            last = num * i
        self.threads[-1:][0][1] += self.length % self.thread_num
        print(self.threads)
            
    def thread(self, num):
        self.lock[num] = _thread.allocate_lock()
        with self.lock[num]:
            header = {'Range': f'bytes=' + str(self.threads[num][0]) + '-' + str(self.threads[num][1])}
            print(header)
            self.proc[num] = 0
            req = requests.get(self.url, headers=header, stream = True)
            blk_size = self.threads[num][1] - self.threads[num][0]
            file = open('dl_block_' + str(num), 'wb')
            i = 0
            for chunk in req.iter_content(chunk_size=512):
                if chunk:
                    file.write(chunk)
                    i += 1
                    self.proc[num] = i * 512 / blk_size
            file.close()
        return 0
    
    def getDownloadInfo(self):
        info = []
        total = 0
        for i in range(self.thread_num):
            if None in self.proc: return 1
            info.append(str(self.proc[i] * 100) + '%')
            total += self.proc[i]
        info.append(str(total / self.thread_num * 100) + '%') #[*threads_info, total_info]
        return info

    def run(self):
        for i in range(self.thread_num):
            print(i)
            _thread.start_new_thread(self.thread, (i,))
        time.sleep(10)
        locked = 1
        print(self.lock)
        while locked:
            print(self.getDownloadInfo())
            locked = 0
            for n in range(self.thread_num):
                if None in self.lock:
                    locked = 1
                    continue
                if self.lock[n].locked():
                    locked += 1
            print(locked)
            time.sleep(1)
        for num in range(self.thread_num):
            blk = open('dl_block_' + str(num), 'rb')
            target = open(self.path + self.filename, 'ab')
            target.write(blk.read())
            target.close()
            blk.close()
            os.remove('dl_block_' + str(num))
        print('file downloaded as', self.path + self.filename)

Some of its output: [[0, 48967091], [48967092, 97934182], [97934183, 146901273], [146901274, 195868364], [195868365, 244835455], [244835456, 293802546], [293802547, 342769637], [342769638, 391736728], [391736729, 440703819], [440703820, 489670910], [489670911, 538638001], [538638002, 587605092], [587605093, 636572183], [636572184, 685539274], [685539275, 734506365], [734506366, 783473471]] 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 {'Range': 'bytes=48967092-97934182'} {'Range': 'bytes=244835456-293802546'}{'Range': 'bytes=195868365-244835455'} {'Range': 'bytes=489670911-538638001'}{'Range': 'bytes=391736729-440703819'}{'Range': 'bytes=342769638-391736728'}{'Range': 'bytes=0-48967091'}{'Range': 'bytes=97934183-146901273'} {'Range': 'bytes=146901274-195868364'}{'Range': 'bytes=440703820-489670910'}{'Range': 'bytes=293802547-342769637'}

{'Range': 'bytes=538638002-587605092'}{'Range': 'bytes=587605093-636572183'} {'Range': 'bytes=685539275-734506365'}

{'Range': 'bytes=636572184-685539274'}{'Range': 'bytes=734506366-783473471'}

the thread's range is quite strange but I can't find where's wrong.


Solution

  • It is expected that the program print out the range not in order, depending on which thread is started first rather than depending on which thread is create first.

    You should change your code like this:

            target = open(self.path + self.filename, 'wb')
            for num in range(self.thread_num):
                blk = open('dl_block_' + str(num), 'rb')
                target.write(blk.read())
                blk.close()
                os.remove('dl_block_' + str(num))
            target.close()
    

    This change will create a target file when you first run it or rewrite the target file from the second running.

    The code seems ugly although it works. More advice:

    • use threading library to implement this downloader
    • using barrier to wait all works done instead of sleep and while-loop
    • blk_size = self.threads[num][1] - self.threads[num][0] + 1
    • use ls -l to check the file size

    enter image description here