Search code examples
python-3.xmultiprocessing

How to use Multiprocessing effeciently with pytube library in python for quicker downloads?


I am using python and the pytube library to download videos given a playlist link on youtube. This is what I came up with:

import pytube

playlist = pytube.Playlist(youtube_playlist_link)

for video in playlist.videos:
    pytube.query.StreamQuery.get_by_itag(video.streams,itag=251).download(output_path=r'C:\Users\Anderson\OneDrive\Desktop\Vids')

It took over one minute to download 12 audio streams which is way slower than my internet speed. I tried using async but I don't think it works with the library so I switched to muliprocessing.

import pytube
from multiprocessing import Process

yt = pytube.Playlist(youtube_playlist_link)

def first_half(): 
    for video in range(0,7,1):
        #Downloads first half of playlist audio
        x = yt.video_urls[video]
        y = pytube.YouTube(x)
        pytube.query.StreamQuery.get_by_itag(y.streams, itag=251).download(output_path=r'C:\Users\Anderson\OneDrive\Desktop\Vids')

def second_half():
    for video in range(7,12,1):
        #Download second half of playlist audio

if __name__ == '__main__':
    fh = Process(target=first_half)
    sh = Process(target=second_half)
    fh.start()
    sh.start()
    fh.join()
    sh.join()
   

This more than halved my previous one minute but is inefficient to make.

If I have 4 more cores in my CPU is there a way I can use them to download audio without having to make a new function for each? All six cores would split up the 12 songs evenly with one function downloading them all.


Also, I'm new to async so if I used async would I implement like this?

import pytube
import asyncio

playlist = pytube.Playlist(playlist_link)

async def main():
    for video in playlist.videos:
        await pytube.query.StreamQuery.get_by_itag(video.streams, itag=251).download(output_path=r'C:\Users\Anderson\OneDrive\Desktop\Vids')

if __name__ == "__main__":
    asyncio.run(main())

Does the library need to support async for me to use it in my code?


Solution

  • An async approach would be best, or perhaps some combination of async and multiprocessing, since this primarily involves network and disk IO.

    But I'm not familiar with pytube or async-pytube so I'll just address not having to create a new function for each CPU core.

    import pytube
    from multiprocessing import cpu_count, Process
    import os
    
    
    yt = pytube.Playlist(youtube_playlist_link)
    
    def download_vids(id: int, start: int, end: int):
        for video in range(start, end, 1):
            print(f'[{id}] - downloading video', video)
    
            x = yt.video_urls[video]
            y = pytube.YouTube(x)
            pytube.query.StreamQuery.get_by_itag(y.streams, itag=251).download(output_path=r'C:\Users\Anderson\OneDrive\Desktop\Vids')
    
    
    if __name__ == '__main__':
        CPU_COUNT = cpu_count()
    
        # better than cpu_count but only supported on some *nix distros
        try:
            CPU_COUNT = len(os.sched_getaffinity(0))
        except AttributeError:
            pass
    
        NUM_VIDS = len(yt.video_urls)
        
        print('CPU COUNT: ', CPU_COUNT)
        print('NUM VIDS: ', NUM_VIDS)
        print()
    
        start_end = []
        step, rem = divmod(NUM_VIDS, CPU_COUNT)
        off = 0
    
        # evenly distribute downloads amongst processes
        for i in range(CPU_COUNT):
            start = i * step + off
            end = start + step
            if rem > 0:
                off += 1
                end += 1
            
            start_end.append({'start': start, 'end': end})
            rem -= 1
    
    
        # create processes
        procs: list[Process] = []
        for i in range(CPU_COUNT):
            proc = Process(target=download_vids, kwargs=dict(id=i, **start_end[i]))
            procs.append(proc)
    
        # start processes
        for i, proc in enumerate(procs):
            proc.start()
            print('-- started process:', i)
    
        # join processes
        for i, proc in enumerate(procs):
            proc.join()
            print('-- joined process:', i)