How to use Multiprocessing effeciently with pytube library in python for quicker downloads?

I am using python and the pytube library to download videos given a playlist link on youtube. This is what I came up with:

import pytube

playlist = pytube.Playlist(youtube_playlist_link)

for video in playlist.videos:
    pytube.query.StreamQuery.get_by_itag(video.streams,itag=251).download(output_path=r'C:\Users\Anderson\OneDrive\Desktop\Vids')

It took over one minute to download 12 audio streams which is way slower than my internet speed. I tried using async but I don't think it works with the library so I switched to muliprocessing.

import pytube
from multiprocessing import Process

yt = pytube.Playlist(youtube_playlist_link)

def first_half(): 
    for video in range(0,7,1):
        #Downloads first half of playlist audio
        x = yt.video_urls[video]
        y = pytube.YouTube(x)
        pytube.query.StreamQuery.get_by_itag(y.streams, itag=251).download(output_path=r'C:\Users\Anderson\OneDrive\Desktop\Vids')

def second_half():
    for video in range(7,12,1):
        #Download second half of playlist audio

if __name__ == '__main__':
    fh = Process(target=first_half)
    sh = Process(target=second_half)
    fh.start()
    sh.start()
    fh.join()
    sh.join()

This more than halved my previous one minute but is inefficient to make.

If I have 4 more cores in my CPU is there a way I can use them to download audio without having to make a new function for each? All six cores would split up the 12 songs evenly with one function downloading them all.

Also, I'm new to async so if I used async would I implement like this?

import pytube
import asyncio

playlist = pytube.Playlist(playlist_link)

async def main():
    for video in playlist.videos:
        await pytube.query.StreamQuery.get_by_itag(video.streams, itag=251).download(output_path=r'C:\Users\Anderson\OneDrive\Desktop\Vids')

if __name__ == "__main__":
    asyncio.run(main())

Does the library need to support async for me to use it in my code?

Solution

An async approach would be best, or perhaps some combination of async and multiprocessing, since this primarily involves network and disk IO.

But I'm not familiar with pytube or async-pytube so I'll just address not having to create a new function for each CPU core.

import pytube
from multiprocessing import cpu_count, Process
import os


yt = pytube.Playlist(youtube_playlist_link)

def download_vids(id: int, start: int, end: int):
    for video in range(start, end, 1):
        print(f'[{id}] - downloading video', video)

        x = yt.video_urls[video]
        y = pytube.YouTube(x)
        pytube.query.StreamQuery.get_by_itag(y.streams, itag=251).download(output_path=r'C:\Users\Anderson\OneDrive\Desktop\Vids')


if __name__ == '__main__':
    CPU_COUNT = cpu_count()

    # better than cpu_count but only supported on some *nix distros
    try:
        CPU_COUNT = len(os.sched_getaffinity(0))
    except AttributeError:
        pass

    NUM_VIDS = len(yt.video_urls)
    
    print('CPU COUNT: ', CPU_COUNT)
    print('NUM VIDS: ', NUM_VIDS)
    print()

    start_end = []
    step, rem = divmod(NUM_VIDS, CPU_COUNT)
    off = 0

    # evenly distribute downloads amongst processes
    for i in range(CPU_COUNT):
        start = i * step + off
        end = start + step
        if rem > 0:
            off += 1
            end += 1
        
        start_end.append({'start': start, 'end': end})
        rem -= 1


    # create processes
    procs: list[Process] = []
    for i in range(CPU_COUNT):
        proc = Process(target=download_vids, kwargs=dict(id=i, **start_end[i]))
        procs.append(proc)

    # start processes
    for i, proc in enumerate(procs):
        proc.start()
        print('-- started process:', i)

    # join processes
    for i, proc in enumerate(procs):
        proc.join()
        print('-- joined process:', i)