I am using python and the pytube library to download videos given a playlist link on youtube. This is what I came up with:
import pytube
playlist = pytube.Playlist(youtube_playlist_link)
for video in playlist.videos:
pytube.query.StreamQuery.get_by_itag(video.streams,itag=251).download(output_path=r'C:\Users\Anderson\OneDrive\Desktop\Vids')
It took over one minute to download 12 audio streams which is way slower than my internet speed. I tried using async but I don't think it works with the library so I switched to muliprocessing.
import pytube
from multiprocessing import Process
yt = pytube.Playlist(youtube_playlist_link)
def first_half():
for video in range(0,7,1):
#Downloads first half of playlist audio
x = yt.video_urls[video]
y = pytube.YouTube(x)
pytube.query.StreamQuery.get_by_itag(y.streams, itag=251).download(output_path=r'C:\Users\Anderson\OneDrive\Desktop\Vids')
def second_half():
for video in range(7,12,1):
#Download second half of playlist audio
if __name__ == '__main__':
fh = Process(target=first_half)
sh = Process(target=second_half)
fh.start()
sh.start()
fh.join()
sh.join()
This more than halved my previous one minute but is inefficient to make.
If I have 4 more cores in my CPU is there a way I can use them to download audio without having to make a new function for each? All six cores would split up the 12 songs evenly with one function downloading them all.
Also, I'm new to async
so if I used async
would I implement like this?
import pytube
import asyncio
playlist = pytube.Playlist(playlist_link)
async def main():
for video in playlist.videos:
await pytube.query.StreamQuery.get_by_itag(video.streams, itag=251).download(output_path=r'C:\Users\Anderson\OneDrive\Desktop\Vids')
if __name__ == "__main__":
asyncio.run(main())
Does the library need to support async
for me to use it in my code?
An async
approach would be best, or perhaps some combination of async
and multiprocessing
, since this primarily involves network and disk IO.
But I'm not familiar with pytube
or async-pytube
so I'll just address not having to create a new function for each CPU core.
import pytube
from multiprocessing import cpu_count, Process
import os
yt = pytube.Playlist(youtube_playlist_link)
def download_vids(id: int, start: int, end: int):
for video in range(start, end, 1):
print(f'[{id}] - downloading video', video)
x = yt.video_urls[video]
y = pytube.YouTube(x)
pytube.query.StreamQuery.get_by_itag(y.streams, itag=251).download(output_path=r'C:\Users\Anderson\OneDrive\Desktop\Vids')
if __name__ == '__main__':
CPU_COUNT = cpu_count()
# better than cpu_count but only supported on some *nix distros
try:
CPU_COUNT = len(os.sched_getaffinity(0))
except AttributeError:
pass
NUM_VIDS = len(yt.video_urls)
print('CPU COUNT: ', CPU_COUNT)
print('NUM VIDS: ', NUM_VIDS)
print()
start_end = []
step, rem = divmod(NUM_VIDS, CPU_COUNT)
off = 0
# evenly distribute downloads amongst processes
for i in range(CPU_COUNT):
start = i * step + off
end = start + step
if rem > 0:
off += 1
end += 1
start_end.append({'start': start, 'end': end})
rem -= 1
# create processes
procs: list[Process] = []
for i in range(CPU_COUNT):
proc = Process(target=download_vids, kwargs=dict(id=i, **start_end[i]))
procs.append(proc)
# start processes
for i, proc in enumerate(procs):
proc.start()
print('-- started process:', i)
# join processes
for i, proc in enumerate(procs):
proc.join()
print('-- joined process:', i)