I've struggled a while now trying to download a pdf in Python Playwright which is rendered from a php web page (not included in this example as it has sensitive code - instead I've included a link to a pdf).
Here's my code so far, using the JavaScript code from https://github.com/microsoft/playwright/issues/3509 as example:
from playwright.async_api import Playwright, async_playwright, expect
import asyncio
import os
import json
tmp_dir = './pwtest/'
user_dir = os.path.join(os.getcwd(),"pwtest","user_dir")
print("User dir: ", user_dir)
downloads_path = os.path.join(os.getcwd(),"pwtest","downloads")
print("Downloads path: ", downloads_path)
storage_state_path = "./pwtest/"
default_preferences = {
"plugins": {
"always_open_pdf_externally": True
}
}
#Making directories can likely be done more efficiently... please comment if you know how, then I'll edit the post.
try:
os.mkdir(os.path.join(os.getcwd(),"pwtest"))
except:
print("Unable to create folder... Likely it already exists.")
try:
os.mkdir(os.path.join(os.getcwd(),"pwtest","downloads"))
except:
print("Unable to create folder... Likely it already exists.")
try:
os.mkdir(os.path.join(os.getcwd(),"pwtest","user_dir"))
except:
print("Unable to create folder... Likely it already exists.")
try:
os.mkdir(os.path.join(os.getcwd(),"pwtest","user_dir","Default"))
except:
print("Unable to create folder... Likely it already exists.")
with open(os.path.join(user_dir, "Default", "Preferences"), "w") as f:
f.write(json.dumps(default_preferences))
async def run(playwright: Playwright) -> None:
browser = await playwright.chromium.launch_persistent_context(user_dir, accept_downloads=True, headless=False, slow_mo=1000)
browser.set_default_timeout(10000)
page = await browser.new_page()
# Start waiting for the download
file_name = "test_d.pdf"
async with page.expect_download() as download_info:
await page.goto("https://www.africau.edu/images/default/sample.pdf", timeout= 5000)
await page.wait_for_timeout(200)
print("Saving file to ", downloads_path, file_name)
# Wait for the download to start
download = await download_info.value
# Wait for the download process to complete
print(await download.path())
# Save downloaded file somewhere
await download.save_as(os.path.join(downloads_path, file_name))
await browser.close()
async def main() -> None:
async with async_playwright() as playwright:
await run(playwright)
asyncio.run(main())
Help will be appreciated.
I keep getting the following error after trying sync, async etc. code. Another alternative is likely to intercept the blob transfer, but I don't know how that's done. Please advise.
playwright._impl._api_types.Error: net::ERR_ABORTED at https://www.africau.edu/images/default/sample.pdf
=========================== logs ===========================
navigating to "https://www.africau.edu/images/default/sample.pdf", waiting until "load"
============================================================
This is how I fixed it in the end as explained in my comment of original post. This is probably not the best way to do it, but it worked. Please comment if you can improve on using the try/except methods for the download portion of the pdf.
from playwright.async_api import Playwright, async_playwright, expect
import asyncio
import os
import json
tmp_dir = './pwtest/'
user_dir = os.path.join(os.getcwd(),"pwtest","user_dir")
print("User dir: ", user_dir)
downloads_path = os.path.join(os.getcwd(),"pwtest","downloads")
print("Downloads path: ", downloads_path)
storage_state_path = "./pwtest/"
# os.makedirs(os.path.join(tmp_dir, user_dir), exist_ok=True)
default_preferences = {
"plugins": {
"always_open_pdf_externally": True
}
}
#Making directories can likely be done more efficiently... please comment if you know how, then I'll edit the post.
try:
os.mkdir(os.path.join(os.getcwd(),"pwtest"))
except:
print("Unable to create folder... Likely it already exists.")
try:
os.mkdir(os.path.join(os.getcwd(),"pwtest","downloads"))
except:
print("Unable to create folder... Likely it already exists.")
try:
os.mkdir(os.path.join(os.getcwd(),"pwtest","user_dir"))
except:
print("Unable to create folder... Likely it already exists.")
try:
os.mkdir(os.path.join(os.getcwd(),"pwtest","user_dir","Default"))
except:
print("Unable to create folder... Likely it already exists.")
with open(os.path.join(user_dir, "Default", "Preferences"), "w") as f:
f.write(json.dumps(default_preferences))
async def run(playwright: Playwright) -> None:
browser = await playwright.chromium.launch_persistent_context(user_dir, accept_downloads=True, headless=False, slow_mo=1000)
browser.set_default_timeout(10000)
page = await browser.new_page()
# Start waiting for the download
file_name = "test_d.pdf"
async with page.expect_download() as download_info:
try:
await page.goto("https://www.africau.edu/images/default/sample.pdf", timeout= 0)
except:
print("Saving file to ", downloads_path, file_name)
# Wait for the download to start
download = await download_info.value
# Wait for the download process to complete
print(await download.path())
# Save downloaded file somewhere
await download.save_as(os.path.join(downloads_path, file_name))
await page.wait_for_timeout(200)
await browser.close()
async def main() -> None:
async with async_playwright() as playwright:
await run(playwright)
asyncio.run(main())