Search code examples
pythonpython-3.xweb-scrapingrequesturllib

How to download pdf files from URLs leading to sub-URLs using Python


I am trying to download all pdf files from the links in the following URLs:

https://www.adb.org/projects/documents/country/ban/year/2020?terms=education
https://www.adb.org/projects/documents/country/ban/year/2019?terms=education
https://www.adb.org/projects/documents/country/ban/year/2018?terms=education

These URLs have lists of links which directs to sub-links containing pdf files. The lists of links in the main URLs come from the search result of a country, year and a term.

I have tried with the following codes by changing it in different ways. However, it does not seem to be working. Any help would be appreciated. Thanks.

import os
import time
from glob import glob 
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
 
url = ["https://www.adb.org/projects/documents/country/ban/year/2020?terms=education",
      "https://www.adb.org/projects/documents/country/ban/year/2019?terms=education",
      "https://www.adb.org/projects/documents/country/ban/year/2018?terms=education"]

folder = glob("J:/pdfs/*/")

for i, folder_location in zip(url, folder):
    time.sleep(1)
    response = requests.get(i)
    soup= BeautifulSoup(response.text, "lxml")
    for link in soup.select("[href$='.pdf']"):

        filename = os.path.join(folder_location,link['href'].split('/')[-1])
        with open(filename, 'wb') as f:
            f.write(requests.get(urljoin(i,link['href'])).content)


Solution

  • Try this. It will put the files in the PDF folder.

    import os
    from simplified_scrapy import Spider, SimplifiedDoc, SimplifiedMain, utils
    
    class MySpider(Spider):
        name = 'download_pdf'
        allowed_domains = ["www.adb.org"]
        start_urls = [
            "https://www.adb.org/projects/documents/country/ban/year/2020?terms=education",
            "https://www.adb.org/projects/documents/country/ban/year/2019?terms=education",
            "https://www.adb.org/projects/documents/country/ban/year/2018?terms=education"
        ]  # Entry page
    
        def __init__(self):
            Spider.__init__(self, self.name)  #necessary
            if (not os.path.exists('./pdfs')):
                os.mkdir('./pdfs')
    
        def afterResponse(self, response, url, error=None, extra=None):
            try:
                path = './pdfs' + url[url.rindex('/'):]
                index = path.find('?')
                if index > 0: path = path[:index]
                flag = utils.saveResponseAsFile(response, path, fileType="pdf")
                if flag:
                    return None
                else:  # If it's not a pdf, leave it to the frame
                    return Spider.afterResponse(self, response, url, error)
            except Exception as err:
                print(err)
    
        def extract(self, url, html, models, modelNames):
            doc = SimplifiedDoc(html)
            lst = doc.selects('div.list >a').contains("documents/", attr="href")
            if not lst:
                lst = doc.selects('div.hidden-md hidden-lg >a')
            urls = []
            for a in lst:
                a["url"] = utils.absoluteUrl(url.url, a["href"])
                urls.append(a)
    
            return {"Urls": urls}
    
    
    SimplifiedMain.startThread(MySpider())  # Start download
    

    The pdf from each url be downloaded to each separate folder.

    from simplified_scrapy import Spider, SimplifiedDoc, SimplifiedMain, utils
    
    class MySpider(Spider):
        name = 'download_pdf'
        allowed_domains = ["www.adb.org"]
        start_urls = [
            "https://www.adb.org/projects/documents/country/ban/year/2020?terms=education",
            "https://www.adb.org/projects/documents/country/ban/year/2019?terms=education",
            "https://www.adb.org/projects/documents/country/ban/year/2018?terms=education"
        ]  # Entry page
    
        def afterResponse(self, response, url, error=None, extra=None):
            if not extra:
                print ("The version of library simplified_scrapy is too old, please update.")
                SimplifiedMain.setRunFlag(False)
                return
            try:
                path = './pdfs'
                # create folder start
                srcUrl = extra.get('srcUrl')
                if srcUrl:
                    index = srcUrl.find('year/')
                    year = ''
                    if index > 0:
                        year = srcUrl[index + 5:]
                        index = year.find('?')
                        if index>0:
                            path = path + year[:index]
                            utils.createDir(path)
                # create folder end
    
                path = path + url[url.rindex('/'):]
                index = path.find('?')
                if index > 0: path = path[:index]
                flag = utils.saveResponseAsFile(response, path, fileType="pdf")
                if flag:
                    return None
                else:  # If it's not a pdf, leave it to the frame
                    return Spider.afterResponse(self, response, url, error, extra)
            except Exception as err:
                print(err)
    
        def extract(self, url, html, models, modelNames):
            doc = SimplifiedDoc(html)
            lst = doc.selects('div.list >a').contains("documents/", attr="href")
            if not lst:
                lst = doc.selects('div.hidden-md hidden-lg >a')
            urls = []
            for a in lst:
                a["url"] = utils.absoluteUrl(url.url, a["href"])
                # Set root url start
                a["srcUrl"] = url.get('srcUrl')
                if not a['srcUrl']:
                    a["srcUrl"] = url.url
                # Set root url end
                urls.append(a)
    
            return {"Urls": urls}
    
        # Download again by resetting the URL. Called when you want to download again.
        def resetUrl(self):
            Spider.clearUrl(self)
            Spider.resetUrlsTest(self)
    
    SimplifiedMain.startThread(MySpider())  # Start download