Search code examples
pythonselenium-webdriverweb-scrapingscrapy

Download embedded PDF on website using Selenium/Scrapy/Python


There are some pages like this that I want to download the embedded PDF file, but researching and testing various proposed solutions on SO and other sources, I couldn't find a correct solution. When I inspect the page, I see that the PDF file is in a blob URL, and I can't save it from this URL (although I can access it through the browser). Would there be a way for me to use Selenium, Scrapy or even any other Python library to download this PDF file?


Solution

  • If you open devtools in your browser and go to the network tab and load the page you can see how the PDF file is loaded. And you can see that it's base 64 encoded.

    devtools

    If we search for it in the source we can find the JavaScript code, let's look at the relevant part:

    if (document.getElementById('hdnTokenB3').value != '') {
        dataValue = "{ codigoInstituicao: '" + codInstituicao + "', numeroProtocolo: '" + numeroProtocolo + "'";
        buscarPDF(dataValue, document.getElementById('hdnTokenB3').value, '');
        return;
    }
    

    The id is set in the function getParameterByName.

    var id = getParameterByName("ID");
    
    if (id != null && id.length > 0) {
        var numeroProtocolo = id;
        var codInstituicao = 2;
    }
    else 
    {
        var numeroProtocolo = getParameterByName("NumeroProtocoloEntrega");
        var codInstituicao = 1;
    }
    

    We can write the function in python, and then we just need to recreate the request.

    import scrapy
    import re
    import base64
    import logging
    import os
    from urllib.parse import unquote
    
    
    class ExampleSpider(scrapy.Spider):
        name = "example_spider"
        start_urls = ['https://www.rad.cvm.gov.br/ENET/frmExibirArquivoIPEExterno.aspx?NumeroProtocoloEntrega=1106753']
        base_dir = './pdf_downloads'
    
        def parse(self, response):
            id_ = self.get_parameter_by_name("ID", response.url)
    
            if id_:
                numeroProtocolo = id_
                codInstituicao = 2
            else:
                numeroProtocolo = self.get_parameter_by_name("NumeroProtocoloEntrega", response.url)
                codInstituicao = 1
    
            dataValue = "{ codigoInstituicao: '" + str(codInstituicao) + "', numeroProtocolo: '" + str(numeroProtocolo) + "'"
            token = response.xpath('//*[@id="hdnTokenB3"]/@value').get(default='')
    
            versaoCaptcha = ''
            if response.xpath('//*[@id="hdnHabilitaCaptcha"]/@value').get(default='') == 'S':
                if not token:
                    versaoCaptcha = 'V3'
    
            payload = dataValue + ", token: '" + token + "', versaoCaptcha: '" + versaoCaptcha + "'}"
    
            url = 'https://www.rad.cvm.gov.br/ENET/frmExibirArquivoIPEExterno.aspx/ExibirPDF'
            headers = {
                "Accept": "application/json, text/javascript, */*; q=0.01",
                "Accept-Encoding": "gzip, deflate, br",
                "Accept-Language": "en-US,en;q=0.5",
                "Cache-Control": "no-cache",
                "Connection": "keep-alive",
                "Content-Type": "application/json; charset=utf-8",
                "DNT": "1",
                "Host": "www.rad.cvm.gov.br",
                "Origin": "https://www.rad.cvm.gov.br",
                "Pragma": "no-cache",
                "Referer": f"https://www.rad.cvm.gov.br/ENET/frmExibirArquivoIPEExterno.aspx?NumeroProtocoloEntrega={numeroProtocolo}",
                "Sec-Fetch-Dest": "empty",
                "Sec-Fetch-Mode": "cors",
                "Sec-Fetch-Site": "same-origin",
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0",
                "X-Requested-With": "XMLHttpRequest"
            }
    
            yield scrapy.Request(url=url, headers=headers, body=payload, method='POST', callback=self.download_pdf, cb_kwargs={'protocol_num': numeroProtocolo})
    
        def download_pdf(self, response, protocol_num):
            json_data = response.json()
            b64 = json_data.get('d')
    
            if b64:
                pdf = base64.b64decode(b64)
                filename = f'{protocol_num}.pdf'
                p = os.path.join(self.base_dir, filename)
    
                if not os.path.isdir(self.base_dir):
                    os.mkdir(self.base_dir)
    
                with open(p, 'wb') as f:
                    f.write(pdf)
    
                self.log(f"Saved {filename} in {self.base_dir}")
            else:
                self.log("Couldn't download pdf", logging.ERROR)
    
        @staticmethod
        def get_parameter_by_name(name, url):
            name = name.replace('[', '\\[').replace(']', '\\]')
    
            results = re.search(r"[?&]" + name + r"(=([^&#]*)|&|#|$)", url)
            if not results:
                return None
            if len(results.groups()) < 2 or not results[2]:
                return ''
    
            return unquote(results[2])