python selenium-webdriver web-scraping scrapy

Download embedded PDF on website using Selenium/Scrapy/Python

There are some pages like this that I want to download the embedded PDF file, but researching and testing various proposed solutions on SO and other sources, I couldn't find a correct solution. When I inspect the page, I see that the PDF file is in a blob URL, and I can't save it from this URL (although I can access it through the browser). Would there be a way for me to use Selenium, Scrapy or even any other Python library to download this PDF file?

Solution

If you open devtools in your browser and go to the network tab and load the page you can see how the PDF file is loaded. And you can see that it's base 64 encoded.

If we search for it in the source we can find the JavaScript code, let's look at the relevant part:

if (document.getElementById('hdnTokenB3').value != '') {
    dataValue = "{ codigoInstituicao: '" + codInstituicao + "', numeroProtocolo: '" + numeroProtocolo + "'";
    buscarPDF(dataValue, document.getElementById('hdnTokenB3').value, '');
    return;
}

The id is set in the function getParameterByName.

var id = getParameterByName("ID");

if (id != null && id.length > 0) {
    var numeroProtocolo = id;
    var codInstituicao = 2;
}
else 
{
    var numeroProtocolo = getParameterByName("NumeroProtocoloEntrega");
    var codInstituicao = 1;
}

We can write the function in python, and then we just need to recreate the request.

import scrapy
import re
import base64
import logging
import os
from urllib.parse import unquote


class ExampleSpider(scrapy.Spider):
    name = "example_spider"
    start_urls = ['https://www.rad.cvm.gov.br/ENET/frmExibirArquivoIPEExterno.aspx?NumeroProtocoloEntrega=1106753']
    base_dir = './pdf_downloads'

    def parse(self, response):
        id_ = self.get_parameter_by_name("ID", response.url)

        if id_:
            numeroProtocolo = id_
            codInstituicao = 2
        else:
            numeroProtocolo = self.get_parameter_by_name("NumeroProtocoloEntrega", response.url)
            codInstituicao = 1

        dataValue = "{ codigoInstituicao: '" + str(codInstituicao) + "', numeroProtocolo: '" + str(numeroProtocolo) + "'"
        token = response.xpath('//*[@id="hdnTokenB3"]/@value').get(default='')

        versaoCaptcha = ''
        if response.xpath('//*[@id="hdnHabilitaCaptcha"]/@value').get(default='') == 'S':
            if not token:
                versaoCaptcha = 'V3'

        payload = dataValue + ", token: '" + token + "', versaoCaptcha: '" + versaoCaptcha + "'}"

        url = 'https://www.rad.cvm.gov.br/ENET/frmExibirArquivoIPEExterno.aspx/ExibirPDF'
        headers = {
            "Accept": "application/json, text/javascript, */*; q=0.01",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "en-US,en;q=0.5",
            "Cache-Control": "no-cache",
            "Connection": "keep-alive",
            "Content-Type": "application/json; charset=utf-8",
            "DNT": "1",
            "Host": "www.rad.cvm.gov.br",
            "Origin": "https://www.rad.cvm.gov.br",
            "Pragma": "no-cache",
            "Referer": f"https://www.rad.cvm.gov.br/ENET/frmExibirArquivoIPEExterno.aspx?NumeroProtocoloEntrega={numeroProtocolo}",
            "Sec-Fetch-Dest": "empty",
            "Sec-Fetch-Mode": "cors",
            "Sec-Fetch-Site": "same-origin",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0",
            "X-Requested-With": "XMLHttpRequest"
        }

        yield scrapy.Request(url=url, headers=headers, body=payload, method='POST', callback=self.download_pdf, cb_kwargs={'protocol_num': numeroProtocolo})

    def download_pdf(self, response, protocol_num):
        json_data = response.json()
        b64 = json_data.get('d')

        if b64:
            pdf = base64.b64decode(b64)
            filename = f'{protocol_num}.pdf'
            p = os.path.join(self.base_dir, filename)

            if not os.path.isdir(self.base_dir):
                os.mkdir(self.base_dir)

            with open(p, 'wb') as f:
                f.write(pdf)

            self.log(f"Saved {filename} in {self.base_dir}")
        else:
            self.log("Couldn't download pdf", logging.ERROR)

    @staticmethod
    def get_parameter_by_name(name, url):
        name = name.replace('[', '\\[').replace(']', '\\]')

        results = re.search(r"[?&]" + name + r"(=([^&#]*)|&|#|$)", url)
        if not results:
            return None
        if len(results.groups()) < 2 or not results[2]:
            return ''

        return unquote(results[2])