I want to reduce the time it takes the code to finish scraping the pages, I'm using selenium. I used Scrapy for this scraping project, but JavaScript was hiding the email elements from Scrapy .
Scrapy was perfect, I don't know if there's a way to reduce the time in selenium or there's another way, or another tool or package to use in such a case?
If there's any information or docs to know more about it I would be thankful.
Here's the code:
import scrapy
import logging
def decode_email_protection(encoded_string):
if encoded_string:
encoded_data = encoded_string.split('#')[-1]
r = int(encoded_data[:2], 16)
email = ''.join([chr(int(encoded_data[i:i + 2], 16) ^ r) for i in range(2, len(encoded_data), 2)])
encoded_data = email.split('#')[-1]
r = int(encoded_data[4:6], 16)
encoded_data = encoded_data[:4] + encoded_data[6:]
email = ''.join([chr(int(encoded_data[i:i + 2], 16) ^ r) for i in range(0, len(encoded_data), 2)])
return email
else:
return None
class ExampleSpider(scrapy.Spider):
name = "example_spider"
allowed_domains = ["mdpi.com"]
base_url = "https://www.mdpi.com/search?q=biomaterials"
def start_requests(self):
yield scrapy.Request(url=self.base_url , callback=self.parse)
def parse(self, response):
article_hrefs = response.xpath("//a[@class='title-link']/@href").getall()
for href in article_hrefs:
yield response.follow(url=href, callback=self.parse_page)
next_page_link = response.xpath("//span[contains(@class,'pageSelect')]/a[6]/@href").get()
if next_page_link:
yield response.follow(url=next_page_link, callback=self.parse)
def parse_page(self, response):
title = response.xpath("//h1[contains(@class,'title')]/text()").get(default="").strip()
authors = response.xpath("//a[@class='profile-card-drop'][1]//text()").get(default="").strip()
#authors = [i.strip() for i in authors]
email_href = response.xpath("//a[contains(@class,'email')]/@href").get(default="")
email = decode_email_protection(email_href)
yield {
"Title": title,
"Link": response.url,
"Authors": authors,
"Email": email
}
EDIT: this's the new version of the code thanks for @SuperUser. the problem now is the scrapy returns 15 articles per page and it should be 50 articles . i used scrapy shell and i saw that the article_href method returns 15 link. i tried and searched for a reason but got nothing . even though i thought the problem was in the scrapy so i did use selenium with it, but still got the same problem, didn't get all the article in the page .
The email address is protected by CloudFlare's email protection script (it's actually double-encoded). I have found the decoding script online, but it was for single encoding string so I had to modify it.
Here's how you can scrape the website with Scrapy (no selenium):
EDIT: added dynamic pagination to get all the items on a single page.
import scrapy
import logging
def decode_email_protection(encoded_string):
encoded_data = encoded_string.split('#')[-1]
r = int(encoded_data[:2], 16)
email = ''.join([chr(int(encoded_data[i:i + 2], 16) ^ r) for i in range(2, len(encoded_data), 2)])
encoded_data = email.split('#')[-1]
r = int(encoded_data[4:6], 16)
encoded_data = encoded_data[:4] + encoded_data[6:]
email = ''.join([chr(int(encoded_data[i:i + 2], 16) ^ r) for i in range(0, len(encoded_data), 2)])
return email
class ExampleSpider(scrapy.Spider):
name = "example_spider"
allowed_domains = ["mdpi.com"]
base_url = "https://www.mdpi.com/search?sort=pubdate&page_no={}&page_count=50&year_from=1996&year_to=2024&q=biomaterials&view=default"
def start_requests(self):
for page_no in range(218, 219):
yield scrapy.Request(url=self.base_url.format(page_no), cb_kwargs={"page_no": page_no, "index": 0})
def parse(self, response, page_no, index):
self.log(f"Scraping page number : {page_no}", logging.INFO)
article_hrefs = response.xpath("//a[@class='title-link']/@href").getall()
for href in article_hrefs:
yield response.follow(url=href, callback=self.parse_page)
if index < 50//15: # 1 page with 15 articles plus 3 others = 50 items in total
index += 1
headers = {
"X-Requested-With": "XMLHttpRequest",
"Referer": self.base_url.format(page_no),
"USER_AGENT": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
}
yield scrapy.Request(url='https://www.mdpi.com/search/set/default/pagination', headers=headers, cb_kwargs={"page_no": page_no, "index": index}, dont_filter=True)
def parse_page(self, response):
self.log(f"Scraping article: {response.url}", logging.INFO)
title = response.xpath("//h1[contains(@class,'title')]//text()").getall()
title = "".join(i.strip() for i in title)
authors = response.xpath("//a[@class='profile-card-drop']//text()").getall()
authors = [i.strip() for i in authors]
email_href = response.xpath("//a[contains(@class,'email')]/@href").get(default="")
email = decode_email_protection(email_href)
yield {
"Title": title,
"Link": response.url,
"Authors": authors,
"Email": email
}