I have been currently working on creating a web crawler where I want to call the correct class that scrapes the web elements from a given URL.
Currently I have created:
import sys
import tldextract
import requests
class Scraper:
scrapers = {}
def __init_subclass__(scraper_class):
Scraper.scrapers[scraper_class.url] = scraper_class
@classmethod
def for_url(cls, url):
k = tldextract.extract(url)
# return Scraper.scrapers[k.domain]()
# or
return cls.scrapers[k.domain]()
class BBCScraper(Scraper):
url = 'bbc.co.uk'
def scrape(s):
print(s)
# FIXME Scrape the correct values for BBC
return "Scraped BBC News"
url = 'https://www.bbc.co.uk/'
scraper = Scraper.for_url(url)
scraper.scrape(requests.get(url))
What I am trying to do right now is that if the BBC is the domain name, then it should go into the class BBCScraper(Scraper):
and since we call the scraper.scrape(requests.get(url))
it should then scrape the web elements inside the BBCScraper -> scrape -> Return web elements
However I do have problems where i'm trying to run this script it outprints:
Outprint >>> return cls.scrapers[k.domain]() KeyError: 'bbc'
I wonder how I can call the correct class depending on the domain that has been given to the for_url
classmethod
Problem is that k.domain
returns bbc
and you wrote url = 'bbc.co.uk'
so one these solutions
url = 'bbc.co.uk'
along with k.registered_domain
url = 'bbc'
along with k.domain
And add a parameter in the scrape
method to get the response
from abc import abstractmethod
import requests
import tldextract
class Scraper:
scrapers = {}
def __init_subclass__(scraper_class):
Scraper.scrapers[scraper_class.url] = scraper_class
@classmethod
def for_url(cls, url):
k = tldextract.extract(url)
return cls.scrapers[k.registered_domain]()
@abstractmethod
def scrape(self, content: requests.Response):
pass
class BBCScraper(Scraper):
url = 'bbc.co.uk'
def scrape(self, content: requests.Response):
return "Scraped BBC News"
if __name__ == "__main__":
url = 'https://www.bbc.co.uk/'
scraper = Scraper.for_url(url)
r = scraper.scrape(requests.get(url))
print(r) # Scraped BBC News
I'd suggest to store the url
in a attribute to put the requests.get
in the scrape
, so there is less code in the main
class Scraper:
scrapers = {}
def __init_subclass__(scraper_class):
Scraper.scrapers[scraper_class.domain] = scraper_class
@classmethod
def for_url(cls, url):
k = tldextract.extract(url)
return cls.scrapers[k.registered_domain](url)
@abstractmethod
def scrape(self):
pass
class BBCScraper(Scraper):
domain = 'bbc.co.uk'
def __init__(self, url):
self.url = url
def scrape(self):
rep = requests.Response = requests.get(self.url)
content = rep.text # ALL HTML CONTENT
return "Scraped BBC News" + content[:20]
if __name__ == "__main__":
url = 'https://www.bbc.co.uk/'
scraper = Scraper.for_url(url)
r = scraper.scrape()
print(r) # Scraped BBC News<!DOCTYPE html><html