I am currently writing my own small web scraper where I am trying to achieve a function that starts and terminates a thread whenever a URL has been added or removed from a list. This is what I have created so far:
import concurrent.futures
import time
import random
import requests
class WebScraper:
def __init__(self):
self.session = requests.Session()
def run(self, url: str):
while True:
response = self.do_request(url)
if response.status_code != 200:
continue
data = self.scrape_data(response)
...
time.sleep(500)
def do_request(self, url):
response = self.session.get(url)
return response
def scrape_data(self, response):
# TODO: Implement your web scraping logic here
return {}
if __name__ == '__main__':
URLS_TO_TEST = [
"http://books.toscrape.com/catalogue/category/books/travel_2/index.html",
"http://books.toscrape.com/catalogue/category/books/mystery_3/index.html",
"http://books.toscrape.com/catalogue/category/books/historical-fiction_4/index.html",
"http://books.toscrape.com/catalogue/category/books/sequential-art_5/index.html",
"http://books.toscrape.com/catalogue/category/books/classics_6/index.html",
]
with concurrent.futures.ThreadPoolExecutor() as executor:
for url in URLS_TO_TEST:
session = WebScraper()
future = executor.submit(session.run, url)
time.sleep(random.randint(10, 20))
URLS_TO_TEST.pop(random.randint(0, len(URLS_TO_TEST) - 1)) # The removed url should also terminate the thread
time.sleep(random.randint(10, 20))
URLS_TO_TEST.append('http://books.toscrape.com/catalogue/category/books/health_47/index.html') # The added url should also start a new thread`
My problem is that I am not sure if its possible to terminate a running thread whenever a URL from the main has been removed and vice versa when it comes to starting as well. Is it possible to do it using threading?
The idea later on is to set the URL_TO_TEST later on through database instead of having static list and will be dynamic with DB but that will be for later.
Expected:
I want the thread to be terminated if the URL is removed from the list I want the thread to start whenver there is a new URL in the list
It can be achieved using Observer Pattern: Python Observer Pattern: Examples, Tips?
Or you can create a class of URLs. Something like this:
import threading
import concurrent.futures
class Url:
URLs = []
def __init__(self):
lock = {}
thread = {}
def append_url(url):
if not lock[url]:
lock[url] = threading.Lock()
lock[url].acquire()
URLs.append(url)
thread[url] = threading.Thread(target=WebScraper().run, args=(url,))
thread[url].join()
lock[url].release()
del lock[url]
del thread[url]
def delete_url(url):
if thread[url]:
thread[url].exit()
del thread[url]
if lock[url]:
lock[url].release()
del lock[url]
URLs.remove(url)
def append_list(list_urls):
with ThreadPoolExecutor(max_workers=len(list_urls)) as executor:
results = {url: executor.submit(append_url, url) for url in list_urls}