So I messing around with BeautifulSoup. I wrote some code and, with your permision past it here. With the following question - Is there any way use multithreading or multiprocessing to speed it up? Bet this code is far from the ideal :) Should Pool be used for such ocasions?
ps. I took this website as an example.
Thank you in advance.
import requests
from bs4 import BeautifulSoup
import csv
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
pages = [str(i) for i in range(100,2000)]
for page in pages:
html = requests.get('https://statesassembly.gov.je/Pages/Members.aspxMemberID='+page).text
def get_page_data():
soup = BeautifulSoup(html, 'lxml')
name = soup.find('h1').text
title = soup.find(class_='gel-layout__item gel-2/3@m gel-1/1@s').find('h2').text
data = {'name': name,
'title': title,
}
return (data)
data = get_page_data()
with open('Members.csv','a') as output_file:
writer = csv.writer(output_file, delimiter=';')
writer.writerow((data['name'],
data['title'],
))
brute force a government website can be an illegal in some countries. please make sure you read copyright laws of your country and the country you are fetching data from.
first of all please divide your list into parts after that make threads of it to parallel execute them.
import threading
import os
def task1():
print("Task 1 assigned to thread: {}".format(threading.current_thread().name))
print("ID of process running task 1: {}".format(os.getpid()))
def task2():
print("Task 2 assigned to thread: {}".format(threading.current_thread().name))
print("ID of process running task 2: {}".format(os.getpid()))
if __name__ == "__main__":
# print ID of current process
print("ID of process running main program: {}".format(os.getpid()))
# print name of main thread
print("Main thread name: {}".format(threading.main_thread().name))
# creating threads
t1 = threading.Thread(target=task1, name='t1')
t2 = threading.Thread(target=task2, name='t2')
# starting threads
t1.start()
t2.start()
# wait until all threads finish
t1.join()
t2.join()