I have recently wrote a 2-level scraper on the hong kong election platform, and it worked out well. The code allows me to retrieve information on a district level basis. The code is here below:
from typing import List
import requests
import csv
from lxml import etree
from urllib.parse import urljoin
class hongkongelection:
def __init__(self):
self.url = 'https://www.elections.gov.hk/dc2019/eng/intro_to_can/A.html'
def send_request(self, url):
r = requests.get(url)
if r.text:
html_result = r.text
print('get result la')
return html_result
else:
print('get result fail la')
return ''
def extract_info_urls(self, response):
raw_tree = etree.HTML(response)
platform_urls = raw_tree.xpath('//*[@id="table-district-member"]/tbody/tr/td[6]/div/a/@href|//*[@id="table-district-member"]/tbody/tr/td[4]/div/a/@href')
scraped_url = "https://www.elections.gov.hk/dc2019/eng/intro_to_can/H.html"
#self.pdf_url = "../../pdf/intro_to_can/A01_1_ENG.html"
platform_urls: List[str] = [urljoin(scraped_url, pdf_url) for pdf_url in platform_urls]
return platform_urls
def extract_info(self, platform_urls):
raw_tree = etree.HTML(platform_urls)
dict_result = {}
dict_result['namelist'] = raw_tree.xpath("//*[@id=\"main\"]/p[2]/span[2]/text()")
dict_result['namelist'] = [x.replace("\r\n", "") for x in dict_result['namelist']]
dict_result['partylist'] = raw_tree.xpath("//*[@id=\"main\"]/p[5]/span[2]/text()")
dict_result['partylist'] = [x.replace("\r\n", "") for x in dict_result['partylist']]
dict_result['message_list'] = raw_tree.xpath("//*[@id=\"main\"]/p[8]/span/text()")
dict_result['message_list'] = [x.replace("\r\n", "") for x in dict_result['message_list']]
return dict_result
def save_information(self, raw_json):
with open('platform.csv', 'a+', encoding='UTF-8') as out_f:
csv_writer =csv.DictWriter(out_f, raw_json.keys())
if out_f.tell() == 0:
csv_writer.writeheader()
csv_writer.writerow(raw_json)
def run(self):
response = self.send_request(self.url)
platform_urls = self.extract_info_urls(response)
for url in platform_urls:
info_response = self.send_request(url)
raw_json =self.extract_info(info_response)
raw_json['platform_url'] = url
self.save_information(raw_json)
if __name__ == '__main__' :
runner = hongkongelection()
runner.run()
Nonetheless, as I wanted advance my skills, I tried to do a 3-level scraper instead. I wanted to scrape all politicians' platforms in the 18 districts simultaneously.
class hongkongelection:
def __init__(self):
self.url = 'https://www.elections.gov.hk/dc2019/eng/intro_to_can.html'
def send_request(self, url):
r = requests.get(url)
if r.text:
html_result = r.text
print('get result la')
return html_result
else:
print('get result fail la')
return ''
def extract_info_urls_district(self, response):
raw_tree = etree.HTML(response)
district_urls = raw_tree.xpath('//*[@id="content-area"]/table[2]/tr/td/div/ol/li/a/@href')
scraped_url_district = "https://www.elections.gov.hk/dc2019/eng/intro_to_can.html"
#pdf_url = "../eng/intro_to_can/A.html"
district_urls = [urljoin(scraped_url_district, pdf_url) for pdf_url in district_urls]
return district_urls
def extract_info_urls_platform(self, district_urls):
raw_tree = etree.HTML(district_urls)
platform_urls = raw_tree.xpath('//*[@id="table-district-member"]/tbody/tr/td[6]/div/a/@href|//*[@id="table-district-member"]/tbody/tr/td[4]/div/a/@href')
scraped_url = "https://www.elections.gov.hk/dc2019/eng/intro_to_can/H.html"
#pdf_url = "../../pdf/intro_to_can/A01_1_ENG.html"
platform_urls: list[str] = [urljoin(scraped_url, pdf_url) for pdf_url in platform_urls]
return platform_urls
def extract_info(self, platform_urls):
raw_tree = etree.HTML(platform_urls)
dict_result = {}
dict_result['namelist'] = raw_tree.xpath("//*[@id=\"main\"]/p[2]/span[2]/text()")
dict_result['namelist'] = [x.replace("\r\n", "") for x in dict_result['namelist']]
dict_result['partylist'] = raw_tree.xpath("//*[@id=\"main\"]/p[5]/span[2]/text()")
dict_result['partylist'] = [x.replace("\r\n", "") for x in dict_result['partylist']]
dict_result['message_list'] = raw_tree.xpath("//*[@id=\"main\"]/p[8]/span/text()")
dict_result['message_list'] = [x.replace("\r\n", "") for x in dict_result['message_list']]
return dict_result
def save_information(self, raw_json):
with open('platform.csv', 'a+', encoding='UTF-8') as out_f:
csv_writer =csv.DictWriter(out_f, raw_json.keys())
if out_f.tell() == 0:
csv_writer.writeheader()
csv_writer.writerow(raw_json)
def run(self):
response = self.send_request(self.url)
district_urls = self.extract_info_urls_district(response)
platform_urls = self.extract_info_urls_platform(district_urls)
for url in platform_urls:
info_response = self.send_request(url)
raw_json =self.extract_info(info_response)
raw_json['platform_url'] = url
self.save_information(raw_json)
if __name__ == '__main__' :
runner = hongkongelection()
runner.run()
But it failed. I wonder what I did wrong.
Full traceback:
Traceback (most recent call last):
File "C:\Program Files\JetBrains\PyCharm Community Edition 2020.3.2\plugins\python-ce\helpers\pydev\pydevd.py", line 1477, in _exec
pydev_imports.execfile(file, globals, locals) # execute the script
File "C:\Program Files\JetBrains\PyCharm Community Edition 2020.3.2\plugins\python-ce\helpers\pydev\_pydev_imps\_pydev_execfile.py", line 18, in execfile
exec(compile(contents+"\n", file, 'exec'), glob, loc)
File "C:/Users/BUS-0556/PycharmProjects/webscraper/scraper for platform more.py", line 71, in <module>
runner.run()
File "C:/Users/BUS-0556/PycharmProjects/webscraper/scraper for platform more.py", line 61, in run
platform_urls = self.extract_info_urls_platform(district_urls)
File "C:/Users/BUS-0556/PycharmProjects/webscraper/scraper for platform more.py", line 31, in extract_info_urls_platform
raw_tree = etree.HTML(district_urls)
File "src/lxml/etree.pyx", line 3185, in lxml.etree.HTML
File "src/lxml/parser.pxi", line 1895, in lxml.etree._parseMemoryDocument
ValueError: can only parse strings
I appreciate your help and time - looking forward to learning from this amazing community!
You were trying to scrape the content directly using lxml parser without even sending requests. I've brought about some changes in your xpaths which was not necessary though. I've also used generator to make it efficient. Make sure to add this save_information
method within the script as I had to kick that out to see what was happening:
import csv
import time
import random
import requests
from lxml import etree
from typing import List
from urllib.parse import urljoin
class hongkongelection(object):
def __init__(self):
self.url = 'https://www.elections.gov.hk/dc2019/eng/intro_to_can.html'
def send_request(self, url):
r = requests.get(url)
r.raise_for_status()
return r.text
def extract_info_urls_district(self, url):
res = self.send_request(url)
raw_tree = etree.HTML(res)
for pdf_url in raw_tree.xpath('//a[contains(@href,"/intro_to_can/")]/@href'):
yield urljoin(url,pdf_url)
def extract_info_urls_platform(self, url):
res = self.send_request(url)
raw_tree = etree.HTML(res)
for pdf_url in raw_tree.xpath('//*[@id="table-district-member"]//a[contains(@href,"/pdf/intro_to_can/") and contains(.,"Text")]/@href'):
yield urljoin(url,pdf_url)
def extract_info(self, url):
res = self.send_request(url)
raw_tree = etree.HTML(res)
dict_result = {}
dict_result['namelist'] = raw_tree.xpath("//*[@id=\"main\"]/p[2]/span[2]/text()")
dict_result['namelist'] = [x.replace("\r\n", "") for x in dict_result['namelist']]
dict_result['partylist'] = raw_tree.xpath("//*[@id=\"main\"]/p[5]/span[2]/text()")
dict_result['partylist'] = [x.replace("\r\n", "") for x in dict_result['partylist']]
dict_result['message_list'] = raw_tree.xpath("//*[@id=\"main\"]/p[8]/span/text()")
dict_result['message_list'] = [x.replace("\r\n", "") for x in dict_result['message_list']]
return dict_result
def run(self):
for district_url in self.extract_info_urls_district(self.url):
for url in self.extract_info_urls_platform(district_url):
raw_json = self.extract_info(url)
raw_json['platform_url'] = url
print(raw_json)
time.sleep(random.randint(3,8))
if __name__ == '__main__' :
runner = hongkongelection()
runner.run()