Search code examples
pythonweb-scrapingxpathvalueerror

Web Scraper issue: can only parse strings


I have recently wrote a 2-level scraper on the hong kong election platform, and it worked out well. The code allows me to retrieve information on a district level basis. The code is here below:

from typing import List
import requests
import csv
from lxml import etree
from urllib.parse import urljoin


class hongkongelection:
    def __init__(self):
        self.url = 'https://www.elections.gov.hk/dc2019/eng/intro_to_can/A.html'

def send_request(self, url):
    r = requests.get(url)
    if r.text:
        html_result = r.text
        print('get result la')
        return html_result
    else:
        print('get result fail la')
        return ''

def extract_info_urls(self, response):
    raw_tree = etree.HTML(response)
    platform_urls = raw_tree.xpath('//*[@id="table-district-member"]/tbody/tr/td[6]/div/a/@href|//*[@id="table-district-member"]/tbody/tr/td[4]/div/a/@href')
    scraped_url = "https://www.elections.gov.hk/dc2019/eng/intro_to_can/H.html"
    #self.pdf_url = "../../pdf/intro_to_can/A01_1_ENG.html"
    platform_urls: List[str] = [urljoin(scraped_url, pdf_url) for pdf_url in platform_urls]
    return platform_urls

def extract_info(self, platform_urls):
    raw_tree = etree.HTML(platform_urls)
    dict_result = {}
    dict_result['namelist'] = raw_tree.xpath("//*[@id=\"main\"]/p[2]/span[2]/text()")
    dict_result['namelist'] = [x.replace("\r\n", "") for x in dict_result['namelist']]
    dict_result['partylist'] = raw_tree.xpath("//*[@id=\"main\"]/p[5]/span[2]/text()")
    dict_result['partylist'] = [x.replace("\r\n", "") for x in dict_result['partylist']]
    dict_result['message_list'] = raw_tree.xpath("//*[@id=\"main\"]/p[8]/span/text()")
    dict_result['message_list'] = [x.replace("\r\n", "") for x in dict_result['message_list']]
    return dict_result


def save_information(self, raw_json):
    with open('platform.csv', 'a+', encoding='UTF-8') as out_f:
        csv_writer =csv.DictWriter(out_f, raw_json.keys())
        if out_f.tell() == 0:
            csv_writer.writeheader()

        csv_writer.writerow(raw_json)

def run(self):
    response = self.send_request(self.url)
    platform_urls = self.extract_info_urls(response)
    for url in platform_urls:
        info_response = self.send_request(url)
        raw_json =self.extract_info(info_response)
        raw_json['platform_url'] = url
        self.save_information(raw_json)



if __name__ == '__main__' :
    runner = hongkongelection()
    runner.run()

Nonetheless, as I wanted advance my skills, I tried to do a 3-level scraper instead. I wanted to scrape all politicians' platforms in the 18 districts simultaneously.

class hongkongelection:
def __init__(self):
    self.url = 'https://www.elections.gov.hk/dc2019/eng/intro_to_can.html'

def send_request(self, url):
    r = requests.get(url)
    if r.text:
        html_result = r.text
        print('get result la')
        return html_result
    else:
        print('get result fail la')
        return ''

def extract_info_urls_district(self, response):
    raw_tree = etree.HTML(response)
    district_urls = raw_tree.xpath('//*[@id="content-area"]/table[2]/tr/td/div/ol/li/a/@href')
    scraped_url_district = "https://www.elections.gov.hk/dc2019/eng/intro_to_can.html"
    #pdf_url = "../eng/intro_to_can/A.html"
    district_urls = [urljoin(scraped_url_district, pdf_url) for pdf_url in district_urls]
    return district_urls

def extract_info_urls_platform(self, district_urls):
    raw_tree = etree.HTML(district_urls)
    platform_urls = raw_tree.xpath('//*[@id="table-district-member"]/tbody/tr/td[6]/div/a/@href|//*[@id="table-district-member"]/tbody/tr/td[4]/div/a/@href')
    scraped_url = "https://www.elections.gov.hk/dc2019/eng/intro_to_can/H.html"
    #pdf_url = "../../pdf/intro_to_can/A01_1_ENG.html"
    platform_urls: list[str] = [urljoin(scraped_url, pdf_url) for pdf_url in platform_urls]
    return platform_urls

def extract_info(self, platform_urls):
    raw_tree = etree.HTML(platform_urls)
    dict_result = {}
    dict_result['namelist'] = raw_tree.xpath("//*[@id=\"main\"]/p[2]/span[2]/text()")
    dict_result['namelist'] = [x.replace("\r\n", "") for x in dict_result['namelist']]
    dict_result['partylist'] = raw_tree.xpath("//*[@id=\"main\"]/p[5]/span[2]/text()")
    dict_result['partylist'] = [x.replace("\r\n", "") for x in dict_result['partylist']]
    dict_result['message_list'] = raw_tree.xpath("//*[@id=\"main\"]/p[8]/span/text()")
    dict_result['message_list'] = [x.replace("\r\n", "") for x in dict_result['message_list']]
    return dict_result


def save_information(self, raw_json):
    with open('platform.csv', 'a+', encoding='UTF-8') as out_f:
        csv_writer =csv.DictWriter(out_f, raw_json.keys())
        if out_f.tell() == 0:
            csv_writer.writeheader()

        csv_writer.writerow(raw_json)

def run(self):
    response = self.send_request(self.url)
    district_urls = self.extract_info_urls_district(response)
    platform_urls = self.extract_info_urls_platform(district_urls)
    for url in platform_urls:
        info_response = self.send_request(url)
        raw_json =self.extract_info(info_response)
        raw_json['platform_url'] = url
        self.save_information(raw_json)


if __name__ == '__main__' :
    runner = hongkongelection()
    runner.run()

But it failed. I wonder what I did wrong.

Full traceback:

Traceback (most recent call last):


 File "C:\Program Files\JetBrains\PyCharm Community Edition 2020.3.2\plugins\python-ce\helpers\pydev\pydevd.py", line 1477, in _exec
    pydev_imports.execfile(file, globals, locals)  # execute the script
  File "C:\Program Files\JetBrains\PyCharm Community Edition 2020.3.2\plugins\python-ce\helpers\pydev\_pydev_imps\_pydev_execfile.py", line 18, in execfile
    exec(compile(contents+"\n", file, 'exec'), glob, loc)
  File "C:/Users/BUS-0556/PycharmProjects/webscraper/scraper for platform more.py", line 71, in <module>
    runner.run()
  File "C:/Users/BUS-0556/PycharmProjects/webscraper/scraper for platform more.py", line 61, in run
    platform_urls = self.extract_info_urls_platform(district_urls)
  File "C:/Users/BUS-0556/PycharmProjects/webscraper/scraper for platform more.py", line 31, in extract_info_urls_platform
    raw_tree = etree.HTML(district_urls)
  File "src/lxml/etree.pyx", line 3185, in lxml.etree.HTML
  File "src/lxml/parser.pxi", line 1895, in lxml.etree._parseMemoryDocument
ValueError: can only parse strings

I appreciate your help and time - looking forward to learning from this amazing community!


Solution

  • You were trying to scrape the content directly using lxml parser without even sending requests. I've brought about some changes in your xpaths which was not necessary though. I've also used generator to make it efficient. Make sure to add this save_information method within the script as I had to kick that out to see what was happening:

    import csv
    import time
    import random
    import requests
    from lxml import etree
    from typing import List
    from urllib.parse import urljoin
    
    class hongkongelection(object):
    
        def __init__(self):
            self.url = 'https://www.elections.gov.hk/dc2019/eng/intro_to_can.html'
    
        def send_request(self, url):
            r = requests.get(url)
            r.raise_for_status()
            return r.text
    
        def extract_info_urls_district(self, url):
            res = self.send_request(url)
            raw_tree = etree.HTML(res)
            for pdf_url in raw_tree.xpath('//a[contains(@href,"/intro_to_can/")]/@href'):
                yield urljoin(url,pdf_url)
    
        def extract_info_urls_platform(self, url):
            res = self.send_request(url)
            raw_tree = etree.HTML(res)
            for pdf_url in raw_tree.xpath('//*[@id="table-district-member"]//a[contains(@href,"/pdf/intro_to_can/") and contains(.,"Text")]/@href'):
                yield urljoin(url,pdf_url) 
    
        def extract_info(self, url):
            res = self.send_request(url)
            raw_tree = etree.HTML(res)
            dict_result = {}
            dict_result['namelist'] = raw_tree.xpath("//*[@id=\"main\"]/p[2]/span[2]/text()")
            dict_result['namelist'] = [x.replace("\r\n", "") for x in dict_result['namelist']]
            dict_result['partylist'] = raw_tree.xpath("//*[@id=\"main\"]/p[5]/span[2]/text()")
            dict_result['partylist'] = [x.replace("\r\n", "") for x in dict_result['partylist']]
            dict_result['message_list'] = raw_tree.xpath("//*[@id=\"main\"]/p[8]/span/text()")
            dict_result['message_list'] = [x.replace("\r\n", "") for x in dict_result['message_list']]
            return dict_result
    
        def run(self):
            for district_url in self.extract_info_urls_district(self.url):
                for url in self.extract_info_urls_platform(district_url):
                    raw_json = self.extract_info(url)
                    raw_json['platform_url'] = url
                    print(raw_json)
                time.sleep(random.randint(3,8))
    
    
    if __name__ == '__main__' :
        runner = hongkongelection()
        runner.run()